1/*	$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $	*/
2
3/*
4 * Copyright 2019 Advanced Micro Devices, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 */
25#include <sys/cdefs.h>
26__KERNEL_RCSID(0, "$NetBSD: amdgpu_umc_v6_1.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $");
27
28#include "umc_v6_1.h"
29#include "amdgpu_ras.h"
30#include "amdgpu.h"
31
32#include "rsmu/rsmu_0_0_2_offset.h"
33#include "rsmu/rsmu_0_0_2_sh_mask.h"
34#include "umc/umc_6_1_1_offset.h"
35#include "umc/umc_6_1_1_sh_mask.h"
36#include "umc/umc_6_1_2_offset.h"
37
38#define UMC_6_INST_DIST			0x40000
39
40/*
41 * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
42 * is the index of 8KB block
43 */
44#define ADDR_OF_8KB_BLOCK(addr)			(((addr) & ~0xffULL) << 5)
45/* channel index is the index of 256B block */
46#define ADDR_OF_256B_BLOCK(channel_index)	((channel_index) << 8)
47/* offset in 256B block */
48#define OFFSET_IN_256B_BLOCK(addr)		((addr) & 0xffULL)
49
50#define LOOP_UMC_INST(umc_inst) for ((umc_inst) = 0; (umc_inst) < adev->umc.umc_inst_num; (umc_inst)++)
51#define LOOP_UMC_CH_INST(ch_inst) for ((ch_inst) = 0; (ch_inst) < adev->umc.channel_inst_num; (ch_inst)++)
52#define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) LOOP_UMC_CH_INST((ch_inst))
53
54const uint32_t
55	umc_v6_1_channel_idx_tbl[UMC_V6_1_UMC_INSTANCE_NUM][UMC_V6_1_CHANNEL_INSTANCE_NUM] = {
56		{2, 18, 11, 27},	{4, 20, 13, 29},
57		{1, 17, 8, 24},		{7, 23, 14, 30},
58		{10, 26, 3, 19},	{12, 28, 5, 21},
59		{9, 25, 0, 16},		{15, 31, 6, 22}
60};
61
62static void umc_v6_1_enable_umc_index_mode(struct amdgpu_device *adev)
63{
64	WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
65			RSMU_UMC_INDEX_MODE_EN, 1);
66}
67
68static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
69{
70	WREG32_FIELD15(RSMU, 0, RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
71			RSMU_UMC_INDEX_MODE_EN, 0);
72}
73
74static uint32_t umc_v6_1_get_umc_index_mode_state(struct amdgpu_device *adev)
75{
76	uint32_t rsmu_umc_index;
77
78	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
79			mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
80
81	return REG_GET_FIELD(rsmu_umc_index,
82			RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
83			RSMU_UMC_INDEX_MODE_EN);
84}
85
86static inline uint32_t get_umc_6_reg_offset(struct amdgpu_device *adev,
87					    uint32_t umc_inst,
88					    uint32_t ch_inst)
89{
90	return adev->umc.channel_offs*ch_inst + UMC_6_INST_DIST*umc_inst;
91}
92
93static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
94						   uint32_t umc_reg_offset,
95						   unsigned long *error_count)
96{
97	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
98	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
99	uint64_t mc_umc_status;
100	uint32_t mc_umc_status_addr;
101
102	if (adev->asic_type == CHIP_ARCTURUS) {
103		/* UMC 6_1_2 registers */
104		ecc_err_cnt_sel_addr =
105			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
106		ecc_err_cnt_addr =
107			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
108		mc_umc_status_addr =
109			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
110	} else {
111		/* UMC 6_1_1 registers */
112		ecc_err_cnt_sel_addr =
113			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
114		ecc_err_cnt_addr =
115			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
116		mc_umc_status_addr =
117			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
118	}
119
120	/* select the lower chip and check the error count */
121	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
122	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
123					EccErrCntCsSel, 0);
124	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
125	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
126	*error_count +=
127		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
128		 UMC_V6_1_CE_CNT_INIT);
129	/* clear the lower chip err count */
130	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
131
132	/* select the higher chip and check the err counter */
133	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
134					EccErrCntCsSel, 1);
135	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
136	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
137	*error_count +=
138		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
139		 UMC_V6_1_CE_CNT_INIT);
140	/* clear the higher chip err count */
141	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
142
143	/* check for SRAM correctable error
144	  MCUMC_STATUS is a 64 bit register */
145	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
146	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
147	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
148	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
149		*error_count += 1;
150}
151
152static void umc_v6_1_querry_uncorrectable_error_count(struct amdgpu_device *adev,
153						      uint32_t umc_reg_offset,
154						      unsigned long *error_count)
155{
156	uint64_t mc_umc_status;
157	uint32_t mc_umc_status_addr;
158
159	if (adev->asic_type == CHIP_ARCTURUS) {
160		/* UMC 6_1_2 registers */
161		mc_umc_status_addr =
162			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
163	} else {
164		/* UMC 6_1_1 registers */
165		mc_umc_status_addr =
166			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
167	}
168
169	/* check the MCUMC_STATUS */
170	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
171	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
172	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
173	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
174	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
175	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
176	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
177		*error_count += 1;
178}
179
180static void umc_v6_1_query_ras_error_count(struct amdgpu_device *adev,
181					   void *ras_error_status)
182{
183	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
184
185	uint32_t umc_inst        = 0;
186	uint32_t ch_inst         = 0;
187	uint32_t umc_reg_offset  = 0;
188
189	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
190
191	if (rsmu_umc_index_state)
192		umc_v6_1_disable_umc_index_mode(adev);
193
194	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
195		umc_reg_offset = get_umc_6_reg_offset(adev,
196						      umc_inst,
197						      ch_inst);
198
199		umc_v6_1_query_correctable_error_count(adev,
200						       umc_reg_offset,
201						       &(err_data->ce_count));
202		umc_v6_1_querry_uncorrectable_error_count(adev,
203							  umc_reg_offset,
204							  &(err_data->ue_count));
205	}
206
207	if (rsmu_umc_index_state)
208		umc_v6_1_enable_umc_index_mode(adev);
209}
210
211static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
212					 struct ras_err_data *err_data,
213					 uint32_t umc_reg_offset,
214					 uint32_t ch_inst,
215					 uint32_t umc_inst)
216{
217	uint32_t lsb, mc_umc_status_addr;
218	uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
219	struct eeprom_table_record *err_rec;
220	uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
221
222	if (adev->asic_type == CHIP_ARCTURUS) {
223		/* UMC 6_1_2 registers */
224		mc_umc_status_addr =
225			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0_ARCT);
226		mc_umc_addrt0 =
227			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0_ARCT);
228	} else {
229		/* UMC 6_1_1 registers */
230		mc_umc_status_addr =
231			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
232		mc_umc_addrt0 =
233			SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
234	}
235
236	/* skip error address process if -ENOMEM */
237	if (!err_data->err_addr) {
238		/* clear umc status */
239		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
240		return;
241	}
242
243	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
244	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
245
246	/* calculate error address if ue/ce error is detected */
247	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
248	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
249	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
250
251		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
252		/* the lowest lsb bits should be ignored */
253		lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
254		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
255		err_addr &= ~((0x1ULL << lsb) - 1);
256
257		/* translate umc channel address to soc pa, 3 parts are included */
258		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
259				ADDR_OF_256B_BLOCK(channel_index) |
260				OFFSET_IN_256B_BLOCK(err_addr);
261
262		/* we only save ue error information currently, ce is skipped */
263		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
264				== 1) {
265			err_rec->address = err_addr;
266			/* page frame address is saved */
267			err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
268			err_rec->ts = (uint64_t)ktime_get_real_seconds();
269			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
270			err_rec->cu = 0;
271			err_rec->mem_channel = channel_index;
272			err_rec->mcumc_id = umc_inst;
273
274			err_data->err_addr_cnt++;
275		}
276	}
277
278	/* clear umc status */
279	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
280}
281
282static void umc_v6_1_query_ras_error_address(struct amdgpu_device *adev,
283					     void *ras_error_status)
284{
285	struct ras_err_data* err_data = (struct ras_err_data*)ras_error_status;
286
287	uint32_t umc_inst        = 0;
288	uint32_t ch_inst         = 0;
289	uint32_t umc_reg_offset  = 0;
290
291	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
292
293	if (rsmu_umc_index_state)
294		umc_v6_1_disable_umc_index_mode(adev);
295
296	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
297		umc_reg_offset = get_umc_6_reg_offset(adev,
298						      umc_inst,
299						      ch_inst);
300
301		umc_v6_1_query_error_address(adev,
302					     err_data,
303					     umc_reg_offset,
304					     ch_inst,
305					     umc_inst);
306	}
307
308	if (rsmu_umc_index_state)
309		umc_v6_1_enable_umc_index_mode(adev);
310}
311
312static void umc_v6_1_err_cnt_init_per_channel(struct amdgpu_device *adev,
313					      uint32_t umc_reg_offset)
314{
315	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
316	uint32_t ecc_err_cnt_addr;
317
318	if (adev->asic_type == CHIP_ARCTURUS) {
319		/* UMC 6_1_2 registers */
320		ecc_err_cnt_sel_addr =
321			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel_ARCT);
322		ecc_err_cnt_addr =
323			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt_ARCT);
324	} else {
325		/* UMC 6_1_1 registers */
326		ecc_err_cnt_sel_addr =
327			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCntSel);
328		ecc_err_cnt_addr =
329			SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_EccErrCnt);
330	}
331
332	/* select the lower chip and check the error count */
333	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
334	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
335					EccErrCntCsSel, 0);
336	/* set ce error interrupt type to APIC based interrupt */
337	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
338					EccErrInt, 0x1);
339	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
340	/* set error count to initial value */
341	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
342
343	/* select the higher chip and check the err counter */
344	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
345					EccErrCntCsSel, 1);
346	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
347	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V6_1_CE_CNT_INIT);
348}
349
350static void umc_v6_1_err_cnt_init(struct amdgpu_device *adev)
351{
352	uint32_t umc_inst        = 0;
353	uint32_t ch_inst         = 0;
354	uint32_t umc_reg_offset  = 0;
355
356	uint32_t rsmu_umc_index_state = umc_v6_1_get_umc_index_mode_state(adev);
357
358	if (rsmu_umc_index_state)
359		umc_v6_1_disable_umc_index_mode(adev);
360
361	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
362		umc_reg_offset = get_umc_6_reg_offset(adev,
363						      umc_inst,
364						      ch_inst);
365
366		umc_v6_1_err_cnt_init_per_channel(adev, umc_reg_offset);
367	}
368
369	if (rsmu_umc_index_state)
370		umc_v6_1_enable_umc_index_mode(adev);
371}
372
373const struct amdgpu_umc_funcs umc_v6_1_funcs = {
374	.err_cnt_init = umc_v6_1_err_cnt_init,
375	.ras_late_init = amdgpu_umc_ras_late_init,
376	.query_ras_error_count = umc_v6_1_query_ras_error_count,
377	.query_ras_error_address = umc_v6_1_query_ras_error_address,
378};
379