1// SPDX-License-Identifier: MIT
2/*
3 * Copyright 2024 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#include <generated/utsrelease.h>
26#include <linux/devcoredump.h>
27#include "amdgpu_dev_coredump.h"
28#include "atom.h"
29
30#ifndef CONFIG_DEV_COREDUMP
31void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
32		     struct amdgpu_reset_context *reset_context)
33{
34}
35#else
36
37const char *hw_ip_names[MAX_HWIP] = {
38	[GC_HWIP]		= "GC",
39	[HDP_HWIP]		= "HDP",
40	[SDMA0_HWIP]		= "SDMA0",
41	[SDMA1_HWIP]		= "SDMA1",
42	[SDMA2_HWIP]		= "SDMA2",
43	[SDMA3_HWIP]		= "SDMA3",
44	[SDMA4_HWIP]		= "SDMA4",
45	[SDMA5_HWIP]		= "SDMA5",
46	[SDMA6_HWIP]		= "SDMA6",
47	[SDMA7_HWIP]		= "SDMA7",
48	[LSDMA_HWIP]		= "LSDMA",
49	[MMHUB_HWIP]		= "MMHUB",
50	[ATHUB_HWIP]		= "ATHUB",
51	[NBIO_HWIP]		= "NBIO",
52	[MP0_HWIP]		= "MP0",
53	[MP1_HWIP]		= "MP1",
54	[UVD_HWIP]		= "UVD/JPEG/VCN",
55	[VCN1_HWIP]		= "VCN1",
56	[VCE_HWIP]		= "VCE",
57	[VPE_HWIP]		= "VPE",
58	[DF_HWIP]		= "DF",
59	[DCE_HWIP]		= "DCE",
60	[OSSSYS_HWIP]		= "OSSSYS",
61	[SMUIO_HWIP]		= "SMUIO",
62	[PWR_HWIP]		= "PWR",
63	[NBIF_HWIP]		= "NBIF",
64	[THM_HWIP]		= "THM",
65	[CLK_HWIP]		= "CLK",
66	[UMC_HWIP]		= "UMC",
67	[RSMU_HWIP]		= "RSMU",
68	[XGMI_HWIP]		= "XGMI",
69	[DCI_HWIP]		= "DCI",
70	[PCIE_HWIP]		= "PCIE",
71};
72
73static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev,
74				       struct drm_printer *p)
75{
76	uint32_t version;
77	uint32_t feature;
78	uint8_t smu_program, smu_major, smu_minor, smu_debug;
79	struct atom_context *ctx = adev->mode_info.atom_context;
80
81	drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n",
82		   adev->vce.fb_version, adev->vce.fw_version);
83	drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0,
84		   adev->uvd.fw_version);
85	drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0,
86		   adev->gmc.fw_version);
87	drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n",
88		   adev->gfx.me_feature_version, adev->gfx.me_fw_version);
89	drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n",
90		   adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version);
91	drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n",
92		   adev->gfx.ce_feature_version, adev->gfx.ce_fw_version);
93	drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n",
94		   adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version);
95
96	drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n",
97		   adev->gfx.rlc_srlc_feature_version,
98		   adev->gfx.rlc_srlc_fw_version);
99	drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n",
100		   adev->gfx.rlc_srlg_feature_version,
101		   adev->gfx.rlc_srlg_fw_version);
102	drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n",
103		   adev->gfx.rlc_srls_feature_version,
104		   adev->gfx.rlc_srls_fw_version);
105	drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n",
106		   adev->gfx.rlcp_ucode_feature_version,
107		   adev->gfx.rlcp_ucode_version);
108	drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n",
109		   adev->gfx.rlcv_ucode_feature_version,
110		   adev->gfx.rlcv_ucode_version);
111	drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n",
112		   adev->gfx.mec_feature_version, adev->gfx.mec_fw_version);
113
114	if (adev->gfx.mec2_fw)
115		drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n",
116			   adev->gfx.mec2_feature_version,
117			   adev->gfx.mec2_fw_version);
118
119	drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0,
120		   adev->gfx.imu_fw_version);
121	drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n",
122		   adev->psp.sos.feature_version, adev->psp.sos.fw_version);
123	drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n",
124		   adev->psp.asd_context.bin_desc.feature_version,
125		   adev->psp.asd_context.bin_desc.fw_version);
126
127	drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n",
128		   adev->psp.xgmi_context.context.bin_desc.feature_version,
129		   adev->psp.xgmi_context.context.bin_desc.fw_version);
130	drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n",
131		   adev->psp.ras_context.context.bin_desc.feature_version,
132		   adev->psp.ras_context.context.bin_desc.fw_version);
133	drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n",
134		   adev->psp.hdcp_context.context.bin_desc.feature_version,
135		   adev->psp.hdcp_context.context.bin_desc.fw_version);
136	drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n",
137		   adev->psp.dtm_context.context.bin_desc.feature_version,
138		   adev->psp.dtm_context.context.bin_desc.fw_version);
139	drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n",
140		   adev->psp.rap_context.context.bin_desc.feature_version,
141		   adev->psp.rap_context.context.bin_desc.fw_version);
142	drm_printf(p,
143		   "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n",
144		   adev->psp.securedisplay_context.context.bin_desc.feature_version,
145		   adev->psp.securedisplay_context.context.bin_desc.fw_version);
146
147	/* SMC firmware */
148	version = adev->pm.fw_version;
149
150	smu_program = (version >> 24) & 0xff;
151	smu_major = (version >> 16) & 0xff;
152	smu_minor = (version >> 8) & 0xff;
153	smu_debug = (version >> 0) & 0xff;
154	drm_printf(p,
155		   "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n",
156		   0, smu_program, version, smu_major, smu_minor, smu_debug);
157
158	/* SDMA firmware */
159	for (int i = 0; i < adev->sdma.num_instances; i++) {
160		drm_printf(p,
161			   "SDMA%d feature version: %u, firmware version: 0x%08x\n",
162			   i, adev->sdma.instance[i].feature_version,
163			   adev->sdma.instance[i].fw_version);
164	}
165
166	drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0,
167		   adev->vcn.fw_version);
168	drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0,
169		   adev->dm.dmcu_fw_version);
170	drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0,
171		   adev->dm.dmcub_fw_version);
172	drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n",
173		   adev->psp.toc.feature_version, adev->psp.toc.fw_version);
174
175	version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK;
176	feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
177		  AMDGPU_MES_FEAT_VERSION_SHIFT;
178	drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n",
179		   feature, version);
180
181	version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
182	feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >>
183		  AMDGPU_MES_FEAT_VERSION_SHIFT;
184	drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature,
185		   version);
186
187	drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n",
188		   adev->vpe.feature_version, adev->vpe.fw_version);
189
190	drm_printf(p, "\nVBIOS Information\n");
191	drm_printf(p, "vbios name       : %s\n", ctx->name);
192	drm_printf(p, "vbios pn         : %s\n", ctx->vbios_pn);
193	drm_printf(p, "vbios version    : %d\n", ctx->version);
194	drm_printf(p, "vbios ver_str    : %s\n", ctx->vbios_ver_str);
195	drm_printf(p, "vbios date       : %s\n", ctx->date);
196}
197
198static ssize_t
199amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
200			void *data, size_t datalen)
201{
202	struct drm_printer p;
203	struct amdgpu_coredump_info *coredump = data;
204	struct drm_print_iterator iter;
205	struct amdgpu_vm_fault_info *fault_info;
206	int i, ver;
207
208	iter.data = buffer;
209	iter.offset = 0;
210	iter.start = offset;
211	iter.remain = count;
212
213	p = drm_coredump_printer(&iter);
214
215	drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
216	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
217	drm_printf(&p, "kernel: " UTS_RELEASE "\n");
218	drm_printf(&p, "module: " KBUILD_MODNAME "\n");
219	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec,
220		   coredump->reset_time.tv_nsec);
221
222	if (coredump->reset_task_info.pid)
223		drm_printf(&p, "process_name: %s PID: %d\n",
224			   coredump->reset_task_info.process_name,
225			   coredump->reset_task_info.pid);
226
227	/* GPU IP's information of the SOC */
228	drm_printf(&p, "\nIP Information\n");
229	drm_printf(&p, "SOC Family: %d\n", coredump->adev->family);
230	drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id);
231	drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id);
232
233	for (int i = 1; i < MAX_HWIP; i++) {
234		for (int j = 0; j < HWIP_MAX_INSTANCE; j++) {
235			ver = coredump->adev->ip_versions[i][j];
236			if (ver)
237				drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n",
238					   hw_ip_names[i], i, j,
239					   IP_VERSION_MAJ(ver),
240					   IP_VERSION_MIN(ver),
241					   IP_VERSION_REV(ver),
242					   IP_VERSION_VARIANT(ver),
243					   IP_VERSION_SUBREV(ver));
244		}
245	}
246
247	/* IP firmware information */
248	drm_printf(&p, "\nIP Firmwares\n");
249	amdgpu_devcoredump_fw_info(coredump->adev, &p);
250
251	if (coredump->ring) {
252		drm_printf(&p, "\nRing timed out details\n");
253		drm_printf(&p, "IP Type: %d Ring Name: %s\n",
254			   coredump->ring->funcs->type,
255			   coredump->ring->name);
256	}
257
258	/* Add page fault information */
259	fault_info = &coredump->adev->vm_manager.fault_info;
260	drm_printf(&p, "\n[%s] Page fault observed\n",
261		   fault_info->vmhub ? "mmhub" : "gfxhub");
262	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr);
263	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status);
264
265	/* dump the ip state for each ip */
266	drm_printf(&p, "IP Dump\n");
267	for (int i = 0; i < coredump->adev->num_ip_blocks; i++) {
268		if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) {
269			drm_printf(&p, "IP: %s\n",
270				   coredump->adev->ip_blocks[i]
271					   .version->funcs->name);
272			coredump->adev->ip_blocks[i]
273				.version->funcs->print_ip_state(
274					(void *)coredump->adev, &p);
275			drm_printf(&p, "\n");
276		}
277	}
278
279	/* Add ring buffer information */
280	drm_printf(&p, "Ring buffer information\n");
281	for (int i = 0; i < coredump->adev->num_rings; i++) {
282		int j = 0;
283		struct amdgpu_ring *ring = coredump->adev->rings[i];
284
285		drm_printf(&p, "ring name: %s\n", ring->name);
286		drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n",
287			   amdgpu_ring_get_rptr(ring),
288			   amdgpu_ring_get_wptr(ring),
289			   ring->buf_mask);
290		drm_printf(&p, "Ring size in dwords: %d\n",
291			   ring->ring_size / 4);
292		drm_printf(&p, "Ring contents\n");
293		drm_printf(&p, "Offset \t Value\n");
294
295		while (j < ring->ring_size) {
296			drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]);
297			j += 4;
298		}
299	}
300
301	if (coredump->reset_vram_lost)
302		drm_printf(&p, "VRAM is lost due to GPU reset!\n");
303	if (coredump->adev->reset_info.num_regs) {
304		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
305
306		for (i = 0; i < coredump->adev->reset_info.num_regs; i++)
307			drm_printf(&p, "0x%08x: 0x%08x\n",
308				   coredump->adev->reset_info.reset_dump_reg_list[i],
309				   coredump->adev->reset_info.reset_dump_reg_value[i]);
310	}
311
312	return count - iter.remain;
313}
314
315static void amdgpu_devcoredump_free(void *data)
316{
317	kfree(data);
318}
319
320void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
321		     struct amdgpu_reset_context *reset_context)
322{
323	struct amdgpu_coredump_info *coredump;
324	struct drm_device *dev = adev_to_drm(adev);
325	struct amdgpu_job *job = reset_context->job;
326	struct drm_sched_job *s_job;
327
328	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
329
330	if (!coredump) {
331		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
332		return;
333	}
334
335	coredump->reset_vram_lost = vram_lost;
336
337	if (reset_context->job && reset_context->job->vm) {
338		struct amdgpu_task_info *ti;
339		struct amdgpu_vm *vm = reset_context->job->vm;
340
341		ti = amdgpu_vm_get_task_info_vm(vm);
342		if (ti) {
343			coredump->reset_task_info = *ti;
344			amdgpu_vm_put_task_info(ti);
345		}
346	}
347
348	if (job) {
349		s_job = &job->base;
350		coredump->ring = to_amdgpu_ring(s_job->sched);
351	}
352
353	coredump->adev = adev;
354
355	ktime_get_ts64(&coredump->reset_time);
356
357	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
358		      amdgpu_devcoredump_read, amdgpu_devcoredump_free);
359}
360#endif
361