1// SPDX-License-Identifier: MIT 2/* 3 * Copyright 2024 Advanced Micro Devices, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 * OTHER DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25#include <generated/utsrelease.h> 26#include <linux/devcoredump.h> 27#include "amdgpu_dev_coredump.h" 28#include "atom.h" 29 30#ifndef CONFIG_DEV_COREDUMP 31void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 32 struct amdgpu_reset_context *reset_context) 33{ 34} 35#else 36 37const char *hw_ip_names[MAX_HWIP] = { 38 [GC_HWIP] = "GC", 39 [HDP_HWIP] = "HDP", 40 [SDMA0_HWIP] = "SDMA0", 41 [SDMA1_HWIP] = "SDMA1", 42 [SDMA2_HWIP] = "SDMA2", 43 [SDMA3_HWIP] = "SDMA3", 44 [SDMA4_HWIP] = "SDMA4", 45 [SDMA5_HWIP] = "SDMA5", 46 [SDMA6_HWIP] = "SDMA6", 47 [SDMA7_HWIP] = "SDMA7", 48 [LSDMA_HWIP] = "LSDMA", 49 [MMHUB_HWIP] = "MMHUB", 50 [ATHUB_HWIP] = "ATHUB", 51 [NBIO_HWIP] = "NBIO", 52 [MP0_HWIP] = "MP0", 53 [MP1_HWIP] = "MP1", 54 [UVD_HWIP] = "UVD/JPEG/VCN", 55 [VCN1_HWIP] = "VCN1", 56 [VCE_HWIP] = "VCE", 57 [VPE_HWIP] = "VPE", 58 [DF_HWIP] = "DF", 59 [DCE_HWIP] = "DCE", 60 [OSSSYS_HWIP] = "OSSSYS", 61 [SMUIO_HWIP] = "SMUIO", 62 [PWR_HWIP] = "PWR", 63 [NBIF_HWIP] = "NBIF", 64 [THM_HWIP] = "THM", 65 [CLK_HWIP] = "CLK", 66 [UMC_HWIP] = "UMC", 67 [RSMU_HWIP] = "RSMU", 68 [XGMI_HWIP] = "XGMI", 69 [DCI_HWIP] = "DCI", 70 [PCIE_HWIP] = "PCIE", 71}; 72 73static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, 74 struct drm_printer *p) 75{ 76 uint32_t version; 77 uint32_t feature; 78 uint8_t smu_program, smu_major, smu_minor, smu_debug; 79 struct atom_context *ctx = adev->mode_info.atom_context; 80 81 drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n", 82 adev->vce.fb_version, adev->vce.fw_version); 83 drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0, 84 adev->uvd.fw_version); 85 drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0, 86 adev->gmc.fw_version); 87 drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n", 88 adev->gfx.me_feature_version, adev->gfx.me_fw_version); 89 drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n", 90 adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version); 91 drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n", 92 adev->gfx.ce_feature_version, adev->gfx.ce_fw_version); 93 drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n", 94 adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version); 95 96 drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n", 97 adev->gfx.rlc_srlc_feature_version, 98 adev->gfx.rlc_srlc_fw_version); 99 drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n", 100 adev->gfx.rlc_srlg_feature_version, 101 adev->gfx.rlc_srlg_fw_version); 102 drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n", 103 adev->gfx.rlc_srls_feature_version, 104 adev->gfx.rlc_srls_fw_version); 105 drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n", 106 adev->gfx.rlcp_ucode_feature_version, 107 adev->gfx.rlcp_ucode_version); 108 drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n", 109 adev->gfx.rlcv_ucode_feature_version, 110 adev->gfx.rlcv_ucode_version); 111 drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n", 112 adev->gfx.mec_feature_version, adev->gfx.mec_fw_version); 113 114 if (adev->gfx.mec2_fw) 115 drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n", 116 adev->gfx.mec2_feature_version, 117 adev->gfx.mec2_fw_version); 118 119 drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0, 120 adev->gfx.imu_fw_version); 121 drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n", 122 adev->psp.sos.feature_version, adev->psp.sos.fw_version); 123 drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n", 124 adev->psp.asd_context.bin_desc.feature_version, 125 adev->psp.asd_context.bin_desc.fw_version); 126 127 drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n", 128 adev->psp.xgmi_context.context.bin_desc.feature_version, 129 adev->psp.xgmi_context.context.bin_desc.fw_version); 130 drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n", 131 adev->psp.ras_context.context.bin_desc.feature_version, 132 adev->psp.ras_context.context.bin_desc.fw_version); 133 drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n", 134 adev->psp.hdcp_context.context.bin_desc.feature_version, 135 adev->psp.hdcp_context.context.bin_desc.fw_version); 136 drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n", 137 adev->psp.dtm_context.context.bin_desc.feature_version, 138 adev->psp.dtm_context.context.bin_desc.fw_version); 139 drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n", 140 adev->psp.rap_context.context.bin_desc.feature_version, 141 adev->psp.rap_context.context.bin_desc.fw_version); 142 drm_printf(p, 143 "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n", 144 adev->psp.securedisplay_context.context.bin_desc.feature_version, 145 adev->psp.securedisplay_context.context.bin_desc.fw_version); 146 147 /* SMC firmware */ 148 version = adev->pm.fw_version; 149 150 smu_program = (version >> 24) & 0xff; 151 smu_major = (version >> 16) & 0xff; 152 smu_minor = (version >> 8) & 0xff; 153 smu_debug = (version >> 0) & 0xff; 154 drm_printf(p, 155 "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n", 156 0, smu_program, version, smu_major, smu_minor, smu_debug); 157 158 /* SDMA firmware */ 159 for (int i = 0; i < adev->sdma.num_instances; i++) { 160 drm_printf(p, 161 "SDMA%d feature version: %u, firmware version: 0x%08x\n", 162 i, adev->sdma.instance[i].feature_version, 163 adev->sdma.instance[i].fw_version); 164 } 165 166 drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0, 167 adev->vcn.fw_version); 168 drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0, 169 adev->dm.dmcu_fw_version); 170 drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0, 171 adev->dm.dmcub_fw_version); 172 drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n", 173 adev->psp.toc.feature_version, adev->psp.toc.fw_version); 174 175 version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK; 176 feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >> 177 AMDGPU_MES_FEAT_VERSION_SHIFT; 178 drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n", 179 feature, version); 180 181 version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; 182 feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >> 183 AMDGPU_MES_FEAT_VERSION_SHIFT; 184 drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature, 185 version); 186 187 drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n", 188 adev->vpe.feature_version, adev->vpe.fw_version); 189 190 drm_printf(p, "\nVBIOS Information\n"); 191 drm_printf(p, "vbios name : %s\n", ctx->name); 192 drm_printf(p, "vbios pn : %s\n", ctx->vbios_pn); 193 drm_printf(p, "vbios version : %d\n", ctx->version); 194 drm_printf(p, "vbios ver_str : %s\n", ctx->vbios_ver_str); 195 drm_printf(p, "vbios date : %s\n", ctx->date); 196} 197 198static ssize_t 199amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, 200 void *data, size_t datalen) 201{ 202 struct drm_printer p; 203 struct amdgpu_coredump_info *coredump = data; 204 struct drm_print_iterator iter; 205 struct amdgpu_vm_fault_info *fault_info; 206 int i, ver; 207 208 iter.data = buffer; 209 iter.offset = 0; 210 iter.start = offset; 211 iter.remain = count; 212 213 p = drm_coredump_printer(&iter); 214 215 drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); 216 drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); 217 drm_printf(&p, "kernel: " UTS_RELEASE "\n"); 218 drm_printf(&p, "module: " KBUILD_MODNAME "\n"); 219 drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, 220 coredump->reset_time.tv_nsec); 221 222 if (coredump->reset_task_info.pid) 223 drm_printf(&p, "process_name: %s PID: %d\n", 224 coredump->reset_task_info.process_name, 225 coredump->reset_task_info.pid); 226 227 /* GPU IP's information of the SOC */ 228 drm_printf(&p, "\nIP Information\n"); 229 drm_printf(&p, "SOC Family: %d\n", coredump->adev->family); 230 drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id); 231 drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id); 232 233 for (int i = 1; i < MAX_HWIP; i++) { 234 for (int j = 0; j < HWIP_MAX_INSTANCE; j++) { 235 ver = coredump->adev->ip_versions[i][j]; 236 if (ver) 237 drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n", 238 hw_ip_names[i], i, j, 239 IP_VERSION_MAJ(ver), 240 IP_VERSION_MIN(ver), 241 IP_VERSION_REV(ver), 242 IP_VERSION_VARIANT(ver), 243 IP_VERSION_SUBREV(ver)); 244 } 245 } 246 247 /* IP firmware information */ 248 drm_printf(&p, "\nIP Firmwares\n"); 249 amdgpu_devcoredump_fw_info(coredump->adev, &p); 250 251 if (coredump->ring) { 252 drm_printf(&p, "\nRing timed out details\n"); 253 drm_printf(&p, "IP Type: %d Ring Name: %s\n", 254 coredump->ring->funcs->type, 255 coredump->ring->name); 256 } 257 258 /* Add page fault information */ 259 fault_info = &coredump->adev->vm_manager.fault_info; 260 drm_printf(&p, "\n[%s] Page fault observed\n", 261 fault_info->vmhub ? "mmhub" : "gfxhub"); 262 drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); 263 drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); 264 265 /* dump the ip state for each ip */ 266 drm_printf(&p, "IP Dump\n"); 267 for (int i = 0; i < coredump->adev->num_ip_blocks; i++) { 268 if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) { 269 drm_printf(&p, "IP: %s\n", 270 coredump->adev->ip_blocks[i] 271 .version->funcs->name); 272 coredump->adev->ip_blocks[i] 273 .version->funcs->print_ip_state( 274 (void *)coredump->adev, &p); 275 drm_printf(&p, "\n"); 276 } 277 } 278 279 /* Add ring buffer information */ 280 drm_printf(&p, "Ring buffer information\n"); 281 for (int i = 0; i < coredump->adev->num_rings; i++) { 282 int j = 0; 283 struct amdgpu_ring *ring = coredump->adev->rings[i]; 284 285 drm_printf(&p, "ring name: %s\n", ring->name); 286 drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n", 287 amdgpu_ring_get_rptr(ring), 288 amdgpu_ring_get_wptr(ring), 289 ring->buf_mask); 290 drm_printf(&p, "Ring size in dwords: %d\n", 291 ring->ring_size / 4); 292 drm_printf(&p, "Ring contents\n"); 293 drm_printf(&p, "Offset \t Value\n"); 294 295 while (j < ring->ring_size) { 296 drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]); 297 j += 4; 298 } 299 } 300 301 if (coredump->reset_vram_lost) 302 drm_printf(&p, "VRAM is lost due to GPU reset!\n"); 303 if (coredump->adev->reset_info.num_regs) { 304 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); 305 306 for (i = 0; i < coredump->adev->reset_info.num_regs; i++) 307 drm_printf(&p, "0x%08x: 0x%08x\n", 308 coredump->adev->reset_info.reset_dump_reg_list[i], 309 coredump->adev->reset_info.reset_dump_reg_value[i]); 310 } 311 312 return count - iter.remain; 313} 314 315static void amdgpu_devcoredump_free(void *data) 316{ 317 kfree(data); 318} 319 320void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, 321 struct amdgpu_reset_context *reset_context) 322{ 323 struct amdgpu_coredump_info *coredump; 324 struct drm_device *dev = adev_to_drm(adev); 325 struct amdgpu_job *job = reset_context->job; 326 struct drm_sched_job *s_job; 327 328 coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); 329 330 if (!coredump) { 331 DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); 332 return; 333 } 334 335 coredump->reset_vram_lost = vram_lost; 336 337 if (reset_context->job && reset_context->job->vm) { 338 struct amdgpu_task_info *ti; 339 struct amdgpu_vm *vm = reset_context->job->vm; 340 341 ti = amdgpu_vm_get_task_info_vm(vm); 342 if (ti) { 343 coredump->reset_task_info = *ti; 344 amdgpu_vm_put_task_info(ti); 345 } 346 } 347 348 if (job) { 349 s_job = &job->base; 350 coredump->ring = to_amdgpu_ring(s_job->sched); 351 } 352 353 coredump->adev = adev; 354 355 ktime_get_ts64(&coredump->reset_time); 356 357 dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, 358 amdgpu_devcoredump_read, amdgpu_devcoredump_free); 359} 360#endif 361