112059Smarkm/* $NetBSD: amdgpu_amdkfd_gfx_v10.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $ */ 212059Smarkm 312059Smarkm/* 412059Smarkm * Copyright 2019 Advanced Micro Devices, Inc. 512059Smarkm * 612059Smarkm * Permission is hereby granted, free of charge, to any person obtaining a 712059Smarkm * copy of this software and associated documentation files (the "Software"), 812059Smarkm * to deal in the Software without restriction, including without limitation 912059Smarkm * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1012059Smarkm * and/or sell copies of the Software, and to permit persons to whom the 1150477Speter * Software is furnished to do so, subject to the following conditions: 1212059Smarkm * 1312059Smarkm * The above copyright notice and this permission notice shall be included in 1412059Smarkm * all copies or substantial portions of the Software. 1512059Smarkm * 1612059Smarkm * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1712059Smarkm * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1812059Smarkm * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1912059Smarkm * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 2012059Smarkm * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 2112059Smarkm * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 2212059Smarkm * OTHER DEALINGS IN THE SOFTWARE. 2312059Smarkm */ 2412059Smarkm#include <sys/cdefs.h> 2512059Smarkm__KERNEL_RCSID(0, "$NetBSD: amdgpu_amdkfd_gfx_v10.c,v 1.2 2021/12/18 23:44:58 riastradh Exp $"); 2612059Smarkm 2712059Smarkm#include <linux/mmu_context.h> 2812059Smarkm#include "amdgpu.h" 2912059Smarkm#include "amdgpu_amdkfd.h" 30#include "gc/gc_10_1_0_offset.h" 31#include "gc/gc_10_1_0_sh_mask.h" 32#include "navi10_enum.h" 33#include "athub/athub_2_0_0_offset.h" 34#include "athub/athub_2_0_0_sh_mask.h" 35#include "oss/osssys_5_0_0_offset.h" 36#include "oss/osssys_5_0_0_sh_mask.h" 37#include "soc15_common.h" 38#include "v10_structs.h" 39#include "nv.h" 40#include "nvd.h" 41#include "gfxhub_v2_0.h" 42 43enum hqd_dequeue_request_type { 44 NO_ACTION = 0, 45 DRAIN_PIPE, 46 RESET_WAVES, 47 SAVE_WAVES 48}; 49 50/* Because of REG_GET_FIELD() being used, we put this function in the 51 * asic specific file. 52 */ 53static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, 54 struct tile_config *config) 55{ 56 struct amdgpu_device *adev = (struct amdgpu_device *)kgd; 57 58 config->gb_addr_config = adev->gfx.config.gb_addr_config; 59#if 0 60/* TODO - confirm REG_GET_FIELD x2, should be OK as is... but 61 * MC_ARB_RAMCFG register doesn't exist on Vega10 - initial amdgpu 62 * changes commented out related code, doing the same here for now but 63 * need to sync with Ken et al 64 */ 65 config->num_banks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, 66 MC_ARB_RAMCFG, NOOFBANK); 67 config->num_ranks = REG_GET_FIELD(adev->gfx.config.mc_arb_ramcfg, 68 MC_ARB_RAMCFG, NOOFRANKS); 69#endif 70 71 config->tile_config_ptr = adev->gfx.config.tile_mode_array; 72 config->num_tile_configs = 73 ARRAY_SIZE(adev->gfx.config.tile_mode_array); 74 config->macro_tile_config_ptr = 75 adev->gfx.config.macrotile_mode_array; 76 config->num_macro_tile_configs = 77 ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); 78 79 return 0; 80} 81 82static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) 83{ 84 return (struct amdgpu_device *)kgd; 85} 86 87static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, 88 uint32_t queue, uint32_t vmid) 89{ 90 struct amdgpu_device *adev = get_amdgpu_device(kgd); 91 92 mutex_lock(&adev->srbm_mutex); 93 nv_grbm_select(adev, mec, pipe, queue, vmid); 94} 95 96static void unlock_srbm(struct kgd_dev *kgd) 97{ 98 struct amdgpu_device *adev = get_amdgpu_device(kgd); 99 100 nv_grbm_select(adev, 0, 0, 0, 0); 101 mutex_unlock(&adev->srbm_mutex); 102} 103 104static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, 105 uint32_t queue_id) 106{ 107 struct amdgpu_device *adev = get_amdgpu_device(kgd); 108 109 uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 110 uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 111 112 lock_srbm(kgd, mec, pipe, queue_id, 0); 113} 114 115static uint64_t get_queue_mask(struct amdgpu_device *adev, 116 uint32_t pipe_id, uint32_t queue_id) 117{ 118 unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + 119 queue_id; 120 121 return 1ull << bit; 122} 123 124static void release_queue(struct kgd_dev *kgd) 125{ 126 unlock_srbm(kgd); 127} 128 129static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, 130 uint32_t sh_mem_config, 131 uint32_t sh_mem_ape1_base, 132 uint32_t sh_mem_ape1_limit, 133 uint32_t sh_mem_bases) 134{ 135 struct amdgpu_device *adev = get_amdgpu_device(kgd); 136 137 lock_srbm(kgd, 0, 0, 0, vmid); 138 139 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); 140 WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); 141 /* APE1 no longer exists on GFX9 */ 142 143 unlock_srbm(kgd); 144} 145 146static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, 147 unsigned int vmid) 148{ 149 struct amdgpu_device *adev = get_amdgpu_device(kgd); 150 151 /* 152 * We have to assume that there is no outstanding mapping. 153 * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because 154 * a mapping is in progress or because a mapping finished 155 * and the SW cleared it. 156 * So the protocol is to always wait & clear. 157 */ 158 uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | 159 ATC_VMID0_PASID_MAPPING__VALID_MASK; 160 161 pr_debug("pasid 0x%x vmid %d, reg value %x\n", pasid, vmid, pasid_mapping); 162 163 pr_debug("ATHUB, reg %x\n", SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid); 164 WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, 165 pasid_mapping); 166 167#if 0 168 /* TODO: uncomment this code when the hardware support is ready. */ 169 while (!(RREG32(SOC15_REG_OFFSET( 170 ATHUB, 0, 171 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & 172 (1U << vmid))) 173 cpu_relax(); 174 175 pr_debug("ATHUB mapping update finished\n"); 176 WREG32(SOC15_REG_OFFSET(ATHUB, 0, 177 mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), 178 1U << vmid); 179#endif 180 181 /* Mapping vmid to pasid also for IH block */ 182 pr_debug("update mapping for IH block and mmhub"); 183 WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, 184 pasid_mapping); 185 186 return 0; 187} 188 189/* TODO - RING0 form of field is obsolete, seems to date back to SI 190 * but still works 191 */ 192 193static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) 194{ 195 struct amdgpu_device *adev = get_amdgpu_device(kgd); 196 uint32_t mec; 197 uint32_t pipe; 198 199 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 200 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 201 202 lock_srbm(kgd, mec, pipe, 0, 0); 203 204 WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), 205 CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | 206 CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); 207 208 unlock_srbm(kgd); 209 210 return 0; 211} 212 213static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, 214 unsigned int engine_id, 215 unsigned int queue_id) 216{ 217 uint32_t sdma_engine_reg_base[2] = { 218 SOC15_REG_OFFSET(SDMA0, 0, 219 mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, 220 /* On gfx10, mmSDMA1_xxx registers are defined NOT based 221 * on SDMA1 base address (dw 0x1860) but based on SDMA0 222 * base address (dw 0x1260). Therefore use mmSDMA0_RLC0_RB_CNTL 223 * instead of mmSDMA1_RLC0_RB_CNTL for the base address calc 224 * below 225 */ 226 SOC15_REG_OFFSET(SDMA1, 0, 227 mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL 228 }; 229 230 uint32_t retval = sdma_engine_reg_base[engine_id] 231 + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); 232 233 pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id, 234 queue_id, retval); 235 236 return retval; 237} 238 239#if 0 240static uint32_t get_watch_base_addr(struct amdgpu_device *adev) 241{ 242 uint32_t retval = SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) - 243 mmTCP_WATCH0_ADDR_H; 244 245 pr_debug("kfd: reg watch base address: 0x%x\n", retval); 246 247 return retval; 248} 249#endif 250 251static inline struct v10_compute_mqd *get_mqd(void *mqd) 252{ 253 return (struct v10_compute_mqd *)mqd; 254} 255 256static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) 257{ 258 return (struct v10_sdma_mqd *)mqd; 259} 260 261static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, 262 uint32_t queue_id, uint32_t __user *wptr, 263 uint32_t wptr_shift, uint32_t wptr_mask, 264 struct mm_struct *mm) 265{ 266 struct amdgpu_device *adev = get_amdgpu_device(kgd); 267 struct v10_compute_mqd *m; 268 uint32_t *mqd_hqd; 269 uint32_t reg, hqd_base, data; 270 271 m = get_mqd(mqd); 272 273 pr_debug("Load hqd of pipe %d queue %d\n", pipe_id, queue_id); 274 acquire_queue(kgd, pipe_id, queue_id); 275 276 /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ 277 mqd_hqd = &m->cp_mqd_base_addr_lo; 278 hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 279 280 for (reg = hqd_base; 281 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 282 WREG32(reg, mqd_hqd[reg - hqd_base]); 283 284 285 /* Activate doorbell logic before triggering WPTR poll. */ 286 data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, 287 CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); 288 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); 289 290 if (wptr) { 291 /* Don't read wptr with get_user because the user 292 * context may not be accessible (if this function 293 * runs in a work queue). Instead trigger a one-shot 294 * polling read from memory in the CP. This assumes 295 * that wptr is GPU-accessible in the queue's VMID via 296 * ATC or SVM. WPTR==RPTR before starting the poll so 297 * the CP starts fetching new commands from the right 298 * place. 299 * 300 * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit 301 * tricky. Assume that the queue didn't overflow. The 302 * number of valid bits in the 32-bit RPTR depends on 303 * the queue size. The remaining bits are taken from 304 * the saved 64-bit WPTR. If the WPTR wrapped, add the 305 * queue size. 306 */ 307 uint32_t queue_size = 308 2 << REG_GET_FIELD(m->cp_hqd_pq_control, 309 CP_HQD_PQ_CONTROL, QUEUE_SIZE); 310 uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); 311 312 if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) 313 guessed_wptr += queue_size; 314 guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); 315 guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; 316 317 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), 318 lower_32_bits(guessed_wptr)); 319 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), 320 upper_32_bits(guessed_wptr)); 321 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), 322 lower_32_bits((uint64_t)wptr)); 323 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), 324 upper_32_bits((uint64_t)wptr)); 325 pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n", __func__, 326 (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); 327 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), 328 (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); 329 } 330 331 /* Start the EOP fetcher */ 332 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), 333 REG_SET_FIELD(m->cp_hqd_eop_rptr, 334 CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); 335 336 data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); 337 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); 338 339 release_queue(kgd); 340 341 return 0; 342} 343 344static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd, 345 uint32_t pipe_id, uint32_t queue_id, 346 uint32_t doorbell_off) 347{ 348 struct amdgpu_device *adev = get_amdgpu_device(kgd); 349 struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring; 350 struct v10_compute_mqd *m; 351 uint32_t mec, pipe; 352 int r; 353 354 m = get_mqd(mqd); 355 356 acquire_queue(kgd, pipe_id, queue_id); 357 358 mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; 359 pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); 360 361 pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n", 362 mec, pipe, queue_id); 363 364 spin_lock(&adev->gfx.kiq.ring_lock); 365 r = amdgpu_ring_alloc(kiq_ring, 7); 366 if (r) { 367 pr_err("Failed to alloc KIQ (%d).\n", r); 368 goto out_unlock; 369 } 370 371 amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); 372 amdgpu_ring_write(kiq_ring, 373 PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ 374 PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ 375 PACKET3_MAP_QUEUES_QUEUE(queue_id) | 376 PACKET3_MAP_QUEUES_PIPE(pipe) | 377 PACKET3_MAP_QUEUES_ME((mec - 1)) | 378 PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ 379 PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ 380 PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ 381 PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ 382 amdgpu_ring_write(kiq_ring, 383 PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); 384 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo); 385 amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi); 386 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo); 387 amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi); 388 amdgpu_ring_commit(kiq_ring); 389 390out_unlock: 391 spin_unlock(&adev->gfx.kiq.ring_lock); 392 release_queue(kgd); 393 394 return r; 395} 396 397static int kgd_hqd_dump(struct kgd_dev *kgd, 398 uint32_t pipe_id, uint32_t queue_id, 399 uint32_t (**dump)[2], uint32_t *n_regs) 400{ 401 struct amdgpu_device *adev = get_amdgpu_device(kgd); 402 uint32_t i = 0, reg; 403#define HQD_N_REGS 56 404#define DUMP_REG(addr) do { \ 405 if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ 406 break; \ 407 (*dump)[i][0] = (addr) << 2; \ 408 (*dump)[i++][1] = RREG32(addr); \ 409 } while (0) 410 411 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 412 if (*dump == NULL) 413 return -ENOMEM; 414 415 acquire_queue(kgd, pipe_id, queue_id); 416 417 for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); 418 reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) 419 DUMP_REG(reg); 420 421 release_queue(kgd); 422 423 WARN_ON_ONCE(i != HQD_N_REGS); 424 *n_regs = i; 425 426 return 0; 427} 428 429static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, 430 uint32_t __user *wptr, struct mm_struct *mm) 431{ 432 struct amdgpu_device *adev = get_amdgpu_device(kgd); 433 struct v10_sdma_mqd *m; 434 uint32_t sdma_rlc_reg_offset; 435 unsigned long end_jiffies; 436 uint32_t data; 437 uint64_t data64; 438 uint64_t __user *wptr64 = (uint64_t __user *)wptr; 439 440 m = get_sdma_mqd(mqd); 441 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 442 m->sdma_queue_id); 443 444 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 445 m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); 446 447 end_jiffies = msecs_to_jiffies(2000) + jiffies; 448 while (true) { 449 data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 450 if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 451 break; 452 if (time_after(jiffies, end_jiffies)) { 453 pr_err("SDMA RLC not idle in %s\n", __func__); 454 return -ETIME; 455 } 456 usleep_range(500, 1000); 457 } 458 459 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, 460 m->sdmax_rlcx_doorbell_offset); 461 462 data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, 463 ENABLE, 1); 464 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); 465 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, 466 m->sdmax_rlcx_rb_rptr); 467 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, 468 m->sdmax_rlcx_rb_rptr_hi); 469 470 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); 471 if (read_user_wptr(mm, wptr64, data64)) { 472 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 473 lower_32_bits(data64)); 474 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 475 upper_32_bits(data64)); 476 } else { 477 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, 478 m->sdmax_rlcx_rb_rptr); 479 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, 480 m->sdmax_rlcx_rb_rptr_hi); 481 } 482 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); 483 484 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); 485 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, 486 m->sdmax_rlcx_rb_base_hi); 487 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, 488 m->sdmax_rlcx_rb_rptr_addr_lo); 489 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, 490 m->sdmax_rlcx_rb_rptr_addr_hi); 491 492 data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, 493 RB_ENABLE, 1); 494 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); 495 496 return 0; 497} 498 499static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, 500 uint32_t engine_id, uint32_t queue_id, 501 uint32_t (**dump)[2], uint32_t *n_regs) 502{ 503 struct amdgpu_device *adev = get_amdgpu_device(kgd); 504 uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, 505 engine_id, queue_id); 506 uint32_t i = 0, reg; 507#undef HQD_N_REGS 508#define HQD_N_REGS (19+6+7+10) 509 510 *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); 511 if (*dump == NULL) 512 return -ENOMEM; 513 514 for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) 515 DUMP_REG(sdma_rlc_reg_offset + reg); 516 for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) 517 DUMP_REG(sdma_rlc_reg_offset + reg); 518 for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; 519 reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) 520 DUMP_REG(sdma_rlc_reg_offset + reg); 521 for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; 522 reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) 523 DUMP_REG(sdma_rlc_reg_offset + reg); 524 525 WARN_ON_ONCE(i != HQD_N_REGS); 526 *n_regs = i; 527 528 return 0; 529} 530 531static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, 532 uint32_t pipe_id, uint32_t queue_id) 533{ 534 struct amdgpu_device *adev = get_amdgpu_device(kgd); 535 uint32_t act; 536 bool retval = false; 537 uint32_t low, high; 538 539 acquire_queue(kgd, pipe_id, queue_id); 540 act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 541 if (act) { 542 low = lower_32_bits(queue_address >> 8); 543 high = upper_32_bits(queue_address >> 8); 544 545 if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && 546 high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) 547 retval = true; 548 } 549 release_queue(kgd); 550 return retval; 551} 552 553static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) 554{ 555 struct amdgpu_device *adev = get_amdgpu_device(kgd); 556 struct v10_sdma_mqd *m; 557 uint32_t sdma_rlc_reg_offset; 558 uint32_t sdma_rlc_rb_cntl; 559 560 m = get_sdma_mqd(mqd); 561 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 562 m->sdma_queue_id); 563 564 sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 565 566 if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) 567 return true; 568 569 return false; 570} 571 572static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, 573 enum kfd_preempt_type reset_type, 574 unsigned int utimeout, uint32_t pipe_id, 575 uint32_t queue_id) 576{ 577 struct amdgpu_device *adev = get_amdgpu_device(kgd); 578 enum hqd_dequeue_request_type type; 579 unsigned long end_jiffies; 580 uint32_t temp; 581 struct v10_compute_mqd *m = get_mqd(mqd); 582 583#if 0 584 unsigned long flags; 585 int retry; 586#endif 587 588 acquire_queue(kgd, pipe_id, queue_id); 589 590 if (m->cp_hqd_vmid == 0) 591 WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); 592 593 switch (reset_type) { 594 case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: 595 type = DRAIN_PIPE; 596 break; 597 case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: 598 type = RESET_WAVES; 599 break; 600 default: 601 type = DRAIN_PIPE; 602 break; 603 } 604 605#if 0 /* Is this still needed? */ 606 /* Workaround: If IQ timer is active and the wait time is close to or 607 * equal to 0, dequeueing is not safe. Wait until either the wait time 608 * is larger or timer is cleared. Also, ensure that IQ_REQ_PEND is 609 * cleared before continuing. Also, ensure wait times are set to at 610 * least 0x3. 611 */ 612 local_irq_save(flags); 613 preempt_disable(); 614 retry = 5000; /* wait for 500 usecs at maximum */ 615 while (true) { 616 temp = RREG32(mmCP_HQD_IQ_TIMER); 617 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, PROCESSING_IQ)) { 618 pr_debug("HW is processing IQ\n"); 619 goto loop; 620 } 621 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, ACTIVE)) { 622 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, RETRY_TYPE) 623 == 3) /* SEM-rearm is safe */ 624 break; 625 /* Wait time 3 is safe for CP, but our MMIO read/write 626 * time is close to 1 microsecond, so check for 10 to 627 * leave more buffer room 628 */ 629 if (REG_GET_FIELD(temp, CP_HQD_IQ_TIMER, WAIT_TIME) 630 >= 10) 631 break; 632 pr_debug("IQ timer is active\n"); 633 } else 634 break; 635loop: 636 if (!retry) { 637 pr_err("CP HQD IQ timer status time out\n"); 638 break; 639 } 640 ndelay(100); 641 --retry; 642 } 643 retry = 1000; 644 while (true) { 645 temp = RREG32(mmCP_HQD_DEQUEUE_REQUEST); 646 if (!(temp & CP_HQD_DEQUEUE_REQUEST__IQ_REQ_PEND_MASK)) 647 break; 648 pr_debug("Dequeue request is pending\n"); 649 650 if (!retry) { 651 pr_err("CP HQD dequeue request time out\n"); 652 break; 653 } 654 ndelay(100); 655 --retry; 656 } 657 local_irq_restore(flags); 658 preempt_enable(); 659#endif 660 661 WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); 662 663 end_jiffies = (utimeout * HZ / 1000) + jiffies; 664 while (true) { 665 temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); 666 if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) 667 break; 668 if (time_after(jiffies, end_jiffies)) { 669 pr_err("cp queue preemption time out.\n"); 670 release_queue(kgd); 671 return -ETIME; 672 } 673 usleep_range(500, 1000); 674 } 675 676 release_queue(kgd); 677 return 0; 678} 679 680static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, 681 unsigned int utimeout) 682{ 683 struct amdgpu_device *adev = get_amdgpu_device(kgd); 684 struct v10_sdma_mqd *m; 685 uint32_t sdma_rlc_reg_offset; 686 uint32_t temp; 687 unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; 688 689 m = get_sdma_mqd(mqd); 690 sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, 691 m->sdma_queue_id); 692 693 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); 694 temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; 695 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); 696 697 while (true) { 698 temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); 699 if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) 700 break; 701 if (time_after(jiffies, end_jiffies)) { 702 pr_err("SDMA RLC not idle in %s\n", __func__); 703 return -ETIME; 704 } 705 usleep_range(500, 1000); 706 } 707 708 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); 709 WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, 710 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | 711 SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); 712 713 m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); 714 m->sdmax_rlcx_rb_rptr_hi = 715 RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); 716 717 return 0; 718} 719 720static bool get_atc_vmid_pasid_mapping_info(struct kgd_dev *kgd, 721 uint8_t vmid, uint16_t *p_pasid) 722{ 723 uint32_t value; 724 struct amdgpu_device *adev = (struct amdgpu_device *) kgd; 725 726 value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) 727 + vmid); 728 *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; 729 730 return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); 731} 732 733static int kgd_address_watch_disable(struct kgd_dev *kgd) 734{ 735 return 0; 736} 737 738static int kgd_address_watch_execute(struct kgd_dev *kgd, 739 unsigned int watch_point_id, 740 uint32_t cntl_val, 741 uint32_t addr_hi, 742 uint32_t addr_lo) 743{ 744 return 0; 745} 746 747static int kgd_wave_control_execute(struct kgd_dev *kgd, 748 uint32_t gfx_index_val, 749 uint32_t sq_cmd) 750{ 751 struct amdgpu_device *adev = get_amdgpu_device(kgd); 752 uint32_t data = 0; 753 754 mutex_lock(&adev->grbm_idx_mutex); 755 756 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); 757 WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); 758 759 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 760 INSTANCE_BROADCAST_WRITES, 1); 761 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 762 SA_BROADCAST_WRITES, 1); 763 data = REG_SET_FIELD(data, GRBM_GFX_INDEX, 764 SE_BROADCAST_WRITES, 1); 765 766 WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); 767 mutex_unlock(&adev->grbm_idx_mutex); 768 769 return 0; 770} 771 772static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, 773 unsigned int watch_point_id, 774 unsigned int reg_offset) 775{ 776 return 0; 777} 778 779static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, 780 uint64_t page_table_base) 781{ 782 struct amdgpu_device *adev = get_amdgpu_device(kgd); 783 784 if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { 785 pr_err("trying to set page table base for wrong VMID %u\n", 786 vmid); 787 return; 788 } 789 790 /* SDMA is on gfxhub as well for Navi1* series */ 791 gfxhub_v2_0_setup_vm_pt_regs(adev, vmid, page_table_base); 792} 793 794const struct kfd2kgd_calls gfx_v10_kfd2kgd = { 795 .program_sh_mem_settings = kgd_program_sh_mem_settings, 796 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, 797 .init_interrupts = kgd_init_interrupts, 798 .hqd_load = kgd_hqd_load, 799 .hiq_mqd_load = kgd_hiq_mqd_load, 800 .hqd_sdma_load = kgd_hqd_sdma_load, 801 .hqd_dump = kgd_hqd_dump, 802 .hqd_sdma_dump = kgd_hqd_sdma_dump, 803 .hqd_is_occupied = kgd_hqd_is_occupied, 804 .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, 805 .hqd_destroy = kgd_hqd_destroy, 806 .hqd_sdma_destroy = kgd_hqd_sdma_destroy, 807 .address_watch_disable = kgd_address_watch_disable, 808 .address_watch_execute = kgd_address_watch_execute, 809 .wave_control_execute = kgd_wave_control_execute, 810 .address_watch_get_offset = kgd_address_watch_get_offset, 811 .get_atc_vmid_pasid_mapping_info = 812 get_atc_vmid_pasid_mapping_info, 813 .get_tile_config = amdgpu_amdkfd_get_tile_config, 814 .set_vm_context_page_table_base = set_vm_context_page_table_base, 815 .get_hive_id = amdgpu_amdkfd_get_hive_id, 816}; 817