1/*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "kfd_debug.h"
24#include "kfd_device_queue_manager.h"
25#include "kfd_topology.h"
26#include <linux/file.h>
27#include <uapi/linux/kfd_ioctl.h>
28
29#define MAX_WATCH_ADDRESSES	4
30
31int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32		      unsigned int *queue_id,
33		      unsigned int *gpu_id,
34		      uint64_t exception_clear_mask,
35		      uint64_t *event_status)
36{
37	struct process_queue_manager *pqm;
38	struct process_queue_node *pqn;
39	int i;
40
41	if (!(process && process->debug_trap_enabled))
42		return -ENODATA;
43
44	mutex_lock(&process->event_mutex);
45	*event_status = 0;
46	*queue_id = 0;
47	*gpu_id = 0;
48
49	/* find and report queue events */
50	pqm = &process->pqm;
51	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52		uint64_t tmp = process->exception_enable_mask;
53
54		if (!pqn->q)
55			continue;
56
57		tmp &= pqn->q->properties.exception_status;
58
59		if (!tmp)
60			continue;
61
62		*event_status = pqn->q->properties.exception_status;
63		*queue_id = pqn->q->properties.queue_id;
64		*gpu_id = pqn->q->device->id;
65		pqn->q->properties.exception_status &= ~exception_clear_mask;
66		goto out;
67	}
68
69	/* find and report device events */
70	for (i = 0; i < process->n_pdds; i++) {
71		struct kfd_process_device *pdd = process->pdds[i];
72		uint64_t tmp = process->exception_enable_mask
73						& pdd->exception_status;
74
75		if (!tmp)
76			continue;
77
78		*event_status = pdd->exception_status;
79		*gpu_id = pdd->dev->id;
80		pdd->exception_status &= ~exception_clear_mask;
81		goto out;
82	}
83
84	/* report process events */
85	if (process->exception_enable_mask & process->exception_status) {
86		*event_status = process->exception_status;
87		process->exception_status &= ~exception_clear_mask;
88	}
89
90out:
91	mutex_unlock(&process->event_mutex);
92	return *event_status ? 0 : -EAGAIN;
93}
94
95void debug_event_write_work_handler(struct work_struct *work)
96{
97	struct kfd_process *process;
98
99	static const char write_data = '.';
100	loff_t pos = 0;
101
102	process = container_of(work,
103			struct kfd_process,
104			debug_event_workarea);
105
106	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
107}
108
109/* update process/device/queue exception status, write to descriptor
110 * only if exception_status is enabled.
111 */
112bool kfd_dbg_ev_raise(uint64_t event_mask,
113			struct kfd_process *process, struct kfd_node *dev,
114			unsigned int source_id, bool use_worker,
115			void *exception_data, size_t exception_data_size)
116{
117	struct process_queue_manager *pqm;
118	struct process_queue_node *pqn;
119	int i;
120	static const char write_data = '.';
121	loff_t pos = 0;
122	bool is_subscribed = true;
123
124	if (!(process && process->debug_trap_enabled))
125		return false;
126
127	mutex_lock(&process->event_mutex);
128
129	if (event_mask & KFD_EC_MASK_DEVICE) {
130		for (i = 0; i < process->n_pdds; i++) {
131			struct kfd_process_device *pdd = process->pdds[i];
132
133			if (pdd->dev != dev)
134				continue;
135
136			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
137
138			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
139				if (!pdd->vm_fault_exc_data) {
140					pdd->vm_fault_exc_data = kmemdup(
141							exception_data,
142							exception_data_size,
143							GFP_KERNEL);
144					if (!pdd->vm_fault_exc_data)
145						pr_debug("Failed to allocate exception data memory");
146				} else {
147					pr_debug("Debugger exception data not saved\n");
148					print_hex_dump_bytes("exception data: ",
149							DUMP_PREFIX_OFFSET,
150							exception_data,
151							exception_data_size);
152				}
153			}
154			break;
155		}
156	} else if (event_mask & KFD_EC_MASK_PROCESS) {
157		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
158	} else {
159		pqm = &process->pqm;
160		list_for_each_entry(pqn, &pqm->queues,
161				process_queue_list) {
162			int target_id;
163
164			if (!pqn->q)
165				continue;
166
167			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
168					pqn->q->properties.queue_id :
169							pqn->q->doorbell_id;
170
171			if (pqn->q->device != dev || target_id != source_id)
172				continue;
173
174			pqn->q->properties.exception_status |= event_mask;
175			break;
176		}
177	}
178
179	if (process->exception_enable_mask & event_mask) {
180		if (use_worker)
181			schedule_work(&process->debug_event_workarea);
182		else
183			kernel_write(process->dbg_ev_file,
184					&write_data,
185					1,
186					&pos);
187	} else {
188		is_subscribed = false;
189	}
190
191	mutex_unlock(&process->event_mutex);
192
193	return is_subscribed;
194}
195
196/* set pending event queue entry from ring entry  */
197bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
198				   unsigned int pasid,
199				   uint32_t doorbell_id,
200				   uint64_t trap_mask,
201				   void *exception_data,
202				   size_t exception_data_size)
203{
204	struct kfd_process *p;
205	bool signaled_to_debugger_or_runtime = false;
206
207	p = kfd_lookup_process_by_pasid(pasid);
208
209	if (!p)
210		return false;
211
212	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
213			      exception_data, exception_data_size)) {
214		struct process_queue_manager *pqm;
215		struct process_queue_node *pqn;
216
217		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
218		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
219			mutex_lock(&p->mutex);
220
221			pqm = &p->pqm;
222			list_for_each_entry(pqn, &pqm->queues,
223							process_queue_list) {
224
225				if (!(pqn->q && pqn->q->device == dev &&
226				      pqn->q->doorbell_id == doorbell_id))
227					continue;
228
229				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
230							      trap_mask);
231
232				signaled_to_debugger_or_runtime = true;
233
234				break;
235			}
236
237			mutex_unlock(&p->mutex);
238		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
239			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
240			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
241							exception_data);
242
243			signaled_to_debugger_or_runtime = true;
244		}
245	} else {
246		signaled_to_debugger_or_runtime = true;
247	}
248
249	kfd_unref_process(p);
250
251	return signaled_to_debugger_or_runtime;
252}
253
254int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
255					unsigned int dev_id,
256					unsigned int queue_id,
257					uint64_t error_reason)
258{
259	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
260		struct kfd_process_device *pdd = NULL;
261		struct kfd_hsa_memory_exception_data *data;
262		int i;
263
264		for (i = 0; i < p->n_pdds; i++) {
265			if (p->pdds[i]->dev->id == dev_id) {
266				pdd = p->pdds[i];
267				break;
268			}
269		}
270
271		if (!pdd)
272			return -ENODEV;
273
274		data = (struct kfd_hsa_memory_exception_data *)
275						pdd->vm_fault_exc_data;
276
277		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
278		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
279		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
280	}
281
282	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
283		/*
284		 * block should only happen after the debugger receives runtime
285		 * enable notice.
286		 */
287		up(&p->runtime_enable_sema);
288		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
289	}
290
291	if (error_reason)
292		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
293
294	return 0;
295}
296
297static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
298{
299	struct mqd_update_info minfo = {0};
300	int err;
301
302	if (!q)
303		return 0;
304
305	if (!kfd_dbg_has_cwsr_workaround(q->device))
306		return 0;
307
308	if (enable && q->properties.is_user_cu_masked)
309		return -EBUSY;
310
311	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
312
313	q->properties.is_dbg_wa = enable;
314	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
315	if (err)
316		q->properties.is_dbg_wa = false;
317
318	return err;
319}
320
321static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
322{
323	struct process_queue_manager *pqm = &target->pqm;
324	struct process_queue_node *pqn;
325	int r = 0;
326
327	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
328		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
329		if (enable && r)
330			goto unwind;
331	}
332
333	return 0;
334
335unwind:
336	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
337		kfd_dbg_set_queue_workaround(pqn->q, false);
338
339	if (enable)
340		target->runtime_info.runtime_state = r == -EBUSY ?
341				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
342				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
343
344	return r;
345}
346
347int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
348{
349	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
350	uint32_t flags = pdd->process->dbg_flags;
351
352	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
353		return 0;
354
355	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
356						pdd->watch_points, flags, sq_trap_en);
357}
358
359#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
360static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
361{
362	int i;
363
364	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
365
366	spin_lock(&pdd->dev->kfd->watch_points_lock);
367
368	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
369		/* device watchpoint in use so skip */
370		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
371			continue;
372
373		pdd->alloc_watch_ids |= 0x1 << i;
374		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
375		*watch_id = i;
376		spin_unlock(&pdd->dev->kfd->watch_points_lock);
377		return 0;
378	}
379
380	spin_unlock(&pdd->dev->kfd->watch_points_lock);
381
382	return -ENOMEM;
383}
384
385static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
386{
387	spin_lock(&pdd->dev->kfd->watch_points_lock);
388
389	/* process owns device watch point so safe to clear */
390	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
391		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
392		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
393	}
394
395	spin_unlock(&pdd->dev->kfd->watch_points_lock);
396}
397
398static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
399{
400	bool owns_watch_id = false;
401
402	spin_lock(&pdd->dev->kfd->watch_points_lock);
403	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
404			((pdd->alloc_watch_ids >> watch_id) & 0x1);
405
406	spin_unlock(&pdd->dev->kfd->watch_points_lock);
407
408	return owns_watch_id;
409}
410
411int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
412					uint32_t watch_id)
413{
414	int r;
415
416	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
417		return -EINVAL;
418
419	if (!pdd->dev->kfd->shared_resources.enable_mes) {
420		r = debug_lock_and_unmap(pdd->dev->dqm);
421		if (r)
422			return r;
423	}
424
425	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
426	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
427							pdd->dev->adev,
428							watch_id);
429	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
430
431	if (!pdd->dev->kfd->shared_resources.enable_mes)
432		r = debug_map_and_unlock(pdd->dev->dqm);
433	else
434		r = kfd_dbg_set_mes_debug_mode(pdd, true);
435
436	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
437
438	return r;
439}
440
441int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
442					uint64_t watch_address,
443					uint32_t watch_address_mask,
444					uint32_t *watch_id,
445					uint32_t watch_mode)
446{
447	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
448	uint32_t xcc_mask = pdd->dev->xcc_mask;
449
450	if (r)
451		return r;
452
453	if (!pdd->dev->kfd->shared_resources.enable_mes) {
454		r = debug_lock_and_unmap(pdd->dev->dqm);
455		if (r) {
456			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
457			return r;
458		}
459	}
460
461	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
462	for_each_inst(xcc_id, xcc_mask)
463		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
464				pdd->dev->adev,
465				watch_address,
466				watch_address_mask,
467				*watch_id,
468				watch_mode,
469				pdd->dev->vm_info.last_vmid_kfd,
470				xcc_id);
471	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
472
473	if (!pdd->dev->kfd->shared_resources.enable_mes)
474		r = debug_map_and_unlock(pdd->dev->dqm);
475	else
476		r = kfd_dbg_set_mes_debug_mode(pdd, true);
477
478	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
479	if (r)
480		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
481
482	return 0;
483}
484
485static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
486{
487	int i, j;
488
489	for (i = 0; i < target->n_pdds; i++)
490		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
491			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
492}
493
494int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
495{
496	uint32_t prev_flags = target->dbg_flags;
497	int i, r = 0, rewind_count = 0;
498
499	for (i = 0; i < target->n_pdds; i++) {
500		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
501			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
502			*flags = prev_flags;
503			return -EACCES;
504		}
505	}
506
507	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
508	*flags = prev_flags;
509	for (i = 0; i < target->n_pdds; i++) {
510		struct kfd_process_device *pdd = target->pdds[i];
511
512		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
513			continue;
514
515		if (!pdd->dev->kfd->shared_resources.enable_mes)
516			r = debug_refresh_runlist(pdd->dev->dqm);
517		else
518			r = kfd_dbg_set_mes_debug_mode(pdd, true);
519
520		if (r) {
521			target->dbg_flags = prev_flags;
522			break;
523		}
524
525		rewind_count++;
526	}
527
528	/* Rewind flags */
529	if (r) {
530		target->dbg_flags = prev_flags;
531
532		for (i = 0; i < rewind_count; i++) {
533			struct kfd_process_device *pdd = target->pdds[i];
534
535			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
536				continue;
537
538			if (!pdd->dev->kfd->shared_resources.enable_mes)
539				debug_refresh_runlist(pdd->dev->dqm);
540			else
541				kfd_dbg_set_mes_debug_mode(pdd, true);
542		}
543	}
544
545	return r;
546}
547
548/* kfd_dbg_trap_deactivate:
549 *	target: target process
550 *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
551 *	unwind_count:
552 *		If unwind == true, how far down the pdd list we need
553 *				to unwind
554 *		else: ignored
555 */
556void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
557{
558	int i;
559
560	if (!unwind) {
561		uint32_t flags = 0;
562		int resume_count = resume_queues(target, 0, NULL);
563
564		if (resume_count)
565			pr_debug("Resumed %d queues\n", resume_count);
566
567		cancel_work_sync(&target->debug_event_workarea);
568		kfd_dbg_clear_process_address_watch(target);
569		kfd_dbg_trap_set_wave_launch_mode(target, 0);
570
571		kfd_dbg_trap_set_flags(target, &flags);
572	}
573
574	for (i = 0; i < target->n_pdds; i++) {
575		struct kfd_process_device *pdd = target->pdds[i];
576
577		/* If this is an unwind, and we have unwound the required
578		 * enable calls on the pdd list, we need to stop now
579		 * otherwise we may mess up another debugger session.
580		 */
581		if (unwind && i == unwind_count)
582			break;
583
584		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
585
586		/* GFX off is already disabled by debug activate if not RLC restore supported. */
587		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
588			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
589		pdd->spi_dbg_override =
590				pdd->dev->kfd2kgd->disable_debug_trap(
591				pdd->dev->adev,
592				target->runtime_info.ttmp_setup,
593				pdd->dev->vm_info.last_vmid_kfd);
594		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
595
596		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
597				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
598			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
599
600		if (!pdd->dev->kfd->shared_resources.enable_mes)
601			debug_refresh_runlist(pdd->dev->dqm);
602		else
603			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
604	}
605
606	kfd_dbg_set_workaround(target, false);
607}
608
609static void kfd_dbg_clean_exception_status(struct kfd_process *target)
610{
611	struct process_queue_manager *pqm;
612	struct process_queue_node *pqn;
613	int i;
614
615	for (i = 0; i < target->n_pdds; i++) {
616		struct kfd_process_device *pdd = target->pdds[i];
617
618		kfd_process_drain_interrupts(pdd);
619
620		pdd->exception_status = 0;
621	}
622
623	pqm = &target->pqm;
624	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
625		if (!pqn->q)
626			continue;
627
628		pqn->q->properties.exception_status = 0;
629	}
630
631	target->exception_status = 0;
632}
633
634int kfd_dbg_trap_disable(struct kfd_process *target)
635{
636	if (!target->debug_trap_enabled)
637		return 0;
638
639	/*
640	 * Defer deactivation to runtime if runtime not enabled otherwise reset
641	 * attached running target runtime state to enable for re-attach.
642	 */
643	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
644		kfd_dbg_trap_deactivate(target, false, 0);
645	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
646		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
647
648	fput(target->dbg_ev_file);
649	target->dbg_ev_file = NULL;
650
651	if (target->debugger_process) {
652		atomic_dec(&target->debugger_process->debugged_process_count);
653		target->debugger_process = NULL;
654	}
655
656	target->debug_trap_enabled = false;
657	kfd_dbg_clean_exception_status(target);
658	kfd_unref_process(target);
659
660	return 0;
661}
662
663int kfd_dbg_trap_activate(struct kfd_process *target)
664{
665	int i, r = 0;
666
667	r = kfd_dbg_set_workaround(target, true);
668	if (r)
669		return r;
670
671	for (i = 0; i < target->n_pdds; i++) {
672		struct kfd_process_device *pdd = target->pdds[i];
673
674		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
675			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
676
677			if (r) {
678				target->runtime_info.runtime_state = (r == -EBUSY) ?
679							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
680							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
681
682				goto unwind_err;
683			}
684		}
685
686		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
687		 * If RLC restore of debug registers is not supported and runtime enable
688		 * hasn't done so already on ttmp setup request, restore the trap config registers.
689		 *
690		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
691		 * the debug session.
692		 */
693		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
694		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
695						target->runtime_info.ttmp_setup))
696			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
697								pdd->dev->vm_info.last_vmid_kfd);
698
699		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
700					pdd->dev->adev,
701					false,
702					pdd->dev->vm_info.last_vmid_kfd);
703
704		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
705			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
706
707		/*
708		 * Setting the debug flag in the trap handler requires that the TMA has been
709		 * allocated, which occurs during CWSR initialization.
710		 * In the event that CWSR has not been initialized at this point, setting the
711		 * flag will be called again during CWSR initialization if the target process
712		 * is still debug enabled.
713		 */
714		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
715
716		if (!pdd->dev->kfd->shared_resources.enable_mes)
717			r = debug_refresh_runlist(pdd->dev->dqm);
718		else
719			r = kfd_dbg_set_mes_debug_mode(pdd, true);
720
721		if (r) {
722			target->runtime_info.runtime_state =
723					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
724			goto unwind_err;
725		}
726	}
727
728	return 0;
729
730unwind_err:
731	/* Enabling debug failed, we need to disable on
732	 * all GPUs so the enable is all or nothing.
733	 */
734	kfd_dbg_trap_deactivate(target, true, i);
735	return r;
736}
737
738int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
739			void __user *runtime_info, uint32_t *runtime_size)
740{
741	struct file *f;
742	uint32_t copy_size;
743	int i, r = 0;
744
745	if (target->debug_trap_enabled)
746		return -EALREADY;
747
748	/* Enable pre-checks */
749	for (i = 0; i < target->n_pdds; i++) {
750		struct kfd_process_device *pdd = target->pdds[i];
751
752		if (!KFD_IS_SOC15(pdd->dev))
753			return -ENODEV;
754
755		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
756					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
757			return -EBUSY;
758	}
759
760	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
761
762	f = fget(fd);
763	if (!f) {
764		pr_err("Failed to get file for (%i)\n", fd);
765		return -EBADF;
766	}
767
768	target->dbg_ev_file = f;
769
770	/* defer activation to runtime if not runtime enabled */
771	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
772		kfd_dbg_trap_activate(target);
773
774	/* We already hold the process reference but hold another one for the
775	 * debug session.
776	 */
777	kref_get(&target->ref);
778	target->debug_trap_enabled = true;
779
780	if (target->debugger_process)
781		atomic_inc(&target->debugger_process->debugged_process_count);
782
783	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
784		kfd_dbg_trap_deactivate(target, false, 0);
785		r = -EFAULT;
786	}
787
788	*runtime_size = sizeof(target->runtime_info);
789
790	return r;
791}
792
793static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
794						uint32_t trap_override,
795						uint32_t trap_mask_request,
796						uint32_t *trap_mask_supported)
797{
798	int i = 0;
799
800	*trap_mask_supported = 0xffffffff;
801
802	for (i = 0; i < p->n_pdds; i++) {
803		struct kfd_process_device *pdd = p->pdds[i];
804		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
805								pdd->dev->adev,
806								trap_override,
807								trap_mask_supported);
808
809		if (err)
810			return err;
811	}
812
813	if (trap_mask_request & ~*trap_mask_supported)
814		return -EACCES;
815
816	return 0;
817}
818
819int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
820					uint32_t trap_override,
821					uint32_t trap_mask_bits,
822					uint32_t trap_mask_request,
823					uint32_t *trap_mask_prev,
824					uint32_t *trap_mask_supported)
825{
826	int r = 0, i;
827
828	r = kfd_dbg_validate_trap_override_request(target,
829						trap_override,
830						trap_mask_request,
831						trap_mask_supported);
832
833	if (r)
834		return r;
835
836	for (i = 0; i < target->n_pdds; i++) {
837		struct kfd_process_device *pdd = target->pdds[i];
838
839		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
840		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
841				pdd->dev->adev,
842				pdd->dev->vm_info.last_vmid_kfd,
843				trap_override,
844				trap_mask_bits,
845				trap_mask_request,
846				trap_mask_prev,
847				pdd->spi_dbg_override);
848		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
849
850		if (!pdd->dev->kfd->shared_resources.enable_mes)
851			r = debug_refresh_runlist(pdd->dev->dqm);
852		else
853			r = kfd_dbg_set_mes_debug_mode(pdd, true);
854
855		if (r)
856			break;
857	}
858
859	return r;
860}
861
862int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
863					uint8_t wave_launch_mode)
864{
865	int r = 0, i;
866
867	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
868			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
869			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
870		return -EINVAL;
871
872	for (i = 0; i < target->n_pdds; i++) {
873		struct kfd_process_device *pdd = target->pdds[i];
874
875		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
876		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
877				pdd->dev->adev,
878				wave_launch_mode,
879				pdd->dev->vm_info.last_vmid_kfd);
880		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
881
882		if (!pdd->dev->kfd->shared_resources.enable_mes)
883			r = debug_refresh_runlist(pdd->dev->dqm);
884		else
885			r = kfd_dbg_set_mes_debug_mode(pdd, true);
886
887		if (r)
888			break;
889	}
890
891	return r;
892}
893
894int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
895		uint32_t source_id,
896		uint32_t exception_code,
897		bool clear_exception,
898		void __user *info,
899		uint32_t *info_size)
900{
901	bool found = false;
902	int r = 0;
903	uint32_t copy_size, actual_info_size = 0;
904	uint64_t *exception_status_ptr = NULL;
905
906	if (!target)
907		return -EINVAL;
908
909	if (!info || !info_size)
910		return -EINVAL;
911
912	mutex_lock(&target->event_mutex);
913
914	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
915		/* Per queue exceptions */
916		struct queue *queue = NULL;
917		int i;
918
919		for (i = 0; i < target->n_pdds; i++) {
920			struct kfd_process_device *pdd = target->pdds[i];
921			struct qcm_process_device *qpd = &pdd->qpd;
922
923			list_for_each_entry(queue, &qpd->queues_list, list) {
924				if (!found && queue->properties.queue_id == source_id) {
925					found = true;
926					break;
927				}
928			}
929			if (found)
930				break;
931		}
932
933		if (!found) {
934			r = -EINVAL;
935			goto out;
936		}
937
938		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
939			r = -ENODATA;
940			goto out;
941		}
942		exception_status_ptr = &queue->properties.exception_status;
943	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
944		/* Per device exceptions */
945		struct kfd_process_device *pdd = NULL;
946		int i;
947
948		for (i = 0; i < target->n_pdds; i++) {
949			pdd = target->pdds[i];
950			if (pdd->dev->id == source_id) {
951				found = true;
952				break;
953			}
954		}
955
956		if (!found) {
957			r = -EINVAL;
958			goto out;
959		}
960
961		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
962			r = -ENODATA;
963			goto out;
964		}
965
966		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
967			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
968
969			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
970				r = -EFAULT;
971				goto out;
972			}
973			actual_info_size = pdd->vm_fault_exc_data_size;
974			if (clear_exception) {
975				kfree(pdd->vm_fault_exc_data);
976				pdd->vm_fault_exc_data = NULL;
977				pdd->vm_fault_exc_data_size = 0;
978			}
979		}
980		exception_status_ptr = &pdd->exception_status;
981	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
982		/* Per process exceptions */
983		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
984			r = -ENODATA;
985			goto out;
986		}
987
988		if (exception_code == EC_PROCESS_RUNTIME) {
989			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
990
991			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
992				r = -EFAULT;
993				goto out;
994			}
995
996			actual_info_size = sizeof(target->runtime_info);
997		}
998
999		exception_status_ptr = &target->exception_status;
1000	} else {
1001		pr_debug("Bad exception type [%i]\n", exception_code);
1002		r = -EINVAL;
1003		goto out;
1004	}
1005
1006	*info_size = actual_info_size;
1007	if (clear_exception)
1008		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1009out:
1010	mutex_unlock(&target->event_mutex);
1011	return r;
1012}
1013
1014int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1015		uint64_t exception_clear_mask,
1016		void __user *user_info,
1017		uint32_t *number_of_device_infos,
1018		uint32_t *entry_size)
1019{
1020	struct kfd_dbg_device_info_entry device_info;
1021	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1022	int i, r = 0;
1023
1024	if (!(target && user_info && number_of_device_infos && entry_size))
1025		return -EINVAL;
1026
1027	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1028	*number_of_device_infos = target->n_pdds;
1029	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1030
1031	if (!tmp_num_devices)
1032		return 0;
1033
1034	memset(&device_info, 0, sizeof(device_info));
1035
1036	mutex_lock(&target->event_mutex);
1037
1038	/* Run over all pdd of the process */
1039	for (i = 0; i < tmp_num_devices; i++) {
1040		struct kfd_process_device *pdd = target->pdds[i];
1041		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1042
1043		device_info.gpu_id = pdd->dev->id;
1044		device_info.exception_status = pdd->exception_status;
1045		device_info.lds_base = pdd->lds_base;
1046		device_info.lds_limit = pdd->lds_limit;
1047		device_info.scratch_base = pdd->scratch_base;
1048		device_info.scratch_limit = pdd->scratch_limit;
1049		device_info.gpuvm_base = pdd->gpuvm_base;
1050		device_info.gpuvm_limit = pdd->gpuvm_limit;
1051		device_info.location_id = topo_dev->node_props.location_id;
1052		device_info.vendor_id = topo_dev->node_props.vendor_id;
1053		device_info.device_id = topo_dev->node_props.device_id;
1054		device_info.revision_id = pdd->dev->adev->pdev->revision;
1055		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1056		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1057		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1058		device_info.gfx_target_version =
1059			topo_dev->node_props.gfx_target_version;
1060		device_info.simd_count = topo_dev->node_props.simd_count;
1061		device_info.max_waves_per_simd =
1062			topo_dev->node_props.max_waves_per_simd;
1063		device_info.array_count = topo_dev->node_props.array_count;
1064		device_info.simd_arrays_per_engine =
1065			topo_dev->node_props.simd_arrays_per_engine;
1066		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1067		device_info.capability = topo_dev->node_props.capability;
1068		device_info.debug_prop = topo_dev->node_props.debug_prop;
1069
1070		if (exception_clear_mask)
1071			pdd->exception_status &= ~exception_clear_mask;
1072
1073		if (copy_to_user(user_info, &device_info, *entry_size)) {
1074			r = -EFAULT;
1075			break;
1076		}
1077
1078		user_info += tmp_entry_size;
1079	}
1080
1081	mutex_unlock(&target->event_mutex);
1082
1083	return r;
1084}
1085
1086void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1087					uint64_t exception_set_mask)
1088{
1089	uint64_t found_mask = 0;
1090	struct process_queue_manager *pqm;
1091	struct process_queue_node *pqn;
1092	static const char write_data = '.';
1093	loff_t pos = 0;
1094	int i;
1095
1096	mutex_lock(&target->event_mutex);
1097
1098	found_mask |= target->exception_status;
1099
1100	pqm = &target->pqm;
1101	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1102		if (!pqn->q)
1103			continue;
1104
1105		found_mask |= pqn->q->properties.exception_status;
1106	}
1107
1108	for (i = 0; i < target->n_pdds; i++) {
1109		struct kfd_process_device *pdd = target->pdds[i];
1110
1111		found_mask |= pdd->exception_status;
1112	}
1113
1114	if (exception_set_mask & found_mask)
1115		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1116
1117	target->exception_enable_mask = exception_set_mask;
1118
1119	mutex_unlock(&target->event_mutex);
1120}
1121