1/*
2 * Copyright 2022 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23#include <linux/slab.h>
24#include <drm/drm_print.h>
25
26#include "amdgpu_ring_mux.h"
27#include "amdgpu_ring.h"
28#include "amdgpu.h"
29
30#define AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT (HZ / 2)
31#define AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US 10000
32
33static const struct ring_info {
34	unsigned int hw_pio;
35	const char *ring_name;
36} sw_ring_info[] = {
37	{ AMDGPU_RING_PRIO_DEFAULT, "gfx_low"},
38	{ AMDGPU_RING_PRIO_2, "gfx_high"},
39};
40
41static struct kmem_cache *amdgpu_mux_chunk_slab;
42
43static inline struct amdgpu_mux_entry *amdgpu_ring_mux_sw_entry(struct amdgpu_ring_mux *mux,
44								struct amdgpu_ring *ring)
45{
46	return ring->entry_index < mux->ring_entry_size ?
47			&mux->ring_entry[ring->entry_index] : NULL;
48}
49
50/* copy packages on sw ring range[begin, end) */
51static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
52						  struct amdgpu_ring *ring,
53						  u64 s_start, u64 s_end)
54{
55	u64 start, end;
56	struct amdgpu_ring *real_ring = mux->real_ring;
57
58	start = s_start & ring->buf_mask;
59	end = s_end & ring->buf_mask;
60
61	if (start == end) {
62		DRM_ERROR("no more data copied from sw ring\n");
63		return;
64	}
65	if (start > end) {
66		amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
67		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
68					   (ring->ring_size >> 2) - start);
69		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
70	} else {
71		amdgpu_ring_alloc(real_ring, end - start);
72		amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
73	}
74}
75
76static void amdgpu_mux_resubmit_chunks(struct amdgpu_ring_mux *mux)
77{
78	struct amdgpu_mux_entry *e = NULL;
79	struct amdgpu_mux_chunk *chunk;
80	uint32_t seq, last_seq;
81	int i;
82
83	/*find low priority entries:*/
84	if (!mux->s_resubmit)
85		return;
86
87	for (i = 0; i < mux->num_ring_entries; i++) {
88		if (mux->ring_entry[i].ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
89			e = &mux->ring_entry[i];
90			break;
91		}
92	}
93
94	if (!e) {
95		DRM_ERROR("%s no low priority ring found\n", __func__);
96		return;
97	}
98
99	last_seq = atomic_read(&e->ring->fence_drv.last_seq);
100	seq = mux->seqno_to_resubmit;
101	if (last_seq < seq) {
102		/*resubmit all the fences between (last_seq, seq]*/
103		list_for_each_entry(chunk, &e->list, entry) {
104			if (chunk->sync_seq > last_seq && chunk->sync_seq <= seq) {
105				amdgpu_fence_update_start_timestamp(e->ring,
106								    chunk->sync_seq,
107								    ktime_get());
108				if (chunk->sync_seq ==
109					le32_to_cpu(*(e->ring->fence_drv.cpu_addr + 2))) {
110					if (chunk->cntl_offset <= e->ring->buf_mask)
111						amdgpu_ring_patch_cntl(e->ring,
112								       chunk->cntl_offset);
113					if (chunk->ce_offset <= e->ring->buf_mask)
114						amdgpu_ring_patch_ce(e->ring, chunk->ce_offset);
115					if (chunk->de_offset <= e->ring->buf_mask)
116						amdgpu_ring_patch_de(e->ring, chunk->de_offset);
117				}
118				amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, e->ring,
119								      chunk->start,
120								      chunk->end);
121				mux->wptr_resubmit = chunk->end;
122				amdgpu_ring_commit(mux->real_ring);
123			}
124		}
125	}
126
127	del_timer(&mux->resubmit_timer);
128	mux->s_resubmit = false;
129}
130
131static void amdgpu_ring_mux_schedule_resubmit(struct amdgpu_ring_mux *mux)
132{
133	mod_timer(&mux->resubmit_timer, jiffies + AMDGPU_MUX_RESUBMIT_JIFFIES_TIMEOUT);
134}
135
136static void amdgpu_mux_resubmit_fallback(struct timer_list *t)
137{
138	struct amdgpu_ring_mux *mux = from_timer(mux, t, resubmit_timer);
139
140	if (!spin_trylock(&mux->lock)) {
141		amdgpu_ring_mux_schedule_resubmit(mux);
142		DRM_ERROR("reschedule resubmit\n");
143		return;
144	}
145	amdgpu_mux_resubmit_chunks(mux);
146	spin_unlock(&mux->lock);
147}
148
149int amdgpu_ring_mux_init(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring,
150			 unsigned int entry_size)
151{
152	mux->real_ring = ring;
153	mux->num_ring_entries = 0;
154
155	mux->ring_entry = kcalloc(entry_size, sizeof(struct amdgpu_mux_entry), GFP_KERNEL);
156	if (!mux->ring_entry)
157		return -ENOMEM;
158
159	mux->ring_entry_size = entry_size;
160	mux->s_resubmit = false;
161
162	amdgpu_mux_chunk_slab = KMEM_CACHE(amdgpu_mux_chunk, SLAB_HWCACHE_ALIGN);
163	if (!amdgpu_mux_chunk_slab) {
164		DRM_ERROR("create amdgpu_mux_chunk cache failed\n");
165		return -ENOMEM;
166	}
167
168	spin_lock_init(&mux->lock);
169	timer_setup(&mux->resubmit_timer, amdgpu_mux_resubmit_fallback, 0);
170
171	return 0;
172}
173
174void amdgpu_ring_mux_fini(struct amdgpu_ring_mux *mux)
175{
176	struct amdgpu_mux_entry *e;
177	struct amdgpu_mux_chunk *chunk, *chunk2;
178	int i;
179
180	for (i = 0; i < mux->num_ring_entries; i++) {
181		e = &mux->ring_entry[i];
182		list_for_each_entry_safe(chunk, chunk2, &e->list, entry) {
183			list_del(&chunk->entry);
184			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
185		}
186	}
187	kmem_cache_destroy(amdgpu_mux_chunk_slab);
188	kfree(mux->ring_entry);
189	mux->ring_entry = NULL;
190	mux->num_ring_entries = 0;
191	mux->ring_entry_size = 0;
192}
193
194int amdgpu_ring_mux_add_sw_ring(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
195{
196	struct amdgpu_mux_entry *e;
197
198	if (mux->num_ring_entries >= mux->ring_entry_size) {
199		DRM_ERROR("add sw ring exceeding max entry size\n");
200		return -ENOENT;
201	}
202
203	e = &mux->ring_entry[mux->num_ring_entries];
204	ring->entry_index = mux->num_ring_entries;
205	e->ring = ring;
206
207	INIT_LIST_HEAD(&e->list);
208	mux->num_ring_entries += 1;
209	return 0;
210}
211
212void amdgpu_ring_mux_set_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring, u64 wptr)
213{
214	struct amdgpu_mux_entry *e;
215
216	spin_lock(&mux->lock);
217
218	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT)
219		amdgpu_mux_resubmit_chunks(mux);
220
221	e = amdgpu_ring_mux_sw_entry(mux, ring);
222	if (!e) {
223		DRM_ERROR("cannot find entry for sw ring\n");
224		spin_unlock(&mux->lock);
225		return;
226	}
227
228	/* We could skip this set wptr as preemption in process. */
229	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && mux->pending_trailing_fence_signaled) {
230		spin_unlock(&mux->lock);
231		return;
232	}
233
234	e->sw_cptr = e->sw_wptr;
235	/* Update cptr if the package already copied in resubmit functions */
236	if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT && e->sw_cptr < mux->wptr_resubmit)
237		e->sw_cptr = mux->wptr_resubmit;
238	e->sw_wptr = wptr;
239	e->start_ptr_in_hw_ring = mux->real_ring->wptr;
240
241	/* Skip copying for the packages already resubmitted.*/
242	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT || mux->wptr_resubmit < wptr) {
243		amdgpu_ring_mux_copy_pkt_from_sw_ring(mux, ring, e->sw_cptr, wptr);
244		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
245		amdgpu_ring_commit(mux->real_ring);
246	} else {
247		e->end_ptr_in_hw_ring = mux->real_ring->wptr;
248	}
249	spin_unlock(&mux->lock);
250}
251
252u64 amdgpu_ring_mux_get_wptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
253{
254	struct amdgpu_mux_entry *e;
255
256	e = amdgpu_ring_mux_sw_entry(mux, ring);
257	if (!e) {
258		DRM_ERROR("cannot find entry for sw ring\n");
259		return 0;
260	}
261
262	return e->sw_wptr;
263}
264
265/**
266 * amdgpu_ring_mux_get_rptr - get the readptr of the software ring
267 * @mux: the multiplexer the software rings attach to
268 * @ring: the software ring of which we calculate the readptr
269 *
270 * The return value of the readptr is not precise while the other rings could
271 * write data onto the real ring buffer.After overwriting on the real ring, we
272 * can not decide if our packages have been excuted or not read yet. However,
273 * this function is only called by the tools such as umr to collect the latest
274 * packages for the hang analysis. We assume the hang happens near our latest
275 * submit. Thus we could use the following logic to give the clue:
276 * If the readptr is between start and end, then we return the copy pointer
277 * plus the distance from start to readptr. If the readptr is before start, we
278 * return the copy pointer. Lastly, if the readptr is past end, we return the
279 * write pointer.
280 */
281u64 amdgpu_ring_mux_get_rptr(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
282{
283	struct amdgpu_mux_entry *e;
284	u64 readp, offset, start, end;
285
286	e = amdgpu_ring_mux_sw_entry(mux, ring);
287	if (!e) {
288		DRM_ERROR("no sw entry found!\n");
289		return 0;
290	}
291
292	readp = amdgpu_ring_get_rptr(mux->real_ring);
293
294	start = e->start_ptr_in_hw_ring & mux->real_ring->buf_mask;
295	end = e->end_ptr_in_hw_ring & mux->real_ring->buf_mask;
296	if (start > end) {
297		if (readp <= end)
298			readp += mux->real_ring->ring_size >> 2;
299		end += mux->real_ring->ring_size >> 2;
300	}
301
302	if (start <= readp && readp <= end) {
303		offset = readp - start;
304		e->sw_rptr = (e->sw_cptr + offset) & ring->buf_mask;
305	} else if (readp < start) {
306		e->sw_rptr = e->sw_cptr;
307	} else {
308		/* end < readptr */
309		e->sw_rptr = e->sw_wptr;
310	}
311
312	return e->sw_rptr;
313}
314
315u64 amdgpu_sw_ring_get_rptr_gfx(struct amdgpu_ring *ring)
316{
317	struct amdgpu_device *adev = ring->adev;
318	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
319
320	WARN_ON(!ring->is_sw_ring);
321	return amdgpu_ring_mux_get_rptr(mux, ring);
322}
323
324u64 amdgpu_sw_ring_get_wptr_gfx(struct amdgpu_ring *ring)
325{
326	struct amdgpu_device *adev = ring->adev;
327	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
328
329	WARN_ON(!ring->is_sw_ring);
330	return amdgpu_ring_mux_get_wptr(mux, ring);
331}
332
333void amdgpu_sw_ring_set_wptr_gfx(struct amdgpu_ring *ring)
334{
335	struct amdgpu_device *adev = ring->adev;
336	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
337
338	WARN_ON(!ring->is_sw_ring);
339	amdgpu_ring_mux_set_wptr(mux, ring, ring->wptr);
340}
341
342/* Override insert_nop to prevent emitting nops to the software rings */
343void amdgpu_sw_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
344{
345	WARN_ON(!ring->is_sw_ring);
346}
347
348const char *amdgpu_sw_ring_name(int idx)
349{
350	return idx < ARRAY_SIZE(sw_ring_info) ?
351		sw_ring_info[idx].ring_name : NULL;
352}
353
354unsigned int amdgpu_sw_ring_priority(int idx)
355{
356	return idx < ARRAY_SIZE(sw_ring_info) ?
357		sw_ring_info[idx].hw_pio : AMDGPU_RING_PRIO_DEFAULT;
358}
359
360/*Scan on low prio rings to have unsignaled fence and high ring has no fence.*/
361static int amdgpu_mcbp_scan(struct amdgpu_ring_mux *mux)
362{
363	struct amdgpu_ring *ring;
364	int i, need_preempt;
365
366	need_preempt = 0;
367	for (i = 0; i < mux->num_ring_entries; i++) {
368		ring = mux->ring_entry[i].ring;
369		if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT &&
370		    amdgpu_fence_count_emitted(ring) > 0)
371			return 0;
372		if (ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT &&
373		    amdgpu_fence_last_unsignaled_time_us(ring) >
374		    AMDGPU_MAX_LAST_UNSIGNALED_THRESHOLD_US)
375			need_preempt = 1;
376	}
377	return need_preempt && !mux->s_resubmit;
378}
379
380/* Trigger Mid-Command Buffer Preemption (MCBP) and find if we need to resubmit. */
381static int amdgpu_mcbp_trigger_preempt(struct amdgpu_ring_mux *mux)
382{
383	int r;
384
385	spin_lock(&mux->lock);
386	mux->pending_trailing_fence_signaled = true;
387	r = amdgpu_ring_preempt_ib(mux->real_ring);
388	spin_unlock(&mux->lock);
389	return r;
390}
391
392void amdgpu_sw_ring_ib_begin(struct amdgpu_ring *ring)
393{
394	struct amdgpu_device *adev = ring->adev;
395	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
396
397	WARN_ON(!ring->is_sw_ring);
398	if (adev->gfx.mcbp && ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT) {
399		if (amdgpu_mcbp_scan(mux) > 0)
400			amdgpu_mcbp_trigger_preempt(mux);
401		return;
402	}
403
404	amdgpu_ring_mux_start_ib(mux, ring);
405}
406
407void amdgpu_sw_ring_ib_end(struct amdgpu_ring *ring)
408{
409	struct amdgpu_device *adev = ring->adev;
410	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
411
412	WARN_ON(!ring->is_sw_ring);
413	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
414		return;
415	amdgpu_ring_mux_end_ib(mux, ring);
416}
417
418void amdgpu_sw_ring_ib_mark_offset(struct amdgpu_ring *ring, enum amdgpu_ring_mux_offset_type type)
419{
420	struct amdgpu_device *adev = ring->adev;
421	struct amdgpu_ring_mux *mux = &adev->gfx.muxer;
422	unsigned offset;
423
424	if (ring->hw_prio > AMDGPU_RING_PRIO_DEFAULT)
425		return;
426
427	offset = ring->wptr & ring->buf_mask;
428
429	amdgpu_ring_mux_ib_mark_offset(mux, ring, offset, type);
430}
431
432void amdgpu_ring_mux_start_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
433{
434	struct amdgpu_mux_entry *e;
435	struct amdgpu_mux_chunk *chunk;
436
437	spin_lock(&mux->lock);
438	amdgpu_mux_resubmit_chunks(mux);
439	spin_unlock(&mux->lock);
440
441	e = amdgpu_ring_mux_sw_entry(mux, ring);
442	if (!e) {
443		DRM_ERROR("cannot find entry!\n");
444		return;
445	}
446
447	chunk = kmem_cache_alloc(amdgpu_mux_chunk_slab, GFP_KERNEL);
448	if (!chunk) {
449		DRM_ERROR("alloc amdgpu_mux_chunk_slab failed\n");
450		return;
451	}
452
453	chunk->start = ring->wptr;
454	/* the initialized value used to check if they are set by the ib submission*/
455	chunk->cntl_offset = ring->buf_mask + 1;
456	chunk->de_offset = ring->buf_mask + 1;
457	chunk->ce_offset = ring->buf_mask + 1;
458	list_add_tail(&chunk->entry, &e->list);
459}
460
461static void scan_and_remove_signaled_chunk(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
462{
463	uint32_t last_seq = 0;
464	struct amdgpu_mux_entry *e;
465	struct amdgpu_mux_chunk *chunk, *tmp;
466
467	e = amdgpu_ring_mux_sw_entry(mux, ring);
468	if (!e) {
469		DRM_ERROR("cannot find entry!\n");
470		return;
471	}
472
473	last_seq = atomic_read(&ring->fence_drv.last_seq);
474
475	list_for_each_entry_safe(chunk, tmp, &e->list, entry) {
476		if (chunk->sync_seq <= last_seq) {
477			list_del(&chunk->entry);
478			kmem_cache_free(amdgpu_mux_chunk_slab, chunk);
479		}
480	}
481}
482
483void amdgpu_ring_mux_ib_mark_offset(struct amdgpu_ring_mux *mux,
484				    struct amdgpu_ring *ring, u64 offset,
485				    enum amdgpu_ring_mux_offset_type type)
486{
487	struct amdgpu_mux_entry *e;
488	struct amdgpu_mux_chunk *chunk;
489
490	e = amdgpu_ring_mux_sw_entry(mux, ring);
491	if (!e) {
492		DRM_ERROR("cannot find entry!\n");
493		return;
494	}
495
496	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
497	if (!chunk) {
498		DRM_ERROR("cannot find chunk!\n");
499		return;
500	}
501
502	switch (type) {
503	case AMDGPU_MUX_OFFSET_TYPE_CONTROL:
504		chunk->cntl_offset = offset;
505		break;
506	case AMDGPU_MUX_OFFSET_TYPE_DE:
507		chunk->de_offset = offset;
508		break;
509	case AMDGPU_MUX_OFFSET_TYPE_CE:
510		chunk->ce_offset = offset;
511		break;
512	default:
513		DRM_ERROR("invalid type (%d)\n", type);
514		break;
515	}
516}
517
518void amdgpu_ring_mux_end_ib(struct amdgpu_ring_mux *mux, struct amdgpu_ring *ring)
519{
520	struct amdgpu_mux_entry *e;
521	struct amdgpu_mux_chunk *chunk;
522
523	e = amdgpu_ring_mux_sw_entry(mux, ring);
524	if (!e) {
525		DRM_ERROR("cannot find entry!\n");
526		return;
527	}
528
529	chunk = list_last_entry(&e->list, struct amdgpu_mux_chunk, entry);
530	if (!chunk) {
531		DRM_ERROR("cannot find chunk!\n");
532		return;
533	}
534
535	chunk->end = ring->wptr;
536	chunk->sync_seq = READ_ONCE(ring->fence_drv.sync_seq);
537
538	scan_and_remove_signaled_chunk(mux, ring);
539}
540
541bool amdgpu_mcbp_handle_trailing_fence_irq(struct amdgpu_ring_mux *mux)
542{
543	struct amdgpu_mux_entry *e;
544	struct amdgpu_ring *ring = NULL;
545	int i;
546
547	if (!mux->pending_trailing_fence_signaled)
548		return false;
549
550	if (mux->real_ring->trail_seq != le32_to_cpu(*mux->real_ring->trail_fence_cpu_addr))
551		return false;
552
553	for (i = 0; i < mux->num_ring_entries; i++) {
554		e = &mux->ring_entry[i];
555		if (e->ring->hw_prio <= AMDGPU_RING_PRIO_DEFAULT) {
556			ring = e->ring;
557			break;
558		}
559	}
560
561	if (!ring) {
562		DRM_ERROR("cannot find low priority ring\n");
563		return false;
564	}
565
566	amdgpu_fence_process(ring);
567	if (amdgpu_fence_count_emitted(ring) > 0) {
568		mux->s_resubmit = true;
569		mux->seqno_to_resubmit = ring->fence_drv.sync_seq;
570		amdgpu_ring_mux_schedule_resubmit(mux);
571	}
572
573	mux->pending_trailing_fence_signaled = false;
574	return true;
575}
576