1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2021 Intel Corporation
4 */
5
6#include "xe_lrc.h"
7
8#include <linux/ascii85.h>
9
10#include "instructions/xe_mi_commands.h"
11#include "instructions/xe_gfxpipe_commands.h"
12#include "instructions/xe_gfx_state_commands.h"
13#include "regs/xe_engine_regs.h"
14#include "regs/xe_gpu_commands.h"
15#include "regs/xe_lrc_layout.h"
16#include "xe_bb.h"
17#include "xe_bo.h"
18#include "xe_device.h"
19#include "xe_drm_client.h"
20#include "xe_exec_queue_types.h"
21#include "xe_gt.h"
22#include "xe_gt_printk.h"
23#include "xe_hw_fence.h"
24#include "xe_map.h"
25#include "xe_memirq.h"
26#include "xe_sriov.h"
27#include "xe_vm.h"
28
29#define LRC_VALID				BIT_ULL(0)
30#define LRC_PRIVILEGE				BIT_ULL(8)
31#define LRC_ADDRESSING_MODE			GENMASK_ULL(4, 3)
32#define LRC_LEGACY_64B_CONTEXT			3
33
34#define LRC_ENGINE_CLASS			GENMASK_ULL(63, 61)
35#define LRC_ENGINE_INSTANCE			GENMASK_ULL(53, 48)
36
37struct xe_lrc_snapshot {
38	struct xe_bo *lrc_bo;
39	void *lrc_snapshot;
40	unsigned long lrc_size, lrc_offset;
41
42	u32 context_desc;
43	u32 head;
44	struct {
45		u32 internal;
46		u32 memory;
47	} tail;
48	u32 start_seqno;
49	u32 seqno;
50};
51
52static struct xe_device *
53lrc_to_xe(struct xe_lrc *lrc)
54{
55	return gt_to_xe(lrc->fence_ctx.gt);
56}
57
58size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
59{
60	switch (class) {
61	case XE_ENGINE_CLASS_RENDER:
62		if (GRAPHICS_VER(xe) >= 20)
63			return 4 * SZ_4K;
64		else
65			return 14 * SZ_4K;
66	case XE_ENGINE_CLASS_COMPUTE:
67		/* 14 pages since graphics_ver == 11 */
68		if (GRAPHICS_VER(xe) >= 20)
69			return 3 * SZ_4K;
70		else
71			return 14 * SZ_4K;
72	default:
73		WARN(1, "Unknown engine class: %d", class);
74		fallthrough;
75	case XE_ENGINE_CLASS_COPY:
76	case XE_ENGINE_CLASS_VIDEO_DECODE:
77	case XE_ENGINE_CLASS_VIDEO_ENHANCE:
78	case XE_ENGINE_CLASS_OTHER:
79		return 2 * SZ_4K;
80	}
81}
82
83/*
84 * The per-platform tables are u8-encoded in @data. Decode @data and set the
85 * addresses' offset and commands in @regs. The following encoding is used
86 * for each byte. There are 2 steps: decoding commands and decoding addresses.
87 *
88 * Commands:
89 * [7]: create NOPs - number of NOPs are set in lower bits
90 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
91 *      MI_LRI_FORCE_POSTED
92 * [5:0]: Number of NOPs or registers to set values to in case of
93 *        MI_LOAD_REGISTER_IMM
94 *
95 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
96 * number of registers. They are set by using the REG/REG16 macros: the former
97 * is used for offsets smaller than 0x200 while the latter is for values bigger
98 * than that. Those macros already set all the bits documented below correctly:
99 *
100 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
101 *      follow, for the lower bits
102 * [6:0]: Register offset, without considering the engine base.
103 *
104 * This function only tweaks the commands and register offsets. Values are not
105 * filled out.
106 */
107static void set_offsets(u32 *regs,
108			const u8 *data,
109			const struct xe_hw_engine *hwe)
110#define NOP(x) (BIT(7) | (x))
111#define LRI(count, flags) ((flags) << 6 | (count) | \
112			   BUILD_BUG_ON_ZERO(count >= BIT(6)))
113#define POSTED BIT(0)
114#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
115#define REG16(x) \
116	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
117	(((x) >> 2) & 0x7f)
118{
119	const u32 base = hwe->mmio_base;
120
121	while (*data) {
122		u8 count, flags;
123
124		if (*data & BIT(7)) { /* skip */
125			count = *data++ & ~BIT(7);
126			regs += count;
127			continue;
128		}
129
130		count = *data & 0x3f;
131		flags = *data >> 6;
132		data++;
133
134		*regs = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(count);
135		if (flags & POSTED)
136			*regs |= MI_LRI_FORCE_POSTED;
137		*regs |= MI_LRI_LRM_CS_MMIO;
138		regs++;
139
140		xe_gt_assert(hwe->gt, count);
141		do {
142			u32 offset = 0;
143			u8 v;
144
145			do {
146				v = *data++;
147				offset <<= 7;
148				offset |= v & ~BIT(7);
149			} while (v & BIT(7));
150
151			regs[0] = base + (offset << 2);
152			regs += 2;
153		} while (--count);
154	}
155
156	*regs = MI_BATCH_BUFFER_END | BIT(0);
157}
158
159static const u8 gen12_xcs_offsets[] = {
160	NOP(1),
161	LRI(13, POSTED),
162	REG16(0x244),
163	REG(0x034),
164	REG(0x030),
165	REG(0x038),
166	REG(0x03c),
167	REG(0x168),
168	REG(0x140),
169	REG(0x110),
170	REG(0x1c0),
171	REG(0x1c4),
172	REG(0x1c8),
173	REG(0x180),
174	REG16(0x2b4),
175
176	NOP(5),
177	LRI(9, POSTED),
178	REG16(0x3a8),
179	REG16(0x28c),
180	REG16(0x288),
181	REG16(0x284),
182	REG16(0x280),
183	REG16(0x27c),
184	REG16(0x278),
185	REG16(0x274),
186	REG16(0x270),
187
188	0
189};
190
191static const u8 dg2_xcs_offsets[] = {
192	NOP(1),
193	LRI(15, POSTED),
194	REG16(0x244),
195	REG(0x034),
196	REG(0x030),
197	REG(0x038),
198	REG(0x03c),
199	REG(0x168),
200	REG(0x140),
201	REG(0x110),
202	REG(0x1c0),
203	REG(0x1c4),
204	REG(0x1c8),
205	REG(0x180),
206	REG16(0x2b4),
207	REG(0x120),
208	REG(0x124),
209
210	NOP(1),
211	LRI(9, POSTED),
212	REG16(0x3a8),
213	REG16(0x28c),
214	REG16(0x288),
215	REG16(0x284),
216	REG16(0x280),
217	REG16(0x27c),
218	REG16(0x278),
219	REG16(0x274),
220	REG16(0x270),
221
222	0
223};
224
225static const u8 gen12_rcs_offsets[] = {
226	NOP(1),
227	LRI(13, POSTED),
228	REG16(0x244),
229	REG(0x034),
230	REG(0x030),
231	REG(0x038),
232	REG(0x03c),
233	REG(0x168),
234	REG(0x140),
235	REG(0x110),
236	REG(0x1c0),
237	REG(0x1c4),
238	REG(0x1c8),
239	REG(0x180),
240	REG16(0x2b4),
241
242	NOP(5),
243	LRI(9, POSTED),
244	REG16(0x3a8),
245	REG16(0x28c),
246	REG16(0x288),
247	REG16(0x284),
248	REG16(0x280),
249	REG16(0x27c),
250	REG16(0x278),
251	REG16(0x274),
252	REG16(0x270),
253
254	LRI(3, POSTED),
255	REG(0x1b0),
256	REG16(0x5a8),
257	REG16(0x5ac),
258
259	NOP(6),
260	LRI(1, 0),
261	REG(0x0c8),
262	NOP(3 + 9 + 1),
263
264	LRI(51, POSTED),
265	REG16(0x588),
266	REG16(0x588),
267	REG16(0x588),
268	REG16(0x588),
269	REG16(0x588),
270	REG16(0x588),
271	REG(0x028),
272	REG(0x09c),
273	REG(0x0c0),
274	REG(0x178),
275	REG(0x17c),
276	REG16(0x358),
277	REG(0x170),
278	REG(0x150),
279	REG(0x154),
280	REG(0x158),
281	REG16(0x41c),
282	REG16(0x600),
283	REG16(0x604),
284	REG16(0x608),
285	REG16(0x60c),
286	REG16(0x610),
287	REG16(0x614),
288	REG16(0x618),
289	REG16(0x61c),
290	REG16(0x620),
291	REG16(0x624),
292	REG16(0x628),
293	REG16(0x62c),
294	REG16(0x630),
295	REG16(0x634),
296	REG16(0x638),
297	REG16(0x63c),
298	REG16(0x640),
299	REG16(0x644),
300	REG16(0x648),
301	REG16(0x64c),
302	REG16(0x650),
303	REG16(0x654),
304	REG16(0x658),
305	REG16(0x65c),
306	REG16(0x660),
307	REG16(0x664),
308	REG16(0x668),
309	REG16(0x66c),
310	REG16(0x670),
311	REG16(0x674),
312	REG16(0x678),
313	REG16(0x67c),
314	REG(0x068),
315	REG(0x084),
316	NOP(1),
317
318	0
319};
320
321static const u8 xehp_rcs_offsets[] = {
322	NOP(1),
323	LRI(13, POSTED),
324	REG16(0x244),
325	REG(0x034),
326	REG(0x030),
327	REG(0x038),
328	REG(0x03c),
329	REG(0x168),
330	REG(0x140),
331	REG(0x110),
332	REG(0x1c0),
333	REG(0x1c4),
334	REG(0x1c8),
335	REG(0x180),
336	REG16(0x2b4),
337
338	NOP(5),
339	LRI(9, POSTED),
340	REG16(0x3a8),
341	REG16(0x28c),
342	REG16(0x288),
343	REG16(0x284),
344	REG16(0x280),
345	REG16(0x27c),
346	REG16(0x278),
347	REG16(0x274),
348	REG16(0x270),
349
350	LRI(3, POSTED),
351	REG(0x1b0),
352	REG16(0x5a8),
353	REG16(0x5ac),
354
355	NOP(6),
356	LRI(1, 0),
357	REG(0x0c8),
358
359	0
360};
361
362static const u8 dg2_rcs_offsets[] = {
363	NOP(1),
364	LRI(15, POSTED),
365	REG16(0x244),
366	REG(0x034),
367	REG(0x030),
368	REG(0x038),
369	REG(0x03c),
370	REG(0x168),
371	REG(0x140),
372	REG(0x110),
373	REG(0x1c0),
374	REG(0x1c4),
375	REG(0x1c8),
376	REG(0x180),
377	REG16(0x2b4),
378	REG(0x120),
379	REG(0x124),
380
381	NOP(1),
382	LRI(9, POSTED),
383	REG16(0x3a8),
384	REG16(0x28c),
385	REG16(0x288),
386	REG16(0x284),
387	REG16(0x280),
388	REG16(0x27c),
389	REG16(0x278),
390	REG16(0x274),
391	REG16(0x270),
392
393	LRI(3, POSTED),
394	REG(0x1b0),
395	REG16(0x5a8),
396	REG16(0x5ac),
397
398	NOP(6),
399	LRI(1, 0),
400	REG(0x0c8),
401
402	0
403};
404
405static const u8 mtl_rcs_offsets[] = {
406	NOP(1),
407	LRI(15, POSTED),
408	REG16(0x244),
409	REG(0x034),
410	REG(0x030),
411	REG(0x038),
412	REG(0x03c),
413	REG(0x168),
414	REG(0x140),
415	REG(0x110),
416	REG(0x1c0),
417	REG(0x1c4),
418	REG(0x1c8),
419	REG(0x180),
420	REG16(0x2b4),
421	REG(0x120),
422	REG(0x124),
423
424	NOP(1),
425	LRI(9, POSTED),
426	REG16(0x3a8),
427	REG16(0x28c),
428	REG16(0x288),
429	REG16(0x284),
430	REG16(0x280),
431	REG16(0x27c),
432	REG16(0x278),
433	REG16(0x274),
434	REG16(0x270),
435
436	NOP(2),
437	LRI(2, POSTED),
438	REG16(0x5a8),
439	REG16(0x5ac),
440
441	NOP(6),
442	LRI(1, 0),
443	REG(0x0c8),
444
445	0
446};
447
448#define XE2_CTX_COMMON \
449	NOP(1),                 /* [0x00] */ \
450	LRI(15, POSTED),        /* [0x01] */ \
451	REG16(0x244),           /* [0x02] CTXT_SR_CTL */ \
452	REG(0x034),             /* [0x04] RING_BUFFER_HEAD */ \
453	REG(0x030),             /* [0x06] RING_BUFFER_TAIL */ \
454	REG(0x038),             /* [0x08] RING_BUFFER_START */ \
455	REG(0x03c),             /* [0x0a] RING_BUFFER_CONTROL */ \
456	REG(0x168),             /* [0x0c] BB_ADDR_UDW */ \
457	REG(0x140),             /* [0x0e] BB_ADDR */ \
458	REG(0x110),             /* [0x10] BB_STATE */ \
459	REG(0x1c0),             /* [0x12] BB_PER_CTX_PTR */ \
460	REG(0x1c4),             /* [0x14] RCS_INDIRECT_CTX */ \
461	REG(0x1c8),             /* [0x16] RCS_INDIRECT_CTX_OFFSET */ \
462	REG(0x180),             /* [0x18] CCID */ \
463	REG16(0x2b4),           /* [0x1a] SEMAPHORE_TOKEN */ \
464	REG(0x120),             /* [0x1c] PRT_BB_STATE */ \
465	REG(0x124),             /* [0x1e] PRT_BB_STATE_UDW */ \
466	\
467	NOP(1),                 /* [0x20] */ \
468	LRI(9, POSTED),         /* [0x21] */ \
469	REG16(0x3a8),           /* [0x22] CTX_TIMESTAMP */ \
470	REG16(0x3ac),           /* [0x24] CTX_TIMESTAMP_UDW */ \
471	REG(0x108),             /* [0x26] INDIRECT_RING_STATE */ \
472	REG16(0x284),           /* [0x28] dummy reg */ \
473	REG16(0x280),           /* [0x2a] CS_ACC_CTR_THOLD */ \
474	REG16(0x27c),           /* [0x2c] CS_CTX_SYS_PASID */ \
475	REG16(0x278),           /* [0x2e] CS_CTX_ASID */ \
476	REG16(0x274),           /* [0x30] PTBP_UDW */ \
477	REG16(0x270)            /* [0x32] PTBP_LDW */
478
479static const u8 xe2_rcs_offsets[] = {
480	XE2_CTX_COMMON,
481
482	NOP(2),                 /* [0x34] */
483	LRI(2, POSTED),         /* [0x36] */
484	REG16(0x5a8),           /* [0x37] CONTEXT_SCHEDULING_ATTRIBUTES */
485	REG16(0x5ac),           /* [0x39] PREEMPTION_STATUS */
486
487	NOP(6),                 /* [0x41] */
488	LRI(1, 0),              /* [0x47] */
489	REG(0x0c8),             /* [0x48] R_PWR_CLK_STATE */
490
491	0
492};
493
494static const u8 xe2_bcs_offsets[] = {
495	XE2_CTX_COMMON,
496
497	NOP(4 + 8 + 1),         /* [0x34] */
498	LRI(2, POSTED),         /* [0x41] */
499	REG16(0x200),           /* [0x42] BCS_SWCTRL */
500	REG16(0x204),           /* [0x44] BLIT_CCTL */
501
502	0
503};
504
505static const u8 xe2_xcs_offsets[] = {
506	XE2_CTX_COMMON,
507
508	0
509};
510
511#undef REG16
512#undef REG
513#undef LRI
514#undef NOP
515
516static const u8 *reg_offsets(struct xe_device *xe, enum xe_engine_class class)
517{
518	if (class == XE_ENGINE_CLASS_RENDER) {
519		if (GRAPHICS_VER(xe) >= 20)
520			return xe2_rcs_offsets;
521		else if (GRAPHICS_VERx100(xe) >= 1270)
522			return mtl_rcs_offsets;
523		else if (GRAPHICS_VERx100(xe) >= 1255)
524			return dg2_rcs_offsets;
525		else if (GRAPHICS_VERx100(xe) >= 1250)
526			return xehp_rcs_offsets;
527		else
528			return gen12_rcs_offsets;
529	} else if (class == XE_ENGINE_CLASS_COPY) {
530		if (GRAPHICS_VER(xe) >= 20)
531			return xe2_bcs_offsets;
532		else
533			return gen12_xcs_offsets;
534	} else {
535		if (GRAPHICS_VER(xe) >= 20)
536			return xe2_xcs_offsets;
537		else if (GRAPHICS_VERx100(xe) >= 1255)
538			return dg2_xcs_offsets;
539		else
540			return gen12_xcs_offsets;
541	}
542}
543
544static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
545{
546	regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
547						       CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
548
549	/* TODO: Timestamp */
550}
551
552static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
553{
554	struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
555	struct xe_device *xe = gt_to_xe(hwe->gt);
556
557	if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
558		return;
559
560	regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
561					MI_LRI_LRM_CS_MMIO | MI_LRM_USE_GGTT;
562	regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
563	regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
564
565	regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
566				       MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
567	regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
568	regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
569	regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
570	regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
571}
572
573static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
574{
575	struct xe_device *xe = gt_to_xe(hwe->gt);
576
577	if (GRAPHICS_VERx100(xe) >= 1250)
578		return 0x70;
579	else
580		return 0x60;
581}
582
583static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
584{
585	int x;
586
587	x = lrc_ring_mi_mode(hwe);
588	regs[x + 1] &= ~STOP_RING;
589	regs[x + 1] |= STOP_RING << 16;
590}
591
592static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
593{
594	return 0;
595}
596
597u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
598{
599	return lrc->ring.size;
600}
601
602/* Make the magic macros work */
603#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
604
605#define LRC_SEQNO_PPHWSP_OFFSET 512
606#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
607#define LRC_PARALLEL_PPHWSP_OFFSET 2048
608#define LRC_PPHWSP_SIZE SZ_4K
609
610static size_t lrc_reg_size(struct xe_device *xe)
611{
612	if (GRAPHICS_VERx100(xe) >= 1250)
613		return 96 * sizeof(u32);
614	else
615		return 80 * sizeof(u32);
616}
617
618size_t xe_lrc_skip_size(struct xe_device *xe)
619{
620	return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
621}
622
623static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
624{
625	/* The seqno is stored in the driver-defined portion of PPHWSP */
626	return xe_lrc_pphwsp_offset(lrc) + LRC_SEQNO_PPHWSP_OFFSET;
627}
628
629static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
630{
631	/* The start seqno is stored in the driver-defined portion of PPHWSP */
632	return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
633}
634
635static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
636{
637	/* The parallel is stored in the driver-defined portion of PPHWSP */
638	return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
639}
640
641static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
642{
643	return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
644}
645
646#define DECL_MAP_ADDR_HELPERS(elem) \
647static inline struct iosys_map __xe_lrc_##elem##_map(struct xe_lrc *lrc) \
648{ \
649	struct iosys_map map = lrc->bo->vmap; \
650\
651	xe_assert(lrc_to_xe(lrc), !iosys_map_is_null(&map));  \
652	iosys_map_incr(&map, __xe_lrc_##elem##_offset(lrc)); \
653	return map; \
654} \
655static inline u32 __maybe_unused __xe_lrc_##elem##_ggtt_addr(struct xe_lrc *lrc) \
656{ \
657	return xe_bo_ggtt_addr(lrc->bo) + __xe_lrc_##elem##_offset(lrc); \
658} \
659
660DECL_MAP_ADDR_HELPERS(ring)
661DECL_MAP_ADDR_HELPERS(pphwsp)
662DECL_MAP_ADDR_HELPERS(seqno)
663DECL_MAP_ADDR_HELPERS(regs)
664DECL_MAP_ADDR_HELPERS(start_seqno)
665DECL_MAP_ADDR_HELPERS(parallel)
666
667#undef DECL_MAP_ADDR_HELPERS
668
669u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
670{
671	return __xe_lrc_pphwsp_ggtt_addr(lrc);
672}
673
674u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
675{
676	struct xe_device *xe = lrc_to_xe(lrc);
677	struct iosys_map map;
678
679	map = __xe_lrc_regs_map(lrc);
680	iosys_map_incr(&map, reg_nr * sizeof(u32));
681	return xe_map_read32(xe, &map);
682}
683
684void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
685{
686	struct xe_device *xe = lrc_to_xe(lrc);
687	struct iosys_map map;
688
689	map = __xe_lrc_regs_map(lrc);
690	iosys_map_incr(&map, reg_nr * sizeof(u32));
691	xe_map_write32(xe, &map, val);
692}
693
694static void *empty_lrc_data(struct xe_hw_engine *hwe)
695{
696	struct xe_device *xe = gt_to_xe(hwe->gt);
697	void *data;
698	u32 *regs;
699
700	data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
701	if (!data)
702		return NULL;
703
704	/* 1st page: Per-Process of HW status Page */
705	regs = data + LRC_PPHWSP_SIZE;
706	set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
707	set_context_control(regs, hwe);
708	set_memory_based_intr(regs, hwe);
709	reset_stop_ring(regs, hwe);
710
711	return data;
712}
713
714static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
715{
716	u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
717
718	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
719	xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
720}
721
722#define PVC_CTX_ASID		(0x2e + 1)
723#define PVC_CTX_ACC_CTR_THOLD	(0x2a + 1)
724
725int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
726		struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
727{
728	struct xe_gt *gt = hwe->gt;
729	struct xe_tile *tile = gt_to_tile(gt);
730	struct xe_device *xe = gt_to_xe(gt);
731	struct iosys_map map;
732	void *init_data = NULL;
733	u32 arb_enable;
734	int err;
735
736	lrc->flags = 0;
737
738	/*
739	 * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
740	 * via VM bind calls.
741	 */
742	lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
743				      ring_size + xe_lrc_size(xe, hwe->class),
744				      ttm_bo_type_kernel,
745				      XE_BO_FLAG_VRAM_IF_DGFX(tile) |
746				      XE_BO_FLAG_GGTT |
747				      XE_BO_FLAG_GGTT_INVALIDATE);
748	if (IS_ERR(lrc->bo))
749		return PTR_ERR(lrc->bo);
750
751	lrc->tile = gt_to_tile(hwe->gt);
752	lrc->ring.size = ring_size;
753	lrc->ring.tail = 0;
754
755	xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
756			     hwe->fence_irq, hwe->name);
757
758	if (!gt->default_lrc[hwe->class]) {
759		init_data = empty_lrc_data(hwe);
760		if (!init_data) {
761			err = -ENOMEM;
762			goto err_lrc_finish;
763		}
764	}
765
766	/*
767	 * Init Per-Process of HW status Page, LRC / context state to known
768	 * values
769	 */
770	map = __xe_lrc_pphwsp_map(lrc);
771	if (!init_data) {
772		xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE);	/* PPHWSP */
773		xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
774				 gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
775				 xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
776	} else {
777		xe_map_memcpy_to(xe, &map, 0, init_data,
778				 xe_lrc_size(xe, hwe->class));
779		kfree(init_data);
780	}
781
782	if (vm) {
783		xe_lrc_set_ppgtt(lrc, vm);
784
785		if (vm->xef)
786			xe_drm_client_add_bo(vm->xef->client, lrc->bo);
787	}
788
789	xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
790	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
791	xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
792	xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
793			     RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
794	if (xe->info.has_asid && vm)
795		xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
796
797	lrc->desc = LRC_VALID;
798	lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
799	/* TODO: Priority */
800
801	/* While this appears to have something about privileged batches or
802	 * some such, it really just means PPGTT mode.
803	 */
804	if (vm)
805		lrc->desc |= LRC_PRIVILEGE;
806
807	if (GRAPHICS_VERx100(xe) < 1250) {
808		lrc->desc |= FIELD_PREP(LRC_ENGINE_INSTANCE, hwe->instance);
809		lrc->desc |= FIELD_PREP(LRC_ENGINE_CLASS, hwe->class);
810	}
811
812	arb_enable = MI_ARB_ON_OFF | MI_ARB_ENABLE;
813	xe_lrc_write_ring(lrc, &arb_enable, sizeof(arb_enable));
814
815	map = __xe_lrc_seqno_map(lrc);
816	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
817
818	map = __xe_lrc_start_seqno_map(lrc);
819	xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
820
821	return 0;
822
823err_lrc_finish:
824	xe_lrc_finish(lrc);
825	return err;
826}
827
828void xe_lrc_finish(struct xe_lrc *lrc)
829{
830	xe_hw_fence_ctx_finish(&lrc->fence_ctx);
831	xe_bo_lock(lrc->bo, false);
832	xe_bo_unpin(lrc->bo);
833	xe_bo_unlock(lrc->bo);
834	xe_bo_put(lrc->bo);
835}
836
837void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
838{
839	xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
840}
841
842u32 xe_lrc_ring_head(struct xe_lrc *lrc)
843{
844	return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
845}
846
847u32 xe_lrc_ring_space(struct xe_lrc *lrc)
848{
849	const u32 head = xe_lrc_ring_head(lrc);
850	const u32 tail = lrc->ring.tail;
851	const u32 size = lrc->ring.size;
852
853	return ((head - tail - 1) & (size - 1)) + 1;
854}
855
856static void __xe_lrc_write_ring(struct xe_lrc *lrc, struct iosys_map ring,
857				const void *data, size_t size)
858{
859	struct xe_device *xe = lrc_to_xe(lrc);
860
861	iosys_map_incr(&ring, lrc->ring.tail);
862	xe_map_memcpy_to(xe, &ring, 0, data, size);
863	lrc->ring.tail = (lrc->ring.tail + size) & (lrc->ring.size - 1);
864}
865
866void xe_lrc_write_ring(struct xe_lrc *lrc, const void *data, size_t size)
867{
868	struct xe_device *xe = lrc_to_xe(lrc);
869	struct iosys_map ring;
870	u32 rhs;
871	size_t aligned_size;
872
873	xe_assert(xe, IS_ALIGNED(size, 4));
874	aligned_size = ALIGN(size, 8);
875
876	ring = __xe_lrc_ring_map(lrc);
877
878	xe_assert(xe, lrc->ring.tail < lrc->ring.size);
879	rhs = lrc->ring.size - lrc->ring.tail;
880	if (size > rhs) {
881		__xe_lrc_write_ring(lrc, ring, data, rhs);
882		__xe_lrc_write_ring(lrc, ring, data + rhs, size - rhs);
883	} else {
884		__xe_lrc_write_ring(lrc, ring, data, size);
885	}
886
887	if (aligned_size > size) {
888		u32 noop = MI_NOOP;
889
890		__xe_lrc_write_ring(lrc, ring, &noop, sizeof(noop));
891	}
892}
893
894u64 xe_lrc_descriptor(struct xe_lrc *lrc)
895{
896	return lrc->desc | xe_lrc_ggtt_addr(lrc);
897}
898
899u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
900{
901	return __xe_lrc_seqno_ggtt_addr(lrc);
902}
903
904struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
905{
906	return &xe_hw_fence_create(&lrc->fence_ctx,
907				   __xe_lrc_seqno_map(lrc))->dma;
908}
909
910s32 xe_lrc_seqno(struct xe_lrc *lrc)
911{
912	struct iosys_map map = __xe_lrc_seqno_map(lrc);
913
914	return xe_map_read32(lrc_to_xe(lrc), &map);
915}
916
917s32 xe_lrc_start_seqno(struct xe_lrc *lrc)
918{
919	struct iosys_map map = __xe_lrc_start_seqno_map(lrc);
920
921	return xe_map_read32(lrc_to_xe(lrc), &map);
922}
923
924u32 xe_lrc_start_seqno_ggtt_addr(struct xe_lrc *lrc)
925{
926	return __xe_lrc_start_seqno_ggtt_addr(lrc);
927}
928
929u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc)
930{
931	return __xe_lrc_parallel_ggtt_addr(lrc);
932}
933
934struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
935{
936	return __xe_lrc_parallel_map(lrc);
937}
938
939static int instr_dw(u32 cmd_header)
940{
941	/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
942	if ((cmd_header & (XE_INSTR_CMD_TYPE | GFXPIPE_PIPELINE)) ==
943	    GFXPIPE_SINGLE_DW_CMD(0, 0))
944		return 1;
945
946	/* 3DSTATE_SO_DECL_LIST has a 9-bit dword length rather than 8 */
947	if ((cmd_header & GFXPIPE_MATCH_MASK) == CMD_3DSTATE_SO_DECL_LIST)
948		return REG_FIELD_GET(CMD_3DSTATE_SO_DECL_LIST_DW_LEN, cmd_header) + 2;
949
950	/* Most instructions have the # of dwords (minus 2) in 7:0 */
951	return REG_FIELD_GET(XE_INSTR_LEN_MASK, cmd_header) + 2;
952}
953
954static int dump_mi_command(struct drm_printer *p,
955			   struct xe_gt *gt,
956			   u32 *dw,
957			   int remaining_dw)
958{
959	u32 inst_header = *dw;
960	u32 numdw = instr_dw(inst_header);
961	u32 opcode = REG_FIELD_GET(MI_OPCODE, inst_header);
962	int num_noop;
963
964	/* First check for commands that don't have/use a '# DW' field */
965	switch (inst_header & MI_OPCODE) {
966	case MI_NOOP:
967		num_noop = 1;
968		while (num_noop < remaining_dw &&
969		       (*(++dw) & REG_GENMASK(31, 23)) == MI_NOOP)
970			num_noop++;
971		drm_printf(p, "[%#010x] MI_NOOP (%d dwords)\n", inst_header, num_noop);
972		return num_noop;
973
974	case MI_TOPOLOGY_FILTER:
975		drm_printf(p, "[%#010x] MI_TOPOLOGY_FILTER\n", inst_header);
976		return 1;
977
978	case MI_BATCH_BUFFER_END:
979		drm_printf(p, "[%#010x] MI_BATCH_BUFFER_END\n", inst_header);
980		/* Return 'remaining_dw' to consume the rest of the LRC */
981		return remaining_dw;
982	}
983
984	/*
985	 * Any remaining commands include a # of dwords.  We should make sure
986	 * it doesn't exceed the remaining size of the LRC.
987	 */
988	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
989		numdw = remaining_dw;
990
991	switch (inst_header & MI_OPCODE) {
992	case MI_LOAD_REGISTER_IMM:
993		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_IMM: %d regs\n",
994			   inst_header, (numdw - 1) / 2);
995		for (int i = 1; i < numdw; i += 2)
996			drm_printf(p, " - %#6x = %#010x\n", dw[i], dw[i + 1]);
997		return numdw;
998
999	case MI_LOAD_REGISTER_MEM & MI_OPCODE:
1000		drm_printf(p, "[%#010x] MI_LOAD_REGISTER_MEM: %s%s\n",
1001			   inst_header,
1002			   dw[0] & MI_LRI_LRM_CS_MMIO ? "CS_MMIO " : "",
1003			   dw[0] & MI_LRM_USE_GGTT ? "USE_GGTT " : "");
1004		if (numdw == 4)
1005			drm_printf(p, " - %#6x = %#010llx\n",
1006				   dw[1], ((u64)(dw[3]) << 32 | (u64)(dw[2])));
1007		else
1008			drm_printf(p, " - %*ph (%s)\n",
1009				   (int)sizeof(u32) * (numdw - 1), dw + 1,
1010				   numdw < 4 ? "truncated" : "malformed");
1011		return numdw;
1012
1013	case MI_FORCE_WAKEUP:
1014		drm_printf(p, "[%#010x] MI_FORCE_WAKEUP\n", inst_header);
1015		return numdw;
1016
1017	default:
1018		drm_printf(p, "[%#010x] unknown MI opcode %#x, likely %d dwords\n",
1019			   inst_header, opcode, numdw);
1020		return numdw;
1021	}
1022}
1023
1024static int dump_gfxpipe_command(struct drm_printer *p,
1025				struct xe_gt *gt,
1026				u32 *dw,
1027				int remaining_dw)
1028{
1029	u32 numdw = instr_dw(*dw);
1030	u32 pipeline = REG_FIELD_GET(GFXPIPE_PIPELINE, *dw);
1031	u32 opcode = REG_FIELD_GET(GFXPIPE_OPCODE, *dw);
1032	u32 subopcode = REG_FIELD_GET(GFXPIPE_SUBOPCODE, *dw);
1033
1034	/*
1035	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1036	 * remaining size of the LRC.
1037	 */
1038	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1039		numdw = remaining_dw;
1040
1041	switch (*dw & GFXPIPE_MATCH_MASK) {
1042#define MATCH(cmd) \
1043	case cmd: \
1044		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1045		return numdw
1046#define MATCH3D(cmd) \
1047	case CMD_##cmd: \
1048		drm_printf(p, "[%#010x] " #cmd " (%d dwords)\n", *dw, numdw); \
1049		return numdw
1050
1051	MATCH(STATE_BASE_ADDRESS);
1052	MATCH(STATE_SIP);
1053	MATCH(GPGPU_CSR_BASE_ADDRESS);
1054	MATCH(STATE_COMPUTE_MODE);
1055	MATCH3D(3DSTATE_BTD);
1056	MATCH(STATE_SYSTEM_MEM_FENCE_ADDRESS);
1057	MATCH(STATE_CONTEXT_DATA_BASE_ADDRESS);
1058
1059	MATCH3D(3DSTATE_VF_STATISTICS);
1060
1061	MATCH(PIPELINE_SELECT);
1062
1063	MATCH3D(3DSTATE_DRAWING_RECTANGLE_FAST);
1064	MATCH3D(3DSTATE_CLEAR_PARAMS);
1065	MATCH3D(3DSTATE_DEPTH_BUFFER);
1066	MATCH3D(3DSTATE_STENCIL_BUFFER);
1067	MATCH3D(3DSTATE_HIER_DEPTH_BUFFER);
1068	MATCH3D(3DSTATE_VERTEX_BUFFERS);
1069	MATCH3D(3DSTATE_VERTEX_ELEMENTS);
1070	MATCH3D(3DSTATE_INDEX_BUFFER);
1071	MATCH3D(3DSTATE_VF);
1072	MATCH3D(3DSTATE_MULTISAMPLE);
1073	MATCH3D(3DSTATE_CC_STATE_POINTERS);
1074	MATCH3D(3DSTATE_SCISSOR_STATE_POINTERS);
1075	MATCH3D(3DSTATE_VS);
1076	MATCH3D(3DSTATE_GS);
1077	MATCH3D(3DSTATE_CLIP);
1078	MATCH3D(3DSTATE_SF);
1079	MATCH3D(3DSTATE_WM);
1080	MATCH3D(3DSTATE_CONSTANT_VS);
1081	MATCH3D(3DSTATE_CONSTANT_GS);
1082	MATCH3D(3DSTATE_CONSTANT_PS);
1083	MATCH3D(3DSTATE_SAMPLE_MASK);
1084	MATCH3D(3DSTATE_CONSTANT_HS);
1085	MATCH3D(3DSTATE_CONSTANT_DS);
1086	MATCH3D(3DSTATE_HS);
1087	MATCH3D(3DSTATE_TE);
1088	MATCH3D(3DSTATE_DS);
1089	MATCH3D(3DSTATE_STREAMOUT);
1090	MATCH3D(3DSTATE_SBE);
1091	MATCH3D(3DSTATE_PS);
1092	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP);
1093	MATCH3D(3DSTATE_CPS_POINTERS);
1094	MATCH3D(3DSTATE_VIEWPORT_STATE_POINTERS_CC);
1095	MATCH3D(3DSTATE_BLEND_STATE_POINTERS);
1096	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_VS);
1097	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_HS);
1098	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_DS);
1099	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_GS);
1100	MATCH3D(3DSTATE_BINDING_TABLE_POINTERS_PS);
1101	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_VS);
1102	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_HS);
1103	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_DS);
1104	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_GS);
1105	MATCH3D(3DSTATE_SAMPLER_STATE_POINTERS_PS);
1106	MATCH3D(3DSTATE_VF_INSTANCING);
1107	MATCH3D(3DSTATE_VF_SGVS);
1108	MATCH3D(3DSTATE_VF_TOPOLOGY);
1109	MATCH3D(3DSTATE_WM_CHROMAKEY);
1110	MATCH3D(3DSTATE_PS_BLEND);
1111	MATCH3D(3DSTATE_WM_DEPTH_STENCIL);
1112	MATCH3D(3DSTATE_PS_EXTRA);
1113	MATCH3D(3DSTATE_RASTER);
1114	MATCH3D(3DSTATE_SBE_SWIZ);
1115	MATCH3D(3DSTATE_WM_HZ_OP);
1116	MATCH3D(3DSTATE_VF_COMPONENT_PACKING);
1117	MATCH3D(3DSTATE_VF_SGVS_2);
1118	MATCH3D(3DSTATE_VFG);
1119	MATCH3D(3DSTATE_URB_ALLOC_VS);
1120	MATCH3D(3DSTATE_URB_ALLOC_HS);
1121	MATCH3D(3DSTATE_URB_ALLOC_DS);
1122	MATCH3D(3DSTATE_URB_ALLOC_GS);
1123	MATCH3D(3DSTATE_SO_BUFFER_INDEX_0);
1124	MATCH3D(3DSTATE_SO_BUFFER_INDEX_1);
1125	MATCH3D(3DSTATE_SO_BUFFER_INDEX_2);
1126	MATCH3D(3DSTATE_SO_BUFFER_INDEX_3);
1127	MATCH3D(3DSTATE_PRIMITIVE_REPLICATION);
1128	MATCH3D(3DSTATE_TBIMR_TILE_PASS_INFO);
1129	MATCH3D(3DSTATE_AMFS);
1130	MATCH3D(3DSTATE_DEPTH_BOUNDS);
1131	MATCH3D(3DSTATE_AMFS_TEXTURE_POINTERS);
1132	MATCH3D(3DSTATE_CONSTANT_TS_POINTER);
1133	MATCH3D(3DSTATE_MESH_CONTROL);
1134	MATCH3D(3DSTATE_MESH_DISTRIB);
1135	MATCH3D(3DSTATE_TASK_REDISTRIB);
1136	MATCH3D(3DSTATE_MESH_SHADER);
1137	MATCH3D(3DSTATE_MESH_SHADER_DATA);
1138	MATCH3D(3DSTATE_TASK_CONTROL);
1139	MATCH3D(3DSTATE_TASK_SHADER);
1140	MATCH3D(3DSTATE_TASK_SHADER_DATA);
1141	MATCH3D(3DSTATE_URB_ALLOC_MESH);
1142	MATCH3D(3DSTATE_URB_ALLOC_TASK);
1143	MATCH3D(3DSTATE_CLIP_MESH);
1144	MATCH3D(3DSTATE_SBE_MESH);
1145	MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
1146
1147	MATCH3D(3DSTATE_DRAWING_RECTANGLE);
1148	MATCH3D(3DSTATE_CHROMA_KEY);
1149	MATCH3D(3DSTATE_POLY_STIPPLE_OFFSET);
1150	MATCH3D(3DSTATE_POLY_STIPPLE_PATTERN);
1151	MATCH3D(3DSTATE_LINE_STIPPLE);
1152	MATCH3D(3DSTATE_AA_LINE_PARAMETERS);
1153	MATCH3D(3DSTATE_MONOFILTER_SIZE);
1154	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_VS);
1155	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_HS);
1156	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_DS);
1157	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_GS);
1158	MATCH3D(3DSTATE_PUSH_CONSTANT_ALLOC_PS);
1159	MATCH3D(3DSTATE_SO_DECL_LIST);
1160	MATCH3D(3DSTATE_SO_BUFFER);
1161	MATCH3D(3DSTATE_BINDING_TABLE_POOL_ALLOC);
1162	MATCH3D(3DSTATE_SAMPLE_PATTERN);
1163	MATCH3D(3DSTATE_3D_MODE);
1164	MATCH3D(3DSTATE_SUBSLICE_HASH_TABLE);
1165	MATCH3D(3DSTATE_SLICE_TABLE_STATE_POINTERS);
1166	MATCH3D(3DSTATE_PTBR_TILE_PASS_INFO);
1167
1168	default:
1169		drm_printf(p, "[%#010x] unknown GFXPIPE command (pipeline=%#x, opcode=%#x, subopcode=%#x), likely %d dwords\n",
1170			   *dw, pipeline, opcode, subopcode, numdw);
1171		return numdw;
1172	}
1173}
1174
1175static int dump_gfx_state_command(struct drm_printer *p,
1176				  struct xe_gt *gt,
1177				  u32 *dw,
1178				  int remaining_dw)
1179{
1180	u32 numdw = instr_dw(*dw);
1181	u32 opcode = REG_FIELD_GET(GFX_STATE_OPCODE, *dw);
1182
1183	/*
1184	 * Make sure we haven't mis-parsed a number of dwords that exceeds the
1185	 * remaining size of the LRC.
1186	 */
1187	if (xe_gt_WARN_ON(gt, numdw > remaining_dw))
1188		numdw = remaining_dw;
1189
1190	switch (*dw & (XE_INSTR_GFX_STATE | GFX_STATE_OPCODE)) {
1191	MATCH(STATE_WRITE_INLINE);
1192
1193	default:
1194		drm_printf(p, "[%#010x] unknown GFX_STATE command (opcode=%#x), likely %d dwords\n",
1195			   *dw, opcode, numdw);
1196		return numdw;
1197	}
1198}
1199
1200void xe_lrc_dump_default(struct drm_printer *p,
1201			 struct xe_gt *gt,
1202			 enum xe_engine_class hwe_class)
1203{
1204	u32 *dw;
1205	int remaining_dw, num_dw;
1206
1207	if (!gt->default_lrc[hwe_class]) {
1208		drm_printf(p, "No default LRC for class %d\n", hwe_class);
1209		return;
1210	}
1211
1212	/*
1213	 * Skip the beginning of the LRC since it contains the per-process
1214	 * hardware status page.
1215	 */
1216	dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
1217	remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
1218
1219	while (remaining_dw > 0) {
1220		if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
1221			num_dw = dump_mi_command(p, gt, dw, remaining_dw);
1222		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE) {
1223			num_dw = dump_gfxpipe_command(p, gt, dw, remaining_dw);
1224		} else if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_GFX_STATE) {
1225			num_dw = dump_gfx_state_command(p, gt, dw, remaining_dw);
1226		} else {
1227			num_dw = min(instr_dw(*dw), remaining_dw);
1228			drm_printf(p, "[%#10x] Unknown instruction of type %#x, likely %d dwords\n",
1229				   *dw, REG_FIELD_GET(XE_INSTR_CMD_TYPE, *dw),
1230				   num_dw);
1231		}
1232
1233		dw += num_dw;
1234		remaining_dw -= num_dw;
1235	}
1236}
1237
1238struct instr_state {
1239	u32 instr;
1240	u16 num_dw;
1241};
1242
1243static const struct instr_state xe_hpg_svg_state[] = {
1244	{ .instr = CMD_3DSTATE_CONSTANT_VS, .num_dw = 11 },
1245	{ .instr = CMD_3DSTATE_CONSTANT_HS, .num_dw = 11 },
1246	{ .instr = CMD_3DSTATE_CONSTANT_DS, .num_dw = 11 },
1247	{ .instr = CMD_3DSTATE_CONSTANT_GS, .num_dw = 11 },
1248	{ .instr = CMD_3DSTATE_VERTEX_ELEMENTS, .num_dw = 69 },
1249	{ .instr = CMD_3DSTATE_VF_COMPONENT_PACKING, .num_dw = 5 },
1250	{ .instr = CMD_3DSTATE_VF_SGVS, .num_dw = 2 },
1251	{ .instr = CMD_3DSTATE_VF_SGVS_2, .num_dw = 3 },
1252	{ .instr = CMD_3DSTATE_VS, .num_dw = 9 },
1253	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_VS, .num_dw = 2 },
1254	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_VS, .num_dw = 2 },
1255	{ .instr = CMD_3DSTATE_URB_ALLOC_VS, .num_dw = 3 },
1256	{ .instr = CMD_3DSTATE_STREAMOUT, .num_dw = 5 },
1257	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_0, .num_dw = 8 },
1258	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_1, .num_dw = 8 },
1259	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_2, .num_dw = 8 },
1260	{ .instr = CMD_3DSTATE_SO_BUFFER_INDEX_3, .num_dw = 8 },
1261	{ .instr = CMD_3DSTATE_CLIP, .num_dw = 4 },
1262	{ .instr = CMD_3DSTATE_PRIMITIVE_REPLICATION, .num_dw = 6 },
1263	{ .instr = CMD_3DSTATE_CLIP_MESH, .num_dw = 2 },
1264	{ .instr = CMD_3DSTATE_SF, .num_dw = 4 },
1265	{ .instr = CMD_3DSTATE_SCISSOR_STATE_POINTERS, .num_dw = 2 },
1266	{ .instr = CMD_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP, .num_dw = 2 },
1267	{ .instr = CMD_3DSTATE_RASTER, .num_dw = 5 },
1268	{ .instr = CMD_3DSTATE_TBIMR_TILE_PASS_INFO, .num_dw = 4 },
1269	{ .instr = CMD_3DSTATE_WM_HZ_OP, .num_dw = 6 },
1270	{ .instr = CMD_3DSTATE_MULTISAMPLE, .num_dw = 2 },
1271	{ .instr = CMD_3DSTATE_HS, .num_dw = 9 },
1272	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_HS, .num_dw = 2 },
1273	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_HS, .num_dw = 2 },
1274	{ .instr = CMD_3DSTATE_URB_ALLOC_HS, .num_dw = 3 },
1275	{ .instr = CMD_3DSTATE_TASK_CONTROL, .num_dw = 3 },
1276	{ .instr = CMD_3DSTATE_TASK_SHADER, .num_dw = 7 },
1277	{ .instr = CMD_3DSTATE_TASK_SHADER_DATA, .num_dw = 10 },
1278	{ .instr = CMD_3DSTATE_URB_ALLOC_TASK, .num_dw = 3 },
1279	{ .instr = CMD_3DSTATE_TE, .num_dw = 5 },
1280	{ .instr = CMD_3DSTATE_TASK_REDISTRIB, .num_dw = 2 },
1281	{ .instr = CMD_3DSTATE_DS, .num_dw = 11 },
1282	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_DS, .num_dw = 2 },
1283	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_DS, .num_dw = 2 },
1284	{ .instr = CMD_3DSTATE_URB_ALLOC_DS, .num_dw = 3 },
1285	{ .instr = CMD_3DSTATE_GS, .num_dw = 10 },
1286	{ .instr = CMD_3DSTATE_BINDING_TABLE_POINTERS_GS, .num_dw = 2 },
1287	{ .instr = CMD_3DSTATE_SAMPLER_STATE_POINTERS_GS, .num_dw = 2 },
1288	{ .instr = CMD_3DSTATE_URB_ALLOC_GS, .num_dw = 3 },
1289	{ .instr = CMD_3DSTATE_MESH_CONTROL, .num_dw = 3 },
1290	{ .instr = CMD_3DSTATE_MESH_SHADER_DATA, .num_dw = 10 },
1291	{ .instr = CMD_3DSTATE_URB_ALLOC_MESH, .num_dw = 3 },
1292	{ .instr = CMD_3DSTATE_MESH_SHADER, .num_dw = 8 },
1293	{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
1294};
1295
1296void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
1297{
1298	struct xe_gt *gt = q->hwe->gt;
1299	struct xe_device *xe = gt_to_xe(gt);
1300	const struct instr_state *state_table = NULL;
1301	int state_table_size = 0;
1302
1303	/*
1304	 * At the moment we only need to emit non-register state for the RCS
1305	 * engine.
1306	 */
1307	if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
1308		return;
1309
1310	switch (GRAPHICS_VERx100(xe)) {
1311	case 1255:
1312	case 1270 ... 2004:
1313		state_table = xe_hpg_svg_state;
1314		state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
1315		break;
1316	default:
1317		xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
1318			  GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
1319		return;
1320	}
1321
1322	for (int i = 0; i < state_table_size; i++) {
1323		u32 instr = state_table[i].instr;
1324		u16 num_dw = state_table[i].num_dw;
1325		bool is_single_dw = ((instr & GFXPIPE_PIPELINE) == PIPELINE_SINGLE_DW);
1326
1327		xe_gt_assert(gt, (instr & XE_INSTR_CMD_TYPE) == XE_INSTR_GFXPIPE);
1328		xe_gt_assert(gt, num_dw != 0);
1329		xe_gt_assert(gt, is_single_dw ^ (num_dw > 1));
1330
1331		/*
1332		 * Xe2's SVG context is the same as the one on DG2 / MTL
1333		 * except that 3DSTATE_DRAWING_RECTANGLE (non-pipelined) has
1334		 * been replaced by 3DSTATE_DRAWING_RECTANGLE_FAST (pipelined).
1335		 * Just make the replacement here rather than defining a
1336		 * whole separate table for the single trivial change.
1337		 */
1338		if (GRAPHICS_VER(xe) >= 20 &&
1339		    instr == CMD_3DSTATE_DRAWING_RECTANGLE)
1340			instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
1341
1342		bb->cs[bb->len] = instr;
1343		if (!is_single_dw)
1344			bb->cs[bb->len] |= (num_dw - 2);
1345
1346		bb->len += num_dw;
1347	}
1348}
1349
1350struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
1351{
1352	struct xe_lrc_snapshot *snapshot = kmalloc(sizeof(*snapshot), GFP_NOWAIT);
1353
1354	if (!snapshot)
1355		return NULL;
1356
1357	snapshot->context_desc = lower_32_bits(xe_lrc_ggtt_addr(lrc));
1358	snapshot->head = xe_lrc_ring_head(lrc);
1359	snapshot->tail.internal = lrc->ring.tail;
1360	snapshot->tail.memory = xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL);
1361	snapshot->start_seqno = xe_lrc_start_seqno(lrc);
1362	snapshot->seqno = xe_lrc_seqno(lrc);
1363	snapshot->lrc_bo = xe_bo_get(lrc->bo);
1364	snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
1365	snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
1366	snapshot->lrc_snapshot = NULL;
1367	return snapshot;
1368}
1369
1370void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
1371{
1372	struct xe_bo *bo;
1373	struct iosys_map src;
1374
1375	if (!snapshot)
1376		return;
1377
1378	bo = snapshot->lrc_bo;
1379	snapshot->lrc_bo = NULL;
1380
1381	snapshot->lrc_snapshot = kvmalloc(snapshot->lrc_size, GFP_KERNEL);
1382	if (!snapshot->lrc_snapshot)
1383		goto put_bo;
1384
1385	dma_resv_lock(bo->ttm.base.resv, NULL);
1386	if (!ttm_bo_vmap(&bo->ttm, &src)) {
1387		xe_map_memcpy_from(xe_bo_device(bo),
1388				   snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
1389				   snapshot->lrc_size);
1390		ttm_bo_vunmap(&bo->ttm, &src);
1391	} else {
1392		kvfree(snapshot->lrc_snapshot);
1393		snapshot->lrc_snapshot = NULL;
1394	}
1395	dma_resv_unlock(bo->ttm.base.resv);
1396put_bo:
1397	xe_bo_put(bo);
1398}
1399
1400void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer *p)
1401{
1402	unsigned long i;
1403
1404	if (!snapshot)
1405		return;
1406
1407	drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
1408	drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
1409	drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
1410		   snapshot->tail.internal, snapshot->tail.memory);
1411	drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
1412	drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
1413
1414	if (!snapshot->lrc_snapshot)
1415		return;
1416
1417	drm_printf(p, "\t[HWSP].length: 0x%x\n", LRC_PPHWSP_SIZE);
1418	drm_puts(p, "\t[HWSP].data: ");
1419	for (i = 0; i < LRC_PPHWSP_SIZE; i += sizeof(u32)) {
1420		u32 *val = snapshot->lrc_snapshot + i;
1421		char dumped[ASCII85_BUFSZ];
1422
1423		drm_puts(p, ascii85_encode(*val, dumped));
1424	}
1425
1426	drm_printf(p, "\n\t[HWCTX].length: 0x%lx\n", snapshot->lrc_size - LRC_PPHWSP_SIZE);
1427	drm_puts(p, "\t[HWCTX].data: ");
1428	for (; i < snapshot->lrc_size; i += sizeof(u32)) {
1429		u32 *val = snapshot->lrc_snapshot + i;
1430		char dumped[ASCII85_BUFSZ];
1431
1432		drm_puts(p, ascii85_encode(*val, dumped));
1433	}
1434	drm_puts(p, "\n");
1435}
1436
1437void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
1438{
1439	if (!snapshot)
1440		return;
1441
1442	kvfree(snapshot->lrc_snapshot);
1443	if (snapshot->lrc_bo)
1444		xe_bo_put(snapshot->lrc_bo);
1445	kfree(snapshot);
1446}
1447