1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2014 Intel Corporation
4 */
5
6#include "gem/i915_gem_lmem.h"
7
8#include "gen8_engine_cs.h"
9#include "i915_drv.h"
10#include "i915_perf.h"
11#include "i915_reg.h"
12#include "intel_context.h"
13#include "intel_engine.h"
14#include "intel_engine_regs.h"
15#include "intel_gpu_commands.h"
16#include "intel_gt.h"
17#include "intel_gt_regs.h"
18#include "intel_lrc.h"
19#include "intel_lrc_reg.h"
20#include "intel_ring.h"
21#include "shmem_utils.h"
22
23/*
24 * The per-platform tables are u8-encoded in @data. Decode @data and set the
25 * addresses' offset and commands in @regs. The following encoding is used
26 * for each byte. There are 2 steps: decoding commands and decoding addresses.
27 *
28 * Commands:
29 * [7]: create NOPs - number of NOPs are set in lower bits
30 * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set
31 *      MI_LRI_FORCE_POSTED
32 * [5:0]: Number of NOPs or registers to set values to in case of
33 *        MI_LOAD_REGISTER_IMM
34 *
35 * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count"
36 * number of registers. They are set by using the REG/REG16 macros: the former
37 * is used for offsets smaller than 0x200 while the latter is for values bigger
38 * than that. Those macros already set all the bits documented below correctly:
39 *
40 * [7]: When a register offset needs more than 6 bits, use additional bytes, to
41 *      follow, for the lower bits
42 * [6:0]: Register offset, without considering the engine base.
43 *
44 * This function only tweaks the commands and register offsets. Values are not
45 * filled out.
46 */
47static void set_offsets(u32 *regs,
48			const u8 *data,
49			const struct intel_engine_cs *engine,
50			bool close)
51#define NOP(x) (BIT(7) | (x))
52#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
53#define POSTED BIT(0)
54#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
55#define REG16(x) \
56	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
57	(((x) >> 2) & 0x7f)
58#define END 0
59{
60	const u32 base = engine->mmio_base;
61
62	while (*data) {
63		u8 count, flags;
64
65		if (*data & BIT(7)) { /* skip */
66			count = *data++ & ~BIT(7);
67			regs += count;
68			continue;
69		}
70
71		count = *data & 0x3f;
72		flags = *data >> 6;
73		data++;
74
75		*regs = MI_LOAD_REGISTER_IMM(count);
76		if (flags & POSTED)
77			*regs |= MI_LRI_FORCE_POSTED;
78		if (GRAPHICS_VER(engine->i915) >= 11)
79			*regs |= MI_LRI_LRM_CS_MMIO;
80		regs++;
81
82		GEM_BUG_ON(!count);
83		do {
84			u32 offset = 0;
85			u8 v;
86
87			do {
88				v = *data++;
89				offset <<= 7;
90				offset |= v & ~BIT(7);
91			} while (v & BIT(7));
92
93			regs[0] = base + (offset << 2);
94			regs += 2;
95		} while (--count);
96	}
97
98	if (close) {
99		/* Close the batch; used mainly by live_lrc_layout() */
100		*regs = MI_BATCH_BUFFER_END;
101		if (GRAPHICS_VER(engine->i915) >= 11)
102			*regs |= BIT(0);
103	}
104}
105
106static const u8 gen8_xcs_offsets[] = {
107	NOP(1),
108	LRI(11, 0),
109	REG16(0x244),
110	REG(0x034),
111	REG(0x030),
112	REG(0x038),
113	REG(0x03c),
114	REG(0x168),
115	REG(0x140),
116	REG(0x110),
117	REG(0x11c),
118	REG(0x114),
119	REG(0x118),
120
121	NOP(9),
122	LRI(9, 0),
123	REG16(0x3a8),
124	REG16(0x28c),
125	REG16(0x288),
126	REG16(0x284),
127	REG16(0x280),
128	REG16(0x27c),
129	REG16(0x278),
130	REG16(0x274),
131	REG16(0x270),
132
133	NOP(13),
134	LRI(2, 0),
135	REG16(0x200),
136	REG(0x028),
137
138	END
139};
140
141static const u8 gen9_xcs_offsets[] = {
142	NOP(1),
143	LRI(14, POSTED),
144	REG16(0x244),
145	REG(0x034),
146	REG(0x030),
147	REG(0x038),
148	REG(0x03c),
149	REG(0x168),
150	REG(0x140),
151	REG(0x110),
152	REG(0x11c),
153	REG(0x114),
154	REG(0x118),
155	REG(0x1c0),
156	REG(0x1c4),
157	REG(0x1c8),
158
159	NOP(3),
160	LRI(9, POSTED),
161	REG16(0x3a8),
162	REG16(0x28c),
163	REG16(0x288),
164	REG16(0x284),
165	REG16(0x280),
166	REG16(0x27c),
167	REG16(0x278),
168	REG16(0x274),
169	REG16(0x270),
170
171	NOP(13),
172	LRI(1, POSTED),
173	REG16(0x200),
174
175	NOP(13),
176	LRI(44, POSTED),
177	REG(0x028),
178	REG(0x09c),
179	REG(0x0c0),
180	REG(0x178),
181	REG(0x17c),
182	REG16(0x358),
183	REG(0x170),
184	REG(0x150),
185	REG(0x154),
186	REG(0x158),
187	REG16(0x41c),
188	REG16(0x600),
189	REG16(0x604),
190	REG16(0x608),
191	REG16(0x60c),
192	REG16(0x610),
193	REG16(0x614),
194	REG16(0x618),
195	REG16(0x61c),
196	REG16(0x620),
197	REG16(0x624),
198	REG16(0x628),
199	REG16(0x62c),
200	REG16(0x630),
201	REG16(0x634),
202	REG16(0x638),
203	REG16(0x63c),
204	REG16(0x640),
205	REG16(0x644),
206	REG16(0x648),
207	REG16(0x64c),
208	REG16(0x650),
209	REG16(0x654),
210	REG16(0x658),
211	REG16(0x65c),
212	REG16(0x660),
213	REG16(0x664),
214	REG16(0x668),
215	REG16(0x66c),
216	REG16(0x670),
217	REG16(0x674),
218	REG16(0x678),
219	REG16(0x67c),
220	REG(0x068),
221
222	END
223};
224
225static const u8 gen12_xcs_offsets[] = {
226	NOP(1),
227	LRI(13, POSTED),
228	REG16(0x244),
229	REG(0x034),
230	REG(0x030),
231	REG(0x038),
232	REG(0x03c),
233	REG(0x168),
234	REG(0x140),
235	REG(0x110),
236	REG(0x1c0),
237	REG(0x1c4),
238	REG(0x1c8),
239	REG(0x180),
240	REG16(0x2b4),
241
242	NOP(5),
243	LRI(9, POSTED),
244	REG16(0x3a8),
245	REG16(0x28c),
246	REG16(0x288),
247	REG16(0x284),
248	REG16(0x280),
249	REG16(0x27c),
250	REG16(0x278),
251	REG16(0x274),
252	REG16(0x270),
253
254	END
255};
256
257static const u8 dg2_xcs_offsets[] = {
258	NOP(1),
259	LRI(15, POSTED),
260	REG16(0x244),
261	REG(0x034),
262	REG(0x030),
263	REG(0x038),
264	REG(0x03c),
265	REG(0x168),
266	REG(0x140),
267	REG(0x110),
268	REG(0x1c0),
269	REG(0x1c4),
270	REG(0x1c8),
271	REG(0x180),
272	REG16(0x2b4),
273	REG(0x120),
274	REG(0x124),
275
276	NOP(1),
277	LRI(9, POSTED),
278	REG16(0x3a8),
279	REG16(0x28c),
280	REG16(0x288),
281	REG16(0x284),
282	REG16(0x280),
283	REG16(0x27c),
284	REG16(0x278),
285	REG16(0x274),
286	REG16(0x270),
287
288	END
289};
290
291static const u8 gen8_rcs_offsets[] = {
292	NOP(1),
293	LRI(14, POSTED),
294	REG16(0x244),
295	REG(0x034),
296	REG(0x030),
297	REG(0x038),
298	REG(0x03c),
299	REG(0x168),
300	REG(0x140),
301	REG(0x110),
302	REG(0x11c),
303	REG(0x114),
304	REG(0x118),
305	REG(0x1c0),
306	REG(0x1c4),
307	REG(0x1c8),
308
309	NOP(3),
310	LRI(9, POSTED),
311	REG16(0x3a8),
312	REG16(0x28c),
313	REG16(0x288),
314	REG16(0x284),
315	REG16(0x280),
316	REG16(0x27c),
317	REG16(0x278),
318	REG16(0x274),
319	REG16(0x270),
320
321	NOP(13),
322	LRI(1, 0),
323	REG(0x0c8),
324
325	END
326};
327
328static const u8 gen9_rcs_offsets[] = {
329	NOP(1),
330	LRI(14, POSTED),
331	REG16(0x244),
332	REG(0x34),
333	REG(0x30),
334	REG(0x38),
335	REG(0x3c),
336	REG(0x168),
337	REG(0x140),
338	REG(0x110),
339	REG(0x11c),
340	REG(0x114),
341	REG(0x118),
342	REG(0x1c0),
343	REG(0x1c4),
344	REG(0x1c8),
345
346	NOP(3),
347	LRI(9, POSTED),
348	REG16(0x3a8),
349	REG16(0x28c),
350	REG16(0x288),
351	REG16(0x284),
352	REG16(0x280),
353	REG16(0x27c),
354	REG16(0x278),
355	REG16(0x274),
356	REG16(0x270),
357
358	NOP(13),
359	LRI(1, 0),
360	REG(0xc8),
361
362	NOP(13),
363	LRI(44, POSTED),
364	REG(0x28),
365	REG(0x9c),
366	REG(0xc0),
367	REG(0x178),
368	REG(0x17c),
369	REG16(0x358),
370	REG(0x170),
371	REG(0x150),
372	REG(0x154),
373	REG(0x158),
374	REG16(0x41c),
375	REG16(0x600),
376	REG16(0x604),
377	REG16(0x608),
378	REG16(0x60c),
379	REG16(0x610),
380	REG16(0x614),
381	REG16(0x618),
382	REG16(0x61c),
383	REG16(0x620),
384	REG16(0x624),
385	REG16(0x628),
386	REG16(0x62c),
387	REG16(0x630),
388	REG16(0x634),
389	REG16(0x638),
390	REG16(0x63c),
391	REG16(0x640),
392	REG16(0x644),
393	REG16(0x648),
394	REG16(0x64c),
395	REG16(0x650),
396	REG16(0x654),
397	REG16(0x658),
398	REG16(0x65c),
399	REG16(0x660),
400	REG16(0x664),
401	REG16(0x668),
402	REG16(0x66c),
403	REG16(0x670),
404	REG16(0x674),
405	REG16(0x678),
406	REG16(0x67c),
407	REG(0x68),
408
409	END
410};
411
412static const u8 gen11_rcs_offsets[] = {
413	NOP(1),
414	LRI(15, POSTED),
415	REG16(0x244),
416	REG(0x034),
417	REG(0x030),
418	REG(0x038),
419	REG(0x03c),
420	REG(0x168),
421	REG(0x140),
422	REG(0x110),
423	REG(0x11c),
424	REG(0x114),
425	REG(0x118),
426	REG(0x1c0),
427	REG(0x1c4),
428	REG(0x1c8),
429	REG(0x180),
430
431	NOP(1),
432	LRI(9, POSTED),
433	REG16(0x3a8),
434	REG16(0x28c),
435	REG16(0x288),
436	REG16(0x284),
437	REG16(0x280),
438	REG16(0x27c),
439	REG16(0x278),
440	REG16(0x274),
441	REG16(0x270),
442
443	LRI(1, POSTED),
444	REG(0x1b0),
445
446	NOP(10),
447	LRI(1, 0),
448	REG(0x0c8),
449
450	END
451};
452
453static const u8 gen12_rcs_offsets[] = {
454	NOP(1),
455	LRI(13, POSTED),
456	REG16(0x244),
457	REG(0x034),
458	REG(0x030),
459	REG(0x038),
460	REG(0x03c),
461	REG(0x168),
462	REG(0x140),
463	REG(0x110),
464	REG(0x1c0),
465	REG(0x1c4),
466	REG(0x1c8),
467	REG(0x180),
468	REG16(0x2b4),
469
470	NOP(5),
471	LRI(9, POSTED),
472	REG16(0x3a8),
473	REG16(0x28c),
474	REG16(0x288),
475	REG16(0x284),
476	REG16(0x280),
477	REG16(0x27c),
478	REG16(0x278),
479	REG16(0x274),
480	REG16(0x270),
481
482	LRI(3, POSTED),
483	REG(0x1b0),
484	REG16(0x5a8),
485	REG16(0x5ac),
486
487	NOP(6),
488	LRI(1, 0),
489	REG(0x0c8),
490	NOP(3 + 9 + 1),
491
492	LRI(51, POSTED),
493	REG16(0x588),
494	REG16(0x588),
495	REG16(0x588),
496	REG16(0x588),
497	REG16(0x588),
498	REG16(0x588),
499	REG(0x028),
500	REG(0x09c),
501	REG(0x0c0),
502	REG(0x178),
503	REG(0x17c),
504	REG16(0x358),
505	REG(0x170),
506	REG(0x150),
507	REG(0x154),
508	REG(0x158),
509	REG16(0x41c),
510	REG16(0x600),
511	REG16(0x604),
512	REG16(0x608),
513	REG16(0x60c),
514	REG16(0x610),
515	REG16(0x614),
516	REG16(0x618),
517	REG16(0x61c),
518	REG16(0x620),
519	REG16(0x624),
520	REG16(0x628),
521	REG16(0x62c),
522	REG16(0x630),
523	REG16(0x634),
524	REG16(0x638),
525	REG16(0x63c),
526	REG16(0x640),
527	REG16(0x644),
528	REG16(0x648),
529	REG16(0x64c),
530	REG16(0x650),
531	REG16(0x654),
532	REG16(0x658),
533	REG16(0x65c),
534	REG16(0x660),
535	REG16(0x664),
536	REG16(0x668),
537	REG16(0x66c),
538	REG16(0x670),
539	REG16(0x674),
540	REG16(0x678),
541	REG16(0x67c),
542	REG(0x068),
543	REG(0x084),
544	NOP(1),
545
546	END
547};
548
549static const u8 xehp_rcs_offsets[] = {
550	NOP(1),
551	LRI(13, POSTED),
552	REG16(0x244),
553	REG(0x034),
554	REG(0x030),
555	REG(0x038),
556	REG(0x03c),
557	REG(0x168),
558	REG(0x140),
559	REG(0x110),
560	REG(0x1c0),
561	REG(0x1c4),
562	REG(0x1c8),
563	REG(0x180),
564	REG16(0x2b4),
565
566	NOP(5),
567	LRI(9, POSTED),
568	REG16(0x3a8),
569	REG16(0x28c),
570	REG16(0x288),
571	REG16(0x284),
572	REG16(0x280),
573	REG16(0x27c),
574	REG16(0x278),
575	REG16(0x274),
576	REG16(0x270),
577
578	LRI(3, POSTED),
579	REG(0x1b0),
580	REG16(0x5a8),
581	REG16(0x5ac),
582
583	NOP(6),
584	LRI(1, 0),
585	REG(0x0c8),
586
587	END
588};
589
590static const u8 dg2_rcs_offsets[] = {
591	NOP(1),
592	LRI(15, POSTED),
593	REG16(0x244),
594	REG(0x034),
595	REG(0x030),
596	REG(0x038),
597	REG(0x03c),
598	REG(0x168),
599	REG(0x140),
600	REG(0x110),
601	REG(0x1c0),
602	REG(0x1c4),
603	REG(0x1c8),
604	REG(0x180),
605	REG16(0x2b4),
606	REG(0x120),
607	REG(0x124),
608
609	NOP(1),
610	LRI(9, POSTED),
611	REG16(0x3a8),
612	REG16(0x28c),
613	REG16(0x288),
614	REG16(0x284),
615	REG16(0x280),
616	REG16(0x27c),
617	REG16(0x278),
618	REG16(0x274),
619	REG16(0x270),
620
621	LRI(3, POSTED),
622	REG(0x1b0),
623	REG16(0x5a8),
624	REG16(0x5ac),
625
626	NOP(6),
627	LRI(1, 0),
628	REG(0x0c8),
629
630	END
631};
632
633static const u8 mtl_rcs_offsets[] = {
634	NOP(1),
635	LRI(15, POSTED),
636	REG16(0x244),
637	REG(0x034),
638	REG(0x030),
639	REG(0x038),
640	REG(0x03c),
641	REG(0x168),
642	REG(0x140),
643	REG(0x110),
644	REG(0x1c0),
645	REG(0x1c4),
646	REG(0x1c8),
647	REG(0x180),
648	REG16(0x2b4),
649	REG(0x120),
650	REG(0x124),
651
652	NOP(1),
653	LRI(9, POSTED),
654	REG16(0x3a8),
655	REG16(0x28c),
656	REG16(0x288),
657	REG16(0x284),
658	REG16(0x280),
659	REG16(0x27c),
660	REG16(0x278),
661	REG16(0x274),
662	REG16(0x270),
663
664	NOP(2),
665	LRI(2, POSTED),
666	REG16(0x5a8),
667	REG16(0x5ac),
668
669	NOP(6),
670	LRI(1, 0),
671	REG(0x0c8),
672
673	END
674};
675
676#undef END
677#undef REG16
678#undef REG
679#undef LRI
680#undef NOP
681
682static const u8 *reg_offsets(const struct intel_engine_cs *engine)
683{
684	/*
685	 * The gen12+ lists only have the registers we program in the basic
686	 * default state. We rely on the context image using relative
687	 * addressing to automatic fixup the register state between the
688	 * physical engines for virtual engine.
689	 */
690	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
691		   !intel_engine_has_relative_mmio(engine));
692
693	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
694		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70))
695			return mtl_rcs_offsets;
696		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
697			return dg2_rcs_offsets;
698		else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
699			return xehp_rcs_offsets;
700		else if (GRAPHICS_VER(engine->i915) >= 12)
701			return gen12_rcs_offsets;
702		else if (GRAPHICS_VER(engine->i915) >= 11)
703			return gen11_rcs_offsets;
704		else if (GRAPHICS_VER(engine->i915) >= 9)
705			return gen9_rcs_offsets;
706		else
707			return gen8_rcs_offsets;
708	} else {
709		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
710			return dg2_xcs_offsets;
711		else if (GRAPHICS_VER(engine->i915) >= 12)
712			return gen12_xcs_offsets;
713		else if (GRAPHICS_VER(engine->i915) >= 9)
714			return gen9_xcs_offsets;
715		else
716			return gen8_xcs_offsets;
717	}
718}
719
720static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
721{
722	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
723		return 0x70;
724	else if (GRAPHICS_VER(engine->i915) >= 12)
725		return 0x60;
726	else if (GRAPHICS_VER(engine->i915) >= 9)
727		return 0x54;
728	else if (engine->class == RENDER_CLASS)
729		return 0x58;
730	else
731		return -1;
732}
733
734static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
735{
736	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
737		return 0x80;
738	else if (GRAPHICS_VER(engine->i915) >= 12)
739		return 0x70;
740	else if (GRAPHICS_VER(engine->i915) >= 9)
741		return 0x64;
742	else if (GRAPHICS_VER(engine->i915) >= 8 &&
743		 engine->class == RENDER_CLASS)
744		return 0xc4;
745	else
746		return -1;
747}
748
749static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
750{
751	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
752		return 0x84;
753	else if (GRAPHICS_VER(engine->i915) >= 12)
754		return 0x74;
755	else if (GRAPHICS_VER(engine->i915) >= 9)
756		return 0x68;
757	else if (engine->class == RENDER_CLASS)
758		return 0xd8;
759	else
760		return -1;
761}
762
763static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
764{
765	if (GRAPHICS_VER(engine->i915) >= 12)
766		return 0x12;
767	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
768		return 0x18;
769	else
770		return -1;
771}
772
773static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
774{
775	int x;
776
777	x = lrc_ring_wa_bb_per_ctx(engine);
778	if (x < 0)
779		return x;
780
781	return x + 2;
782}
783
784static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
785{
786	int x;
787
788	x = lrc_ring_indirect_ptr(engine);
789	if (x < 0)
790		return x;
791
792	return x + 2;
793}
794
795static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
796{
797
798	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
799		/*
800		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
801		 * simply to match the RCS context image layout.
802		 */
803		return 0xc6;
804	else if (engine->class != RENDER_CLASS)
805		return -1;
806	else if (GRAPHICS_VER(engine->i915) >= 12)
807		return 0xb6;
808	else if (GRAPHICS_VER(engine->i915) >= 11)
809		return 0xaa;
810	else
811		return -1;
812}
813
814static u32
815lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
816{
817	if (GRAPHICS_VER(engine->i915) >= 12)
818		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
819	else if (GRAPHICS_VER(engine->i915) >= 11)
820		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
821	else if (GRAPHICS_VER(engine->i915) >= 9)
822		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
823	else if (GRAPHICS_VER(engine->i915) >= 8)
824		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
825
826	GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8);
827
828	return 0;
829}
830
831static void
832lrc_setup_bb_per_ctx(u32 *regs,
833		     const struct intel_engine_cs *engine,
834		     u32 ctx_bb_ggtt_addr)
835{
836	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
837	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
838		ctx_bb_ggtt_addr |
839		PER_CTX_BB_FORCE |
840		PER_CTX_BB_VALID;
841}
842
843static void
844lrc_setup_indirect_ctx(u32 *regs,
845		       const struct intel_engine_cs *engine,
846		       u32 ctx_bb_ggtt_addr,
847		       u32 size)
848{
849	GEM_BUG_ON(!size);
850	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
851	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
852	regs[lrc_ring_indirect_ptr(engine) + 1] =
853		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
854
855	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
856	regs[lrc_ring_indirect_offset(engine) + 1] =
857		lrc_ring_indirect_offset_default(engine) << 6;
858}
859
860static bool ctx_needs_runalone(const struct intel_context *ce)
861{
862	struct i915_gem_context *gem_ctx;
863	bool ctx_is_protected = false;
864
865	/*
866	 * On MTL and newer platforms, protected contexts require setting
867	 * the LRC run-alone bit or else the encryption will not happen.
868	 */
869	if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
870	    (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
871		rcu_read_lock();
872		gem_ctx = rcu_dereference(ce->gem_context);
873		if (gem_ctx)
874			ctx_is_protected = gem_ctx->uses_protected_content;
875		rcu_read_unlock();
876	}
877
878	return ctx_is_protected;
879}
880
881static void init_common_regs(u32 * const regs,
882			     const struct intel_context *ce,
883			     const struct intel_engine_cs *engine,
884			     bool inhibit)
885{
886	u32 ctl;
887	int loc;
888
889	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
890	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
891	if (inhibit)
892		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
893	if (GRAPHICS_VER(engine->i915) < 11)
894		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
895					   CTX_CTRL_RS_CTX_ENABLE);
896	if (ctx_needs_runalone(ce))
897		ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE);
898	regs[CTX_CONTEXT_CONTROL] = ctl;
899
900	regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
901
902	loc = lrc_ring_bb_offset(engine);
903	if (loc != -1)
904		regs[loc + 1] = 0;
905}
906
907static void init_wa_bb_regs(u32 * const regs,
908			    const struct intel_engine_cs *engine)
909{
910	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
911
912	if (wa_ctx->per_ctx.size) {
913		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
914
915		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
916		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
917			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
918	}
919
920	if (wa_ctx->indirect_ctx.size) {
921		lrc_setup_indirect_ctx(regs, engine,
922				       i915_ggtt_offset(wa_ctx->vma) +
923				       wa_ctx->indirect_ctx.offset,
924				       wa_ctx->indirect_ctx.size);
925	}
926}
927
928static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
929{
930	if (i915_vm_is_4lvl(&ppgtt->vm)) {
931		/* 64b PPGTT (48bit canonical)
932		 * PDP0_DESCRIPTOR contains the base address to PML4 and
933		 * other PDP Descriptors are ignored.
934		 */
935		ASSIGN_CTX_PML4(ppgtt, regs);
936	} else {
937		ASSIGN_CTX_PDP(ppgtt, regs, 3);
938		ASSIGN_CTX_PDP(ppgtt, regs, 2);
939		ASSIGN_CTX_PDP(ppgtt, regs, 1);
940		ASSIGN_CTX_PDP(ppgtt, regs, 0);
941	}
942}
943
944static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
945{
946	if (i915_is_ggtt(vm))
947		return i915_vm_to_ggtt(vm)->alias;
948	else
949		return i915_vm_to_ppgtt(vm);
950}
951
952static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
953{
954	int x;
955
956	x = lrc_ring_mi_mode(engine);
957	if (x != -1) {
958		regs[x + 1] &= ~STOP_RING;
959		regs[x + 1] |= STOP_RING << 16;
960	}
961}
962
963static void __lrc_init_regs(u32 *regs,
964			    const struct intel_context *ce,
965			    const struct intel_engine_cs *engine,
966			    bool inhibit)
967{
968	/*
969	 * A context is actually a big batch buffer with several
970	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
971	 * values we are setting here are only for the first context restore:
972	 * on a subsequent save, the GPU will recreate this batchbuffer with new
973	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
974	 * we are not initializing here).
975	 *
976	 * Must keep consistent with virtual_update_register_offsets().
977	 */
978
979	if (inhibit)
980		memset(regs, 0, PAGE_SIZE);
981
982	set_offsets(regs, reg_offsets(engine), engine, inhibit);
983
984	init_common_regs(regs, ce, engine, inhibit);
985	init_ppgtt_regs(regs, vm_alias(ce->vm));
986
987	init_wa_bb_regs(regs, engine);
988
989	__reset_stop_ring(regs, engine);
990}
991
992void lrc_init_regs(const struct intel_context *ce,
993		   const struct intel_engine_cs *engine,
994		   bool inhibit)
995{
996	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
997}
998
999void lrc_reset_regs(const struct intel_context *ce,
1000		    const struct intel_engine_cs *engine)
1001{
1002	__reset_stop_ring(ce->lrc_reg_state, engine);
1003}
1004
1005static void
1006set_redzone(void *vaddr, const struct intel_engine_cs *engine)
1007{
1008	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1009		return;
1010
1011	vaddr += engine->context_size;
1012
1013	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
1014}
1015
1016static void
1017check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
1018{
1019	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1020		return;
1021
1022	vaddr += engine->context_size;
1023
1024	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
1025		drm_err_once(&engine->i915->drm,
1026			     "%s context redzone overwritten!\n",
1027			     engine->name);
1028}
1029
1030static u32 context_wa_bb_offset(const struct intel_context *ce)
1031{
1032	return PAGE_SIZE * ce->wa_bb_page;
1033}
1034
1035/*
1036 * per_ctx below determines which WABB section is used.
1037 * When true, the function returns the location of the
1038 * PER_CTX_BB.  When false, the function returns the
1039 * location of the INDIRECT_CTX.
1040 */
1041static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
1042{
1043	void *ptr;
1044
1045	GEM_BUG_ON(!ce->wa_bb_page);
1046
1047	ptr = ce->lrc_reg_state;
1048	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1049	ptr += context_wa_bb_offset(ce);
1050	ptr += per_ctx ? PAGE_SIZE : 0;
1051
1052	return ptr;
1053}
1054
1055void lrc_init_state(struct intel_context *ce,
1056		    struct intel_engine_cs *engine,
1057		    void *state)
1058{
1059	bool inhibit = true;
1060
1061	set_redzone(state, engine);
1062
1063	if (engine->default_state) {
1064		shmem_read(engine->default_state, 0,
1065			   state, engine->context_size);
1066		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
1067		inhibit = false;
1068	}
1069
1070	/* Clear the ppHWSP (inc. per-context counters) */
1071	memset(state, 0, PAGE_SIZE);
1072
1073	/* Clear the indirect wa and storage */
1074	if (ce->wa_bb_page)
1075		memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
1076
1077	/*
1078	 * The second page of the context object contains some registers which
1079	 * must be set up prior to the first execution.
1080	 */
1081	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
1082}
1083
1084u32 lrc_indirect_bb(const struct intel_context *ce)
1085{
1086	return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
1087}
1088
1089static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
1090{
1091	/* If predication is active, this will be noop'ed */
1092	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1093	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1094	*cs++ = 0;
1095	*cs++ = 0; /* No predication */
1096
1097	/* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
1098	*cs++ = MI_BATCH_BUFFER_END | BIT(15);
1099	*cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
1100
1101	/* Instructions are no longer predicated (disabled), we can proceed */
1102	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
1103	*cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
1104	*cs++ = 0;
1105	*cs++ = 1; /* enable predication before the next BB */
1106
1107	*cs++ = MI_BATCH_BUFFER_END;
1108	GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
1109
1110	return cs;
1111}
1112
1113static struct i915_vma *
1114__lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1115{
1116	struct drm_i915_gem_object *obj;
1117	struct i915_vma *vma;
1118	u32 context_size;
1119
1120	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1121
1122	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1123		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1124
1125	if (GRAPHICS_VER(engine->i915) >= 12) {
1126		ce->wa_bb_page = context_size / PAGE_SIZE;
1127		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
1128		context_size += PAGE_SIZE * 2;
1129	}
1130
1131	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1132		ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1133		context_size += PARENT_SCRATCH_SIZE;
1134	}
1135
1136	obj = i915_gem_object_create_lmem(engine->i915, context_size,
1137					  I915_BO_ALLOC_PM_VOLATILE);
1138	if (IS_ERR(obj)) {
1139		obj = i915_gem_object_create_shmem(engine->i915, context_size);
1140		if (IS_ERR(obj))
1141			return ERR_CAST(obj);
1142
1143		/*
1144		 * Wa_22016122933: For Media version 13.0, all Media GT shared
1145		 * memory needs to be mapped as WC on CPU side and UC (PAT
1146		 * index 2) on GPU side.
1147		 */
1148		if (intel_gt_needs_wa_22016122933(engine->gt))
1149			i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
1150	}
1151
1152	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1153	if (IS_ERR(vma)) {
1154		i915_gem_object_put(obj);
1155		return vma;
1156	}
1157
1158	return vma;
1159}
1160
1161static struct intel_timeline *
1162pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1163{
1164	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1165
1166	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1167}
1168
1169int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1170{
1171	struct intel_ring *ring;
1172	struct i915_vma *vma;
1173	int err;
1174
1175	GEM_BUG_ON(ce->state);
1176
1177	vma = __lrc_alloc_state(ce, engine);
1178	if (IS_ERR(vma))
1179		return PTR_ERR(vma);
1180
1181	ring = intel_engine_create_ring(engine, ce->ring_size);
1182	if (IS_ERR(ring)) {
1183		err = PTR_ERR(ring);
1184		goto err_vma;
1185	}
1186
1187	if (!page_mask_bits(ce->timeline)) {
1188		struct intel_timeline *tl;
1189
1190		/*
1191		 * Use the static global HWSP for the kernel context, and
1192		 * a dynamically allocated cacheline for everyone else.
1193		 */
1194		if (unlikely(ce->timeline))
1195			tl = pinned_timeline(ce, engine);
1196		else
1197			tl = intel_timeline_create(engine->gt);
1198		if (IS_ERR(tl)) {
1199			err = PTR_ERR(tl);
1200			goto err_ring;
1201		}
1202
1203		ce->timeline = tl;
1204	}
1205
1206	ce->ring = ring;
1207	ce->state = vma;
1208
1209	return 0;
1210
1211err_ring:
1212	intel_ring_put(ring);
1213err_vma:
1214	i915_vma_put(vma);
1215	return err;
1216}
1217
1218void lrc_reset(struct intel_context *ce)
1219{
1220	GEM_BUG_ON(!intel_context_is_pinned(ce));
1221
1222	intel_ring_reset(ce->ring, ce->ring->emit);
1223
1224	/* Scrub away the garbage */
1225	lrc_init_regs(ce, ce->engine, true);
1226	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1227}
1228
1229int
1230lrc_pre_pin(struct intel_context *ce,
1231	    struct intel_engine_cs *engine,
1232	    struct i915_gem_ww_ctx *ww,
1233	    void **vaddr)
1234{
1235	GEM_BUG_ON(!ce->state);
1236	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1237
1238	*vaddr = i915_gem_object_pin_map(ce->state->obj,
1239					 intel_gt_coherent_map_type(ce->engine->gt,
1240								    ce->state->obj,
1241								    false) |
1242					 I915_MAP_OVERRIDE);
1243
1244	return PTR_ERR_OR_ZERO(*vaddr);
1245}
1246
1247int
1248lrc_pin(struct intel_context *ce,
1249	struct intel_engine_cs *engine,
1250	void *vaddr)
1251{
1252	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1253
1254	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1255		lrc_init_state(ce, engine, vaddr);
1256
1257	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1258	return 0;
1259}
1260
1261void lrc_unpin(struct intel_context *ce)
1262{
1263	if (unlikely(ce->parallel.last_rq)) {
1264		i915_request_put(ce->parallel.last_rq);
1265		ce->parallel.last_rq = NULL;
1266	}
1267	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1268		      ce->engine);
1269}
1270
1271void lrc_post_unpin(struct intel_context *ce)
1272{
1273	i915_gem_object_unpin_map(ce->state->obj);
1274}
1275
1276void lrc_fini(struct intel_context *ce)
1277{
1278	if (!ce->state)
1279		return;
1280
1281	intel_ring_put(fetch_and_zero(&ce->ring));
1282	i915_vma_put(fetch_and_zero(&ce->state));
1283}
1284
1285void lrc_destroy(struct kref *kref)
1286{
1287	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1288
1289	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1290	GEM_BUG_ON(intel_context_is_pinned(ce));
1291
1292	lrc_fini(ce);
1293
1294	intel_context_fini(ce);
1295	intel_context_free(ce);
1296}
1297
1298static u32 *
1299gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1300{
1301	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1302		MI_SRM_LRM_GLOBAL_GTT |
1303		MI_LRI_LRM_CS_MMIO;
1304	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1305	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1306		CTX_TIMESTAMP * sizeof(u32);
1307	*cs++ = 0;
1308
1309	*cs++ = MI_LOAD_REGISTER_REG |
1310		MI_LRR_SOURCE_CS_MMIO |
1311		MI_LRI_LRM_CS_MMIO;
1312	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1313	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1314
1315	*cs++ = MI_LOAD_REGISTER_REG |
1316		MI_LRR_SOURCE_CS_MMIO |
1317		MI_LRI_LRM_CS_MMIO;
1318	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1319	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1320
1321	return cs;
1322}
1323
1324static u32 *
1325gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1326{
1327	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1328
1329	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1330		MI_SRM_LRM_GLOBAL_GTT |
1331		MI_LRI_LRM_CS_MMIO;
1332	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1333	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1334		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1335	*cs++ = 0;
1336
1337	return cs;
1338}
1339
1340static u32 *
1341gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1342{
1343	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1344
1345	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1346		MI_SRM_LRM_GLOBAL_GTT |
1347		MI_LRI_LRM_CS_MMIO;
1348	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1349	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1350		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1351	*cs++ = 0;
1352
1353	*cs++ = MI_LOAD_REGISTER_REG |
1354		MI_LRR_SOURCE_CS_MMIO |
1355		MI_LRI_LRM_CS_MMIO;
1356	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1357	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1358
1359	return cs;
1360}
1361
1362/*
1363 * The bspec's tuning guide asks us to program a vertical watermark value of
1364 * 0x3FF.  However this register is not saved/restored properly by the
1365 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1366 * batch buffer to ensure the value takes effect properly.  All other bits
1367 * in this register should remain at 0 (the hardware default).
1368 */
1369static u32 *
1370dg2_emit_draw_watermark_setting(u32 *cs)
1371{
1372	*cs++ = MI_LOAD_REGISTER_IMM(1);
1373	*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1374	*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1375
1376	return cs;
1377}
1378
1379static u32 *
1380gen12_invalidate_state_cache(u32 *cs)
1381{
1382	*cs++ = MI_LOAD_REGISTER_IMM(1);
1383	*cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2);
1384	*cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
1385	return cs;
1386}
1387
1388static u32 *
1389gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1390{
1391	cs = gen12_emit_timestamp_wa(ce, cs);
1392	cs = gen12_emit_cmd_buf_wa(ce, cs);
1393	cs = gen12_emit_restore_scratch(ce, cs);
1394
1395	/* Wa_16013000631:dg2 */
1396	if (IS_DG2_G11(ce->engine->i915))
1397		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1398
1399	cs = gen12_emit_aux_table_inv(ce->engine, cs);
1400
1401	/* Wa_18022495364 */
1402	if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10)))
1403		cs = gen12_invalidate_state_cache(cs);
1404
1405	/* Wa_16014892111 */
1406	if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1407	    IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
1408	    IS_DG2(ce->engine->i915))
1409		cs = dg2_emit_draw_watermark_setting(cs);
1410
1411	return cs;
1412}
1413
1414static u32 *
1415gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1416{
1417	cs = gen12_emit_timestamp_wa(ce, cs);
1418	cs = gen12_emit_restore_scratch(ce, cs);
1419
1420	/* Wa_16013000631:dg2 */
1421	if (IS_DG2_G11(ce->engine->i915))
1422		if (ce->engine->class == COMPUTE_CLASS)
1423			cs = gen8_emit_pipe_control(cs,
1424						    PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1425						    0);
1426
1427	return gen12_emit_aux_table_inv(ce->engine, cs);
1428}
1429
1430static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
1431{
1432	struct intel_gt *gt = ce->engine->gt;
1433	int mocs = gt->mocs.uc_index << 1;
1434
1435	/**
1436	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
1437	 * main copy engine arbitration into round robin mode.  We
1438	 * additionally need to submit the following WABB blt command
1439	 * to produce 4 subblits with each subblit generating 0 byte
1440	 * write requests as WABB:
1441	 *
1442	 * XY_FASTCOLOR_BLT
1443	 *  BG0    -> 5100000E
1444	 *  BG1    -> 0000003F (Dest pitch)
1445	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
1446	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
1447	 *  BG4    -> scratch
1448	 *  BG5    -> scratch
1449	 *  BG6-12 -> 00000000
1450	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
1451	 *  BG14   -> 00000010 (Qpitch = 4)
1452	 *  BG15   -> 00000000
1453	 */
1454	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
1455	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
1456	*cs++ = 0;
1457	*cs++ = 4 << 16 | 1;
1458	*cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1459	*cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma));
1460	*cs++ = 0;
1461	*cs++ = 0;
1462	*cs++ = 0;
1463	*cs++ = 0;
1464	*cs++ = 0;
1465	*cs++ = 0;
1466	*cs++ = 0;
1467	*cs++ = 0x20004004;
1468	*cs++ = 0x10;
1469	*cs++ = 0;
1470
1471	return cs;
1472}
1473
1474static u32 *
1475xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
1476{
1477	/* Wa_16018031267, Wa_16018063123 */
1478	if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine))
1479		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
1480
1481	return cs;
1482}
1483
1484static void
1485setup_per_ctx_bb(const struct intel_context *ce,
1486		 const struct intel_engine_cs *engine,
1487		 u32 *(*emit)(const struct intel_context *, u32 *))
1488{
1489	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
1490	u32 * const start = context_wabb(ce, true);
1491	u32 *cs;
1492
1493	cs = emit(ce, start);
1494
1495	/* PER_CTX_BB must manually terminate */
1496	*cs++ = MI_BATCH_BUFFER_END;
1497
1498	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1499	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
1500			     lrc_indirect_bb(ce) + PAGE_SIZE);
1501}
1502
1503static void
1504setup_indirect_ctx_bb(const struct intel_context *ce,
1505		      const struct intel_engine_cs *engine,
1506		      u32 *(*emit)(const struct intel_context *, u32 *))
1507{
1508	u32 * const start = context_wabb(ce, false);
1509	u32 *cs;
1510
1511	cs = emit(ce, start);
1512	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1513	while ((unsigned long)cs % CACHELINE_BYTES)
1514		*cs++ = MI_NOOP;
1515
1516	GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1517	setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1518
1519	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1520			       lrc_indirect_bb(ce),
1521			       (cs - start) * sizeof(*cs));
1522}
1523
1524/*
1525 * The context descriptor encodes various attributes of a context,
1526 * including its GTT address and some flags. Because it's fairly
1527 * expensive to calculate, we'll just do it once and cache the result,
1528 * which remains valid until the context is unpinned.
1529 *
1530 * This is what a descriptor looks like, from LSB to MSB::
1531 *
1532 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1533 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1534 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1535 *      bits 53-54:    mbz, reserved for use by hardware
1536 *      bits 55-63:    group ID, currently unused and set to 0
1537 *
1538 * Starting from Gen11, the upper dword of the descriptor has a new format:
1539 *
1540 *      bits 32-36:    reserved
1541 *      bits 37-47:    SW context ID
1542 *      bits 48:53:    engine instance
1543 *      bit 54:        mbz, reserved for use by hardware
1544 *      bits 55-60:    SW counter
1545 *      bits 61-63:    engine class
1546 *
1547 * On Xe_HP, the upper dword of the descriptor has a new format:
1548 *
1549 *      bits 32-37:    virtual function number
1550 *      bit 38:        mbz, reserved for use by hardware
1551 *      bits 39-54:    SW context ID
1552 *      bits 55-57:    reserved
1553 *      bits 58-63:    SW counter
1554 *
1555 * engine info, SW context ID and SW counter need to form a unique number
1556 * (Context ID) per lrc.
1557 */
1558static u32 lrc_descriptor(const struct intel_context *ce)
1559{
1560	u32 desc;
1561
1562	desc = INTEL_LEGACY_32B_CONTEXT;
1563	if (i915_vm_is_4lvl(ce->vm))
1564		desc = INTEL_LEGACY_64B_CONTEXT;
1565	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1566
1567	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1568	if (GRAPHICS_VER(ce->vm->i915) == 8)
1569		desc |= GEN8_CTX_L3LLC_COHERENT;
1570
1571	return i915_ggtt_offset(ce->state) | desc;
1572}
1573
1574u32 lrc_update_regs(const struct intel_context *ce,
1575		    const struct intel_engine_cs *engine,
1576		    u32 head)
1577{
1578	struct intel_ring *ring = ce->ring;
1579	u32 *regs = ce->lrc_reg_state;
1580
1581	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1582	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1583
1584	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1585	regs[CTX_RING_HEAD] = head;
1586	regs[CTX_RING_TAIL] = ring->tail;
1587	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1588
1589	/* RPCS */
1590	if (engine->class == RENDER_CLASS) {
1591		regs[CTX_R_PWR_CLK_STATE] =
1592			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1593
1594		i915_oa_init_reg_state(ce, engine);
1595	}
1596
1597	if (ce->wa_bb_page) {
1598		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1599
1600		fn = gen12_emit_indirect_ctx_xcs;
1601		if (ce->engine->class == RENDER_CLASS)
1602			fn = gen12_emit_indirect_ctx_rcs;
1603
1604		/* Mutually exclusive wrt to global indirect bb */
1605		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1606		setup_indirect_ctx_bb(ce, engine, fn);
1607		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
1608	}
1609
1610	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1611}
1612
1613void lrc_update_offsets(struct intel_context *ce,
1614			struct intel_engine_cs *engine)
1615{
1616	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1617}
1618
1619void lrc_check_regs(const struct intel_context *ce,
1620		    const struct intel_engine_cs *engine,
1621		    const char *when)
1622{
1623	const struct intel_ring *ring = ce->ring;
1624	u32 *regs = ce->lrc_reg_state;
1625	bool valid = true;
1626	int x;
1627
1628	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1629		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1630		       engine->name,
1631		       regs[CTX_RING_START],
1632		       i915_ggtt_offset(ring->vma));
1633		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1634		valid = false;
1635	}
1636
1637	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1638	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1639		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1640		       engine->name,
1641		       regs[CTX_RING_CTL],
1642		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1643		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1644		valid = false;
1645	}
1646
1647	x = lrc_ring_mi_mode(engine);
1648	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1649		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1650		       engine->name, regs[x + 1]);
1651		regs[x + 1] &= ~STOP_RING;
1652		regs[x + 1] |= STOP_RING << 16;
1653		valid = false;
1654	}
1655
1656	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1657}
1658
1659/*
1660 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1661 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1662 * but there is a slight complication as this is applied in WA batch where the
1663 * values are only initialized once so we cannot take register value at the
1664 * beginning and reuse it further; hence we save its value to memory, upload a
1665 * constant value with bit21 set and then we restore it back with the saved value.
1666 * To simplify the WA, a constant value is formed by using the default value
1667 * of this register. This shouldn't be a problem because we are only modifying
1668 * it for a short period and this batch in non-premptible. We can ofcourse
1669 * use additional instructions that read the actual value of the register
1670 * at that time and set our bit of interest but it makes the WA complicated.
1671 *
1672 * This WA is also required for Gen9 so extracting as a function avoids
1673 * code duplication.
1674 */
1675static u32 *
1676gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1677{
1678	/* NB no one else is allowed to scribble over scratch + 256! */
1679	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1680	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1681	*batch++ = intel_gt_scratch_offset(engine->gt,
1682					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1683	*batch++ = 0;
1684
1685	*batch++ = MI_LOAD_REGISTER_IMM(1);
1686	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1687	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1688
1689	batch = gen8_emit_pipe_control(batch,
1690				       PIPE_CONTROL_CS_STALL |
1691				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1692				       0);
1693
1694	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1695	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1696	*batch++ = intel_gt_scratch_offset(engine->gt,
1697					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1698	*batch++ = 0;
1699
1700	return batch;
1701}
1702
1703/*
1704 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1705 * initialized at the beginning and shared across all contexts but this field
1706 * helps us to have multiple batches at different offsets and select them based
1707 * on a criteria. At the moment this batch always start at the beginning of the page
1708 * and at this point we don't have multiple wa_ctx batch buffers.
1709 *
1710 * The number of WA applied are not known at the beginning; we use this field
1711 * to return the no of DWORDS written.
1712 *
1713 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1714 * so it adds NOOPs as padding to make it cacheline aligned.
1715 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1716 * makes a complete batch buffer.
1717 */
1718static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1719{
1720	/* WaDisableCtxRestoreArbitration:bdw,chv */
1721	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1722
1723	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1724	if (IS_BROADWELL(engine->i915))
1725		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1726
1727	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1728	/* Actual scratch location is at 128 bytes offset */
1729	batch = gen8_emit_pipe_control(batch,
1730				       PIPE_CONTROL_FLUSH_L3 |
1731				       PIPE_CONTROL_STORE_DATA_INDEX |
1732				       PIPE_CONTROL_CS_STALL |
1733				       PIPE_CONTROL_QW_WRITE,
1734				       LRC_PPHWSP_SCRATCH_ADDR);
1735
1736	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1737
1738	/* Pad to end of cacheline */
1739	while ((unsigned long)batch % CACHELINE_BYTES)
1740		*batch++ = MI_NOOP;
1741
1742	/*
1743	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1744	 * execution depends on the length specified in terms of cache lines
1745	 * in the register CTX_RCS_INDIRECT_CTX
1746	 */
1747
1748	return batch;
1749}
1750
1751struct lri {
1752	i915_reg_t reg;
1753	u32 value;
1754};
1755
1756static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1757{
1758	GEM_BUG_ON(!count || count > 63);
1759
1760	*batch++ = MI_LOAD_REGISTER_IMM(count);
1761	do {
1762		*batch++ = i915_mmio_reg_offset(lri->reg);
1763		*batch++ = lri->value;
1764	} while (lri++, --count);
1765	*batch++ = MI_NOOP;
1766
1767	return batch;
1768}
1769
1770static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1771{
1772	static const struct lri lri[] = {
1773		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1774		{
1775			COMMON_SLICE_CHICKEN2,
1776			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1777				       0),
1778		},
1779
1780		/* BSpec: 11391 */
1781		{
1782			FF_SLICE_CHICKEN,
1783			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1784				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1785		},
1786
1787		/* BSpec: 11299 */
1788		{
1789			_3D_CHICKEN3,
1790			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1791				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1792		}
1793	};
1794
1795	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1796
1797	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1798	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1799
1800	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1801	batch = gen8_emit_pipe_control(batch,
1802				       PIPE_CONTROL_FLUSH_L3 |
1803				       PIPE_CONTROL_STORE_DATA_INDEX |
1804				       PIPE_CONTROL_CS_STALL |
1805				       PIPE_CONTROL_QW_WRITE,
1806				       LRC_PPHWSP_SCRATCH_ADDR);
1807
1808	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1809
1810	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1811	if (HAS_POOLED_EU(engine->i915)) {
1812		/*
1813		 * EU pool configuration is setup along with golden context
1814		 * during context initialization. This value depends on
1815		 * device type (2x6 or 3x6) and needs to be updated based
1816		 * on which subslice is disabled especially for 2x6
1817		 * devices, however it is safe to load default
1818		 * configuration of 3x6 device instead of masking off
1819		 * corresponding bits because HW ignores bits of a disabled
1820		 * subslice and drops down to appropriate config. Please
1821		 * see render_state_setup() in i915_gem_render_state.c for
1822		 * possible configurations, to avoid duplication they are
1823		 * not shown here again.
1824		 */
1825		*batch++ = GEN9_MEDIA_POOL_STATE;
1826		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1827		*batch++ = 0x00777000;
1828		*batch++ = 0;
1829		*batch++ = 0;
1830		*batch++ = 0;
1831	}
1832
1833	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1834
1835	/* Pad to end of cacheline */
1836	while ((unsigned long)batch % CACHELINE_BYTES)
1837		*batch++ = MI_NOOP;
1838
1839	return batch;
1840}
1841
1842#define CTX_WA_BB_SIZE (PAGE_SIZE)
1843
1844static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1845{
1846	struct drm_i915_gem_object *obj;
1847	struct i915_vma *vma;
1848	int err;
1849
1850	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1851	if (IS_ERR(obj))
1852		return PTR_ERR(obj);
1853
1854	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1855	if (IS_ERR(vma)) {
1856		err = PTR_ERR(vma);
1857		goto err;
1858	}
1859
1860	engine->wa_ctx.vma = vma;
1861	return 0;
1862
1863err:
1864	i915_gem_object_put(obj);
1865	return err;
1866}
1867
1868void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1869{
1870	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1871}
1872
1873typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1874
1875void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1876{
1877	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1878	struct i915_wa_ctx_bb *wa_bb[] = {
1879		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1880	};
1881	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1882	struct i915_gem_ww_ctx ww;
1883	void *batch, *batch_ptr;
1884	unsigned int i;
1885	int err;
1886
1887	if (GRAPHICS_VER(engine->i915) >= 11 ||
1888	    !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1889		return;
1890
1891	if (GRAPHICS_VER(engine->i915) == 9) {
1892		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1893		wa_bb_fn[1] = NULL;
1894	} else if (GRAPHICS_VER(engine->i915) == 8) {
1895		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1896		wa_bb_fn[1] = NULL;
1897	}
1898
1899	err = lrc_create_wa_ctx(engine);
1900	if (err) {
1901		/*
1902		 * We continue even if we fail to initialize WA batch
1903		 * because we only expect rare glitches but nothing
1904		 * critical to prevent us from using GPU
1905		 */
1906		drm_err(&engine->i915->drm,
1907			"Ignoring context switch w/a allocation error:%d\n",
1908			err);
1909		return;
1910	}
1911
1912	if (!engine->wa_ctx.vma)
1913		return;
1914
1915	i915_gem_ww_ctx_init(&ww, true);
1916retry:
1917	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1918	if (!err)
1919		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1920	if (err)
1921		goto err;
1922
1923	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1924	if (IS_ERR(batch)) {
1925		err = PTR_ERR(batch);
1926		goto err_unpin;
1927	}
1928
1929	/*
1930	 * Emit the two workaround batch buffers, recording the offset from the
1931	 * start of the workaround batch buffer object for each and their
1932	 * respective sizes.
1933	 */
1934	batch_ptr = batch;
1935	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1936		wa_bb[i]->offset = batch_ptr - batch;
1937		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1938						  CACHELINE_BYTES))) {
1939			err = -EINVAL;
1940			break;
1941		}
1942		if (wa_bb_fn[i])
1943			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1944		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1945	}
1946	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1947
1948	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1949	__i915_gem_object_release_map(wa_ctx->vma->obj);
1950
1951	/* Verify that we can handle failure to setup the wa_ctx */
1952	if (!err)
1953		err = i915_inject_probe_error(engine->i915, -ENODEV);
1954
1955err_unpin:
1956	if (err)
1957		i915_vma_unpin(wa_ctx->vma);
1958err:
1959	if (err == -EDEADLK) {
1960		err = i915_gem_ww_ctx_backoff(&ww);
1961		if (!err)
1962			goto retry;
1963	}
1964	i915_gem_ww_ctx_fini(&ww);
1965
1966	if (err) {
1967		i915_vma_put(engine->wa_ctx.vma);
1968
1969		/* Clear all flags to prevent further use */
1970		memset(wa_ctx, 0, sizeof(*wa_ctx));
1971	}
1972}
1973
1974static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1975{
1976#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1977	stats->runtime.num_underflow++;
1978	stats->runtime.max_underflow =
1979		max_t(u32, stats->runtime.max_underflow, -dt);
1980#endif
1981}
1982
1983static u32 lrc_get_runtime(const struct intel_context *ce)
1984{
1985	/*
1986	 * We can use either ppHWSP[16] which is recorded before the context
1987	 * switch (and so excludes the cost of context switches) or use the
1988	 * value from the context image itself, which is saved/restored earlier
1989	 * and so includes the cost of the save.
1990	 */
1991	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1992}
1993
1994void lrc_update_runtime(struct intel_context *ce)
1995{
1996	struct intel_context_stats *stats = &ce->stats;
1997	u32 old;
1998	s32 dt;
1999
2000	old = stats->runtime.last;
2001	stats->runtime.last = lrc_get_runtime(ce);
2002	dt = stats->runtime.last - old;
2003	if (!dt)
2004		return;
2005
2006	if (unlikely(dt < 0)) {
2007		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
2008			 old, stats->runtime.last, dt);
2009		st_runtime_underflow(stats, dt);
2010		return;
2011	}
2012
2013	ewma_runtime_add(&stats->runtime.avg, dt);
2014	stats->runtime.total += dt;
2015}
2016
2017#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
2018#include "selftest_lrc.c"
2019#endif
2020