1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2014-2018 Intel Corporation
4 */
5
6#include "i915_drv.h"
7#include "i915_reg.h"
8#include "intel_context.h"
9#include "intel_engine_pm.h"
10#include "intel_engine_regs.h"
11#include "intel_gpu_commands.h"
12#include "intel_gt.h"
13#include "intel_gt_ccs_mode.h"
14#include "intel_gt_mcr.h"
15#include "intel_gt_print.h"
16#include "intel_gt_regs.h"
17#include "intel_ring.h"
18#include "intel_workarounds.h"
19
20/**
21 * DOC: Hardware workarounds
22 *
23 * Hardware workarounds are register programming documented to be executed in
24 * the driver that fall outside of the normal programming sequences for a
25 * platform. There are some basic categories of workarounds, depending on
26 * how/when they are applied:
27 *
28 * - Context workarounds: workarounds that touch registers that are
29 *   saved/restored to/from the HW context image. The list is emitted (via Load
30 *   Register Immediate commands) once when initializing the device and saved in
31 *   the default context. That default context is then used on every context
32 *   creation to have a "primed golden context", i.e. a context image that
33 *   already contains the changes needed to all the registers.
34 *
35 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
36 *   variants respective to the targeted platforms.
37 *
38 * - Engine workarounds: the list of these WAs is applied whenever the specific
39 *   engine is reset. It's also possible that a set of engine classes share a
40 *   common power domain and they are reset together. This happens on some
41 *   platforms with render and compute engines. In this case (at least) one of
42 *   them need to keeep the workaround programming: the approach taken in the
43 *   driver is to tie those workarounds to the first compute/render engine that
44 *   is registered.  When executing with GuC submission, engine resets are
45 *   outside of kernel driver control, hence the list of registers involved in
46 *   written once, on engine initialization, and then passed to GuC, that
47 *   saves/restores their values before/after the reset takes place. See
48 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
49 *
50 *   Workarounds for registers specific to RCS and CCS should be implemented in
51 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
52 *   registers belonging to BCS, VCS or VECS should be implemented in
53 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
54 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
55 *   should be implemented in general_render_compute_wa_init(). The settings
56 *   about the CCS load balancing should be added in ccs_engine_wa_mode().
57 *
58 * - GT workarounds: the list of these WAs is applied whenever these registers
59 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
60 *
61 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
62 *   variants respective to the targeted platforms.
63 *
64 * - Register whitelist: some workarounds need to be implemented in userspace,
65 *   but need to touch privileged registers. The whitelist in the kernel
66 *   instructs the hardware to allow the access to happen. From the kernel side,
67 *   this is just a special case of a MMIO workaround (as we write the list of
68 *   these to/be-whitelisted registers to some special HW registers).
69 *
70 *   Register whitelisting should be done in the \*_whitelist_build() variants
71 *   respective to the targeted platforms.
72 *
73 * - Workaround batchbuffers: buffers that get executed automatically by the
74 *   hardware on every HW context restore. These buffers are created and
75 *   programmed in the default context so the hardware always go through those
76 *   programming sequences when switching contexts. The support for workaround
77 *   batchbuffers is enabled these hardware mechanisms:
78 *
79 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
80 *      context, pointing the hardware to jump to that location when that offset
81 *      is reached in the context restore. Workaround batchbuffer in the driver
82 *      currently uses this mechanism for all platforms.
83 *
84 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
85 *      pointing the hardware to a buffer to continue executing after the
86 *      engine registers are restored in a context restore sequence. This is
87 *      currently not used in the driver.
88 *
89 * - Other:  There are WAs that, due to their nature, cannot be applied from a
90 *   central place. Those are peppered around the rest of the code, as needed.
91 *   Workarounds related to the display IP are the main example.
92 *
93 * .. [1] Technically, some registers are powercontext saved & restored, so they
94 *    survive a suspend/resume. In practice, writing them again is not too
95 *    costly and simplifies things, so it's the approach taken in the driver.
96 */
97
98static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
99			  const char *name, const char *engine_name)
100{
101	wal->gt = gt;
102	wal->name = name;
103	wal->engine_name = engine_name;
104}
105
106#define WA_LIST_CHUNK (1 << 4)
107
108static void wa_init_finish(struct i915_wa_list *wal)
109{
110	/* Trim unused entries. */
111	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
112		struct i915_wa *list = kmemdup(wal->list,
113					       wal->count * sizeof(*list),
114					       GFP_KERNEL);
115
116		if (list) {
117			kfree(wal->list);
118			wal->list = list;
119		}
120	}
121
122	if (!wal->count)
123		return;
124
125	gt_dbg(wal->gt, "Initialized %u %s workarounds on %s\n",
126	       wal->wa_count, wal->name, wal->engine_name);
127}
128
129static enum forcewake_domains
130wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
131{
132	enum forcewake_domains fw = 0;
133	struct i915_wa *wa;
134	unsigned int i;
135
136	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
137		fw |= intel_uncore_forcewake_for_reg(uncore,
138						     wa->reg,
139						     FW_REG_READ |
140						     FW_REG_WRITE);
141
142	return fw;
143}
144
145static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
146{
147	unsigned int addr = i915_mmio_reg_offset(wa->reg);
148	struct drm_i915_private *i915 = wal->gt->i915;
149	unsigned int start = 0, end = wal->count;
150	const unsigned int grow = WA_LIST_CHUNK;
151	struct i915_wa *wa_;
152
153	GEM_BUG_ON(!is_power_of_2(grow));
154
155	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
156		struct i915_wa *list;
157
158		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
159				     GFP_KERNEL);
160		if (!list) {
161			drm_err(&i915->drm, "No space for workaround init!\n");
162			return;
163		}
164
165		if (wal->list) {
166			memcpy(list, wal->list, sizeof(*wa) * wal->count);
167			kfree(wal->list);
168		}
169
170		wal->list = list;
171	}
172
173	while (start < end) {
174		unsigned int mid = start + (end - start) / 2;
175
176		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
177			start = mid + 1;
178		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
179			end = mid;
180		} else {
181			wa_ = &wal->list[mid];
182
183			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
184				drm_err(&i915->drm,
185					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
186					i915_mmio_reg_offset(wa_->reg),
187					wa_->clr, wa_->set);
188
189				wa_->set &= ~wa->clr;
190			}
191
192			wal->wa_count++;
193			wa_->set |= wa->set;
194			wa_->clr |= wa->clr;
195			wa_->read |= wa->read;
196			return;
197		}
198	}
199
200	wal->wa_count++;
201	wa_ = &wal->list[wal->count++];
202	*wa_ = *wa;
203
204	while (wa_-- > wal->list) {
205		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
206			   i915_mmio_reg_offset(wa_[1].reg));
207		if (i915_mmio_reg_offset(wa_[1].reg) >
208		    i915_mmio_reg_offset(wa_[0].reg))
209			break;
210
211		swap(wa_[1], wa_[0]);
212	}
213}
214
215static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
216		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
217{
218	struct i915_wa wa = {
219		.reg  = reg,
220		.clr  = clear,
221		.set  = set,
222		.read = read_mask,
223		.masked_reg = masked_reg,
224	};
225
226	_wa_add(wal, &wa);
227}
228
229static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
230		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
231{
232	struct i915_wa wa = {
233		.mcr_reg = reg,
234		.clr  = clear,
235		.set  = set,
236		.read = read_mask,
237		.masked_reg = masked_reg,
238		.is_mcr = 1,
239	};
240
241	_wa_add(wal, &wa);
242}
243
244static void
245wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
246{
247	wa_add(wal, reg, clear, set, clear | set, false);
248}
249
250static void
251wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
252{
253	wa_mcr_add(wal, reg, clear, set, clear | set, false);
254}
255
256static void
257wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
258{
259	wa_write_clr_set(wal, reg, ~0, set);
260}
261
262static void
263wa_mcr_write(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
264{
265	wa_mcr_write_clr_set(wal, reg, ~0, set);
266}
267
268static void
269wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
270{
271	wa_write_clr_set(wal, reg, set, set);
272}
273
274static void
275wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
276{
277	wa_mcr_write_clr_set(wal, reg, set, set);
278}
279
280static void
281wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
282{
283	wa_write_clr_set(wal, reg, clr, 0);
284}
285
286static void
287wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
288{
289	wa_mcr_write_clr_set(wal, reg, clr, 0);
290}
291
292/*
293 * WA operations on "masked register". A masked register has the upper 16 bits
294 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
295 * portion of the register without a rmw: you simply write in the upper 16 bits
296 * the mask of bits you are going to modify.
297 *
298 * The wa_masked_* family of functions already does the necessary operations to
299 * calculate the mask based on the parameters passed, so user only has to
300 * provide the lower 16 bits of that register.
301 */
302
303static void
304wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
305{
306	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
307}
308
309static void
310wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
311{
312	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
313}
314
315static void
316wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
317{
318	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
319}
320
321static void
322wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
323{
324	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
325}
326
327static void
328wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
329		    u32 mask, u32 val)
330{
331	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
332}
333
334static void
335wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
336			u32 mask, u32 val)
337{
338	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
339}
340
341static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
342				      struct i915_wa_list *wal)
343{
344	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
345}
346
347static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
348				      struct i915_wa_list *wal)
349{
350	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
351}
352
353static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
354				      struct i915_wa_list *wal)
355{
356	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
357
358	/* WaDisableAsyncFlipPerfMode:bdw,chv */
359	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
360
361	/* WaDisablePartialInstShootdown:bdw,chv */
362	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
363			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
364
365	/* Use Force Non-Coherent whenever executing a 3D context. This is a
366	 * workaround for a possible hang in the unlikely event a TLB
367	 * invalidation occurs during a PSD flush.
368	 */
369	/* WaForceEnableNonCoherent:bdw,chv */
370	/* WaHdcDisableFetchWhenMasked:bdw,chv */
371	wa_masked_en(wal, HDC_CHICKEN0,
372		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
373		     HDC_FORCE_NON_COHERENT);
374
375	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
376	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
377	 *  polygons in the same 8x4 pixel/sample area to be processed without
378	 *  stalling waiting for the earlier ones to write to Hierarchical Z
379	 *  buffer."
380	 *
381	 * This optimization is off by default for BDW and CHV; turn it on.
382	 */
383	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
384
385	/* Wa4x4STCOptimizationDisable:bdw,chv */
386	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
387
388	/*
389	 * BSpec recommends 8x4 when MSAA is used,
390	 * however in practice 16x4 seems fastest.
391	 *
392	 * Note that PS/WM thread counts depend on the WIZ hashing
393	 * disable bit, which we don't touch here, but it's good
394	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
395	 */
396	wa_masked_field_set(wal, GEN7_GT_MODE,
397			    GEN6_WIZ_HASHING_MASK,
398			    GEN6_WIZ_HASHING_16x4);
399}
400
401static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
402				     struct i915_wa_list *wal)
403{
404	struct drm_i915_private *i915 = engine->i915;
405
406	gen8_ctx_workarounds_init(engine, wal);
407
408	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
409	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
410
411	/* WaDisableDopClockGating:bdw
412	 *
413	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
414	 * to disable EUTC clock gating.
415	 */
416	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
417			 DOP_CLOCK_GATING_DISABLE);
418
419	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
420			 GEN8_SAMPLER_POWER_BYPASS_DIS);
421
422	wa_masked_en(wal, HDC_CHICKEN0,
423		     /* WaForceContextSaveRestoreNonCoherent:bdw */
424		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
425		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
426		     (IS_BROADWELL_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
427}
428
429static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
430				     struct i915_wa_list *wal)
431{
432	gen8_ctx_workarounds_init(engine, wal);
433
434	/* WaDisableThreadStallDopClockGating:chv */
435	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
436
437	/* Improve HiZ throughput on CHV. */
438	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
439}
440
441static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
442				      struct i915_wa_list *wal)
443{
444	struct drm_i915_private *i915 = engine->i915;
445
446	if (HAS_LLC(i915)) {
447		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
448		 *
449		 * Must match Display Engine. See
450		 * WaCompressedResourceDisplayNewHashMode.
451		 */
452		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
453			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
454		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
455				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
456	}
457
458	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
459	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
460	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
461			 FLOW_CONTROL_ENABLE |
462			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
463
464	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
465	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
466	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
467			 GEN9_ENABLE_YV12_BUGFIX |
468			 GEN9_ENABLE_GPGPU_PREEMPTION);
469
470	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
471	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
472	wa_masked_en(wal, CACHE_MODE_1,
473		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
474		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
475
476	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
477	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
478			  GEN9_CCS_TLB_PREFETCH_ENABLE);
479
480	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
481	wa_masked_en(wal, HDC_CHICKEN0,
482		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
483		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
484
485	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
486	 * both tied to WaForceContextSaveRestoreNonCoherent
487	 * in some hsds for skl. We keep the tie for all gen9. The
488	 * documentation is a bit hazy and so we want to get common behaviour,
489	 * even though there is no clear evidence we would need both on kbl/bxt.
490	 * This area has been source of system hangs so we play it safe
491	 * and mimic the skl regardless of what bspec says.
492	 *
493	 * Use Force Non-Coherent whenever executing a 3D context. This
494	 * is a workaround for a possible hang in the unlikely event
495	 * a TLB invalidation occurs during a PSD flush.
496	 */
497
498	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
499	wa_masked_en(wal, HDC_CHICKEN0,
500		     HDC_FORCE_NON_COHERENT);
501
502	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
503	if (IS_SKYLAKE(i915) ||
504	    IS_KABYLAKE(i915) ||
505	    IS_COFFEELAKE(i915) ||
506	    IS_COMETLAKE(i915))
507		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
508				 GEN8_SAMPLER_POWER_BYPASS_DIS);
509
510	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
511	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
512
513	/*
514	 * Supporting preemption with fine-granularity requires changes in the
515	 * batch buffer programming. Since we can't break old userspace, we
516	 * need to set our default preemption level to safe value. Userspace is
517	 * still able to use more fine-grained preemption levels, since in
518	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
519	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
520	 * not real HW workarounds, but merely a way to start using preemption
521	 * while maintaining old contract with userspace.
522	 */
523
524	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
525	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
526
527	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
528	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
529			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
530			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
531
532	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
533	if (IS_GEN9_LP(i915))
534		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
535}
536
537static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
538				struct i915_wa_list *wal)
539{
540	struct intel_gt *gt = engine->gt;
541	u8 vals[3] = { 0, 0, 0 };
542	unsigned int i;
543
544	for (i = 0; i < 3; i++) {
545		u8 ss;
546
547		/*
548		 * Only consider slices where one, and only one, subslice has 7
549		 * EUs
550		 */
551		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
552			continue;
553
554		/*
555		 * subslice_7eu[i] != 0 (because of the check above) and
556		 * ss_max == 4 (maximum number of subslices possible per slice)
557		 *
558		 * ->    0 <= ss <= 3;
559		 */
560		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
561		vals[i] = 3 - ss;
562	}
563
564	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
565		return;
566
567	/* Tune IZ hashing. See intel_device_info_runtime_init() */
568	wa_masked_field_set(wal, GEN7_GT_MODE,
569			    GEN9_IZ_HASHING_MASK(2) |
570			    GEN9_IZ_HASHING_MASK(1) |
571			    GEN9_IZ_HASHING_MASK(0),
572			    GEN9_IZ_HASHING(2, vals[2]) |
573			    GEN9_IZ_HASHING(1, vals[1]) |
574			    GEN9_IZ_HASHING(0, vals[0]));
575}
576
577static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
578				     struct i915_wa_list *wal)
579{
580	gen9_ctx_workarounds_init(engine, wal);
581	skl_tune_iz_hashing(engine, wal);
582}
583
584static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
585				     struct i915_wa_list *wal)
586{
587	gen9_ctx_workarounds_init(engine, wal);
588
589	/* WaDisableThreadStallDopClockGating:bxt */
590	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
591			 STALL_DOP_GATING_DISABLE);
592
593	/* WaToEnableHwFixForPushConstHWBug:bxt */
594	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
595		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
596}
597
598static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
599				     struct i915_wa_list *wal)
600{
601	struct drm_i915_private *i915 = engine->i915;
602
603	gen9_ctx_workarounds_init(engine, wal);
604
605	/* WaToEnableHwFixForPushConstHWBug:kbl */
606	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
607		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
608			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
609
610	/* WaDisableSbeCacheDispatchPortSharing:kbl */
611	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
612			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
613}
614
615static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
616				     struct i915_wa_list *wal)
617{
618	gen9_ctx_workarounds_init(engine, wal);
619
620	/* WaToEnableHwFixForPushConstHWBug:glk */
621	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
622		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
623}
624
625static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
626				     struct i915_wa_list *wal)
627{
628	gen9_ctx_workarounds_init(engine, wal);
629
630	/* WaToEnableHwFixForPushConstHWBug:cfl */
631	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
632		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
633
634	/* WaDisableSbeCacheDispatchPortSharing:cfl */
635	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
636			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
637}
638
639static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
640				     struct i915_wa_list *wal)
641{
642	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
643	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
644
645	/* WaForceEnableNonCoherent:icl
646	 * This is not the same workaround as in early Gen9 platforms, where
647	 * lacking this could cause system hangs, but coherency performance
648	 * overhead is high and only a few compute workloads really need it
649	 * (the register is whitelisted in hardware now, so UMDs can opt in
650	 * for coherency if they have a good reason).
651	 */
652	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
653
654	/* WaEnableFloatBlendOptimization:icl */
655	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
656		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
657		   0 /* write-only, so skip validation */,
658		   true);
659
660	/* WaDisableGPGPUMidThreadPreemption:icl */
661	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
662			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
663			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
664
665	/* allow headerless messages for preemptible GPGPU context */
666	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
667			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
668
669	/* Wa_1604278689:icl,ehl */
670	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
671	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
672			 0,
673			 0xFFFFFFFF);
674
675	/* Wa_1406306137:icl,ehl */
676	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
677}
678
679/*
680 * These settings aren't actually workarounds, but general tuning settings that
681 * need to be programmed on dg2 platform.
682 */
683static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
684				   struct i915_wa_list *wal)
685{
686	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
687	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
688			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
689	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
690			     FF_MODE2_TDS_TIMER_128);
691}
692
693static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
694				       struct i915_wa_list *wal)
695{
696	struct drm_i915_private *i915 = engine->i915;
697
698	/*
699	 * Wa_1409142259:tgl,dg1,adl-p
700	 * Wa_1409347922:tgl,dg1,adl-p
701	 * Wa_1409252684:tgl,dg1,adl-p
702	 * Wa_1409217633:tgl,dg1,adl-p
703	 * Wa_1409207793:tgl,dg1,adl-p
704	 * Wa_1409178076:tgl,dg1,adl-p
705	 * Wa_1408979724:tgl,dg1,adl-p
706	 * Wa_14010443199:tgl,rkl,dg1,adl-p
707	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
708	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
709	 */
710	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
711		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
712
713	/* WaDisableGPGPUMidThreadPreemption:gen12 */
714	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
715			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
716			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
717
718	/*
719	 * Wa_16011163337 - GS_TIMER
720	 *
721	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
722	 * need to program it even on those that don't explicitly list that
723	 * workaround.
724	 *
725	 * Note that the programming of GEN12_FF_MODE2 is further modified
726	 * according to the FF_MODE2 guidance given by Wa_1608008084.
727	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
728	 * value when read from the CPU.
729	 *
730	 * The default value for this register is zero for all fields.
731	 * So instead of doing a RMW we should just write the desired values
732	 * for TDS and GS timers. Note that since the readback can't be trusted,
733	 * the clear mask is just set to ~0 to make sure other bits are not
734	 * inadvertently set. For the same reason read verification is ignored.
735	 */
736	wa_add(wal,
737	       GEN12_FF_MODE2,
738	       ~0,
739	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
740	       0, false);
741
742	if (!IS_DG1(i915)) {
743		/* Wa_1806527549 */
744		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
745
746		/* Wa_1606376872 */
747		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
748	}
749}
750
751static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
752				     struct i915_wa_list *wal)
753{
754	gen12_ctx_workarounds_init(engine, wal);
755
756	/* Wa_1409044764 */
757	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
758		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
759
760	/* Wa_22010493298 */
761	wa_masked_en(wal, HIZ_CHICKEN,
762		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
763}
764
765static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
766				     struct i915_wa_list *wal)
767{
768	dg2_ctx_gt_tuning_init(engine, wal);
769
770	/* Wa_16013271637:dg2 */
771	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
772			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
773
774	/* Wa_14014947963:dg2 */
775	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
776
777	/* Wa_18018764978:dg2 */
778	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
779
780	/* Wa_18019271663:dg2 */
781	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
782
783	/* Wa_14019877138:dg2 */
784	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
785}
786
787static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
788				     struct i915_wa_list *wal)
789{
790	struct intel_gt *gt = engine->gt;
791
792	dg2_ctx_gt_tuning_init(engine, wal);
793
794	/*
795	 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
796	 * gen12_emit_indirect_ctx_rcs() rather than here on some early
797	 * steppings.
798	 */
799	if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
800	      IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
801		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
802}
803
804static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
805				       struct i915_wa_list *wal)
806{
807	struct intel_gt *gt = engine->gt;
808
809	xelpg_ctx_gt_tuning_init(engine, wal);
810
811	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
812	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
813		/* Wa_14014947963 */
814		wa_masked_field_set(wal, VF_PREEMPTION,
815				    PREEMPTION_VERTEX_COUNT, 0x4000);
816
817		/* Wa_16013271637 */
818		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
819				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
820
821		/* Wa_18019627453 */
822		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
823
824		/* Wa_18018764978 */
825		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
826	}
827
828	/* Wa_18019271663 */
829	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
830
831	/* Wa_14019877138 */
832	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
833}
834
835static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
836					 struct i915_wa_list *wal)
837{
838	/*
839	 * This is a "fake" workaround defined by software to ensure we
840	 * maintain reliable, backward-compatible behavior for userspace with
841	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
842	 *
843	 * The per-context setting of MI_MODE[12] determines whether the bits
844	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
845	 * in the traditional manner or whether they should instead use a new
846	 * tgl+ meaning that breaks backward compatibility, but allows nesting
847	 * into 3rd-level batchbuffers.  When this new capability was first
848	 * added in TGL, it remained off by default unless a context
849	 * intentionally opted in to the new behavior.  However Xe_HPG now
850	 * flips this on by default and requires that we explicitly opt out if
851	 * we don't want the new behavior.
852	 *
853	 * From a SW perspective, we want to maintain the backward-compatible
854	 * behavior for userspace, so we'll apply a fake workaround to set it
855	 * back to the legacy behavior on platforms where the hardware default
856	 * is to break compatibility.  At the moment there is no Linux
857	 * userspace that utilizes third-level batchbuffers, so this will avoid
858	 * userspace from needing to make any changes.  using the legacy
859	 * meaning is the correct thing to do.  If/when we have userspace
860	 * consumers that want to utilize third-level batch nesting, we can
861	 * provide a context parameter to allow them to opt-in.
862	 */
863	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
864}
865
866static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
867				   struct i915_wa_list *wal)
868{
869	u8 mocs;
870
871	/*
872	 * Some blitter commands do not have a field for MOCS, those
873	 * commands will use MOCS index pointed by BLIT_CCTL.
874	 * BLIT_CCTL registers are needed to be programmed to un-cached.
875	 */
876	if (engine->class == COPY_ENGINE_CLASS) {
877		mocs = engine->gt->mocs.uc_index;
878		wa_write_clr_set(wal,
879				 BLIT_CCTL(engine->mmio_base),
880				 BLIT_CCTL_MASK,
881				 BLIT_CCTL_MOCS(mocs, mocs));
882	}
883}
884
885/*
886 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
887 * defined by the hardware team, but it programming general context registers.
888 * Adding those context register programming in context workaround
889 * allow us to use the wa framework for proper application and validation.
890 */
891static void
892gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
893			  struct i915_wa_list *wal)
894{
895	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
896		fakewa_disable_nestedbb_mode(engine, wal);
897
898	gen12_ctx_gt_mocs_init(engine, wal);
899}
900
901static void
902__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
903			   struct i915_wa_list *wal,
904			   const char *name)
905{
906	struct drm_i915_private *i915 = engine->i915;
907
908	wa_init_start(wal, engine->gt, name, engine->name);
909
910	/* Applies to all engines */
911	/*
912	 * Fake workarounds are not the actual workaround but
913	 * programming of context registers using workaround framework.
914	 */
915	if (GRAPHICS_VER(i915) >= 12)
916		gen12_ctx_gt_fake_wa_init(engine, wal);
917
918	if (engine->class != RENDER_CLASS)
919		goto done;
920
921	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
922		xelpg_ctx_workarounds_init(engine, wal);
923	else if (IS_PONTEVECCHIO(i915))
924		; /* noop; none at this time */
925	else if (IS_DG2(i915))
926		dg2_ctx_workarounds_init(engine, wal);
927	else if (IS_XEHPSDV(i915))
928		; /* noop; none at this time */
929	else if (IS_DG1(i915))
930		dg1_ctx_workarounds_init(engine, wal);
931	else if (GRAPHICS_VER(i915) == 12)
932		gen12_ctx_workarounds_init(engine, wal);
933	else if (GRAPHICS_VER(i915) == 11)
934		icl_ctx_workarounds_init(engine, wal);
935	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
936		cfl_ctx_workarounds_init(engine, wal);
937	else if (IS_GEMINILAKE(i915))
938		glk_ctx_workarounds_init(engine, wal);
939	else if (IS_KABYLAKE(i915))
940		kbl_ctx_workarounds_init(engine, wal);
941	else if (IS_BROXTON(i915))
942		bxt_ctx_workarounds_init(engine, wal);
943	else if (IS_SKYLAKE(i915))
944		skl_ctx_workarounds_init(engine, wal);
945	else if (IS_CHERRYVIEW(i915))
946		chv_ctx_workarounds_init(engine, wal);
947	else if (IS_BROADWELL(i915))
948		bdw_ctx_workarounds_init(engine, wal);
949	else if (GRAPHICS_VER(i915) == 7)
950		gen7_ctx_workarounds_init(engine, wal);
951	else if (GRAPHICS_VER(i915) == 6)
952		gen6_ctx_workarounds_init(engine, wal);
953	else if (GRAPHICS_VER(i915) < 8)
954		;
955	else
956		MISSING_CASE(GRAPHICS_VER(i915));
957
958done:
959	wa_init_finish(wal);
960}
961
962void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
963{
964	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
965}
966
967int intel_engine_emit_ctx_wa(struct i915_request *rq)
968{
969	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
970	struct intel_uncore *uncore = rq->engine->uncore;
971	enum forcewake_domains fw;
972	unsigned long flags;
973	struct i915_wa *wa;
974	unsigned int i;
975	u32 *cs;
976	int ret;
977
978	if (wal->count == 0)
979		return 0;
980
981	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
982	if (ret)
983		return ret;
984
985	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
986	if (IS_ERR(cs))
987		return PTR_ERR(cs);
988
989	fw = wal_get_fw_for_rmw(uncore, wal);
990
991	intel_gt_mcr_lock(wal->gt, &flags);
992	spin_lock(&uncore->lock);
993	intel_uncore_forcewake_get__locked(uncore, fw);
994
995	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
996	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
997		u32 val;
998
999		/* Skip reading the register if it's not really needed */
1000		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
1001			val = wa->set;
1002		} else {
1003			val = wa->is_mcr ?
1004				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1005				intel_uncore_read_fw(uncore, wa->reg);
1006			val &= ~wa->clr;
1007			val |= wa->set;
1008		}
1009
1010		*cs++ = i915_mmio_reg_offset(wa->reg);
1011		*cs++ = val;
1012	}
1013	*cs++ = MI_NOOP;
1014
1015	intel_uncore_forcewake_put__locked(uncore, fw);
1016	spin_unlock(&uncore->lock);
1017	intel_gt_mcr_unlock(wal->gt, flags);
1018
1019	intel_ring_advance(rq, cs);
1020
1021	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1022	if (ret)
1023		return ret;
1024
1025	return 0;
1026}
1027
1028static void
1029gen4_gt_workarounds_init(struct intel_gt *gt,
1030			 struct i915_wa_list *wal)
1031{
1032	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1033	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1034}
1035
1036static void
1037g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1038{
1039	gen4_gt_workarounds_init(gt, wal);
1040
1041	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1042	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1043}
1044
1045static void
1046ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1047{
1048	g4x_gt_workarounds_init(gt, wal);
1049
1050	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1051}
1052
1053static void
1054snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1055{
1056}
1057
1058static void
1059ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1060{
1061	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1062	wa_masked_dis(wal,
1063		      GEN7_COMMON_SLICE_CHICKEN1,
1064		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1065
1066	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1067	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1068	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1069
1070	/* WaForceL3Serialization:ivb */
1071	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1072}
1073
1074static void
1075vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1076{
1077	/* WaForceL3Serialization:vlv */
1078	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1079
1080	/*
1081	 * WaIncreaseL3CreditsForVLVB0:vlv
1082	 * This is the hardware default actually.
1083	 */
1084	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1085}
1086
1087static void
1088hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1089{
1090	/* L3 caching of data atomics doesn't work -- disable it. */
1091	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1092
1093	wa_add(wal,
1094	       HSW_ROW_CHICKEN3, 0,
1095	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1096	       0 /* XXX does this reg exist? */, true);
1097
1098	/* WaVSRefCountFullforceMissDisable:hsw */
1099	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1100}
1101
1102static void
1103gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1104{
1105	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1106	unsigned int slice, subslice;
1107	u32 mcr, mcr_mask;
1108
1109	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1110
1111	/*
1112	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1113	 * Before any MMIO read into slice/subslice specific registers, MCR
1114	 * packet control register needs to be programmed to point to any
1115	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1116	 * This means each subsequent MMIO read will be forwarded to an
1117	 * specific s/ss combination, but this is OK since these registers
1118	 * are consistent across s/ss in almost all cases. In the rare
1119	 * occasions, such as INSTDONE, where this value is dependent
1120	 * on s/ss combo, the read should be done with read_subslice_reg.
1121	 */
1122	slice = ffs(sseu->slice_mask) - 1;
1123	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1124	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1125	GEM_BUG_ON(!subslice);
1126	subslice--;
1127
1128	/*
1129	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1130	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1131	 */
1132	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1133	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1134
1135	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1136
1137	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1138}
1139
1140static void
1141gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1142{
1143	struct drm_i915_private *i915 = gt->i915;
1144
1145	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1146	gen9_wa_init_mcr(i915, wal);
1147
1148	/* WaDisableKillLogic:bxt,skl,kbl */
1149	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1150		wa_write_or(wal,
1151			    GAM_ECOCHK,
1152			    ECOCHK_DIS_TLB);
1153
1154	if (HAS_LLC(i915)) {
1155		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1156		 *
1157		 * Must match Display Engine. See
1158		 * WaCompressedResourceDisplayNewHashMode.
1159		 */
1160		wa_write_or(wal,
1161			    MMCD_MISC_CTRL,
1162			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1163	}
1164
1165	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1166	wa_write_or(wal,
1167		    GAM_ECOCHK,
1168		    BDW_DISABLE_HDC_INVALIDATION);
1169}
1170
1171static void
1172skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1173{
1174	gen9_gt_workarounds_init(gt, wal);
1175
1176	/* WaDisableGafsUnitClkGating:skl */
1177	wa_write_or(wal,
1178		    GEN7_UCGCTL4,
1179		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1180
1181	/* WaInPlaceDecompressionHang:skl */
1182	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1183		wa_write_or(wal,
1184			    GEN9_GAMT_ECO_REG_RW_IA,
1185			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1186}
1187
1188static void
1189kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1190{
1191	gen9_gt_workarounds_init(gt, wal);
1192
1193	/* WaDisableDynamicCreditSharing:kbl */
1194	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1195		wa_write_or(wal,
1196			    GAMT_CHKN_BIT_REG,
1197			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1198
1199	/* WaDisableGafsUnitClkGating:kbl */
1200	wa_write_or(wal,
1201		    GEN7_UCGCTL4,
1202		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1203
1204	/* WaInPlaceDecompressionHang:kbl */
1205	wa_write_or(wal,
1206		    GEN9_GAMT_ECO_REG_RW_IA,
1207		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1208}
1209
1210static void
1211glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1212{
1213	gen9_gt_workarounds_init(gt, wal);
1214}
1215
1216static void
1217cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1218{
1219	gen9_gt_workarounds_init(gt, wal);
1220
1221	/* WaDisableGafsUnitClkGating:cfl */
1222	wa_write_or(wal,
1223		    GEN7_UCGCTL4,
1224		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1225
1226	/* WaInPlaceDecompressionHang:cfl */
1227	wa_write_or(wal,
1228		    GEN9_GAMT_ECO_REG_RW_IA,
1229		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1230}
1231
1232static void __set_mcr_steering(struct i915_wa_list *wal,
1233			       i915_reg_t steering_reg,
1234			       unsigned int slice, unsigned int subslice)
1235{
1236	u32 mcr, mcr_mask;
1237
1238	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1239	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1240
1241	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1242}
1243
1244static void debug_dump_steering(struct intel_gt *gt)
1245{
1246	struct drm_printer p = drm_dbg_printer(&gt->i915->drm, DRM_UT_DRIVER,
1247					       "MCR Steering:");
1248
1249	if (drm_debug_enabled(DRM_UT_DRIVER))
1250		intel_gt_mcr_report_steering(&p, gt, false);
1251}
1252
1253static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1254			 unsigned int slice, unsigned int subslice)
1255{
1256	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1257
1258	gt->default_steering.groupid = slice;
1259	gt->default_steering.instanceid = subslice;
1260
1261	debug_dump_steering(gt);
1262}
1263
1264static void
1265icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1266{
1267	const struct sseu_dev_info *sseu = &gt->info.sseu;
1268	unsigned int subslice;
1269
1270	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1271	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1272
1273	/*
1274	 * Although a platform may have subslices, we need to always steer
1275	 * reads to the lowest instance that isn't fused off.  When Render
1276	 * Power Gating is enabled, grabbing forcewake will only power up a
1277	 * single subslice (the "minconfig") if there isn't a real workload
1278	 * that needs to be run; this means that if we steer register reads to
1279	 * one of the higher subslices, we run the risk of reading back 0's or
1280	 * random garbage.
1281	 */
1282	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1283
1284	/*
1285	 * If the subslice we picked above also steers us to a valid L3 bank,
1286	 * then we can just rely on the default steering and won't need to
1287	 * worry about explicitly re-steering L3BANK reads later.
1288	 */
1289	if (gt->info.l3bank_mask & BIT(subslice))
1290		gt->steering_table[L3BANK] = NULL;
1291
1292	__add_mcr_wa(gt, wal, 0, subslice);
1293}
1294
1295static void
1296xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1297{
1298	const struct sseu_dev_info *sseu = &gt->info.sseu;
1299	unsigned long slice, subslice = 0, slice_mask = 0;
1300	u32 lncf_mask = 0;
1301	int i;
1302
1303	/*
1304	 * On Xe_HP the steering increases in complexity. There are now several
1305	 * more units that require steering and we're not guaranteed to be able
1306	 * to find a common setting for all of them. These are:
1307	 * - GSLICE (fusable)
1308	 * - DSS (sub-unit within gslice; fusable)
1309	 * - L3 Bank (fusable)
1310	 * - MSLICE (fusable)
1311	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1312	 *
1313	 * We'll do our default/implicit steering based on GSLICE (in the
1314	 * sliceid field) and DSS (in the subsliceid field).  If we can
1315	 * find overlap between the valid MSLICE and/or LNCF values with
1316	 * a suitable GSLICE, then we can just re-use the default value and
1317	 * skip and explicit steering at runtime.
1318	 *
1319	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1320	 * a valid sliceid value.  DSS steering is the only type of steering
1321	 * that utilizes the 'subsliceid' bits.
1322	 *
1323	 * Also note that, even though the steering domain is called "GSlice"
1324	 * and it is encoded in the register using the gslice format, the spec
1325	 * says that the combined (geometry | compute) fuse should be used to
1326	 * select the steering.
1327	 */
1328
1329	/* Find the potential gslice candidates */
1330	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1331						       GEN_DSS_PER_GSLICE);
1332
1333	/*
1334	 * Find the potential LNCF candidates.  Either LNCF within a valid
1335	 * mslice is fine.
1336	 */
1337	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1338		lncf_mask |= (0x3 << (i * 2));
1339
1340	/*
1341	 * Are there any sliceid values that work for both GSLICE and LNCF
1342	 * steering?
1343	 */
1344	if (slice_mask & lncf_mask) {
1345		slice_mask &= lncf_mask;
1346		gt->steering_table[LNCF] = NULL;
1347	}
1348
1349	/* How about sliceid values that also work for MSLICE steering? */
1350	if (slice_mask & gt->info.mslice_mask) {
1351		slice_mask &= gt->info.mslice_mask;
1352		gt->steering_table[MSLICE] = NULL;
1353	}
1354
1355	if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1356		gt->steering_table[GAM] = NULL;
1357
1358	slice = __ffs(slice_mask);
1359	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1360		GEN_DSS_PER_GSLICE;
1361
1362	__add_mcr_wa(gt, wal, slice, subslice);
1363
1364	/*
1365	 * SQIDI ranges are special because they use different steering
1366	 * registers than everything else we work with.  On XeHP SDV and
1367	 * DG2-G10, any value in the steering registers will work fine since
1368	 * all instances are present, but DG2-G11 only has SQIDI instances at
1369	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1370	 * we'll just steer to a hardcoded "2" since that value will work
1371	 * everywhere.
1372	 */
1373	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1374	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1375
1376	/*
1377	 * On DG2, GAM registers have a dedicated steering control register
1378	 * and must always be programmed to a hardcoded groupid of "1."
1379	 */
1380	if (IS_DG2(gt->i915))
1381		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1382}
1383
1384static void
1385pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1386{
1387	unsigned int dss;
1388
1389	/*
1390	 * Setup implicit steering for COMPUTE and DSS ranges to the first
1391	 * non-fused-off DSS.  All other types of MCR registers will be
1392	 * explicitly steered.
1393	 */
1394	dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1395	__add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1396}
1397
1398static void
1399icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1400{
1401	struct drm_i915_private *i915 = gt->i915;
1402
1403	icl_wa_init_mcr(gt, wal);
1404
1405	/* WaModifyGamTlbPartitioning:icl */
1406	wa_write_clr_set(wal,
1407			 GEN11_GACB_PERF_CTRL,
1408			 GEN11_HASH_CTRL_MASK,
1409			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1410
1411	/* Wa_1405766107:icl
1412	 * Formerly known as WaCL2SFHalfMaxAlloc
1413	 */
1414	wa_write_or(wal,
1415		    GEN11_LSN_UNSLCVC,
1416		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1417		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1418
1419	/* Wa_220166154:icl
1420	 * Formerly known as WaDisCtxReload
1421	 */
1422	wa_write_or(wal,
1423		    GEN8_GAMW_ECO_DEV_RW_IA,
1424		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1425
1426	/* Wa_1406463099:icl
1427	 * Formerly known as WaGamTlbPendError
1428	 */
1429	wa_write_or(wal,
1430		    GAMT_CHKN_BIT_REG,
1431		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1432
1433	/*
1434	 * Wa_1408615072:icl,ehl  (vsunit)
1435	 * Wa_1407596294:icl,ehl  (hsunit)
1436	 */
1437	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1438		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1439
1440	/* Wa_1407352427:icl,ehl */
1441	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1442		    PSDUNIT_CLKGATE_DIS);
1443
1444	/* Wa_1406680159:icl,ehl */
1445	wa_mcr_write_or(wal,
1446			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1447			GWUNIT_CLKGATE_DIS);
1448
1449	/* Wa_1607087056:icl,ehl,jsl */
1450	if (IS_ICELAKE(i915) ||
1451		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1452		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1453		wa_write_or(wal,
1454			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1455			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1456
1457	/*
1458	 * This is not a documented workaround, but rather an optimization
1459	 * to reduce sampler power.
1460	 */
1461	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1462}
1463
1464/*
1465 * Though there are per-engine instances of these registers,
1466 * they retain their value through engine resets and should
1467 * only be provided on the GT workaround list rather than
1468 * the engine-specific workaround list.
1469 */
1470static void
1471wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1472{
1473	struct intel_engine_cs *engine;
1474	int id;
1475
1476	for_each_engine(engine, gt, id) {
1477		if (engine->class != VIDEO_DECODE_CLASS ||
1478		    (engine->instance % 2))
1479			continue;
1480
1481		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1482			    IECPUNIT_CLKGATE_DIS);
1483	}
1484}
1485
1486static void
1487gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1488{
1489	icl_wa_init_mcr(gt, wal);
1490
1491	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1492	wa_14011060649(gt, wal);
1493
1494	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1495	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1496
1497	/*
1498	 * Wa_14015795083
1499	 *
1500	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1501	 * preventing i915 from modifying it for this workaround.  Skip the
1502	 * readback verification for this workaround on debug builds; if the
1503	 * workaround doesn't stick due to firmware behavior, it's not an error
1504	 * that we want CI to flag.
1505	 */
1506	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1507	       0, 0, false);
1508}
1509
1510static void
1511dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1512{
1513	gen12_gt_workarounds_init(gt, wal);
1514
1515	/* Wa_1409420604:dg1 */
1516	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1517			CPSSUNIT_CLKGATE_DIS);
1518
1519	/* Wa_1408615072:dg1 */
1520	/* Empirical testing shows this register is unaffected by engine reset. */
1521	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1522}
1523
1524static void
1525xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1526{
1527	struct drm_i915_private *i915 = gt->i915;
1528
1529	xehp_init_mcr(gt, wal);
1530
1531	/* Wa_1409757795:xehpsdv */
1532	wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1533
1534	/* Wa_18011725039:xehpsdv */
1535	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
1536		wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
1537		wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
1538	}
1539
1540	/* Wa_16011155590:xehpsdv */
1541	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1542		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1543			    TSGUNIT_CLKGATE_DIS);
1544
1545	/* Wa_14011780169:xehpsdv */
1546	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1547		wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1548			    GAMTLBVDBOX7_CLKGATE_DIS |
1549			    GAMTLBVDBOX6_CLKGATE_DIS |
1550			    GAMTLBVDBOX5_CLKGATE_DIS |
1551			    GAMTLBVDBOX4_CLKGATE_DIS |
1552			    GAMTLBVDBOX3_CLKGATE_DIS |
1553			    GAMTLBVDBOX2_CLKGATE_DIS |
1554			    GAMTLBVDBOX1_CLKGATE_DIS |
1555			    GAMTLBVDBOX0_CLKGATE_DIS |
1556			    GAMTLBKCR_CLKGATE_DIS |
1557			    GAMTLBGUC_CLKGATE_DIS |
1558			    GAMTLBBLT_CLKGATE_DIS);
1559		wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1560			    GAMTLBGFXA1_CLKGATE_DIS |
1561			    GAMTLBCOMPA0_CLKGATE_DIS |
1562			    GAMTLBCOMPA1_CLKGATE_DIS |
1563			    GAMTLBCOMPB0_CLKGATE_DIS |
1564			    GAMTLBCOMPB1_CLKGATE_DIS |
1565			    GAMTLBCOMPC0_CLKGATE_DIS |
1566			    GAMTLBCOMPC1_CLKGATE_DIS |
1567			    GAMTLBCOMPD0_CLKGATE_DIS |
1568			    GAMTLBCOMPD1_CLKGATE_DIS |
1569			    GAMTLBMERT_CLKGATE_DIS   |
1570			    GAMTLBVEBOX3_CLKGATE_DIS |
1571			    GAMTLBVEBOX2_CLKGATE_DIS |
1572			    GAMTLBVEBOX1_CLKGATE_DIS |
1573			    GAMTLBVEBOX0_CLKGATE_DIS);
1574	}
1575
1576	/* Wa_16012725990:xehpsdv */
1577	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1578		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1579
1580	/* Wa_14011060649:xehpsdv */
1581	wa_14011060649(gt, wal);
1582
1583	/* Wa_14012362059:xehpsdv */
1584	wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
1585
1586	/* Wa_14014368820:xehpsdv */
1587	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1588			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1589
1590	/* Wa_14010670810:xehpsdv */
1591	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1592}
1593
1594static void
1595dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1596{
1597	xehp_init_mcr(gt, wal);
1598
1599	/* Wa_14011060649:dg2 */
1600	wa_14011060649(gt, wal);
1601
1602	if (IS_DG2_G10(gt->i915)) {
1603		/* Wa_22010523718:dg2 */
1604		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1605			    CG3DDISCFEG_CLKGATE_DIS);
1606
1607		/* Wa_14011006942:dg2 */
1608		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1609				DSS_ROUTER_CLKGATE_DIS);
1610	}
1611
1612	/* Wa_14014830051:dg2 */
1613	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1614
1615	/*
1616	 * Wa_14015795083
1617	 * Skip verification for possibly locked register.
1618	 */
1619	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1620	       0, 0, false);
1621
1622	/* Wa_18018781329 */
1623	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1624	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1625	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1626	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1627
1628	/* Wa_1509235366:dg2 */
1629	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1630			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1631
1632	/* Wa_14010648519:dg2 */
1633	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1634}
1635
1636static void
1637pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1638{
1639	pvc_init_mcr(gt, wal);
1640
1641	/* Wa_14015795083 */
1642	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1643
1644	/* Wa_18018781329 */
1645	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1646	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1647	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1648	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1649
1650	/* Wa_16016694945 */
1651	wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
1652}
1653
1654static void
1655xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1656{
1657	/* Wa_14018575942 / Wa_18018781329 */
1658	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1659	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1660
1661	/* Wa_22016670082 */
1662	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1663
1664	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1665	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1666		/* Wa_14014830051 */
1667		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1668
1669		/* Wa_14015795083 */
1670		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1671	}
1672
1673	/*
1674	 * Unlike older platforms, we no longer setup implicit steering here;
1675	 * all MCR accesses are explicitly steered.
1676	 */
1677	debug_dump_steering(gt);
1678}
1679
1680static void
1681wa_16021867713(struct intel_gt *gt, struct i915_wa_list *wal)
1682{
1683	struct intel_engine_cs *engine;
1684	int id;
1685
1686	for_each_engine(engine, gt, id)
1687		if (engine->class == VIDEO_DECODE_CLASS)
1688			wa_write_or(wal, VDBOX_CGCTL3F1C(engine->mmio_base),
1689				    MFXPIPE_CLKGATE_DIS);
1690}
1691
1692static void
1693xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1694{
1695	wa_16021867713(gt, wal);
1696
1697	/*
1698	 * Wa_14018778641
1699	 * Wa_18018781329
1700	 *
1701	 * Note that although these registers are MCR on the primary
1702	 * GT, the media GT's versions are regular singleton registers.
1703	 */
1704	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1705
1706	/* Wa_22016670082 */
1707	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1708
1709	debug_dump_steering(gt);
1710}
1711
1712/*
1713 * The bspec performance guide has recommended MMIO tuning settings.  These
1714 * aren't truly "workarounds" but we want to program them through the
1715 * workaround infrastructure to make sure they're (re)applied at the proper
1716 * times.
1717 *
1718 * The programming in this function is for settings that persist through
1719 * engine resets and also are not part of any engine's register state context.
1720 * I.e., settings that only need to be re-applied in the event of a full GT
1721 * reset.
1722 */
1723static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1724{
1725	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1726		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1727		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1728	}
1729
1730	if (IS_PONTEVECCHIO(gt->i915)) {
1731		wa_mcr_write(wal, XEHPC_L3SCRUB,
1732			     SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
1733		wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_HOSTCACHEEN);
1734	}
1735
1736	if (IS_DG2(gt->i915)) {
1737		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1738		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1739	}
1740}
1741
1742static void
1743gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1744{
1745	struct drm_i915_private *i915 = gt->i915;
1746
1747	gt_tuning_settings(gt, wal);
1748
1749	if (gt->type == GT_MEDIA) {
1750		if (MEDIA_VER_FULL(i915) == IP_VER(13, 0))
1751			xelpmp_gt_workarounds_init(gt, wal);
1752		else
1753			MISSING_CASE(MEDIA_VER_FULL(i915));
1754
1755		return;
1756	}
1757
1758	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1759		xelpg_gt_workarounds_init(gt, wal);
1760	else if (IS_PONTEVECCHIO(i915))
1761		pvc_gt_workarounds_init(gt, wal);
1762	else if (IS_DG2(i915))
1763		dg2_gt_workarounds_init(gt, wal);
1764	else if (IS_XEHPSDV(i915))
1765		xehpsdv_gt_workarounds_init(gt, wal);
1766	else if (IS_DG1(i915))
1767		dg1_gt_workarounds_init(gt, wal);
1768	else if (GRAPHICS_VER(i915) == 12)
1769		gen12_gt_workarounds_init(gt, wal);
1770	else if (GRAPHICS_VER(i915) == 11)
1771		icl_gt_workarounds_init(gt, wal);
1772	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1773		cfl_gt_workarounds_init(gt, wal);
1774	else if (IS_GEMINILAKE(i915))
1775		glk_gt_workarounds_init(gt, wal);
1776	else if (IS_KABYLAKE(i915))
1777		kbl_gt_workarounds_init(gt, wal);
1778	else if (IS_BROXTON(i915))
1779		gen9_gt_workarounds_init(gt, wal);
1780	else if (IS_SKYLAKE(i915))
1781		skl_gt_workarounds_init(gt, wal);
1782	else if (IS_HASWELL(i915))
1783		hsw_gt_workarounds_init(gt, wal);
1784	else if (IS_VALLEYVIEW(i915))
1785		vlv_gt_workarounds_init(gt, wal);
1786	else if (IS_IVYBRIDGE(i915))
1787		ivb_gt_workarounds_init(gt, wal);
1788	else if (GRAPHICS_VER(i915) == 6)
1789		snb_gt_workarounds_init(gt, wal);
1790	else if (GRAPHICS_VER(i915) == 5)
1791		ilk_gt_workarounds_init(gt, wal);
1792	else if (IS_G4X(i915))
1793		g4x_gt_workarounds_init(gt, wal);
1794	else if (GRAPHICS_VER(i915) == 4)
1795		gen4_gt_workarounds_init(gt, wal);
1796	else if (GRAPHICS_VER(i915) <= 8)
1797		;
1798	else
1799		MISSING_CASE(GRAPHICS_VER(i915));
1800}
1801
1802void intel_gt_init_workarounds(struct intel_gt *gt)
1803{
1804	struct i915_wa_list *wal = &gt->wa_list;
1805
1806	wa_init_start(wal, gt, "GT", "global");
1807	gt_init_workarounds(gt, wal);
1808	wa_init_finish(wal);
1809}
1810
1811static bool
1812wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1813	  const char *name, const char *from)
1814{
1815	if ((cur ^ wa->set) & wa->read) {
1816		gt_err(gt,
1817		       "%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1818		       name, from, i915_mmio_reg_offset(wa->reg),
1819		       cur, cur & wa->read, wa->set & wa->read);
1820
1821		return false;
1822	}
1823
1824	return true;
1825}
1826
1827static void wa_list_apply(const struct i915_wa_list *wal)
1828{
1829	struct intel_gt *gt = wal->gt;
1830	struct intel_uncore *uncore = gt->uncore;
1831	enum forcewake_domains fw;
1832	unsigned long flags;
1833	struct i915_wa *wa;
1834	unsigned int i;
1835
1836	if (!wal->count)
1837		return;
1838
1839	fw = wal_get_fw_for_rmw(uncore, wal);
1840
1841	intel_gt_mcr_lock(gt, &flags);
1842	spin_lock(&uncore->lock);
1843	intel_uncore_forcewake_get__locked(uncore, fw);
1844
1845	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1846		u32 val, old = 0;
1847
1848		/* open-coded rmw due to steering */
1849		if (wa->clr)
1850			old = wa->is_mcr ?
1851				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1852				intel_uncore_read_fw(uncore, wa->reg);
1853		val = (old & ~wa->clr) | wa->set;
1854		if (val != old || !wa->clr) {
1855			if (wa->is_mcr)
1856				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1857			else
1858				intel_uncore_write_fw(uncore, wa->reg, val);
1859		}
1860
1861		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1862			u32 val = wa->is_mcr ?
1863				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1864				intel_uncore_read_fw(uncore, wa->reg);
1865
1866			wa_verify(gt, wa, val, wal->name, "application");
1867		}
1868	}
1869
1870	intel_uncore_forcewake_put__locked(uncore, fw);
1871	spin_unlock(&uncore->lock);
1872	intel_gt_mcr_unlock(gt, flags);
1873}
1874
1875void intel_gt_apply_workarounds(struct intel_gt *gt)
1876{
1877	wa_list_apply(&gt->wa_list);
1878}
1879
1880static bool wa_list_verify(struct intel_gt *gt,
1881			   const struct i915_wa_list *wal,
1882			   const char *from)
1883{
1884	struct intel_uncore *uncore = gt->uncore;
1885	struct i915_wa *wa;
1886	enum forcewake_domains fw;
1887	unsigned long flags;
1888	unsigned int i;
1889	bool ok = true;
1890
1891	fw = wal_get_fw_for_rmw(uncore, wal);
1892
1893	intel_gt_mcr_lock(gt, &flags);
1894	spin_lock(&uncore->lock);
1895	intel_uncore_forcewake_get__locked(uncore, fw);
1896
1897	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1898		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1899				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1900				intel_uncore_read_fw(uncore, wa->reg),
1901				wal->name, from);
1902
1903	intel_uncore_forcewake_put__locked(uncore, fw);
1904	spin_unlock(&uncore->lock);
1905	intel_gt_mcr_unlock(gt, flags);
1906
1907	return ok;
1908}
1909
1910bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1911{
1912	return wa_list_verify(gt, &gt->wa_list, from);
1913}
1914
1915__maybe_unused
1916static bool is_nonpriv_flags_valid(u32 flags)
1917{
1918	/* Check only valid flag bits are set */
1919	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1920		return false;
1921
1922	/* NB: Only 3 out of 4 enum values are valid for access field */
1923	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1924	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1925		return false;
1926
1927	return true;
1928}
1929
1930static void
1931whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1932{
1933	struct i915_wa wa = {
1934		.reg = reg
1935	};
1936
1937	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1938		return;
1939
1940	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1941		return;
1942
1943	wa.reg.reg |= flags;
1944	_wa_add(wal, &wa);
1945}
1946
1947static void
1948whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1949{
1950	struct i915_wa wa = {
1951		.mcr_reg = reg,
1952		.is_mcr = 1,
1953	};
1954
1955	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1956		return;
1957
1958	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1959		return;
1960
1961	wa.mcr_reg.reg |= flags;
1962	_wa_add(wal, &wa);
1963}
1964
1965static void
1966whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1967{
1968	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1969}
1970
1971static void
1972whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1973{
1974	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1975}
1976
1977static void gen9_whitelist_build(struct i915_wa_list *w)
1978{
1979	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1980	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1981
1982	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1983	whitelist_reg(w, GEN8_CS_CHICKEN1);
1984
1985	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1986	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1987
1988	/* WaSendPushConstantsFromMMIO:skl,bxt */
1989	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1990}
1991
1992static void skl_whitelist_build(struct intel_engine_cs *engine)
1993{
1994	struct i915_wa_list *w = &engine->whitelist;
1995
1996	if (engine->class != RENDER_CLASS)
1997		return;
1998
1999	gen9_whitelist_build(w);
2000
2001	/* WaDisableLSQCROPERFforOCL:skl */
2002	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
2003}
2004
2005static void bxt_whitelist_build(struct intel_engine_cs *engine)
2006{
2007	if (engine->class != RENDER_CLASS)
2008		return;
2009
2010	gen9_whitelist_build(&engine->whitelist);
2011}
2012
2013static void kbl_whitelist_build(struct intel_engine_cs *engine)
2014{
2015	struct i915_wa_list *w = &engine->whitelist;
2016
2017	if (engine->class != RENDER_CLASS)
2018		return;
2019
2020	gen9_whitelist_build(w);
2021
2022	/* WaDisableLSQCROPERFforOCL:kbl */
2023	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
2024}
2025
2026static void glk_whitelist_build(struct intel_engine_cs *engine)
2027{
2028	struct i915_wa_list *w = &engine->whitelist;
2029
2030	if (engine->class != RENDER_CLASS)
2031		return;
2032
2033	gen9_whitelist_build(w);
2034
2035	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
2036	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2037}
2038
2039static void cfl_whitelist_build(struct intel_engine_cs *engine)
2040{
2041	struct i915_wa_list *w = &engine->whitelist;
2042
2043	if (engine->class != RENDER_CLASS)
2044		return;
2045
2046	gen9_whitelist_build(w);
2047
2048	/*
2049	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
2050	 *
2051	 * This covers 4 register which are next to one another :
2052	 *   - PS_INVOCATION_COUNT
2053	 *   - PS_INVOCATION_COUNT_UDW
2054	 *   - PS_DEPTH_COUNT
2055	 *   - PS_DEPTH_COUNT_UDW
2056	 */
2057	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2058			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2059			  RING_FORCE_TO_NONPRIV_RANGE_4);
2060}
2061
2062static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
2063{
2064	struct i915_wa_list *w = &engine->whitelist;
2065
2066	if (engine->class != RENDER_CLASS)
2067		whitelist_reg_ext(w,
2068				  RING_CTX_TIMESTAMP(engine->mmio_base),
2069				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2070}
2071
2072static void cml_whitelist_build(struct intel_engine_cs *engine)
2073{
2074	allow_read_ctx_timestamp(engine);
2075
2076	cfl_whitelist_build(engine);
2077}
2078
2079static void icl_whitelist_build(struct intel_engine_cs *engine)
2080{
2081	struct i915_wa_list *w = &engine->whitelist;
2082
2083	allow_read_ctx_timestamp(engine);
2084
2085	switch (engine->class) {
2086	case RENDER_CLASS:
2087		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
2088		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
2089
2090		/* WaAllowUMDToModifySamplerMode:icl */
2091		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
2092
2093		/* WaEnableStateCacheRedirectToCS:icl */
2094		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2095
2096		/*
2097		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2098		 *
2099		 * This covers 4 register which are next to one another :
2100		 *   - PS_INVOCATION_COUNT
2101		 *   - PS_INVOCATION_COUNT_UDW
2102		 *   - PS_DEPTH_COUNT
2103		 *   - PS_DEPTH_COUNT_UDW
2104		 */
2105		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2106				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2107				  RING_FORCE_TO_NONPRIV_RANGE_4);
2108		break;
2109
2110	case VIDEO_DECODE_CLASS:
2111		/* hucStatusRegOffset */
2112		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2113				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2114		/* hucUKernelHdrInfoRegOffset */
2115		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2116				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2117		/* hucStatus2RegOffset */
2118		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2119				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2120		break;
2121
2122	default:
2123		break;
2124	}
2125}
2126
2127static void tgl_whitelist_build(struct intel_engine_cs *engine)
2128{
2129	struct i915_wa_list *w = &engine->whitelist;
2130
2131	allow_read_ctx_timestamp(engine);
2132
2133	switch (engine->class) {
2134	case RENDER_CLASS:
2135		/*
2136		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2137		 * Wa_1408556865:tgl
2138		 *
2139		 * This covers 4 registers which are next to one another :
2140		 *   - PS_INVOCATION_COUNT
2141		 *   - PS_INVOCATION_COUNT_UDW
2142		 *   - PS_DEPTH_COUNT
2143		 *   - PS_DEPTH_COUNT_UDW
2144		 */
2145		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2146				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2147				  RING_FORCE_TO_NONPRIV_RANGE_4);
2148
2149		/*
2150		 * Wa_1808121037:tgl
2151		 * Wa_14012131227:dg1
2152		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2153		 */
2154		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2155
2156		/* Wa_1806527549:tgl */
2157		whitelist_reg(w, HIZ_CHICKEN);
2158
2159		/* Required by recommended tuning setting (not a workaround) */
2160		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2161
2162		break;
2163	default:
2164		break;
2165	}
2166}
2167
2168static void dg2_whitelist_build(struct intel_engine_cs *engine)
2169{
2170	struct i915_wa_list *w = &engine->whitelist;
2171
2172	switch (engine->class) {
2173	case RENDER_CLASS:
2174		/* Required by recommended tuning setting (not a workaround) */
2175		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2176
2177		break;
2178	default:
2179		break;
2180	}
2181}
2182
2183static void blacklist_trtt(struct intel_engine_cs *engine)
2184{
2185	struct i915_wa_list *w = &engine->whitelist;
2186
2187	/*
2188	 * Prevent read/write access to [0x4400, 0x4600) which covers
2189	 * the TRTT range across all engines. Note that normally userspace
2190	 * cannot access the other engines' trtt control, but for simplicity
2191	 * we cover the entire range on each engine.
2192	 */
2193	whitelist_reg_ext(w, _MMIO(0x4400),
2194			  RING_FORCE_TO_NONPRIV_DENY |
2195			  RING_FORCE_TO_NONPRIV_RANGE_64);
2196	whitelist_reg_ext(w, _MMIO(0x4500),
2197			  RING_FORCE_TO_NONPRIV_DENY |
2198			  RING_FORCE_TO_NONPRIV_RANGE_64);
2199}
2200
2201static void pvc_whitelist_build(struct intel_engine_cs *engine)
2202{
2203	/* Wa_16014440446:pvc */
2204	blacklist_trtt(engine);
2205}
2206
2207static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2208{
2209	struct i915_wa_list *w = &engine->whitelist;
2210
2211	switch (engine->class) {
2212	case RENDER_CLASS:
2213		/* Required by recommended tuning setting (not a workaround) */
2214		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2215
2216		break;
2217	default:
2218		break;
2219	}
2220}
2221
2222void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2223{
2224	struct drm_i915_private *i915 = engine->i915;
2225	struct i915_wa_list *w = &engine->whitelist;
2226
2227	wa_init_start(w, engine->gt, "whitelist", engine->name);
2228
2229	if (engine->gt->type == GT_MEDIA)
2230		; /* none yet */
2231	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2232		xelpg_whitelist_build(engine);
2233	else if (IS_PONTEVECCHIO(i915))
2234		pvc_whitelist_build(engine);
2235	else if (IS_DG2(i915))
2236		dg2_whitelist_build(engine);
2237	else if (IS_XEHPSDV(i915))
2238		; /* none needed */
2239	else if (GRAPHICS_VER(i915) == 12)
2240		tgl_whitelist_build(engine);
2241	else if (GRAPHICS_VER(i915) == 11)
2242		icl_whitelist_build(engine);
2243	else if (IS_COMETLAKE(i915))
2244		cml_whitelist_build(engine);
2245	else if (IS_COFFEELAKE(i915))
2246		cfl_whitelist_build(engine);
2247	else if (IS_GEMINILAKE(i915))
2248		glk_whitelist_build(engine);
2249	else if (IS_KABYLAKE(i915))
2250		kbl_whitelist_build(engine);
2251	else if (IS_BROXTON(i915))
2252		bxt_whitelist_build(engine);
2253	else if (IS_SKYLAKE(i915))
2254		skl_whitelist_build(engine);
2255	else if (GRAPHICS_VER(i915) <= 8)
2256		;
2257	else
2258		MISSING_CASE(GRAPHICS_VER(i915));
2259
2260	wa_init_finish(w);
2261}
2262
2263void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2264{
2265	const struct i915_wa_list *wal = &engine->whitelist;
2266	struct intel_uncore *uncore = engine->uncore;
2267	const u32 base = engine->mmio_base;
2268	struct i915_wa *wa;
2269	unsigned int i;
2270
2271	if (!wal->count)
2272		return;
2273
2274	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2275		intel_uncore_write(uncore,
2276				   RING_FORCE_TO_NONPRIV(base, i),
2277				   i915_mmio_reg_offset(wa->reg));
2278
2279	/* And clear the rest just in case of garbage */
2280	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2281		intel_uncore_write(uncore,
2282				   RING_FORCE_TO_NONPRIV(base, i),
2283				   i915_mmio_reg_offset(RING_NOPID(base)));
2284}
2285
2286/*
2287 * engine_fake_wa_init(), a place holder to program the registers
2288 * which are not part of an official workaround defined by the
2289 * hardware team.
2290 * Adding programming of those register inside workaround will
2291 * allow utilizing wa framework to proper application and verification.
2292 */
2293static void
2294engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2295{
2296	u8 mocs_w, mocs_r;
2297
2298	/*
2299	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2300	 * by the command streamer when executing commands that don't have
2301	 * a way to explicitly specify a MOCS setting.  The default should
2302	 * usually reference whichever MOCS entry corresponds to uncached
2303	 * behavior, although use of a WB cached entry is recommended by the
2304	 * spec in certain circumstances on specific platforms.
2305	 */
2306	if (GRAPHICS_VER(engine->i915) >= 12) {
2307		mocs_r = engine->gt->mocs.uc_index;
2308		mocs_w = engine->gt->mocs.uc_index;
2309
2310		if (HAS_L3_CCS_READ(engine->i915) &&
2311		    engine->class == COMPUTE_CLASS) {
2312			mocs_r = engine->gt->mocs.wb_index;
2313
2314			/*
2315			 * Even on the few platforms where MOCS 0 is a
2316			 * legitimate table entry, it's never the correct
2317			 * setting to use here; we can assume the MOCS init
2318			 * just forgot to initialize wb_index.
2319			 */
2320			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2321		}
2322
2323		wa_masked_field_set(wal,
2324				    RING_CMD_CCTL(engine->mmio_base),
2325				    CMD_CCTL_MOCS_MASK,
2326				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2327	}
2328}
2329
2330static void
2331rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2332{
2333	struct drm_i915_private *i915 = engine->i915;
2334	struct intel_gt *gt = engine->gt;
2335
2336	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2337	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2338		/* Wa_22014600077 */
2339		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2340				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2341	}
2342
2343	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2344	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2345	    IS_DG2(i915)) {
2346		/* Wa_1509727124 */
2347		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2348				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2349	}
2350
2351	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2352	    IS_DG2(i915)) {
2353		/* Wa_22012856258 */
2354		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2355				 GEN12_DISABLE_READ_SUPPRESSION);
2356	}
2357
2358	if (IS_DG2(i915)) {
2359		/*
2360		 * Wa_22010960976:dg2
2361		 * Wa_14013347512:dg2
2362		 */
2363		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2364				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2365	}
2366
2367	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2368	    IS_DG2(i915)) {
2369		/* Wa_14015150844 */
2370		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2371			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2372			   0, true);
2373	}
2374
2375	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2376	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2377		/*
2378		 * Wa_1606700617:tgl,dg1,adl-p
2379		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2380		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2381		 * Wa_18019627453:dg2
2382		 */
2383		wa_masked_en(wal,
2384			     GEN9_CS_DEBUG_MODE1,
2385			     FF_DOP_CLOCK_GATE_DISABLE);
2386	}
2387
2388	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2389	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2390		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2391		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2392
2393		/*
2394		 * Wa_1407928979:tgl A*
2395		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2396		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2397		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2398		 */
2399		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2400			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2401
2402		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2403		wa_mcr_masked_en(wal,
2404				 GEN10_SAMPLER_MODE,
2405				 ENABLE_SMALLPL);
2406	}
2407
2408	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2409	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2410		/* Wa_1409804808 */
2411		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2412				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2413
2414		/* Wa_14010229206 */
2415		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2416	}
2417
2418	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2419		/*
2420		 * Wa_1607297627
2421		 *
2422		 * On TGL and RKL there are multiple entries for this WA in the
2423		 * BSpec; some indicate this is an A0-only WA, others indicate
2424		 * it applies to all steppings so we trust the "all steppings."
2425		 */
2426		wa_masked_en(wal,
2427			     RING_PSMI_CTL(RENDER_RING_BASE),
2428			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2429			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2430	}
2431
2432	if (GRAPHICS_VER(i915) == 11) {
2433		/* This is not an Wa. Enable for better image quality */
2434		wa_masked_en(wal,
2435			     _3D_CHICKEN3,
2436			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2437
2438		/*
2439		 * Wa_1405543622:icl
2440		 * Formerly known as WaGAPZPriorityScheme
2441		 */
2442		wa_write_or(wal,
2443			    GEN8_GARBCNTL,
2444			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2445
2446		/*
2447		 * Wa_1604223664:icl
2448		 * Formerly known as WaL3BankAddressHashing
2449		 */
2450		wa_write_clr_set(wal,
2451				 GEN8_GARBCNTL,
2452				 GEN11_HASH_CTRL_EXCL_MASK,
2453				 GEN11_HASH_CTRL_EXCL_BIT0);
2454		wa_write_clr_set(wal,
2455				 GEN11_GLBLINVL,
2456				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2457				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2458
2459		/*
2460		 * Wa_1405733216:icl
2461		 * Formerly known as WaDisableCleanEvicts
2462		 */
2463		wa_mcr_write_or(wal,
2464				GEN8_L3SQCREG4,
2465				GEN11_LQSC_CLEAN_EVICT_DISABLE);
2466
2467		/* Wa_1606682166:icl */
2468		wa_write_or(wal,
2469			    GEN7_SARCHKMD,
2470			    GEN7_DISABLE_SAMPLER_PREFETCH);
2471
2472		/* Wa_1409178092:icl */
2473		wa_mcr_write_clr_set(wal,
2474				     GEN11_SCRATCH2,
2475				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2476				     0);
2477
2478		/* WaEnable32PlaneMode:icl */
2479		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2480			     GEN11_ENABLE_32_PLANE_MODE);
2481
2482		/*
2483		 * Wa_1408767742:icl[a2..forever],ehl[all]
2484		 * Wa_1605460711:icl[a0..c0]
2485		 */
2486		wa_write_or(wal,
2487			    GEN7_FF_THREAD_MODE,
2488			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2489
2490		/* Wa_22010271021 */
2491		wa_masked_en(wal,
2492			     GEN9_CS_DEBUG_MODE1,
2493			     FF_DOP_CLOCK_GATE_DISABLE);
2494	}
2495
2496	/*
2497	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2498	 * beyond) allow the kernel-mode driver to choose between two different
2499	 * options for controlling preemption granularity and behavior.
2500	 *
2501	 * Option 1 (hardware default):
2502	 *   Preemption settings are controlled in a global manner via
2503	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2504	 *   and settings chosen by the kernel-mode driver will apply to all
2505	 *   userspace clients.
2506	 *
2507	 * Option 2:
2508	 *   Preemption settings are controlled on a per-context basis via
2509	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2510	 *   context switch and is writable by userspace (e.g., via
2511	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2512	 *   which allows different userspace drivers/clients to select
2513	 *   different settings, or to change those settings on the fly in
2514	 *   response to runtime needs.  This option was known by name
2515	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2516	 *   that name is somewhat misleading as other non-granularity
2517	 *   preemption settings are also impacted by this decision.
2518	 *
2519	 * On Linux, our policy has always been to let userspace drivers
2520	 * control preemption granularity/settings (Option 2).  This was
2521	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2522	 * userspace developed before object-level preemption was enabled would
2523	 * not behave well if i915 were to go with Option 1 and enable that
2524	 * preemption in a global manner).  On gen9 each context would have
2525	 * object-level preemption disabled by default (see
2526	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2527	 * userspace drivers could opt-in to object-level preemption as they
2528	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2529	 * even though it is no longer necessary for ABI compatibility when
2530	 * enabling a new platform, it does ensure that userspace will be able
2531	 * to implement any workarounds that show up requiring temporary
2532	 * adjustments to preemption behavior at runtime.
2533	 *
2534	 * Notes/Workarounds:
2535	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2536	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2537	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2538	 *      using Option 1).  Effectively this means userspace is unable
2539	 *      to disable object-level preemption on these platforms/steppings
2540	 *      despite the setting here.
2541	 *
2542	 *  - Wa_16013994831:  May require that userspace program
2543	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2544	 *      Userspace requires Option 2 to be in effect for their update of
2545	 *      CS_CHICKEN1[10] to be effective.
2546	 *
2547	 * Other workarounds may appear in the future that will also require
2548	 * Option 2 behavior to allow proper userspace implementation.
2549	 */
2550	if (GRAPHICS_VER(i915) >= 9)
2551		wa_masked_en(wal,
2552			     GEN7_FF_SLICE_CS_CHICKEN1,
2553			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2554
2555	if (IS_SKYLAKE(i915) ||
2556	    IS_KABYLAKE(i915) ||
2557	    IS_COFFEELAKE(i915) ||
2558	    IS_COMETLAKE(i915)) {
2559		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2560		wa_write_or(wal,
2561			    GEN8_GARBCNTL,
2562			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2563	}
2564
2565	if (IS_BROXTON(i915)) {
2566		/* WaDisablePooledEuLoadBalancingFix:bxt */
2567		wa_masked_en(wal,
2568			     FF_SLICE_CS_CHICKEN2,
2569			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2570	}
2571
2572	if (GRAPHICS_VER(i915) == 9) {
2573		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2574		wa_masked_en(wal,
2575			     GEN9_CSFE_CHICKEN1_RCS,
2576			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2577
2578		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2579		wa_mcr_write_or(wal,
2580				BDW_SCRATCH1,
2581				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2582
2583		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2584		if (IS_GEN9_LP(i915))
2585			wa_mcr_write_clr_set(wal,
2586					     GEN8_L3SQCREG1,
2587					     L3_PRIO_CREDITS_MASK,
2588					     L3_GENERAL_PRIO_CREDITS(62) |
2589					     L3_HIGH_PRIO_CREDITS(2));
2590
2591		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2592		wa_mcr_write_or(wal,
2593				GEN8_L3SQCREG4,
2594				GEN8_LQSC_FLUSH_COHERENT_LINES);
2595
2596		/* Disable atomics in L3 to prevent unrecoverable hangs */
2597		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2598				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2599		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2600				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2601		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2602				     EVICTION_PERF_FIX_ENABLE, 0);
2603	}
2604
2605	if (IS_HASWELL(i915)) {
2606		/* WaSampleCChickenBitEnable:hsw */
2607		wa_masked_en(wal,
2608			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2609
2610		wa_masked_dis(wal,
2611			      CACHE_MODE_0_GEN7,
2612			      /* enable HiZ Raw Stall Optimization */
2613			      HIZ_RAW_STALL_OPT_DISABLE);
2614	}
2615
2616	if (IS_VALLEYVIEW(i915)) {
2617		/* WaDisableEarlyCull:vlv */
2618		wa_masked_en(wal,
2619			     _3D_CHICKEN3,
2620			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2621
2622		/*
2623		 * WaVSThreadDispatchOverride:ivb,vlv
2624		 *
2625		 * This actually overrides the dispatch
2626		 * mode for all thread types.
2627		 */
2628		wa_write_clr_set(wal,
2629				 GEN7_FF_THREAD_MODE,
2630				 GEN7_FF_SCHED_MASK,
2631				 GEN7_FF_TS_SCHED_HW |
2632				 GEN7_FF_VS_SCHED_HW |
2633				 GEN7_FF_DS_SCHED_HW);
2634
2635		/* WaPsdDispatchEnable:vlv */
2636		/* WaDisablePSDDualDispatchEnable:vlv */
2637		wa_masked_en(wal,
2638			     GEN7_HALF_SLICE_CHICKEN1,
2639			     GEN7_MAX_PS_THREAD_DEP |
2640			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2641	}
2642
2643	if (IS_IVYBRIDGE(i915)) {
2644		/* WaDisableEarlyCull:ivb */
2645		wa_masked_en(wal,
2646			     _3D_CHICKEN3,
2647			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2648
2649		if (0) { /* causes HiZ corruption on ivb:gt1 */
2650			/* enable HiZ Raw Stall Optimization */
2651			wa_masked_dis(wal,
2652				      CACHE_MODE_0_GEN7,
2653				      HIZ_RAW_STALL_OPT_DISABLE);
2654		}
2655
2656		/*
2657		 * WaVSThreadDispatchOverride:ivb,vlv
2658		 *
2659		 * This actually overrides the dispatch
2660		 * mode for all thread types.
2661		 */
2662		wa_write_clr_set(wal,
2663				 GEN7_FF_THREAD_MODE,
2664				 GEN7_FF_SCHED_MASK,
2665				 GEN7_FF_TS_SCHED_HW |
2666				 GEN7_FF_VS_SCHED_HW |
2667				 GEN7_FF_DS_SCHED_HW);
2668
2669		/* WaDisablePSDDualDispatchEnable:ivb */
2670		if (IS_IVB_GT1(i915))
2671			wa_masked_en(wal,
2672				     GEN7_HALF_SLICE_CHICKEN1,
2673				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2674	}
2675
2676	if (GRAPHICS_VER(i915) == 7) {
2677		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2678		wa_masked_en(wal,
2679			     RING_MODE_GEN7(RENDER_RING_BASE),
2680			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2681
2682		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2683		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2684
2685		/*
2686		 * BSpec says this must be set, even though
2687		 * WaDisable4x2SubspanOptimization:ivb,hsw
2688		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2689		 */
2690		wa_masked_en(wal,
2691			     CACHE_MODE_1,
2692			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2693
2694		/*
2695		 * BSpec recommends 8x4 when MSAA is used,
2696		 * however in practice 16x4 seems fastest.
2697		 *
2698		 * Note that PS/WM thread counts depend on the WIZ hashing
2699		 * disable bit, which we don't touch here, but it's good
2700		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2701		 */
2702		wa_masked_field_set(wal,
2703				    GEN7_GT_MODE,
2704				    GEN6_WIZ_HASHING_MASK,
2705				    GEN6_WIZ_HASHING_16x4);
2706	}
2707
2708	if (IS_GRAPHICS_VER(i915, 6, 7))
2709		/*
2710		 * We need to disable the AsyncFlip performance optimisations in
2711		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2712		 * already be programmed to '1' on all products.
2713		 *
2714		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2715		 */
2716		wa_masked_en(wal,
2717			     RING_MI_MODE(RENDER_RING_BASE),
2718			     ASYNC_FLIP_PERF_DISABLE);
2719
2720	if (GRAPHICS_VER(i915) == 6) {
2721		/*
2722		 * Required for the hardware to program scanline values for
2723		 * waiting
2724		 * WaEnableFlushTlbInvalidationMode:snb
2725		 */
2726		wa_masked_en(wal,
2727			     GFX_MODE,
2728			     GFX_TLB_INVALIDATE_EXPLICIT);
2729
2730		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2731		wa_masked_en(wal,
2732			     _3D_CHICKEN,
2733			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2734
2735		wa_masked_en(wal,
2736			     _3D_CHICKEN3,
2737			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2738			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2739			     /*
2740			      * Bspec says:
2741			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2742			      * to normal and 3DSTATE_SF number of SF output attributes
2743			      * is more than 16."
2744			      */
2745			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2746
2747		/*
2748		 * BSpec recommends 8x4 when MSAA is used,
2749		 * however in practice 16x4 seems fastest.
2750		 *
2751		 * Note that PS/WM thread counts depend on the WIZ hashing
2752		 * disable bit, which we don't touch here, but it's good
2753		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2754		 */
2755		wa_masked_field_set(wal,
2756				    GEN6_GT_MODE,
2757				    GEN6_WIZ_HASHING_MASK,
2758				    GEN6_WIZ_HASHING_16x4);
2759
2760		/* WaDisable_RenderCache_OperationalFlush:snb */
2761		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2762
2763		/*
2764		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2765		 * "If this bit is set, STCunit will have LRA as replacement
2766		 *  policy. [...] This bit must be reset. LRA replacement
2767		 *  policy is not supported."
2768		 */
2769		wa_masked_dis(wal,
2770			      CACHE_MODE_0,
2771			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2772	}
2773
2774	if (IS_GRAPHICS_VER(i915, 4, 6))
2775		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2776		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2777		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2778		       /* XXX bit doesn't stick on Broadwater */
2779		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2780
2781	if (GRAPHICS_VER(i915) == 4)
2782		/*
2783		 * Disable CONSTANT_BUFFER before it is loaded from the context
2784		 * image. For as it is loaded, it is executed and the stored
2785		 * address may no longer be valid, leading to a GPU hang.
2786		 *
2787		 * This imposes the requirement that userspace reload their
2788		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2789		 * they are already accustomed to from before contexts were
2790		 * enabled.
2791		 */
2792		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2793		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2794		       0 /* XXX bit doesn't stick on Broadwater */,
2795		       true);
2796}
2797
2798static void
2799xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2800{
2801	struct drm_i915_private *i915 = engine->i915;
2802
2803	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2804	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2805		wa_write(wal,
2806			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2807			 1);
2808	}
2809	/* Wa_16018031267, Wa_16018063123 */
2810	if (NEEDS_FASTCOLOR_BLT_WABB(engine))
2811		wa_masked_field_set(wal, ECOSKPD(engine->mmio_base),
2812				    XEHP_BLITTER_SCHEDULING_MODE_MASK,
2813				    XEHP_BLITTER_ROUND_ROBIN_MODE);
2814}
2815
2816static void
2817ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2818{
2819	if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2820		/* Wa_14014999345:pvc */
2821		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2822	}
2823}
2824
2825/*
2826 * The bspec performance guide has recommended MMIO tuning settings.  These
2827 * aren't truly "workarounds" but we want to program them with the same
2828 * workaround infrastructure to ensure that they're automatically added to
2829 * the GuC save/restore lists, re-applied at the right times, and checked for
2830 * any conflicting programming requested by real workarounds.
2831 *
2832 * Programming settings should be added here only if their registers are not
2833 * part of an engine's register state context.  If a register is part of a
2834 * context, then any tuning settings should be programmed in an appropriate
2835 * function invoked by __intel_engine_init_ctx_wa().
2836 */
2837static void
2838add_render_compute_tuning_settings(struct intel_gt *gt,
2839				   struct i915_wa_list *wal)
2840{
2841	struct drm_i915_private *i915 = gt->i915;
2842
2843	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2844		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2845
2846	/*
2847	 * This tuning setting proves beneficial only on ATS-M designs; the
2848	 * default "age based" setting is optimal on regular DG2 and other
2849	 * platforms.
2850	 */
2851	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2852		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2853					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2854
2855	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
2856		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2857}
2858
2859static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2860{
2861	struct intel_gt *gt = engine->gt;
2862
2863	if (!IS_DG2(gt->i915))
2864		return;
2865
2866	/*
2867	 * Wa_14019159160: This workaround, along with others, leads to
2868	 * significant challenges in utilizing load balancing among the
2869	 * CCS slices. Consequently, an architectural decision has been
2870	 * made to completely disable automatic CCS load balancing.
2871	 */
2872	wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2873
2874	/*
2875	 * After having disabled automatic load balancing we need to
2876	 * assign all slices to a single CCS. We will call it CCS mode 1
2877	 */
2878	intel_gt_apply_ccs_mode(gt);
2879}
2880
2881/*
2882 * The workarounds in this function apply to shared registers in
2883 * the general render reset domain that aren't tied to a
2884 * specific engine.  Since all render+compute engines get reset
2885 * together, and the contents of these registers are lost during
2886 * the shared render domain reset, we'll define such workarounds
2887 * here and then add them to just a single RCS or CCS engine's
2888 * workaround list (whichever engine has the XXXX flag).
2889 */
2890static void
2891general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2892{
2893	struct drm_i915_private *i915 = engine->i915;
2894	struct intel_gt *gt = engine->gt;
2895
2896	add_render_compute_tuning_settings(gt, wal);
2897
2898	if (GRAPHICS_VER(i915) >= 11) {
2899		/* This is not a Wa (although referred to as
2900		 * WaSetInidrectStateOverride in places), this allows
2901		 * applications that reference sampler states through
2902		 * the BindlessSamplerStateBaseAddress to have their
2903		 * border color relative to DynamicStateBaseAddress
2904		 * rather than BindlessSamplerStateBaseAddress.
2905		 *
2906		 * Otherwise SAMPLER_STATE border colors have to be
2907		 * copied in multiple heaps (DynamicStateBaseAddress &
2908		 * BindlessSamplerStateBaseAddress)
2909		 *
2910		 * BSpec: 46052
2911		 */
2912		wa_mcr_masked_en(wal,
2913				 GEN10_SAMPLER_MODE,
2914				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2915	}
2916
2917	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2918	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2919	    IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74)))
2920		/* Wa_14017856879 */
2921		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2922
2923	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2924	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2925		/*
2926		 * Wa_14017066071
2927		 * Wa_14017654203
2928		 */
2929		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2930				 MTL_DISABLE_SAMPLER_SC_OOO);
2931
2932	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2933		/* Wa_22015279794 */
2934		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2935				 DISABLE_PREFETCH_INTO_IC);
2936
2937	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2938	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2939	    IS_DG2(i915)) {
2940		/* Wa_22013037850 */
2941		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2942				DISABLE_128B_EVICTION_COMMAND_UDW);
2943
2944		/* Wa_18017747507 */
2945		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2946	}
2947
2948	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2949	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2950	    IS_PONTEVECCHIO(i915) ||
2951	    IS_DG2(i915)) {
2952		/* Wa_22014226127 */
2953		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2954	}
2955
2956	if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) {
2957		/* Wa_14015227452:dg2,pvc */
2958		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2959
2960		/* Wa_16015675438:dg2,pvc */
2961		wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2962	}
2963
2964	if (IS_DG2(i915)) {
2965		/*
2966		 * Wa_16011620976:dg2_g11
2967		 * Wa_22015475538:dg2
2968		 */
2969		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2970
2971		/* Wa_18028616096 */
2972		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, UGM_FRAGMENT_THRESHOLD_TO_3);
2973	}
2974
2975	if (IS_DG2_G11(i915)) {
2976		/*
2977		 * Wa_22012826095:dg2
2978		 * Wa_22013059131:dg2
2979		 */
2980		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2981				     MAXREQS_PER_BANK,
2982				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2983
2984		/* Wa_22013059131:dg2 */
2985		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2986				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2987
2988		/*
2989		 * Wa_22012654132
2990		 *
2991		 * Note that register 0xE420 is write-only and cannot be read
2992		 * back for verification on DG2 (due to Wa_14012342262), so
2993		 * we need to explicitly skip the readback.
2994		 */
2995		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2996			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2997			   0 /* write-only, so skip validation */,
2998			   true);
2999	}
3000
3001	if (IS_XEHPSDV(i915)) {
3002		/* Wa_1409954639 */
3003		wa_mcr_masked_en(wal,
3004				 GEN8_ROW_CHICKEN,
3005				 SYSTOLIC_DOP_CLOCK_GATING_DIS);
3006
3007		/* Wa_1607196519 */
3008		wa_mcr_masked_en(wal,
3009				 GEN9_ROW_CHICKEN4,
3010				 GEN12_DISABLE_GRF_CLEAR);
3011
3012		/* Wa_14010449647:xehpsdv */
3013		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
3014				 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
3015	}
3016}
3017
3018static void
3019engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
3020{
3021	if (GRAPHICS_VER(engine->i915) < 4)
3022		return;
3023
3024	engine_fake_wa_init(engine, wal);
3025
3026	/*
3027	 * These are common workarounds that just need to applied
3028	 * to a single RCS/CCS engine's workaround list since
3029	 * they're reset as part of the general render domain reset.
3030	 */
3031	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
3032		general_render_compute_wa_init(engine, wal);
3033		ccs_engine_wa_mode(engine, wal);
3034	}
3035
3036	if (engine->class == COMPUTE_CLASS)
3037		ccs_engine_wa_init(engine, wal);
3038	else if (engine->class == RENDER_CLASS)
3039		rcs_engine_wa_init(engine, wal);
3040	else
3041		xcs_engine_wa_init(engine, wal);
3042}
3043
3044void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3045{
3046	struct i915_wa_list *wal = &engine->wa_list;
3047
3048	wa_init_start(wal, engine->gt, "engine", engine->name);
3049	engine_init_workarounds(engine, wal);
3050	wa_init_finish(wal);
3051}
3052
3053void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3054{
3055	wa_list_apply(&engine->wa_list);
3056}
3057
3058static const struct i915_range mcr_ranges_gen8[] = {
3059	{ .start = 0x5500, .end = 0x55ff },
3060	{ .start = 0x7000, .end = 0x7fff },
3061	{ .start = 0x9400, .end = 0x97ff },
3062	{ .start = 0xb000, .end = 0xb3ff },
3063	{ .start = 0xe000, .end = 0xe7ff },
3064	{},
3065};
3066
3067static const struct i915_range mcr_ranges_gen12[] = {
3068	{ .start =  0x8150, .end =  0x815f },
3069	{ .start =  0x9520, .end =  0x955f },
3070	{ .start =  0xb100, .end =  0xb3ff },
3071	{ .start =  0xde80, .end =  0xe8ff },
3072	{ .start = 0x24a00, .end = 0x24a7f },
3073	{},
3074};
3075
3076static const struct i915_range mcr_ranges_xehp[] = {
3077	{ .start =  0x4000, .end =  0x4aff },
3078	{ .start =  0x5200, .end =  0x52ff },
3079	{ .start =  0x5400, .end =  0x7fff },
3080	{ .start =  0x8140, .end =  0x815f },
3081	{ .start =  0x8c80, .end =  0x8dff },
3082	{ .start =  0x94d0, .end =  0x955f },
3083	{ .start =  0x9680, .end =  0x96ff },
3084	{ .start =  0xb000, .end =  0xb3ff },
3085	{ .start =  0xc800, .end =  0xcfff },
3086	{ .start =  0xd800, .end =  0xd8ff },
3087	{ .start =  0xdc00, .end =  0xffff },
3088	{ .start = 0x17000, .end = 0x17fff },
3089	{ .start = 0x24a00, .end = 0x24a7f },
3090	{},
3091};
3092
3093static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3094{
3095	const struct i915_range *mcr_ranges;
3096	int i;
3097
3098	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3099		mcr_ranges = mcr_ranges_xehp;
3100	else if (GRAPHICS_VER(i915) >= 12)
3101		mcr_ranges = mcr_ranges_gen12;
3102	else if (GRAPHICS_VER(i915) >= 8)
3103		mcr_ranges = mcr_ranges_gen8;
3104	else
3105		return false;
3106
3107	/*
3108	 * Registers in these ranges are affected by the MCR selector
3109	 * which only controls CPU initiated MMIO. Routing does not
3110	 * work for CS access so we cannot verify them on this path.
3111	 */
3112	for (i = 0; mcr_ranges[i].start; i++)
3113		if (offset >= mcr_ranges[i].start &&
3114		    offset <= mcr_ranges[i].end)
3115			return true;
3116
3117	return false;
3118}
3119
3120static int
3121wa_list_srm(struct i915_request *rq,
3122	    const struct i915_wa_list *wal,
3123	    struct i915_vma *vma)
3124{
3125	struct drm_i915_private *i915 = rq->i915;
3126	unsigned int i, count = 0;
3127	const struct i915_wa *wa;
3128	u32 srm, *cs;
3129
3130	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3131	if (GRAPHICS_VER(i915) >= 8)
3132		srm++;
3133
3134	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3135		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3136			count++;
3137	}
3138
3139	cs = intel_ring_begin(rq, 4 * count);
3140	if (IS_ERR(cs))
3141		return PTR_ERR(cs);
3142
3143	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3144		u32 offset = i915_mmio_reg_offset(wa->reg);
3145
3146		if (mcr_range(i915, offset))
3147			continue;
3148
3149		*cs++ = srm;
3150		*cs++ = offset;
3151		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3152		*cs++ = 0;
3153	}
3154	intel_ring_advance(rq, cs);
3155
3156	return 0;
3157}
3158
3159static int engine_wa_list_verify(struct intel_context *ce,
3160				 const struct i915_wa_list * const wal,
3161				 const char *from)
3162{
3163	const struct i915_wa *wa;
3164	struct i915_request *rq;
3165	struct i915_vma *vma;
3166	struct i915_gem_ww_ctx ww;
3167	unsigned int i;
3168	u32 *results;
3169	int err;
3170
3171	if (!wal->count)
3172		return 0;
3173
3174	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3175					   wal->count * sizeof(u32));
3176	if (IS_ERR(vma))
3177		return PTR_ERR(vma);
3178
3179	intel_engine_pm_get(ce->engine);
3180	i915_gem_ww_ctx_init(&ww, false);
3181retry:
3182	err = i915_gem_object_lock(vma->obj, &ww);
3183	if (err == 0)
3184		err = intel_context_pin_ww(ce, &ww);
3185	if (err)
3186		goto err_pm;
3187
3188	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3189			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3190	if (err)
3191		goto err_unpin;
3192
3193	rq = i915_request_create(ce);
3194	if (IS_ERR(rq)) {
3195		err = PTR_ERR(rq);
3196		goto err_vma;
3197	}
3198
3199	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3200	if (err == 0)
3201		err = wa_list_srm(rq, wal, vma);
3202
3203	i915_request_get(rq);
3204	if (err)
3205		i915_request_set_error_once(rq, err);
3206	i915_request_add(rq);
3207
3208	if (err)
3209		goto err_rq;
3210
3211	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3212		err = -ETIME;
3213		goto err_rq;
3214	}
3215
3216	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3217	if (IS_ERR(results)) {
3218		err = PTR_ERR(results);
3219		goto err_rq;
3220	}
3221
3222	err = 0;
3223	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3224		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3225			continue;
3226
3227		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3228			err = -ENXIO;
3229	}
3230
3231	i915_gem_object_unpin_map(vma->obj);
3232
3233err_rq:
3234	i915_request_put(rq);
3235err_vma:
3236	i915_vma_unpin(vma);
3237err_unpin:
3238	intel_context_unpin(ce);
3239err_pm:
3240	if (err == -EDEADLK) {
3241		err = i915_gem_ww_ctx_backoff(&ww);
3242		if (!err)
3243			goto retry;
3244	}
3245	i915_gem_ww_ctx_fini(&ww);
3246	intel_engine_pm_put(ce->engine);
3247	i915_vma_put(vma);
3248	return err;
3249}
3250
3251int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3252				    const char *from)
3253{
3254	return engine_wa_list_verify(engine->kernel_context,
3255				     &engine->wa_list,
3256				     from);
3257}
3258
3259#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3260#include "selftest_workarounds.c"
3261#endif
3262