1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2014-2018 Intel Corporation
4 */
5
6#include "i915_drv.h"
7#include "i915_reg.h"
8#include "intel_context.h"
9#include "intel_engine_pm.h"
10#include "intel_engine_regs.h"
11#include "intel_gpu_commands.h"
12#include "intel_gt.h"
13#include "intel_gt_ccs_mode.h"
14#include "intel_gt_mcr.h"
15#include "intel_gt_regs.h"
16#include "intel_ring.h"
17#include "intel_workarounds.h"
18
19/**
20 * DOC: Hardware workarounds
21 *
22 * Hardware workarounds are register programming documented to be executed in
23 * the driver that fall outside of the normal programming sequences for a
24 * platform. There are some basic categories of workarounds, depending on
25 * how/when they are applied:
26 *
27 * - Context workarounds: workarounds that touch registers that are
28 *   saved/restored to/from the HW context image. The list is emitted (via Load
29 *   Register Immediate commands) once when initializing the device and saved in
30 *   the default context. That default context is then used on every context
31 *   creation to have a "primed golden context", i.e. a context image that
32 *   already contains the changes needed to all the registers.
33 *
34 *   Context workarounds should be implemented in the \*_ctx_workarounds_init()
35 *   variants respective to the targeted platforms.
36 *
37 * - Engine workarounds: the list of these WAs is applied whenever the specific
38 *   engine is reset. It's also possible that a set of engine classes share a
39 *   common power domain and they are reset together. This happens on some
40 *   platforms with render and compute engines. In this case (at least) one of
41 *   them need to keeep the workaround programming: the approach taken in the
42 *   driver is to tie those workarounds to the first compute/render engine that
43 *   is registered.  When executing with GuC submission, engine resets are
44 *   outside of kernel driver control, hence the list of registers involved in
45 *   written once, on engine initialization, and then passed to GuC, that
46 *   saves/restores their values before/after the reset takes place. See
47 *   ``drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c`` for reference.
48 *
49 *   Workarounds for registers specific to RCS and CCS should be implemented in
50 *   rcs_engine_wa_init() and ccs_engine_wa_init(), respectively; those for
51 *   registers belonging to BCS, VCS or VECS should be implemented in
52 *   xcs_engine_wa_init(). Workarounds for registers not belonging to a specific
53 *   engine's MMIO range but that are part of of the common RCS/CCS reset domain
54 *   should be implemented in general_render_compute_wa_init(). The settings
55 *   about the CCS load balancing should be added in ccs_engine_wa_mode().
56 *
57 * - GT workarounds: the list of these WAs is applied whenever these registers
58 *   revert to their default values: on GPU reset, suspend/resume [1]_, etc.
59 *
60 *   GT workarounds should be implemented in the \*_gt_workarounds_init()
61 *   variants respective to the targeted platforms.
62 *
63 * - Register whitelist: some workarounds need to be implemented in userspace,
64 *   but need to touch privileged registers. The whitelist in the kernel
65 *   instructs the hardware to allow the access to happen. From the kernel side,
66 *   this is just a special case of a MMIO workaround (as we write the list of
67 *   these to/be-whitelisted registers to some special HW registers).
68 *
69 *   Register whitelisting should be done in the \*_whitelist_build() variants
70 *   respective to the targeted platforms.
71 *
72 * - Workaround batchbuffers: buffers that get executed automatically by the
73 *   hardware on every HW context restore. These buffers are created and
74 *   programmed in the default context so the hardware always go through those
75 *   programming sequences when switching contexts. The support for workaround
76 *   batchbuffers is enabled these hardware mechanisms:
77 *
78 *   #. INDIRECT_CTX: A batchbuffer and an offset are provided in the default
79 *      context, pointing the hardware to jump to that location when that offset
80 *      is reached in the context restore. Workaround batchbuffer in the driver
81 *      currently uses this mechanism for all platforms.
82 *
83 *   #. BB_PER_CTX_PTR: A batchbuffer is provided in the default context,
84 *      pointing the hardware to a buffer to continue executing after the
85 *      engine registers are restored in a context restore sequence. This is
86 *      currently not used in the driver.
87 *
88 * - Other:  There are WAs that, due to their nature, cannot be applied from a
89 *   central place. Those are peppered around the rest of the code, as needed.
90 *   Workarounds related to the display IP are the main example.
91 *
92 * .. [1] Technically, some registers are powercontext saved & restored, so they
93 *    survive a suspend/resume. In practice, writing them again is not too
94 *    costly and simplifies things, so it's the approach taken in the driver.
95 */
96
97static void wa_init_start(struct i915_wa_list *wal, struct intel_gt *gt,
98			  const char *name, const char *engine_name)
99{
100	wal->gt = gt;
101	wal->name = name;
102	wal->engine_name = engine_name;
103}
104
105#define WA_LIST_CHUNK (1 << 4)
106
107static void wa_init_finish(struct i915_wa_list *wal)
108{
109	/* Trim unused entries. */
110	if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) {
111		struct i915_wa *list = kmemdup(wal->list,
112					       wal->count * sizeof(*list),
113					       GFP_KERNEL);
114
115		if (list) {
116			kfree(wal->list);
117			wal->list = list;
118		}
119	}
120
121	if (!wal->count)
122		return;
123
124	drm_dbg(&wal->gt->i915->drm, "Initialized %u %s workarounds on %s\n",
125		wal->wa_count, wal->name, wal->engine_name);
126}
127
128static enum forcewake_domains
129wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal)
130{
131	enum forcewake_domains fw = 0;
132	struct i915_wa *wa;
133	unsigned int i;
134
135	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
136		fw |= intel_uncore_forcewake_for_reg(uncore,
137						     wa->reg,
138						     FW_REG_READ |
139						     FW_REG_WRITE);
140
141	return fw;
142}
143
144static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa)
145{
146	unsigned int addr = i915_mmio_reg_offset(wa->reg);
147	struct drm_i915_private *i915 = wal->gt->i915;
148	unsigned int start = 0, end = wal->count;
149	const unsigned int grow = WA_LIST_CHUNK;
150	struct i915_wa *wa_;
151
152	GEM_BUG_ON(!is_power_of_2(grow));
153
154	if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */
155		struct i915_wa *list;
156
157		list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa),
158				     GFP_KERNEL);
159		if (!list) {
160			drm_err(&i915->drm, "No space for workaround init!\n");
161			return;
162		}
163
164		if (wal->list) {
165			memcpy(list, wal->list, sizeof(*wa) * wal->count);
166			kfree(wal->list);
167		}
168
169		wal->list = list;
170	}
171
172	while (start < end) {
173		unsigned int mid = start + (end - start) / 2;
174
175		if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) {
176			start = mid + 1;
177		} else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) {
178			end = mid;
179		} else {
180			wa_ = &wal->list[mid];
181
182			if ((wa->clr | wa_->clr) && !(wa->clr & ~wa_->clr)) {
183				drm_err(&i915->drm,
184					"Discarding overwritten w/a for reg %04x (clear: %08x, set: %08x)\n",
185					i915_mmio_reg_offset(wa_->reg),
186					wa_->clr, wa_->set);
187
188				wa_->set &= ~wa->clr;
189			}
190
191			wal->wa_count++;
192			wa_->set |= wa->set;
193			wa_->clr |= wa->clr;
194			wa_->read |= wa->read;
195			return;
196		}
197	}
198
199	wal->wa_count++;
200	wa_ = &wal->list[wal->count++];
201	*wa_ = *wa;
202
203	while (wa_-- > wal->list) {
204		GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) ==
205			   i915_mmio_reg_offset(wa_[1].reg));
206		if (i915_mmio_reg_offset(wa_[1].reg) >
207		    i915_mmio_reg_offset(wa_[0].reg))
208			break;
209
210		swap(wa_[1], wa_[0]);
211	}
212}
213
214static void wa_add(struct i915_wa_list *wal, i915_reg_t reg,
215		   u32 clear, u32 set, u32 read_mask, bool masked_reg)
216{
217	struct i915_wa wa = {
218		.reg  = reg,
219		.clr  = clear,
220		.set  = set,
221		.read = read_mask,
222		.masked_reg = masked_reg,
223	};
224
225	_wa_add(wal, &wa);
226}
227
228static void wa_mcr_add(struct i915_wa_list *wal, i915_mcr_reg_t reg,
229		       u32 clear, u32 set, u32 read_mask, bool masked_reg)
230{
231	struct i915_wa wa = {
232		.mcr_reg = reg,
233		.clr  = clear,
234		.set  = set,
235		.read = read_mask,
236		.masked_reg = masked_reg,
237		.is_mcr = 1,
238	};
239
240	_wa_add(wal, &wa);
241}
242
243static void
244wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set)
245{
246	wa_add(wal, reg, clear, set, clear | set, false);
247}
248
249static void
250wa_mcr_write_clr_set(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clear, u32 set)
251{
252	wa_mcr_add(wal, reg, clear, set, clear | set, false);
253}
254
255static void
256wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
257{
258	wa_write_clr_set(wal, reg, ~0, set);
259}
260
261static void
262wa_mcr_write(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
263{
264	wa_mcr_write_clr_set(wal, reg, ~0, set);
265}
266
267static void
268wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set)
269{
270	wa_write_clr_set(wal, reg, set, set);
271}
272
273static void
274wa_mcr_write_or(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 set)
275{
276	wa_mcr_write_clr_set(wal, reg, set, set);
277}
278
279static void
280wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr)
281{
282	wa_write_clr_set(wal, reg, clr, 0);
283}
284
285static void
286wa_mcr_write_clr(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 clr)
287{
288	wa_mcr_write_clr_set(wal, reg, clr, 0);
289}
290
291/*
292 * WA operations on "masked register". A masked register has the upper 16 bits
293 * documented as "masked" in b-spec. Its purpose is to allow writing to just a
294 * portion of the register without a rmw: you simply write in the upper 16 bits
295 * the mask of bits you are going to modify.
296 *
297 * The wa_masked_* family of functions already does the necessary operations to
298 * calculate the mask based on the parameters passed, so user only has to
299 * provide the lower 16 bits of that register.
300 */
301
302static void
303wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
304{
305	wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
306}
307
308static void
309wa_mcr_masked_en(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
310{
311	wa_mcr_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val, true);
312}
313
314static void
315wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
316{
317	wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
318}
319
320static void
321wa_mcr_masked_dis(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 val)
322{
323	wa_mcr_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val, true);
324}
325
326static void
327wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg,
328		    u32 mask, u32 val)
329{
330	wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
331}
332
333static void
334wa_mcr_masked_field_set(struct i915_wa_list *wal, i915_mcr_reg_t reg,
335			u32 mask, u32 val)
336{
337	wa_mcr_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask, true);
338}
339
340static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
341				      struct i915_wa_list *wal)
342{
343	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
344}
345
346static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
347				      struct i915_wa_list *wal)
348{
349	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
350}
351
352static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
353				      struct i915_wa_list *wal)
354{
355	wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING);
356
357	/* WaDisableAsyncFlipPerfMode:bdw,chv */
358	wa_masked_en(wal, RING_MI_MODE(RENDER_RING_BASE), ASYNC_FLIP_PERF_DISABLE);
359
360	/* WaDisablePartialInstShootdown:bdw,chv */
361	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
362			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
363
364	/* Use Force Non-Coherent whenever executing a 3D context. This is a
365	 * workaround for a possible hang in the unlikely event a TLB
366	 * invalidation occurs during a PSD flush.
367	 */
368	/* WaForceEnableNonCoherent:bdw,chv */
369	/* WaHdcDisableFetchWhenMasked:bdw,chv */
370	wa_masked_en(wal, HDC_CHICKEN0,
371		     HDC_DONOT_FETCH_MEM_WHEN_MASKED |
372		     HDC_FORCE_NON_COHERENT);
373
374	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
375	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
376	 *  polygons in the same 8x4 pixel/sample area to be processed without
377	 *  stalling waiting for the earlier ones to write to Hierarchical Z
378	 *  buffer."
379	 *
380	 * This optimization is off by default for BDW and CHV; turn it on.
381	 */
382	wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE);
383
384	/* Wa4x4STCOptimizationDisable:bdw,chv */
385	wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE);
386
387	/*
388	 * BSpec recommends 8x4 when MSAA is used,
389	 * however in practice 16x4 seems fastest.
390	 *
391	 * Note that PS/WM thread counts depend on the WIZ hashing
392	 * disable bit, which we don't touch here, but it's good
393	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
394	 */
395	wa_masked_field_set(wal, GEN7_GT_MODE,
396			    GEN6_WIZ_HASHING_MASK,
397			    GEN6_WIZ_HASHING_16x4);
398}
399
400static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine,
401				     struct i915_wa_list *wal)
402{
403	struct drm_i915_private *i915 = engine->i915;
404
405	gen8_ctx_workarounds_init(engine, wal);
406
407	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
408	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
409
410	/* WaDisableDopClockGating:bdw
411	 *
412	 * Also see the related UCGTCL1 write in bdw_init_clock_gating()
413	 * to disable EUTC clock gating.
414	 */
415	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
416			 DOP_CLOCK_GATING_DISABLE);
417
418	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
419			 GEN8_SAMPLER_POWER_BYPASS_DIS);
420
421	wa_masked_en(wal, HDC_CHICKEN0,
422		     /* WaForceContextSaveRestoreNonCoherent:bdw */
423		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
424		     /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
425		     (IS_BROADWELL_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
426}
427
428static void chv_ctx_workarounds_init(struct intel_engine_cs *engine,
429				     struct i915_wa_list *wal)
430{
431	gen8_ctx_workarounds_init(engine, wal);
432
433	/* WaDisableThreadStallDopClockGating:chv */
434	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
435
436	/* Improve HiZ throughput on CHV. */
437	wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
438}
439
440static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
441				      struct i915_wa_list *wal)
442{
443	struct drm_i915_private *i915 = engine->i915;
444
445	if (HAS_LLC(i915)) {
446		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
447		 *
448		 * Must match Display Engine. See
449		 * WaCompressedResourceDisplayNewHashMode.
450		 */
451		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
452			     GEN9_PBE_COMPRESSED_HASH_SELECTION);
453		wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
454				 GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR);
455	}
456
457	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */
458	/* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */
459	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
460			 FLOW_CONTROL_ENABLE |
461			 PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
462
463	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */
464	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */
465	wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7,
466			 GEN9_ENABLE_YV12_BUGFIX |
467			 GEN9_ENABLE_GPGPU_PREEMPTION);
468
469	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */
470	/* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */
471	wa_masked_en(wal, CACHE_MODE_1,
472		     GEN8_4x4_STC_OPTIMIZATION_DISABLE |
473		     GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE);
474
475	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */
476	wa_mcr_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5,
477			  GEN9_CCS_TLB_PREFETCH_ENABLE);
478
479	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */
480	wa_masked_en(wal, HDC_CHICKEN0,
481		     HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
482		     HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
483
484	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
485	 * both tied to WaForceContextSaveRestoreNonCoherent
486	 * in some hsds for skl. We keep the tie for all gen9. The
487	 * documentation is a bit hazy and so we want to get common behaviour,
488	 * even though there is no clear evidence we would need both on kbl/bxt.
489	 * This area has been source of system hangs so we play it safe
490	 * and mimic the skl regardless of what bspec says.
491	 *
492	 * Use Force Non-Coherent whenever executing a 3D context. This
493	 * is a workaround for a possible hang in the unlikely event
494	 * a TLB invalidation occurs during a PSD flush.
495	 */
496
497	/* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */
498	wa_masked_en(wal, HDC_CHICKEN0,
499		     HDC_FORCE_NON_COHERENT);
500
501	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
502	if (IS_SKYLAKE(i915) ||
503	    IS_KABYLAKE(i915) ||
504	    IS_COFFEELAKE(i915) ||
505	    IS_COMETLAKE(i915))
506		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN3,
507				 GEN8_SAMPLER_POWER_BYPASS_DIS);
508
509	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */
510	wa_mcr_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
511
512	/*
513	 * Supporting preemption with fine-granularity requires changes in the
514	 * batch buffer programming. Since we can't break old userspace, we
515	 * need to set our default preemption level to safe value. Userspace is
516	 * still able to use more fine-grained preemption levels, since in
517	 * WaEnablePreemptionGranularityControlByUMD we're whitelisting the
518	 * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are
519	 * not real HW workarounds, but merely a way to start using preemption
520	 * while maintaining old contract with userspace.
521	 */
522
523	/* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */
524	wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL);
525
526	/* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */
527	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
528			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
529			    GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
530
531	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
532	if (IS_GEN9_LP(i915))
533		wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
534}
535
536static void skl_tune_iz_hashing(struct intel_engine_cs *engine,
537				struct i915_wa_list *wal)
538{
539	struct intel_gt *gt = engine->gt;
540	u8 vals[3] = { 0, 0, 0 };
541	unsigned int i;
542
543	for (i = 0; i < 3; i++) {
544		u8 ss;
545
546		/*
547		 * Only consider slices where one, and only one, subslice has 7
548		 * EUs
549		 */
550		if (!is_power_of_2(gt->info.sseu.subslice_7eu[i]))
551			continue;
552
553		/*
554		 * subslice_7eu[i] != 0 (because of the check above) and
555		 * ss_max == 4 (maximum number of subslices possible per slice)
556		 *
557		 * ->    0 <= ss <= 3;
558		 */
559		ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1;
560		vals[i] = 3 - ss;
561	}
562
563	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
564		return;
565
566	/* Tune IZ hashing. See intel_device_info_runtime_init() */
567	wa_masked_field_set(wal, GEN7_GT_MODE,
568			    GEN9_IZ_HASHING_MASK(2) |
569			    GEN9_IZ_HASHING_MASK(1) |
570			    GEN9_IZ_HASHING_MASK(0),
571			    GEN9_IZ_HASHING(2, vals[2]) |
572			    GEN9_IZ_HASHING(1, vals[1]) |
573			    GEN9_IZ_HASHING(0, vals[0]));
574}
575
576static void skl_ctx_workarounds_init(struct intel_engine_cs *engine,
577				     struct i915_wa_list *wal)
578{
579	gen9_ctx_workarounds_init(engine, wal);
580	skl_tune_iz_hashing(engine, wal);
581}
582
583static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine,
584				     struct i915_wa_list *wal)
585{
586	gen9_ctx_workarounds_init(engine, wal);
587
588	/* WaDisableThreadStallDopClockGating:bxt */
589	wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN,
590			 STALL_DOP_GATING_DISABLE);
591
592	/* WaToEnableHwFixForPushConstHWBug:bxt */
593	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
594		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
595}
596
597static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine,
598				     struct i915_wa_list *wal)
599{
600	struct drm_i915_private *i915 = engine->i915;
601
602	gen9_ctx_workarounds_init(engine, wal);
603
604	/* WaToEnableHwFixForPushConstHWBug:kbl */
605	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_C0, STEP_FOREVER))
606		wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
607			     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
608
609	/* WaDisableSbeCacheDispatchPortSharing:kbl */
610	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
611			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
612}
613
614static void glk_ctx_workarounds_init(struct intel_engine_cs *engine,
615				     struct i915_wa_list *wal)
616{
617	gen9_ctx_workarounds_init(engine, wal);
618
619	/* WaToEnableHwFixForPushConstHWBug:glk */
620	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
621		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
622}
623
624static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine,
625				     struct i915_wa_list *wal)
626{
627	gen9_ctx_workarounds_init(engine, wal);
628
629	/* WaToEnableHwFixForPushConstHWBug:cfl */
630	wa_masked_en(wal, COMMON_SLICE_CHICKEN2,
631		     GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
632
633	/* WaDisableSbeCacheDispatchPortSharing:cfl */
634	wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
635			 GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
636}
637
638static void icl_ctx_workarounds_init(struct intel_engine_cs *engine,
639				     struct i915_wa_list *wal)
640{
641	/* Wa_1406697149 (WaDisableBankHangMode:icl) */
642	wa_write(wal, GEN8_L3CNTLREG, GEN8_ERRDETBCTRL);
643
644	/* WaForceEnableNonCoherent:icl
645	 * This is not the same workaround as in early Gen9 platforms, where
646	 * lacking this could cause system hangs, but coherency performance
647	 * overhead is high and only a few compute workloads really need it
648	 * (the register is whitelisted in hardware now, so UMDs can opt in
649	 * for coherency if they have a good reason).
650	 */
651	wa_mcr_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
652
653	/* WaEnableFloatBlendOptimization:icl */
654	wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
655		   _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE),
656		   0 /* write-only, so skip validation */,
657		   true);
658
659	/* WaDisableGPGPUMidThreadPreemption:icl */
660	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
661			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
662			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
663
664	/* allow headerless messages for preemptible GPGPU context */
665	wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
666			 GEN11_SAMPLER_ENABLE_HEADLESS_MSG);
667
668	/* Wa_1604278689:icl,ehl */
669	wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID);
670	wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER,
671			 0,
672			 0xFFFFFFFF);
673
674	/* Wa_1406306137:icl,ehl */
675	wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU);
676}
677
678/*
679 * These settings aren't actually workarounds, but general tuning settings that
680 * need to be programmed on dg2 platform.
681 */
682static void dg2_ctx_gt_tuning_init(struct intel_engine_cs *engine,
683				   struct i915_wa_list *wal)
684{
685	wa_mcr_masked_en(wal, CHICKEN_RASTER_2, TBIMR_FAST_CLIP);
686	wa_mcr_write_clr_set(wal, XEHP_L3SQCREG5, L3_PWM_TIMER_INIT_VAL_MASK,
687			     REG_FIELD_PREP(L3_PWM_TIMER_INIT_VAL_MASK, 0x7f));
688	wa_mcr_write_clr_set(wal, XEHP_FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
689			     FF_MODE2_TDS_TIMER_128);
690}
691
692static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine,
693				       struct i915_wa_list *wal)
694{
695	struct drm_i915_private *i915 = engine->i915;
696
697	/*
698	 * Wa_1409142259:tgl,dg1,adl-p
699	 * Wa_1409347922:tgl,dg1,adl-p
700	 * Wa_1409252684:tgl,dg1,adl-p
701	 * Wa_1409217633:tgl,dg1,adl-p
702	 * Wa_1409207793:tgl,dg1,adl-p
703	 * Wa_1409178076:tgl,dg1,adl-p
704	 * Wa_1408979724:tgl,dg1,adl-p
705	 * Wa_14010443199:tgl,rkl,dg1,adl-p
706	 * Wa_14010698770:tgl,rkl,dg1,adl-s,adl-p
707	 * Wa_1409342910:tgl,rkl,dg1,adl-s,adl-p
708	 */
709	wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3,
710		     GEN12_DISABLE_CPS_AWARE_COLOR_PIPE);
711
712	/* WaDisableGPGPUMidThreadPreemption:gen12 */
713	wa_masked_field_set(wal, GEN8_CS_CHICKEN1,
714			    GEN9_PREEMPT_GPGPU_LEVEL_MASK,
715			    GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL);
716
717	/*
718	 * Wa_16011163337 - GS_TIMER
719	 *
720	 * TDS_TIMER: Although some platforms refer to it as Wa_1604555607, we
721	 * need to program it even on those that don't explicitly list that
722	 * workaround.
723	 *
724	 * Note that the programming of GEN12_FF_MODE2 is further modified
725	 * according to the FF_MODE2 guidance given by Wa_1608008084.
726	 * Wa_1608008084 tells us the FF_MODE2 register will return the wrong
727	 * value when read from the CPU.
728	 *
729	 * The default value for this register is zero for all fields.
730	 * So instead of doing a RMW we should just write the desired values
731	 * for TDS and GS timers. Note that since the readback can't be trusted,
732	 * the clear mask is just set to ~0 to make sure other bits are not
733	 * inadvertently set. For the same reason read verification is ignored.
734	 */
735	wa_add(wal,
736	       GEN12_FF_MODE2,
737	       ~0,
738	       FF_MODE2_TDS_TIMER_128 | FF_MODE2_GS_TIMER_224,
739	       0, false);
740
741	if (!IS_DG1(i915)) {
742		/* Wa_1806527549 */
743		wa_masked_en(wal, HIZ_CHICKEN, HZ_DEPTH_TEST_LE_GE_OPT_DISABLE);
744
745		/* Wa_1606376872 */
746		wa_masked_en(wal, COMMON_SLICE_CHICKEN4, DISABLE_TDC_LOAD_BALANCING_CALC);
747	}
748}
749
750static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine,
751				     struct i915_wa_list *wal)
752{
753	gen12_ctx_workarounds_init(engine, wal);
754
755	/* Wa_1409044764 */
756	wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3,
757		      DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN);
758
759	/* Wa_22010493298 */
760	wa_masked_en(wal, HIZ_CHICKEN,
761		     DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE);
762}
763
764static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine,
765				     struct i915_wa_list *wal)
766{
767	dg2_ctx_gt_tuning_init(engine, wal);
768
769	/* Wa_16013271637:dg2 */
770	wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
771			 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
772
773	/* Wa_14014947963:dg2 */
774	wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000);
775
776	/* Wa_18018764978:dg2 */
777	wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
778
779	/* Wa_15010599737:dg2 */
780	wa_mcr_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN);
781
782	/* Wa_18019271663:dg2 */
783	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
784
785	/* Wa_14019877138:dg2 */
786	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
787}
788
789static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine,
790				     struct i915_wa_list *wal)
791{
792	struct intel_gt *gt = engine->gt;
793
794	dg2_ctx_gt_tuning_init(engine, wal);
795
796	/*
797	 * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in
798	 * gen12_emit_indirect_ctx_rcs() rather than here on some early
799	 * steppings.
800	 */
801	if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
802	      IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)))
803		wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false);
804}
805
806static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine,
807				       struct i915_wa_list *wal)
808{
809	struct intel_gt *gt = engine->gt;
810
811	xelpg_ctx_gt_tuning_init(engine, wal);
812
813	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
814	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
815		/* Wa_14014947963 */
816		wa_masked_field_set(wal, VF_PREEMPTION,
817				    PREEMPTION_VERTEX_COUNT, 0x4000);
818
819		/* Wa_16013271637 */
820		wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1,
821				 MSC_MSAA_REODER_BUF_BYPASS_DISABLE);
822
823		/* Wa_18019627453 */
824		wa_mcr_masked_en(wal, VFLSKPD, VF_PREFETCH_TLB_DIS);
825
826		/* Wa_18018764978 */
827		wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL);
828	}
829
830	/* Wa_18019271663 */
831	wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE);
832
833	/* Wa_14019877138 */
834	wa_mcr_masked_en(wal, XEHP_PSS_CHICKEN, FD_END_COLLECT);
835}
836
837static void fakewa_disable_nestedbb_mode(struct intel_engine_cs *engine,
838					 struct i915_wa_list *wal)
839{
840	/*
841	 * This is a "fake" workaround defined by software to ensure we
842	 * maintain reliable, backward-compatible behavior for userspace with
843	 * regards to how nested MI_BATCH_BUFFER_START commands are handled.
844	 *
845	 * The per-context setting of MI_MODE[12] determines whether the bits
846	 * of a nested MI_BATCH_BUFFER_START instruction should be interpreted
847	 * in the traditional manner or whether they should instead use a new
848	 * tgl+ meaning that breaks backward compatibility, but allows nesting
849	 * into 3rd-level batchbuffers.  When this new capability was first
850	 * added in TGL, it remained off by default unless a context
851	 * intentionally opted in to the new behavior.  However Xe_HPG now
852	 * flips this on by default and requires that we explicitly opt out if
853	 * we don't want the new behavior.
854	 *
855	 * From a SW perspective, we want to maintain the backward-compatible
856	 * behavior for userspace, so we'll apply a fake workaround to set it
857	 * back to the legacy behavior on platforms where the hardware default
858	 * is to break compatibility.  At the moment there is no Linux
859	 * userspace that utilizes third-level batchbuffers, so this will avoid
860	 * userspace from needing to make any changes.  using the legacy
861	 * meaning is the correct thing to do.  If/when we have userspace
862	 * consumers that want to utilize third-level batch nesting, we can
863	 * provide a context parameter to allow them to opt-in.
864	 */
865	wa_masked_dis(wal, RING_MI_MODE(engine->mmio_base), TGL_NESTED_BB_EN);
866}
867
868static void gen12_ctx_gt_mocs_init(struct intel_engine_cs *engine,
869				   struct i915_wa_list *wal)
870{
871	u8 mocs;
872
873	/*
874	 * Some blitter commands do not have a field for MOCS, those
875	 * commands will use MOCS index pointed by BLIT_CCTL.
876	 * BLIT_CCTL registers are needed to be programmed to un-cached.
877	 */
878	if (engine->class == COPY_ENGINE_CLASS) {
879		mocs = engine->gt->mocs.uc_index;
880		wa_write_clr_set(wal,
881				 BLIT_CCTL(engine->mmio_base),
882				 BLIT_CCTL_MASK,
883				 BLIT_CCTL_MOCS(mocs, mocs));
884	}
885}
886
887/*
888 * gen12_ctx_gt_fake_wa_init() aren't programmingan official workaround
889 * defined by the hardware team, but it programming general context registers.
890 * Adding those context register programming in context workaround
891 * allow us to use the wa framework for proper application and validation.
892 */
893static void
894gen12_ctx_gt_fake_wa_init(struct intel_engine_cs *engine,
895			  struct i915_wa_list *wal)
896{
897	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
898		fakewa_disable_nestedbb_mode(engine, wal);
899
900	gen12_ctx_gt_mocs_init(engine, wal);
901}
902
903static void
904__intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
905			   struct i915_wa_list *wal,
906			   const char *name)
907{
908	struct drm_i915_private *i915 = engine->i915;
909
910	wa_init_start(wal, engine->gt, name, engine->name);
911
912	/* Applies to all engines */
913	/*
914	 * Fake workarounds are not the actual workaround but
915	 * programming of context registers using workaround framework.
916	 */
917	if (GRAPHICS_VER(i915) >= 12)
918		gen12_ctx_gt_fake_wa_init(engine, wal);
919
920	if (engine->class != RENDER_CLASS)
921		goto done;
922
923	if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
924		xelpg_ctx_workarounds_init(engine, wal);
925	else if (IS_PONTEVECCHIO(i915))
926		; /* noop; none at this time */
927	else if (IS_DG2(i915))
928		dg2_ctx_workarounds_init(engine, wal);
929	else if (IS_XEHPSDV(i915))
930		; /* noop; none at this time */
931	else if (IS_DG1(i915))
932		dg1_ctx_workarounds_init(engine, wal);
933	else if (GRAPHICS_VER(i915) == 12)
934		gen12_ctx_workarounds_init(engine, wal);
935	else if (GRAPHICS_VER(i915) == 11)
936		icl_ctx_workarounds_init(engine, wal);
937	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
938		cfl_ctx_workarounds_init(engine, wal);
939	else if (IS_GEMINILAKE(i915))
940		glk_ctx_workarounds_init(engine, wal);
941	else if (IS_KABYLAKE(i915))
942		kbl_ctx_workarounds_init(engine, wal);
943	else if (IS_BROXTON(i915))
944		bxt_ctx_workarounds_init(engine, wal);
945	else if (IS_SKYLAKE(i915))
946		skl_ctx_workarounds_init(engine, wal);
947	else if (IS_CHERRYVIEW(i915))
948		chv_ctx_workarounds_init(engine, wal);
949	else if (IS_BROADWELL(i915))
950		bdw_ctx_workarounds_init(engine, wal);
951	else if (GRAPHICS_VER(i915) == 7)
952		gen7_ctx_workarounds_init(engine, wal);
953	else if (GRAPHICS_VER(i915) == 6)
954		gen6_ctx_workarounds_init(engine, wal);
955	else if (GRAPHICS_VER(i915) < 8)
956		;
957	else
958		MISSING_CASE(GRAPHICS_VER(i915));
959
960done:
961	wa_init_finish(wal);
962}
963
964void intel_engine_init_ctx_wa(struct intel_engine_cs *engine)
965{
966	__intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context");
967}
968
969int intel_engine_emit_ctx_wa(struct i915_request *rq)
970{
971	struct i915_wa_list *wal = &rq->engine->ctx_wa_list;
972	struct intel_uncore *uncore = rq->engine->uncore;
973	enum forcewake_domains fw;
974	unsigned long flags;
975	struct i915_wa *wa;
976	unsigned int i;
977	u32 *cs;
978	int ret;
979
980	if (wal->count == 0)
981		return 0;
982
983	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
984	if (ret)
985		return ret;
986
987	cs = intel_ring_begin(rq, (wal->count * 2 + 2));
988	if (IS_ERR(cs))
989		return PTR_ERR(cs);
990
991	fw = wal_get_fw_for_rmw(uncore, wal);
992
993	intel_gt_mcr_lock(wal->gt, &flags);
994	spin_lock(&uncore->lock);
995	intel_uncore_forcewake_get__locked(uncore, fw);
996
997	*cs++ = MI_LOAD_REGISTER_IMM(wal->count);
998	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
999		u32 val;
1000
1001		/* Skip reading the register if it's not really needed */
1002		if (wa->masked_reg || (wa->clr | wa->set) == U32_MAX) {
1003			val = wa->set;
1004		} else {
1005			val = wa->is_mcr ?
1006				intel_gt_mcr_read_any_fw(wal->gt, wa->mcr_reg) :
1007				intel_uncore_read_fw(uncore, wa->reg);
1008			val &= ~wa->clr;
1009			val |= wa->set;
1010		}
1011
1012		*cs++ = i915_mmio_reg_offset(wa->reg);
1013		*cs++ = val;
1014	}
1015	*cs++ = MI_NOOP;
1016
1017	intel_uncore_forcewake_put__locked(uncore, fw);
1018	spin_unlock(&uncore->lock);
1019	intel_gt_mcr_unlock(wal->gt, flags);
1020
1021	intel_ring_advance(rq, cs);
1022
1023	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
1024	if (ret)
1025		return ret;
1026
1027	return 0;
1028}
1029
1030static void
1031gen4_gt_workarounds_init(struct intel_gt *gt,
1032			 struct i915_wa_list *wal)
1033{
1034	/* WaDisable_RenderCache_OperationalFlush:gen4,ilk */
1035	wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
1036}
1037
1038static void
1039g4x_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1040{
1041	gen4_gt_workarounds_init(gt, wal);
1042
1043	/* WaDisableRenderCachePipelinedFlush:g4x,ilk */
1044	wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE);
1045}
1046
1047static void
1048ilk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1049{
1050	g4x_gt_workarounds_init(gt, wal);
1051
1052	wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED);
1053}
1054
1055static void
1056snb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1057{
1058}
1059
1060static void
1061ivb_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1062{
1063	/* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */
1064	wa_masked_dis(wal,
1065		      GEN7_COMMON_SLICE_CHICKEN1,
1066		      GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
1067
1068	/* WaApplyL3ControlAndL3ChickenMode:ivb */
1069	wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL);
1070	wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
1071
1072	/* WaForceL3Serialization:ivb */
1073	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1074}
1075
1076static void
1077vlv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1078{
1079	/* WaForceL3Serialization:vlv */
1080	wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE);
1081
1082	/*
1083	 * WaIncreaseL3CreditsForVLVB0:vlv
1084	 * This is the hardware default actually.
1085	 */
1086	wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE);
1087}
1088
1089static void
1090hsw_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1091{
1092	/* L3 caching of data atomics doesn't work -- disable it. */
1093	wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
1094
1095	wa_add(wal,
1096	       HSW_ROW_CHICKEN3, 0,
1097	       _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE),
1098	       0 /* XXX does this reg exist? */, true);
1099
1100	/* WaVSRefCountFullforceMissDisable:hsw */
1101	wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME);
1102}
1103
1104static void
1105gen9_wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal)
1106{
1107	const struct sseu_dev_info *sseu = &to_gt(i915)->info.sseu;
1108	unsigned int slice, subslice;
1109	u32 mcr, mcr_mask;
1110
1111	GEM_BUG_ON(GRAPHICS_VER(i915) != 9);
1112
1113	/*
1114	 * WaProgramMgsrForCorrectSliceSpecificMmioReads:gen9,glk,kbl,cml
1115	 * Before any MMIO read into slice/subslice specific registers, MCR
1116	 * packet control register needs to be programmed to point to any
1117	 * enabled s/ss pair. Otherwise, incorrect values will be returned.
1118	 * This means each subsequent MMIO read will be forwarded to an
1119	 * specific s/ss combination, but this is OK since these registers
1120	 * are consistent across s/ss in almost all cases. In the rare
1121	 * occasions, such as INSTDONE, where this value is dependent
1122	 * on s/ss combo, the read should be done with read_subslice_reg.
1123	 */
1124	slice = ffs(sseu->slice_mask) - 1;
1125	GEM_BUG_ON(slice >= ARRAY_SIZE(sseu->subslice_mask.hsw));
1126	subslice = ffs(intel_sseu_get_hsw_subslices(sseu, slice));
1127	GEM_BUG_ON(!subslice);
1128	subslice--;
1129
1130	/*
1131	 * We use GEN8_MCR..() macros to calculate the |mcr| value for
1132	 * Gen9 to address WaProgramMgsrForCorrectSliceSpecificMmioReads
1133	 */
1134	mcr = GEN8_MCR_SLICE(slice) | GEN8_MCR_SUBSLICE(subslice);
1135	mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
1136
1137	drm_dbg(&i915->drm, "MCR slice:%d/subslice:%d = %x\n", slice, subslice, mcr);
1138
1139	wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr);
1140}
1141
1142static void
1143gen9_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1144{
1145	struct drm_i915_private *i915 = gt->i915;
1146
1147	/* WaProgramMgsrForCorrectSliceSpecificMmioReads:glk,kbl,cml,gen9 */
1148	gen9_wa_init_mcr(i915, wal);
1149
1150	/* WaDisableKillLogic:bxt,skl,kbl */
1151	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
1152		wa_write_or(wal,
1153			    GAM_ECOCHK,
1154			    ECOCHK_DIS_TLB);
1155
1156	if (HAS_LLC(i915)) {
1157		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
1158		 *
1159		 * Must match Display Engine. See
1160		 * WaCompressedResourceDisplayNewHashMode.
1161		 */
1162		wa_write_or(wal,
1163			    MMCD_MISC_CTRL,
1164			    MMCD_PCLA | MMCD_HOTSPOT_EN);
1165	}
1166
1167	/* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */
1168	wa_write_or(wal,
1169		    GAM_ECOCHK,
1170		    BDW_DISABLE_HDC_INVALIDATION);
1171}
1172
1173static void
1174skl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1175{
1176	gen9_gt_workarounds_init(gt, wal);
1177
1178	/* WaDisableGafsUnitClkGating:skl */
1179	wa_write_or(wal,
1180		    GEN7_UCGCTL4,
1181		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1182
1183	/* WaInPlaceDecompressionHang:skl */
1184	if (IS_SKYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, STEP_A0, STEP_H0))
1185		wa_write_or(wal,
1186			    GEN9_GAMT_ECO_REG_RW_IA,
1187			    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1188}
1189
1190static void
1191kbl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1192{
1193	gen9_gt_workarounds_init(gt, wal);
1194
1195	/* WaDisableDynamicCreditSharing:kbl */
1196	if (IS_KABYLAKE(gt->i915) && IS_GRAPHICS_STEP(gt->i915, 0, STEP_C0))
1197		wa_write_or(wal,
1198			    GAMT_CHKN_BIT_REG,
1199			    GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
1200
1201	/* WaDisableGafsUnitClkGating:kbl */
1202	wa_write_or(wal,
1203		    GEN7_UCGCTL4,
1204		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1205
1206	/* WaInPlaceDecompressionHang:kbl */
1207	wa_write_or(wal,
1208		    GEN9_GAMT_ECO_REG_RW_IA,
1209		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1210}
1211
1212static void
1213glk_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1214{
1215	gen9_gt_workarounds_init(gt, wal);
1216}
1217
1218static void
1219cfl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1220{
1221	gen9_gt_workarounds_init(gt, wal);
1222
1223	/* WaDisableGafsUnitClkGating:cfl */
1224	wa_write_or(wal,
1225		    GEN7_UCGCTL4,
1226		    GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
1227
1228	/* WaInPlaceDecompressionHang:cfl */
1229	wa_write_or(wal,
1230		    GEN9_GAMT_ECO_REG_RW_IA,
1231		    GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS);
1232}
1233
1234static void __set_mcr_steering(struct i915_wa_list *wal,
1235			       i915_reg_t steering_reg,
1236			       unsigned int slice, unsigned int subslice)
1237{
1238	u32 mcr, mcr_mask;
1239
1240	mcr = GEN11_MCR_SLICE(slice) | GEN11_MCR_SUBSLICE(subslice);
1241	mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
1242
1243	wa_write_clr_set(wal, steering_reg, mcr_mask, mcr);
1244}
1245
1246static void debug_dump_steering(struct intel_gt *gt)
1247{
1248	struct drm_printer p = drm_debug_printer("MCR Steering:");
1249
1250	if (drm_debug_enabled(DRM_UT_DRIVER))
1251		intel_gt_mcr_report_steering(&p, gt, false);
1252}
1253
1254static void __add_mcr_wa(struct intel_gt *gt, struct i915_wa_list *wal,
1255			 unsigned int slice, unsigned int subslice)
1256{
1257	__set_mcr_steering(wal, GEN8_MCR_SELECTOR, slice, subslice);
1258
1259	gt->default_steering.groupid = slice;
1260	gt->default_steering.instanceid = subslice;
1261
1262	debug_dump_steering(gt);
1263}
1264
1265static void
1266icl_wa_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1267{
1268	const struct sseu_dev_info *sseu = &gt->info.sseu;
1269	unsigned int subslice;
1270
1271	GEM_BUG_ON(GRAPHICS_VER(gt->i915) < 11);
1272	GEM_BUG_ON(hweight8(sseu->slice_mask) > 1);
1273
1274	/*
1275	 * Although a platform may have subslices, we need to always steer
1276	 * reads to the lowest instance that isn't fused off.  When Render
1277	 * Power Gating is enabled, grabbing forcewake will only power up a
1278	 * single subslice (the "minconfig") if there isn't a real workload
1279	 * that needs to be run; this means that if we steer register reads to
1280	 * one of the higher subslices, we run the risk of reading back 0's or
1281	 * random garbage.
1282	 */
1283	subslice = __ffs(intel_sseu_get_hsw_subslices(sseu, 0));
1284
1285	/*
1286	 * If the subslice we picked above also steers us to a valid L3 bank,
1287	 * then we can just rely on the default steering and won't need to
1288	 * worry about explicitly re-steering L3BANK reads later.
1289	 */
1290	if (gt->info.l3bank_mask & BIT(subslice))
1291		gt->steering_table[L3BANK] = NULL;
1292
1293	__add_mcr_wa(gt, wal, 0, subslice);
1294}
1295
1296static void
1297xehp_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1298{
1299	const struct sseu_dev_info *sseu = &gt->info.sseu;
1300	unsigned long slice, subslice = 0, slice_mask = 0;
1301	u32 lncf_mask = 0;
1302	int i;
1303
1304	/*
1305	 * On Xe_HP the steering increases in complexity. There are now several
1306	 * more units that require steering and we're not guaranteed to be able
1307	 * to find a common setting for all of them. These are:
1308	 * - GSLICE (fusable)
1309	 * - DSS (sub-unit within gslice; fusable)
1310	 * - L3 Bank (fusable)
1311	 * - MSLICE (fusable)
1312	 * - LNCF (sub-unit within mslice; always present if mslice is present)
1313	 *
1314	 * We'll do our default/implicit steering based on GSLICE (in the
1315	 * sliceid field) and DSS (in the subsliceid field).  If we can
1316	 * find overlap between the valid MSLICE and/or LNCF values with
1317	 * a suitable GSLICE, then we can just re-use the default value and
1318	 * skip and explicit steering at runtime.
1319	 *
1320	 * We only need to look for overlap between GSLICE/MSLICE/LNCF to find
1321	 * a valid sliceid value.  DSS steering is the only type of steering
1322	 * that utilizes the 'subsliceid' bits.
1323	 *
1324	 * Also note that, even though the steering domain is called "GSlice"
1325	 * and it is encoded in the register using the gslice format, the spec
1326	 * says that the combined (geometry | compute) fuse should be used to
1327	 * select the steering.
1328	 */
1329
1330	/* Find the potential gslice candidates */
1331	slice_mask = intel_slicemask_from_xehp_dssmask(sseu->subslice_mask,
1332						       GEN_DSS_PER_GSLICE);
1333
1334	/*
1335	 * Find the potential LNCF candidates.  Either LNCF within a valid
1336	 * mslice is fine.
1337	 */
1338	for_each_set_bit(i, &gt->info.mslice_mask, GEN12_MAX_MSLICES)
1339		lncf_mask |= (0x3 << (i * 2));
1340
1341	/*
1342	 * Are there any sliceid values that work for both GSLICE and LNCF
1343	 * steering?
1344	 */
1345	if (slice_mask & lncf_mask) {
1346		slice_mask &= lncf_mask;
1347		gt->steering_table[LNCF] = NULL;
1348	}
1349
1350	/* How about sliceid values that also work for MSLICE steering? */
1351	if (slice_mask & gt->info.mslice_mask) {
1352		slice_mask &= gt->info.mslice_mask;
1353		gt->steering_table[MSLICE] = NULL;
1354	}
1355
1356	if (IS_XEHPSDV(gt->i915) && slice_mask & BIT(0))
1357		gt->steering_table[GAM] = NULL;
1358
1359	slice = __ffs(slice_mask);
1360	subslice = intel_sseu_find_first_xehp_dss(sseu, GEN_DSS_PER_GSLICE, slice) %
1361		GEN_DSS_PER_GSLICE;
1362
1363	__add_mcr_wa(gt, wal, slice, subslice);
1364
1365	/*
1366	 * SQIDI ranges are special because they use different steering
1367	 * registers than everything else we work with.  On XeHP SDV and
1368	 * DG2-G10, any value in the steering registers will work fine since
1369	 * all instances are present, but DG2-G11 only has SQIDI instances at
1370	 * ID's 2 and 3, so we need to steer to one of those.  For simplicity
1371	 * we'll just steer to a hardcoded "2" since that value will work
1372	 * everywhere.
1373	 */
1374	__set_mcr_steering(wal, MCFG_MCR_SELECTOR, 0, 2);
1375	__set_mcr_steering(wal, SF_MCR_SELECTOR, 0, 2);
1376
1377	/*
1378	 * On DG2, GAM registers have a dedicated steering control register
1379	 * and must always be programmed to a hardcoded groupid of "1."
1380	 */
1381	if (IS_DG2(gt->i915))
1382		__set_mcr_steering(wal, GAM_MCR_SELECTOR, 1, 0);
1383}
1384
1385static void
1386pvc_init_mcr(struct intel_gt *gt, struct i915_wa_list *wal)
1387{
1388	unsigned int dss;
1389
1390	/*
1391	 * Setup implicit steering for COMPUTE and DSS ranges to the first
1392	 * non-fused-off DSS.  All other types of MCR registers will be
1393	 * explicitly steered.
1394	 */
1395	dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
1396	__add_mcr_wa(gt, wal, dss / GEN_DSS_PER_CSLICE, dss % GEN_DSS_PER_CSLICE);
1397}
1398
1399static void
1400icl_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1401{
1402	struct drm_i915_private *i915 = gt->i915;
1403
1404	icl_wa_init_mcr(gt, wal);
1405
1406	/* WaModifyGamTlbPartitioning:icl */
1407	wa_write_clr_set(wal,
1408			 GEN11_GACB_PERF_CTRL,
1409			 GEN11_HASH_CTRL_MASK,
1410			 GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4);
1411
1412	/* Wa_1405766107:icl
1413	 * Formerly known as WaCL2SFHalfMaxAlloc
1414	 */
1415	wa_write_or(wal,
1416		    GEN11_LSN_UNSLCVC,
1417		    GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC |
1418		    GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC);
1419
1420	/* Wa_220166154:icl
1421	 * Formerly known as WaDisCtxReload
1422	 */
1423	wa_write_or(wal,
1424		    GEN8_GAMW_ECO_DEV_RW_IA,
1425		    GAMW_ECO_DEV_CTX_RELOAD_DISABLE);
1426
1427	/* Wa_1406463099:icl
1428	 * Formerly known as WaGamTlbPendError
1429	 */
1430	wa_write_or(wal,
1431		    GAMT_CHKN_BIT_REG,
1432		    GAMT_CHKN_DISABLE_L3_COH_PIPE);
1433
1434	/*
1435	 * Wa_1408615072:icl,ehl  (vsunit)
1436	 * Wa_1407596294:icl,ehl  (hsunit)
1437	 */
1438	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1439		    VSUNIT_CLKGATE_DIS | HSUNIT_CLKGATE_DIS);
1440
1441	/* Wa_1407352427:icl,ehl */
1442	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2,
1443		    PSDUNIT_CLKGATE_DIS);
1444
1445	/* Wa_1406680159:icl,ehl */
1446	wa_mcr_write_or(wal,
1447			GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1448			GWUNIT_CLKGATE_DIS);
1449
1450	/* Wa_1607087056:icl,ehl,jsl */
1451	if (IS_ICELAKE(i915) ||
1452		((IS_JASPERLAKE(i915) || IS_ELKHARTLAKE(i915)) &&
1453		IS_GRAPHICS_STEP(i915, STEP_A0, STEP_B0)))
1454		wa_write_or(wal,
1455			    GEN11_SLICE_UNIT_LEVEL_CLKGATE,
1456			    L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS);
1457
1458	/*
1459	 * This is not a documented workaround, but rather an optimization
1460	 * to reduce sampler power.
1461	 */
1462	wa_mcr_write_clr(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1463}
1464
1465/*
1466 * Though there are per-engine instances of these registers,
1467 * they retain their value through engine resets and should
1468 * only be provided on the GT workaround list rather than
1469 * the engine-specific workaround list.
1470 */
1471static void
1472wa_14011060649(struct intel_gt *gt, struct i915_wa_list *wal)
1473{
1474	struct intel_engine_cs *engine;
1475	int id;
1476
1477	for_each_engine(engine, gt, id) {
1478		if (engine->class != VIDEO_DECODE_CLASS ||
1479		    (engine->instance % 2))
1480			continue;
1481
1482		wa_write_or(wal, VDBOX_CGCTL3F10(engine->mmio_base),
1483			    IECPUNIT_CLKGATE_DIS);
1484	}
1485}
1486
1487static void
1488gen12_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1489{
1490	icl_wa_init_mcr(gt, wal);
1491
1492	/* Wa_14011060649:tgl,rkl,dg1,adl-s,adl-p */
1493	wa_14011060649(gt, wal);
1494
1495	/* Wa_14011059788:tgl,rkl,adl-s,dg1,adl-p */
1496	wa_mcr_write_or(wal, GEN10_DFR_RATIO_EN_AND_CHICKEN, DFR_DISABLE);
1497
1498	/*
1499	 * Wa_14015795083
1500	 *
1501	 * Firmware on some gen12 platforms locks the MISCCPCTL register,
1502	 * preventing i915 from modifying it for this workaround.  Skip the
1503	 * readback verification for this workaround on debug builds; if the
1504	 * workaround doesn't stick due to firmware behavior, it's not an error
1505	 * that we want CI to flag.
1506	 */
1507	wa_add(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE,
1508	       0, 0, false);
1509}
1510
1511static void
1512dg1_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1513{
1514	gen12_gt_workarounds_init(gt, wal);
1515
1516	/* Wa_1409420604:dg1 */
1517	wa_mcr_write_or(wal, SUBSLICE_UNIT_LEVEL_CLKGATE2,
1518			CPSSUNIT_CLKGATE_DIS);
1519
1520	/* Wa_1408615072:dg1 */
1521	/* Empirical testing shows this register is unaffected by engine reset. */
1522	wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL);
1523}
1524
1525static void
1526xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1527{
1528	struct drm_i915_private *i915 = gt->i915;
1529
1530	xehp_init_mcr(gt, wal);
1531
1532	/* Wa_1409757795:xehpsdv */
1533	wa_mcr_write_or(wal, SCCGCTL94DC, CG3DDISURB);
1534
1535	/* Wa_18011725039:xehpsdv */
1536	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_B0)) {
1537		wa_mcr_masked_dis(wal, MLTICTXCTL, TDONRENDER);
1538		wa_mcr_write_or(wal, L3SQCREG1_CCS0, FLUSHALLNONCOH);
1539	}
1540
1541	/* Wa_16011155590:xehpsdv */
1542	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A0, STEP_B0))
1543		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1544			    TSGUNIT_CLKGATE_DIS);
1545
1546	/* Wa_14011780169:xehpsdv */
1547	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_B0, STEP_FOREVER)) {
1548		wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS |
1549			    GAMTLBVDBOX7_CLKGATE_DIS |
1550			    GAMTLBVDBOX6_CLKGATE_DIS |
1551			    GAMTLBVDBOX5_CLKGATE_DIS |
1552			    GAMTLBVDBOX4_CLKGATE_DIS |
1553			    GAMTLBVDBOX3_CLKGATE_DIS |
1554			    GAMTLBVDBOX2_CLKGATE_DIS |
1555			    GAMTLBVDBOX1_CLKGATE_DIS |
1556			    GAMTLBVDBOX0_CLKGATE_DIS |
1557			    GAMTLBKCR_CLKGATE_DIS |
1558			    GAMTLBGUC_CLKGATE_DIS |
1559			    GAMTLBBLT_CLKGATE_DIS);
1560		wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS |
1561			    GAMTLBGFXA1_CLKGATE_DIS |
1562			    GAMTLBCOMPA0_CLKGATE_DIS |
1563			    GAMTLBCOMPA1_CLKGATE_DIS |
1564			    GAMTLBCOMPB0_CLKGATE_DIS |
1565			    GAMTLBCOMPB1_CLKGATE_DIS |
1566			    GAMTLBCOMPC0_CLKGATE_DIS |
1567			    GAMTLBCOMPC1_CLKGATE_DIS |
1568			    GAMTLBCOMPD0_CLKGATE_DIS |
1569			    GAMTLBCOMPD1_CLKGATE_DIS |
1570			    GAMTLBMERT_CLKGATE_DIS   |
1571			    GAMTLBVEBOX3_CLKGATE_DIS |
1572			    GAMTLBVEBOX2_CLKGATE_DIS |
1573			    GAMTLBVEBOX1_CLKGATE_DIS |
1574			    GAMTLBVEBOX0_CLKGATE_DIS);
1575	}
1576
1577	/* Wa_16012725990:xehpsdv */
1578	if (IS_XEHPSDV_GRAPHICS_STEP(i915, STEP_A1, STEP_FOREVER))
1579		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, VFUNIT_CLKGATE_DIS);
1580
1581	/* Wa_14011060649:xehpsdv */
1582	wa_14011060649(gt, wal);
1583
1584	/* Wa_14012362059:xehpsdv */
1585	wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB);
1586
1587	/* Wa_14014368820:xehpsdv */
1588	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1589			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1590
1591	/* Wa_14010670810:xehpsdv */
1592	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1593}
1594
1595static void
1596dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1597{
1598	xehp_init_mcr(gt, wal);
1599
1600	/* Wa_14011060649:dg2 */
1601	wa_14011060649(gt, wal);
1602
1603	if (IS_DG2_G10(gt->i915)) {
1604		/* Wa_22010523718:dg2 */
1605		wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE,
1606			    CG3DDISCFEG_CLKGATE_DIS);
1607
1608		/* Wa_14011006942:dg2 */
1609		wa_mcr_write_or(wal, GEN11_SUBSLICE_UNIT_LEVEL_CLKGATE,
1610				DSS_ROUTER_CLKGATE_DIS);
1611	}
1612
1613	/* Wa_14014830051:dg2 */
1614	wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1615
1616	/* Wa_14015795083 */
1617	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1618
1619	/* Wa_18018781329 */
1620	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1621	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1622	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1623	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1624
1625	/* Wa_1509235366:dg2 */
1626	wa_mcr_write_or(wal, XEHP_GAMCNTRL_CTRL,
1627			INVALIDATION_BROADCAST_MODE_DIS | GLOBAL_INVALIDATION_MODE);
1628
1629	/* Wa_14010648519:dg2 */
1630	wa_mcr_write_or(wal, XEHP_L3NODEARBCFG, XEHP_LNESPARE);
1631}
1632
1633static void
1634pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1635{
1636	pvc_init_mcr(gt, wal);
1637
1638	/* Wa_14015795083 */
1639	wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1640
1641	/* Wa_18018781329 */
1642	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1643	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1644	wa_mcr_write_or(wal, XEHP_VDBX_MOD_CTRL, FORCE_MISS_FTLB);
1645	wa_mcr_write_or(wal, XEHP_VEBX_MOD_CTRL, FORCE_MISS_FTLB);
1646
1647	/* Wa_16016694945 */
1648	wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_OVRLSCCC);
1649}
1650
1651static void
1652xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1653{
1654	/* Wa_14018575942 / Wa_18018781329 */
1655	wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB);
1656	wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB);
1657
1658	/* Wa_22016670082 */
1659	wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE);
1660
1661	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
1662	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
1663		/* Wa_14014830051 */
1664		wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN);
1665
1666		/* Wa_14015795083 */
1667		wa_write_clr(wal, GEN7_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
1668	}
1669
1670	/*
1671	 * Unlike older platforms, we no longer setup implicit steering here;
1672	 * all MCR accesses are explicitly steered.
1673	 */
1674	debug_dump_steering(gt);
1675}
1676
1677static void
1678xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
1679{
1680	/*
1681	 * Wa_14018778641
1682	 * Wa_18018781329
1683	 *
1684	 * Note that although these registers are MCR on the primary
1685	 * GT, the media GT's versions are regular singleton registers.
1686	 */
1687	wa_write_or(wal, XELPMP_GSC_MOD_CTRL, FORCE_MISS_FTLB);
1688
1689	debug_dump_steering(gt);
1690}
1691
1692/*
1693 * The bspec performance guide has recommended MMIO tuning settings.  These
1694 * aren't truly "workarounds" but we want to program them through the
1695 * workaround infrastructure to make sure they're (re)applied at the proper
1696 * times.
1697 *
1698 * The programming in this function is for settings that persist through
1699 * engine resets and also are not part of any engine's register state context.
1700 * I.e., settings that only need to be re-applied in the event of a full GT
1701 * reset.
1702 */
1703static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal)
1704{
1705	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) {
1706		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1707		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1708	}
1709
1710	if (IS_PONTEVECCHIO(gt->i915)) {
1711		wa_mcr_write(wal, XEHPC_L3SCRUB,
1712			     SCRUB_CL_DWNGRADE_SHARED | SCRUB_RATE_4B_PER_CLK);
1713		wa_mcr_masked_en(wal, XEHPC_LNCFMISCCFGREG0, XEHPC_HOSTCACHEEN);
1714	}
1715
1716	if (IS_DG2(gt->i915)) {
1717		wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS);
1718		wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS);
1719	}
1720}
1721
1722static void
1723gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
1724{
1725	struct drm_i915_private *i915 = gt->i915;
1726
1727	gt_tuning_settings(gt, wal);
1728
1729	if (gt->type == GT_MEDIA) {
1730		if (MEDIA_VER(i915) >= 13)
1731			xelpmp_gt_workarounds_init(gt, wal);
1732		else
1733			MISSING_CASE(MEDIA_VER(i915));
1734
1735		return;
1736	}
1737
1738	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)))
1739		xelpg_gt_workarounds_init(gt, wal);
1740	else if (IS_PONTEVECCHIO(i915))
1741		pvc_gt_workarounds_init(gt, wal);
1742	else if (IS_DG2(i915))
1743		dg2_gt_workarounds_init(gt, wal);
1744	else if (IS_XEHPSDV(i915))
1745		xehpsdv_gt_workarounds_init(gt, wal);
1746	else if (IS_DG1(i915))
1747		dg1_gt_workarounds_init(gt, wal);
1748	else if (GRAPHICS_VER(i915) == 12)
1749		gen12_gt_workarounds_init(gt, wal);
1750	else if (GRAPHICS_VER(i915) == 11)
1751		icl_gt_workarounds_init(gt, wal);
1752	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
1753		cfl_gt_workarounds_init(gt, wal);
1754	else if (IS_GEMINILAKE(i915))
1755		glk_gt_workarounds_init(gt, wal);
1756	else if (IS_KABYLAKE(i915))
1757		kbl_gt_workarounds_init(gt, wal);
1758	else if (IS_BROXTON(i915))
1759		gen9_gt_workarounds_init(gt, wal);
1760	else if (IS_SKYLAKE(i915))
1761		skl_gt_workarounds_init(gt, wal);
1762	else if (IS_HASWELL(i915))
1763		hsw_gt_workarounds_init(gt, wal);
1764	else if (IS_VALLEYVIEW(i915))
1765		vlv_gt_workarounds_init(gt, wal);
1766	else if (IS_IVYBRIDGE(i915))
1767		ivb_gt_workarounds_init(gt, wal);
1768	else if (GRAPHICS_VER(i915) == 6)
1769		snb_gt_workarounds_init(gt, wal);
1770	else if (GRAPHICS_VER(i915) == 5)
1771		ilk_gt_workarounds_init(gt, wal);
1772	else if (IS_G4X(i915))
1773		g4x_gt_workarounds_init(gt, wal);
1774	else if (GRAPHICS_VER(i915) == 4)
1775		gen4_gt_workarounds_init(gt, wal);
1776	else if (GRAPHICS_VER(i915) <= 8)
1777		;
1778	else
1779		MISSING_CASE(GRAPHICS_VER(i915));
1780}
1781
1782void intel_gt_init_workarounds(struct intel_gt *gt)
1783{
1784	struct i915_wa_list *wal = &gt->wa_list;
1785
1786	wa_init_start(wal, gt, "GT", "global");
1787	gt_init_workarounds(gt, wal);
1788	wa_init_finish(wal);
1789}
1790
1791static bool
1792wa_verify(struct intel_gt *gt, const struct i915_wa *wa, u32 cur,
1793	  const char *name, const char *from)
1794{
1795	if ((cur ^ wa->set) & wa->read) {
1796		drm_err(&gt->i915->drm,
1797			"%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n",
1798			name, from, i915_mmio_reg_offset(wa->reg),
1799			cur, cur & wa->read, wa->set & wa->read);
1800
1801		return false;
1802	}
1803
1804	return true;
1805}
1806
1807static void wa_list_apply(const struct i915_wa_list *wal)
1808{
1809	struct intel_gt *gt = wal->gt;
1810	struct intel_uncore *uncore = gt->uncore;
1811	enum forcewake_domains fw;
1812	unsigned long flags;
1813	struct i915_wa *wa;
1814	unsigned int i;
1815
1816	if (!wal->count)
1817		return;
1818
1819	fw = wal_get_fw_for_rmw(uncore, wal);
1820
1821	intel_gt_mcr_lock(gt, &flags);
1822	spin_lock(&uncore->lock);
1823	intel_uncore_forcewake_get__locked(uncore, fw);
1824
1825	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
1826		u32 val, old = 0;
1827
1828		/* open-coded rmw due to steering */
1829		if (wa->clr)
1830			old = wa->is_mcr ?
1831				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1832				intel_uncore_read_fw(uncore, wa->reg);
1833		val = (old & ~wa->clr) | wa->set;
1834		if (val != old || !wa->clr) {
1835			if (wa->is_mcr)
1836				intel_gt_mcr_multicast_write_fw(gt, wa->mcr_reg, val);
1837			else
1838				intel_uncore_write_fw(uncore, wa->reg, val);
1839		}
1840
1841		if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
1842			u32 val = wa->is_mcr ?
1843				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1844				intel_uncore_read_fw(uncore, wa->reg);
1845
1846			wa_verify(gt, wa, val, wal->name, "application");
1847		}
1848	}
1849
1850	intel_uncore_forcewake_put__locked(uncore, fw);
1851	spin_unlock(&uncore->lock);
1852	intel_gt_mcr_unlock(gt, flags);
1853}
1854
1855void intel_gt_apply_workarounds(struct intel_gt *gt)
1856{
1857	wa_list_apply(&gt->wa_list);
1858}
1859
1860static bool wa_list_verify(struct intel_gt *gt,
1861			   const struct i915_wa_list *wal,
1862			   const char *from)
1863{
1864	struct intel_uncore *uncore = gt->uncore;
1865	struct i915_wa *wa;
1866	enum forcewake_domains fw;
1867	unsigned long flags;
1868	unsigned int i;
1869	bool ok = true;
1870
1871	fw = wal_get_fw_for_rmw(uncore, wal);
1872
1873	intel_gt_mcr_lock(gt, &flags);
1874	spin_lock(&uncore->lock);
1875	intel_uncore_forcewake_get__locked(uncore, fw);
1876
1877	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
1878		ok &= wa_verify(wal->gt, wa, wa->is_mcr ?
1879				intel_gt_mcr_read_any_fw(gt, wa->mcr_reg) :
1880				intel_uncore_read_fw(uncore, wa->reg),
1881				wal->name, from);
1882
1883	intel_uncore_forcewake_put__locked(uncore, fw);
1884	spin_unlock(&uncore->lock);
1885	intel_gt_mcr_unlock(gt, flags);
1886
1887	return ok;
1888}
1889
1890bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from)
1891{
1892	return wa_list_verify(gt, &gt->wa_list, from);
1893}
1894
1895__maybe_unused
1896static bool is_nonpriv_flags_valid(u32 flags)
1897{
1898	/* Check only valid flag bits are set */
1899	if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID)
1900		return false;
1901
1902	/* NB: Only 3 out of 4 enum values are valid for access field */
1903	if ((flags & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
1904	    RING_FORCE_TO_NONPRIV_ACCESS_INVALID)
1905		return false;
1906
1907	return true;
1908}
1909
1910static void
1911whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags)
1912{
1913	struct i915_wa wa = {
1914		.reg = reg
1915	};
1916
1917	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1918		return;
1919
1920	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1921		return;
1922
1923	wa.reg.reg |= flags;
1924	_wa_add(wal, &wa);
1925}
1926
1927static void
1928whitelist_mcr_reg_ext(struct i915_wa_list *wal, i915_mcr_reg_t reg, u32 flags)
1929{
1930	struct i915_wa wa = {
1931		.mcr_reg = reg,
1932		.is_mcr = 1,
1933	};
1934
1935	if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS))
1936		return;
1937
1938	if (GEM_DEBUG_WARN_ON(!is_nonpriv_flags_valid(flags)))
1939		return;
1940
1941	wa.mcr_reg.reg |= flags;
1942	_wa_add(wal, &wa);
1943}
1944
1945static void
1946whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg)
1947{
1948	whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1949}
1950
1951static void
1952whitelist_mcr_reg(struct i915_wa_list *wal, i915_mcr_reg_t reg)
1953{
1954	whitelist_mcr_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW);
1955}
1956
1957static void gen9_whitelist_build(struct i915_wa_list *w)
1958{
1959	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
1960	whitelist_reg(w, GEN9_CTX_PREEMPT_REG);
1961
1962	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */
1963	whitelist_reg(w, GEN8_CS_CHICKEN1);
1964
1965	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */
1966	whitelist_reg(w, GEN8_HDC_CHICKEN1);
1967
1968	/* WaSendPushConstantsFromMMIO:skl,bxt */
1969	whitelist_reg(w, COMMON_SLICE_CHICKEN2);
1970}
1971
1972static void skl_whitelist_build(struct intel_engine_cs *engine)
1973{
1974	struct i915_wa_list *w = &engine->whitelist;
1975
1976	if (engine->class != RENDER_CLASS)
1977		return;
1978
1979	gen9_whitelist_build(w);
1980
1981	/* WaDisableLSQCROPERFforOCL:skl */
1982	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
1983}
1984
1985static void bxt_whitelist_build(struct intel_engine_cs *engine)
1986{
1987	if (engine->class != RENDER_CLASS)
1988		return;
1989
1990	gen9_whitelist_build(&engine->whitelist);
1991}
1992
1993static void kbl_whitelist_build(struct intel_engine_cs *engine)
1994{
1995	struct i915_wa_list *w = &engine->whitelist;
1996
1997	if (engine->class != RENDER_CLASS)
1998		return;
1999
2000	gen9_whitelist_build(w);
2001
2002	/* WaDisableLSQCROPERFforOCL:kbl */
2003	whitelist_mcr_reg(w, GEN8_L3SQCREG4);
2004}
2005
2006static void glk_whitelist_build(struct intel_engine_cs *engine)
2007{
2008	struct i915_wa_list *w = &engine->whitelist;
2009
2010	if (engine->class != RENDER_CLASS)
2011		return;
2012
2013	gen9_whitelist_build(w);
2014
2015	/* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */
2016	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2017}
2018
2019static void cfl_whitelist_build(struct intel_engine_cs *engine)
2020{
2021	struct i915_wa_list *w = &engine->whitelist;
2022
2023	if (engine->class != RENDER_CLASS)
2024		return;
2025
2026	gen9_whitelist_build(w);
2027
2028	/*
2029	 * WaAllowPMDepthAndInvocationCountAccessFromUMD:cfl,whl,cml,aml
2030	 *
2031	 * This covers 4 register which are next to one another :
2032	 *   - PS_INVOCATION_COUNT
2033	 *   - PS_INVOCATION_COUNT_UDW
2034	 *   - PS_DEPTH_COUNT
2035	 *   - PS_DEPTH_COUNT_UDW
2036	 */
2037	whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2038			  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2039			  RING_FORCE_TO_NONPRIV_RANGE_4);
2040}
2041
2042static void allow_read_ctx_timestamp(struct intel_engine_cs *engine)
2043{
2044	struct i915_wa_list *w = &engine->whitelist;
2045
2046	if (engine->class != RENDER_CLASS)
2047		whitelist_reg_ext(w,
2048				  RING_CTX_TIMESTAMP(engine->mmio_base),
2049				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2050}
2051
2052static void cml_whitelist_build(struct intel_engine_cs *engine)
2053{
2054	allow_read_ctx_timestamp(engine);
2055
2056	cfl_whitelist_build(engine);
2057}
2058
2059static void icl_whitelist_build(struct intel_engine_cs *engine)
2060{
2061	struct i915_wa_list *w = &engine->whitelist;
2062
2063	allow_read_ctx_timestamp(engine);
2064
2065	switch (engine->class) {
2066	case RENDER_CLASS:
2067		/* WaAllowUMDToModifyHalfSliceChicken7:icl */
2068		whitelist_mcr_reg(w, GEN9_HALF_SLICE_CHICKEN7);
2069
2070		/* WaAllowUMDToModifySamplerMode:icl */
2071		whitelist_mcr_reg(w, GEN10_SAMPLER_MODE);
2072
2073		/* WaEnableStateCacheRedirectToCS:icl */
2074		whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);
2075
2076		/*
2077		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:icl
2078		 *
2079		 * This covers 4 register which are next to one another :
2080		 *   - PS_INVOCATION_COUNT
2081		 *   - PS_INVOCATION_COUNT_UDW
2082		 *   - PS_DEPTH_COUNT
2083		 *   - PS_DEPTH_COUNT_UDW
2084		 */
2085		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2086				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2087				  RING_FORCE_TO_NONPRIV_RANGE_4);
2088		break;
2089
2090	case VIDEO_DECODE_CLASS:
2091		/* hucStatusRegOffset */
2092		whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base),
2093				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2094		/* hucUKernelHdrInfoRegOffset */
2095		whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base),
2096				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2097		/* hucStatus2RegOffset */
2098		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
2099				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
2100		break;
2101
2102	default:
2103		break;
2104	}
2105}
2106
2107static void tgl_whitelist_build(struct intel_engine_cs *engine)
2108{
2109	struct i915_wa_list *w = &engine->whitelist;
2110
2111	allow_read_ctx_timestamp(engine);
2112
2113	switch (engine->class) {
2114	case RENDER_CLASS:
2115		/*
2116		 * WaAllowPMDepthAndInvocationCountAccessFromUMD:tgl
2117		 * Wa_1408556865:tgl
2118		 *
2119		 * This covers 4 registers which are next to one another :
2120		 *   - PS_INVOCATION_COUNT
2121		 *   - PS_INVOCATION_COUNT_UDW
2122		 *   - PS_DEPTH_COUNT
2123		 *   - PS_DEPTH_COUNT_UDW
2124		 */
2125		whitelist_reg_ext(w, PS_INVOCATION_COUNT,
2126				  RING_FORCE_TO_NONPRIV_ACCESS_RD |
2127				  RING_FORCE_TO_NONPRIV_RANGE_4);
2128
2129		/*
2130		 * Wa_1808121037:tgl
2131		 * Wa_14012131227:dg1
2132		 * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p
2133		 */
2134		whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1);
2135
2136		/* Wa_1806527549:tgl */
2137		whitelist_reg(w, HIZ_CHICKEN);
2138
2139		/* Required by recommended tuning setting (not a workaround) */
2140		whitelist_reg(w, GEN11_COMMON_SLICE_CHICKEN3);
2141
2142		break;
2143	default:
2144		break;
2145	}
2146}
2147
2148static void dg2_whitelist_build(struct intel_engine_cs *engine)
2149{
2150	struct i915_wa_list *w = &engine->whitelist;
2151
2152	switch (engine->class) {
2153	case RENDER_CLASS:
2154		/* Required by recommended tuning setting (not a workaround) */
2155		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2156
2157		break;
2158	default:
2159		break;
2160	}
2161}
2162
2163static void blacklist_trtt(struct intel_engine_cs *engine)
2164{
2165	struct i915_wa_list *w = &engine->whitelist;
2166
2167	/*
2168	 * Prevent read/write access to [0x4400, 0x4600) which covers
2169	 * the TRTT range across all engines. Note that normally userspace
2170	 * cannot access the other engines' trtt control, but for simplicity
2171	 * we cover the entire range on each engine.
2172	 */
2173	whitelist_reg_ext(w, _MMIO(0x4400),
2174			  RING_FORCE_TO_NONPRIV_DENY |
2175			  RING_FORCE_TO_NONPRIV_RANGE_64);
2176	whitelist_reg_ext(w, _MMIO(0x4500),
2177			  RING_FORCE_TO_NONPRIV_DENY |
2178			  RING_FORCE_TO_NONPRIV_RANGE_64);
2179}
2180
2181static void pvc_whitelist_build(struct intel_engine_cs *engine)
2182{
2183	/* Wa_16014440446:pvc */
2184	blacklist_trtt(engine);
2185}
2186
2187static void xelpg_whitelist_build(struct intel_engine_cs *engine)
2188{
2189	struct i915_wa_list *w = &engine->whitelist;
2190
2191	switch (engine->class) {
2192	case RENDER_CLASS:
2193		/* Required by recommended tuning setting (not a workaround) */
2194		whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3);
2195
2196		break;
2197	default:
2198		break;
2199	}
2200}
2201
2202void intel_engine_init_whitelist(struct intel_engine_cs *engine)
2203{
2204	struct drm_i915_private *i915 = engine->i915;
2205	struct i915_wa_list *w = &engine->whitelist;
2206
2207	wa_init_start(w, engine->gt, "whitelist", engine->name);
2208
2209	if (engine->gt->type == GT_MEDIA)
2210		; /* none yet */
2211	else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74)))
2212		xelpg_whitelist_build(engine);
2213	else if (IS_PONTEVECCHIO(i915))
2214		pvc_whitelist_build(engine);
2215	else if (IS_DG2(i915))
2216		dg2_whitelist_build(engine);
2217	else if (IS_XEHPSDV(i915))
2218		; /* none needed */
2219	else if (GRAPHICS_VER(i915) == 12)
2220		tgl_whitelist_build(engine);
2221	else if (GRAPHICS_VER(i915) == 11)
2222		icl_whitelist_build(engine);
2223	else if (IS_COMETLAKE(i915))
2224		cml_whitelist_build(engine);
2225	else if (IS_COFFEELAKE(i915))
2226		cfl_whitelist_build(engine);
2227	else if (IS_GEMINILAKE(i915))
2228		glk_whitelist_build(engine);
2229	else if (IS_KABYLAKE(i915))
2230		kbl_whitelist_build(engine);
2231	else if (IS_BROXTON(i915))
2232		bxt_whitelist_build(engine);
2233	else if (IS_SKYLAKE(i915))
2234		skl_whitelist_build(engine);
2235	else if (GRAPHICS_VER(i915) <= 8)
2236		;
2237	else
2238		MISSING_CASE(GRAPHICS_VER(i915));
2239
2240	wa_init_finish(w);
2241}
2242
2243void intel_engine_apply_whitelist(struct intel_engine_cs *engine)
2244{
2245	const struct i915_wa_list *wal = &engine->whitelist;
2246	struct intel_uncore *uncore = engine->uncore;
2247	const u32 base = engine->mmio_base;
2248	struct i915_wa *wa;
2249	unsigned int i;
2250
2251	if (!wal->count)
2252		return;
2253
2254	for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
2255		intel_uncore_write(uncore,
2256				   RING_FORCE_TO_NONPRIV(base, i),
2257				   i915_mmio_reg_offset(wa->reg));
2258
2259	/* And clear the rest just in case of garbage */
2260	for (; i < RING_MAX_NONPRIV_SLOTS; i++)
2261		intel_uncore_write(uncore,
2262				   RING_FORCE_TO_NONPRIV(base, i),
2263				   i915_mmio_reg_offset(RING_NOPID(base)));
2264}
2265
2266/*
2267 * engine_fake_wa_init(), a place holder to program the registers
2268 * which are not part of an official workaround defined by the
2269 * hardware team.
2270 * Adding programming of those register inside workaround will
2271 * allow utilizing wa framework to proper application and verification.
2272 */
2273static void
2274engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2275{
2276	u8 mocs_w, mocs_r;
2277
2278	/*
2279	 * RING_CMD_CCTL specifies the default MOCS entry that will be used
2280	 * by the command streamer when executing commands that don't have
2281	 * a way to explicitly specify a MOCS setting.  The default should
2282	 * usually reference whichever MOCS entry corresponds to uncached
2283	 * behavior, although use of a WB cached entry is recommended by the
2284	 * spec in certain circumstances on specific platforms.
2285	 */
2286	if (GRAPHICS_VER(engine->i915) >= 12) {
2287		mocs_r = engine->gt->mocs.uc_index;
2288		mocs_w = engine->gt->mocs.uc_index;
2289
2290		if (HAS_L3_CCS_READ(engine->i915) &&
2291		    engine->class == COMPUTE_CLASS) {
2292			mocs_r = engine->gt->mocs.wb_index;
2293
2294			/*
2295			 * Even on the few platforms where MOCS 0 is a
2296			 * legitimate table entry, it's never the correct
2297			 * setting to use here; we can assume the MOCS init
2298			 * just forgot to initialize wb_index.
2299			 */
2300			drm_WARN_ON(&engine->i915->drm, mocs_r == 0);
2301		}
2302
2303		wa_masked_field_set(wal,
2304				    RING_CMD_CCTL(engine->mmio_base),
2305				    CMD_CCTL_MOCS_MASK,
2306				    CMD_CCTL_MOCS_OVERRIDE(mocs_w, mocs_r));
2307	}
2308}
2309
2310static void
2311rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2312{
2313	struct drm_i915_private *i915 = engine->i915;
2314	struct intel_gt *gt = engine->gt;
2315
2316	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2317	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) {
2318		/* Wa_22014600077 */
2319		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2320				 ENABLE_EU_COUNT_FOR_TDL_FLUSH);
2321	}
2322
2323	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2324	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2325	    IS_DG2(i915)) {
2326		/* Wa_1509727124 */
2327		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2328				 SC_DISABLE_POWER_OPTIMIZATION_EBB);
2329	}
2330
2331	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2332	    IS_DG2(i915)) {
2333		/* Wa_22012856258 */
2334		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2335				 GEN12_DISABLE_READ_SUPPRESSION);
2336	}
2337
2338	if (IS_DG2(i915)) {
2339		/*
2340		 * Wa_22010960976:dg2
2341		 * Wa_14013347512:dg2
2342		 */
2343		wa_mcr_masked_dis(wal, XEHP_HDC_CHICKEN0,
2344				  LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK);
2345	}
2346
2347	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) ||
2348	    IS_DG2(i915)) {
2349		/* Wa_14015150844 */
2350		wa_mcr_add(wal, XEHP_HDC_CHICKEN0, 0,
2351			   _MASKED_BIT_ENABLE(DIS_ATOMIC_CHAINING_TYPED_WRITES),
2352			   0, true);
2353	}
2354
2355	if (IS_DG2_G11(i915) || IS_DG2_G10(i915)) {
2356		/* Wa_22014600077:dg2 */
2357		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2358			   _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH),
2359			   0 /* Wa_14012342262 write-only reg, so skip verification */,
2360			   true);
2361	}
2362
2363	if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2364	    IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2365		/*
2366		 * Wa_1606700617:tgl,dg1,adl-p
2367		 * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p
2368		 * Wa_14010826681:tgl,dg1,rkl,adl-p
2369		 * Wa_18019627453:dg2
2370		 */
2371		wa_masked_en(wal,
2372			     GEN9_CS_DEBUG_MODE1,
2373			     FF_DOP_CLOCK_GATE_DISABLE);
2374	}
2375
2376	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) ||
2377	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2378		/* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */
2379		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_EARLY_READ);
2380
2381		/*
2382		 * Wa_1407928979:tgl A*
2383		 * Wa_18011464164:tgl[B0+],dg1[B0+]
2384		 * Wa_22010931296:tgl[B0+],dg1[B0+]
2385		 * Wa_14010919138:rkl,dg1,adl-s,adl-p
2386		 */
2387		wa_write_or(wal, GEN7_FF_THREAD_MODE,
2388			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2389
2390		/* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */
2391		wa_mcr_masked_en(wal,
2392				 GEN10_SAMPLER_MODE,
2393				 ENABLE_SMALLPL);
2394	}
2395
2396	if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) ||
2397	    IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) {
2398		/* Wa_1409804808 */
2399		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2,
2400				 GEN12_PUSH_CONST_DEREF_HOLD_DIS);
2401
2402		/* Wa_14010229206 */
2403		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH);
2404	}
2405
2406	if (IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || IS_ALDERLAKE_P(i915)) {
2407		/*
2408		 * Wa_1607297627
2409		 *
2410		 * On TGL and RKL there are multiple entries for this WA in the
2411		 * BSpec; some indicate this is an A0-only WA, others indicate
2412		 * it applies to all steppings so we trust the "all steppings."
2413		 */
2414		wa_masked_en(wal,
2415			     RING_PSMI_CTL(RENDER_RING_BASE),
2416			     GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE |
2417			     GEN8_RC_SEMA_IDLE_MSG_DISABLE);
2418	}
2419
2420	if (GRAPHICS_VER(i915) == 11) {
2421		/* This is not an Wa. Enable for better image quality */
2422		wa_masked_en(wal,
2423			     _3D_CHICKEN3,
2424			     _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE);
2425
2426		/*
2427		 * Wa_1405543622:icl
2428		 * Formerly known as WaGAPZPriorityScheme
2429		 */
2430		wa_write_or(wal,
2431			    GEN8_GARBCNTL,
2432			    GEN11_ARBITRATION_PRIO_ORDER_MASK);
2433
2434		/*
2435		 * Wa_1604223664:icl
2436		 * Formerly known as WaL3BankAddressHashing
2437		 */
2438		wa_write_clr_set(wal,
2439				 GEN8_GARBCNTL,
2440				 GEN11_HASH_CTRL_EXCL_MASK,
2441				 GEN11_HASH_CTRL_EXCL_BIT0);
2442		wa_write_clr_set(wal,
2443				 GEN11_GLBLINVL,
2444				 GEN11_BANK_HASH_ADDR_EXCL_MASK,
2445				 GEN11_BANK_HASH_ADDR_EXCL_BIT0);
2446
2447		/*
2448		 * Wa_1405733216:icl
2449		 * Formerly known as WaDisableCleanEvicts
2450		 */
2451		wa_mcr_write_or(wal,
2452				GEN8_L3SQCREG4,
2453				GEN11_LQSC_CLEAN_EVICT_DISABLE);
2454
2455		/* Wa_1606682166:icl */
2456		wa_write_or(wal,
2457			    GEN7_SARCHKMD,
2458			    GEN7_DISABLE_SAMPLER_PREFETCH);
2459
2460		/* Wa_1409178092:icl */
2461		wa_mcr_write_clr_set(wal,
2462				     GEN11_SCRATCH2,
2463				     GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE,
2464				     0);
2465
2466		/* WaEnable32PlaneMode:icl */
2467		wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS,
2468			     GEN11_ENABLE_32_PLANE_MODE);
2469
2470		/*
2471		 * Wa_1408767742:icl[a2..forever],ehl[all]
2472		 * Wa_1605460711:icl[a0..c0]
2473		 */
2474		wa_write_or(wal,
2475			    GEN7_FF_THREAD_MODE,
2476			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
2477
2478		/* Wa_22010271021 */
2479		wa_masked_en(wal,
2480			     GEN9_CS_DEBUG_MODE1,
2481			     FF_DOP_CLOCK_GATE_DISABLE);
2482	}
2483
2484	/*
2485	 * Intel platforms that support fine-grained preemption (i.e., gen9 and
2486	 * beyond) allow the kernel-mode driver to choose between two different
2487	 * options for controlling preemption granularity and behavior.
2488	 *
2489	 * Option 1 (hardware default):
2490	 *   Preemption settings are controlled in a global manner via
2491	 *   kernel-only register CS_DEBUG_MODE1 (0x20EC).  Any granularity
2492	 *   and settings chosen by the kernel-mode driver will apply to all
2493	 *   userspace clients.
2494	 *
2495	 * Option 2:
2496	 *   Preemption settings are controlled on a per-context basis via
2497	 *   register CS_CHICKEN1 (0x2580).  CS_CHICKEN1 is saved/restored on
2498	 *   context switch and is writable by userspace (e.g., via
2499	 *   MI_LOAD_REGISTER_IMMEDIATE instructions placed in a batch buffer)
2500	 *   which allows different userspace drivers/clients to select
2501	 *   different settings, or to change those settings on the fly in
2502	 *   response to runtime needs.  This option was known by name
2503	 *   "FtrPerCtxtPreemptionGranularityControl" at one time, although
2504	 *   that name is somewhat misleading as other non-granularity
2505	 *   preemption settings are also impacted by this decision.
2506	 *
2507	 * On Linux, our policy has always been to let userspace drivers
2508	 * control preemption granularity/settings (Option 2).  This was
2509	 * originally mandatory on gen9 to prevent ABI breakage (old gen9
2510	 * userspace developed before object-level preemption was enabled would
2511	 * not behave well if i915 were to go with Option 1 and enable that
2512	 * preemption in a global manner).  On gen9 each context would have
2513	 * object-level preemption disabled by default (see
2514	 * WaDisable3DMidCmdPreemption in gen9_ctx_workarounds_init), but
2515	 * userspace drivers could opt-in to object-level preemption as they
2516	 * saw fit.  For post-gen9 platforms, we continue to utilize Option 2;
2517	 * even though it is no longer necessary for ABI compatibility when
2518	 * enabling a new platform, it does ensure that userspace will be able
2519	 * to implement any workarounds that show up requiring temporary
2520	 * adjustments to preemption behavior at runtime.
2521	 *
2522	 * Notes/Workarounds:
2523	 *  - Wa_14015141709:  On DG2 and early steppings of MTL,
2524	 *      CS_CHICKEN1[0] does not disable object-level preemption as
2525	 *      it is supposed to (nor does CS_DEBUG_MODE1[0] if we had been
2526	 *      using Option 1).  Effectively this means userspace is unable
2527	 *      to disable object-level preemption on these platforms/steppings
2528	 *      despite the setting here.
2529	 *
2530	 *  - Wa_16013994831:  May require that userspace program
2531	 *      CS_CHICKEN1[10] when certain runtime conditions are true.
2532	 *      Userspace requires Option 2 to be in effect for their update of
2533	 *      CS_CHICKEN1[10] to be effective.
2534	 *
2535	 * Other workarounds may appear in the future that will also require
2536	 * Option 2 behavior to allow proper userspace implementation.
2537	 */
2538	if (GRAPHICS_VER(i915) >= 9)
2539		wa_masked_en(wal,
2540			     GEN7_FF_SLICE_CS_CHICKEN1,
2541			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
2542
2543	if (IS_SKYLAKE(i915) ||
2544	    IS_KABYLAKE(i915) ||
2545	    IS_COFFEELAKE(i915) ||
2546	    IS_COMETLAKE(i915)) {
2547		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
2548		wa_write_or(wal,
2549			    GEN8_GARBCNTL,
2550			    GEN9_GAPS_TSV_CREDIT_DISABLE);
2551	}
2552
2553	if (IS_BROXTON(i915)) {
2554		/* WaDisablePooledEuLoadBalancingFix:bxt */
2555		wa_masked_en(wal,
2556			     FF_SLICE_CS_CHICKEN2,
2557			     GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE);
2558	}
2559
2560	if (GRAPHICS_VER(i915) == 9) {
2561		/* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */
2562		wa_masked_en(wal,
2563			     GEN9_CSFE_CHICKEN1_RCS,
2564			     GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE);
2565
2566		/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */
2567		wa_mcr_write_or(wal,
2568				BDW_SCRATCH1,
2569				GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
2570
2571		/* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */
2572		if (IS_GEN9_LP(i915))
2573			wa_mcr_write_clr_set(wal,
2574					     GEN8_L3SQCREG1,
2575					     L3_PRIO_CREDITS_MASK,
2576					     L3_GENERAL_PRIO_CREDITS(62) |
2577					     L3_HIGH_PRIO_CREDITS(2));
2578
2579		/* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */
2580		wa_mcr_write_or(wal,
2581				GEN8_L3SQCREG4,
2582				GEN8_LQSC_FLUSH_COHERENT_LINES);
2583
2584		/* Disable atomics in L3 to prevent unrecoverable hangs */
2585		wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1,
2586				 GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2587		wa_mcr_write_clr_set(wal, GEN8_L3SQCREG4,
2588				     GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0);
2589		wa_mcr_write_clr_set(wal, GEN9_SCRATCH1,
2590				     EVICTION_PERF_FIX_ENABLE, 0);
2591	}
2592
2593	if (IS_HASWELL(i915)) {
2594		/* WaSampleCChickenBitEnable:hsw */
2595		wa_masked_en(wal,
2596			     HSW_HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE);
2597
2598		wa_masked_dis(wal,
2599			      CACHE_MODE_0_GEN7,
2600			      /* enable HiZ Raw Stall Optimization */
2601			      HIZ_RAW_STALL_OPT_DISABLE);
2602	}
2603
2604	if (IS_VALLEYVIEW(i915)) {
2605		/* WaDisableEarlyCull:vlv */
2606		wa_masked_en(wal,
2607			     _3D_CHICKEN3,
2608			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2609
2610		/*
2611		 * WaVSThreadDispatchOverride:ivb,vlv
2612		 *
2613		 * This actually overrides the dispatch
2614		 * mode for all thread types.
2615		 */
2616		wa_write_clr_set(wal,
2617				 GEN7_FF_THREAD_MODE,
2618				 GEN7_FF_SCHED_MASK,
2619				 GEN7_FF_TS_SCHED_HW |
2620				 GEN7_FF_VS_SCHED_HW |
2621				 GEN7_FF_DS_SCHED_HW);
2622
2623		/* WaPsdDispatchEnable:vlv */
2624		/* WaDisablePSDDualDispatchEnable:vlv */
2625		wa_masked_en(wal,
2626			     GEN7_HALF_SLICE_CHICKEN1,
2627			     GEN7_MAX_PS_THREAD_DEP |
2628			     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2629	}
2630
2631	if (IS_IVYBRIDGE(i915)) {
2632		/* WaDisableEarlyCull:ivb */
2633		wa_masked_en(wal,
2634			     _3D_CHICKEN3,
2635			     _3D_CHICKEN_SF_DISABLE_OBJEND_CULL);
2636
2637		if (0) { /* causes HiZ corruption on ivb:gt1 */
2638			/* enable HiZ Raw Stall Optimization */
2639			wa_masked_dis(wal,
2640				      CACHE_MODE_0_GEN7,
2641				      HIZ_RAW_STALL_OPT_DISABLE);
2642		}
2643
2644		/*
2645		 * WaVSThreadDispatchOverride:ivb,vlv
2646		 *
2647		 * This actually overrides the dispatch
2648		 * mode for all thread types.
2649		 */
2650		wa_write_clr_set(wal,
2651				 GEN7_FF_THREAD_MODE,
2652				 GEN7_FF_SCHED_MASK,
2653				 GEN7_FF_TS_SCHED_HW |
2654				 GEN7_FF_VS_SCHED_HW |
2655				 GEN7_FF_DS_SCHED_HW);
2656
2657		/* WaDisablePSDDualDispatchEnable:ivb */
2658		if (IS_IVB_GT1(i915))
2659			wa_masked_en(wal,
2660				     GEN7_HALF_SLICE_CHICKEN1,
2661				     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
2662	}
2663
2664	if (GRAPHICS_VER(i915) == 7) {
2665		/* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */
2666		wa_masked_en(wal,
2667			     RING_MODE_GEN7(RENDER_RING_BASE),
2668			     GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE);
2669
2670		/* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */
2671		wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE);
2672
2673		/*
2674		 * BSpec says this must be set, even though
2675		 * WaDisable4x2SubspanOptimization:ivb,hsw
2676		 * WaDisable4x2SubspanOptimization isn't listed for VLV.
2677		 */
2678		wa_masked_en(wal,
2679			     CACHE_MODE_1,
2680			     PIXEL_SUBSPAN_COLLECT_OPT_DISABLE);
2681
2682		/*
2683		 * BSpec recommends 8x4 when MSAA is used,
2684		 * however in practice 16x4 seems fastest.
2685		 *
2686		 * Note that PS/WM thread counts depend on the WIZ hashing
2687		 * disable bit, which we don't touch here, but it's good
2688		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2689		 */
2690		wa_masked_field_set(wal,
2691				    GEN7_GT_MODE,
2692				    GEN6_WIZ_HASHING_MASK,
2693				    GEN6_WIZ_HASHING_16x4);
2694	}
2695
2696	if (IS_GRAPHICS_VER(i915, 6, 7))
2697		/*
2698		 * We need to disable the AsyncFlip performance optimisations in
2699		 * order to use MI_WAIT_FOR_EVENT within the CS. It should
2700		 * already be programmed to '1' on all products.
2701		 *
2702		 * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv
2703		 */
2704		wa_masked_en(wal,
2705			     RING_MI_MODE(RENDER_RING_BASE),
2706			     ASYNC_FLIP_PERF_DISABLE);
2707
2708	if (GRAPHICS_VER(i915) == 6) {
2709		/*
2710		 * Required for the hardware to program scanline values for
2711		 * waiting
2712		 * WaEnableFlushTlbInvalidationMode:snb
2713		 */
2714		wa_masked_en(wal,
2715			     GFX_MODE,
2716			     GFX_TLB_INVALIDATE_EXPLICIT);
2717
2718		/* WaDisableHiZPlanesWhenMSAAEnabled:snb */
2719		wa_masked_en(wal,
2720			     _3D_CHICKEN,
2721			     _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB);
2722
2723		wa_masked_en(wal,
2724			     _3D_CHICKEN3,
2725			     /* WaStripsFansDisableFastClipPerformanceFix:snb */
2726			     _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL |
2727			     /*
2728			      * Bspec says:
2729			      * "This bit must be set if 3DSTATE_CLIP clip mode is set
2730			      * to normal and 3DSTATE_SF number of SF output attributes
2731			      * is more than 16."
2732			      */
2733			     _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH);
2734
2735		/*
2736		 * BSpec recommends 8x4 when MSAA is used,
2737		 * however in practice 16x4 seems fastest.
2738		 *
2739		 * Note that PS/WM thread counts depend on the WIZ hashing
2740		 * disable bit, which we don't touch here, but it's good
2741		 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
2742		 */
2743		wa_masked_field_set(wal,
2744				    GEN6_GT_MODE,
2745				    GEN6_WIZ_HASHING_MASK,
2746				    GEN6_WIZ_HASHING_16x4);
2747
2748		/* WaDisable_RenderCache_OperationalFlush:snb */
2749		wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE);
2750
2751		/*
2752		 * From the Sandybridge PRM, volume 1 part 3, page 24:
2753		 * "If this bit is set, STCunit will have LRA as replacement
2754		 *  policy. [...] This bit must be reset. LRA replacement
2755		 *  policy is not supported."
2756		 */
2757		wa_masked_dis(wal,
2758			      CACHE_MODE_0,
2759			      CM0_STC_EVICT_DISABLE_LRA_SNB);
2760	}
2761
2762	if (IS_GRAPHICS_VER(i915, 4, 6))
2763		/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
2764		wa_add(wal, RING_MI_MODE(RENDER_RING_BASE),
2765		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
2766		       /* XXX bit doesn't stick on Broadwater */
2767		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH, true);
2768
2769	if (GRAPHICS_VER(i915) == 4)
2770		/*
2771		 * Disable CONSTANT_BUFFER before it is loaded from the context
2772		 * image. For as it is loaded, it is executed and the stored
2773		 * address may no longer be valid, leading to a GPU hang.
2774		 *
2775		 * This imposes the requirement that userspace reload their
2776		 * CONSTANT_BUFFER on every batch, fortunately a requirement
2777		 * they are already accustomed to from before contexts were
2778		 * enabled.
2779		 */
2780		wa_add(wal, ECOSKPD(RENDER_RING_BASE),
2781		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
2782		       0 /* XXX bit doesn't stick on Broadwater */,
2783		       true);
2784}
2785
2786static void
2787xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2788{
2789	struct drm_i915_private *i915 = engine->i915;
2790
2791	/* WaKBLVECSSemaphoreWaitPoll:kbl */
2792	if (IS_KABYLAKE(i915) && IS_GRAPHICS_STEP(i915, STEP_A0, STEP_F0)) {
2793		wa_write(wal,
2794			 RING_SEMA_WAIT_POLL(engine->mmio_base),
2795			 1);
2796	}
2797}
2798
2799static void
2800ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2801{
2802	if (IS_PVC_CT_STEP(engine->i915, STEP_A0, STEP_C0)) {
2803		/* Wa_14014999345:pvc */
2804		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_ECC);
2805	}
2806}
2807
2808/*
2809 * The bspec performance guide has recommended MMIO tuning settings.  These
2810 * aren't truly "workarounds" but we want to program them with the same
2811 * workaround infrastructure to ensure that they're automatically added to
2812 * the GuC save/restore lists, re-applied at the right times, and checked for
2813 * any conflicting programming requested by real workarounds.
2814 *
2815 * Programming settings should be added here only if their registers are not
2816 * part of an engine's register state context.  If a register is part of a
2817 * context, then any tuning settings should be programmed in an appropriate
2818 * function invoked by __intel_engine_init_ctx_wa().
2819 */
2820static void
2821add_render_compute_tuning_settings(struct intel_gt *gt,
2822				   struct i915_wa_list *wal)
2823{
2824	struct drm_i915_private *i915 = gt->i915;
2825
2826	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915))
2827		wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512);
2828
2829	/*
2830	 * This tuning setting proves beneficial only on ATS-M designs; the
2831	 * default "age based" setting is optimal on regular DG2 and other
2832	 * platforms.
2833	 */
2834	if (INTEL_INFO(i915)->tuning_thread_rr_after_dep)
2835		wa_mcr_masked_field_set(wal, GEN9_ROW_CHICKEN4, THREAD_EX_ARB_MODE,
2836					THREAD_EX_ARB_MODE_RR_AFTER_DEP);
2837
2838	if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50))
2839		wa_write_clr(wal, GEN8_GARBCNTL, GEN12_BUS_HASH_CTL_BIT_EXC);
2840}
2841
2842static void ccs_engine_wa_mode(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2843{
2844	struct intel_gt *gt = engine->gt;
2845	u32 mode;
2846
2847	if (!IS_DG2(gt->i915))
2848		return;
2849
2850	/*
2851	 * Wa_14019159160: This workaround, along with others, leads to
2852	 * significant challenges in utilizing load balancing among the
2853	 * CCS slices. Consequently, an architectural decision has been
2854	 * made to completely disable automatic CCS load balancing.
2855	 */
2856	wa_masked_en(wal, GEN12_RCU_MODE, XEHP_RCU_MODE_FIXED_SLICE_CCS_MODE);
2857
2858	/*
2859	 * After having disabled automatic load balancing we need to
2860	 * assign all slices to a single CCS. We will call it CCS mode 1
2861	 */
2862	mode = intel_gt_apply_ccs_mode(gt);
2863	wa_masked_en(wal, XEHP_CCS_MODE, mode);
2864}
2865
2866/*
2867 * The workarounds in this function apply to shared registers in
2868 * the general render reset domain that aren't tied to a
2869 * specific engine.  Since all render+compute engines get reset
2870 * together, and the contents of these registers are lost during
2871 * the shared render domain reset, we'll define such workarounds
2872 * here and then add them to just a single RCS or CCS engine's
2873 * workaround list (whichever engine has the XXXX flag).
2874 */
2875static void
2876general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
2877{
2878	struct drm_i915_private *i915 = engine->i915;
2879	struct intel_gt *gt = engine->gt;
2880
2881	add_render_compute_tuning_settings(gt, wal);
2882
2883	if (GRAPHICS_VER(i915) >= 11) {
2884		/* This is not a Wa (although referred to as
2885		 * WaSetInidrectStateOverride in places), this allows
2886		 * applications that reference sampler states through
2887		 * the BindlessSamplerStateBaseAddress to have their
2888		 * border color relative to DynamicStateBaseAddress
2889		 * rather than BindlessSamplerStateBaseAddress.
2890		 *
2891		 * Otherwise SAMPLER_STATE border colors have to be
2892		 * copied in multiple heaps (DynamicStateBaseAddress &
2893		 * BindlessSamplerStateBaseAddress)
2894		 *
2895		 * BSpec: 46052
2896		 */
2897		wa_mcr_masked_en(wal,
2898				 GEN10_SAMPLER_MODE,
2899				 GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE);
2900	}
2901
2902	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) ||
2903	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) ||
2904	    IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) {
2905		/* Wa_14017856879 */
2906		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH);
2907
2908		/* Wa_14020495402 */
2909		wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, XELPG_DISABLE_TDL_SVHS_GATING);
2910	}
2911
2912	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2913	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2914		/*
2915		 * Wa_14017066071
2916		 * Wa_14017654203
2917		 */
2918		wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE,
2919				 MTL_DISABLE_SAMPLER_SC_OOO);
2920
2921	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))
2922		/* Wa_22015279794 */
2923		wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS,
2924				 DISABLE_PREFETCH_INTO_IC);
2925
2926	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2927	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2928	    IS_DG2(i915)) {
2929		/* Wa_22013037850 */
2930		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW,
2931				DISABLE_128B_EVICTION_COMMAND_UDW);
2932
2933		/* Wa_18017747507 */
2934		wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE);
2935	}
2936
2937	if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) ||
2938	    IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) ||
2939	    IS_PONTEVECCHIO(i915) ||
2940	    IS_DG2(i915)) {
2941		/* Wa_22014226127 */
2942		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE);
2943	}
2944
2945	if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) {
2946		/* Wa_14015227452:dg2,pvc */
2947		wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE);
2948
2949		/* Wa_16015675438:dg2,pvc */
2950		wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE);
2951	}
2952
2953	if (IS_DG2(i915)) {
2954		/*
2955		 * Wa_16011620976:dg2_g11
2956		 * Wa_22015475538:dg2
2957		 */
2958		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8);
2959	}
2960
2961	if (IS_DG2_G11(i915)) {
2962		/*
2963		 * Wa_22012826095:dg2
2964		 * Wa_22013059131:dg2
2965		 */
2966		wa_mcr_write_clr_set(wal, LSC_CHICKEN_BIT_0_UDW,
2967				     MAXREQS_PER_BANK,
2968				     REG_FIELD_PREP(MAXREQS_PER_BANK, 2));
2969
2970		/* Wa_22013059131:dg2 */
2971		wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0,
2972				FORCE_1_SUB_MESSAGE_PER_FRAGMENT);
2973
2974		/*
2975		 * Wa_22012654132
2976		 *
2977		 * Note that register 0xE420 is write-only and cannot be read
2978		 * back for verification on DG2 (due to Wa_14012342262), so
2979		 * we need to explicitly skip the readback.
2980		 */
2981		wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0,
2982			   _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC),
2983			   0 /* write-only, so skip validation */,
2984			   true);
2985	}
2986
2987	if (IS_XEHPSDV(i915)) {
2988		/* Wa_1409954639 */
2989		wa_mcr_masked_en(wal,
2990				 GEN8_ROW_CHICKEN,
2991				 SYSTOLIC_DOP_CLOCK_GATING_DIS);
2992
2993		/* Wa_1607196519 */
2994		wa_mcr_masked_en(wal,
2995				 GEN9_ROW_CHICKEN4,
2996				 GEN12_DISABLE_GRF_CLEAR);
2997
2998		/* Wa_14010449647:xehpsdv */
2999		wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1,
3000				 GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE);
3001	}
3002}
3003
3004static void
3005engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal)
3006{
3007	if (GRAPHICS_VER(engine->i915) < 4)
3008		return;
3009
3010	engine_fake_wa_init(engine, wal);
3011
3012	/*
3013	 * These are common workarounds that just need to applied
3014	 * to a single RCS/CCS engine's workaround list since
3015	 * they're reset as part of the general render domain reset.
3016	 */
3017	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE) {
3018		general_render_compute_wa_init(engine, wal);
3019		ccs_engine_wa_mode(engine, wal);
3020	}
3021
3022	if (engine->class == COMPUTE_CLASS)
3023		ccs_engine_wa_init(engine, wal);
3024	else if (engine->class == RENDER_CLASS)
3025		rcs_engine_wa_init(engine, wal);
3026	else
3027		xcs_engine_wa_init(engine, wal);
3028}
3029
3030void intel_engine_init_workarounds(struct intel_engine_cs *engine)
3031{
3032	struct i915_wa_list *wal = &engine->wa_list;
3033
3034	wa_init_start(wal, engine->gt, "engine", engine->name);
3035	engine_init_workarounds(engine, wal);
3036	wa_init_finish(wal);
3037}
3038
3039void intel_engine_apply_workarounds(struct intel_engine_cs *engine)
3040{
3041	wa_list_apply(&engine->wa_list);
3042}
3043
3044static const struct i915_range mcr_ranges_gen8[] = {
3045	{ .start = 0x5500, .end = 0x55ff },
3046	{ .start = 0x7000, .end = 0x7fff },
3047	{ .start = 0x9400, .end = 0x97ff },
3048	{ .start = 0xb000, .end = 0xb3ff },
3049	{ .start = 0xe000, .end = 0xe7ff },
3050	{},
3051};
3052
3053static const struct i915_range mcr_ranges_gen12[] = {
3054	{ .start =  0x8150, .end =  0x815f },
3055	{ .start =  0x9520, .end =  0x955f },
3056	{ .start =  0xb100, .end =  0xb3ff },
3057	{ .start =  0xde80, .end =  0xe8ff },
3058	{ .start = 0x24a00, .end = 0x24a7f },
3059	{},
3060};
3061
3062static const struct i915_range mcr_ranges_xehp[] = {
3063	{ .start =  0x4000, .end =  0x4aff },
3064	{ .start =  0x5200, .end =  0x52ff },
3065	{ .start =  0x5400, .end =  0x7fff },
3066	{ .start =  0x8140, .end =  0x815f },
3067	{ .start =  0x8c80, .end =  0x8dff },
3068	{ .start =  0x94d0, .end =  0x955f },
3069	{ .start =  0x9680, .end =  0x96ff },
3070	{ .start =  0xb000, .end =  0xb3ff },
3071	{ .start =  0xc800, .end =  0xcfff },
3072	{ .start =  0xd800, .end =  0xd8ff },
3073	{ .start =  0xdc00, .end =  0xffff },
3074	{ .start = 0x17000, .end = 0x17fff },
3075	{ .start = 0x24a00, .end = 0x24a7f },
3076	{},
3077};
3078
3079static bool mcr_range(struct drm_i915_private *i915, u32 offset)
3080{
3081	const struct i915_range *mcr_ranges;
3082	int i;
3083
3084	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50))
3085		mcr_ranges = mcr_ranges_xehp;
3086	else if (GRAPHICS_VER(i915) >= 12)
3087		mcr_ranges = mcr_ranges_gen12;
3088	else if (GRAPHICS_VER(i915) >= 8)
3089		mcr_ranges = mcr_ranges_gen8;
3090	else
3091		return false;
3092
3093	/*
3094	 * Registers in these ranges are affected by the MCR selector
3095	 * which only controls CPU initiated MMIO. Routing does not
3096	 * work for CS access so we cannot verify them on this path.
3097	 */
3098	for (i = 0; mcr_ranges[i].start; i++)
3099		if (offset >= mcr_ranges[i].start &&
3100		    offset <= mcr_ranges[i].end)
3101			return true;
3102
3103	return false;
3104}
3105
3106static int
3107wa_list_srm(struct i915_request *rq,
3108	    const struct i915_wa_list *wal,
3109	    struct i915_vma *vma)
3110{
3111	struct drm_i915_private *i915 = rq->i915;
3112	unsigned int i, count = 0;
3113	const struct i915_wa *wa;
3114	u32 srm, *cs;
3115
3116	srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
3117	if (GRAPHICS_VER(i915) >= 8)
3118		srm++;
3119
3120	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3121		if (!mcr_range(i915, i915_mmio_reg_offset(wa->reg)))
3122			count++;
3123	}
3124
3125	cs = intel_ring_begin(rq, 4 * count);
3126	if (IS_ERR(cs))
3127		return PTR_ERR(cs);
3128
3129	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3130		u32 offset = i915_mmio_reg_offset(wa->reg);
3131
3132		if (mcr_range(i915, offset))
3133			continue;
3134
3135		*cs++ = srm;
3136		*cs++ = offset;
3137		*cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i;
3138		*cs++ = 0;
3139	}
3140	intel_ring_advance(rq, cs);
3141
3142	return 0;
3143}
3144
3145static int engine_wa_list_verify(struct intel_context *ce,
3146				 const struct i915_wa_list * const wal,
3147				 const char *from)
3148{
3149	const struct i915_wa *wa;
3150	struct i915_request *rq;
3151	struct i915_vma *vma;
3152	struct i915_gem_ww_ctx ww;
3153	unsigned int i;
3154	u32 *results;
3155	int err;
3156
3157	if (!wal->count)
3158		return 0;
3159
3160	vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm,
3161					   wal->count * sizeof(u32));
3162	if (IS_ERR(vma))
3163		return PTR_ERR(vma);
3164
3165	intel_engine_pm_get(ce->engine);
3166	i915_gem_ww_ctx_init(&ww, false);
3167retry:
3168	err = i915_gem_object_lock(vma->obj, &ww);
3169	if (err == 0)
3170		err = intel_context_pin_ww(ce, &ww);
3171	if (err)
3172		goto err_pm;
3173
3174	err = i915_vma_pin_ww(vma, &ww, 0, 0,
3175			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
3176	if (err)
3177		goto err_unpin;
3178
3179	rq = i915_request_create(ce);
3180	if (IS_ERR(rq)) {
3181		err = PTR_ERR(rq);
3182		goto err_vma;
3183	}
3184
3185	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
3186	if (err == 0)
3187		err = wa_list_srm(rq, wal, vma);
3188
3189	i915_request_get(rq);
3190	if (err)
3191		i915_request_set_error_once(rq, err);
3192	i915_request_add(rq);
3193
3194	if (err)
3195		goto err_rq;
3196
3197	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3198		err = -ETIME;
3199		goto err_rq;
3200	}
3201
3202	results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
3203	if (IS_ERR(results)) {
3204		err = PTR_ERR(results);
3205		goto err_rq;
3206	}
3207
3208	err = 0;
3209	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
3210		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
3211			continue;
3212
3213		if (!wa_verify(wal->gt, wa, results[i], wal->name, from))
3214			err = -ENXIO;
3215	}
3216
3217	i915_gem_object_unpin_map(vma->obj);
3218
3219err_rq:
3220	i915_request_put(rq);
3221err_vma:
3222	i915_vma_unpin(vma);
3223err_unpin:
3224	intel_context_unpin(ce);
3225err_pm:
3226	if (err == -EDEADLK) {
3227		err = i915_gem_ww_ctx_backoff(&ww);
3228		if (!err)
3229			goto retry;
3230	}
3231	i915_gem_ww_ctx_fini(&ww);
3232	intel_engine_pm_put(ce->engine);
3233	i915_vma_put(vma);
3234	return err;
3235}
3236
3237int intel_engine_verify_workarounds(struct intel_engine_cs *engine,
3238				    const char *from)
3239{
3240	return engine_wa_list_verify(engine->kernel_context,
3241				     &engine->wa_list,
3242				     from);
3243}
3244
3245#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
3246#include "selftest_workarounds.c"
3247#endif
3248