1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2020 Intel Corporation
4 */
5
6#include "gen2_engine_cs.h"
7#include "i915_drv.h"
8#include "i915_reg.h"
9#include "intel_engine.h"
10#include "intel_engine_regs.h"
11#include "intel_gpu_commands.h"
12#include "intel_gt.h"
13#include "intel_gt_irq.h"
14#include "intel_ring.h"
15
16int gen2_emit_flush(struct i915_request *rq, u32 mode)
17{
18	unsigned int num_store_dw = 12;
19	u32 cmd, *cs;
20
21	cmd = MI_FLUSH;
22	if (mode & EMIT_INVALIDATE)
23		cmd |= MI_READ_FLUSH;
24
25	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
26	if (IS_ERR(cs))
27		return PTR_ERR(cs);
28
29	*cs++ = cmd;
30	while (num_store_dw--) {
31		*cs++ = MI_STORE_DWORD_INDEX;
32		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
33		*cs++ = 0;
34		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
35	}
36	*cs++ = cmd;
37
38	intel_ring_advance(rq, cs);
39
40	return 0;
41}
42
43int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
44{
45	u32 cmd, *cs;
46	int i;
47
48	/*
49	 * read/write caches:
50	 *
51	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
52	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
53	 * also flushed at 2d versus 3d pipeline switches.
54	 *
55	 * read-only caches:
56	 *
57	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
58	 * MI_READ_FLUSH is set, and is always flushed on 965.
59	 *
60	 * I915_GEM_DOMAIN_COMMAND may not exist?
61	 *
62	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
63	 * invalidated when MI_EXE_FLUSH is set.
64	 *
65	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
66	 * invalidated with every MI_FLUSH.
67	 *
68	 * TLBs:
69	 *
70	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
71	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
72	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
73	 * are flushed at any MI_FLUSH.
74	 */
75
76	cmd = MI_FLUSH;
77	if (mode & EMIT_INVALIDATE) {
78		cmd |= MI_EXE_FLUSH;
79		if (IS_G4X(rq->i915) || GRAPHICS_VER(rq->i915) == 5)
80			cmd |= MI_INVALIDATE_ISP;
81	}
82
83	i = 2;
84	if (mode & EMIT_INVALIDATE)
85		i += 20;
86
87	cs = intel_ring_begin(rq, i);
88	if (IS_ERR(cs))
89		return PTR_ERR(cs);
90
91	*cs++ = cmd;
92
93	/*
94	 * A random delay to let the CS invalidate take effect? Without this
95	 * delay, the GPU relocation path fails as the CS does not see
96	 * the updated contents. Just as important, if we apply the flushes
97	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
98	 * write and before the invalidate on the next batch), the relocations
99	 * still fail. This implies that is a delay following invalidation
100	 * that is required to reset the caches as opposed to a delay to
101	 * ensure the memory is written.
102	 */
103	if (mode & EMIT_INVALIDATE) {
104		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
105		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
106						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
107			PIPE_CONTROL_GLOBAL_GTT;
108		*cs++ = 0;
109		*cs++ = 0;
110
111		for (i = 0; i < 12; i++)
112			*cs++ = MI_FLUSH;
113
114		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
115		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
116						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
117			PIPE_CONTROL_GLOBAL_GTT;
118		*cs++ = 0;
119		*cs++ = 0;
120	}
121
122	*cs++ = cmd;
123
124	intel_ring_advance(rq, cs);
125
126	return 0;
127}
128
129int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
130{
131	u32 *cs;
132
133	cs = intel_ring_begin(rq, 2);
134	if (IS_ERR(cs))
135		return PTR_ERR(cs);
136
137	*cs++ = MI_FLUSH;
138	*cs++ = MI_NOOP;
139	intel_ring_advance(rq, cs);
140
141	return 0;
142}
143
144static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
145				   int flush, int post)
146{
147	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
148	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
149
150	*cs++ = MI_FLUSH;
151
152	while (flush--) {
153		*cs++ = MI_STORE_DWORD_INDEX;
154		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
155		*cs++ = rq->fence.seqno;
156	}
157
158	while (post--) {
159		*cs++ = MI_STORE_DWORD_INDEX;
160		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
161		*cs++ = rq->fence.seqno;
162	}
163
164	*cs++ = MI_USER_INTERRUPT;
165
166	rq->tail = intel_ring_offset(rq, cs);
167	assert_ring_tail_valid(rq->ring, rq->tail);
168
169	return cs;
170}
171
172u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
173{
174	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
175}
176
177u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
178{
179	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
180}
181
182/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
183#define I830_BATCH_LIMIT SZ_256K
184#define I830_TLB_ENTRIES (2)
185#define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
186int i830_emit_bb_start(struct i915_request *rq,
187		       u64 offset, u32 len,
188		       unsigned int dispatch_flags)
189{
190	u32 *cs, cs_offset =
191		intel_gt_scratch_offset(rq->engine->gt,
192					INTEL_GT_SCRATCH_FIELD_DEFAULT);
193
194	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
195
196	cs = intel_ring_begin(rq, 6);
197	if (IS_ERR(cs))
198		return PTR_ERR(cs);
199
200	/* Evict the invalid PTE TLBs */
201	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
202	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
203	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
204	*cs++ = cs_offset;
205	*cs++ = 0xdeadbeef;
206	*cs++ = MI_NOOP;
207	intel_ring_advance(rq, cs);
208
209	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
210		if (len > I830_BATCH_LIMIT)
211			return -ENOSPC;
212
213		cs = intel_ring_begin(rq, 6 + 2);
214		if (IS_ERR(cs))
215			return PTR_ERR(cs);
216
217		/*
218		 * Blit the batch (which has now all relocs applied) to the
219		 * stable batch scratch bo area (so that the CS never
220		 * stumbles over its tlb invalidation bug) ...
221		 */
222		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
223		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
224		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
225		*cs++ = cs_offset;
226		*cs++ = 4096;
227		*cs++ = offset;
228
229		*cs++ = MI_FLUSH;
230		*cs++ = MI_NOOP;
231		intel_ring_advance(rq, cs);
232
233		/* ... and execute it. */
234		offset = cs_offset;
235	}
236
237	if (!(dispatch_flags & I915_DISPATCH_SECURE))
238		offset |= MI_BATCH_NON_SECURE;
239
240	cs = intel_ring_begin(rq, 2);
241	if (IS_ERR(cs))
242		return PTR_ERR(cs);
243
244	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
245	*cs++ = offset;
246	intel_ring_advance(rq, cs);
247
248	return 0;
249}
250
251int gen3_emit_bb_start(struct i915_request *rq,
252		       u64 offset, u32 len,
253		       unsigned int dispatch_flags)
254{
255	u32 *cs;
256
257	if (!(dispatch_flags & I915_DISPATCH_SECURE))
258		offset |= MI_BATCH_NON_SECURE;
259
260	cs = intel_ring_begin(rq, 2);
261	if (IS_ERR(cs))
262		return PTR_ERR(cs);
263
264	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
265	*cs++ = offset;
266	intel_ring_advance(rq, cs);
267
268	return 0;
269}
270
271int gen4_emit_bb_start(struct i915_request *rq,
272		       u64 offset, u32 length,
273		       unsigned int dispatch_flags)
274{
275	u32 security;
276	u32 *cs;
277
278	security = MI_BATCH_NON_SECURE_I965;
279	if (dispatch_flags & I915_DISPATCH_SECURE)
280		security = 0;
281
282	cs = intel_ring_begin(rq, 2);
283	if (IS_ERR(cs))
284		return PTR_ERR(cs);
285
286	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
287	*cs++ = offset;
288	intel_ring_advance(rq, cs);
289
290	return 0;
291}
292
293void gen2_irq_enable(struct intel_engine_cs *engine)
294{
295	struct drm_i915_private *i915 = engine->i915;
296
297	i915->irq_mask &= ~engine->irq_enable_mask;
298	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
299	ENGINE_POSTING_READ16(engine, RING_IMR);
300}
301
302void gen2_irq_disable(struct intel_engine_cs *engine)
303{
304	struct drm_i915_private *i915 = engine->i915;
305
306	i915->irq_mask |= engine->irq_enable_mask;
307	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
308}
309
310void gen3_irq_enable(struct intel_engine_cs *engine)
311{
312	engine->i915->irq_mask &= ~engine->irq_enable_mask;
313	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
314	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
315}
316
317void gen3_irq_disable(struct intel_engine_cs *engine)
318{
319	engine->i915->irq_mask |= engine->irq_enable_mask;
320	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
321}
322
323void gen5_irq_enable(struct intel_engine_cs *engine)
324{
325	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
326}
327
328void gen5_irq_disable(struct intel_engine_cs *engine)
329{
330	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
331}
332