intel_lrc.c revision 1.4
1/*
2 * Copyright �� 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 *    Ben Widawsky <ben@bwidawsk.net>
25 *    Michel Thierry <michel.thierry@intel.com>
26 *    Thomas Daniel <thomas.daniel@intel.com>
27 *    Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
31/**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
133 */
134#include <linux/interrupt.h>
135
136#include "i915_drv.h"
137#include "i915_perf.h"
138#include "i915_trace.h"
139#include "i915_vgpu.h"
140#include "intel_context.h"
141#include "intel_engine_pm.h"
142#include "intel_gt.h"
143#include "intel_gt_pm.h"
144#include "intel_gt_requests.h"
145#include "intel_lrc_reg.h"
146#include "intel_mocs.h"
147#include "intel_reset.h"
148#include "intel_ring.h"
149#include "intel_workarounds.h"
150
151#define RING_EXECLIST_QFULL		(1 << 0x2)
152#define RING_EXECLIST1_VALID		(1 << 0x3)
153#define RING_EXECLIST0_VALID		(1 << 0x4)
154#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
155#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
156#define RING_EXECLIST0_ACTIVE		(1 << 0x12)
157
158#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
159#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
160#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
161#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
162#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
163#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
164
165#define GEN8_CTX_STATUS_COMPLETED_MASK \
166	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167
168#define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169
170#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
171#define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
172#define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
173#define GEN12_IDLE_CTX_ID		0x7FF
174#define GEN12_CSB_CTX_VALID(csb_dw) \
175	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176
177/* Typical size of the average request (2 pipecontrols and a MI_BB) */
178#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179
180struct virtual_engine {
181	struct intel_engine_cs base;
182	struct intel_context context;
183
184	/*
185	 * We allow only a single request through the virtual engine at a time
186	 * (each request in the timeline waits for the completion fence of
187	 * the previous before being submitted). By restricting ourselves to
188	 * only submitting a single request, each request is placed on to a
189	 * physical to maximise load spreading (by virtue of the late greedy
190	 * scheduling -- each real engine takes the next available request
191	 * upon idling).
192	 */
193	struct i915_request *request;
194
195	/*
196	 * We keep a rbtree of available virtual engines inside each physical
197	 * engine, sorted by priority. Here we preallocate the nodes we need
198	 * for the virtual engine, indexed by physical_engine->id.
199	 */
200	struct ve_node {
201		struct rb_node rb;
202		int prio;
203	} nodes[I915_NUM_ENGINES];
204
205	/*
206	 * Keep track of bonded pairs -- restrictions upon on our selection
207	 * of physical engines any particular request may be submitted to.
208	 * If we receive a submit-fence from a master engine, we will only
209	 * use one of sibling_mask physical engines.
210	 */
211	struct ve_bond {
212		const struct intel_engine_cs *master;
213		intel_engine_mask_t sibling_mask;
214	} *bonds;
215	unsigned int num_bonds;
216
217	/* And finally, which physical engines this virtual engine maps onto. */
218	unsigned int num_siblings;
219	struct intel_engine_cs *siblings[0];
220};
221
222static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
223{
224	GEM_BUG_ON(!intel_engine_is_virtual(engine));
225	return container_of(engine, struct virtual_engine, base);
226}
227
228static int __execlists_context_alloc(struct intel_context *ce,
229				     struct intel_engine_cs *engine);
230
231static void execlists_init_reg_state(u32 *reg_state,
232				     const struct intel_context *ce,
233				     const struct intel_engine_cs *engine,
234				     const struct intel_ring *ring,
235				     bool close);
236static void
237__execlists_update_reg_state(const struct intel_context *ce,
238			     const struct intel_engine_cs *engine,
239			     u32 head);
240
241static void mark_eio(struct i915_request *rq)
242{
243	if (i915_request_completed(rq))
244		return;
245
246	GEM_BUG_ON(i915_request_signaled(rq));
247
248	i915_request_set_error_once(rq, -EIO);
249	i915_request_mark_complete(rq);
250}
251
252static struct i915_request *
253active_request(const struct intel_timeline * const tl, struct i915_request *rq)
254{
255	struct i915_request *active = rq;
256
257	rcu_read_lock();
258	list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
259		if (i915_request_completed(rq))
260			break;
261
262		active = rq;
263	}
264	rcu_read_unlock();
265
266	return active;
267}
268
269static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
270{
271	return (i915_ggtt_offset(engine->status_page.vma) +
272		I915_GEM_HWS_PREEMPT_ADDR);
273}
274
275static inline void
276ring_set_paused(const struct intel_engine_cs *engine, int state)
277{
278	/*
279	 * We inspect HWS_PREEMPT with a semaphore inside
280	 * engine->emit_fini_breadcrumb. If the dword is true,
281	 * the ring is paused as the semaphore will busywait
282	 * until the dword is false.
283	 */
284	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
285	if (state)
286		wmb();
287}
288
289static inline struct i915_priolist *to_priolist(struct rb_node *rb)
290{
291	return rb_entry(rb, struct i915_priolist, node);
292}
293
294static inline int rq_prio(const struct i915_request *rq)
295{
296	return READ_ONCE(rq->sched.attr.priority);
297}
298
299static int effective_prio(const struct i915_request *rq)
300{
301	int prio = rq_prio(rq);
302
303	/*
304	 * If this request is special and must not be interrupted at any
305	 * cost, so be it. Note we are only checking the most recent request
306	 * in the context and so may be masking an earlier vip request. It
307	 * is hoped that under the conditions where nopreempt is used, this
308	 * will not matter (i.e. all requests to that context will be
309	 * nopreempt for as long as desired).
310	 */
311	if (i915_request_has_nopreempt(rq))
312		prio = I915_PRIORITY_UNPREEMPTABLE;
313
314	/*
315	 * On unwinding the active request, we give it a priority bump
316	 * if it has completed waiting on any semaphore. If we know that
317	 * the request has already started, we can prevent an unwanted
318	 * preempt-to-idle cycle by taking that into account now.
319	 */
320	if (__i915_request_has_started(rq))
321		prio |= I915_PRIORITY_NOSEMAPHORE;
322
323	/* Restrict mere WAIT boosts from triggering preemption */
324	BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
325	return prio | __NO_PREEMPTION;
326}
327
328static int queue_prio(const struct intel_engine_execlists *execlists)
329{
330	struct i915_priolist *p;
331	struct rb_node *rb;
332
333	rb = rb_first_cached(&execlists->queue);
334	if (!rb)
335		return INT_MIN;
336
337	/*
338	 * As the priolist[] are inverted, with the highest priority in [0],
339	 * we have to flip the index value to become priority.
340	 */
341	p = to_priolist(rb);
342	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
343}
344
345static inline bool need_preempt(const struct intel_engine_cs *engine,
346				const struct i915_request *rq,
347				struct rb_node *rb)
348{
349	int last_prio;
350
351	if (!intel_engine_has_semaphores(engine))
352		return false;
353
354	/*
355	 * Check if the current priority hint merits a preemption attempt.
356	 *
357	 * We record the highest value priority we saw during rescheduling
358	 * prior to this dequeue, therefore we know that if it is strictly
359	 * less than the current tail of ESLP[0], we do not need to force
360	 * a preempt-to-idle cycle.
361	 *
362	 * However, the priority hint is a mere hint that we may need to
363	 * preempt. If that hint is stale or we may be trying to preempt
364	 * ourselves, ignore the request.
365	 *
366	 * More naturally we would write
367	 *      prio >= max(0, last);
368	 * except that we wish to prevent triggering preemption at the same
369	 * priority level: the task that is running should remain running
370	 * to preserve FIFO ordering of dependencies.
371	 */
372	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
373	if (engine->execlists.queue_priority_hint <= last_prio)
374		return false;
375
376	/*
377	 * Check against the first request in ELSP[1], it will, thanks to the
378	 * power of PI, be the highest priority of that context.
379	 */
380	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
381	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
382		return true;
383
384	if (rb) {
385		struct virtual_engine *ve =
386			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
387		bool preempt = false;
388
389		if (engine == ve->siblings[0]) { /* only preempt one sibling */
390			struct i915_request *next;
391
392			rcu_read_lock();
393			next = READ_ONCE(ve->request);
394			if (next)
395				preempt = rq_prio(next) > last_prio;
396			rcu_read_unlock();
397		}
398
399		if (preempt)
400			return preempt;
401	}
402
403	/*
404	 * If the inflight context did not trigger the preemption, then maybe
405	 * it was the set of queued requests? Pick the highest priority in
406	 * the queue (the first active priolist) and see if it deserves to be
407	 * running instead of ELSP[0].
408	 *
409	 * The highest priority request in the queue can not be either
410	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
411	 * context, it's priority would not exceed ELSP[0] aka last_prio.
412	 */
413	return queue_prio(&engine->execlists) > last_prio;
414}
415
416__maybe_unused static inline bool
417assert_priority_queue(const struct i915_request *prev,
418		      const struct i915_request *next)
419{
420	/*
421	 * Without preemption, the prev may refer to the still active element
422	 * which we refuse to let go.
423	 *
424	 * Even with preemption, there are times when we think it is better not
425	 * to preempt and leave an ostensibly lower priority request in flight.
426	 */
427	if (i915_request_is_active(prev))
428		return true;
429
430	return rq_prio(prev) >= rq_prio(next);
431}
432
433/*
434 * The context descriptor encodes various attributes of a context,
435 * including its GTT address and some flags. Because it's fairly
436 * expensive to calculate, we'll just do it once and cache the result,
437 * which remains valid until the context is unpinned.
438 *
439 * This is what a descriptor looks like, from LSB to MSB::
440 *
441 *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
442 *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
443 *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
444 *      bits 53-54:    mbz, reserved for use by hardware
445 *      bits 55-63:    group ID, currently unused and set to 0
446 *
447 * Starting from Gen11, the upper dword of the descriptor has a new format:
448 *
449 *      bits 32-36:    reserved
450 *      bits 37-47:    SW context ID
451 *      bits 48:53:    engine instance
452 *      bit 54:        mbz, reserved for use by hardware
453 *      bits 55-60:    SW counter
454 *      bits 61-63:    engine class
455 *
456 * engine info, SW context ID and SW counter need to form a unique number
457 * (Context ID) per lrc.
458 */
459static u32
460lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
461{
462	u32 desc;
463
464	desc = INTEL_LEGACY_32B_CONTEXT;
465	if (i915_vm_is_4lvl(ce->vm))
466		desc = INTEL_LEGACY_64B_CONTEXT;
467	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
468
469	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
470	if (IS_GEN(engine->i915, 8))
471		desc |= GEN8_CTX_L3LLC_COHERENT;
472
473	return i915_ggtt_offset(ce->state) | desc;
474}
475
476static inline unsigned int dword_in_page(void *addr)
477{
478	return offset_in_page(addr) / sizeof(u32);
479}
480
481static void set_offsets(u32 *regs,
482			const u8 *data,
483			const struct intel_engine_cs *engine,
484			bool clear)
485#define NOP(x) (BIT(7) | (x))
486#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
487#define POSTED BIT(0)
488#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
489#define REG16(x) \
490	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
491	(((x) >> 2) & 0x7f)
492#define END(x) 0, (x)
493{
494	const u32 base = engine->mmio_base;
495
496	while (*data) {
497		u8 count, flags;
498
499		if (*data & BIT(7)) { /* skip */
500			count = *data++ & ~BIT(7);
501			if (clear)
502				memset32(regs, MI_NOOP, count);
503			regs += count;
504			continue;
505		}
506
507		count = *data & 0x3f;
508		flags = *data >> 6;
509		data++;
510
511		*regs = MI_LOAD_REGISTER_IMM(count);
512		if (flags & POSTED)
513			*regs |= MI_LRI_FORCE_POSTED;
514		if (INTEL_GEN(engine->i915) >= 11)
515			*regs |= MI_LRI_CS_MMIO;
516		regs++;
517
518		GEM_BUG_ON(!count);
519		do {
520			u32 offset = 0;
521			u8 v;
522
523			do {
524				v = *data++;
525				offset <<= 7;
526				offset |= v & ~BIT(7);
527			} while (v & BIT(7));
528
529			regs[0] = base + (offset << 2);
530			if (clear)
531				regs[1] = 0;
532			regs += 2;
533		} while (--count);
534	}
535
536	if (clear) {
537		u8 count = *++data;
538
539		/* Clear past the tail for HW access */
540		GEM_BUG_ON(dword_in_page(regs) > count);
541		memset32(regs, MI_NOOP, count - dword_in_page(regs));
542
543		/* Close the batch; used mainly by live_lrc_layout() */
544		*regs = MI_BATCH_BUFFER_END;
545		if (INTEL_GEN(engine->i915) >= 10)
546			*regs |= BIT(0);
547	}
548}
549
550static const u8 gen8_xcs_offsets[] = {
551	NOP(1),
552	LRI(11, 0),
553	REG16(0x244),
554	REG(0x034),
555	REG(0x030),
556	REG(0x038),
557	REG(0x03c),
558	REG(0x168),
559	REG(0x140),
560	REG(0x110),
561	REG(0x11c),
562	REG(0x114),
563	REG(0x118),
564
565	NOP(9),
566	LRI(9, 0),
567	REG16(0x3a8),
568	REG16(0x28c),
569	REG16(0x288),
570	REG16(0x284),
571	REG16(0x280),
572	REG16(0x27c),
573	REG16(0x278),
574	REG16(0x274),
575	REG16(0x270),
576
577	NOP(13),
578	LRI(2, 0),
579	REG16(0x200),
580	REG(0x028),
581
582	END(80)
583};
584
585static const u8 gen9_xcs_offsets[] = {
586	NOP(1),
587	LRI(14, POSTED),
588	REG16(0x244),
589	REG(0x034),
590	REG(0x030),
591	REG(0x038),
592	REG(0x03c),
593	REG(0x168),
594	REG(0x140),
595	REG(0x110),
596	REG(0x11c),
597	REG(0x114),
598	REG(0x118),
599	REG(0x1c0),
600	REG(0x1c4),
601	REG(0x1c8),
602
603	NOP(3),
604	LRI(9, POSTED),
605	REG16(0x3a8),
606	REG16(0x28c),
607	REG16(0x288),
608	REG16(0x284),
609	REG16(0x280),
610	REG16(0x27c),
611	REG16(0x278),
612	REG16(0x274),
613	REG16(0x270),
614
615	NOP(13),
616	LRI(1, POSTED),
617	REG16(0x200),
618
619	NOP(13),
620	LRI(44, POSTED),
621	REG(0x028),
622	REG(0x09c),
623	REG(0x0c0),
624	REG(0x178),
625	REG(0x17c),
626	REG16(0x358),
627	REG(0x170),
628	REG(0x150),
629	REG(0x154),
630	REG(0x158),
631	REG16(0x41c),
632	REG16(0x600),
633	REG16(0x604),
634	REG16(0x608),
635	REG16(0x60c),
636	REG16(0x610),
637	REG16(0x614),
638	REG16(0x618),
639	REG16(0x61c),
640	REG16(0x620),
641	REG16(0x624),
642	REG16(0x628),
643	REG16(0x62c),
644	REG16(0x630),
645	REG16(0x634),
646	REG16(0x638),
647	REG16(0x63c),
648	REG16(0x640),
649	REG16(0x644),
650	REG16(0x648),
651	REG16(0x64c),
652	REG16(0x650),
653	REG16(0x654),
654	REG16(0x658),
655	REG16(0x65c),
656	REG16(0x660),
657	REG16(0x664),
658	REG16(0x668),
659	REG16(0x66c),
660	REG16(0x670),
661	REG16(0x674),
662	REG16(0x678),
663	REG16(0x67c),
664	REG(0x068),
665
666	END(176)
667};
668
669static const u8 gen12_xcs_offsets[] = {
670	NOP(1),
671	LRI(13, POSTED),
672	REG16(0x244),
673	REG(0x034),
674	REG(0x030),
675	REG(0x038),
676	REG(0x03c),
677	REG(0x168),
678	REG(0x140),
679	REG(0x110),
680	REG(0x1c0),
681	REG(0x1c4),
682	REG(0x1c8),
683	REG(0x180),
684	REG16(0x2b4),
685
686	NOP(5),
687	LRI(9, POSTED),
688	REG16(0x3a8),
689	REG16(0x28c),
690	REG16(0x288),
691	REG16(0x284),
692	REG16(0x280),
693	REG16(0x27c),
694	REG16(0x278),
695	REG16(0x274),
696	REG16(0x270),
697
698	END(80)
699};
700
701static const u8 gen8_rcs_offsets[] = {
702	NOP(1),
703	LRI(14, POSTED),
704	REG16(0x244),
705	REG(0x034),
706	REG(0x030),
707	REG(0x038),
708	REG(0x03c),
709	REG(0x168),
710	REG(0x140),
711	REG(0x110),
712	REG(0x11c),
713	REG(0x114),
714	REG(0x118),
715	REG(0x1c0),
716	REG(0x1c4),
717	REG(0x1c8),
718
719	NOP(3),
720	LRI(9, POSTED),
721	REG16(0x3a8),
722	REG16(0x28c),
723	REG16(0x288),
724	REG16(0x284),
725	REG16(0x280),
726	REG16(0x27c),
727	REG16(0x278),
728	REG16(0x274),
729	REG16(0x270),
730
731	NOP(13),
732	LRI(1, 0),
733	REG(0x0c8),
734
735	END(80)
736};
737
738static const u8 gen9_rcs_offsets[] = {
739	NOP(1),
740	LRI(14, POSTED),
741	REG16(0x244),
742	REG(0x34),
743	REG(0x30),
744	REG(0x38),
745	REG(0x3c),
746	REG(0x168),
747	REG(0x140),
748	REG(0x110),
749	REG(0x11c),
750	REG(0x114),
751	REG(0x118),
752	REG(0x1c0),
753	REG(0x1c4),
754	REG(0x1c8),
755
756	NOP(3),
757	LRI(9, POSTED),
758	REG16(0x3a8),
759	REG16(0x28c),
760	REG16(0x288),
761	REG16(0x284),
762	REG16(0x280),
763	REG16(0x27c),
764	REG16(0x278),
765	REG16(0x274),
766	REG16(0x270),
767
768	NOP(13),
769	LRI(1, 0),
770	REG(0xc8),
771
772	NOP(13),
773	LRI(44, POSTED),
774	REG(0x28),
775	REG(0x9c),
776	REG(0xc0),
777	REG(0x178),
778	REG(0x17c),
779	REG16(0x358),
780	REG(0x170),
781	REG(0x150),
782	REG(0x154),
783	REG(0x158),
784	REG16(0x41c),
785	REG16(0x600),
786	REG16(0x604),
787	REG16(0x608),
788	REG16(0x60c),
789	REG16(0x610),
790	REG16(0x614),
791	REG16(0x618),
792	REG16(0x61c),
793	REG16(0x620),
794	REG16(0x624),
795	REG16(0x628),
796	REG16(0x62c),
797	REG16(0x630),
798	REG16(0x634),
799	REG16(0x638),
800	REG16(0x63c),
801	REG16(0x640),
802	REG16(0x644),
803	REG16(0x648),
804	REG16(0x64c),
805	REG16(0x650),
806	REG16(0x654),
807	REG16(0x658),
808	REG16(0x65c),
809	REG16(0x660),
810	REG16(0x664),
811	REG16(0x668),
812	REG16(0x66c),
813	REG16(0x670),
814	REG16(0x674),
815	REG16(0x678),
816	REG16(0x67c),
817	REG(0x68),
818
819	END(176)
820};
821
822static const u8 gen11_rcs_offsets[] = {
823	NOP(1),
824	LRI(15, POSTED),
825	REG16(0x244),
826	REG(0x034),
827	REG(0x030),
828	REG(0x038),
829	REG(0x03c),
830	REG(0x168),
831	REG(0x140),
832	REG(0x110),
833	REG(0x11c),
834	REG(0x114),
835	REG(0x118),
836	REG(0x1c0),
837	REG(0x1c4),
838	REG(0x1c8),
839	REG(0x180),
840
841	NOP(1),
842	LRI(9, POSTED),
843	REG16(0x3a8),
844	REG16(0x28c),
845	REG16(0x288),
846	REG16(0x284),
847	REG16(0x280),
848	REG16(0x27c),
849	REG16(0x278),
850	REG16(0x274),
851	REG16(0x270),
852
853	LRI(1, POSTED),
854	REG(0x1b0),
855
856	NOP(10),
857	LRI(1, 0),
858	REG(0x0c8),
859
860	END(80)
861};
862
863static const u8 gen12_rcs_offsets[] = {
864	NOP(1),
865	LRI(13, POSTED),
866	REG16(0x244),
867	REG(0x034),
868	REG(0x030),
869	REG(0x038),
870	REG(0x03c),
871	REG(0x168),
872	REG(0x140),
873	REG(0x110),
874	REG(0x1c0),
875	REG(0x1c4),
876	REG(0x1c8),
877	REG(0x180),
878	REG16(0x2b4),
879
880	NOP(5),
881	LRI(9, POSTED),
882	REG16(0x3a8),
883	REG16(0x28c),
884	REG16(0x288),
885	REG16(0x284),
886	REG16(0x280),
887	REG16(0x27c),
888	REG16(0x278),
889	REG16(0x274),
890	REG16(0x270),
891
892	LRI(3, POSTED),
893	REG(0x1b0),
894	REG16(0x5a8),
895	REG16(0x5ac),
896
897	NOP(6),
898	LRI(1, 0),
899	REG(0x0c8),
900
901	END(80)
902};
903
904#undef END
905#undef REG16
906#undef REG
907#undef LRI
908#undef NOP
909
910static const u8 *reg_offsets(const struct intel_engine_cs *engine)
911{
912	/*
913	 * The gen12+ lists only have the registers we program in the basic
914	 * default state. We rely on the context image using relative
915	 * addressing to automatic fixup the register state between the
916	 * physical engines for virtual engine.
917	 */
918	GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
919		   !intel_engine_has_relative_mmio(engine));
920
921	if (engine->class == RENDER_CLASS) {
922		if (INTEL_GEN(engine->i915) >= 12)
923			return gen12_rcs_offsets;
924		else if (INTEL_GEN(engine->i915) >= 11)
925			return gen11_rcs_offsets;
926		else if (INTEL_GEN(engine->i915) >= 9)
927			return gen9_rcs_offsets;
928		else
929			return gen8_rcs_offsets;
930	} else {
931		if (INTEL_GEN(engine->i915) >= 12)
932			return gen12_xcs_offsets;
933		else if (INTEL_GEN(engine->i915) >= 9)
934			return gen9_xcs_offsets;
935		else
936			return gen8_xcs_offsets;
937	}
938}
939
940static struct i915_request *
941__unwind_incomplete_requests(struct intel_engine_cs *engine)
942{
943	struct i915_request *rq, *rn, *active = NULL;
944	struct list_head *uninitialized_var(pl);
945	int prio = I915_PRIORITY_INVALID;
946
947	lockdep_assert_held(&engine->active.lock);
948
949	list_for_each_entry_safe_reverse(rq, rn,
950					 &engine->active.requests,
951					 sched.link) {
952		if (i915_request_completed(rq))
953			continue; /* XXX */
954
955		__i915_request_unsubmit(rq);
956
957		/*
958		 * Push the request back into the queue for later resubmission.
959		 * If this request is not native to this physical engine (i.e.
960		 * it came from a virtual source), push it back onto the virtual
961		 * engine so that it can be moved across onto another physical
962		 * engine as load dictates.
963		 */
964		if (likely(rq->execution_mask == engine->mask)) {
965			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
966			if (rq_prio(rq) != prio) {
967				prio = rq_prio(rq);
968				pl = i915_sched_lookup_priolist(engine, prio);
969			}
970			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
971
972			list_move(&rq->sched.link, pl);
973			set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
974
975			/* Check in case we rollback so far we wrap [size/2] */
976			if (intel_ring_direction(rq->ring,
977						 intel_ring_wrap(rq->ring,
978								 rq->tail),
979						 rq->ring->tail) > 0)
980				rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
981
982			active = rq;
983		} else {
984			struct intel_engine_cs *owner = rq->context->engine;
985
986			/*
987			 * Decouple the virtual breadcrumb before moving it
988			 * back to the virtual engine -- we don't want the
989			 * request to complete in the background and try
990			 * and cancel the breadcrumb on the virtual engine
991			 * (instead of the old engine where it is linked)!
992			 */
993			if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
994				     &rq->fence.flags)) {
995				spin_lock_nested(&rq->lock,
996						 SINGLE_DEPTH_NESTING);
997				i915_request_cancel_breadcrumb(rq);
998				spin_unlock(&rq->lock);
999			}
1000			WRITE_ONCE(rq->engine, owner);
1001			owner->submit_request(rq);
1002			active = NULL;
1003		}
1004	}
1005
1006	return active;
1007}
1008
1009struct i915_request *
1010execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1011{
1012	struct intel_engine_cs *engine =
1013		container_of(execlists, typeof(*engine), execlists);
1014
1015	return __unwind_incomplete_requests(engine);
1016}
1017
1018static inline void
1019execlists_context_status_change(struct i915_request *rq, unsigned long status)
1020{
1021	/*
1022	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1023	 * The compiler should eliminate this function as dead-code.
1024	 */
1025	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1026		return;
1027
1028#ifdef notyet
1029	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1030				   status, rq);
1031#endif
1032}
1033
1034static void intel_engine_context_in(struct intel_engine_cs *engine)
1035{
1036	unsigned long flags;
1037
1038	if (READ_ONCE(engine->stats.enabled) == 0)
1039		return;
1040
1041	write_seqlock_irqsave(&engine->stats.lock, flags);
1042
1043	if (engine->stats.enabled > 0) {
1044		if (engine->stats.active++ == 0)
1045			engine->stats.start = ktime_get();
1046		GEM_BUG_ON(engine->stats.active == 0);
1047	}
1048
1049	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1050}
1051
1052static void intel_engine_context_out(struct intel_engine_cs *engine)
1053{
1054	unsigned long flags;
1055
1056	if (READ_ONCE(engine->stats.enabled) == 0)
1057		return;
1058
1059	write_seqlock_irqsave(&engine->stats.lock, flags);
1060
1061	if (engine->stats.enabled > 0) {
1062		ktime_t last;
1063
1064		if (engine->stats.active && --engine->stats.active == 0) {
1065			/*
1066			 * Decrement the active context count and in case GPU
1067			 * is now idle add up to the running total.
1068			 */
1069			last = ktime_sub(ktime_get(), engine->stats.start);
1070
1071			engine->stats.total = ktime_add(engine->stats.total,
1072							last);
1073		} else if (engine->stats.active == 0) {
1074			/*
1075			 * After turning on engine stats, context out might be
1076			 * the first event in which case we account from the
1077			 * time stats gathering was turned on.
1078			 */
1079			last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1080
1081			engine->stats.total = ktime_add(engine->stats.total,
1082							last);
1083		}
1084	}
1085
1086	write_sequnlock_irqrestore(&engine->stats.lock, flags);
1087}
1088
1089static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1090{
1091	if (INTEL_GEN(engine->i915) >= 12)
1092		return 0x60;
1093	else if (INTEL_GEN(engine->i915) >= 9)
1094		return 0x54;
1095	else if (engine->class == RENDER_CLASS)
1096		return 0x58;
1097	else
1098		return -1;
1099}
1100
1101static void
1102execlists_check_context(const struct intel_context *ce,
1103			const struct intel_engine_cs *engine)
1104{
1105	const struct intel_ring *ring = ce->ring;
1106	u32 *regs = ce->lrc_reg_state;
1107	bool valid = true;
1108	int x;
1109
1110	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1111		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1112		       engine->name,
1113		       regs[CTX_RING_START],
1114		       i915_ggtt_offset(ring->vma));
1115		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1116		valid = false;
1117	}
1118
1119	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1120	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1121		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1122		       engine->name,
1123		       regs[CTX_RING_CTL],
1124		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1125		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1126		valid = false;
1127	}
1128
1129	x = lrc_ring_mi_mode(engine);
1130	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1131		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1132		       engine->name, regs[x + 1]);
1133		regs[x + 1] &= ~STOP_RING;
1134		regs[x + 1] |= STOP_RING << 16;
1135		valid = false;
1136	}
1137
1138	WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1139}
1140
1141static void restore_default_state(struct intel_context *ce,
1142				  struct intel_engine_cs *engine)
1143{
1144	u32 *regs = ce->lrc_reg_state;
1145
1146	if (engine->pinned_default_state)
1147		memcpy(regs, /* skip restoring the vanilla PPHWSP */
1148		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1149		       engine->context_size - PAGE_SIZE);
1150
1151	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1152}
1153
1154static void reset_active(struct i915_request *rq,
1155			 struct intel_engine_cs *engine)
1156{
1157	struct intel_context * const ce = rq->context;
1158	u32 head;
1159
1160	/*
1161	 * The executing context has been cancelled. We want to prevent
1162	 * further execution along this context and propagate the error on
1163	 * to anything depending on its results.
1164	 *
1165	 * In __i915_request_submit(), we apply the -EIO and remove the
1166	 * requests' payloads for any banned requests. But first, we must
1167	 * rewind the context back to the start of the incomplete request so
1168	 * that we do not jump back into the middle of the batch.
1169	 *
1170	 * We preserve the breadcrumbs and semaphores of the incomplete
1171	 * requests so that inter-timeline dependencies (i.e other timelines)
1172	 * remain correctly ordered. And we defer to __i915_request_submit()
1173	 * so that all asynchronous waits are correctly handled.
1174	 */
1175	ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1176		     rq->fence.context, rq->fence.seqno);
1177
1178	/* On resubmission of the active request, payload will be scrubbed */
1179	if (i915_request_completed(rq))
1180		head = rq->tail;
1181	else
1182		head = active_request(ce->timeline, rq)->head;
1183	head = intel_ring_wrap(ce->ring, head);
1184
1185	/* Scrub the context image to prevent replaying the previous batch */
1186	restore_default_state(ce, engine);
1187	__execlists_update_reg_state(ce, engine, head);
1188
1189	/* We've switched away, so this should be a no-op, but intent matters */
1190	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1191}
1192
1193static u32 intel_context_get_runtime(const struct intel_context *ce)
1194{
1195	/*
1196	 * We can use either ppHWSP[16] which is recorded before the context
1197	 * switch (and so excludes the cost of context switches) or use the
1198	 * value from the context image itself, which is saved/restored earlier
1199	 * and so includes the cost of the save.
1200	 */
1201	return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1202}
1203
1204static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1205{
1206#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1207	ce->runtime.num_underflow += dt < 0;
1208	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1209#endif
1210}
1211
1212static void intel_context_update_runtime(struct intel_context *ce)
1213{
1214	u32 old;
1215	s32 dt;
1216
1217	if (intel_context_is_barrier(ce))
1218		return;
1219
1220	old = ce->runtime.last;
1221	ce->runtime.last = intel_context_get_runtime(ce);
1222	dt = ce->runtime.last - old;
1223
1224	if (unlikely(dt <= 0)) {
1225		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1226			 old, ce->runtime.last, dt);
1227		st_update_runtime_underflow(ce, dt);
1228		return;
1229	}
1230
1231	ewma_runtime_add(&ce->runtime.avg, dt);
1232	ce->runtime.total += dt;
1233}
1234
1235static inline struct intel_engine_cs *
1236__execlists_schedule_in(struct i915_request *rq)
1237{
1238	struct intel_engine_cs * const engine = rq->engine;
1239	struct intel_context * const ce = rq->context;
1240
1241	intel_context_get(ce);
1242
1243	if (unlikely(intel_context_is_banned(ce)))
1244		reset_active(rq, engine);
1245
1246	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1247		execlists_check_context(ce, engine);
1248
1249	if (ce->tag) {
1250		/* Use a fixed tag for OA and friends */
1251		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1252		ce->lrc.ccid = ce->tag;
1253	} else {
1254		/* We don't need a strict matching tag, just different values */
1255		unsigned int tag = ffs(engine->context_tag);
1256
1257		GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1258		clear_bit(tag - 1, &engine->context_tag);
1259		ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1260
1261		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1262	}
1263
1264	ce->lrc.ccid |= engine->execlists.ccid;
1265
1266	__intel_gt_pm_get(engine->gt);
1267	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1268	intel_engine_context_in(engine);
1269
1270	return engine;
1271}
1272
1273static inline struct i915_request *
1274execlists_schedule_in(struct i915_request *rq, int idx)
1275{
1276	struct intel_context * const ce = rq->context;
1277	struct intel_engine_cs *old;
1278
1279	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1280	trace_i915_request_in(rq, idx);
1281
1282	old = READ_ONCE(ce->inflight);
1283	do {
1284		if (!old) {
1285			WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1286			break;
1287		}
1288	} while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1289
1290	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1291	return i915_request_get(rq);
1292}
1293
1294static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1295{
1296	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1297	struct i915_request *next = READ_ONCE(ve->request);
1298
1299	if (next && next->execution_mask & ~rq->execution_mask)
1300		tasklet_schedule(&ve->base.execlists.tasklet);
1301}
1302
1303static inline void
1304__execlists_schedule_out(struct i915_request *rq,
1305			 struct intel_engine_cs * const engine,
1306			 unsigned int ccid)
1307{
1308	struct intel_context * const ce = rq->context;
1309
1310	/*
1311	 * NB process_csb() is not under the engine->active.lock and hence
1312	 * schedule_out can race with schedule_in meaning that we should
1313	 * refrain from doing non-trivial work here.
1314	 */
1315
1316	/*
1317	 * If we have just completed this context, the engine may now be
1318	 * idle and we want to re-enter powersaving.
1319	 */
1320	if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1321	    i915_request_completed(rq))
1322		intel_engine_add_retire(engine, ce->timeline);
1323
1324	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1325	ccid &= GEN12_MAX_CONTEXT_HW_ID;
1326	if (ccid < BITS_PER_LONG) {
1327		GEM_BUG_ON(ccid == 0);
1328		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1329		set_bit(ccid - 1, &engine->context_tag);
1330	}
1331
1332	intel_context_update_runtime(ce);
1333	intel_engine_context_out(engine);
1334	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1335	intel_gt_pm_put_async(engine->gt);
1336
1337	/*
1338	 * If this is part of a virtual engine, its next request may
1339	 * have been blocked waiting for access to the active context.
1340	 * We have to kick all the siblings again in case we need to
1341	 * switch (e.g. the next request is not runnable on this
1342	 * engine). Hopefully, we will already have submitted the next
1343	 * request before the tasklet runs and do not need to rebuild
1344	 * each virtual tree and kick everyone again.
1345	 */
1346	if (ce->engine != engine)
1347		kick_siblings(rq, ce);
1348
1349	intel_context_put(ce);
1350}
1351
1352static inline void
1353execlists_schedule_out(struct i915_request *rq)
1354{
1355	struct intel_context * const ce = rq->context;
1356	struct intel_engine_cs *cur, *old;
1357	u32 ccid;
1358
1359	trace_i915_request_out(rq);
1360
1361	ccid = rq->context->lrc.ccid;
1362	old = READ_ONCE(ce->inflight);
1363	do
1364		cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1365	while (!try_cmpxchg(&ce->inflight, &old, cur));
1366	if (!cur)
1367		__execlists_schedule_out(rq, old, ccid);
1368
1369	i915_request_put(rq);
1370}
1371
1372static u64 execlists_update_context(struct i915_request *rq)
1373{
1374	struct intel_context *ce = rq->context;
1375	u64 desc = ce->lrc.desc;
1376	u32 tail, prev;
1377
1378	/*
1379	 * WaIdleLiteRestore:bdw,skl
1380	 *
1381	 * We should never submit the context with the same RING_TAIL twice
1382	 * just in case we submit an empty ring, which confuses the HW.
1383	 *
1384	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1385	 * the normal request to be able to always advance the RING_TAIL on
1386	 * subsequent resubmissions (for lite restore). Should that fail us,
1387	 * and we try and submit the same tail again, force the context
1388	 * reload.
1389	 *
1390	 * If we need to return to a preempted context, we need to skip the
1391	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1392	 * HW has a tendency to ignore us rewinding the TAIL to the end of
1393	 * an earlier request.
1394	 */
1395	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1396	prev = rq->ring->tail;
1397	tail = intel_ring_set_tail(rq->ring, rq->tail);
1398	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1399		desc |= CTX_DESC_FORCE_RESTORE;
1400	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1401	rq->tail = rq->wa_tail;
1402
1403	/*
1404	 * Make sure the context image is complete before we submit it to HW.
1405	 *
1406	 * Ostensibly, writes (including the WCB) should be flushed prior to
1407	 * an uncached write such as our mmio register access, the empirical
1408	 * evidence (esp. on Braswell) suggests that the WC write into memory
1409	 * may not be visible to the HW prior to the completion of the UC
1410	 * register write and that we may begin execution from the context
1411	 * before its image is complete leading to invalid PD chasing.
1412	 */
1413	wmb();
1414
1415	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1416	return desc;
1417}
1418
1419static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1420{
1421	if (execlists->ctrl_reg) {
1422		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1423		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1424	} else {
1425		writel(upper_32_bits(desc), execlists->submit_reg);
1426		writel(lower_32_bits(desc), execlists->submit_reg);
1427	}
1428}
1429
1430static __maybe_unused void
1431trace_ports(const struct intel_engine_execlists *execlists,
1432	    const char *msg,
1433	    struct i915_request * const *ports)
1434{
1435	const struct intel_engine_cs *engine =
1436		container_of(execlists, typeof(*engine), execlists);
1437
1438	if (!ports[0])
1439		return;
1440
1441	ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1442		     ports[0]->fence.context,
1443		     ports[0]->fence.seqno,
1444		     i915_request_completed(ports[0]) ? "!" :
1445		     i915_request_started(ports[0]) ? "*" :
1446		     "",
1447		     ports[1] ? ports[1]->fence.context : 0,
1448		     ports[1] ? ports[1]->fence.seqno : 0);
1449}
1450
1451static inline bool
1452reset_in_progress(const struct intel_engine_execlists *execlists)
1453{
1454	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1455}
1456
1457static __maybe_unused bool
1458assert_pending_valid(const struct intel_engine_execlists *execlists,
1459		     const char *msg)
1460{
1461	struct i915_request * const *port, *rq;
1462	struct intel_context *ce = NULL;
1463	bool sentinel = false;
1464
1465	trace_ports(execlists, msg, execlists->pending);
1466
1467	/* We may be messing around with the lists during reset, lalala */
1468	if (reset_in_progress(execlists))
1469		return true;
1470
1471	if (!execlists->pending[0]) {
1472		GEM_TRACE_ERR("Nothing pending for promotion!\n");
1473		return false;
1474	}
1475
1476	if (execlists->pending[execlists_num_ports(execlists)]) {
1477		GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1478			      execlists_num_ports(execlists));
1479		return false;
1480	}
1481
1482	for (port = execlists->pending; (rq = *port); port++) {
1483		unsigned long flags;
1484		bool ok = true;
1485
1486		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1487		GEM_BUG_ON(!i915_request_is_active(rq));
1488
1489		if (ce == rq->context) {
1490			GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1491				      ce->timeline->fence_context,
1492				      port - execlists->pending);
1493			return false;
1494		}
1495		ce = rq->context;
1496
1497		/*
1498		 * Sentinels are supposed to be lonely so they flush the
1499		 * current exection off the HW. Check that they are the
1500		 * only request in the pending submission.
1501		 */
1502		if (sentinel) {
1503			GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n",
1504				      ce->timeline->fence_context,
1505				      port - execlists->pending);
1506			return false;
1507		}
1508
1509		sentinel = i915_request_has_sentinel(rq);
1510		if (sentinel && port != execlists->pending) {
1511			GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n",
1512				      ce->timeline->fence_context,
1513				      port - execlists->pending);
1514			return false;
1515		}
1516
1517		/* Hold tightly onto the lock to prevent concurrent retires! */
1518		if (!spin_trylock_irqsave(&rq->lock, flags))
1519			continue;
1520
1521		if (i915_request_completed(rq))
1522			goto unlock;
1523
1524		if (i915_active_is_idle(&ce->active) &&
1525		    !intel_context_is_barrier(ce)) {
1526			GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1527				      ce->timeline->fence_context,
1528				      port - execlists->pending);
1529			ok = false;
1530			goto unlock;
1531		}
1532
1533		if (!i915_vma_is_pinned(ce->state)) {
1534			GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1535				      ce->timeline->fence_context,
1536				      port - execlists->pending);
1537			ok = false;
1538			goto unlock;
1539		}
1540
1541		if (!i915_vma_is_pinned(ce->ring->vma)) {
1542			GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1543				      ce->timeline->fence_context,
1544				      port - execlists->pending);
1545			ok = false;
1546			goto unlock;
1547		}
1548
1549unlock:
1550		spin_unlock_irqrestore(&rq->lock, flags);
1551		if (!ok)
1552			return false;
1553	}
1554
1555	return ce;
1556}
1557
1558static void execlists_submit_ports(struct intel_engine_cs *engine)
1559{
1560	struct intel_engine_execlists *execlists = &engine->execlists;
1561	unsigned int n;
1562
1563	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1564
1565	/*
1566	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1567	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1568	 * not be relinquished until the device is idle (see
1569	 * i915_gem_idle_work_handler()). As a precaution, we make sure
1570	 * that all ELSP are drained i.e. we have processed the CSB,
1571	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1572	 */
1573	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1574
1575	/*
1576	 * ELSQ note: the submit queue is not cleared after being submitted
1577	 * to the HW so we need to make sure we always clean it up. This is
1578	 * currently ensured by the fact that we always write the same number
1579	 * of elsq entries, keep this in mind before changing the loop below.
1580	 */
1581	for (n = execlists_num_ports(execlists); n--; ) {
1582		struct i915_request *rq = execlists->pending[n];
1583
1584		write_desc(execlists,
1585			   rq ? execlists_update_context(rq) : 0,
1586			   n);
1587	}
1588
1589	/* we need to manually load the submit queue */
1590	if (execlists->ctrl_reg)
1591		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1592}
1593
1594static bool ctx_single_port_submission(const struct intel_context *ce)
1595{
1596	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1597		intel_context_force_single_submission(ce));
1598}
1599
1600static bool can_merge_ctx(const struct intel_context *prev,
1601			  const struct intel_context *next)
1602{
1603	if (prev != next)
1604		return false;
1605
1606	if (ctx_single_port_submission(prev))
1607		return false;
1608
1609	return true;
1610}
1611
1612static unsigned long i915_request_flags(const struct i915_request *rq)
1613{
1614	return READ_ONCE(rq->fence.flags);
1615}
1616
1617static bool can_merge_rq(const struct i915_request *prev,
1618			 const struct i915_request *next)
1619{
1620	GEM_BUG_ON(prev == next);
1621	GEM_BUG_ON(!assert_priority_queue(prev, next));
1622
1623	/*
1624	 * We do not submit known completed requests. Therefore if the next
1625	 * request is already completed, we can pretend to merge it in
1626	 * with the previous context (and we will skip updating the ELSP
1627	 * and tracking). Thus hopefully keeping the ELSP full with active
1628	 * contexts, despite the best efforts of preempt-to-busy to confuse
1629	 * us.
1630	 */
1631	if (i915_request_completed(next))
1632		return true;
1633
1634	if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1635		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1636		      BIT(I915_FENCE_FLAG_SENTINEL))))
1637		return false;
1638
1639	if (!can_merge_ctx(prev->context, next->context))
1640		return false;
1641
1642	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1643	return true;
1644}
1645
1646static void virtual_update_register_offsets(u32 *regs,
1647					    struct intel_engine_cs *engine)
1648{
1649	set_offsets(regs, reg_offsets(engine), engine, false);
1650}
1651
1652static bool virtual_matches(const struct virtual_engine *ve,
1653			    const struct i915_request *rq,
1654			    const struct intel_engine_cs *engine)
1655{
1656	const struct intel_engine_cs *inflight;
1657
1658	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1659		return false;
1660
1661	/*
1662	 * We track when the HW has completed saving the context image
1663	 * (i.e. when we have seen the final CS event switching out of
1664	 * the context) and must not overwrite the context image before
1665	 * then. This restricts us to only using the active engine
1666	 * while the previous virtualized request is inflight (so
1667	 * we reuse the register offsets). This is a very small
1668	 * hystersis on the greedy seelction algorithm.
1669	 */
1670	inflight = intel_context_inflight(&ve->context);
1671	if (inflight && inflight != engine)
1672		return false;
1673
1674	return true;
1675}
1676
1677static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1678				     struct i915_request *rq)
1679{
1680	struct intel_engine_cs *old = ve->siblings[0];
1681
1682	/* All unattached (rq->engine == old) must already be completed */
1683
1684	spin_lock(&old->breadcrumbs.irq_lock);
1685	if (!list_empty(&ve->context.signal_link)) {
1686		list_del_init(&ve->context.signal_link);
1687
1688		/*
1689		 * We cannot acquire the new engine->breadcrumbs.irq_lock
1690		 * (as we are holding a breadcrumbs.irq_lock already),
1691		 * so attach this request to the signaler on submission.
1692		 * The queued irq_work will occur when we finally drop
1693		 * the engine->active.lock after dequeue.
1694		 */
1695		set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags);
1696
1697		/* Also transfer the pending irq_work for the old breadcrumb. */
1698		intel_engine_signal_breadcrumbs(rq->engine);
1699	}
1700	spin_unlock(&old->breadcrumbs.irq_lock);
1701}
1702
1703#define for_each_waiter(p__, rq__) \
1704	list_for_each_entry_lockless(p__, \
1705				     &(rq__)->sched.waiters_list, \
1706				     wait_link)
1707
1708#define for_each_signaler(p__, rq__) \
1709	list_for_each_entry_rcu(p__, \
1710				&(rq__)->sched.signalers_list, \
1711				signal_link)
1712
1713static void defer_request(struct i915_request *rq, struct list_head * const pl)
1714{
1715	DRM_LIST_HEAD(list);
1716
1717	/*
1718	 * We want to move the interrupted request to the back of
1719	 * the round-robin list (i.e. its priority level), but
1720	 * in doing so, we must then move all requests that were in
1721	 * flight and were waiting for the interrupted request to
1722	 * be run after it again.
1723	 */
1724	do {
1725		struct i915_dependency *p;
1726
1727		GEM_BUG_ON(i915_request_is_active(rq));
1728		list_move_tail(&rq->sched.link, pl);
1729
1730		for_each_waiter(p, rq) {
1731			struct i915_request *w =
1732				container_of(p->waiter, typeof(*w), sched);
1733
1734			if (p->flags & I915_DEPENDENCY_WEAK)
1735				continue;
1736
1737			/* Leave semaphores spinning on the other engines */
1738			if (w->engine != rq->engine)
1739				continue;
1740
1741			/* No waiter should start before its signaler */
1742			GEM_BUG_ON(i915_request_started(w) &&
1743				   !i915_request_completed(rq));
1744
1745			GEM_BUG_ON(i915_request_is_active(w));
1746			if (!i915_request_is_ready(w))
1747				continue;
1748
1749			if (rq_prio(w) < rq_prio(rq))
1750				continue;
1751
1752			GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1753			list_move_tail(&w->sched.link, &list);
1754		}
1755
1756		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1757	} while (rq);
1758}
1759
1760static void defer_active(struct intel_engine_cs *engine)
1761{
1762	struct i915_request *rq;
1763
1764	rq = __unwind_incomplete_requests(engine);
1765	if (!rq)
1766		return;
1767
1768	defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1769}
1770
1771static bool
1772need_timeslice(const struct intel_engine_cs *engine,
1773	       const struct i915_request *rq)
1774{
1775	int hint;
1776
1777	if (!intel_engine_has_timeslices(engine))
1778		return false;
1779
1780	hint = engine->execlists.queue_priority_hint;
1781	if (!list_is_last(&rq->sched.link, &engine->active.requests))
1782		hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1783
1784	return hint >= effective_prio(rq);
1785}
1786
1787static bool
1788timeslice_yield(const struct intel_engine_execlists *el,
1789		const struct i915_request *rq)
1790{
1791	/*
1792	 * Once bitten, forever smitten!
1793	 *
1794	 * If the active context ever busy-waited on a semaphore,
1795	 * it will be treated as a hog until the end of its timeslice (i.e.
1796	 * until it is scheduled out and replaced by a new submission,
1797	 * possibly even its own lite-restore). The HW only sends an interrupt
1798	 * on the first miss, and we do know if that semaphore has been
1799	 * signaled, or even if it is now stuck on another semaphore. Play
1800	 * safe, yield if it might be stuck -- it will be given a fresh
1801	 * timeslice in the near future.
1802	 */
1803	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1804}
1805
1806static bool
1807timeslice_expired(const struct intel_engine_execlists *el,
1808		  const struct i915_request *rq)
1809{
1810	return timer_expired(&el->timer) || timeslice_yield(el, rq);
1811}
1812
1813static int
1814switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1815{
1816	if (list_is_last(&rq->sched.link, &engine->active.requests))
1817		return INT_MIN;
1818
1819	return rq_prio(list_next_entry(rq, sched.link));
1820}
1821
1822static inline unsigned long
1823timeslice(const struct intel_engine_cs *engine)
1824{
1825	return READ_ONCE(engine->props.timeslice_duration_ms);
1826}
1827
1828static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1829{
1830	const struct intel_engine_execlists *execlists = &engine->execlists;
1831	const struct i915_request *rq = *execlists->active;
1832
1833	if (!rq || i915_request_completed(rq))
1834		return 0;
1835
1836	if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1837		return 0;
1838
1839	return timeslice(engine);
1840}
1841
1842static void set_timeslice(struct intel_engine_cs *engine)
1843{
1844	if (!intel_engine_has_timeslices(engine))
1845		return;
1846
1847	set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1848}
1849
1850static void start_timeslice(struct intel_engine_cs *engine)
1851{
1852	struct intel_engine_execlists *execlists = &engine->execlists;
1853	int prio = queue_prio(execlists);
1854
1855	WRITE_ONCE(execlists->switch_priority_hint, prio);
1856	if (prio == INT_MIN)
1857		return;
1858
1859	if (timer_pending(&execlists->timer))
1860		return;
1861
1862	set_timer_ms(&execlists->timer, timeslice(engine));
1863}
1864
1865static void record_preemption(struct intel_engine_execlists *execlists)
1866{
1867	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1868}
1869
1870static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1871					    const struct i915_request *rq)
1872{
1873	if (!rq)
1874		return 0;
1875
1876	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
1877	if (unlikely(intel_context_is_banned(rq->context)))
1878		return 1;
1879
1880	return READ_ONCE(engine->props.preempt_timeout_ms);
1881}
1882
1883static void set_preempt_timeout(struct intel_engine_cs *engine,
1884				const struct i915_request *rq)
1885{
1886	if (!intel_engine_has_preempt_reset(engine))
1887		return;
1888
1889	set_timer_ms(&engine->execlists.preempt,
1890		     active_preempt_timeout(engine, rq));
1891}
1892
1893static inline void clear_ports(struct i915_request **ports, int count)
1894{
1895	memset_p((void **)ports, NULL, count);
1896}
1897
1898static void execlists_dequeue(struct intel_engine_cs *engine)
1899{
1900	struct intel_engine_execlists * const execlists = &engine->execlists;
1901	struct i915_request **port = execlists->pending;
1902	struct i915_request ** const last_port = port + execlists->port_mask;
1903	struct i915_request * const *active;
1904	struct i915_request *last;
1905	struct rb_node *rb;
1906	bool submit = false;
1907
1908	/*
1909	 * Hardware submission is through 2 ports. Conceptually each port
1910	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1911	 * static for a context, and unique to each, so we only execute
1912	 * requests belonging to a single context from each ring. RING_HEAD
1913	 * is maintained by the CS in the context image, it marks the place
1914	 * where it got up to last time, and through RING_TAIL we tell the CS
1915	 * where we want to execute up to this time.
1916	 *
1917	 * In this list the requests are in order of execution. Consecutive
1918	 * requests from the same context are adjacent in the ringbuffer. We
1919	 * can combine these requests into a single RING_TAIL update:
1920	 *
1921	 *              RING_HEAD...req1...req2
1922	 *                                    ^- RING_TAIL
1923	 * since to execute req2 the CS must first execute req1.
1924	 *
1925	 * Our goal then is to point each port to the end of a consecutive
1926	 * sequence of requests as being the most optimal (fewest wake ups
1927	 * and context switches) submission.
1928	 */
1929
1930	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1931		struct virtual_engine *ve =
1932			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1933		struct i915_request *rq = READ_ONCE(ve->request);
1934
1935		if (!rq) { /* lazily cleanup after another engine handled rq */
1936			rb_erase_cached(rb, &execlists->virtual);
1937			RB_CLEAR_NODE(rb);
1938			rb = rb_first_cached(&execlists->virtual);
1939			continue;
1940		}
1941
1942		if (!virtual_matches(ve, rq, engine)) {
1943			rb = rb_next(rb);
1944			continue;
1945		}
1946
1947		break;
1948	}
1949
1950	/*
1951	 * If the queue is higher priority than the last
1952	 * request in the currently active context, submit afresh.
1953	 * We will resubmit again afterwards in case we need to split
1954	 * the active context to interject the preemption request,
1955	 * i.e. we will retrigger preemption following the ack in case
1956	 * of trouble.
1957	 */
1958	active = READ_ONCE(execlists->active);
1959	while ((last = *active) && i915_request_completed(last))
1960		active++;
1961
1962	if (last) {
1963		if (need_preempt(engine, last, rb)) {
1964			ENGINE_TRACE(engine,
1965				     "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1966				     last->fence.context,
1967				     last->fence.seqno,
1968				     last->sched.attr.priority,
1969				     execlists->queue_priority_hint);
1970			record_preemption(execlists);
1971
1972			/*
1973			 * Don't let the RING_HEAD advance past the breadcrumb
1974			 * as we unwind (and until we resubmit) so that we do
1975			 * not accidentally tell it to go backwards.
1976			 */
1977			ring_set_paused(engine, 1);
1978
1979			/*
1980			 * Note that we have not stopped the GPU at this point,
1981			 * so we are unwinding the incomplete requests as they
1982			 * remain inflight and so by the time we do complete
1983			 * the preemption, some of the unwound requests may
1984			 * complete!
1985			 */
1986			__unwind_incomplete_requests(engine);
1987
1988			last = NULL;
1989		} else if (need_timeslice(engine, last) &&
1990			   timeslice_expired(execlists, last)) {
1991			ENGINE_TRACE(engine,
1992				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
1993				     last->fence.context,
1994				     last->fence.seqno,
1995				     last->sched.attr.priority,
1996				     execlists->queue_priority_hint,
1997				     yesno(timeslice_yield(execlists, last)));
1998
1999			ring_set_paused(engine, 1);
2000			defer_active(engine);
2001
2002			/*
2003			 * Unlike for preemption, if we rewind and continue
2004			 * executing the same context as previously active,
2005			 * the order of execution will remain the same and
2006			 * the tail will only advance. We do not need to
2007			 * force a full context restore, as a lite-restore
2008			 * is sufficient to resample the monotonic TAIL.
2009			 *
2010			 * If we switch to any other context, similarly we
2011			 * will not rewind TAIL of current context, and
2012			 * normal save/restore will preserve state and allow
2013			 * us to later continue executing the same request.
2014			 */
2015			last = NULL;
2016		} else {
2017			/*
2018			 * Otherwise if we already have a request pending
2019			 * for execution after the current one, we can
2020			 * just wait until the next CS event before
2021			 * queuing more. In either case we will force a
2022			 * lite-restore preemption event, but if we wait
2023			 * we hopefully coalesce several updates into a single
2024			 * submission.
2025			 */
2026			if (!list_is_last(&last->sched.link,
2027					  &engine->active.requests)) {
2028				/*
2029				 * Even if ELSP[1] is occupied and not worthy
2030				 * of timeslices, our queue might be.
2031				 */
2032				start_timeslice(engine);
2033				return;
2034			}
2035		}
2036	}
2037
2038	while (rb) { /* XXX virtual is always taking precedence */
2039		struct virtual_engine *ve =
2040			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2041		struct i915_request *rq;
2042
2043		spin_lock(&ve->base.active.lock);
2044
2045		rq = ve->request;
2046		if (unlikely(!rq)) { /* lost the race to a sibling */
2047			spin_unlock(&ve->base.active.lock);
2048			rb_erase_cached(rb, &execlists->virtual);
2049			RB_CLEAR_NODE(rb);
2050			rb = rb_first_cached(&execlists->virtual);
2051			continue;
2052		}
2053
2054		GEM_BUG_ON(rq != ve->request);
2055		GEM_BUG_ON(rq->engine != &ve->base);
2056		GEM_BUG_ON(rq->context != &ve->context);
2057
2058		if (rq_prio(rq) >= queue_prio(execlists)) {
2059			if (!virtual_matches(ve, rq, engine)) {
2060				spin_unlock(&ve->base.active.lock);
2061				rb = rb_next(rb);
2062				continue;
2063			}
2064
2065			if (last && !can_merge_rq(last, rq)) {
2066				spin_unlock(&ve->base.active.lock);
2067				start_timeslice(engine);
2068				return; /* leave this for another sibling */
2069			}
2070
2071			ENGINE_TRACE(engine,
2072				     "virtual rq=%llx:%lld%s, new engine? %s\n",
2073				     rq->fence.context,
2074				     rq->fence.seqno,
2075				     i915_request_completed(rq) ? "!" :
2076				     i915_request_started(rq) ? "*" :
2077				     "",
2078				     yesno(engine != ve->siblings[0]));
2079
2080			WRITE_ONCE(ve->request, NULL);
2081			WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2082				   INT_MIN);
2083			rb_erase_cached(rb, &execlists->virtual);
2084			RB_CLEAR_NODE(rb);
2085
2086			GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2087			WRITE_ONCE(rq->engine, engine);
2088
2089			if (engine != ve->siblings[0]) {
2090				u32 *regs = ve->context.lrc_reg_state;
2091				unsigned int n;
2092
2093				GEM_BUG_ON(READ_ONCE(ve->context.inflight));
2094
2095				if (!intel_engine_has_relative_mmio(engine))
2096					virtual_update_register_offsets(regs,
2097									engine);
2098
2099				if (!list_empty(&ve->context.signals))
2100					virtual_xfer_breadcrumbs(ve, rq);
2101
2102				/*
2103				 * Move the bound engine to the top of the list
2104				 * for future execution. We then kick this
2105				 * tasklet first before checking others, so that
2106				 * we preferentially reuse this set of bound
2107				 * registers.
2108				 */
2109				for (n = 1; n < ve->num_siblings; n++) {
2110					if (ve->siblings[n] == engine) {
2111						swap(ve->siblings[n],
2112						     ve->siblings[0]);
2113						break;
2114					}
2115				}
2116
2117				GEM_BUG_ON(ve->siblings[0] != engine);
2118			}
2119
2120			if (__i915_request_submit(rq)) {
2121				submit = true;
2122				last = rq;
2123			}
2124			i915_request_put(rq);
2125
2126			/*
2127			 * Hmm, we have a bunch of virtual engine requests,
2128			 * but the first one was already completed (thanks
2129			 * preempt-to-busy!). Keep looking at the veng queue
2130			 * until we have no more relevant requests (i.e.
2131			 * the normal submit queue has higher priority).
2132			 */
2133			if (!submit) {
2134				spin_unlock(&ve->base.active.lock);
2135				rb = rb_first_cached(&execlists->virtual);
2136				continue;
2137			}
2138		}
2139
2140		spin_unlock(&ve->base.active.lock);
2141		break;
2142	}
2143
2144	while ((rb = rb_first_cached(&execlists->queue))) {
2145		struct i915_priolist *p = to_priolist(rb);
2146		struct i915_request *rq, *rn;
2147		int i;
2148
2149		priolist_for_each_request_consume(rq, rn, p, i) {
2150			bool merge = true;
2151
2152			/*
2153			 * Can we combine this request with the current port?
2154			 * It has to be the same context/ringbuffer and not
2155			 * have any exceptions (e.g. GVT saying never to
2156			 * combine contexts).
2157			 *
2158			 * If we can combine the requests, we can execute both
2159			 * by updating the RING_TAIL to point to the end of the
2160			 * second request, and so we never need to tell the
2161			 * hardware about the first.
2162			 */
2163			if (last && !can_merge_rq(last, rq)) {
2164				/*
2165				 * If we are on the second port and cannot
2166				 * combine this request with the last, then we
2167				 * are done.
2168				 */
2169				if (port == last_port)
2170					goto done;
2171
2172				/*
2173				 * We must not populate both ELSP[] with the
2174				 * same LRCA, i.e. we must submit 2 different
2175				 * contexts if we submit 2 ELSP.
2176				 */
2177				if (last->context == rq->context)
2178					goto done;
2179
2180				if (i915_request_has_sentinel(last))
2181					goto done;
2182
2183				/*
2184				 * If GVT overrides us we only ever submit
2185				 * port[0], leaving port[1] empty. Note that we
2186				 * also have to be careful that we don't queue
2187				 * the same context (even though a different
2188				 * request) to the second port.
2189				 */
2190				if (ctx_single_port_submission(last->context) ||
2191				    ctx_single_port_submission(rq->context))
2192					goto done;
2193
2194				merge = false;
2195			}
2196
2197			if (__i915_request_submit(rq)) {
2198				if (!merge) {
2199					*port = execlists_schedule_in(last, port - execlists->pending);
2200					port++;
2201					last = NULL;
2202				}
2203
2204				GEM_BUG_ON(last &&
2205					   !can_merge_ctx(last->context,
2206							  rq->context));
2207				GEM_BUG_ON(last &&
2208					   i915_seqno_passed(last->fence.seqno,
2209							     rq->fence.seqno));
2210
2211				submit = true;
2212				last = rq;
2213			}
2214		}
2215
2216		rb_erase_cached(&p->node, &execlists->queue);
2217		i915_priolist_free(p);
2218	}
2219
2220done:
2221	/*
2222	 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2223	 *
2224	 * We choose the priority hint such that if we add a request of greater
2225	 * priority than this, we kick the submission tasklet to decide on
2226	 * the right order of submitting the requests to hardware. We must
2227	 * also be prepared to reorder requests as they are in-flight on the
2228	 * HW. We derive the priority hint then as the first "hole" in
2229	 * the HW submission ports and if there are no available slots,
2230	 * the priority of the lowest executing request, i.e. last.
2231	 *
2232	 * When we do receive a higher priority request ready to run from the
2233	 * user, see queue_request(), the priority hint is bumped to that
2234	 * request triggering preemption on the next dequeue (or subsequent
2235	 * interrupt for secondary ports).
2236	 */
2237	execlists->queue_priority_hint = queue_prio(execlists);
2238
2239	if (submit) {
2240		*port = execlists_schedule_in(last, port - execlists->pending);
2241		execlists->switch_priority_hint =
2242			switch_prio(engine, *execlists->pending);
2243
2244		/*
2245		 * Skip if we ended up with exactly the same set of requests,
2246		 * e.g. trying to timeslice a pair of ordered contexts
2247		 */
2248		if (!memcmp(active, execlists->pending,
2249			    (port - execlists->pending + 1) * sizeof(*port))) {
2250			do
2251				execlists_schedule_out(fetch_and_zero(port));
2252			while (port-- != execlists->pending);
2253
2254			goto skip_submit;
2255		}
2256		clear_ports(port + 1, last_port - port);
2257
2258		WRITE_ONCE(execlists->yield, -1);
2259		execlists_submit_ports(engine);
2260		set_preempt_timeout(engine, *active);
2261	} else {
2262skip_submit:
2263		ring_set_paused(engine, 0);
2264	}
2265}
2266
2267static void
2268cancel_port_requests(struct intel_engine_execlists * const execlists)
2269{
2270	struct i915_request * const *port;
2271
2272	for (port = execlists->pending; *port; port++)
2273		execlists_schedule_out(*port);
2274	clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2275
2276	/* Mark the end of active before we overwrite *active */
2277	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2278		execlists_schedule_out(*port);
2279	clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2280
2281	smp_wmb(); /* complete the seqlock for execlists_active() */
2282	WRITE_ONCE(execlists->active, execlists->inflight);
2283}
2284
2285static inline void
2286invalidate_csb_entries(const u32 *first, const u32 *last)
2287{
2288	clflush((vaddr_t)first);
2289	clflush((vaddr_t)last);
2290}
2291
2292/*
2293 * Starting with Gen12, the status has a new format:
2294 *
2295 *     bit  0:     switched to new queue
2296 *     bit  1:     reserved
2297 *     bit  2:     semaphore wait mode (poll or signal), only valid when
2298 *                 switch detail is set to "wait on semaphore"
2299 *     bits 3-5:   engine class
2300 *     bits 6-11:  engine instance
2301 *     bits 12-14: reserved
2302 *     bits 15-25: sw context id of the lrc the GT switched to
2303 *     bits 26-31: sw counter of the lrc the GT switched to
2304 *     bits 32-35: context switch detail
2305 *                  - 0: ctx complete
2306 *                  - 1: wait on sync flip
2307 *                  - 2: wait on vblank
2308 *                  - 3: wait on scanline
2309 *                  - 4: wait on semaphore
2310 *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2311 *                       WAIT_FOR_EVENT)
2312 *     bit  36:    reserved
2313 *     bits 37-43: wait detail (for switch detail 1 to 4)
2314 *     bits 44-46: reserved
2315 *     bits 47-57: sw context id of the lrc the GT switched away from
2316 *     bits 58-63: sw counter of the lrc the GT switched away from
2317 */
2318static inline bool
2319gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2320{
2321	u32 lower_dw = csb[0];
2322	u32 upper_dw = csb[1];
2323	bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2324	bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2325	bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2326
2327	/*
2328	 * The context switch detail is not guaranteed to be 5 when a preemption
2329	 * occurs, so we can't just check for that. The check below works for
2330	 * all the cases we care about, including preemptions of WAIT
2331	 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2332	 * would require some extra handling, but we don't support that.
2333	 */
2334	if (!ctx_away_valid || new_queue) {
2335		GEM_BUG_ON(!ctx_to_valid);
2336		return true;
2337	}
2338
2339	/*
2340	 * switch detail = 5 is covered by the case above and we do not expect a
2341	 * context switch on an unsuccessful wait instruction since we always
2342	 * use polling mode.
2343	 */
2344	GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2345	return false;
2346}
2347
2348static inline bool
2349gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2350{
2351	return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2352}
2353
2354static void process_csb(struct intel_engine_cs *engine)
2355{
2356	struct intel_engine_execlists * const execlists = &engine->execlists;
2357	const u32 * const buf = execlists->csb_status;
2358	const u8 num_entries = execlists->csb_size;
2359	u8 head, tail;
2360
2361	/*
2362	 * As we modify our execlists state tracking we require exclusive
2363	 * access. Either we are inside the tasklet, or the tasklet is disabled
2364	 * and we assume that is only inside the reset paths and so serialised.
2365	 */
2366	GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2367		   !reset_in_progress(execlists));
2368	GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2369
2370	/*
2371	 * Note that csb_write, csb_status may be either in HWSP or mmio.
2372	 * When reading from the csb_write mmio register, we have to be
2373	 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2374	 * the low 4bits. As it happens we know the next 4bits are always
2375	 * zero and so we can simply masked off the low u8 of the register
2376	 * and treat it identically to reading from the HWSP (without having
2377	 * to use explicit shifting and masking, and probably bifurcating
2378	 * the code to handle the legacy mmio read).
2379	 */
2380	head = execlists->csb_head;
2381	tail = READ_ONCE(*execlists->csb_write);
2382	if (unlikely(head == tail))
2383		return;
2384
2385	/*
2386	 * Hopefully paired with a wmb() in HW!
2387	 *
2388	 * We must complete the read of the write pointer before any reads
2389	 * from the CSB, so that we do not see stale values. Without an rmb
2390	 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2391	 * we perform the READ_ONCE(*csb_write).
2392	 */
2393	rmb();
2394
2395	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2396	do {
2397		bool promote;
2398
2399		if (++head == num_entries)
2400			head = 0;
2401
2402		/*
2403		 * We are flying near dragons again.
2404		 *
2405		 * We hold a reference to the request in execlist_port[]
2406		 * but no more than that. We are operating in softirq
2407		 * context and so cannot hold any mutex or sleep. That
2408		 * prevents us stopping the requests we are processing
2409		 * in port[] from being retired simultaneously (the
2410		 * breadcrumb will be complete before we see the
2411		 * context-switch). As we only hold the reference to the
2412		 * request, any pointer chasing underneath the request
2413		 * is subject to a potential use-after-free. Thus we
2414		 * store all of the bookkeeping within port[] as
2415		 * required, and avoid using unguarded pointers beneath
2416		 * request itself. The same applies to the atomic
2417		 * status notifier.
2418		 */
2419
2420		ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2421			     head, buf[2 * head + 0], buf[2 * head + 1]);
2422
2423		if (INTEL_GEN(engine->i915) >= 12)
2424			promote = gen12_csb_parse(execlists, buf + 2 * head);
2425		else
2426			promote = gen8_csb_parse(execlists, buf + 2 * head);
2427		if (promote) {
2428			struct i915_request * const *old = execlists->active;
2429
2430			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2431
2432			ring_set_paused(engine, 0);
2433
2434			/* Point active to the new ELSP; prevent overwriting */
2435			WRITE_ONCE(execlists->active, execlists->pending);
2436			smp_wmb(); /* notify execlists_active() */
2437
2438			/* cancel old inflight, prepare for switch */
2439			trace_ports(execlists, "preempted", old);
2440			while (*old)
2441				execlists_schedule_out(*old++);
2442
2443			/* switch pending to inflight */
2444			memcpy(execlists->inflight,
2445			       execlists->pending,
2446			       execlists_num_ports(execlists) *
2447			       sizeof(*execlists->pending));
2448			smp_wmb(); /* complete the seqlock */
2449			WRITE_ONCE(execlists->active, execlists->inflight);
2450
2451			WRITE_ONCE(execlists->pending[0], NULL);
2452		} else {
2453			GEM_BUG_ON(!*execlists->active);
2454
2455			/* port0 completed, advanced to port1 */
2456			trace_ports(execlists, "completed", execlists->active);
2457
2458			/*
2459			 * We rely on the hardware being strongly
2460			 * ordered, that the breadcrumb write is
2461			 * coherent (visible from the CPU) before the
2462			 * user interrupt and CSB is processed.
2463			 */
2464			if (GEM_SHOW_DEBUG() &&
2465			    !i915_request_completed(*execlists->active) &&
2466			    !reset_in_progress(execlists)) {
2467				struct i915_request *rq __maybe_unused =
2468					*execlists->active;
2469				const u32 *regs __maybe_unused =
2470					rq->context->lrc_reg_state;
2471
2472				ENGINE_TRACE(engine,
2473					     "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2474					     ENGINE_READ(engine, RING_START),
2475					     ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2476					     ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2477					     ENGINE_READ(engine, RING_CTL),
2478					     ENGINE_READ(engine, RING_MI_MODE));
2479				ENGINE_TRACE(engine,
2480					     "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2481					     i915_ggtt_offset(rq->ring->vma),
2482					     rq->head, rq->tail,
2483					     rq->fence.context,
2484					     lower_32_bits(rq->fence.seqno),
2485					     hwsp_seqno(rq));
2486				ENGINE_TRACE(engine,
2487					     "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2488					     regs[CTX_RING_START],
2489					     regs[CTX_RING_HEAD],
2490					     regs[CTX_RING_TAIL]);
2491
2492				GEM_BUG_ON("context completed before request");
2493			}
2494
2495			execlists_schedule_out(*execlists->active++);
2496
2497			GEM_BUG_ON(execlists->active - execlists->inflight >
2498				   execlists_num_ports(execlists));
2499		}
2500	} while (head != tail);
2501
2502	execlists->csb_head = head;
2503	set_timeslice(engine);
2504
2505	/*
2506	 * Gen11 has proven to fail wrt global observation point between
2507	 * entry and tail update, failing on the ordering and thus
2508	 * we see an old entry in the context status buffer.
2509	 *
2510	 * Forcibly evict out entries for the next gpu csb update,
2511	 * to increase the odds that we get a fresh entries with non
2512	 * working hardware. The cost for doing so comes out mostly with
2513	 * the wash as hardware, working or not, will need to do the
2514	 * invalidation before.
2515	 */
2516	invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2517}
2518
2519static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2520{
2521	lockdep_assert_held(&engine->active.lock);
2522	if (!READ_ONCE(engine->execlists.pending[0])) {
2523		rcu_read_lock(); /* protect peeking at execlists->active */
2524		execlists_dequeue(engine);
2525		rcu_read_unlock();
2526	}
2527}
2528
2529static void __execlists_hold(struct i915_request *rq)
2530{
2531	DRM_LIST_HEAD(list);
2532
2533	do {
2534		struct i915_dependency *p;
2535
2536		if (i915_request_is_active(rq))
2537			__i915_request_unsubmit(rq);
2538
2539		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2540		list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2541		i915_request_set_hold(rq);
2542		RQ_TRACE(rq, "on hold\n");
2543
2544		for_each_waiter(p, rq) {
2545			struct i915_request *w =
2546				container_of(p->waiter, typeof(*w), sched);
2547
2548			/* Leave semaphores spinning on the other engines */
2549			if (w->engine != rq->engine)
2550				continue;
2551
2552			if (!i915_request_is_ready(w))
2553				continue;
2554
2555			if (i915_request_completed(w))
2556				continue;
2557
2558			if (i915_request_on_hold(w))
2559				continue;
2560
2561			list_move_tail(&w->sched.link, &list);
2562		}
2563
2564		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2565	} while (rq);
2566}
2567
2568static bool execlists_hold(struct intel_engine_cs *engine,
2569			   struct i915_request *rq)
2570{
2571	spin_lock_irq(&engine->active.lock);
2572
2573	if (i915_request_completed(rq)) { /* too late! */
2574		rq = NULL;
2575		goto unlock;
2576	}
2577
2578	if (rq->engine != engine) { /* preempted virtual engine */
2579		struct virtual_engine *ve = to_virtual_engine(rq->engine);
2580
2581		/*
2582		 * intel_context_inflight() is only protected by virtue
2583		 * of process_csb() being called only by the tasklet (or
2584		 * directly from inside reset while the tasklet is suspended).
2585		 * Assert that neither of those are allowed to run while we
2586		 * poke at the request queues.
2587		 */
2588		GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2589
2590		/*
2591		 * An unsubmitted request along a virtual engine will
2592		 * remain on the active (this) engine until we are able
2593		 * to process the context switch away (and so mark the
2594		 * context as no longer in flight). That cannot have happened
2595		 * yet, otherwise we would not be hanging!
2596		 */
2597		spin_lock(&ve->base.active.lock);
2598		GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2599		GEM_BUG_ON(ve->request != rq);
2600		ve->request = NULL;
2601		spin_unlock(&ve->base.active.lock);
2602		i915_request_put(rq);
2603
2604		rq->engine = engine;
2605	}
2606
2607	/*
2608	 * Transfer this request onto the hold queue to prevent it
2609	 * being resumbitted to HW (and potentially completed) before we have
2610	 * released it. Since we may have already submitted following
2611	 * requests, we need to remove those as well.
2612	 */
2613	GEM_BUG_ON(i915_request_on_hold(rq));
2614	GEM_BUG_ON(rq->engine != engine);
2615	__execlists_hold(rq);
2616	GEM_BUG_ON(list_empty(&engine->active.hold));
2617
2618unlock:
2619	spin_unlock_irq(&engine->active.lock);
2620	return rq;
2621}
2622
2623static bool hold_request(const struct i915_request *rq)
2624{
2625	struct i915_dependency *p;
2626	bool result = false;
2627
2628	/*
2629	 * If one of our ancestors is on hold, we must also be on hold,
2630	 * otherwise we will bypass it and execute before it.
2631	 */
2632	rcu_read_lock();
2633	for_each_signaler(p, rq) {
2634		const struct i915_request *s =
2635			container_of(p->signaler, typeof(*s), sched);
2636
2637		if (s->engine != rq->engine)
2638			continue;
2639
2640		result = i915_request_on_hold(s);
2641		if (result)
2642			break;
2643	}
2644	rcu_read_unlock();
2645
2646	return result;
2647}
2648
2649static void __execlists_unhold(struct i915_request *rq)
2650{
2651	DRM_LIST_HEAD(list);
2652
2653	do {
2654		struct i915_dependency *p;
2655
2656		RQ_TRACE(rq, "hold release\n");
2657
2658		GEM_BUG_ON(!i915_request_on_hold(rq));
2659		GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2660
2661		i915_request_clear_hold(rq);
2662		list_move_tail(&rq->sched.link,
2663			       i915_sched_lookup_priolist(rq->engine,
2664							  rq_prio(rq)));
2665		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2666
2667		/* Also release any children on this engine that are ready */
2668		for_each_waiter(p, rq) {
2669			struct i915_request *w =
2670				container_of(p->waiter, typeof(*w), sched);
2671
2672			/* Propagate any change in error status */
2673			if (rq->fence.error)
2674				i915_request_set_error_once(w, rq->fence.error);
2675
2676			if (w->engine != rq->engine)
2677				continue;
2678
2679			if (!i915_request_on_hold(w))
2680				continue;
2681
2682			/* Check that no other parents are also on hold */
2683			if (hold_request(w))
2684				continue;
2685
2686			list_move_tail(&w->sched.link, &list);
2687		}
2688
2689		rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2690	} while (rq);
2691}
2692
2693static void execlists_unhold(struct intel_engine_cs *engine,
2694			     struct i915_request *rq)
2695{
2696	spin_lock_irq(&engine->active.lock);
2697
2698	/*
2699	 * Move this request back to the priority queue, and all of its
2700	 * children and grandchildren that were suspended along with it.
2701	 */
2702	__execlists_unhold(rq);
2703
2704	if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2705		engine->execlists.queue_priority_hint = rq_prio(rq);
2706		tasklet_hi_schedule(&engine->execlists.tasklet);
2707	}
2708
2709	spin_unlock_irq(&engine->active.lock);
2710}
2711
2712struct execlists_capture {
2713	struct work_struct work;
2714	struct i915_request *rq;
2715	struct i915_gpu_coredump *error;
2716};
2717
2718static void execlists_capture_work(struct work_struct *work)
2719{
2720	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2721	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2722	struct intel_engine_cs *engine = cap->rq->engine;
2723	struct intel_gt_coredump *gt = cap->error->gt;
2724	struct intel_engine_capture_vma *vma;
2725
2726	/* Compress all the objects attached to the request, slow! */
2727	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2728	if (vma) {
2729		struct i915_vma_compress *compress =
2730			i915_vma_capture_prepare(gt);
2731
2732		intel_engine_coredump_add_vma(gt->engine, vma, compress);
2733		i915_vma_capture_finish(gt, compress);
2734	}
2735
2736	gt->simulated = gt->engine->simulated;
2737	cap->error->simulated = gt->simulated;
2738
2739	/* Publish the error state, and announce it to the world */
2740	i915_error_state_store(cap->error);
2741	i915_gpu_coredump_put(cap->error);
2742
2743	/* Return this request and all that depend upon it for signaling */
2744	execlists_unhold(engine, cap->rq);
2745	i915_request_put(cap->rq);
2746
2747	kfree(cap);
2748}
2749
2750static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2751{
2752	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2753	struct execlists_capture *cap;
2754
2755	cap = kmalloc(sizeof(*cap), gfp);
2756	if (!cap)
2757		return NULL;
2758
2759	cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2760	if (!cap->error)
2761		goto err_cap;
2762
2763	cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2764	if (!cap->error->gt)
2765		goto err_gpu;
2766
2767	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2768	if (!cap->error->gt->engine)
2769		goto err_gt;
2770
2771	return cap;
2772
2773err_gt:
2774	kfree(cap->error->gt);
2775err_gpu:
2776	kfree(cap->error);
2777err_cap:
2778	kfree(cap);
2779	return NULL;
2780}
2781
2782static bool execlists_capture(struct intel_engine_cs *engine)
2783{
2784	struct execlists_capture *cap;
2785
2786	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2787		return true;
2788
2789	/*
2790	 * We need to _quickly_ capture the engine state before we reset.
2791	 * We are inside an atomic section (softirq) here and we are delaying
2792	 * the forced preemption event.
2793	 */
2794	cap = capture_regs(engine);
2795	if (!cap)
2796		return true;
2797
2798	spin_lock_irq(&engine->active.lock);
2799	cap->rq = execlists_active(&engine->execlists);
2800	if (cap->rq) {
2801		cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2802		cap->rq = i915_request_get_rcu(cap->rq);
2803	}
2804	spin_unlock_irq(&engine->active.lock);
2805	if (!cap->rq)
2806		goto err_free;
2807
2808	/*
2809	 * Remove the request from the execlists queue, and take ownership
2810	 * of the request. We pass it to our worker who will _slowly_ compress
2811	 * all the pages the _user_ requested for debugging their batch, after
2812	 * which we return it to the queue for signaling.
2813	 *
2814	 * By removing them from the execlists queue, we also remove the
2815	 * requests from being processed by __unwind_incomplete_requests()
2816	 * during the intel_engine_reset(), and so they will *not* be replayed
2817	 * afterwards.
2818	 *
2819	 * Note that because we have not yet reset the engine at this point,
2820	 * it is possible for the request that we have identified as being
2821	 * guilty, did in fact complete and we will then hit an arbitration
2822	 * point allowing the outstanding preemption to succeed. The likelihood
2823	 * of that is very low (as capturing of the engine registers should be
2824	 * fast enough to run inside an irq-off atomic section!), so we will
2825	 * simply hold that request accountable for being non-preemptible
2826	 * long enough to force the reset.
2827	 */
2828	if (!execlists_hold(engine, cap->rq))
2829		goto err_rq;
2830
2831	INIT_WORK(&cap->work, execlists_capture_work);
2832	schedule_work(&cap->work);
2833	return true;
2834
2835err_rq:
2836	i915_request_put(cap->rq);
2837err_free:
2838	i915_gpu_coredump_put(cap->error);
2839	kfree(cap);
2840	return false;
2841}
2842
2843static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
2844{
2845	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2846	unsigned long *lock = &engine->gt->reset.flags;
2847
2848	if (!intel_has_reset_engine(engine->gt))
2849		return;
2850
2851	if (test_and_set_bit(bit, lock))
2852		return;
2853
2854	ENGINE_TRACE(engine, "reset for %s\n", msg);
2855
2856	/* Mark this tasklet as disabled to avoid waiting for it to complete */
2857	tasklet_disable_nosync(&engine->execlists.tasklet);
2858
2859	ring_set_paused(engine, 1); /* Freeze the current request in place */
2860	if (execlists_capture(engine))
2861		intel_engine_reset(engine, msg);
2862	else
2863		ring_set_paused(engine, 0);
2864
2865	tasklet_enable(&engine->execlists.tasklet);
2866	clear_and_wake_up_bit(bit, lock);
2867}
2868
2869static bool preempt_timeout(const struct intel_engine_cs *const engine)
2870{
2871	const struct timeout *t = &engine->execlists.preempt;
2872
2873	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2874		return false;
2875
2876	if (!timer_expired(t))
2877		return false;
2878
2879	return READ_ONCE(engine->execlists.pending[0]);
2880}
2881
2882/*
2883 * Check the unread Context Status Buffers and manage the submission of new
2884 * contexts to the ELSP accordingly.
2885 */
2886static void execlists_submission_tasklet(unsigned long data)
2887{
2888	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2889	bool timeout = preempt_timeout(engine);
2890
2891	process_csb(engine);
2892
2893	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2894		engine->execlists.error_interrupt = 0;
2895		if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
2896			execlists_reset(engine, "CS error");
2897	}
2898
2899	if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2900		unsigned long flags;
2901
2902		spin_lock_irqsave(&engine->active.lock, flags);
2903		__execlists_submission_tasklet(engine);
2904		spin_unlock_irqrestore(&engine->active.lock, flags);
2905
2906		/* Recheck after serialising with direct-submission */
2907		if (unlikely(timeout && preempt_timeout(engine)))
2908			execlists_reset(engine, "preemption time out");
2909	}
2910}
2911
2912static void __execlists_kick(struct intel_engine_execlists *execlists)
2913{
2914	/* Kick the tasklet for some interrupt coalescing and reset handling */
2915	tasklet_hi_schedule(&execlists->tasklet);
2916}
2917
2918#define execlists_kick(t, member) \
2919	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2920
2921#ifdef __linux__
2922
2923static void execlists_timeslice(struct timer_list *timer)
2924{
2925	execlists_kick(timer, timer);
2926}
2927
2928static void execlists_preempt(struct timer_list *timer)
2929{
2930	execlists_kick(timer, preempt);
2931}
2932
2933#else
2934
2935static void execlists_timeslice(void *arg)
2936{
2937	struct timeout *timer = arg;
2938	execlists_kick(timer, timer);
2939}
2940
2941static void execlists_preempt(void *arg)
2942{
2943	struct timeout *timer = arg;
2944	execlists_kick(timer, preempt);
2945}
2946
2947#endif
2948
2949static void queue_request(struct intel_engine_cs *engine,
2950			  struct i915_request *rq)
2951{
2952	GEM_BUG_ON(!list_empty(&rq->sched.link));
2953	list_add_tail(&rq->sched.link,
2954		      i915_sched_lookup_priolist(engine, rq_prio(rq)));
2955	set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2956}
2957
2958static void __submit_queue_imm(struct intel_engine_cs *engine)
2959{
2960	struct intel_engine_execlists * const execlists = &engine->execlists;
2961
2962	if (reset_in_progress(execlists))
2963		return; /* defer until we restart the engine following reset */
2964
2965	if (execlists->tasklet.func == execlists_submission_tasklet)
2966		__execlists_submission_tasklet(engine);
2967	else
2968		tasklet_hi_schedule(&execlists->tasklet);
2969}
2970
2971static void submit_queue(struct intel_engine_cs *engine,
2972			 const struct i915_request *rq)
2973{
2974	struct intel_engine_execlists *execlists = &engine->execlists;
2975
2976	if (rq_prio(rq) <= execlists->queue_priority_hint)
2977		return;
2978
2979	execlists->queue_priority_hint = rq_prio(rq);
2980	__submit_queue_imm(engine);
2981}
2982
2983static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2984			     const struct i915_request *rq)
2985{
2986	GEM_BUG_ON(i915_request_on_hold(rq));
2987	return !list_empty(&engine->active.hold) && hold_request(rq);
2988}
2989
2990static void execlists_submit_request(struct i915_request *request)
2991{
2992	struct intel_engine_cs *engine = request->engine;
2993	unsigned long flags;
2994
2995	/* Will be called from irq-context when using foreign fences. */
2996	spin_lock_irqsave(&engine->active.lock, flags);
2997
2998	if (unlikely(ancestor_on_hold(engine, request))) {
2999		RQ_TRACE(request, "ancestor on hold\n");
3000		list_add_tail(&request->sched.link, &engine->active.hold);
3001		i915_request_set_hold(request);
3002	} else {
3003		queue_request(engine, request);
3004
3005		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3006		GEM_BUG_ON(list_empty(&request->sched.link));
3007
3008		submit_queue(engine, request);
3009	}
3010
3011	spin_unlock_irqrestore(&engine->active.lock, flags);
3012}
3013
3014static void __execlists_context_fini(struct intel_context *ce)
3015{
3016	intel_ring_put(ce->ring);
3017	i915_vma_put(ce->state);
3018}
3019
3020static void execlists_context_destroy(struct kref *kref)
3021{
3022	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3023
3024	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3025	GEM_BUG_ON(intel_context_is_pinned(ce));
3026
3027	if (ce->state)
3028		__execlists_context_fini(ce);
3029
3030	intel_context_fini(ce);
3031	intel_context_free(ce);
3032}
3033
3034static void
3035set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3036{
3037	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3038		return;
3039
3040	vaddr += engine->context_size;
3041
3042	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3043}
3044
3045static void
3046check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3047{
3048	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3049		return;
3050
3051	vaddr += engine->context_size;
3052
3053	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3054		dev_err_once(engine->i915->drm.dev,
3055			     "%s context redzone overwritten!\n",
3056			     engine->name);
3057}
3058
3059static void execlists_context_unpin(struct intel_context *ce)
3060{
3061	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
3062		      ce->engine);
3063
3064	i915_gem_object_unpin_map(ce->state->obj);
3065}
3066
3067static void
3068__execlists_update_reg_state(const struct intel_context *ce,
3069			     const struct intel_engine_cs *engine,
3070			     u32 head)
3071{
3072	struct intel_ring *ring = ce->ring;
3073	u32 *regs = ce->lrc_reg_state;
3074
3075	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3076	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3077
3078	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3079	regs[CTX_RING_HEAD] = head;
3080	regs[CTX_RING_TAIL] = ring->tail;
3081	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3082
3083	/* RPCS */
3084	if (engine->class == RENDER_CLASS) {
3085		regs[CTX_R_PWR_CLK_STATE] =
3086			intel_sseu_make_rpcs(engine->i915, &ce->sseu);
3087
3088		i915_oa_init_reg_state(ce, engine);
3089	}
3090}
3091
3092static int
3093__execlists_context_pin(struct intel_context *ce,
3094			struct intel_engine_cs *engine)
3095{
3096	void *vaddr;
3097
3098	GEM_BUG_ON(!ce->state);
3099	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3100
3101	vaddr = i915_gem_object_pin_map(ce->state->obj,
3102					i915_coherent_map_type(engine->i915) |
3103					I915_MAP_OVERRIDE);
3104	if (IS_ERR(vaddr))
3105		return PTR_ERR(vaddr);
3106
3107	ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3108	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
3109	__execlists_update_reg_state(ce, engine, ce->ring->tail);
3110
3111	return 0;
3112}
3113
3114static int execlists_context_pin(struct intel_context *ce)
3115{
3116	return __execlists_context_pin(ce, ce->engine);
3117}
3118
3119static int execlists_context_alloc(struct intel_context *ce)
3120{
3121	return __execlists_context_alloc(ce, ce->engine);
3122}
3123
3124static void execlists_context_reset(struct intel_context *ce)
3125{
3126	CE_TRACE(ce, "reset\n");
3127	GEM_BUG_ON(!intel_context_is_pinned(ce));
3128
3129	intel_ring_reset(ce->ring, ce->ring->emit);
3130
3131	/* Scrub away the garbage */
3132	execlists_init_reg_state(ce->lrc_reg_state,
3133				 ce, ce->engine, ce->ring, true);
3134	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3135
3136	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3137}
3138
3139static const struct intel_context_ops execlists_context_ops = {
3140	.alloc = execlists_context_alloc,
3141
3142	.pin = execlists_context_pin,
3143	.unpin = execlists_context_unpin,
3144
3145	.enter = intel_context_enter_engine,
3146	.exit = intel_context_exit_engine,
3147
3148	.reset = execlists_context_reset,
3149	.destroy = execlists_context_destroy,
3150};
3151
3152static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3153{
3154	u32 *cs;
3155
3156	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3157		return 0;
3158
3159	cs = intel_ring_begin(rq, 6);
3160	if (IS_ERR(cs))
3161		return PTR_ERR(cs);
3162
3163	/*
3164	 * Check if we have been preempted before we even get started.
3165	 *
3166	 * After this point i915_request_started() reports true, even if
3167	 * we get preempted and so are no longer running.
3168	 */
3169	*cs++ = MI_ARB_CHECK;
3170	*cs++ = MI_NOOP;
3171
3172	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3173	*cs++ = i915_request_timeline(rq)->hwsp_offset;
3174	*cs++ = 0;
3175	*cs++ = rq->fence.seqno - 1;
3176
3177	intel_ring_advance(rq, cs);
3178
3179	/* Record the updated position of the request's payload */
3180	rq->infix = intel_ring_offset(rq, cs);
3181
3182	return 0;
3183}
3184
3185static int execlists_request_alloc(struct i915_request *request)
3186{
3187	int ret;
3188
3189	GEM_BUG_ON(!intel_context_is_pinned(request->context));
3190
3191	/*
3192	 * Flush enough space to reduce the likelihood of waiting after
3193	 * we start building the request - in which case we will just
3194	 * have to repeat work.
3195	 */
3196	request->reserved_space += EXECLISTS_REQUEST_SIZE;
3197
3198	/*
3199	 * Note that after this point, we have committed to using
3200	 * this request as it is being used to both track the
3201	 * state of engine initialisation and liveness of the
3202	 * golden renderstate above. Think twice before you try
3203	 * to cancel/unwind this request now.
3204	 */
3205
3206	/* Unconditionally invalidate GPU caches and TLBs. */
3207	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3208	if (ret)
3209		return ret;
3210
3211	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3212	return 0;
3213}
3214
3215/*
3216 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3217 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3218 * but there is a slight complication as this is applied in WA batch where the
3219 * values are only initialized once so we cannot take register value at the
3220 * beginning and reuse it further; hence we save its value to memory, upload a
3221 * constant value with bit21 set and then we restore it back with the saved value.
3222 * To simplify the WA, a constant value is formed by using the default value
3223 * of this register. This shouldn't be a problem because we are only modifying
3224 * it for a short period and this batch in non-premptible. We can ofcourse
3225 * use additional instructions that read the actual value of the register
3226 * at that time and set our bit of interest but it makes the WA complicated.
3227 *
3228 * This WA is also required for Gen9 so extracting as a function avoids
3229 * code duplication.
3230 */
3231static u32 *
3232gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3233{
3234	/* NB no one else is allowed to scribble over scratch + 256! */
3235	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3236	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3237	*batch++ = intel_gt_scratch_offset(engine->gt,
3238					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3239	*batch++ = 0;
3240
3241	*batch++ = MI_LOAD_REGISTER_IMM(1);
3242	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3243	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3244
3245	batch = gen8_emit_pipe_control(batch,
3246				       PIPE_CONTROL_CS_STALL |
3247				       PIPE_CONTROL_DC_FLUSH_ENABLE,
3248				       0);
3249
3250	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3251	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3252	*batch++ = intel_gt_scratch_offset(engine->gt,
3253					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3254	*batch++ = 0;
3255
3256	return batch;
3257}
3258
3259/*
3260 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3261 * initialized at the beginning and shared across all contexts but this field
3262 * helps us to have multiple batches at different offsets and select them based
3263 * on a criteria. At the moment this batch always start at the beginning of the page
3264 * and at this point we don't have multiple wa_ctx batch buffers.
3265 *
3266 * The number of WA applied are not known at the beginning; we use this field
3267 * to return the no of DWORDS written.
3268 *
3269 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3270 * so it adds NOOPs as padding to make it cacheline aligned.
3271 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3272 * makes a complete batch buffer.
3273 */
3274static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3275{
3276	/* WaDisableCtxRestoreArbitration:bdw,chv */
3277	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3278
3279	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3280	if (IS_BROADWELL(engine->i915))
3281		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3282
3283	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3284	/* Actual scratch location is at 128 bytes offset */
3285	batch = gen8_emit_pipe_control(batch,
3286				       PIPE_CONTROL_FLUSH_L3 |
3287				       PIPE_CONTROL_STORE_DATA_INDEX |
3288				       PIPE_CONTROL_CS_STALL |
3289				       PIPE_CONTROL_QW_WRITE,
3290				       LRC_PPHWSP_SCRATCH_ADDR);
3291
3292	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3293
3294	/* Pad to end of cacheline */
3295	while ((unsigned long)batch % CACHELINE_BYTES)
3296		*batch++ = MI_NOOP;
3297
3298	/*
3299	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3300	 * execution depends on the length specified in terms of cache lines
3301	 * in the register CTX_RCS_INDIRECT_CTX
3302	 */
3303
3304	return batch;
3305}
3306
3307struct lri {
3308	i915_reg_t reg;
3309	u32 value;
3310};
3311
3312static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3313{
3314	GEM_BUG_ON(!count || count > 63);
3315
3316	*batch++ = MI_LOAD_REGISTER_IMM(count);
3317	do {
3318		*batch++ = i915_mmio_reg_offset(lri->reg);
3319		*batch++ = lri->value;
3320	} while (lri++, --count);
3321	*batch++ = MI_NOOP;
3322
3323	return batch;
3324}
3325
3326static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3327{
3328	static const struct lri lri[] = {
3329		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3330		{
3331			COMMON_SLICE_CHICKEN2,
3332			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3333				       0),
3334		},
3335
3336		/* BSpec: 11391 */
3337		{
3338			FF_SLICE_CHICKEN,
3339			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3340				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3341		},
3342
3343		/* BSpec: 11299 */
3344		{
3345			_3D_CHICKEN3,
3346			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3347				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3348		}
3349	};
3350
3351	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3352
3353	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3354	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3355
3356	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3357	batch = gen8_emit_pipe_control(batch,
3358				       PIPE_CONTROL_FLUSH_L3 |
3359				       PIPE_CONTROL_STORE_DATA_INDEX |
3360				       PIPE_CONTROL_CS_STALL |
3361				       PIPE_CONTROL_QW_WRITE,
3362				       LRC_PPHWSP_SCRATCH_ADDR);
3363
3364	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3365
3366	/* WaMediaPoolStateCmdInWABB:bxt,glk */
3367	if (HAS_POOLED_EU(engine->i915)) {
3368		/*
3369		 * EU pool configuration is setup along with golden context
3370		 * during context initialization. This value depends on
3371		 * device type (2x6 or 3x6) and needs to be updated based
3372		 * on which subslice is disabled especially for 2x6
3373		 * devices, however it is safe to load default
3374		 * configuration of 3x6 device instead of masking off
3375		 * corresponding bits because HW ignores bits of a disabled
3376		 * subslice and drops down to appropriate config. Please
3377		 * see render_state_setup() in i915_gem_render_state.c for
3378		 * possible configurations, to avoid duplication they are
3379		 * not shown here again.
3380		 */
3381		*batch++ = GEN9_MEDIA_POOL_STATE;
3382		*batch++ = GEN9_MEDIA_POOL_ENABLE;
3383		*batch++ = 0x00777000;
3384		*batch++ = 0;
3385		*batch++ = 0;
3386		*batch++ = 0;
3387	}
3388
3389	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3390
3391	/* Pad to end of cacheline */
3392	while ((unsigned long)batch % CACHELINE_BYTES)
3393		*batch++ = MI_NOOP;
3394
3395	return batch;
3396}
3397
3398static u32 *
3399gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3400{
3401	int i;
3402
3403	/*
3404	 * WaPipeControlBefore3DStateSamplePattern: cnl
3405	 *
3406	 * Ensure the engine is idle prior to programming a
3407	 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3408	 */
3409	batch = gen8_emit_pipe_control(batch,
3410				       PIPE_CONTROL_CS_STALL,
3411				       0);
3412	/*
3413	 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3414	 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3415	 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3416	 * confusing. Since gen8_emit_pipe_control() already advances the
3417	 * batch by 6 dwords, we advance the other 10 here, completing a
3418	 * cacheline. It's not clear if the workaround requires this padding
3419	 * before other commands, or if it's just the regular padding we would
3420	 * already have for the workaround bb, so leave it here for now.
3421	 */
3422	for (i = 0; i < 10; i++)
3423		*batch++ = MI_NOOP;
3424
3425	/* Pad to end of cacheline */
3426	while ((unsigned long)batch % CACHELINE_BYTES)
3427		*batch++ = MI_NOOP;
3428
3429	return batch;
3430}
3431
3432#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3433
3434static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3435{
3436	struct drm_i915_gem_object *obj;
3437	struct i915_vma *vma;
3438	int err;
3439
3440	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3441	if (IS_ERR(obj))
3442		return PTR_ERR(obj);
3443
3444	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3445	if (IS_ERR(vma)) {
3446		err = PTR_ERR(vma);
3447		goto err;
3448	}
3449
3450	err = i915_ggtt_pin(vma, 0, PIN_HIGH);
3451	if (err)
3452		goto err;
3453
3454	engine->wa_ctx.vma = vma;
3455	return 0;
3456
3457err:
3458	i915_gem_object_put(obj);
3459	return err;
3460}
3461
3462static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3463{
3464	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3465}
3466
3467typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3468
3469static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3470{
3471	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3472	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3473					    &wa_ctx->per_ctx };
3474	wa_bb_func_t wa_bb_fn[2];
3475	struct vm_page *page;
3476	void *batch, *batch_ptr;
3477	unsigned int i;
3478	int ret;
3479
3480	if (engine->class != RENDER_CLASS)
3481		return 0;
3482
3483	switch (INTEL_GEN(engine->i915)) {
3484	case 12:
3485	case 11:
3486		return 0;
3487	case 10:
3488		wa_bb_fn[0] = gen10_init_indirectctx_bb;
3489		wa_bb_fn[1] = NULL;
3490		break;
3491	case 9:
3492		wa_bb_fn[0] = gen9_init_indirectctx_bb;
3493		wa_bb_fn[1] = NULL;
3494		break;
3495	case 8:
3496		wa_bb_fn[0] = gen8_init_indirectctx_bb;
3497		wa_bb_fn[1] = NULL;
3498		break;
3499	default:
3500		MISSING_CASE(INTEL_GEN(engine->i915));
3501		return 0;
3502	}
3503
3504	ret = lrc_setup_wa_ctx(engine);
3505	if (ret) {
3506		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3507		return ret;
3508	}
3509
3510	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3511	batch = batch_ptr = kmap_atomic(page);
3512
3513	/*
3514	 * Emit the two workaround batch buffers, recording the offset from the
3515	 * start of the workaround batch buffer object for each and their
3516	 * respective sizes.
3517	 */
3518	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3519		wa_bb[i]->offset = batch_ptr - batch;
3520		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3521						  CACHELINE_BYTES))) {
3522			ret = -EINVAL;
3523			break;
3524		}
3525		if (wa_bb_fn[i])
3526			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3527		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3528	}
3529
3530	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3531
3532	kunmap_atomic(batch);
3533	if (ret)
3534		lrc_destroy_wa_ctx(engine);
3535
3536	return ret;
3537}
3538
3539static void enable_error_interrupt(struct intel_engine_cs *engine)
3540{
3541	u32 status;
3542
3543	engine->execlists.error_interrupt = 0;
3544	ENGINE_WRITE(engine, RING_EMR, ~0u);
3545	ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
3546
3547	status = ENGINE_READ(engine, RING_ESR);
3548	if (unlikely(status)) {
3549		dev_err(engine->i915->drm.dev,
3550			"engine '%s' resumed still in error: %08x\n",
3551			engine->name, status);
3552		__intel_gt_reset(engine->gt, engine->mask);
3553	}
3554
3555	/*
3556	 * On current gen8+, we have 2 signals to play with
3557	 *
3558	 * - I915_ERROR_INSTUCTION (bit 0)
3559	 *
3560	 *    Generate an error if the command parser encounters an invalid
3561	 *    instruction
3562	 *
3563	 *    This is a fatal error.
3564	 *
3565	 * - CP_PRIV (bit 2)
3566	 *
3567	 *    Generate an error on privilege violation (where the CP replaces
3568	 *    the instruction with a no-op). This also fires for writes into
3569	 *    read-only scratch pages.
3570	 *
3571	 *    This is a non-fatal error, parsing continues.
3572	 *
3573	 * * there are a few others defined for odd HW that we do not use
3574	 *
3575	 * Since CP_PRIV fires for cases where we have chosen to ignore the
3576	 * error (as the HW is validating and suppressing the mistakes), we
3577	 * only unmask the instruction error bit.
3578	 */
3579	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
3580}
3581
3582static void enable_execlists(struct intel_engine_cs *engine)
3583{
3584	u32 mode;
3585
3586	assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3587
3588	intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3589
3590	if (INTEL_GEN(engine->i915) >= 11)
3591		mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3592	else
3593		mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3594	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3595
3596	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3597
3598	ENGINE_WRITE_FW(engine,
3599			RING_HWS_PGA,
3600			i915_ggtt_offset(engine->status_page.vma));
3601	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3602
3603	enable_error_interrupt(engine);
3604
3605	engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
3606}
3607
3608static bool unexpected_starting_state(struct intel_engine_cs *engine)
3609{
3610	bool unexpected = false;
3611
3612	if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3613		DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3614		unexpected = true;
3615	}
3616
3617	return unexpected;
3618}
3619
3620static int execlists_resume(struct intel_engine_cs *engine)
3621{
3622	intel_mocs_init_engine(engine);
3623
3624	intel_engine_reset_breadcrumbs(engine);
3625
3626	if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3627		struct drm_printer p = drm_debug_printer(__func__);
3628
3629		intel_engine_dump(engine, &p, NULL);
3630	}
3631
3632	enable_execlists(engine);
3633
3634	return 0;
3635}
3636
3637static void execlists_reset_prepare(struct intel_engine_cs *engine)
3638{
3639	struct intel_engine_execlists * const execlists = &engine->execlists;
3640	unsigned long flags;
3641
3642	ENGINE_TRACE(engine, "depth<-%d\n",
3643		     atomic_read(&execlists->tasklet.count));
3644
3645	/*
3646	 * Prevent request submission to the hardware until we have
3647	 * completed the reset in i915_gem_reset_finish(). If a request
3648	 * is completed by one engine, it may then queue a request
3649	 * to a second via its execlists->tasklet *just* as we are
3650	 * calling engine->resume() and also writing the ELSP.
3651	 * Turning off the execlists->tasklet until the reset is over
3652	 * prevents the race.
3653	 */
3654	__tasklet_disable_sync_once(&execlists->tasklet);
3655	GEM_BUG_ON(!reset_in_progress(execlists));
3656
3657	/* And flush any current direct submission. */
3658	spin_lock_irqsave(&engine->active.lock, flags);
3659	spin_unlock_irqrestore(&engine->active.lock, flags);
3660
3661	/*
3662	 * We stop engines, otherwise we might get failed reset and a
3663	 * dead gpu (on elk). Also as modern gpu as kbl can suffer
3664	 * from system hang if batchbuffer is progressing when
3665	 * the reset is issued, regardless of READY_TO_RESET ack.
3666	 * Thus assume it is best to stop engines on all gens
3667	 * where we have a gpu reset.
3668	 *
3669	 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3670	 *
3671	 * FIXME: Wa for more modern gens needs to be validated
3672	 */
3673	intel_engine_stop_cs(engine);
3674}
3675
3676static void reset_csb_pointers(struct intel_engine_cs *engine)
3677{
3678	struct intel_engine_execlists * const execlists = &engine->execlists;
3679	const unsigned int reset_value = execlists->csb_size - 1;
3680
3681	ring_set_paused(engine, 0);
3682
3683	/*
3684	 * After a reset, the HW starts writing into CSB entry [0]. We
3685	 * therefore have to set our HEAD pointer back one entry so that
3686	 * the *first* entry we check is entry 0. To complicate this further,
3687	 * as we don't wait for the first interrupt after reset, we have to
3688	 * fake the HW write to point back to the last entry so that our
3689	 * inline comparison of our cached head position against the last HW
3690	 * write works even before the first interrupt.
3691	 */
3692	execlists->csb_head = reset_value;
3693	WRITE_ONCE(*execlists->csb_write, reset_value);
3694	wmb(); /* Make sure this is visible to HW (paranoia?) */
3695
3696	/*
3697	 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3698	 * Bludgeon them with a mmio update to be sure.
3699	 */
3700	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3701		     reset_value << 8 | reset_value);
3702	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3703
3704	invalidate_csb_entries(&execlists->csb_status[0],
3705			       &execlists->csb_status[reset_value]);
3706}
3707
3708static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3709{
3710	int x;
3711
3712	x = lrc_ring_mi_mode(engine);
3713	if (x != -1) {
3714		regs[x + 1] &= ~STOP_RING;
3715		regs[x + 1] |= STOP_RING << 16;
3716	}
3717}
3718
3719static void __execlists_reset_reg_state(const struct intel_context *ce,
3720					const struct intel_engine_cs *engine)
3721{
3722	u32 *regs = ce->lrc_reg_state;
3723
3724	__reset_stop_ring(regs, engine);
3725}
3726
3727static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3728{
3729	struct intel_engine_execlists * const execlists = &engine->execlists;
3730	struct intel_context *ce;
3731	struct i915_request *rq;
3732	u32 head;
3733
3734	mb(); /* paranoia: read the CSB pointers from after the reset */
3735	clflush((vaddr_t)execlists->csb_write);
3736	mb();
3737
3738	process_csb(engine); /* drain preemption events */
3739
3740	/* Following the reset, we need to reload the CSB read/write pointers */
3741	reset_csb_pointers(engine);
3742
3743	/*
3744	 * Save the currently executing context, even if we completed
3745	 * its request, it was still running at the time of the
3746	 * reset and will have been clobbered.
3747	 */
3748	rq = execlists_active(execlists);
3749	if (!rq)
3750		goto unwind;
3751
3752	ce = rq->context;
3753	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3754
3755	if (i915_request_completed(rq)) {
3756		/* Idle context; tidy up the ring so we can restart afresh */
3757		head = intel_ring_wrap(ce->ring, rq->tail);
3758		goto out_replay;
3759	}
3760
3761	/* We still have requests in-flight; the engine should be active */
3762	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3763
3764	/* Context has requests still in-flight; it should not be idle! */
3765	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3766
3767	rq = active_request(ce->timeline, rq);
3768	head = intel_ring_wrap(ce->ring, rq->head);
3769	GEM_BUG_ON(head == ce->ring->tail);
3770
3771	/*
3772	 * If this request hasn't started yet, e.g. it is waiting on a
3773	 * semaphore, we need to avoid skipping the request or else we
3774	 * break the signaling chain. However, if the context is corrupt
3775	 * the request will not restart and we will be stuck with a wedged
3776	 * device. It is quite often the case that if we issue a reset
3777	 * while the GPU is loading the context image, that the context
3778	 * image becomes corrupt.
3779	 *
3780	 * Otherwise, if we have not started yet, the request should replay
3781	 * perfectly and we do not need to flag the result as being erroneous.
3782	 */
3783	if (!i915_request_started(rq))
3784		goto out_replay;
3785
3786	/*
3787	 * If the request was innocent, we leave the request in the ELSP
3788	 * and will try to replay it on restarting. The context image may
3789	 * have been corrupted by the reset, in which case we may have
3790	 * to service a new GPU hang, but more likely we can continue on
3791	 * without impact.
3792	 *
3793	 * If the request was guilty, we presume the context is corrupt
3794	 * and have to at least restore the RING register in the context
3795	 * image back to the expected values to skip over the guilty request.
3796	 */
3797	__i915_request_reset(rq, stalled);
3798	if (!stalled)
3799		goto out_replay;
3800
3801	/*
3802	 * We want a simple context + ring to execute the breadcrumb update.
3803	 * We cannot rely on the context being intact across the GPU hang,
3804	 * so clear it and rebuild just what we need for the breadcrumb.
3805	 * All pending requests for this context will be zapped, and any
3806	 * future request will be after userspace has had the opportunity
3807	 * to recreate its own state.
3808	 */
3809	GEM_BUG_ON(!intel_context_is_pinned(ce));
3810	restore_default_state(ce, engine);
3811
3812out_replay:
3813	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3814		     head, ce->ring->tail);
3815	__execlists_reset_reg_state(ce, engine);
3816	__execlists_update_reg_state(ce, engine, head);
3817	ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3818
3819unwind:
3820	/* Push back any incomplete requests for replay after the reset. */
3821	cancel_port_requests(execlists);
3822	__unwind_incomplete_requests(engine);
3823}
3824
3825static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3826{
3827	unsigned long flags;
3828
3829	ENGINE_TRACE(engine, "\n");
3830
3831	spin_lock_irqsave(&engine->active.lock, flags);
3832
3833	__execlists_reset(engine, stalled);
3834
3835	spin_unlock_irqrestore(&engine->active.lock, flags);
3836}
3837
3838static void nop_submission_tasklet(unsigned long data)
3839{
3840	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3841
3842	/* The driver is wedged; don't process any more events. */
3843	WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
3844}
3845
3846static void execlists_reset_cancel(struct intel_engine_cs *engine)
3847{
3848	struct intel_engine_execlists * const execlists = &engine->execlists;
3849	struct i915_request *rq, *rn;
3850	struct rb_node *rb;
3851	unsigned long flags;
3852
3853	ENGINE_TRACE(engine, "\n");
3854
3855	/*
3856	 * Before we call engine->cancel_requests(), we should have exclusive
3857	 * access to the submission state. This is arranged for us by the
3858	 * caller disabling the interrupt generation, the tasklet and other
3859	 * threads that may then access the same state, giving us a free hand
3860	 * to reset state. However, we still need to let lockdep be aware that
3861	 * we know this state may be accessed in hardirq context, so we
3862	 * disable the irq around this manipulation and we want to keep
3863	 * the spinlock focused on its duties and not accidentally conflate
3864	 * coverage to the submission's irq state. (Similarly, although we
3865	 * shouldn't need to disable irq around the manipulation of the
3866	 * submission's irq state, we also wish to remind ourselves that
3867	 * it is irq state.)
3868	 */
3869	spin_lock_irqsave(&engine->active.lock, flags);
3870
3871	__execlists_reset(engine, true);
3872
3873	/* Mark all executing requests as skipped. */
3874	list_for_each_entry(rq, &engine->active.requests, sched.link)
3875		mark_eio(rq);
3876
3877	/* Flush the queued requests to the timeline list (for retiring). */
3878	while ((rb = rb_first_cached(&execlists->queue))) {
3879		struct i915_priolist *p = to_priolist(rb);
3880		int i;
3881
3882		priolist_for_each_request_consume(rq, rn, p, i) {
3883			mark_eio(rq);
3884			__i915_request_submit(rq);
3885		}
3886
3887		rb_erase_cached(&p->node, &execlists->queue);
3888		i915_priolist_free(p);
3889	}
3890
3891	/* On-hold requests will be flushed to timeline upon their release */
3892	list_for_each_entry(rq, &engine->active.hold, sched.link)
3893		mark_eio(rq);
3894
3895	/* Cancel all attached virtual engines */
3896	while ((rb = rb_first_cached(&execlists->virtual))) {
3897		struct virtual_engine *ve =
3898			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3899
3900		rb_erase_cached(rb, &execlists->virtual);
3901		RB_CLEAR_NODE(rb);
3902
3903		spin_lock(&ve->base.active.lock);
3904		rq = fetch_and_zero(&ve->request);
3905		if (rq) {
3906			mark_eio(rq);
3907
3908			rq->engine = engine;
3909			__i915_request_submit(rq);
3910			i915_request_put(rq);
3911
3912			ve->base.execlists.queue_priority_hint = INT_MIN;
3913		}
3914		spin_unlock(&ve->base.active.lock);
3915	}
3916
3917	/* Remaining _unready_ requests will be nop'ed when submitted */
3918
3919	execlists->queue_priority_hint = INT_MIN;
3920	execlists->queue = RB_ROOT_CACHED;
3921
3922	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3923	execlists->tasklet.func = nop_submission_tasklet;
3924
3925	spin_unlock_irqrestore(&engine->active.lock, flags);
3926}
3927
3928static void execlists_reset_finish(struct intel_engine_cs *engine)
3929{
3930	struct intel_engine_execlists * const execlists = &engine->execlists;
3931
3932	/*
3933	 * After a GPU reset, we may have requests to replay. Do so now while
3934	 * we still have the forcewake to be sure that the GPU is not allowed
3935	 * to sleep before we restart and reload a context.
3936	 */
3937	GEM_BUG_ON(!reset_in_progress(execlists));
3938	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3939		execlists->tasklet.func(execlists->tasklet.data);
3940
3941	if (__tasklet_enable(&execlists->tasklet))
3942		/* And kick in case we missed a new request submission. */
3943		tasklet_hi_schedule(&execlists->tasklet);
3944	ENGINE_TRACE(engine, "depth->%d\n",
3945		     atomic_read(&execlists->tasklet.count));
3946}
3947
3948static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3949				    u64 offset, u32 len,
3950				    const unsigned int flags)
3951{
3952	u32 *cs;
3953
3954	cs = intel_ring_begin(rq, 4);
3955	if (IS_ERR(cs))
3956		return PTR_ERR(cs);
3957
3958	/*
3959	 * WaDisableCtxRestoreArbitration:bdw,chv
3960	 *
3961	 * We don't need to perform MI_ARB_ENABLE as often as we do (in
3962	 * particular all the gen that do not need the w/a at all!), if we
3963	 * took care to make sure that on every switch into this context
3964	 * (both ordinary and for preemption) that arbitrartion was enabled
3965	 * we would be fine.  However, for gen8 there is another w/a that
3966	 * requires us to not preempt inside GPGPU execution, so we keep
3967	 * arbitration disabled for gen8 batches. Arbitration will be
3968	 * re-enabled before we close the request
3969	 * (engine->emit_fini_breadcrumb).
3970	 */
3971	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3972
3973	/* FIXME(BDW+): Address space and security selectors. */
3974	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3975		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3976	*cs++ = lower_32_bits(offset);
3977	*cs++ = upper_32_bits(offset);
3978
3979	intel_ring_advance(rq, cs);
3980
3981	return 0;
3982}
3983
3984static int gen8_emit_bb_start(struct i915_request *rq,
3985			      u64 offset, u32 len,
3986			      const unsigned int flags)
3987{
3988	u32 *cs;
3989
3990	cs = intel_ring_begin(rq, 6);
3991	if (IS_ERR(cs))
3992		return PTR_ERR(cs);
3993
3994	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3995
3996	*cs++ = MI_BATCH_BUFFER_START_GEN8 |
3997		(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3998	*cs++ = lower_32_bits(offset);
3999	*cs++ = upper_32_bits(offset);
4000
4001	*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4002	*cs++ = MI_NOOP;
4003
4004	intel_ring_advance(rq, cs);
4005
4006	return 0;
4007}
4008
4009static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4010{
4011	ENGINE_WRITE(engine, RING_IMR,
4012		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
4013	ENGINE_POSTING_READ(engine, RING_IMR);
4014}
4015
4016static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4017{
4018	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4019}
4020
4021static int gen8_emit_flush(struct i915_request *request, u32 mode)
4022{
4023	u32 cmd, *cs;
4024
4025	cs = intel_ring_begin(request, 4);
4026	if (IS_ERR(cs))
4027		return PTR_ERR(cs);
4028
4029	cmd = MI_FLUSH_DW + 1;
4030
4031	/* We always require a command barrier so that subsequent
4032	 * commands, such as breadcrumb interrupts, are strictly ordered
4033	 * wrt the contents of the write cache being flushed to memory
4034	 * (and thus being coherent from the CPU).
4035	 */
4036	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4037
4038	if (mode & EMIT_INVALIDATE) {
4039		cmd |= MI_INVALIDATE_TLB;
4040		if (request->engine->class == VIDEO_DECODE_CLASS)
4041			cmd |= MI_INVALIDATE_BSD;
4042	}
4043
4044	*cs++ = cmd;
4045	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4046	*cs++ = 0; /* upper addr */
4047	*cs++ = 0; /* value */
4048	intel_ring_advance(request, cs);
4049
4050	return 0;
4051}
4052
4053static int gen8_emit_flush_render(struct i915_request *request,
4054				  u32 mode)
4055{
4056	bool vf_flush_wa = false, dc_flush_wa = false;
4057	u32 *cs, flags = 0;
4058	int len;
4059
4060	flags |= PIPE_CONTROL_CS_STALL;
4061
4062	if (mode & EMIT_FLUSH) {
4063		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4064		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4065		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4066		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4067	}
4068
4069	if (mode & EMIT_INVALIDATE) {
4070		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4071		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4072		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4073		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4074		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4075		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4076		flags |= PIPE_CONTROL_QW_WRITE;
4077		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4078
4079		/*
4080		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4081		 * pipe control.
4082		 */
4083		if (IS_GEN(request->i915, 9))
4084			vf_flush_wa = true;
4085
4086		/* WaForGAMHang:kbl */
4087		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
4088			dc_flush_wa = true;
4089	}
4090
4091	len = 6;
4092
4093	if (vf_flush_wa)
4094		len += 6;
4095
4096	if (dc_flush_wa)
4097		len += 12;
4098
4099	cs = intel_ring_begin(request, len);
4100	if (IS_ERR(cs))
4101		return PTR_ERR(cs);
4102
4103	if (vf_flush_wa)
4104		cs = gen8_emit_pipe_control(cs, 0, 0);
4105
4106	if (dc_flush_wa)
4107		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4108					    0);
4109
4110	cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4111
4112	if (dc_flush_wa)
4113		cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4114
4115	intel_ring_advance(request, cs);
4116
4117	return 0;
4118}
4119
4120static int gen11_emit_flush_render(struct i915_request *request,
4121				   u32 mode)
4122{
4123	if (mode & EMIT_FLUSH) {
4124		u32 *cs;
4125		u32 flags = 0;
4126
4127		flags |= PIPE_CONTROL_CS_STALL;
4128
4129		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4130		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4131		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4132		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4133		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4134		flags |= PIPE_CONTROL_QW_WRITE;
4135		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4136
4137		cs = intel_ring_begin(request, 6);
4138		if (IS_ERR(cs))
4139			return PTR_ERR(cs);
4140
4141		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4142		intel_ring_advance(request, cs);
4143	}
4144
4145	if (mode & EMIT_INVALIDATE) {
4146		u32 *cs;
4147		u32 flags = 0;
4148
4149		flags |= PIPE_CONTROL_CS_STALL;
4150
4151		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4152		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4153		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4154		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4155		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4156		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4157		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4158		flags |= PIPE_CONTROL_QW_WRITE;
4159		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4160
4161		cs = intel_ring_begin(request, 6);
4162		if (IS_ERR(cs))
4163			return PTR_ERR(cs);
4164
4165		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4166		intel_ring_advance(request, cs);
4167	}
4168
4169	return 0;
4170}
4171
4172static u32 preparser_disable(bool state)
4173{
4174	return MI_ARB_CHECK | 1 << 8 | state;
4175}
4176
4177static int gen12_emit_flush_render(struct i915_request *request,
4178				   u32 mode)
4179{
4180	if (mode & EMIT_FLUSH) {
4181		u32 flags = 0;
4182		u32 *cs;
4183
4184		flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4185		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4186		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4187		/* Wa_1409600907:tgl */
4188		flags |= PIPE_CONTROL_DEPTH_STALL;
4189		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4190		flags |= PIPE_CONTROL_FLUSH_ENABLE;
4191		flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
4192
4193		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4194		flags |= PIPE_CONTROL_QW_WRITE;
4195
4196		flags |= PIPE_CONTROL_CS_STALL;
4197
4198		cs = intel_ring_begin(request, 6);
4199		if (IS_ERR(cs))
4200			return PTR_ERR(cs);
4201
4202		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4203		intel_ring_advance(request, cs);
4204	}
4205
4206	if (mode & EMIT_INVALIDATE) {
4207		u32 flags = 0;
4208		u32 *cs;
4209
4210		flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4211		flags |= PIPE_CONTROL_TLB_INVALIDATE;
4212		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4213		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4214		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4215		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4216		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4217		flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
4218
4219		flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4220		flags |= PIPE_CONTROL_QW_WRITE;
4221
4222		flags |= PIPE_CONTROL_CS_STALL;
4223
4224		cs = intel_ring_begin(request, 8);
4225		if (IS_ERR(cs))
4226			return PTR_ERR(cs);
4227
4228		/*
4229		 * Prevent the pre-parser from skipping past the TLB
4230		 * invalidate and loading a stale page for the batch
4231		 * buffer / request payload.
4232		 */
4233		*cs++ = preparser_disable(true);
4234
4235		cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4236
4237		*cs++ = preparser_disable(false);
4238		intel_ring_advance(request, cs);
4239	}
4240
4241	return 0;
4242}
4243
4244static void assert_request_valid(struct i915_request *rq)
4245{
4246	struct intel_ring *ring __maybe_unused = rq->ring;
4247
4248	/* Can we unwind this request without appearing to go forwards? */
4249	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4250}
4251
4252/*
4253 * Reserve space for 2 NOOPs at the end of each request to be
4254 * used as a workaround for not being allowed to do lite
4255 * restore with HEAD==TAIL (WaIdleLiteRestore).
4256 */
4257static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4258{
4259	/* Ensure there's always at least one preemption point per-request. */
4260	*cs++ = MI_ARB_CHECK;
4261	*cs++ = MI_NOOP;
4262	request->wa_tail = intel_ring_offset(request, cs);
4263
4264	/* Check that entire request is less than half the ring */
4265	assert_request_valid(request);
4266
4267	return cs;
4268}
4269
4270static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4271{
4272	*cs++ = MI_SEMAPHORE_WAIT |
4273		MI_SEMAPHORE_GLOBAL_GTT |
4274		MI_SEMAPHORE_POLL |
4275		MI_SEMAPHORE_SAD_EQ_SDD;
4276	*cs++ = 0;
4277	*cs++ = intel_hws_preempt_address(request->engine);
4278	*cs++ = 0;
4279
4280	return cs;
4281}
4282
4283static __always_inline u32*
4284gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4285				 u32 *cs)
4286{
4287	*cs++ = MI_USER_INTERRUPT;
4288
4289	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4290	if (intel_engine_has_semaphores(request->engine))
4291		cs = emit_preempt_busywait(request, cs);
4292
4293	request->tail = intel_ring_offset(request, cs);
4294	assert_ring_tail_valid(request->ring, request->tail);
4295
4296	return gen8_emit_wa_tail(request, cs);
4297}
4298
4299static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4300{
4301	cs = gen8_emit_ggtt_write(cs,
4302				  request->fence.seqno,
4303				  i915_request_active_timeline(request)->hwsp_offset,
4304				  0);
4305
4306	return gen8_emit_fini_breadcrumb_footer(request, cs);
4307}
4308
4309static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4310{
4311	cs = gen8_emit_pipe_control(cs,
4312				    PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4313				    PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4314				    PIPE_CONTROL_DC_FLUSH_ENABLE,
4315				    0);
4316
4317	/* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4318	cs = gen8_emit_ggtt_write_rcs(cs,
4319				      request->fence.seqno,
4320				      i915_request_active_timeline(request)->hwsp_offset,
4321				      PIPE_CONTROL_FLUSH_ENABLE |
4322				      PIPE_CONTROL_CS_STALL);
4323
4324	return gen8_emit_fini_breadcrumb_footer(request, cs);
4325}
4326
4327static u32 *
4328gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4329{
4330	cs = gen8_emit_ggtt_write_rcs(cs,
4331				      request->fence.seqno,
4332				      i915_request_active_timeline(request)->hwsp_offset,
4333				      PIPE_CONTROL_CS_STALL |
4334				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4335				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4336				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4337				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4338				      PIPE_CONTROL_FLUSH_ENABLE);
4339
4340	return gen8_emit_fini_breadcrumb_footer(request, cs);
4341}
4342
4343/*
4344 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4345 * flush and will continue pre-fetching the instructions after it before the
4346 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4347 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4348 * of the next request before the memory has been flushed, we're guaranteed that
4349 * we won't access the batch itself too early.
4350 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4351 * so, if the current request is modifying an instruction in the next request on
4352 * the same intel_context, we might pre-fetch and then execute the pre-update
4353 * instruction. To avoid this, the users of self-modifying code should either
4354 * disable the parser around the code emitting the memory writes, via a new flag
4355 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4356 * the in-kernel use-cases we've opted to use a separate context, see
4357 * reloc_gpu() as an example.
4358 * All the above applies only to the instructions themselves. Non-inline data
4359 * used by the instructions is not pre-fetched.
4360 */
4361
4362static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4363{
4364	*cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4365		MI_SEMAPHORE_GLOBAL_GTT |
4366		MI_SEMAPHORE_POLL |
4367		MI_SEMAPHORE_SAD_EQ_SDD;
4368	*cs++ = 0;
4369	*cs++ = intel_hws_preempt_address(request->engine);
4370	*cs++ = 0;
4371	*cs++ = 0;
4372	*cs++ = MI_NOOP;
4373
4374	return cs;
4375}
4376
4377static __always_inline u32*
4378gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4379{
4380	*cs++ = MI_USER_INTERRUPT;
4381
4382	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4383	if (intel_engine_has_semaphores(request->engine))
4384		cs = gen12_emit_preempt_busywait(request, cs);
4385
4386	request->tail = intel_ring_offset(request, cs);
4387	assert_ring_tail_valid(request->ring, request->tail);
4388
4389	return gen8_emit_wa_tail(request, cs);
4390}
4391
4392static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4393{
4394	cs = gen8_emit_ggtt_write(cs,
4395				  request->fence.seqno,
4396				  i915_request_active_timeline(request)->hwsp_offset,
4397				  0);
4398
4399	return gen12_emit_fini_breadcrumb_footer(request, cs);
4400}
4401
4402static u32 *
4403gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4404{
4405	cs = gen8_emit_ggtt_write_rcs(cs,
4406				      request->fence.seqno,
4407				      i915_request_active_timeline(request)->hwsp_offset,
4408				      PIPE_CONTROL_CS_STALL |
4409				      PIPE_CONTROL_TILE_CACHE_FLUSH |
4410				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4411				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4412				      /* Wa_1409600907:tgl */
4413				      PIPE_CONTROL_DEPTH_STALL |
4414				      PIPE_CONTROL_DC_FLUSH_ENABLE |
4415				      PIPE_CONTROL_FLUSH_ENABLE |
4416				      PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4417
4418	return gen12_emit_fini_breadcrumb_footer(request, cs);
4419}
4420
4421static void execlists_park(struct intel_engine_cs *engine)
4422{
4423	cancel_timer(&engine->execlists.timer);
4424	cancel_timer(&engine->execlists.preempt);
4425}
4426
4427void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4428{
4429	engine->submit_request = execlists_submit_request;
4430	engine->schedule = i915_schedule;
4431	engine->execlists.tasklet.func = execlists_submission_tasklet;
4432
4433	engine->reset.prepare = execlists_reset_prepare;
4434	engine->reset.rewind = execlists_reset_rewind;
4435	engine->reset.cancel = execlists_reset_cancel;
4436	engine->reset.finish = execlists_reset_finish;
4437
4438	engine->park = execlists_park;
4439	engine->unpark = NULL;
4440
4441	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4442	if (!intel_vgpu_active(engine->i915)) {
4443		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4444		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
4445			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4446			if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
4447				engine->flags |= I915_ENGINE_HAS_TIMESLICES;
4448		}
4449	}
4450
4451	if (INTEL_GEN(engine->i915) >= 12)
4452		engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4453
4454	if (intel_engine_has_preemption(engine))
4455		engine->emit_bb_start = gen8_emit_bb_start;
4456	else
4457		engine->emit_bb_start = gen8_emit_bb_start_noarb;
4458}
4459
4460static void execlists_shutdown(struct intel_engine_cs *engine)
4461{
4462	/* Synchronise with residual timers and any softirq they raise */
4463	del_timer_sync(&engine->execlists.timer);
4464	del_timer_sync(&engine->execlists.preempt);
4465	tasklet_kill(&engine->execlists.tasklet);
4466}
4467
4468static void execlists_release(struct intel_engine_cs *engine)
4469{
4470	execlists_shutdown(engine);
4471
4472	intel_engine_cleanup_common(engine);
4473	lrc_destroy_wa_ctx(engine);
4474}
4475
4476static void
4477logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4478{
4479	/* Default vfuncs which can be overriden by each engine. */
4480
4481	engine->resume = execlists_resume;
4482
4483	engine->cops = &execlists_context_ops;
4484	engine->request_alloc = execlists_request_alloc;
4485
4486	engine->emit_flush = gen8_emit_flush;
4487	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4488	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4489	if (INTEL_GEN(engine->i915) >= 12)
4490		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4491
4492	engine->set_default_submission = intel_execlists_set_default_submission;
4493
4494	if (INTEL_GEN(engine->i915) < 11) {
4495		engine->irq_enable = gen8_logical_ring_enable_irq;
4496		engine->irq_disable = gen8_logical_ring_disable_irq;
4497	} else {
4498		/*
4499		 * TODO: On Gen11 interrupt masks need to be clear
4500		 * to allow C6 entry. Keep interrupts enabled at
4501		 * and take the hit of generating extra interrupts
4502		 * until a more refined solution exists.
4503		 */
4504	}
4505}
4506
4507static inline void
4508logical_ring_default_irqs(struct intel_engine_cs *engine)
4509{
4510	unsigned int shift = 0;
4511
4512	if (INTEL_GEN(engine->i915) < 11) {
4513		const u8 irq_shifts[] = {
4514			[RCS0]  = GEN8_RCS_IRQ_SHIFT,
4515			[BCS0]  = GEN8_BCS_IRQ_SHIFT,
4516			[VCS0]  = GEN8_VCS0_IRQ_SHIFT,
4517			[VCS1]  = GEN8_VCS1_IRQ_SHIFT,
4518			[VECS0] = GEN8_VECS_IRQ_SHIFT,
4519		};
4520
4521		shift = irq_shifts[engine->id];
4522	}
4523
4524	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4525	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4526	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
4527	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
4528}
4529
4530static void rcs_submission_override(struct intel_engine_cs *engine)
4531{
4532	switch (INTEL_GEN(engine->i915)) {
4533	case 12:
4534		engine->emit_flush = gen12_emit_flush_render;
4535		engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4536		break;
4537	case 11:
4538		engine->emit_flush = gen11_emit_flush_render;
4539		engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4540		break;
4541	default:
4542		engine->emit_flush = gen8_emit_flush_render;
4543		engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4544		break;
4545	}
4546}
4547
4548int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4549{
4550	struct intel_engine_execlists * const execlists = &engine->execlists;
4551	struct drm_i915_private *i915 = engine->i915;
4552	struct intel_uncore *uncore = engine->uncore;
4553	u32 base = engine->mmio_base;
4554
4555	tasklet_init(&engine->execlists.tasklet,
4556		     execlists_submission_tasklet, (unsigned long)engine);
4557#ifdef __linux__
4558	timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4559	timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4560#else
4561	timeout_set(&engine->execlists.timer, execlists_timeslice,
4562	    &engine->execlists.timer);
4563	timeout_set(&engine->execlists.preempt, execlists_preempt,
4564	    &engine->execlists.preempt);
4565#endif
4566
4567	logical_ring_default_vfuncs(engine);
4568	logical_ring_default_irqs(engine);
4569
4570	if (engine->class == RENDER_CLASS)
4571		rcs_submission_override(engine);
4572
4573	if (intel_init_workaround_bb(engine))
4574		/*
4575		 * We continue even if we fail to initialize WA batch
4576		 * because we only expect rare glitches but nothing
4577		 * critical to prevent us from using GPU
4578		 */
4579		DRM_ERROR("WA batch buffer initialization failed\n");
4580
4581	if (HAS_LOGICAL_RING_ELSQ(i915)) {
4582		execlists->submit_reg = uncore->regs +
4583			i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4584		execlists->ctrl_reg = uncore->regs +
4585			i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4586	} else {
4587		execlists->submit_reg = uncore->regs +
4588			i915_mmio_reg_offset(RING_ELSP(base));
4589	}
4590
4591	execlists->csb_status =
4592		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4593
4594	execlists->csb_write =
4595		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
4596
4597	if (INTEL_GEN(i915) < 11)
4598		execlists->csb_size = GEN8_CSB_ENTRIES;
4599	else
4600		execlists->csb_size = GEN11_CSB_ENTRIES;
4601
4602	if (INTEL_GEN(engine->i915) >= 11) {
4603		execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
4604		execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
4605	}
4606
4607	reset_csb_pointers(engine);
4608
4609	/* Finally, take ownership and responsibility for cleanup! */
4610	engine->release = execlists_release;
4611
4612	return 0;
4613}
4614
4615static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4616{
4617	u32 indirect_ctx_offset;
4618
4619	switch (INTEL_GEN(engine->i915)) {
4620	default:
4621		MISSING_CASE(INTEL_GEN(engine->i915));
4622		/* fall through */
4623	case 12:
4624		indirect_ctx_offset =
4625			GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4626		break;
4627	case 11:
4628		indirect_ctx_offset =
4629			GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4630		break;
4631	case 10:
4632		indirect_ctx_offset =
4633			GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4634		break;
4635	case 9:
4636		indirect_ctx_offset =
4637			GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4638		break;
4639	case 8:
4640		indirect_ctx_offset =
4641			GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4642		break;
4643	}
4644
4645	return indirect_ctx_offset;
4646}
4647
4648
4649static void init_common_reg_state(u32 * const regs,
4650				  const struct intel_engine_cs *engine,
4651				  const struct intel_ring *ring,
4652				  bool inhibit)
4653{
4654	u32 ctl;
4655
4656	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4657	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4658	if (inhibit)
4659		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4660	if (INTEL_GEN(engine->i915) < 11)
4661		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4662					   CTX_CTRL_RS_CTX_ENABLE);
4663	regs[CTX_CONTEXT_CONTROL] = ctl;
4664
4665	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4666}
4667
4668static void init_wa_bb_reg_state(u32 * const regs,
4669				 const struct intel_engine_cs *engine,
4670				 u32 pos_bb_per_ctx)
4671{
4672	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4673
4674	if (wa_ctx->per_ctx.size) {
4675		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4676
4677		regs[pos_bb_per_ctx] =
4678			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4679	}
4680
4681	if (wa_ctx->indirect_ctx.size) {
4682		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4683
4684		regs[pos_bb_per_ctx + 2] =
4685			(ggtt_offset + wa_ctx->indirect_ctx.offset) |
4686			(wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4687
4688		regs[pos_bb_per_ctx + 4] =
4689			intel_lr_indirect_ctx_offset(engine) << 6;
4690	}
4691}
4692
4693static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4694{
4695	if (i915_vm_is_4lvl(&ppgtt->vm)) {
4696		/* 64b PPGTT (48bit canonical)
4697		 * PDP0_DESCRIPTOR contains the base address to PML4 and
4698		 * other PDP Descriptors are ignored.
4699		 */
4700		ASSIGN_CTX_PML4(ppgtt, regs);
4701	} else {
4702		ASSIGN_CTX_PDP(ppgtt, regs, 3);
4703		ASSIGN_CTX_PDP(ppgtt, regs, 2);
4704		ASSIGN_CTX_PDP(ppgtt, regs, 1);
4705		ASSIGN_CTX_PDP(ppgtt, regs, 0);
4706	}
4707}
4708
4709static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4710{
4711	if (i915_is_ggtt(vm))
4712		return i915_vm_to_ggtt(vm)->alias;
4713	else
4714		return i915_vm_to_ppgtt(vm);
4715}
4716
4717static void execlists_init_reg_state(u32 *regs,
4718				     const struct intel_context *ce,
4719				     const struct intel_engine_cs *engine,
4720				     const struct intel_ring *ring,
4721				     bool inhibit)
4722{
4723	/*
4724	 * A context is actually a big batch buffer with several
4725	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4726	 * values we are setting here are only for the first context restore:
4727	 * on a subsequent save, the GPU will recreate this batchbuffer with new
4728	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4729	 * we are not initializing here).
4730	 *
4731	 * Must keep consistent with virtual_update_register_offsets().
4732	 */
4733	set_offsets(regs, reg_offsets(engine), engine, inhibit);
4734
4735	init_common_reg_state(regs, engine, ring, inhibit);
4736	init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4737
4738	init_wa_bb_reg_state(regs, engine,
4739			     INTEL_GEN(engine->i915) >= 12 ?
4740			     GEN12_CTX_BB_PER_CTX_PTR :
4741			     CTX_BB_PER_CTX_PTR);
4742
4743	__reset_stop_ring(regs, engine);
4744}
4745
4746static int
4747populate_lr_context(struct intel_context *ce,
4748		    struct drm_i915_gem_object *ctx_obj,
4749		    struct intel_engine_cs *engine,
4750		    struct intel_ring *ring)
4751{
4752	bool inhibit = true;
4753	void *vaddr;
4754	int ret;
4755
4756	vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4757	if (IS_ERR(vaddr)) {
4758		ret = PTR_ERR(vaddr);
4759		DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4760		return ret;
4761	}
4762
4763	set_redzone(vaddr, engine);
4764
4765	if (engine->default_state) {
4766		void *defaults;
4767
4768		defaults = i915_gem_object_pin_map(engine->default_state,
4769						   I915_MAP_WB);
4770		if (IS_ERR(defaults)) {
4771			ret = PTR_ERR(defaults);
4772			goto err_unpin_ctx;
4773		}
4774
4775		memcpy(vaddr, defaults, engine->context_size);
4776		i915_gem_object_unpin_map(engine->default_state);
4777		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
4778		inhibit = false;
4779	}
4780
4781	/* Clear the ppHWSP (inc. per-context counters) */
4782	memset(vaddr, 0, PAGE_SIZE);
4783
4784	/*
4785	 * The second page of the context object contains some registers which
4786	 * must be set up prior to the first execution.
4787	 */
4788	execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4789				 ce, engine, ring, inhibit);
4790
4791	ret = 0;
4792err_unpin_ctx:
4793	__i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4794	i915_gem_object_unpin_map(ctx_obj);
4795	return ret;
4796}
4797
4798static int __execlists_context_alloc(struct intel_context *ce,
4799				     struct intel_engine_cs *engine)
4800{
4801	struct drm_i915_gem_object *ctx_obj;
4802	struct intel_ring *ring;
4803	struct i915_vma *vma;
4804	u32 context_size;
4805	int ret;
4806
4807	GEM_BUG_ON(ce->state);
4808	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4809
4810	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4811		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4812
4813	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4814	if (IS_ERR(ctx_obj))
4815		return PTR_ERR(ctx_obj);
4816
4817	vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4818	if (IS_ERR(vma)) {
4819		ret = PTR_ERR(vma);
4820		goto error_deref_obj;
4821	}
4822
4823	if (!ce->timeline) {
4824		struct intel_timeline *tl;
4825		struct i915_vma *hwsp;
4826
4827		/*
4828		 * Use the static global HWSP for the kernel context, and
4829		 * a dynamically allocated cacheline for everyone else.
4830		 */
4831		hwsp = NULL;
4832		if (unlikely(intel_context_is_barrier(ce)))
4833			hwsp = engine->status_page.vma;
4834
4835		tl = intel_timeline_create(engine->gt, hwsp);
4836		if (IS_ERR(tl)) {
4837			ret = PTR_ERR(tl);
4838			goto error_deref_obj;
4839		}
4840
4841		ce->timeline = tl;
4842	}
4843
4844	ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4845	if (IS_ERR(ring)) {
4846		ret = PTR_ERR(ring);
4847		goto error_deref_obj;
4848	}
4849
4850	ret = populate_lr_context(ce, ctx_obj, engine, ring);
4851	if (ret) {
4852		DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4853		goto error_ring_free;
4854	}
4855
4856	ce->ring = ring;
4857	ce->state = vma;
4858
4859	return 0;
4860
4861error_ring_free:
4862	intel_ring_put(ring);
4863error_deref_obj:
4864	i915_gem_object_put(ctx_obj);
4865	return ret;
4866}
4867
4868static struct list_head *virtual_queue(struct virtual_engine *ve)
4869{
4870	return &ve->base.execlists.default_priolist.requests[0];
4871}
4872
4873static void virtual_context_destroy(struct kref *kref)
4874{
4875	struct virtual_engine *ve =
4876		container_of(kref, typeof(*ve), context.ref);
4877	unsigned int n;
4878
4879	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4880	GEM_BUG_ON(ve->request);
4881	GEM_BUG_ON(ve->context.inflight);
4882
4883	for (n = 0; n < ve->num_siblings; n++) {
4884		struct intel_engine_cs *sibling = ve->siblings[n];
4885		struct rb_node *node = &ve->nodes[sibling->id].rb;
4886		unsigned long flags;
4887
4888		if (RB_EMPTY_NODE(node))
4889			continue;
4890
4891		spin_lock_irqsave(&sibling->active.lock, flags);
4892
4893		/* Detachment is lazily performed in the execlists tasklet */
4894		if (!RB_EMPTY_NODE(node))
4895			rb_erase_cached(node, &sibling->execlists.virtual);
4896
4897		spin_unlock_irqrestore(&sibling->active.lock, flags);
4898	}
4899	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4900
4901	if (ve->context.state)
4902		__execlists_context_fini(&ve->context);
4903	intel_context_fini(&ve->context);
4904
4905	kfree(ve->bonds);
4906	kfree(ve);
4907}
4908
4909static void virtual_engine_initial_hint(struct virtual_engine *ve)
4910{
4911	int swp;
4912
4913	/*
4914	 * Pick a random sibling on starting to help spread the load around.
4915	 *
4916	 * New contexts are typically created with exactly the same order
4917	 * of siblings, and often started in batches. Due to the way we iterate
4918	 * the array of sibling when submitting requests, sibling[0] is
4919	 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4920	 * randomised across the system, we also help spread the load by the
4921	 * first engine we inspect being different each time.
4922	 *
4923	 * NB This does not force us to execute on this engine, it will just
4924	 * typically be the first we inspect for submission.
4925	 */
4926	swp = prandom_u32_max(ve->num_siblings);
4927	if (!swp)
4928		return;
4929
4930	swap(ve->siblings[swp], ve->siblings[0]);
4931	if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4932		virtual_update_register_offsets(ve->context.lrc_reg_state,
4933						ve->siblings[0]);
4934}
4935
4936static int virtual_context_alloc(struct intel_context *ce)
4937{
4938	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4939
4940	return __execlists_context_alloc(ce, ve->siblings[0]);
4941}
4942
4943static int virtual_context_pin(struct intel_context *ce)
4944{
4945	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4946	int err;
4947
4948	/* Note: we must use a real engine class for setting up reg state */
4949	err = __execlists_context_pin(ce, ve->siblings[0]);
4950	if (err)
4951		return err;
4952
4953	virtual_engine_initial_hint(ve);
4954	return 0;
4955}
4956
4957static void virtual_context_enter(struct intel_context *ce)
4958{
4959	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4960	unsigned int n;
4961
4962	for (n = 0; n < ve->num_siblings; n++)
4963		intel_engine_pm_get(ve->siblings[n]);
4964
4965	intel_timeline_enter(ce->timeline);
4966}
4967
4968static void virtual_context_exit(struct intel_context *ce)
4969{
4970	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4971	unsigned int n;
4972
4973	intel_timeline_exit(ce->timeline);
4974
4975	for (n = 0; n < ve->num_siblings; n++)
4976		intel_engine_pm_put(ve->siblings[n]);
4977}
4978
4979static const struct intel_context_ops virtual_context_ops = {
4980	.alloc = virtual_context_alloc,
4981
4982	.pin = virtual_context_pin,
4983	.unpin = execlists_context_unpin,
4984
4985	.enter = virtual_context_enter,
4986	.exit = virtual_context_exit,
4987
4988	.destroy = virtual_context_destroy,
4989};
4990
4991static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4992{
4993	struct i915_request *rq;
4994	intel_engine_mask_t mask;
4995
4996	rq = READ_ONCE(ve->request);
4997	if (!rq)
4998		return 0;
4999
5000	/* The rq is ready for submission; rq->execution_mask is now stable. */
5001	mask = rq->execution_mask;
5002	if (unlikely(!mask)) {
5003		/* Invalid selection, submit to a random engine in error */
5004		i915_request_set_error_once(rq, -ENODEV);
5005		mask = ve->siblings[0]->mask;
5006	}
5007
5008	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5009		     rq->fence.context, rq->fence.seqno,
5010		     mask, ve->base.execlists.queue_priority_hint);
5011
5012	return mask;
5013}
5014
5015static void virtual_submission_tasklet(unsigned long data)
5016{
5017	struct virtual_engine * const ve = (struct virtual_engine *)data;
5018	const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5019	intel_engine_mask_t mask;
5020	unsigned int n;
5021
5022	rcu_read_lock();
5023	mask = virtual_submission_mask(ve);
5024	rcu_read_unlock();
5025	if (unlikely(!mask))
5026		return;
5027
5028	local_irq_disable();
5029	for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
5030		struct intel_engine_cs *sibling = ve->siblings[n];
5031		struct ve_node * const node = &ve->nodes[sibling->id];
5032		struct rb_node **parent, *rb;
5033		bool first;
5034
5035		if (unlikely(!(mask & sibling->mask))) {
5036			if (!RB_EMPTY_NODE(&node->rb)) {
5037				spin_lock(&sibling->active.lock);
5038				rb_erase_cached(&node->rb,
5039						&sibling->execlists.virtual);
5040				RB_CLEAR_NODE(&node->rb);
5041				spin_unlock(&sibling->active.lock);
5042			}
5043			continue;
5044		}
5045
5046		spin_lock(&sibling->active.lock);
5047
5048		if (!RB_EMPTY_NODE(&node->rb)) {
5049			/*
5050			 * Cheat and avoid rebalancing the tree if we can
5051			 * reuse this node in situ.
5052			 */
5053			first = rb_first_cached(&sibling->execlists.virtual) ==
5054				&node->rb;
5055			if (prio == node->prio || (prio > node->prio && first))
5056				goto submit_engine;
5057
5058			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5059		}
5060
5061		rb = NULL;
5062		first = true;
5063		parent = &sibling->execlists.virtual.rb_root.rb_node;
5064		while (*parent) {
5065			struct ve_node *other;
5066
5067			rb = *parent;
5068			other = rb_entry(rb, typeof(*other), rb);
5069			if (prio > other->prio) {
5070				parent = &rb->rb_left;
5071			} else {
5072				parent = &rb->rb_right;
5073				first = false;
5074			}
5075		}
5076
5077		rb_link_node(&node->rb, rb, parent);
5078		rb_insert_color_cached(&node->rb,
5079				       &sibling->execlists.virtual,
5080				       first);
5081
5082submit_engine:
5083		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5084		node->prio = prio;
5085		if (first && prio > sibling->execlists.queue_priority_hint) {
5086			sibling->execlists.queue_priority_hint = prio;
5087			tasklet_hi_schedule(&sibling->execlists.tasklet);
5088		}
5089
5090		spin_unlock(&sibling->active.lock);
5091	}
5092	local_irq_enable();
5093}
5094
5095static void virtual_submit_request(struct i915_request *rq)
5096{
5097	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5098	struct i915_request *old;
5099	unsigned long flags;
5100
5101	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5102		     rq->fence.context,
5103		     rq->fence.seqno);
5104
5105	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5106
5107	spin_lock_irqsave(&ve->base.active.lock, flags);
5108
5109	old = ve->request;
5110	if (old) { /* background completion event from preempt-to-busy */
5111		GEM_BUG_ON(!i915_request_completed(old));
5112		__i915_request_submit(old);
5113		i915_request_put(old);
5114	}
5115
5116	if (i915_request_completed(rq)) {
5117		__i915_request_submit(rq);
5118
5119		ve->base.execlists.queue_priority_hint = INT_MIN;
5120		ve->request = NULL;
5121	} else {
5122		ve->base.execlists.queue_priority_hint = rq_prio(rq);
5123		ve->request = i915_request_get(rq);
5124
5125		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5126		list_move_tail(&rq->sched.link, virtual_queue(ve));
5127
5128		tasklet_schedule(&ve->base.execlists.tasklet);
5129	}
5130
5131	spin_unlock_irqrestore(&ve->base.active.lock, flags);
5132}
5133
5134static struct ve_bond *
5135virtual_find_bond(struct virtual_engine *ve,
5136		  const struct intel_engine_cs *master)
5137{
5138	int i;
5139
5140	for (i = 0; i < ve->num_bonds; i++) {
5141		if (ve->bonds[i].master == master)
5142			return &ve->bonds[i];
5143	}
5144
5145	return NULL;
5146}
5147
5148static void
5149virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5150{
5151	struct virtual_engine *ve = to_virtual_engine(rq->engine);
5152	intel_engine_mask_t allowed, exec;
5153	struct ve_bond *bond;
5154
5155	allowed = ~to_request(signal)->engine->mask;
5156
5157	bond = virtual_find_bond(ve, to_request(signal)->engine);
5158	if (bond)
5159		allowed &= bond->sibling_mask;
5160
5161	/* Restrict the bonded request to run on only the available engines */
5162	exec = READ_ONCE(rq->execution_mask);
5163	while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5164		;
5165
5166	/* Prevent the master from being re-run on the bonded engines */
5167	to_request(signal)->execution_mask &= ~allowed;
5168}
5169
5170struct intel_context *
5171intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5172			       unsigned int count)
5173{
5174	struct virtual_engine *ve;
5175	unsigned int n;
5176	int err;
5177
5178	if (count == 0)
5179		return ERR_PTR(-EINVAL);
5180
5181	if (count == 1)
5182		return intel_context_create(siblings[0]);
5183
5184	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5185	if (!ve)
5186		return ERR_PTR(-ENOMEM);
5187
5188	ve->base.i915 = siblings[0]->i915;
5189	ve->base.gt = siblings[0]->gt;
5190	ve->base.uncore = siblings[0]->uncore;
5191	ve->base.id = -1;
5192
5193	ve->base.class = OTHER_CLASS;
5194	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5195	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5196	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5197
5198	/*
5199	 * The decision on whether to submit a request using semaphores
5200	 * depends on the saturated state of the engine. We only compute
5201	 * this during HW submission of the request, and we need for this
5202	 * state to be globally applied to all requests being submitted
5203	 * to this engine. Virtual engines encompass more than one physical
5204	 * engine and so we cannot accurately tell in advance if one of those
5205	 * engines is already saturated and so cannot afford to use a semaphore
5206	 * and be pessimized in priority for doing so -- if we are the only
5207	 * context using semaphores after all other clients have stopped, we
5208	 * will be starved on the saturated system. Such a global switch for
5209	 * semaphores is less than ideal, but alas is the current compromise.
5210	 */
5211	ve->base.saturated = ALL_ENGINES;
5212
5213	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5214
5215	intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5216	intel_engine_init_breadcrumbs(&ve->base);
5217	intel_engine_init_execlists(&ve->base);
5218	ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */
5219
5220	ve->base.cops = &virtual_context_ops;
5221	ve->base.request_alloc = execlists_request_alloc;
5222
5223	ve->base.schedule = i915_schedule;
5224	ve->base.submit_request = virtual_submit_request;
5225	ve->base.bond_execute = virtual_bond_execute;
5226
5227	INIT_LIST_HEAD(virtual_queue(ve));
5228	ve->base.execlists.queue_priority_hint = INT_MIN;
5229	tasklet_init(&ve->base.execlists.tasklet,
5230		     virtual_submission_tasklet,
5231		     (unsigned long)ve);
5232
5233	intel_context_init(&ve->context, &ve->base);
5234
5235	for (n = 0; n < count; n++) {
5236		struct intel_engine_cs *sibling = siblings[n];
5237
5238		GEM_BUG_ON(!is_power_of_2(sibling->mask));
5239		if (sibling->mask & ve->base.mask) {
5240			DRM_DEBUG("duplicate %s entry in load balancer\n",
5241				  sibling->name);
5242			err = -EINVAL;
5243			goto err_put;
5244		}
5245
5246		/*
5247		 * The virtual engine implementation is tightly coupled to
5248		 * the execlists backend -- we push out request directly
5249		 * into a tree inside each physical engine. We could support
5250		 * layering if we handle cloning of the requests and
5251		 * submitting a copy into each backend.
5252		 */
5253		if (sibling->execlists.tasklet.func !=
5254		    execlists_submission_tasklet) {
5255			err = -ENODEV;
5256			goto err_put;
5257		}
5258
5259		GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5260		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5261
5262		ve->siblings[ve->num_siblings++] = sibling;
5263		ve->base.mask |= sibling->mask;
5264
5265		/*
5266		 * All physical engines must be compatible for their emission
5267		 * functions (as we build the instructions during request
5268		 * construction and do not alter them before submission
5269		 * on the physical engine). We use the engine class as a guide
5270		 * here, although that could be refined.
5271		 */
5272		if (ve->base.class != OTHER_CLASS) {
5273			if (ve->base.class != sibling->class) {
5274				DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5275					  sibling->class, ve->base.class);
5276				err = -EINVAL;
5277				goto err_put;
5278			}
5279			continue;
5280		}
5281
5282		ve->base.class = sibling->class;
5283		ve->base.uabi_class = sibling->uabi_class;
5284		snprintf(ve->base.name, sizeof(ve->base.name),
5285			 "v%dx%d", ve->base.class, count);
5286		ve->base.context_size = sibling->context_size;
5287
5288		ve->base.emit_bb_start = sibling->emit_bb_start;
5289		ve->base.emit_flush = sibling->emit_flush;
5290		ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5291		ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5292		ve->base.emit_fini_breadcrumb_dw =
5293			sibling->emit_fini_breadcrumb_dw;
5294
5295		ve->base.flags = sibling->flags;
5296	}
5297
5298	ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5299
5300	return &ve->context;
5301
5302err_put:
5303	intel_context_put(&ve->context);
5304	return ERR_PTR(err);
5305}
5306
5307struct intel_context *
5308intel_execlists_clone_virtual(struct intel_engine_cs *src)
5309{
5310	struct virtual_engine *se = to_virtual_engine(src);
5311	struct intel_context *dst;
5312
5313	dst = intel_execlists_create_virtual(se->siblings,
5314					     se->num_siblings);
5315	if (IS_ERR(dst))
5316		return dst;
5317
5318	if (se->num_bonds) {
5319		struct virtual_engine *de = to_virtual_engine(dst->engine);
5320
5321		de->bonds = kmemdup(se->bonds,
5322				    sizeof(*se->bonds) * se->num_bonds,
5323				    GFP_KERNEL);
5324		if (!de->bonds) {
5325			intel_context_put(dst);
5326			return ERR_PTR(-ENOMEM);
5327		}
5328
5329		de->num_bonds = se->num_bonds;
5330	}
5331
5332	return dst;
5333}
5334
5335int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5336				     const struct intel_engine_cs *master,
5337				     const struct intel_engine_cs *sibling)
5338{
5339	struct virtual_engine *ve = to_virtual_engine(engine);
5340	struct ve_bond *bond;
5341	int n;
5342
5343	/* Sanity check the sibling is part of the virtual engine */
5344	for (n = 0; n < ve->num_siblings; n++)
5345		if (sibling == ve->siblings[n])
5346			break;
5347	if (n == ve->num_siblings)
5348		return -EINVAL;
5349
5350	bond = virtual_find_bond(ve, master);
5351	if (bond) {
5352		bond->sibling_mask |= sibling->mask;
5353		return 0;
5354	}
5355
5356#ifdef __linux__
5357	bond = krealloc(ve->bonds,
5358			sizeof(*bond) * (ve->num_bonds + 1),
5359			GFP_KERNEL);
5360	if (!bond)
5361		return -ENOMEM;
5362#else
5363	bond = kmalloc(sizeof(*bond) * (ve->num_bonds + 1),
5364			GFP_KERNEL);
5365	if (!bond)
5366		return -ENOMEM;
5367
5368	memcpy(bond, ve->bonds, sizeof(*bond) * ve->num_bonds);
5369	kfree(ve->bonds);
5370#endif
5371
5372	bond[ve->num_bonds].master = master;
5373	bond[ve->num_bonds].sibling_mask = sibling->mask;
5374
5375	ve->bonds = bond;
5376	ve->num_bonds++;
5377
5378	return 0;
5379}
5380
5381struct intel_engine_cs *
5382intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5383				 unsigned int sibling)
5384{
5385	struct virtual_engine *ve = to_virtual_engine(engine);
5386
5387	if (sibling >= ve->num_siblings)
5388		return NULL;
5389
5390	return ve->siblings[sibling];
5391}
5392
5393void intel_execlists_show_requests(struct intel_engine_cs *engine,
5394				   struct drm_printer *m,
5395				   void (*show_request)(struct drm_printer *m,
5396							struct i915_request *rq,
5397							const char *prefix),
5398				   unsigned int max)
5399{
5400	const struct intel_engine_execlists *execlists = &engine->execlists;
5401	struct i915_request *rq, *last;
5402	unsigned long flags;
5403	unsigned int count;
5404	struct rb_node *rb;
5405
5406	spin_lock_irqsave(&engine->active.lock, flags);
5407
5408	last = NULL;
5409	count = 0;
5410	list_for_each_entry(rq, &engine->active.requests, sched.link) {
5411		if (count++ < max - 1)
5412			show_request(m, rq, "\t\tE ");
5413		else
5414			last = rq;
5415	}
5416	if (last) {
5417		if (count > max) {
5418			drm_printf(m,
5419				   "\t\t...skipping %d executing requests...\n",
5420				   count - max);
5421		}
5422		show_request(m, last, "\t\tE ");
5423	}
5424
5425	if (execlists->switch_priority_hint != INT_MIN)
5426		drm_printf(m, "\t\tSwitch priority hint: %d\n",
5427			   READ_ONCE(execlists->switch_priority_hint));
5428	if (execlists->queue_priority_hint != INT_MIN)
5429		drm_printf(m, "\t\tQueue priority hint: %d\n",
5430			   READ_ONCE(execlists->queue_priority_hint));
5431
5432	last = NULL;
5433	count = 0;
5434	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5435		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5436		int i;
5437
5438		priolist_for_each_request(rq, p, i) {
5439			if (count++ < max - 1)
5440				show_request(m, rq, "\t\tQ ");
5441			else
5442				last = rq;
5443		}
5444	}
5445	if (last) {
5446		if (count > max) {
5447			drm_printf(m,
5448				   "\t\t...skipping %d queued requests...\n",
5449				   count - max);
5450		}
5451		show_request(m, last, "\t\tQ ");
5452	}
5453
5454	last = NULL;
5455	count = 0;
5456	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5457		struct virtual_engine *ve =
5458			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5459		struct i915_request *rq = READ_ONCE(ve->request);
5460
5461		if (rq) {
5462			if (count++ < max - 1)
5463				show_request(m, rq, "\t\tV ");
5464			else
5465				last = rq;
5466		}
5467	}
5468	if (last) {
5469		if (count > max) {
5470			drm_printf(m,
5471				   "\t\t...skipping %d virtual requests...\n",
5472				   count - max);
5473		}
5474		show_request(m, last, "\t\tV ");
5475	}
5476
5477	spin_unlock_irqrestore(&engine->active.lock, flags);
5478}
5479
5480void intel_lr_context_reset(struct intel_engine_cs *engine,
5481			    struct intel_context *ce,
5482			    u32 head,
5483			    bool scrub)
5484{
5485	GEM_BUG_ON(!intel_context_is_pinned(ce));
5486
5487	/*
5488	 * We want a simple context + ring to execute the breadcrumb update.
5489	 * We cannot rely on the context being intact across the GPU hang,
5490	 * so clear it and rebuild just what we need for the breadcrumb.
5491	 * All pending requests for this context will be zapped, and any
5492	 * future request will be after userspace has had the opportunity
5493	 * to recreate its own state.
5494	 */
5495	if (scrub)
5496		restore_default_state(ce, engine);
5497
5498	/* Rerun the request; its payload has been neutered (if guilty). */
5499	__execlists_update_reg_state(ce, engine, head);
5500}
5501
5502bool
5503intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5504{
5505	return engine->set_default_submission ==
5506	       intel_execlists_set_default_submission;
5507}
5508
5509#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5510#include "selftest_lrc.c"
5511#endif
5512