1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2019 Intel Corporation
4 */
5
6#include "i915_drv.h"
7#include "i915_request.h"
8
9#include "intel_context.h"
10#include "intel_engine_heartbeat.h"
11#include "intel_engine_pm.h"
12#include "intel_engine.h"
13#include "intel_gt.h"
14#include "intel_reset.h"
15
16/*
17 * While the engine is active, we send a periodic pulse along the engine
18 * to check on its health and to flush any idle-barriers. If that request
19 * is stuck, and we fail to preempt it, we declare the engine hung and
20 * issue a reset -- in the hope that restores progress.
21 */
22
23static bool next_heartbeat(struct intel_engine_cs *engine)
24{
25	struct i915_request *rq;
26	long delay;
27
28	delay = READ_ONCE(engine->props.heartbeat_interval_ms);
29
30	rq = engine->heartbeat.systole;
31
32	/*
33	 * FIXME: The final period extension is disabled if the period has been
34	 * modified from the default. This is to prevent issues with certain
35	 * selftests which override the value and expect specific behaviour.
36	 * Once the selftests have been updated to either cope with variable
37	 * heartbeat periods (or to override the pre-emption timeout as well,
38	 * or just to add a selftest specific override of the extension), the
39	 * generic override can be removed.
40	 */
41	if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
42	    delay == engine->defaults.heartbeat_interval_ms) {
43		long longer;
44
45		/*
46		 * The final try is at the highest priority possible. Up until now
47		 * a pre-emption might not even have been attempted. So make sure
48		 * this last attempt allows enough time for a pre-emption to occur.
49		 */
50		longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
51		longer = intel_clamp_heartbeat_interval_ms(engine, longer);
52		if (longer > delay)
53			delay = longer;
54	}
55
56	if (!delay)
57		return false;
58
59	delay = msecs_to_jiffies_timeout(delay);
60	if (delay >= HZ)
61		delay = round_jiffies_up_relative(delay);
62	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
63
64	return true;
65}
66
67static struct i915_request *
68heartbeat_create(struct intel_context *ce, gfp_t gfp)
69{
70	struct i915_request *rq;
71
72	intel_context_enter(ce);
73	rq = __i915_request_create(ce, gfp);
74	intel_context_exit(ce);
75
76	return rq;
77}
78
79static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq)
80{
81	engine->wakeref_serial = READ_ONCE(engine->serial) + 1;
82	i915_request_add_active_barriers(rq);
83	if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine))
84		engine->heartbeat.systole = i915_request_get(rq);
85}
86
87static void heartbeat_commit(struct i915_request *rq,
88			     const struct i915_sched_attr *attr)
89{
90	idle_pulse(rq->engine, rq);
91
92	__i915_request_commit(rq);
93	__i915_request_queue(rq, attr);
94}
95
96static void show_heartbeat(const struct i915_request *rq,
97			   struct intel_engine_cs *engine)
98{
99	struct drm_printer p =
100		drm_dbg_printer(&engine->i915->drm, DRM_UT_DRIVER, "heartbeat");
101
102	if (!rq) {
103		intel_engine_dump(engine, &p,
104				  "%s heartbeat not ticking\n",
105				  engine->name);
106	} else {
107		intel_engine_dump(engine, &p,
108				  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
109				  engine->name,
110				  rq->fence.context,
111				  rq->fence.seqno,
112				  rq->sched.attr.priority);
113	}
114}
115
116static void
117reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
118{
119	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
120		show_heartbeat(rq, engine);
121
122	if (intel_engine_uses_guc(engine))
123		/*
124		 * GuC itself is toast or GuC's hang detection
125		 * is disabled. Either way, need to find the
126		 * hang culprit manually.
127		 */
128		intel_guc_find_hung_context(engine);
129
130	intel_gt_handle_error(engine->gt, engine->mask,
131			      I915_ERROR_CAPTURE,
132			      "stopped heartbeat on %s",
133			      engine->name);
134}
135
136static void heartbeat(struct work_struct *wrk)
137{
138	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
139	struct intel_engine_cs *engine =
140		container_of(wrk, typeof(*engine), heartbeat.work.work);
141	struct intel_context *ce = engine->kernel_context;
142	struct i915_request *rq;
143	unsigned long serial;
144
145	/* Just in case everything has gone horribly wrong, give it a kick */
146	intel_engine_flush_submission(engine);
147
148	rq = engine->heartbeat.systole;
149	if (rq && i915_request_completed(rq)) {
150		i915_request_put(rq);
151		engine->heartbeat.systole = NULL;
152	}
153
154	if (!intel_engine_pm_get_if_awake(engine))
155		return;
156
157	if (intel_gt_is_wedged(engine->gt))
158		goto out;
159
160	if (i915_sched_engine_disabled(engine->sched_engine)) {
161		reset_engine(engine, engine->heartbeat.systole);
162		goto out;
163	}
164
165	if (engine->heartbeat.systole) {
166		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
167
168		/* Safeguard against too-fast worker invocations */
169		if (!time_after(jiffies,
170				rq->emitted_jiffies + msecs_to_jiffies(delay)))
171			goto out;
172
173		if (!i915_sw_fence_signaled(&rq->submit)) {
174			/*
175			 * Not yet submitted, system is stalled.
176			 *
177			 * This more often happens for ring submission,
178			 * where all contexts are funnelled into a common
179			 * ringbuffer. If one context is blocked on an
180			 * external fence, not only is it not submitted,
181			 * but all other contexts, including the kernel
182			 * context are stuck waiting for the signal.
183			 */
184		} else if (engine->sched_engine->schedule &&
185			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
186			/*
187			 * Gradually raise the priority of the heartbeat to
188			 * give high priority work [which presumably desires
189			 * low latency and no jitter] the chance to naturally
190			 * complete before being preempted.
191			 */
192			attr.priority = I915_PRIORITY_NORMAL;
193			if (rq->sched.attr.priority >= attr.priority)
194				attr.priority = I915_PRIORITY_HEARTBEAT;
195			if (rq->sched.attr.priority >= attr.priority)
196				attr.priority = I915_PRIORITY_BARRIER;
197
198			local_bh_disable();
199			engine->sched_engine->schedule(rq, &attr);
200			local_bh_enable();
201		} else {
202			reset_engine(engine, rq);
203		}
204
205		rq->emitted_jiffies = jiffies;
206		goto out;
207	}
208
209	serial = READ_ONCE(engine->serial);
210	if (engine->wakeref_serial == serial)
211		goto out;
212
213	if (!mutex_trylock(&ce->timeline->mutex)) {
214		/* Unable to lock the kernel timeline, is the engine stuck? */
215		if (xchg(&engine->heartbeat.blocked, serial) == serial)
216			intel_gt_handle_error(engine->gt, engine->mask,
217					      I915_ERROR_CAPTURE,
218					      "no heartbeat on %s",
219					      engine->name);
220		goto out;
221	}
222
223	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
224	if (IS_ERR(rq))
225		goto unlock;
226
227	heartbeat_commit(rq, &attr);
228
229unlock:
230	mutex_unlock(&ce->timeline->mutex);
231out:
232	if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
233		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
234	intel_engine_pm_put(engine);
235}
236
237void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
238{
239	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
240		return;
241
242	next_heartbeat(engine);
243}
244
245void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
246{
247	if (cancel_delayed_work(&engine->heartbeat.work))
248		i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
249}
250
251void intel_gt_unpark_heartbeats(struct intel_gt *gt)
252{
253	struct intel_engine_cs *engine;
254	enum intel_engine_id id;
255
256	for_each_engine(engine, gt, id)
257		if (intel_engine_pm_is_awake(engine))
258			intel_engine_unpark_heartbeat(engine);
259}
260
261void intel_gt_park_heartbeats(struct intel_gt *gt)
262{
263	struct intel_engine_cs *engine;
264	enum intel_engine_id id;
265
266	for_each_engine(engine, gt, id)
267		intel_engine_park_heartbeat(engine);
268}
269
270void intel_engine_init_heartbeat(struct intel_engine_cs *engine)
271{
272	INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat);
273}
274
275static int __intel_engine_pulse(struct intel_engine_cs *engine)
276{
277	struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER };
278	struct intel_context *ce = engine->kernel_context;
279	struct i915_request *rq;
280
281	lockdep_assert_held(&ce->timeline->mutex);
282	GEM_BUG_ON(!intel_engine_has_preemption(engine));
283	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
284
285	rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN);
286	if (IS_ERR(rq))
287		return PTR_ERR(rq);
288
289	__set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags);
290
291	heartbeat_commit(rq, &attr);
292	GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER);
293
294	/* Ensure the forced pulse gets a full period to execute */
295	next_heartbeat(engine);
296
297	return 0;
298}
299
300static unsigned long set_heartbeat(struct intel_engine_cs *engine,
301				   unsigned long delay)
302{
303	unsigned long old;
304
305	old = xchg(&engine->props.heartbeat_interval_ms, delay);
306	if (delay)
307		intel_engine_unpark_heartbeat(engine);
308	else
309		intel_engine_park_heartbeat(engine);
310
311	return old;
312}
313
314int intel_engine_set_heartbeat(struct intel_engine_cs *engine,
315			       unsigned long delay)
316{
317	struct intel_context *ce = engine->kernel_context;
318	int err = 0;
319
320	if (!delay && !intel_engine_has_preempt_reset(engine))
321		return -ENODEV;
322
323	/* FIXME: Remove together with equally marked hack in next_heartbeat. */
324	if (delay != engine->defaults.heartbeat_interval_ms &&
325	    delay < 2 * engine->props.preempt_timeout_ms) {
326		if (intel_engine_uses_guc(engine))
327			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
328				   engine->name);
329		else
330			drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
331				   engine->name);
332	}
333
334	intel_engine_pm_get(engine);
335
336	err = mutex_lock_interruptible(&ce->timeline->mutex);
337	if (err)
338		goto out_rpm;
339
340	if (delay != engine->props.heartbeat_interval_ms) {
341		unsigned long saved = set_heartbeat(engine, delay);
342
343		/* recheck current execution */
344		if (intel_engine_has_preemption(engine)) {
345			err = __intel_engine_pulse(engine);
346			if (err)
347				set_heartbeat(engine, saved);
348		}
349	}
350
351	mutex_unlock(&ce->timeline->mutex);
352
353out_rpm:
354	intel_engine_pm_put(engine);
355	return err;
356}
357
358int intel_engine_pulse(struct intel_engine_cs *engine)
359{
360	struct intel_context *ce = engine->kernel_context;
361	int err;
362
363	if (!intel_engine_has_preemption(engine))
364		return -ENODEV;
365
366	if (!intel_engine_pm_get_if_awake(engine))
367		return 0;
368
369	err = -EINTR;
370	if (!mutex_lock_interruptible(&ce->timeline->mutex)) {
371		err = __intel_engine_pulse(engine);
372		mutex_unlock(&ce->timeline->mutex);
373	}
374
375	intel_engine_flush_submission(engine);
376	intel_engine_pm_put(engine);
377	return err;
378}
379
380int intel_engine_flush_barriers(struct intel_engine_cs *engine)
381{
382	struct i915_sched_attr attr = { .priority = I915_PRIORITY_MIN };
383	struct intel_context *ce = engine->kernel_context;
384	struct i915_request *rq;
385	int err;
386
387	if (llist_empty(&engine->barrier_tasks))
388		return 0;
389
390	if (!intel_engine_pm_get_if_awake(engine))
391		return 0;
392
393	if (mutex_lock_interruptible(&ce->timeline->mutex)) {
394		err = -EINTR;
395		goto out_rpm;
396	}
397
398	rq = heartbeat_create(ce, GFP_KERNEL);
399	if (IS_ERR(rq)) {
400		err = PTR_ERR(rq);
401		goto out_unlock;
402	}
403
404	heartbeat_commit(rq, &attr);
405
406	err = 0;
407out_unlock:
408	mutex_unlock(&ce->timeline->mutex);
409out_rpm:
410	intel_engine_pm_put(engine);
411	return err;
412}
413
414#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
415#include "selftest_engine_heartbeat.c"
416#endif
417