1/*
2 * Copyright �� 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25#include <linux/prime_numbers.h>
26#include <linux/pm_qos.h>
27#include <linux/sort.h>
28
29#include "gem/i915_gem_internal.h"
30#include "gem/i915_gem_pm.h"
31#include "gem/selftests/mock_context.h"
32
33#include "gt/intel_engine_heartbeat.h"
34#include "gt/intel_engine_pm.h"
35#include "gt/intel_engine_user.h"
36#include "gt/intel_gt.h"
37#include "gt/intel_gt_clock_utils.h"
38#include "gt/intel_gt_requests.h"
39#include "gt/selftest_engine_heartbeat.h"
40
41#include "i915_random.h"
42#include "i915_selftest.h"
43#include "igt_flush_test.h"
44#include "igt_live_test.h"
45#include "igt_spinner.h"
46#include "lib_sw_fence.h"
47
48#include "mock_drm.h"
49#include "mock_gem_device.h"
50
51static unsigned int num_uabi_engines(struct drm_i915_private *i915)
52{
53	struct intel_engine_cs *engine;
54	unsigned int count;
55
56	count = 0;
57	for_each_uabi_engine(engine, i915)
58		count++;
59
60	return count;
61}
62
63static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
64{
65	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66}
67
68static int igt_add_request(void *arg)
69{
70	struct drm_i915_private *i915 = arg;
71	struct i915_request *request;
72
73	/* Basic preliminary test to create a request and let it loose! */
74
75	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
76	if (!request)
77		return -ENOMEM;
78
79	i915_request_add(request);
80
81	return 0;
82}
83
84static int igt_wait_request(void *arg)
85{
86	const long T = HZ / 4;
87	struct drm_i915_private *i915 = arg;
88	struct i915_request *request;
89	int err = -EINVAL;
90
91	/* Submit a request, then wait upon it */
92
93	request = mock_request(rcs0(i915)->kernel_context, T);
94	if (!request)
95		return -ENOMEM;
96
97	i915_request_get(request);
98
99	if (i915_request_wait(request, 0, 0) != -ETIME) {
100		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
101		goto out_request;
102	}
103
104	if (i915_request_wait(request, 0, T) != -ETIME) {
105		pr_err("request wait succeeded (expected timeout before submit!)\n");
106		goto out_request;
107	}
108
109	if (i915_request_completed(request)) {
110		pr_err("request completed before submit!!\n");
111		goto out_request;
112	}
113
114	i915_request_add(request);
115
116	if (i915_request_wait(request, 0, 0) != -ETIME) {
117		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
118		goto out_request;
119	}
120
121	if (i915_request_completed(request)) {
122		pr_err("request completed immediately!\n");
123		goto out_request;
124	}
125
126	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
127		pr_err("request wait succeeded (expected timeout!)\n");
128		goto out_request;
129	}
130
131	if (i915_request_wait(request, 0, T) == -ETIME) {
132		pr_err("request wait timed out!\n");
133		goto out_request;
134	}
135
136	if (!i915_request_completed(request)) {
137		pr_err("request not complete after waiting!\n");
138		goto out_request;
139	}
140
141	if (i915_request_wait(request, 0, T) == -ETIME) {
142		pr_err("request wait timed out when already complete!\n");
143		goto out_request;
144	}
145
146	err = 0;
147out_request:
148	i915_request_put(request);
149	mock_device_flush(i915);
150	return err;
151}
152
153static int igt_fence_wait(void *arg)
154{
155	const long T = HZ / 4;
156	struct drm_i915_private *i915 = arg;
157	struct i915_request *request;
158	int err = -EINVAL;
159
160	/* Submit a request, treat it as a fence and wait upon it */
161
162	request = mock_request(rcs0(i915)->kernel_context, T);
163	if (!request)
164		return -ENOMEM;
165
166	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
167		pr_err("fence wait success before submit (expected timeout)!\n");
168		goto out;
169	}
170
171	i915_request_add(request);
172
173	if (dma_fence_is_signaled(&request->fence)) {
174		pr_err("fence signaled immediately!\n");
175		goto out;
176	}
177
178	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
179		pr_err("fence wait success after submit (expected timeout)!\n");
180		goto out;
181	}
182
183	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
184		pr_err("fence wait timed out (expected success)!\n");
185		goto out;
186	}
187
188	if (!dma_fence_is_signaled(&request->fence)) {
189		pr_err("fence unsignaled after waiting!\n");
190		goto out;
191	}
192
193	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
194		pr_err("fence wait timed out when complete (expected success)!\n");
195		goto out;
196	}
197
198	err = 0;
199out:
200	mock_device_flush(i915);
201	return err;
202}
203
204static int igt_request_rewind(void *arg)
205{
206	struct drm_i915_private *i915 = arg;
207	struct i915_request *request, *vip;
208	struct i915_gem_context *ctx[2];
209	struct intel_context *ce;
210	int err = -EINVAL;
211
212	ctx[0] = mock_context(i915, "A");
213	if (!ctx[0]) {
214		err = -ENOMEM;
215		goto err_ctx_0;
216	}
217
218	ce = i915_gem_context_get_engine(ctx[0], RCS0);
219	GEM_BUG_ON(IS_ERR(ce));
220	request = mock_request(ce, 2 * HZ);
221	intel_context_put(ce);
222	if (!request) {
223		err = -ENOMEM;
224		goto err_context_0;
225	}
226
227	i915_request_get(request);
228	i915_request_add(request);
229
230	ctx[1] = mock_context(i915, "B");
231	if (!ctx[1]) {
232		err = -ENOMEM;
233		goto err_ctx_1;
234	}
235
236	ce = i915_gem_context_get_engine(ctx[1], RCS0);
237	GEM_BUG_ON(IS_ERR(ce));
238	vip = mock_request(ce, 0);
239	intel_context_put(ce);
240	if (!vip) {
241		err = -ENOMEM;
242		goto err_context_1;
243	}
244
245	/* Simulate preemption by manual reordering */
246	if (!mock_cancel_request(request)) {
247		pr_err("failed to cancel request (already executed)!\n");
248		i915_request_add(vip);
249		goto err_context_1;
250	}
251	i915_request_get(vip);
252	i915_request_add(vip);
253	rcu_read_lock();
254	request->engine->submit_request(request);
255	rcu_read_unlock();
256
257
258	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
259		pr_err("timed out waiting for high priority request\n");
260		goto err;
261	}
262
263	if (i915_request_completed(request)) {
264		pr_err("low priority request already completed\n");
265		goto err;
266	}
267
268	err = 0;
269err:
270	i915_request_put(vip);
271err_context_1:
272	mock_context_close(ctx[1]);
273err_ctx_1:
274	i915_request_put(request);
275err_context_0:
276	mock_context_close(ctx[0]);
277err_ctx_0:
278	mock_device_flush(i915);
279	return err;
280}
281
282struct smoketest {
283	struct intel_engine_cs *engine;
284	struct i915_gem_context **contexts;
285	atomic_long_t num_waits, num_fences;
286	int ncontexts, max_batch;
287	struct i915_request *(*request_alloc)(struct intel_context *ce);
288};
289
290static struct i915_request *
291__mock_request_alloc(struct intel_context *ce)
292{
293	return mock_request(ce, 0);
294}
295
296static struct i915_request *
297__live_request_alloc(struct intel_context *ce)
298{
299	return intel_context_create_request(ce);
300}
301
302struct smoke_thread {
303	struct kthread_worker *worker;
304	struct kthread_work work;
305	struct smoketest *t;
306	bool stop;
307	int result;
308};
309
310static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
311{
312	struct smoke_thread *thread = container_of(work, typeof(*thread), work);
313	struct smoketest *t = thread->t;
314	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
315	const unsigned int total = 4 * t->ncontexts + 1;
316	unsigned int num_waits = 0, num_fences = 0;
317	struct i915_request **requests;
318	I915_RND_STATE(prng);
319	unsigned int *order;
320	int err = 0;
321
322	/*
323	 * A very simple test to catch the most egregious of list handling bugs.
324	 *
325	 * At its heart, we simply create oodles of requests running across
326	 * multiple kthreads and enable signaling on them, for the sole purpose
327	 * of stressing our breadcrumb handling. The only inspection we do is
328	 * that the fences were marked as signaled.
329	 */
330
331	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
332	if (!requests) {
333		thread->result = -ENOMEM;
334		return;
335	}
336
337	order = i915_random_order(total, &prng);
338	if (!order) {
339		err = -ENOMEM;
340		goto out_requests;
341	}
342
343	while (!READ_ONCE(thread->stop)) {
344		struct i915_sw_fence *submit, *wait;
345		unsigned int n, count;
346
347		submit = heap_fence_create(GFP_KERNEL);
348		if (!submit) {
349			err = -ENOMEM;
350			break;
351		}
352
353		wait = heap_fence_create(GFP_KERNEL);
354		if (!wait) {
355			i915_sw_fence_commit(submit);
356			heap_fence_put(submit);
357			err = -ENOMEM;
358			break;
359		}
360
361		i915_random_reorder(order, total, &prng);
362		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
363
364		for (n = 0; n < count; n++) {
365			struct i915_gem_context *ctx =
366				t->contexts[order[n] % t->ncontexts];
367			struct i915_request *rq;
368			struct intel_context *ce;
369
370			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
371			GEM_BUG_ON(IS_ERR(ce));
372			rq = t->request_alloc(ce);
373			intel_context_put(ce);
374			if (IS_ERR(rq)) {
375				err = PTR_ERR(rq);
376				count = n;
377				break;
378			}
379
380			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
381							       submit,
382							       GFP_KERNEL);
383
384			requests[n] = i915_request_get(rq);
385			i915_request_add(rq);
386
387			if (err >= 0)
388				err = i915_sw_fence_await_dma_fence(wait,
389								    &rq->fence,
390								    0,
391								    GFP_KERNEL);
392
393			if (err < 0) {
394				i915_request_put(rq);
395				count = n;
396				break;
397			}
398		}
399
400		i915_sw_fence_commit(submit);
401		i915_sw_fence_commit(wait);
402
403		if (!wait_event_timeout(wait->wait,
404					i915_sw_fence_done(wait),
405					5 * HZ)) {
406			struct i915_request *rq = requests[count - 1];
407
408			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
409			       atomic_read(&wait->pending), count,
410			       rq->fence.context, rq->fence.seqno,
411			       t->engine->name);
412			GEM_TRACE_DUMP();
413
414			intel_gt_set_wedged(t->engine->gt);
415			GEM_BUG_ON(!i915_request_completed(rq));
416			i915_sw_fence_wait(wait);
417			err = -EIO;
418		}
419
420		for (n = 0; n < count; n++) {
421			struct i915_request *rq = requests[n];
422
423			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
424				      &rq->fence.flags)) {
425				pr_err("%llu:%llu was not signaled!\n",
426				       rq->fence.context, rq->fence.seqno);
427				err = -EINVAL;
428			}
429
430			i915_request_put(rq);
431		}
432
433		heap_fence_put(wait);
434		heap_fence_put(submit);
435
436		if (err < 0)
437			break;
438
439		num_fences += count;
440		num_waits++;
441
442		cond_resched();
443	}
444
445	atomic_long_add(num_fences, &t->num_fences);
446	atomic_long_add(num_waits, &t->num_waits);
447
448	kfree(order);
449out_requests:
450	kfree(requests);
451	thread->result = err;
452}
453
454static int mock_breadcrumbs_smoketest(void *arg)
455{
456	struct drm_i915_private *i915 = arg;
457	struct smoketest t = {
458		.engine = rcs0(i915),
459		.ncontexts = 1024,
460		.max_batch = 1024,
461		.request_alloc = __mock_request_alloc
462	};
463	unsigned int ncpus = num_online_cpus();
464	struct smoke_thread *threads;
465	unsigned int n;
466	int ret = 0;
467
468	/*
469	 * Smoketest our breadcrumb/signal handling for requests across multiple
470	 * threads. A very simple test to only catch the most egregious of bugs.
471	 * See __igt_breadcrumbs_smoketest();
472	 */
473
474	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
475	if (!threads)
476		return -ENOMEM;
477
478	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
479	if (!t.contexts) {
480		ret = -ENOMEM;
481		goto out_threads;
482	}
483
484	for (n = 0; n < t.ncontexts; n++) {
485		t.contexts[n] = mock_context(t.engine->i915, "mock");
486		if (!t.contexts[n]) {
487			ret = -ENOMEM;
488			goto out_contexts;
489		}
490	}
491
492	for (n = 0; n < ncpus; n++) {
493		struct kthread_worker *worker;
494
495		worker = kthread_create_worker(0, "igt/%d", n);
496		if (IS_ERR(worker)) {
497			ret = PTR_ERR(worker);
498			ncpus = n;
499			break;
500		}
501
502		threads[n].worker = worker;
503		threads[n].t = &t;
504		threads[n].stop = false;
505		threads[n].result = 0;
506
507		kthread_init_work(&threads[n].work,
508				  __igt_breadcrumbs_smoketest);
509		kthread_queue_work(worker, &threads[n].work);
510	}
511
512	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
513
514	for (n = 0; n < ncpus; n++) {
515		int err;
516
517		WRITE_ONCE(threads[n].stop, true);
518		kthread_flush_work(&threads[n].work);
519		err = READ_ONCE(threads[n].result);
520		if (err < 0 && !ret)
521			ret = err;
522
523		kthread_destroy_worker(threads[n].worker);
524	}
525	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
526		atomic_long_read(&t.num_waits),
527		atomic_long_read(&t.num_fences),
528		ncpus);
529
530out_contexts:
531	for (n = 0; n < t.ncontexts; n++) {
532		if (!t.contexts[n])
533			break;
534		mock_context_close(t.contexts[n]);
535	}
536	kfree(t.contexts);
537out_threads:
538	kfree(threads);
539	return ret;
540}
541
542int i915_request_mock_selftests(void)
543{
544	static const struct i915_subtest tests[] = {
545		SUBTEST(igt_add_request),
546		SUBTEST(igt_wait_request),
547		SUBTEST(igt_fence_wait),
548		SUBTEST(igt_request_rewind),
549		SUBTEST(mock_breadcrumbs_smoketest),
550	};
551	struct drm_i915_private *i915;
552	intel_wakeref_t wakeref;
553	int err = 0;
554
555	i915 = mock_gem_device();
556	if (!i915)
557		return -ENOMEM;
558
559	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
560		err = i915_subtests(tests, i915);
561
562	mock_destroy_device(i915);
563
564	return err;
565}
566
567static int live_nop_request(void *arg)
568{
569	struct drm_i915_private *i915 = arg;
570	struct intel_engine_cs *engine;
571	struct igt_live_test t;
572	int err = -ENODEV;
573
574	/*
575	 * Submit various sized batches of empty requests, to each engine
576	 * (individually), and wait for the batch to complete. We can check
577	 * the overhead of submitting requests to the hardware.
578	 */
579
580	for_each_uabi_engine(engine, i915) {
581		unsigned long n, prime;
582		IGT_TIMEOUT(end_time);
583		ktime_t times[2] = {};
584
585		err = igt_live_test_begin(&t, i915, __func__, engine->name);
586		if (err)
587			return err;
588
589		intel_engine_pm_get(engine);
590		for_each_prime_number_from(prime, 1, 8192) {
591			struct i915_request *request = NULL;
592
593			times[1] = ktime_get_raw();
594
595			for (n = 0; n < prime; n++) {
596				i915_request_put(request);
597				request = i915_request_create(engine->kernel_context);
598				if (IS_ERR(request))
599					return PTR_ERR(request);
600
601				/*
602				 * This space is left intentionally blank.
603				 *
604				 * We do not actually want to perform any
605				 * action with this request, we just want
606				 * to measure the latency in allocation
607				 * and submission of our breadcrumbs -
608				 * ensuring that the bare request is sufficient
609				 * for the system to work (i.e. proper HEAD
610				 * tracking of the rings, interrupt handling,
611				 * etc). It also gives us the lowest bounds
612				 * for latency.
613				 */
614
615				i915_request_get(request);
616				i915_request_add(request);
617			}
618			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
619			i915_request_put(request);
620
621			times[1] = ktime_sub(ktime_get_raw(), times[1]);
622			if (prime == 1)
623				times[0] = times[1];
624
625			if (__igt_timeout(end_time, NULL))
626				break;
627		}
628		intel_engine_pm_put(engine);
629
630		err = igt_live_test_end(&t);
631		if (err)
632			return err;
633
634		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
635			engine->name,
636			ktime_to_ns(times[0]),
637			prime, div64_u64(ktime_to_ns(times[1]), prime));
638	}
639
640	return err;
641}
642
643static int __cancel_inactive(struct intel_engine_cs *engine)
644{
645	struct intel_context *ce;
646	struct igt_spinner spin;
647	struct i915_request *rq;
648	int err = 0;
649
650	if (igt_spinner_init(&spin, engine->gt))
651		return -ENOMEM;
652
653	ce = intel_context_create(engine);
654	if (IS_ERR(ce)) {
655		err = PTR_ERR(ce);
656		goto out_spin;
657	}
658
659	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
660	if (IS_ERR(rq)) {
661		err = PTR_ERR(rq);
662		goto out_ce;
663	}
664
665	pr_debug("%s: Cancelling inactive request\n", engine->name);
666	i915_request_cancel(rq, -EINTR);
667	i915_request_get(rq);
668	i915_request_add(rq);
669
670	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
671		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
672
673		pr_err("%s: Failed to cancel inactive request\n", engine->name);
674		intel_engine_dump(engine, &p, "%s\n", engine->name);
675		err = -ETIME;
676		goto out_rq;
677	}
678
679	if (rq->fence.error != -EINTR) {
680		pr_err("%s: fence not cancelled (%u)\n",
681		       engine->name, rq->fence.error);
682		err = -EINVAL;
683	}
684
685out_rq:
686	i915_request_put(rq);
687out_ce:
688	intel_context_put(ce);
689out_spin:
690	igt_spinner_fini(&spin);
691	if (err)
692		pr_err("%s: %s error %d\n", __func__, engine->name, err);
693	return err;
694}
695
696static int __cancel_active(struct intel_engine_cs *engine)
697{
698	struct intel_context *ce;
699	struct igt_spinner spin;
700	struct i915_request *rq;
701	int err = 0;
702
703	if (igt_spinner_init(&spin, engine->gt))
704		return -ENOMEM;
705
706	ce = intel_context_create(engine);
707	if (IS_ERR(ce)) {
708		err = PTR_ERR(ce);
709		goto out_spin;
710	}
711
712	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
713	if (IS_ERR(rq)) {
714		err = PTR_ERR(rq);
715		goto out_ce;
716	}
717
718	pr_debug("%s: Cancelling active request\n", engine->name);
719	i915_request_get(rq);
720	i915_request_add(rq);
721	if (!igt_wait_for_spinner(&spin, rq)) {
722		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
723
724		pr_err("Failed to start spinner on %s\n", engine->name);
725		intel_engine_dump(engine, &p, "%s\n", engine->name);
726		err = -ETIME;
727		goto out_rq;
728	}
729	i915_request_cancel(rq, -EINTR);
730
731	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
732		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
733
734		pr_err("%s: Failed to cancel active request\n", engine->name);
735		intel_engine_dump(engine, &p, "%s\n", engine->name);
736		err = -ETIME;
737		goto out_rq;
738	}
739
740	if (rq->fence.error != -EINTR) {
741		pr_err("%s: fence not cancelled (%u)\n",
742		       engine->name, rq->fence.error);
743		err = -EINVAL;
744	}
745
746out_rq:
747	i915_request_put(rq);
748out_ce:
749	intel_context_put(ce);
750out_spin:
751	igt_spinner_fini(&spin);
752	if (err)
753		pr_err("%s: %s error %d\n", __func__, engine->name, err);
754	return err;
755}
756
757static int __cancel_completed(struct intel_engine_cs *engine)
758{
759	struct intel_context *ce;
760	struct igt_spinner spin;
761	struct i915_request *rq;
762	int err = 0;
763
764	if (igt_spinner_init(&spin, engine->gt))
765		return -ENOMEM;
766
767	ce = intel_context_create(engine);
768	if (IS_ERR(ce)) {
769		err = PTR_ERR(ce);
770		goto out_spin;
771	}
772
773	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
774	if (IS_ERR(rq)) {
775		err = PTR_ERR(rq);
776		goto out_ce;
777	}
778	igt_spinner_end(&spin);
779	i915_request_get(rq);
780	i915_request_add(rq);
781
782	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
783		err = -ETIME;
784		goto out_rq;
785	}
786
787	pr_debug("%s: Cancelling completed request\n", engine->name);
788	i915_request_cancel(rq, -EINTR);
789	if (rq->fence.error) {
790		pr_err("%s: fence not cancelled (%u)\n",
791		       engine->name, rq->fence.error);
792		err = -EINVAL;
793	}
794
795out_rq:
796	i915_request_put(rq);
797out_ce:
798	intel_context_put(ce);
799out_spin:
800	igt_spinner_fini(&spin);
801	if (err)
802		pr_err("%s: %s error %d\n", __func__, engine->name, err);
803	return err;
804}
805
806/*
807 * Test to prove a non-preemptable request can be cancelled and a subsequent
808 * request on the same context can successfully complete after cancellation.
809 *
810 * Testing methodology is to create a non-preemptible request and submit it,
811 * wait for spinner to start, create a NOP request and submit it, cancel the
812 * spinner, wait for spinner to complete and verify it failed with an error,
813 * finally wait for NOP request to complete verify it succeeded without an
814 * error. Preemption timeout also reduced / restored so test runs in a timely
815 * maner.
816 */
817static int __cancel_reset(struct drm_i915_private *i915,
818			  struct intel_engine_cs *engine)
819{
820	struct intel_context *ce;
821	struct igt_spinner spin;
822	struct i915_request *rq, *nop;
823	unsigned long preempt_timeout_ms;
824	int err = 0;
825
826	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
827	    !intel_has_reset_engine(engine->gt))
828		return 0;
829
830	preempt_timeout_ms = engine->props.preempt_timeout_ms;
831	engine->props.preempt_timeout_ms = 100;
832
833	if (igt_spinner_init(&spin, engine->gt))
834		goto out_restore;
835
836	ce = intel_context_create(engine);
837	if (IS_ERR(ce)) {
838		err = PTR_ERR(ce);
839		goto out_spin;
840	}
841
842	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
843	if (IS_ERR(rq)) {
844		err = PTR_ERR(rq);
845		goto out_ce;
846	}
847
848	pr_debug("%s: Cancelling active non-preemptable request\n",
849		 engine->name);
850	i915_request_get(rq);
851	i915_request_add(rq);
852	if (!igt_wait_for_spinner(&spin, rq)) {
853		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
854
855		pr_err("Failed to start spinner on %s\n", engine->name);
856		intel_engine_dump(engine, &p, "%s\n", engine->name);
857		err = -ETIME;
858		goto out_rq;
859	}
860
861	nop = intel_context_create_request(ce);
862	if (IS_ERR(nop))
863		goto out_rq;
864	i915_request_get(nop);
865	i915_request_add(nop);
866
867	i915_request_cancel(rq, -EINTR);
868
869	if (i915_request_wait(rq, 0, HZ) < 0) {
870		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
871
872		pr_err("%s: Failed to cancel hung request\n", engine->name);
873		intel_engine_dump(engine, &p, "%s\n", engine->name);
874		err = -ETIME;
875		goto out_nop;
876	}
877
878	if (rq->fence.error != -EINTR) {
879		pr_err("%s: fence not cancelled (%u)\n",
880		       engine->name, rq->fence.error);
881		err = -EINVAL;
882		goto out_nop;
883	}
884
885	if (i915_request_wait(nop, 0, HZ) < 0) {
886		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
887
888		pr_err("%s: Failed to complete nop request\n", engine->name);
889		intel_engine_dump(engine, &p, "%s\n", engine->name);
890		err = -ETIME;
891		goto out_nop;
892	}
893
894	if (nop->fence.error != 0) {
895		pr_err("%s: Nop request errored (%u)\n",
896		       engine->name, nop->fence.error);
897		err = -EINVAL;
898	}
899
900out_nop:
901	i915_request_put(nop);
902out_rq:
903	i915_request_put(rq);
904out_ce:
905	intel_context_put(ce);
906out_spin:
907	igt_spinner_fini(&spin);
908out_restore:
909	engine->props.preempt_timeout_ms = preempt_timeout_ms;
910	if (err)
911		pr_err("%s: %s error %d\n", __func__, engine->name, err);
912	return err;
913}
914
915static int live_cancel_request(void *arg)
916{
917	struct drm_i915_private *i915 = arg;
918	struct intel_engine_cs *engine;
919
920	/*
921	 * Check cancellation of requests. We expect to be able to immediately
922	 * cancel active requests, even if they are currently on the GPU.
923	 */
924
925	for_each_uabi_engine(engine, i915) {
926		struct igt_live_test t;
927		int err, err2;
928
929		if (!intel_engine_has_preemption(engine))
930			continue;
931
932		err = igt_live_test_begin(&t, i915, __func__, engine->name);
933		if (err)
934			return err;
935
936		err = __cancel_inactive(engine);
937		if (err == 0)
938			err = __cancel_active(engine);
939		if (err == 0)
940			err = __cancel_completed(engine);
941
942		err2 = igt_live_test_end(&t);
943		if (err)
944			return err;
945		if (err2)
946			return err2;
947
948		/* Expects reset so call outside of igt_live_test_* */
949		err = __cancel_reset(i915, engine);
950		if (err)
951			return err;
952
953		if (igt_flush_test(i915))
954			return -EIO;
955	}
956
957	return 0;
958}
959
960static struct i915_vma *empty_batch(struct intel_gt *gt)
961{
962	struct drm_i915_gem_object *obj;
963	struct i915_vma *vma;
964	u32 *cmd;
965	int err;
966
967	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
968	if (IS_ERR(obj))
969		return ERR_CAST(obj);
970
971	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
972	if (IS_ERR(cmd)) {
973		err = PTR_ERR(cmd);
974		goto err;
975	}
976
977	*cmd = MI_BATCH_BUFFER_END;
978
979	__i915_gem_object_flush_map(obj, 0, 64);
980	i915_gem_object_unpin_map(obj);
981
982	intel_gt_chipset_flush(gt);
983
984	vma = i915_vma_instance(obj, gt->vm, NULL);
985	if (IS_ERR(vma)) {
986		err = PTR_ERR(vma);
987		goto err;
988	}
989
990	err = i915_vma_pin(vma, 0, 0, PIN_USER);
991	if (err)
992		goto err;
993
994	/* Force the wait now to avoid including it in the benchmark */
995	err = i915_vma_sync(vma);
996	if (err)
997		goto err_pin;
998
999	return vma;
1000
1001err_pin:
1002	i915_vma_unpin(vma);
1003err:
1004	i915_gem_object_put(obj);
1005	return ERR_PTR(err);
1006}
1007
1008static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1009{
1010	return rq->engine->emit_bb_start(rq,
1011					 i915_vma_offset(batch),
1012					 i915_vma_size(batch),
1013					 0);
1014}
1015
1016static struct i915_request *
1017empty_request(struct intel_engine_cs *engine,
1018	      struct i915_vma *batch)
1019{
1020	struct i915_request *request;
1021	int err;
1022
1023	request = i915_request_create(engine->kernel_context);
1024	if (IS_ERR(request))
1025		return request;
1026
1027	err = emit_bb_start(request, batch);
1028	if (err)
1029		goto out_request;
1030
1031	i915_request_get(request);
1032out_request:
1033	i915_request_add(request);
1034	return err ? ERR_PTR(err) : request;
1035}
1036
1037static int live_empty_request(void *arg)
1038{
1039	struct drm_i915_private *i915 = arg;
1040	struct intel_engine_cs *engine;
1041	struct igt_live_test t;
1042	int err;
1043
1044	/*
1045	 * Submit various sized batches of empty requests, to each engine
1046	 * (individually), and wait for the batch to complete. We can check
1047	 * the overhead of submitting requests to the hardware.
1048	 */
1049
1050	for_each_uabi_engine(engine, i915) {
1051		IGT_TIMEOUT(end_time);
1052		struct i915_request *request;
1053		struct i915_vma *batch;
1054		unsigned long n, prime;
1055		ktime_t times[2] = {};
1056
1057		batch = empty_batch(engine->gt);
1058		if (IS_ERR(batch))
1059			return PTR_ERR(batch);
1060
1061		err = igt_live_test_begin(&t, i915, __func__, engine->name);
1062		if (err)
1063			goto out_batch;
1064
1065		intel_engine_pm_get(engine);
1066
1067		/* Warmup / preload */
1068		request = empty_request(engine, batch);
1069		if (IS_ERR(request)) {
1070			err = PTR_ERR(request);
1071			intel_engine_pm_put(engine);
1072			goto out_batch;
1073		}
1074		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1075
1076		for_each_prime_number_from(prime, 1, 8192) {
1077			times[1] = ktime_get_raw();
1078
1079			for (n = 0; n < prime; n++) {
1080				i915_request_put(request);
1081				request = empty_request(engine, batch);
1082				if (IS_ERR(request)) {
1083					err = PTR_ERR(request);
1084					intel_engine_pm_put(engine);
1085					goto out_batch;
1086				}
1087			}
1088			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1089
1090			times[1] = ktime_sub(ktime_get_raw(), times[1]);
1091			if (prime == 1)
1092				times[0] = times[1];
1093
1094			if (__igt_timeout(end_time, NULL))
1095				break;
1096		}
1097		i915_request_put(request);
1098		intel_engine_pm_put(engine);
1099
1100		err = igt_live_test_end(&t);
1101		if (err)
1102			goto out_batch;
1103
1104		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1105			engine->name,
1106			ktime_to_ns(times[0]),
1107			prime, div64_u64(ktime_to_ns(times[1]), prime));
1108out_batch:
1109		i915_vma_unpin(batch);
1110		i915_vma_put(batch);
1111		if (err)
1112			break;
1113	}
1114
1115	return err;
1116}
1117
1118static struct i915_vma *recursive_batch(struct intel_gt *gt)
1119{
1120	struct drm_i915_gem_object *obj;
1121	const int ver = GRAPHICS_VER(gt->i915);
1122	struct i915_vma *vma;
1123	u32 *cmd;
1124	int err;
1125
1126	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1127	if (IS_ERR(obj))
1128		return ERR_CAST(obj);
1129
1130	vma = i915_vma_instance(obj, gt->vm, NULL);
1131	if (IS_ERR(vma)) {
1132		err = PTR_ERR(vma);
1133		goto err;
1134	}
1135
1136	err = i915_vma_pin(vma, 0, 0, PIN_USER);
1137	if (err)
1138		goto err;
1139
1140	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1141	if (IS_ERR(cmd)) {
1142		err = PTR_ERR(cmd);
1143		goto err;
1144	}
1145
1146	if (ver >= 8) {
1147		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1148		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1149		*cmd++ = upper_32_bits(i915_vma_offset(vma));
1150	} else if (ver >= 6) {
1151		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1152		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1153	} else {
1154		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1155		*cmd++ = lower_32_bits(i915_vma_offset(vma));
1156	}
1157	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1158
1159	__i915_gem_object_flush_map(obj, 0, 64);
1160	i915_gem_object_unpin_map(obj);
1161
1162	intel_gt_chipset_flush(gt);
1163
1164	return vma;
1165
1166err:
1167	i915_gem_object_put(obj);
1168	return ERR_PTR(err);
1169}
1170
1171static int recursive_batch_resolve(struct i915_vma *batch)
1172{
1173	u32 *cmd;
1174
1175	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1176	if (IS_ERR(cmd))
1177		return PTR_ERR(cmd);
1178
1179	*cmd = MI_BATCH_BUFFER_END;
1180
1181	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1182	i915_gem_object_unpin_map(batch->obj);
1183
1184	intel_gt_chipset_flush(batch->vm->gt);
1185
1186	return 0;
1187}
1188
1189static int live_all_engines(void *arg)
1190{
1191	struct drm_i915_private *i915 = arg;
1192	const unsigned int nengines = num_uabi_engines(i915);
1193	struct intel_engine_cs *engine;
1194	struct i915_request **request;
1195	struct igt_live_test t;
1196	unsigned int idx;
1197	int err;
1198
1199	/*
1200	 * Check we can submit requests to all engines simultaneously. We
1201	 * send a recursive batch to each engine - checking that we don't
1202	 * block doing so, and that they don't complete too soon.
1203	 */
1204
1205	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1206	if (!request)
1207		return -ENOMEM;
1208
1209	err = igt_live_test_begin(&t, i915, __func__, "");
1210	if (err)
1211		goto out_free;
1212
1213	idx = 0;
1214	for_each_uabi_engine(engine, i915) {
1215		struct i915_vma *batch;
1216
1217		batch = recursive_batch(engine->gt);
1218		if (IS_ERR(batch)) {
1219			err = PTR_ERR(batch);
1220			pr_err("%s: Unable to create batch, err=%d\n",
1221			       __func__, err);
1222			goto out_free;
1223		}
1224
1225		i915_vma_lock(batch);
1226		request[idx] = intel_engine_create_kernel_request(engine);
1227		if (IS_ERR(request[idx])) {
1228			err = PTR_ERR(request[idx]);
1229			pr_err("%s: Request allocation failed with err=%d\n",
1230			       __func__, err);
1231			goto out_unlock;
1232		}
1233		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1234
1235		err = i915_vma_move_to_active(batch, request[idx], 0);
1236		GEM_BUG_ON(err);
1237
1238		err = emit_bb_start(request[idx], batch);
1239		GEM_BUG_ON(err);
1240		request[idx]->batch = batch;
1241
1242		i915_request_get(request[idx]);
1243		i915_request_add(request[idx]);
1244		idx++;
1245out_unlock:
1246		i915_vma_unlock(batch);
1247		if (err)
1248			goto out_request;
1249	}
1250
1251	idx = 0;
1252	for_each_uabi_engine(engine, i915) {
1253		if (i915_request_completed(request[idx])) {
1254			pr_err("%s(%s): request completed too early!\n",
1255			       __func__, engine->name);
1256			err = -EINVAL;
1257			goto out_request;
1258		}
1259		idx++;
1260	}
1261
1262	idx = 0;
1263	for_each_uabi_engine(engine, i915) {
1264		err = recursive_batch_resolve(request[idx]->batch);
1265		if (err) {
1266			pr_err("%s: failed to resolve batch, err=%d\n",
1267			       __func__, err);
1268			goto out_request;
1269		}
1270		idx++;
1271	}
1272
1273	idx = 0;
1274	for_each_uabi_engine(engine, i915) {
1275		struct i915_request *rq = request[idx];
1276		long timeout;
1277
1278		timeout = i915_request_wait(rq, 0,
1279					    MAX_SCHEDULE_TIMEOUT);
1280		if (timeout < 0) {
1281			err = timeout;
1282			pr_err("%s: error waiting for request on %s, err=%d\n",
1283			       __func__, engine->name, err);
1284			goto out_request;
1285		}
1286
1287		GEM_BUG_ON(!i915_request_completed(rq));
1288		i915_vma_unpin(rq->batch);
1289		i915_vma_put(rq->batch);
1290		i915_request_put(rq);
1291		request[idx] = NULL;
1292		idx++;
1293	}
1294
1295	err = igt_live_test_end(&t);
1296
1297out_request:
1298	idx = 0;
1299	for_each_uabi_engine(engine, i915) {
1300		struct i915_request *rq = request[idx];
1301
1302		if (!rq)
1303			continue;
1304
1305		if (rq->batch) {
1306			i915_vma_unpin(rq->batch);
1307			i915_vma_put(rq->batch);
1308		}
1309		i915_request_put(rq);
1310		idx++;
1311	}
1312out_free:
1313	kfree(request);
1314	return err;
1315}
1316
1317static int live_sequential_engines(void *arg)
1318{
1319	struct drm_i915_private *i915 = arg;
1320	const unsigned int nengines = num_uabi_engines(i915);
1321	struct i915_request **request;
1322	struct i915_request *prev = NULL;
1323	struct intel_engine_cs *engine;
1324	struct igt_live_test t;
1325	unsigned int idx;
1326	int err;
1327
1328	/*
1329	 * Check we can submit requests to all engines sequentially, such
1330	 * that each successive request waits for the earlier ones. This
1331	 * tests that we don't execute requests out of order, even though
1332	 * they are running on independent engines.
1333	 */
1334
1335	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1336	if (!request)
1337		return -ENOMEM;
1338
1339	err = igt_live_test_begin(&t, i915, __func__, "");
1340	if (err)
1341		goto out_free;
1342
1343	idx = 0;
1344	for_each_uabi_engine(engine, i915) {
1345		struct i915_vma *batch;
1346
1347		batch = recursive_batch(engine->gt);
1348		if (IS_ERR(batch)) {
1349			err = PTR_ERR(batch);
1350			pr_err("%s: Unable to create batch for %s, err=%d\n",
1351			       __func__, engine->name, err);
1352			goto out_free;
1353		}
1354
1355		i915_vma_lock(batch);
1356		request[idx] = intel_engine_create_kernel_request(engine);
1357		if (IS_ERR(request[idx])) {
1358			err = PTR_ERR(request[idx]);
1359			pr_err("%s: Request allocation failed for %s with err=%d\n",
1360			       __func__, engine->name, err);
1361			goto out_unlock;
1362		}
1363		GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1364
1365		if (prev) {
1366			err = i915_request_await_dma_fence(request[idx],
1367							   &prev->fence);
1368			if (err) {
1369				i915_request_add(request[idx]);
1370				pr_err("%s: Request await failed for %s with err=%d\n",
1371				       __func__, engine->name, err);
1372				goto out_unlock;
1373			}
1374		}
1375
1376		err = i915_vma_move_to_active(batch, request[idx], 0);
1377		GEM_BUG_ON(err);
1378
1379		err = emit_bb_start(request[idx], batch);
1380		GEM_BUG_ON(err);
1381		request[idx]->batch = batch;
1382
1383		i915_request_get(request[idx]);
1384		i915_request_add(request[idx]);
1385
1386		prev = request[idx];
1387		idx++;
1388
1389out_unlock:
1390		i915_vma_unlock(batch);
1391		if (err)
1392			goto out_request;
1393	}
1394
1395	idx = 0;
1396	for_each_uabi_engine(engine, i915) {
1397		long timeout;
1398
1399		if (i915_request_completed(request[idx])) {
1400			pr_err("%s(%s): request completed too early!\n",
1401			       __func__, engine->name);
1402			err = -EINVAL;
1403			goto out_request;
1404		}
1405
1406		err = recursive_batch_resolve(request[idx]->batch);
1407		if (err) {
1408			pr_err("%s: failed to resolve batch, err=%d\n",
1409			       __func__, err);
1410			goto out_request;
1411		}
1412
1413		timeout = i915_request_wait(request[idx], 0,
1414					    MAX_SCHEDULE_TIMEOUT);
1415		if (timeout < 0) {
1416			err = timeout;
1417			pr_err("%s: error waiting for request on %s, err=%d\n",
1418			       __func__, engine->name, err);
1419			goto out_request;
1420		}
1421
1422		GEM_BUG_ON(!i915_request_completed(request[idx]));
1423		idx++;
1424	}
1425
1426	err = igt_live_test_end(&t);
1427
1428out_request:
1429	idx = 0;
1430	for_each_uabi_engine(engine, i915) {
1431		u32 *cmd;
1432
1433		if (!request[idx])
1434			break;
1435
1436		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1437						       I915_MAP_WC);
1438		if (!IS_ERR(cmd)) {
1439			*cmd = MI_BATCH_BUFFER_END;
1440
1441			__i915_gem_object_flush_map(request[idx]->batch->obj,
1442						    0, sizeof(*cmd));
1443			i915_gem_object_unpin_map(request[idx]->batch->obj);
1444
1445			intel_gt_chipset_flush(engine->gt);
1446		}
1447
1448		i915_vma_put(request[idx]->batch);
1449		i915_request_put(request[idx]);
1450		idx++;
1451	}
1452out_free:
1453	kfree(request);
1454	return err;
1455}
1456
1457struct parallel_thread {
1458	struct kthread_worker *worker;
1459	struct kthread_work work;
1460	struct intel_engine_cs *engine;
1461	int result;
1462};
1463
1464static void __live_parallel_engine1(struct kthread_work *work)
1465{
1466	struct parallel_thread *thread =
1467		container_of(work, typeof(*thread), work);
1468	struct intel_engine_cs *engine = thread->engine;
1469	IGT_TIMEOUT(end_time);
1470	unsigned long count;
1471	int err = 0;
1472
1473	count = 0;
1474	intel_engine_pm_get(engine);
1475	do {
1476		struct i915_request *rq;
1477
1478		rq = i915_request_create(engine->kernel_context);
1479		if (IS_ERR(rq)) {
1480			err = PTR_ERR(rq);
1481			break;
1482		}
1483
1484		i915_request_get(rq);
1485		i915_request_add(rq);
1486
1487		err = 0;
1488		if (i915_request_wait(rq, 0, HZ) < 0)
1489			err = -ETIME;
1490		i915_request_put(rq);
1491		if (err)
1492			break;
1493
1494		count++;
1495	} while (!__igt_timeout(end_time, NULL));
1496	intel_engine_pm_put(engine);
1497
1498	pr_info("%s: %lu request + sync\n", engine->name, count);
1499	thread->result = err;
1500}
1501
1502static void __live_parallel_engineN(struct kthread_work *work)
1503{
1504	struct parallel_thread *thread =
1505		container_of(work, typeof(*thread), work);
1506	struct intel_engine_cs *engine = thread->engine;
1507	IGT_TIMEOUT(end_time);
1508	unsigned long count;
1509	int err = 0;
1510
1511	count = 0;
1512	intel_engine_pm_get(engine);
1513	do {
1514		struct i915_request *rq;
1515
1516		rq = i915_request_create(engine->kernel_context);
1517		if (IS_ERR(rq)) {
1518			err = PTR_ERR(rq);
1519			break;
1520		}
1521
1522		i915_request_add(rq);
1523		count++;
1524	} while (!__igt_timeout(end_time, NULL));
1525	intel_engine_pm_put(engine);
1526
1527	pr_info("%s: %lu requests\n", engine->name, count);
1528	thread->result = err;
1529}
1530
1531static bool wake_all(struct drm_i915_private *i915)
1532{
1533	if (atomic_dec_and_test(&i915->selftest.counter)) {
1534		wake_up_var(&i915->selftest.counter);
1535		return true;
1536	}
1537
1538	return false;
1539}
1540
1541static int wait_for_all(struct drm_i915_private *i915)
1542{
1543	if (wake_all(i915))
1544		return 0;
1545
1546	if (wait_var_event_timeout(&i915->selftest.counter,
1547				   !atomic_read(&i915->selftest.counter),
1548				   i915_selftest.timeout_jiffies))
1549		return 0;
1550
1551	return -ETIME;
1552}
1553
1554static void __live_parallel_spin(struct kthread_work *work)
1555{
1556	struct parallel_thread *thread =
1557		container_of(work, typeof(*thread), work);
1558	struct intel_engine_cs *engine = thread->engine;
1559	struct igt_spinner spin;
1560	struct i915_request *rq;
1561	int err = 0;
1562
1563	/*
1564	 * Create a spinner running for eternity on each engine. If a second
1565	 * spinner is incorrectly placed on the same engine, it will not be
1566	 * able to start in time.
1567	 */
1568
1569	if (igt_spinner_init(&spin, engine->gt)) {
1570		wake_all(engine->i915);
1571		thread->result = -ENOMEM;
1572		return;
1573	}
1574
1575	intel_engine_pm_get(engine);
1576	rq = igt_spinner_create_request(&spin,
1577					engine->kernel_context,
1578					MI_NOOP); /* no preemption */
1579	intel_engine_pm_put(engine);
1580	if (IS_ERR(rq)) {
1581		err = PTR_ERR(rq);
1582		if (err == -ENODEV)
1583			err = 0;
1584		wake_all(engine->i915);
1585		goto out_spin;
1586	}
1587
1588	i915_request_get(rq);
1589	i915_request_add(rq);
1590	if (igt_wait_for_spinner(&spin, rq)) {
1591		/* Occupy this engine for the whole test */
1592		err = wait_for_all(engine->i915);
1593	} else {
1594		pr_err("Failed to start spinner on %s\n", engine->name);
1595		err = -EINVAL;
1596	}
1597	igt_spinner_end(&spin);
1598
1599	if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1600		err = -EIO;
1601	i915_request_put(rq);
1602
1603out_spin:
1604	igt_spinner_fini(&spin);
1605	thread->result = err;
1606}
1607
1608static int live_parallel_engines(void *arg)
1609{
1610	struct drm_i915_private *i915 = arg;
1611	static void (* const func[])(struct kthread_work *) = {
1612		__live_parallel_engine1,
1613		__live_parallel_engineN,
1614		__live_parallel_spin,
1615		NULL,
1616	};
1617	const unsigned int nengines = num_uabi_engines(i915);
1618	struct parallel_thread *threads;
1619	struct intel_engine_cs *engine;
1620	void (* const *fn)(struct kthread_work *);
1621	int err = 0;
1622
1623	/*
1624	 * Check we can submit requests to all engines concurrently. This
1625	 * tests that we load up the system maximally.
1626	 */
1627
1628	threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1629	if (!threads)
1630		return -ENOMEM;
1631
1632	for (fn = func; !err && *fn; fn++) {
1633		char name[KSYM_NAME_LEN];
1634		struct igt_live_test t;
1635		unsigned int idx;
1636
1637		snprintf(name, sizeof(name), "%ps", *fn);
1638		err = igt_live_test_begin(&t, i915, __func__, name);
1639		if (err)
1640			break;
1641
1642		atomic_set(&i915->selftest.counter, nengines);
1643
1644		idx = 0;
1645		for_each_uabi_engine(engine, i915) {
1646			struct kthread_worker *worker;
1647
1648			worker = kthread_create_worker(0, "igt/parallel:%s",
1649						       engine->name);
1650			if (IS_ERR(worker)) {
1651				err = PTR_ERR(worker);
1652				break;
1653			}
1654
1655			threads[idx].worker = worker;
1656			threads[idx].result = 0;
1657			threads[idx].engine = engine;
1658
1659			kthread_init_work(&threads[idx].work, *fn);
1660			kthread_queue_work(worker, &threads[idx].work);
1661			idx++;
1662		}
1663
1664		idx = 0;
1665		for_each_uabi_engine(engine, i915) {
1666			int status;
1667
1668			if (!threads[idx].worker)
1669				break;
1670
1671			kthread_flush_work(&threads[idx].work);
1672			status = READ_ONCE(threads[idx].result);
1673			if (status && !err)
1674				err = status;
1675
1676			kthread_destroy_worker(threads[idx++].worker);
1677		}
1678
1679		if (igt_live_test_end(&t))
1680			err = -EIO;
1681	}
1682
1683	kfree(threads);
1684	return err;
1685}
1686
1687static int
1688max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1689{
1690	struct i915_request *rq;
1691	int ret;
1692
1693	/*
1694	 * Before execlists, all contexts share the same ringbuffer. With
1695	 * execlists, each context/engine has a separate ringbuffer and
1696	 * for the purposes of this test, inexhaustible.
1697	 *
1698	 * For the global ringbuffer though, we have to be very careful
1699	 * that we do not wrap while preventing the execution of requests
1700	 * with a unsignaled fence.
1701	 */
1702	if (HAS_EXECLISTS(ctx->i915))
1703		return INT_MAX;
1704
1705	rq = igt_request_alloc(ctx, engine);
1706	if (IS_ERR(rq)) {
1707		ret = PTR_ERR(rq);
1708	} else {
1709		int sz;
1710
1711		ret = rq->ring->size - rq->reserved_space;
1712		i915_request_add(rq);
1713
1714		sz = rq->ring->emit - rq->head;
1715		if (sz < 0)
1716			sz += rq->ring->size;
1717		ret /= sz;
1718		ret /= 2; /* leave half spare, in case of emergency! */
1719	}
1720
1721	return ret;
1722}
1723
1724static int live_breadcrumbs_smoketest(void *arg)
1725{
1726	struct drm_i915_private *i915 = arg;
1727	const unsigned int nengines = num_uabi_engines(i915);
1728	const unsigned int ncpus = /* saturate with nengines * ncpus */
1729		max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1730	unsigned long num_waits, num_fences;
1731	struct intel_engine_cs *engine;
1732	struct smoke_thread *threads;
1733	struct igt_live_test live;
1734	intel_wakeref_t wakeref;
1735	struct smoketest *smoke;
1736	unsigned int n, idx;
1737	struct file *file;
1738	int ret = 0;
1739
1740	/*
1741	 * Smoketest our breadcrumb/signal handling for requests across multiple
1742	 * threads. A very simple test to only catch the most egregious of bugs.
1743	 * See __igt_breadcrumbs_smoketest();
1744	 *
1745	 * On real hardware this time.
1746	 */
1747
1748	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1749
1750	file = mock_file(i915);
1751	if (IS_ERR(file)) {
1752		ret = PTR_ERR(file);
1753		goto out_rpm;
1754	}
1755
1756	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1757	if (!smoke) {
1758		ret = -ENOMEM;
1759		goto out_file;
1760	}
1761
1762	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1763	if (!threads) {
1764		ret = -ENOMEM;
1765		goto out_smoke;
1766	}
1767
1768	smoke[0].request_alloc = __live_request_alloc;
1769	smoke[0].ncontexts = 64;
1770	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1771				    sizeof(*smoke[0].contexts),
1772				    GFP_KERNEL);
1773	if (!smoke[0].contexts) {
1774		ret = -ENOMEM;
1775		goto out_threads;
1776	}
1777
1778	for (n = 0; n < smoke[0].ncontexts; n++) {
1779		smoke[0].contexts[n] = live_context(i915, file);
1780		if (IS_ERR(smoke[0].contexts[n])) {
1781			ret = PTR_ERR(smoke[0].contexts[n]);
1782			goto out_contexts;
1783		}
1784	}
1785
1786	ret = igt_live_test_begin(&live, i915, __func__, "");
1787	if (ret)
1788		goto out_contexts;
1789
1790	idx = 0;
1791	for_each_uabi_engine(engine, i915) {
1792		smoke[idx] = smoke[0];
1793		smoke[idx].engine = engine;
1794		smoke[idx].max_batch =
1795			max_batches(smoke[0].contexts[0], engine);
1796		if (smoke[idx].max_batch < 0) {
1797			ret = smoke[idx].max_batch;
1798			goto out_flush;
1799		}
1800		/* One ring interleaved between requests from all cpus */
1801		smoke[idx].max_batch /= ncpus + 1;
1802		pr_debug("Limiting batches to %d requests on %s\n",
1803			 smoke[idx].max_batch, engine->name);
1804
1805		for (n = 0; n < ncpus; n++) {
1806			unsigned int i = idx * ncpus + n;
1807			struct kthread_worker *worker;
1808
1809			worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
1810			if (IS_ERR(worker)) {
1811				ret = PTR_ERR(worker);
1812				goto out_flush;
1813			}
1814
1815			threads[i].worker = worker;
1816			threads[i].t = &smoke[idx];
1817
1818			kthread_init_work(&threads[i].work,
1819					  __igt_breadcrumbs_smoketest);
1820			kthread_queue_work(worker, &threads[i].work);
1821		}
1822
1823		idx++;
1824	}
1825
1826	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1827
1828out_flush:
1829	idx = 0;
1830	num_waits = 0;
1831	num_fences = 0;
1832	for_each_uabi_engine(engine, i915) {
1833		for (n = 0; n < ncpus; n++) {
1834			unsigned int i = idx * ncpus + n;
1835			int err;
1836
1837			if (!threads[i].worker)
1838				continue;
1839
1840			WRITE_ONCE(threads[i].stop, true);
1841			kthread_flush_work(&threads[i].work);
1842			err = READ_ONCE(threads[i].result);
1843			if (err < 0 && !ret)
1844				ret = err;
1845
1846			kthread_destroy_worker(threads[i].worker);
1847		}
1848
1849		num_waits += atomic_long_read(&smoke[idx].num_waits);
1850		num_fences += atomic_long_read(&smoke[idx].num_fences);
1851		idx++;
1852	}
1853	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1854		num_waits, num_fences, idx, ncpus);
1855
1856	ret = igt_live_test_end(&live) ?: ret;
1857out_contexts:
1858	kfree(smoke[0].contexts);
1859out_threads:
1860	kfree(threads);
1861out_smoke:
1862	kfree(smoke);
1863out_file:
1864	fput(file);
1865out_rpm:
1866	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1867
1868	return ret;
1869}
1870
1871int i915_request_live_selftests(struct drm_i915_private *i915)
1872{
1873	static const struct i915_subtest tests[] = {
1874		SUBTEST(live_nop_request),
1875		SUBTEST(live_all_engines),
1876		SUBTEST(live_sequential_engines),
1877		SUBTEST(live_parallel_engines),
1878		SUBTEST(live_empty_request),
1879		SUBTEST(live_cancel_request),
1880		SUBTEST(live_breadcrumbs_smoketest),
1881	};
1882
1883	if (intel_gt_is_wedged(to_gt(i915)))
1884		return 0;
1885
1886	return i915_live_subtests(tests, i915);
1887}
1888
1889static int switch_to_kernel_sync(struct intel_context *ce, int err)
1890{
1891	struct i915_request *rq;
1892	struct dma_fence *fence;
1893
1894	rq = intel_engine_create_kernel_request(ce->engine);
1895	if (IS_ERR(rq))
1896		return PTR_ERR(rq);
1897
1898	fence = i915_active_fence_get(&ce->timeline->last_request);
1899	if (fence) {
1900		i915_request_await_dma_fence(rq, fence);
1901		dma_fence_put(fence);
1902	}
1903
1904	rq = i915_request_get(rq);
1905	i915_request_add(rq);
1906	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1907		err = -ETIME;
1908	i915_request_put(rq);
1909
1910	while (!err && !intel_engine_is_idle(ce->engine))
1911		intel_engine_flush_submission(ce->engine);
1912
1913	return err;
1914}
1915
1916struct perf_stats {
1917	struct intel_engine_cs *engine;
1918	unsigned long count;
1919	ktime_t time;
1920	ktime_t busy;
1921	u64 runtime;
1922};
1923
1924struct perf_series {
1925	struct drm_i915_private *i915;
1926	unsigned int nengines;
1927	struct intel_context *ce[] __counted_by(nengines);
1928};
1929
1930static int cmp_u32(const void *A, const void *B)
1931{
1932	const u32 *a = A, *b = B;
1933
1934	return *a - *b;
1935}
1936
1937static u32 trifilter(u32 *a)
1938{
1939	u64 sum;
1940
1941#define TF_COUNT 5
1942	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1943
1944	sum = mul_u32_u32(a[2], 2);
1945	sum += a[1];
1946	sum += a[3];
1947
1948	GEM_BUG_ON(sum > U32_MAX);
1949	return sum;
1950#define TF_BIAS 2
1951}
1952
1953static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1954{
1955	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1956
1957	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1958}
1959
1960static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1961{
1962	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1963	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1964	*cs++ = offset;
1965	*cs++ = 0;
1966
1967	return cs;
1968}
1969
1970static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1971{
1972	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1973	*cs++ = offset;
1974	*cs++ = 0;
1975	*cs++ = value;
1976
1977	return cs;
1978}
1979
1980static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1981{
1982	*cs++ = MI_SEMAPHORE_WAIT |
1983		MI_SEMAPHORE_GLOBAL_GTT |
1984		MI_SEMAPHORE_POLL |
1985		mode;
1986	*cs++ = value;
1987	*cs++ = offset;
1988	*cs++ = 0;
1989
1990	return cs;
1991}
1992
1993static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1994{
1995	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1996}
1997
1998static void semaphore_set(u32 *sema, u32 value)
1999{
2000	WRITE_ONCE(*sema, value);
2001	wmb(); /* flush the update to the cache, and beyond */
2002}
2003
2004static u32 *hwsp_scratch(const struct intel_context *ce)
2005{
2006	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2007}
2008
2009static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2010{
2011	return (i915_ggtt_offset(ce->engine->status_page.vma) +
2012		offset_in_page(dw));
2013}
2014
2015static int measure_semaphore_response(struct intel_context *ce)
2016{
2017	u32 *sema = hwsp_scratch(ce);
2018	const u32 offset = hwsp_offset(ce, sema);
2019	u32 elapsed[TF_COUNT], cycles;
2020	struct i915_request *rq;
2021	u32 *cs;
2022	int err;
2023	int i;
2024
2025	/*
2026	 * Measure how many cycles it takes for the HW to detect the change
2027	 * in a semaphore value.
2028	 *
2029	 *    A: read CS_TIMESTAMP from CPU
2030	 *    poke semaphore
2031	 *    B: read CS_TIMESTAMP on GPU
2032	 *
2033	 * Semaphore latency: B - A
2034	 */
2035
2036	semaphore_set(sema, -1);
2037
2038	rq = i915_request_create(ce);
2039	if (IS_ERR(rq))
2040		return PTR_ERR(rq);
2041
2042	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2043	if (IS_ERR(cs)) {
2044		i915_request_add(rq);
2045		err = PTR_ERR(cs);
2046		goto err;
2047	}
2048
2049	cs = emit_store_dw(cs, offset, 0);
2050	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2051		cs = emit_semaphore_poll_until(cs, offset, i);
2052		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2053		cs = emit_store_dw(cs, offset, 0);
2054	}
2055
2056	intel_ring_advance(rq, cs);
2057	i915_request_add(rq);
2058
2059	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2060		err = -EIO;
2061		goto err;
2062	}
2063
2064	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2065		preempt_disable();
2066		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2067		semaphore_set(sema, i);
2068		preempt_enable();
2069
2070		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2071			err = -EIO;
2072			goto err;
2073		}
2074
2075		elapsed[i - 1] = sema[i] - cycles;
2076	}
2077
2078	cycles = trifilter(elapsed);
2079	pr_info("%s: semaphore response %d cycles, %lluns\n",
2080		ce->engine->name, cycles >> TF_BIAS,
2081		cycles_to_ns(ce->engine, cycles));
2082
2083	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2084
2085err:
2086	intel_gt_set_wedged(ce->engine->gt);
2087	return err;
2088}
2089
2090static int measure_idle_dispatch(struct intel_context *ce)
2091{
2092	u32 *sema = hwsp_scratch(ce);
2093	const u32 offset = hwsp_offset(ce, sema);
2094	u32 elapsed[TF_COUNT], cycles;
2095	u32 *cs;
2096	int err;
2097	int i;
2098
2099	/*
2100	 * Measure how long it takes for us to submit a request while the
2101	 * engine is idle, but is resting in our context.
2102	 *
2103	 *    A: read CS_TIMESTAMP from CPU
2104	 *    submit request
2105	 *    B: read CS_TIMESTAMP on GPU
2106	 *
2107	 * Submission latency: B - A
2108	 */
2109
2110	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2111		struct i915_request *rq;
2112
2113		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2114		if (err)
2115			return err;
2116
2117		rq = i915_request_create(ce);
2118		if (IS_ERR(rq)) {
2119			err = PTR_ERR(rq);
2120			goto err;
2121		}
2122
2123		cs = intel_ring_begin(rq, 4);
2124		if (IS_ERR(cs)) {
2125			i915_request_add(rq);
2126			err = PTR_ERR(cs);
2127			goto err;
2128		}
2129
2130		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2131
2132		intel_ring_advance(rq, cs);
2133
2134		preempt_disable();
2135		local_bh_disable();
2136		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2137		i915_request_add(rq);
2138		local_bh_enable();
2139		preempt_enable();
2140	}
2141
2142	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143	if (err)
2144		goto err;
2145
2146	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2147		elapsed[i] = sema[i] - elapsed[i];
2148
2149	cycles = trifilter(elapsed);
2150	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2151		ce->engine->name, cycles >> TF_BIAS,
2152		cycles_to_ns(ce->engine, cycles));
2153
2154	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156err:
2157	intel_gt_set_wedged(ce->engine->gt);
2158	return err;
2159}
2160
2161static int measure_busy_dispatch(struct intel_context *ce)
2162{
2163	u32 *sema = hwsp_scratch(ce);
2164	const u32 offset = hwsp_offset(ce, sema);
2165	u32 elapsed[TF_COUNT + 1], cycles;
2166	u32 *cs;
2167	int err;
2168	int i;
2169
2170	/*
2171	 * Measure how long it takes for us to submit a request while the
2172	 * engine is busy, polling on a semaphore in our context. With
2173	 * direct submission, this will include the cost of a lite restore.
2174	 *
2175	 *    A: read CS_TIMESTAMP from CPU
2176	 *    submit request
2177	 *    B: read CS_TIMESTAMP on GPU
2178	 *
2179	 * Submission latency: B - A
2180	 */
2181
2182	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2183		struct i915_request *rq;
2184
2185		rq = i915_request_create(ce);
2186		if (IS_ERR(rq)) {
2187			err = PTR_ERR(rq);
2188			goto err;
2189		}
2190
2191		cs = intel_ring_begin(rq, 12);
2192		if (IS_ERR(cs)) {
2193			i915_request_add(rq);
2194			err = PTR_ERR(cs);
2195			goto err;
2196		}
2197
2198		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2199		cs = emit_semaphore_poll_until(cs, offset, i);
2200		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2201
2202		intel_ring_advance(rq, cs);
2203
2204		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2205			err = -EIO;
2206			goto err;
2207		}
2208
2209		preempt_disable();
2210		local_bh_disable();
2211		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2212		i915_request_add(rq);
2213		local_bh_enable();
2214		semaphore_set(sema, i - 1);
2215		preempt_enable();
2216	}
2217
2218	wait_for(READ_ONCE(sema[i - 1]), 500);
2219	semaphore_set(sema, i - 1);
2220
2221	for (i = 1; i <= TF_COUNT; i++) {
2222		GEM_BUG_ON(sema[i] == -1);
2223		elapsed[i - 1] = sema[i] - elapsed[i];
2224	}
2225
2226	cycles = trifilter(elapsed);
2227	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2228		ce->engine->name, cycles >> TF_BIAS,
2229		cycles_to_ns(ce->engine, cycles));
2230
2231	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2232
2233err:
2234	intel_gt_set_wedged(ce->engine->gt);
2235	return err;
2236}
2237
2238static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2239{
2240	const u32 offset =
2241		i915_ggtt_offset(engine->status_page.vma) +
2242		offset_in_page(sema);
2243	struct i915_request *rq;
2244	u32 *cs;
2245
2246	rq = i915_request_create(engine->kernel_context);
2247	if (IS_ERR(rq))
2248		return PTR_ERR(rq);
2249
2250	cs = intel_ring_begin(rq, 4);
2251	if (IS_ERR(cs)) {
2252		i915_request_add(rq);
2253		return PTR_ERR(cs);
2254	}
2255
2256	cs = emit_semaphore_poll(cs, mode, value, offset);
2257
2258	intel_ring_advance(rq, cs);
2259	i915_request_add(rq);
2260
2261	return 0;
2262}
2263
2264static int measure_inter_request(struct intel_context *ce)
2265{
2266	u32 *sema = hwsp_scratch(ce);
2267	const u32 offset = hwsp_offset(ce, sema);
2268	u32 elapsed[TF_COUNT + 1], cycles;
2269	struct i915_sw_fence *submit;
2270	int i, err;
2271
2272	/*
2273	 * Measure how long it takes to advance from one request into the
2274	 * next. Between each request we flush the GPU caches to memory,
2275	 * update the breadcrumbs, and then invalidate those caches.
2276	 * We queue up all the requests to be submitted in one batch so
2277	 * it should be one set of contiguous measurements.
2278	 *
2279	 *    A: read CS_TIMESTAMP on GPU
2280	 *    advance request
2281	 *    B: read CS_TIMESTAMP on GPU
2282	 *
2283	 * Request latency: B - A
2284	 */
2285
2286	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2287	if (err)
2288		return err;
2289
2290	submit = heap_fence_create(GFP_KERNEL);
2291	if (!submit) {
2292		semaphore_set(sema, 1);
2293		return -ENOMEM;
2294	}
2295
2296	intel_engine_flush_submission(ce->engine);
2297	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2298		struct i915_request *rq;
2299		u32 *cs;
2300
2301		rq = i915_request_create(ce);
2302		if (IS_ERR(rq)) {
2303			err = PTR_ERR(rq);
2304			goto err_submit;
2305		}
2306
2307		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2308						       submit,
2309						       GFP_KERNEL);
2310		if (err < 0) {
2311			i915_request_add(rq);
2312			goto err_submit;
2313		}
2314
2315		cs = intel_ring_begin(rq, 4);
2316		if (IS_ERR(cs)) {
2317			i915_request_add(rq);
2318			err = PTR_ERR(cs);
2319			goto err_submit;
2320		}
2321
2322		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2323
2324		intel_ring_advance(rq, cs);
2325		i915_request_add(rq);
2326	}
2327	i915_sw_fence_commit(submit);
2328	intel_engine_flush_submission(ce->engine);
2329	heap_fence_put(submit);
2330
2331	semaphore_set(sema, 1);
2332	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2333	if (err)
2334		goto err;
2335
2336	for (i = 1; i <= TF_COUNT; i++)
2337		elapsed[i - 1] = sema[i + 1] - sema[i];
2338
2339	cycles = trifilter(elapsed);
2340	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2341		ce->engine->name, cycles >> TF_BIAS,
2342		cycles_to_ns(ce->engine, cycles));
2343
2344	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2345
2346err_submit:
2347	i915_sw_fence_commit(submit);
2348	heap_fence_put(submit);
2349	semaphore_set(sema, 1);
2350err:
2351	intel_gt_set_wedged(ce->engine->gt);
2352	return err;
2353}
2354
2355static int measure_context_switch(struct intel_context *ce)
2356{
2357	u32 *sema = hwsp_scratch(ce);
2358	const u32 offset = hwsp_offset(ce, sema);
2359	struct i915_request *fence = NULL;
2360	u32 elapsed[TF_COUNT + 1], cycles;
2361	int i, j, err;
2362	u32 *cs;
2363
2364	/*
2365	 * Measure how long it takes to advance from one request in one
2366	 * context to a request in another context. This allows us to
2367	 * measure how long the context save/restore take, along with all
2368	 * the inter-context setup we require.
2369	 *
2370	 *    A: read CS_TIMESTAMP on GPU
2371	 *    switch context
2372	 *    B: read CS_TIMESTAMP on GPU
2373	 *
2374	 * Context switch latency: B - A
2375	 */
2376
2377	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2378	if (err)
2379		return err;
2380
2381	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2382		struct intel_context *arr[] = {
2383			ce, ce->engine->kernel_context
2384		};
2385		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2386
2387		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2388			struct i915_request *rq;
2389
2390			rq = i915_request_create(arr[j]);
2391			if (IS_ERR(rq)) {
2392				err = PTR_ERR(rq);
2393				goto err_fence;
2394			}
2395
2396			if (fence) {
2397				err = i915_request_await_dma_fence(rq,
2398								   &fence->fence);
2399				if (err) {
2400					i915_request_add(rq);
2401					goto err_fence;
2402				}
2403			}
2404
2405			cs = intel_ring_begin(rq, 4);
2406			if (IS_ERR(cs)) {
2407				i915_request_add(rq);
2408				err = PTR_ERR(cs);
2409				goto err_fence;
2410			}
2411
2412			cs = emit_timestamp_store(cs, ce, addr);
2413			addr += sizeof(u32);
2414
2415			intel_ring_advance(rq, cs);
2416
2417			i915_request_put(fence);
2418			fence = i915_request_get(rq);
2419
2420			i915_request_add(rq);
2421		}
2422	}
2423	i915_request_put(fence);
2424	intel_engine_flush_submission(ce->engine);
2425
2426	semaphore_set(sema, 1);
2427	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2428	if (err)
2429		goto err;
2430
2431	for (i = 1; i <= TF_COUNT; i++)
2432		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2433
2434	cycles = trifilter(elapsed);
2435	pr_info("%s: context switch latency %d cycles, %lluns\n",
2436		ce->engine->name, cycles >> TF_BIAS,
2437		cycles_to_ns(ce->engine, cycles));
2438
2439	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2440
2441err_fence:
2442	i915_request_put(fence);
2443	semaphore_set(sema, 1);
2444err:
2445	intel_gt_set_wedged(ce->engine->gt);
2446	return err;
2447}
2448
2449static int measure_preemption(struct intel_context *ce)
2450{
2451	u32 *sema = hwsp_scratch(ce);
2452	const u32 offset = hwsp_offset(ce, sema);
2453	u32 elapsed[TF_COUNT], cycles;
2454	u32 *cs;
2455	int err;
2456	int i;
2457
2458	/*
2459	 * We measure two latencies while triggering preemption. The first
2460	 * latency is how long it takes for us to submit a preempting request.
2461	 * The second latency is how it takes for us to return from the
2462	 * preemption back to the original context.
2463	 *
2464	 *    A: read CS_TIMESTAMP from CPU
2465	 *    submit preemption
2466	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2467	 *    context switch
2468	 *    C: read CS_TIMESTAMP on GPU (in original context)
2469	 *
2470	 * Preemption dispatch latency: B - A
2471	 * Preemption switch latency: C - B
2472	 */
2473
2474	if (!intel_engine_has_preemption(ce->engine))
2475		return 0;
2476
2477	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2478		u32 addr = offset + 2 * i * sizeof(u32);
2479		struct i915_request *rq;
2480
2481		rq = i915_request_create(ce);
2482		if (IS_ERR(rq)) {
2483			err = PTR_ERR(rq);
2484			goto err;
2485		}
2486
2487		cs = intel_ring_begin(rq, 12);
2488		if (IS_ERR(cs)) {
2489			i915_request_add(rq);
2490			err = PTR_ERR(cs);
2491			goto err;
2492		}
2493
2494		cs = emit_store_dw(cs, addr, -1);
2495		cs = emit_semaphore_poll_until(cs, offset, i);
2496		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2497
2498		intel_ring_advance(rq, cs);
2499		i915_request_add(rq);
2500
2501		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2502			err = -EIO;
2503			goto err;
2504		}
2505
2506		rq = i915_request_create(ce->engine->kernel_context);
2507		if (IS_ERR(rq)) {
2508			err = PTR_ERR(rq);
2509			goto err;
2510		}
2511
2512		cs = intel_ring_begin(rq, 8);
2513		if (IS_ERR(cs)) {
2514			i915_request_add(rq);
2515			err = PTR_ERR(cs);
2516			goto err;
2517		}
2518
2519		cs = emit_timestamp_store(cs, ce, addr);
2520		cs = emit_store_dw(cs, offset, i);
2521
2522		intel_ring_advance(rq, cs);
2523		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2524
2525		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2526		i915_request_add(rq);
2527	}
2528
2529	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2530		err = -EIO;
2531		goto err;
2532	}
2533
2534	for (i = 1; i <= TF_COUNT; i++)
2535		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2536
2537	cycles = trifilter(elapsed);
2538	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2539		ce->engine->name, cycles >> TF_BIAS,
2540		cycles_to_ns(ce->engine, cycles));
2541
2542	for (i = 1; i <= TF_COUNT; i++)
2543		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2544
2545	cycles = trifilter(elapsed);
2546	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2547		ce->engine->name, cycles >> TF_BIAS,
2548		cycles_to_ns(ce->engine, cycles));
2549
2550	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2551
2552err:
2553	intel_gt_set_wedged(ce->engine->gt);
2554	return err;
2555}
2556
2557struct signal_cb {
2558	struct dma_fence_cb base;
2559	bool seen;
2560};
2561
2562static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2563{
2564	struct signal_cb *s = container_of(cb, typeof(*s), base);
2565
2566	smp_store_mb(s->seen, true); /* be safe, be strong */
2567}
2568
2569static int measure_completion(struct intel_context *ce)
2570{
2571	u32 *sema = hwsp_scratch(ce);
2572	const u32 offset = hwsp_offset(ce, sema);
2573	u32 elapsed[TF_COUNT], cycles;
2574	u32 *cs;
2575	int err;
2576	int i;
2577
2578	/*
2579	 * Measure how long it takes for the signal (interrupt) to be
2580	 * sent from the GPU to be processed by the CPU.
2581	 *
2582	 *    A: read CS_TIMESTAMP on GPU
2583	 *    signal
2584	 *    B: read CS_TIMESTAMP from CPU
2585	 *
2586	 * Completion latency: B - A
2587	 */
2588
2589	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2590		struct signal_cb cb = { .seen = false };
2591		struct i915_request *rq;
2592
2593		rq = i915_request_create(ce);
2594		if (IS_ERR(rq)) {
2595			err = PTR_ERR(rq);
2596			goto err;
2597		}
2598
2599		cs = intel_ring_begin(rq, 12);
2600		if (IS_ERR(cs)) {
2601			i915_request_add(rq);
2602			err = PTR_ERR(cs);
2603			goto err;
2604		}
2605
2606		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2607		cs = emit_semaphore_poll_until(cs, offset, i);
2608		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2609
2610		intel_ring_advance(rq, cs);
2611
2612		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2613		i915_request_add(rq);
2614
2615		intel_engine_flush_submission(ce->engine);
2616		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2617			err = -EIO;
2618			goto err;
2619		}
2620
2621		preempt_disable();
2622		semaphore_set(sema, i);
2623		while (!READ_ONCE(cb.seen))
2624			cpu_relax();
2625
2626		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2627		preempt_enable();
2628	}
2629
2630	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2631	if (err)
2632		goto err;
2633
2634	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2635		GEM_BUG_ON(sema[i + 1] == -1);
2636		elapsed[i] = elapsed[i] - sema[i + 1];
2637	}
2638
2639	cycles = trifilter(elapsed);
2640	pr_info("%s: completion latency %d cycles, %lluns\n",
2641		ce->engine->name, cycles >> TF_BIAS,
2642		cycles_to_ns(ce->engine, cycles));
2643
2644	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2645
2646err:
2647	intel_gt_set_wedged(ce->engine->gt);
2648	return err;
2649}
2650
2651static void rps_pin(struct intel_gt *gt)
2652{
2653	/* Pin the frequency to max */
2654	atomic_inc(&gt->rps.num_waiters);
2655	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2656
2657	mutex_lock(&gt->rps.lock);
2658	intel_rps_set(&gt->rps, gt->rps.max_freq);
2659	mutex_unlock(&gt->rps.lock);
2660}
2661
2662static void rps_unpin(struct intel_gt *gt)
2663{
2664	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2665	atomic_dec(&gt->rps.num_waiters);
2666}
2667
2668static int perf_request_latency(void *arg)
2669{
2670	struct drm_i915_private *i915 = arg;
2671	struct intel_engine_cs *engine;
2672	struct pm_qos_request qos;
2673	int err = 0;
2674
2675	if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2676		return 0;
2677
2678	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2679
2680	for_each_uabi_engine(engine, i915) {
2681		struct intel_context *ce;
2682
2683		ce = intel_context_create(engine);
2684		if (IS_ERR(ce)) {
2685			err = PTR_ERR(ce);
2686			goto out;
2687		}
2688
2689		err = intel_context_pin(ce);
2690		if (err) {
2691			intel_context_put(ce);
2692			goto out;
2693		}
2694
2695		st_engine_heartbeat_disable(engine);
2696		rps_pin(engine->gt);
2697
2698		if (err == 0)
2699			err = measure_semaphore_response(ce);
2700		if (err == 0)
2701			err = measure_idle_dispatch(ce);
2702		if (err == 0)
2703			err = measure_busy_dispatch(ce);
2704		if (err == 0)
2705			err = measure_inter_request(ce);
2706		if (err == 0)
2707			err = measure_context_switch(ce);
2708		if (err == 0)
2709			err = measure_preemption(ce);
2710		if (err == 0)
2711			err = measure_completion(ce);
2712
2713		rps_unpin(engine->gt);
2714		st_engine_heartbeat_enable(engine);
2715
2716		intel_context_unpin(ce);
2717		intel_context_put(ce);
2718		if (err)
2719			goto out;
2720	}
2721
2722out:
2723	if (igt_flush_test(i915))
2724		err = -EIO;
2725
2726	cpu_latency_qos_remove_request(&qos);
2727	return err;
2728}
2729
2730static int s_sync0(void *arg)
2731{
2732	struct perf_series *ps = arg;
2733	IGT_TIMEOUT(end_time);
2734	unsigned int idx = 0;
2735	int err = 0;
2736
2737	GEM_BUG_ON(!ps->nengines);
2738	do {
2739		struct i915_request *rq;
2740
2741		rq = i915_request_create(ps->ce[idx]);
2742		if (IS_ERR(rq)) {
2743			err = PTR_ERR(rq);
2744			break;
2745		}
2746
2747		i915_request_get(rq);
2748		i915_request_add(rq);
2749
2750		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2751			err = -ETIME;
2752		i915_request_put(rq);
2753		if (err)
2754			break;
2755
2756		if (++idx == ps->nengines)
2757			idx = 0;
2758	} while (!__igt_timeout(end_time, NULL));
2759
2760	return err;
2761}
2762
2763static int s_sync1(void *arg)
2764{
2765	struct perf_series *ps = arg;
2766	struct i915_request *prev = NULL;
2767	IGT_TIMEOUT(end_time);
2768	unsigned int idx = 0;
2769	int err = 0;
2770
2771	GEM_BUG_ON(!ps->nengines);
2772	do {
2773		struct i915_request *rq;
2774
2775		rq = i915_request_create(ps->ce[idx]);
2776		if (IS_ERR(rq)) {
2777			err = PTR_ERR(rq);
2778			break;
2779		}
2780
2781		i915_request_get(rq);
2782		i915_request_add(rq);
2783
2784		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2785			err = -ETIME;
2786		i915_request_put(prev);
2787		prev = rq;
2788		if (err)
2789			break;
2790
2791		if (++idx == ps->nengines)
2792			idx = 0;
2793	} while (!__igt_timeout(end_time, NULL));
2794	i915_request_put(prev);
2795
2796	return err;
2797}
2798
2799static int s_many(void *arg)
2800{
2801	struct perf_series *ps = arg;
2802	IGT_TIMEOUT(end_time);
2803	unsigned int idx = 0;
2804
2805	GEM_BUG_ON(!ps->nengines);
2806	do {
2807		struct i915_request *rq;
2808
2809		rq = i915_request_create(ps->ce[idx]);
2810		if (IS_ERR(rq))
2811			return PTR_ERR(rq);
2812
2813		i915_request_add(rq);
2814
2815		if (++idx == ps->nengines)
2816			idx = 0;
2817	} while (!__igt_timeout(end_time, NULL));
2818
2819	return 0;
2820}
2821
2822static int perf_series_engines(void *arg)
2823{
2824	struct drm_i915_private *i915 = arg;
2825	static int (* const func[])(void *arg) = {
2826		s_sync0,
2827		s_sync1,
2828		s_many,
2829		NULL,
2830	};
2831	const unsigned int nengines = num_uabi_engines(i915);
2832	struct intel_engine_cs *engine;
2833	int (* const *fn)(void *arg);
2834	struct pm_qos_request qos;
2835	struct perf_stats *stats;
2836	struct perf_series *ps;
2837	unsigned int idx;
2838	int err = 0;
2839
2840	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2841	if (!stats)
2842		return -ENOMEM;
2843
2844	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2845	if (!ps) {
2846		kfree(stats);
2847		return -ENOMEM;
2848	}
2849
2850	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2851
2852	ps->i915 = i915;
2853	ps->nengines = nengines;
2854
2855	idx = 0;
2856	for_each_uabi_engine(engine, i915) {
2857		struct intel_context *ce;
2858
2859		ce = intel_context_create(engine);
2860		if (IS_ERR(ce)) {
2861			err = PTR_ERR(ce);
2862			goto out;
2863		}
2864
2865		err = intel_context_pin(ce);
2866		if (err) {
2867			intel_context_put(ce);
2868			goto out;
2869		}
2870
2871		ps->ce[idx++] = ce;
2872	}
2873	GEM_BUG_ON(idx != ps->nengines);
2874
2875	for (fn = func; *fn && !err; fn++) {
2876		char name[KSYM_NAME_LEN];
2877		struct igt_live_test t;
2878
2879		snprintf(name, sizeof(name), "%ps", *fn);
2880		err = igt_live_test_begin(&t, i915, __func__, name);
2881		if (err)
2882			break;
2883
2884		for (idx = 0; idx < nengines; idx++) {
2885			struct perf_stats *p =
2886				memset(&stats[idx], 0, sizeof(stats[idx]));
2887			struct intel_context *ce = ps->ce[idx];
2888
2889			p->engine = ps->ce[idx]->engine;
2890			intel_engine_pm_get(p->engine);
2891
2892			if (intel_engine_supports_stats(p->engine))
2893				p->busy = intel_engine_get_busy_time(p->engine,
2894								     &p->time) + 1;
2895			else
2896				p->time = ktime_get();
2897			p->runtime = -intel_context_get_total_runtime_ns(ce);
2898		}
2899
2900		err = (*fn)(ps);
2901		if (igt_live_test_end(&t))
2902			err = -EIO;
2903
2904		for (idx = 0; idx < nengines; idx++) {
2905			struct perf_stats *p = &stats[idx];
2906			struct intel_context *ce = ps->ce[idx];
2907			int integer, decimal;
2908			u64 busy, dt, now;
2909
2910			if (p->busy)
2911				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2912									       &now),
2913						    p->busy - 1);
2914			else
2915				now = ktime_get();
2916			p->time = ktime_sub(now, p->time);
2917
2918			err = switch_to_kernel_sync(ce, err);
2919			p->runtime += intel_context_get_total_runtime_ns(ce);
2920			intel_engine_pm_put(p->engine);
2921
2922			busy = 100 * ktime_to_ns(p->busy);
2923			dt = ktime_to_ns(p->time);
2924			if (dt) {
2925				integer = div64_u64(busy, dt);
2926				busy -= integer * dt;
2927				decimal = div64_u64(100 * busy, dt);
2928			} else {
2929				integer = 0;
2930				decimal = 0;
2931			}
2932
2933			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2934				name, p->engine->name, ce->timeline->seqno,
2935				integer, decimal,
2936				div_u64(p->runtime, 1000 * 1000),
2937				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2938		}
2939	}
2940
2941out:
2942	for (idx = 0; idx < nengines; idx++) {
2943		if (IS_ERR_OR_NULL(ps->ce[idx]))
2944			break;
2945
2946		intel_context_unpin(ps->ce[idx]);
2947		intel_context_put(ps->ce[idx]);
2948	}
2949	kfree(ps);
2950
2951	cpu_latency_qos_remove_request(&qos);
2952	kfree(stats);
2953	return err;
2954}
2955
2956struct p_thread {
2957	struct perf_stats p;
2958	struct kthread_worker *worker;
2959	struct kthread_work work;
2960	struct intel_engine_cs *engine;
2961	int result;
2962};
2963
2964static void p_sync0(struct kthread_work *work)
2965{
2966	struct p_thread *thread = container_of(work, typeof(*thread), work);
2967	struct perf_stats *p = &thread->p;
2968	struct intel_engine_cs *engine = p->engine;
2969	struct intel_context *ce;
2970	IGT_TIMEOUT(end_time);
2971	unsigned long count;
2972	bool busy;
2973	int err = 0;
2974
2975	ce = intel_context_create(engine);
2976	if (IS_ERR(ce)) {
2977		thread->result = PTR_ERR(ce);
2978		return;
2979	}
2980
2981	err = intel_context_pin(ce);
2982	if (err) {
2983		intel_context_put(ce);
2984		thread->result = err;
2985		return;
2986	}
2987
2988	if (intel_engine_supports_stats(engine)) {
2989		p->busy = intel_engine_get_busy_time(engine, &p->time);
2990		busy = true;
2991	} else {
2992		p->time = ktime_get();
2993		busy = false;
2994	}
2995
2996	count = 0;
2997	do {
2998		struct i915_request *rq;
2999
3000		rq = i915_request_create(ce);
3001		if (IS_ERR(rq)) {
3002			err = PTR_ERR(rq);
3003			break;
3004		}
3005
3006		i915_request_get(rq);
3007		i915_request_add(rq);
3008
3009		err = 0;
3010		if (i915_request_wait(rq, 0, HZ) < 0)
3011			err = -ETIME;
3012		i915_request_put(rq);
3013		if (err)
3014			break;
3015
3016		count++;
3017	} while (!__igt_timeout(end_time, NULL));
3018
3019	if (busy) {
3020		ktime_t now;
3021
3022		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3023				    p->busy);
3024		p->time = ktime_sub(now, p->time);
3025	} else {
3026		p->time = ktime_sub(ktime_get(), p->time);
3027	}
3028
3029	err = switch_to_kernel_sync(ce, err);
3030	p->runtime = intel_context_get_total_runtime_ns(ce);
3031	p->count = count;
3032
3033	intel_context_unpin(ce);
3034	intel_context_put(ce);
3035	thread->result = err;
3036}
3037
3038static void p_sync1(struct kthread_work *work)
3039{
3040	struct p_thread *thread = container_of(work, typeof(*thread), work);
3041	struct perf_stats *p = &thread->p;
3042	struct intel_engine_cs *engine = p->engine;
3043	struct i915_request *prev = NULL;
3044	struct intel_context *ce;
3045	IGT_TIMEOUT(end_time);
3046	unsigned long count;
3047	bool busy;
3048	int err = 0;
3049
3050	ce = intel_context_create(engine);
3051	if (IS_ERR(ce)) {
3052		thread->result = PTR_ERR(ce);
3053		return;
3054	}
3055
3056	err = intel_context_pin(ce);
3057	if (err) {
3058		intel_context_put(ce);
3059		thread->result = err;
3060		return;
3061	}
3062
3063	if (intel_engine_supports_stats(engine)) {
3064		p->busy = intel_engine_get_busy_time(engine, &p->time);
3065		busy = true;
3066	} else {
3067		p->time = ktime_get();
3068		busy = false;
3069	}
3070
3071	count = 0;
3072	do {
3073		struct i915_request *rq;
3074
3075		rq = i915_request_create(ce);
3076		if (IS_ERR(rq)) {
3077			err = PTR_ERR(rq);
3078			break;
3079		}
3080
3081		i915_request_get(rq);
3082		i915_request_add(rq);
3083
3084		err = 0;
3085		if (prev && i915_request_wait(prev, 0, HZ) < 0)
3086			err = -ETIME;
3087		i915_request_put(prev);
3088		prev = rq;
3089		if (err)
3090			break;
3091
3092		count++;
3093	} while (!__igt_timeout(end_time, NULL));
3094	i915_request_put(prev);
3095
3096	if (busy) {
3097		ktime_t now;
3098
3099		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3100				    p->busy);
3101		p->time = ktime_sub(now, p->time);
3102	} else {
3103		p->time = ktime_sub(ktime_get(), p->time);
3104	}
3105
3106	err = switch_to_kernel_sync(ce, err);
3107	p->runtime = intel_context_get_total_runtime_ns(ce);
3108	p->count = count;
3109
3110	intel_context_unpin(ce);
3111	intel_context_put(ce);
3112	thread->result = err;
3113}
3114
3115static void p_many(struct kthread_work *work)
3116{
3117	struct p_thread *thread = container_of(work, typeof(*thread), work);
3118	struct perf_stats *p = &thread->p;
3119	struct intel_engine_cs *engine = p->engine;
3120	struct intel_context *ce;
3121	IGT_TIMEOUT(end_time);
3122	unsigned long count;
3123	int err = 0;
3124	bool busy;
3125
3126	ce = intel_context_create(engine);
3127	if (IS_ERR(ce)) {
3128		thread->result = PTR_ERR(ce);
3129		return;
3130	}
3131
3132	err = intel_context_pin(ce);
3133	if (err) {
3134		intel_context_put(ce);
3135		thread->result = err;
3136		return;
3137	}
3138
3139	if (intel_engine_supports_stats(engine)) {
3140		p->busy = intel_engine_get_busy_time(engine, &p->time);
3141		busy = true;
3142	} else {
3143		p->time = ktime_get();
3144		busy = false;
3145	}
3146
3147	count = 0;
3148	do {
3149		struct i915_request *rq;
3150
3151		rq = i915_request_create(ce);
3152		if (IS_ERR(rq)) {
3153			err = PTR_ERR(rq);
3154			break;
3155		}
3156
3157		i915_request_add(rq);
3158		count++;
3159	} while (!__igt_timeout(end_time, NULL));
3160
3161	if (busy) {
3162		ktime_t now;
3163
3164		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3165				    p->busy);
3166		p->time = ktime_sub(now, p->time);
3167	} else {
3168		p->time = ktime_sub(ktime_get(), p->time);
3169	}
3170
3171	err = switch_to_kernel_sync(ce, err);
3172	p->runtime = intel_context_get_total_runtime_ns(ce);
3173	p->count = count;
3174
3175	intel_context_unpin(ce);
3176	intel_context_put(ce);
3177	thread->result = err;
3178}
3179
3180static int perf_parallel_engines(void *arg)
3181{
3182	struct drm_i915_private *i915 = arg;
3183	static void (* const func[])(struct kthread_work *) = {
3184		p_sync0,
3185		p_sync1,
3186		p_many,
3187		NULL,
3188	};
3189	const unsigned int nengines = num_uabi_engines(i915);
3190	void (* const *fn)(struct kthread_work *);
3191	struct intel_engine_cs *engine;
3192	struct pm_qos_request qos;
3193	struct p_thread *engines;
3194	int err = 0;
3195
3196	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3197	if (!engines)
3198		return -ENOMEM;
3199
3200	cpu_latency_qos_add_request(&qos, 0);
3201
3202	for (fn = func; *fn; fn++) {
3203		char name[KSYM_NAME_LEN];
3204		struct igt_live_test t;
3205		unsigned int idx;
3206
3207		snprintf(name, sizeof(name), "%ps", *fn);
3208		err = igt_live_test_begin(&t, i915, __func__, name);
3209		if (err)
3210			break;
3211
3212		atomic_set(&i915->selftest.counter, nengines);
3213
3214		idx = 0;
3215		for_each_uabi_engine(engine, i915) {
3216			struct kthread_worker *worker;
3217
3218			intel_engine_pm_get(engine);
3219
3220			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3221
3222			worker = kthread_create_worker(0, "igt:%s",
3223						       engine->name);
3224			if (IS_ERR(worker)) {
3225				err = PTR_ERR(worker);
3226				intel_engine_pm_put(engine);
3227				break;
3228			}
3229			engines[idx].worker = worker;
3230			engines[idx].result = 0;
3231			engines[idx].p.engine = engine;
3232			engines[idx].engine = engine;
3233
3234			kthread_init_work(&engines[idx].work, *fn);
3235			kthread_queue_work(worker, &engines[idx].work);
3236			idx++;
3237		}
3238
3239		idx = 0;
3240		for_each_uabi_engine(engine, i915) {
3241			int status;
3242
3243			if (!engines[idx].worker)
3244				break;
3245
3246			kthread_flush_work(&engines[idx].work);
3247			status = READ_ONCE(engines[idx].result);
3248			if (status && !err)
3249				err = status;
3250
3251			intel_engine_pm_put(engine);
3252
3253			kthread_destroy_worker(engines[idx].worker);
3254			idx++;
3255		}
3256
3257		if (igt_live_test_end(&t))
3258			err = -EIO;
3259		if (err)
3260			break;
3261
3262		idx = 0;
3263		for_each_uabi_engine(engine, i915) {
3264			struct perf_stats *p = &engines[idx].p;
3265			u64 busy = 100 * ktime_to_ns(p->busy);
3266			u64 dt = ktime_to_ns(p->time);
3267			int integer, decimal;
3268
3269			if (dt) {
3270				integer = div64_u64(busy, dt);
3271				busy -= integer * dt;
3272				decimal = div64_u64(100 * busy, dt);
3273			} else {
3274				integer = 0;
3275				decimal = 0;
3276			}
3277
3278			GEM_BUG_ON(engine != p->engine);
3279			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3280				name, engine->name, p->count, integer, decimal,
3281				div_u64(p->runtime, 1000 * 1000),
3282				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3283			idx++;
3284		}
3285	}
3286
3287	cpu_latency_qos_remove_request(&qos);
3288	kfree(engines);
3289	return err;
3290}
3291
3292int i915_request_perf_selftests(struct drm_i915_private *i915)
3293{
3294	static const struct i915_subtest tests[] = {
3295		SUBTEST(perf_request_latency),
3296		SUBTEST(perf_series_engines),
3297		SUBTEST(perf_parallel_engines),
3298	};
3299
3300	if (intel_gt_is_wedged(to_gt(i915)))
3301		return 0;
3302
3303	return i915_subtests(tests, i915);
3304}
3305