1// SPDX-License-Identifier: MIT
2/*
3 * Copyright �� 2021 Intel Corporation
4 */
5
6#include "xe_vm.h"
7
8#include <linux/dma-fence-array.h>
9#include <linux/nospec.h>
10
11#include <drm/drm_exec.h>
12#include <drm/drm_print.h>
13#include <drm/ttm/ttm_execbuf_util.h>
14#include <drm/ttm/ttm_tt.h>
15#include <drm/xe_drm.h>
16#include <linux/ascii85.h>
17#include <linux/delay.h>
18#include <linux/kthread.h>
19#include <linux/mm.h>
20#include <linux/swap.h>
21
22#include <generated/xe_wa_oob.h>
23
24#include "xe_assert.h"
25#include "xe_bo.h"
26#include "xe_device.h"
27#include "xe_drm_client.h"
28#include "xe_exec_queue.h"
29#include "xe_gt.h"
30#include "xe_gt_pagefault.h"
31#include "xe_gt_tlb_invalidation.h"
32#include "xe_migrate.h"
33#include "xe_pat.h"
34#include "xe_pm.h"
35#include "xe_preempt_fence.h"
36#include "xe_pt.h"
37#include "xe_res_cursor.h"
38#include "xe_sync.h"
39#include "xe_trace.h"
40#include "xe_wa.h"
41
42static struct drm_gem_object *xe_vm_obj(struct xe_vm *vm)
43{
44	return vm->gpuvm.r_obj;
45}
46
47/**
48 * xe_vma_userptr_check_repin() - Advisory check for repin needed
49 * @uvma: The userptr vma
50 *
51 * Check if the userptr vma has been invalidated since last successful
52 * repin. The check is advisory only and can the function can be called
53 * without the vm->userptr.notifier_lock held. There is no guarantee that the
54 * vma userptr will remain valid after a lockless check, so typically
55 * the call needs to be followed by a proper check under the notifier_lock.
56 *
57 * Return: 0 if userptr vma is valid, -EAGAIN otherwise; repin recommended.
58 */
59int xe_vma_userptr_check_repin(struct xe_userptr_vma *uvma)
60{
61	return mmu_interval_check_retry(&uvma->userptr.notifier,
62					uvma->userptr.notifier_seq) ?
63		-EAGAIN : 0;
64}
65
66int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
67{
68	struct xe_userptr *userptr = &uvma->userptr;
69	struct xe_vma *vma = &uvma->vma;
70	struct xe_vm *vm = xe_vma_vm(vma);
71	struct xe_device *xe = vm->xe;
72	const unsigned long num_pages = xe_vma_size(vma) >> PAGE_SHIFT;
73	struct page **pages;
74	bool in_kthread = !current->mm;
75	unsigned long notifier_seq;
76	int pinned, ret, i;
77	bool read_only = xe_vma_read_only(vma);
78
79	lockdep_assert_held(&vm->lock);
80	xe_assert(xe, xe_vma_is_userptr(vma));
81retry:
82	if (vma->gpuva.flags & XE_VMA_DESTROYED)
83		return 0;
84
85	notifier_seq = mmu_interval_read_begin(&userptr->notifier);
86	if (notifier_seq == userptr->notifier_seq)
87		return 0;
88
89	pages = kvmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
90	if (!pages)
91		return -ENOMEM;
92
93	if (userptr->sg) {
94		dma_unmap_sgtable(xe->drm.dev,
95				  userptr->sg,
96				  read_only ? DMA_TO_DEVICE :
97				  DMA_BIDIRECTIONAL, 0);
98		sg_free_table(userptr->sg);
99		userptr->sg = NULL;
100	}
101
102	pinned = ret = 0;
103	if (in_kthread) {
104		if (!mmget_not_zero(userptr->notifier.mm)) {
105			ret = -EFAULT;
106			goto mm_closed;
107		}
108		kthread_use_mm(userptr->notifier.mm);
109	}
110
111	while (pinned < num_pages) {
112		ret = get_user_pages_fast(xe_vma_userptr(vma) +
113					  pinned * PAGE_SIZE,
114					  num_pages - pinned,
115					  read_only ? 0 : FOLL_WRITE,
116					  &pages[pinned]);
117		if (ret < 0)
118			break;
119
120		pinned += ret;
121		ret = 0;
122	}
123
124	if (in_kthread) {
125		kthread_unuse_mm(userptr->notifier.mm);
126		mmput(userptr->notifier.mm);
127	}
128mm_closed:
129	if (ret)
130		goto out;
131
132	ret = sg_alloc_table_from_pages_segment(&userptr->sgt, pages,
133						pinned, 0,
134						(u64)pinned << PAGE_SHIFT,
135						xe_sg_segment_size(xe->drm.dev),
136						GFP_KERNEL);
137	if (ret) {
138		userptr->sg = NULL;
139		goto out;
140	}
141	userptr->sg = &userptr->sgt;
142
143	ret = dma_map_sgtable(xe->drm.dev, userptr->sg,
144			      read_only ? DMA_TO_DEVICE :
145			      DMA_BIDIRECTIONAL,
146			      DMA_ATTR_SKIP_CPU_SYNC |
147			      DMA_ATTR_NO_KERNEL_MAPPING);
148	if (ret) {
149		sg_free_table(userptr->sg);
150		userptr->sg = NULL;
151		goto out;
152	}
153
154	for (i = 0; i < pinned; ++i) {
155		if (!read_only) {
156			lock_page(pages[i]);
157			set_page_dirty(pages[i]);
158			unlock_page(pages[i]);
159		}
160
161		mark_page_accessed(pages[i]);
162	}
163
164out:
165	release_pages(pages, pinned);
166	kvfree(pages);
167
168	if (!(ret < 0)) {
169		userptr->notifier_seq = notifier_seq;
170		if (xe_vma_userptr_check_repin(uvma) == -EAGAIN)
171			goto retry;
172	}
173
174	return ret < 0 ? ret : 0;
175}
176
177static bool preempt_fences_waiting(struct xe_vm *vm)
178{
179	struct xe_exec_queue *q;
180
181	lockdep_assert_held(&vm->lock);
182	xe_vm_assert_held(vm);
183
184	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
185		if (!q->compute.pfence ||
186		    (q->compute.pfence && test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
187						   &q->compute.pfence->flags))) {
188			return true;
189		}
190	}
191
192	return false;
193}
194
195static void free_preempt_fences(struct list_head *list)
196{
197	struct list_head *link, *next;
198
199	list_for_each_safe(link, next, list)
200		xe_preempt_fence_free(to_preempt_fence_from_link(link));
201}
202
203static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
204				unsigned int *count)
205{
206	lockdep_assert_held(&vm->lock);
207	xe_vm_assert_held(vm);
208
209	if (*count >= vm->preempt.num_exec_queues)
210		return 0;
211
212	for (; *count < vm->preempt.num_exec_queues; ++(*count)) {
213		struct xe_preempt_fence *pfence = xe_preempt_fence_alloc();
214
215		if (IS_ERR(pfence))
216			return PTR_ERR(pfence);
217
218		list_move_tail(xe_preempt_fence_link(pfence), list);
219	}
220
221	return 0;
222}
223
224static int wait_for_existing_preempt_fences(struct xe_vm *vm)
225{
226	struct xe_exec_queue *q;
227
228	xe_vm_assert_held(vm);
229
230	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
231		if (q->compute.pfence) {
232			long timeout = dma_fence_wait(q->compute.pfence, false);
233
234			if (timeout < 0)
235				return -ETIME;
236			dma_fence_put(q->compute.pfence);
237			q->compute.pfence = NULL;
238		}
239	}
240
241	return 0;
242}
243
244static bool xe_vm_is_idle(struct xe_vm *vm)
245{
246	struct xe_exec_queue *q;
247
248	xe_vm_assert_held(vm);
249	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
250		if (!xe_exec_queue_is_idle(q))
251			return false;
252	}
253
254	return true;
255}
256
257static void arm_preempt_fences(struct xe_vm *vm, struct list_head *list)
258{
259	struct list_head *link;
260	struct xe_exec_queue *q;
261
262	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
263		struct dma_fence *fence;
264
265		link = list->next;
266		xe_assert(vm->xe, link != list);
267
268		fence = xe_preempt_fence_arm(to_preempt_fence_from_link(link),
269					     q, q->compute.context,
270					     ++q->compute.seqno);
271		dma_fence_put(q->compute.pfence);
272		q->compute.pfence = fence;
273	}
274}
275
276static int add_preempt_fences(struct xe_vm *vm, struct xe_bo *bo)
277{
278	struct xe_exec_queue *q;
279	int err;
280
281	if (!vm->preempt.num_exec_queues)
282		return 0;
283
284	err = xe_bo_lock(bo, true);
285	if (err)
286		return err;
287
288	err = dma_resv_reserve_fences(bo->ttm.base.resv, vm->preempt.num_exec_queues);
289	if (err)
290		goto out_unlock;
291
292	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
293		if (q->compute.pfence) {
294			dma_resv_add_fence(bo->ttm.base.resv,
295					   q->compute.pfence,
296					   DMA_RESV_USAGE_BOOKKEEP);
297		}
298
299out_unlock:
300	xe_bo_unlock(bo);
301	return err;
302}
303
304static void resume_and_reinstall_preempt_fences(struct xe_vm *vm,
305						struct drm_exec *exec)
306{
307	struct xe_exec_queue *q;
308
309	lockdep_assert_held(&vm->lock);
310	xe_vm_assert_held(vm);
311
312	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link) {
313		q->ops->resume(q);
314
315		drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, q->compute.pfence,
316					 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
317	}
318}
319
320int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
321{
322	struct drm_gpuvm_exec vm_exec = {
323		.vm = &vm->gpuvm,
324		.flags = DRM_EXEC_INTERRUPTIBLE_WAIT,
325		.num_fences = 1,
326	};
327	struct drm_exec *exec = &vm_exec.exec;
328	struct dma_fence *pfence;
329	int err;
330	bool wait;
331
332	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
333
334	down_write(&vm->lock);
335	err = drm_gpuvm_exec_lock(&vm_exec);
336	if (err)
337		goto out_up_write;
338
339	pfence = xe_preempt_fence_create(q, q->compute.context,
340					 ++q->compute.seqno);
341	if (!pfence) {
342		err = -ENOMEM;
343		goto out_fini;
344	}
345
346	list_add(&q->compute.link, &vm->preempt.exec_queues);
347	++vm->preempt.num_exec_queues;
348	q->compute.pfence = pfence;
349
350	down_read(&vm->userptr.notifier_lock);
351
352	drm_gpuvm_resv_add_fence(&vm->gpuvm, exec, pfence,
353				 DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_BOOKKEEP);
354
355	/*
356	 * Check to see if a preemption on VM is in flight or userptr
357	 * invalidation, if so trigger this preempt fence to sync state with
358	 * other preempt fences on the VM.
359	 */
360	wait = __xe_vm_userptr_needs_repin(vm) || preempt_fences_waiting(vm);
361	if (wait)
362		dma_fence_enable_sw_signaling(pfence);
363
364	up_read(&vm->userptr.notifier_lock);
365
366out_fini:
367	drm_exec_fini(exec);
368out_up_write:
369	up_write(&vm->lock);
370
371	return err;
372}
373
374/**
375 * xe_vm_remove_compute_exec_queue() - Remove compute exec queue from VM
376 * @vm: The VM.
377 * @q: The exec_queue
378 */
379void xe_vm_remove_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
380{
381	if (!xe_vm_in_preempt_fence_mode(vm))
382		return;
383
384	down_write(&vm->lock);
385	list_del(&q->compute.link);
386	--vm->preempt.num_exec_queues;
387	if (q->compute.pfence) {
388		dma_fence_enable_sw_signaling(q->compute.pfence);
389		dma_fence_put(q->compute.pfence);
390		q->compute.pfence = NULL;
391	}
392	up_write(&vm->lock);
393}
394
395/**
396 * __xe_vm_userptr_needs_repin() - Check whether the VM does have userptrs
397 * that need repinning.
398 * @vm: The VM.
399 *
400 * This function checks for whether the VM has userptrs that need repinning,
401 * and provides a release-type barrier on the userptr.notifier_lock after
402 * checking.
403 *
404 * Return: 0 if there are no userptrs needing repinning, -EAGAIN if there are.
405 */
406int __xe_vm_userptr_needs_repin(struct xe_vm *vm)
407{
408	lockdep_assert_held_read(&vm->userptr.notifier_lock);
409
410	return (list_empty(&vm->userptr.repin_list) &&
411		list_empty(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
412}
413
414#define XE_VM_REBIND_RETRY_TIMEOUT_MS 1000
415
416static void xe_vm_kill(struct xe_vm *vm)
417{
418	struct xe_exec_queue *q;
419
420	lockdep_assert_held(&vm->lock);
421
422	xe_vm_lock(vm, false);
423	vm->flags |= XE_VM_FLAG_BANNED;
424	trace_xe_vm_kill(vm);
425
426	list_for_each_entry(q, &vm->preempt.exec_queues, compute.link)
427		q->ops->kill(q);
428	xe_vm_unlock(vm);
429
430	/* TODO: Inform user the VM is banned */
431}
432
433/**
434 * xe_vm_validate_should_retry() - Whether to retry after a validate error.
435 * @exec: The drm_exec object used for locking before validation.
436 * @err: The error returned from ttm_bo_validate().
437 * @end: A ktime_t cookie that should be set to 0 before first use and
438 * that should be reused on subsequent calls.
439 *
440 * With multiple active VMs, under memory pressure, it is possible that
441 * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
442 * Until ttm properly handles locking in such scenarios, best thing the
443 * driver can do is retry with a timeout. Check if that is necessary, and
444 * if so unlock the drm_exec's objects while keeping the ticket to prepare
445 * for a rerun.
446 *
447 * Return: true if a retry after drm_exec_init() is recommended;
448 * false otherwise.
449 */
450bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
451{
452	ktime_t cur;
453
454	if (err != -ENOMEM)
455		return false;
456
457	cur = ktime_get();
458	*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
459	if (!ktime_before(cur, *end))
460		return false;
461
462	msleep(20);
463	return true;
464}
465
466static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
467{
468	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
469	struct drm_gpuva *gpuva;
470	int ret;
471
472	lockdep_assert_held(&vm->lock);
473	drm_gpuvm_bo_for_each_va(gpuva, vm_bo)
474		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
475			       &vm->rebind_list);
476
477	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
478	if (ret)
479		return ret;
480
481	vm_bo->evicted = false;
482	return 0;
483}
484
485/**
486 * xe_vm_validate_rebind() - Validate buffer objects and rebind vmas
487 * @vm: The vm for which we are rebinding.
488 * @exec: The struct drm_exec with the locked GEM objects.
489 * @num_fences: The number of fences to reserve for the operation, not
490 * including rebinds and validations.
491 *
492 * Validates all evicted gem objects and rebinds their vmas. Note that
493 * rebindings may cause evictions and hence the validation-rebind
494 * sequence is rerun until there are no more objects to validate.
495 *
496 * Return: 0 on success, negative error code on error. In particular,
497 * may return -EINTR or -ERESTARTSYS if interrupted, and -EDEADLK if
498 * the drm_exec transaction needs to be restarted.
499 */
500int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,
501			  unsigned int num_fences)
502{
503	struct drm_gem_object *obj;
504	unsigned long index;
505	int ret;
506
507	do {
508		ret = drm_gpuvm_validate(&vm->gpuvm, exec);
509		if (ret)
510			return ret;
511
512		ret = xe_vm_rebind(vm, false);
513		if (ret)
514			return ret;
515	} while (!list_empty(&vm->gpuvm.evict.list));
516
517	drm_exec_for_each_locked_object(exec, index, obj) {
518		ret = dma_resv_reserve_fences(obj->resv, num_fences);
519		if (ret)
520			return ret;
521	}
522
523	return 0;
524}
525
526static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
527				 bool *done)
528{
529	int err;
530
531	err = drm_gpuvm_prepare_vm(&vm->gpuvm, exec, 0);
532	if (err)
533		return err;
534
535	if (xe_vm_is_idle(vm)) {
536		vm->preempt.rebind_deactivated = true;
537		*done = true;
538		return 0;
539	}
540
541	if (!preempt_fences_waiting(vm)) {
542		*done = true;
543		return 0;
544	}
545
546	err = drm_gpuvm_prepare_objects(&vm->gpuvm, exec, 0);
547	if (err)
548		return err;
549
550	err = wait_for_existing_preempt_fences(vm);
551	if (err)
552		return err;
553
554	/*
555	 * Add validation and rebinding to the locking loop since both can
556	 * cause evictions which may require blocing dma_resv locks.
557	 * The fence reservation here is intended for the new preempt fences
558	 * we attach at the end of the rebind work.
559	 */
560	return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
561}
562
563static void preempt_rebind_work_func(struct work_struct *w)
564{
565	struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
566	struct drm_exec exec;
567	unsigned int fence_count = 0;
568	LIST_HEAD(preempt_fences);
569	ktime_t end = 0;
570	int err = 0;
571	long wait;
572	int __maybe_unused tries = 0;
573
574	xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
575	trace_xe_vm_rebind_worker_enter(vm);
576
577	down_write(&vm->lock);
578
579	if (xe_vm_is_closed_or_banned(vm)) {
580		up_write(&vm->lock);
581		trace_xe_vm_rebind_worker_exit(vm);
582		return;
583	}
584
585retry:
586	if (xe_vm_userptr_check_repin(vm)) {
587		err = xe_vm_userptr_pin(vm);
588		if (err)
589			goto out_unlock_outer;
590	}
591
592	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
593
594	drm_exec_until_all_locked(&exec) {
595		bool done = false;
596
597		err = xe_preempt_work_begin(&exec, vm, &done);
598		drm_exec_retry_on_contention(&exec);
599		if (err || done) {
600			drm_exec_fini(&exec);
601			if (err && xe_vm_validate_should_retry(&exec, err, &end))
602				err = -EAGAIN;
603
604			goto out_unlock_outer;
605		}
606	}
607
608	err = alloc_preempt_fences(vm, &preempt_fences, &fence_count);
609	if (err)
610		goto out_unlock;
611
612	err = xe_vm_rebind(vm, true);
613	if (err)
614		goto out_unlock;
615
616	/* Wait on rebinds and munmap style VM unbinds */
617	wait = dma_resv_wait_timeout(xe_vm_resv(vm),
618				     DMA_RESV_USAGE_KERNEL,
619				     false, MAX_SCHEDULE_TIMEOUT);
620	if (wait <= 0) {
621		err = -ETIME;
622		goto out_unlock;
623	}
624
625#define retry_required(__tries, __vm) \
626	(IS_ENABLED(CONFIG_DRM_XE_USERPTR_INVAL_INJECT) ? \
627	(!(__tries)++ || __xe_vm_userptr_needs_repin(__vm)) : \
628	__xe_vm_userptr_needs_repin(__vm))
629
630	down_read(&vm->userptr.notifier_lock);
631	if (retry_required(tries, vm)) {
632		up_read(&vm->userptr.notifier_lock);
633		err = -EAGAIN;
634		goto out_unlock;
635	}
636
637#undef retry_required
638
639	spin_lock(&vm->xe->ttm.lru_lock);
640	ttm_lru_bulk_move_tail(&vm->lru_bulk_move);
641	spin_unlock(&vm->xe->ttm.lru_lock);
642
643	/* Point of no return. */
644	arm_preempt_fences(vm, &preempt_fences);
645	resume_and_reinstall_preempt_fences(vm, &exec);
646	up_read(&vm->userptr.notifier_lock);
647
648out_unlock:
649	drm_exec_fini(&exec);
650out_unlock_outer:
651	if (err == -EAGAIN) {
652		trace_xe_vm_rebind_worker_retry(vm);
653		goto retry;
654	}
655
656	if (err) {
657		drm_warn(&vm->xe->drm, "VM worker error: %d\n", err);
658		xe_vm_kill(vm);
659	}
660	up_write(&vm->lock);
661
662	free_preempt_fences(&preempt_fences);
663
664	trace_xe_vm_rebind_worker_exit(vm);
665}
666
667static bool vma_userptr_invalidate(struct mmu_interval_notifier *mni,
668				   const struct mmu_notifier_range *range,
669				   unsigned long cur_seq)
670{
671	struct xe_userptr *userptr = container_of(mni, typeof(*userptr), notifier);
672	struct xe_userptr_vma *uvma = container_of(userptr, typeof(*uvma), userptr);
673	struct xe_vma *vma = &uvma->vma;
674	struct xe_vm *vm = xe_vma_vm(vma);
675	struct dma_resv_iter cursor;
676	struct dma_fence *fence;
677	long err;
678
679	xe_assert(vm->xe, xe_vma_is_userptr(vma));
680	trace_xe_vma_userptr_invalidate(vma);
681
682	if (!mmu_notifier_range_blockable(range))
683		return false;
684
685	down_write(&vm->userptr.notifier_lock);
686	mmu_interval_set_seq(mni, cur_seq);
687
688	/* No need to stop gpu access if the userptr is not yet bound. */
689	if (!userptr->initial_bind) {
690		up_write(&vm->userptr.notifier_lock);
691		return true;
692	}
693
694	/*
695	 * Tell exec and rebind worker they need to repin and rebind this
696	 * userptr.
697	 */
698	if (!xe_vm_in_fault_mode(vm) &&
699	    !(vma->gpuva.flags & XE_VMA_DESTROYED) && vma->tile_present) {
700		spin_lock(&vm->userptr.invalidated_lock);
701		list_move_tail(&userptr->invalidate_link,
702			       &vm->userptr.invalidated);
703		spin_unlock(&vm->userptr.invalidated_lock);
704	}
705
706	up_write(&vm->userptr.notifier_lock);
707
708	/*
709	 * Preempt fences turn into schedule disables, pipeline these.
710	 * Note that even in fault mode, we need to wait for binds and
711	 * unbinds to complete, and those are attached as BOOKMARK fences
712	 * to the vm.
713	 */
714	dma_resv_iter_begin(&cursor, xe_vm_resv(vm),
715			    DMA_RESV_USAGE_BOOKKEEP);
716	dma_resv_for_each_fence_unlocked(&cursor, fence)
717		dma_fence_enable_sw_signaling(fence);
718	dma_resv_iter_end(&cursor);
719
720	err = dma_resv_wait_timeout(xe_vm_resv(vm),
721				    DMA_RESV_USAGE_BOOKKEEP,
722				    false, MAX_SCHEDULE_TIMEOUT);
723	XE_WARN_ON(err <= 0);
724
725	if (xe_vm_in_fault_mode(vm)) {
726		err = xe_vm_invalidate_vma(vma);
727		XE_WARN_ON(err);
728	}
729
730	trace_xe_vma_userptr_invalidate_complete(vma);
731
732	return true;
733}
734
735static const struct mmu_interval_notifier_ops vma_userptr_notifier_ops = {
736	.invalidate = vma_userptr_invalidate,
737};
738
739int xe_vm_userptr_pin(struct xe_vm *vm)
740{
741	struct xe_userptr_vma *uvma, *next;
742	int err = 0;
743	LIST_HEAD(tmp_evict);
744
745	xe_assert(vm->xe, !xe_vm_in_fault_mode(vm));
746	lockdep_assert_held_write(&vm->lock);
747
748	/* Collect invalidated userptrs */
749	spin_lock(&vm->userptr.invalidated_lock);
750	list_for_each_entry_safe(uvma, next, &vm->userptr.invalidated,
751				 userptr.invalidate_link) {
752		list_del_init(&uvma->userptr.invalidate_link);
753		list_move_tail(&uvma->userptr.repin_link,
754			       &vm->userptr.repin_list);
755	}
756	spin_unlock(&vm->userptr.invalidated_lock);
757
758	/* Pin and move to temporary list */
759	list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list,
760				 userptr.repin_link) {
761		err = xe_vma_userptr_pin_pages(uvma);
762		if (err == -EFAULT) {
763			list_del_init(&uvma->userptr.repin_link);
764
765			/* Wait for pending binds */
766			xe_vm_lock(vm, false);
767			dma_resv_wait_timeout(xe_vm_resv(vm),
768					      DMA_RESV_USAGE_BOOKKEEP,
769					      false, MAX_SCHEDULE_TIMEOUT);
770
771			err = xe_vm_invalidate_vma(&uvma->vma);
772			xe_vm_unlock(vm);
773			if (err)
774				return err;
775		} else {
776			if (err < 0)
777				return err;
778
779			list_del_init(&uvma->userptr.repin_link);
780			list_move_tail(&uvma->vma.combined_links.rebind,
781				       &vm->rebind_list);
782		}
783	}
784
785	return 0;
786}
787
788/**
789 * xe_vm_userptr_check_repin() - Check whether the VM might have userptrs
790 * that need repinning.
791 * @vm: The VM.
792 *
793 * This function does an advisory check for whether the VM has userptrs that
794 * need repinning.
795 *
796 * Return: 0 if there are no indications of userptrs needing repinning,
797 * -EAGAIN if there are.
798 */
799int xe_vm_userptr_check_repin(struct xe_vm *vm)
800{
801	return (list_empty_careful(&vm->userptr.repin_list) &&
802		list_empty_careful(&vm->userptr.invalidated)) ? 0 : -EAGAIN;
803}
804
805static struct dma_fence *
806xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
807	       struct xe_sync_entry *syncs, u32 num_syncs,
808	       bool first_op, bool last_op);
809
810int xe_vm_rebind(struct xe_vm *vm, bool rebind_worker)
811{
812	struct dma_fence *fence;
813	struct xe_vma *vma, *next;
814
815	lockdep_assert_held(&vm->lock);
816	if (xe_vm_in_lr_mode(vm) && !rebind_worker)
817		return 0;
818
819	xe_vm_assert_held(vm);
820	list_for_each_entry_safe(vma, next, &vm->rebind_list,
821				 combined_links.rebind) {
822		xe_assert(vm->xe, vma->tile_present);
823
824		list_del_init(&vma->combined_links.rebind);
825		if (rebind_worker)
826			trace_xe_vma_rebind_worker(vma);
827		else
828			trace_xe_vma_rebind_exec(vma);
829		fence = xe_vm_bind_vma(vma, NULL, NULL, 0, false, false);
830		if (IS_ERR(fence))
831			return PTR_ERR(fence);
832		dma_fence_put(fence);
833	}
834
835	return 0;
836}
837
838static void xe_vma_free(struct xe_vma *vma)
839{
840	if (xe_vma_is_userptr(vma))
841		kfree(to_userptr_vma(vma));
842	else
843		kfree(vma);
844}
845
846#define VMA_CREATE_FLAG_READ_ONLY	BIT(0)
847#define VMA_CREATE_FLAG_IS_NULL		BIT(1)
848#define VMA_CREATE_FLAG_DUMPABLE	BIT(2)
849
850static struct xe_vma *xe_vma_create(struct xe_vm *vm,
851				    struct xe_bo *bo,
852				    u64 bo_offset_or_userptr,
853				    u64 start, u64 end,
854				    u16 pat_index, unsigned int flags)
855{
856	struct xe_vma *vma;
857	struct xe_tile *tile;
858	u8 id;
859	bool read_only = (flags & VMA_CREATE_FLAG_READ_ONLY);
860	bool is_null = (flags & VMA_CREATE_FLAG_IS_NULL);
861	bool dumpable = (flags & VMA_CREATE_FLAG_DUMPABLE);
862
863	xe_assert(vm->xe, start < end);
864	xe_assert(vm->xe, end < vm->size);
865
866	/*
867	 * Allocate and ensure that the xe_vma_is_userptr() return
868	 * matches what was allocated.
869	 */
870	if (!bo && !is_null) {
871		struct xe_userptr_vma *uvma = kzalloc(sizeof(*uvma), GFP_KERNEL);
872
873		if (!uvma)
874			return ERR_PTR(-ENOMEM);
875
876		vma = &uvma->vma;
877	} else {
878		vma = kzalloc(sizeof(*vma), GFP_KERNEL);
879		if (!vma)
880			return ERR_PTR(-ENOMEM);
881
882		if (is_null)
883			vma->gpuva.flags |= DRM_GPUVA_SPARSE;
884		if (bo)
885			vma->gpuva.gem.obj = &bo->ttm.base;
886	}
887
888	INIT_LIST_HEAD(&vma->combined_links.rebind);
889
890	INIT_LIST_HEAD(&vma->gpuva.gem.entry);
891	vma->gpuva.vm = &vm->gpuvm;
892	vma->gpuva.va.addr = start;
893	vma->gpuva.va.range = end - start + 1;
894	if (read_only)
895		vma->gpuva.flags |= XE_VMA_READ_ONLY;
896	if (dumpable)
897		vma->gpuva.flags |= XE_VMA_DUMPABLE;
898
899	for_each_tile(tile, vm->xe, id)
900		vma->tile_mask |= 0x1 << id;
901
902	if (GRAPHICS_VER(vm->xe) >= 20 || vm->xe->info.platform == XE_PVC)
903		vma->gpuva.flags |= XE_VMA_ATOMIC_PTE_BIT;
904
905	vma->pat_index = pat_index;
906
907	if (bo) {
908		struct drm_gpuvm_bo *vm_bo;
909
910		xe_bo_assert_held(bo);
911
912		vm_bo = drm_gpuvm_bo_obtain(vma->gpuva.vm, &bo->ttm.base);
913		if (IS_ERR(vm_bo)) {
914			xe_vma_free(vma);
915			return ERR_CAST(vm_bo);
916		}
917
918		drm_gpuvm_bo_extobj_add(vm_bo);
919		drm_gem_object_get(&bo->ttm.base);
920		vma->gpuva.gem.offset = bo_offset_or_userptr;
921		drm_gpuva_link(&vma->gpuva, vm_bo);
922		drm_gpuvm_bo_put(vm_bo);
923	} else /* userptr or null */ {
924		if (!is_null) {
925			struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
926			u64 size = end - start + 1;
927			int err;
928
929			INIT_LIST_HEAD(&userptr->invalidate_link);
930			INIT_LIST_HEAD(&userptr->repin_link);
931			vma->gpuva.gem.offset = bo_offset_or_userptr;
932
933			err = mmu_interval_notifier_insert(&userptr->notifier,
934							   current->mm,
935							   xe_vma_userptr(vma), size,
936							   &vma_userptr_notifier_ops);
937			if (err) {
938				xe_vma_free(vma);
939				return ERR_PTR(err);
940			}
941
942			userptr->notifier_seq = LONG_MAX;
943		}
944
945		xe_vm_get(vm);
946	}
947
948	return vma;
949}
950
951static void xe_vma_destroy_late(struct xe_vma *vma)
952{
953	struct xe_vm *vm = xe_vma_vm(vma);
954	struct xe_device *xe = vm->xe;
955	bool read_only = xe_vma_read_only(vma);
956
957	if (vma->ufence) {
958		xe_sync_ufence_put(vma->ufence);
959		vma->ufence = NULL;
960	}
961
962	if (xe_vma_is_userptr(vma)) {
963		struct xe_userptr *userptr = &to_userptr_vma(vma)->userptr;
964
965		if (userptr->sg) {
966			dma_unmap_sgtable(xe->drm.dev,
967					  userptr->sg,
968					  read_only ? DMA_TO_DEVICE :
969					  DMA_BIDIRECTIONAL, 0);
970			sg_free_table(userptr->sg);
971			userptr->sg = NULL;
972		}
973
974		/*
975		 * Since userptr pages are not pinned, we can't remove
976		 * the notifer until we're sure the GPU is not accessing
977		 * them anymore
978		 */
979		mmu_interval_notifier_remove(&userptr->notifier);
980		xe_vm_put(vm);
981	} else if (xe_vma_is_null(vma)) {
982		xe_vm_put(vm);
983	} else {
984		xe_bo_put(xe_vma_bo(vma));
985	}
986
987	xe_vma_free(vma);
988}
989
990static void vma_destroy_work_func(struct work_struct *w)
991{
992	struct xe_vma *vma =
993		container_of(w, struct xe_vma, destroy_work);
994
995	xe_vma_destroy_late(vma);
996}
997
998static void vma_destroy_cb(struct dma_fence *fence,
999			   struct dma_fence_cb *cb)
1000{
1001	struct xe_vma *vma = container_of(cb, struct xe_vma, destroy_cb);
1002
1003	INIT_WORK(&vma->destroy_work, vma_destroy_work_func);
1004	queue_work(system_unbound_wq, &vma->destroy_work);
1005}
1006
1007static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence)
1008{
1009	struct xe_vm *vm = xe_vma_vm(vma);
1010
1011	lockdep_assert_held_write(&vm->lock);
1012	xe_assert(vm->xe, list_empty(&vma->combined_links.destroy));
1013
1014	if (xe_vma_is_userptr(vma)) {
1015		xe_assert(vm->xe, vma->gpuva.flags & XE_VMA_DESTROYED);
1016
1017		spin_lock(&vm->userptr.invalidated_lock);
1018		list_del(&to_userptr_vma(vma)->userptr.invalidate_link);
1019		spin_unlock(&vm->userptr.invalidated_lock);
1020	} else if (!xe_vma_is_null(vma)) {
1021		xe_bo_assert_held(xe_vma_bo(vma));
1022
1023		drm_gpuva_unlink(&vma->gpuva);
1024	}
1025
1026	xe_vm_assert_held(vm);
1027	if (fence) {
1028		int ret = dma_fence_add_callback(fence, &vma->destroy_cb,
1029						 vma_destroy_cb);
1030
1031		if (ret) {
1032			XE_WARN_ON(ret != -ENOENT);
1033			xe_vma_destroy_late(vma);
1034		}
1035	} else {
1036		xe_vma_destroy_late(vma);
1037	}
1038}
1039
1040/**
1041 * xe_vm_lock_vma() - drm_exec utility to lock a vma
1042 * @exec: The drm_exec object we're currently locking for.
1043 * @vma: The vma for witch we want to lock the vm resv and any attached
1044 * object's resv.
1045 *
1046 * Return: 0 on success, negative error code on error. In particular
1047 * may return -EDEADLK on WW transaction contention and -EINTR if
1048 * an interruptible wait is terminated by a signal.
1049 */
1050int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
1051{
1052	struct xe_vm *vm = xe_vma_vm(vma);
1053	struct xe_bo *bo = xe_vma_bo(vma);
1054	int err;
1055
1056	XE_WARN_ON(!vm);
1057
1058	err = drm_exec_lock_obj(exec, xe_vm_obj(vm));
1059	if (!err && bo && !bo->vm)
1060		err = drm_exec_lock_obj(exec, &bo->ttm.base);
1061
1062	return err;
1063}
1064
1065static void xe_vma_destroy_unlocked(struct xe_vma *vma)
1066{
1067	struct drm_exec exec;
1068	int err;
1069
1070	drm_exec_init(&exec, 0, 0);
1071	drm_exec_until_all_locked(&exec) {
1072		err = xe_vm_lock_vma(&exec, vma);
1073		drm_exec_retry_on_contention(&exec);
1074		if (XE_WARN_ON(err))
1075			break;
1076	}
1077
1078	xe_vma_destroy(vma, NULL);
1079
1080	drm_exec_fini(&exec);
1081}
1082
1083struct xe_vma *
1084xe_vm_find_overlapping_vma(struct xe_vm *vm, u64 start, u64 range)
1085{
1086	struct drm_gpuva *gpuva;
1087
1088	lockdep_assert_held(&vm->lock);
1089
1090	if (xe_vm_is_closed_or_banned(vm))
1091		return NULL;
1092
1093	xe_assert(vm->xe, start + range <= vm->size);
1094
1095	gpuva = drm_gpuva_find_first(&vm->gpuvm, start, range);
1096
1097	return gpuva ? gpuva_to_vma(gpuva) : NULL;
1098}
1099
1100static int xe_vm_insert_vma(struct xe_vm *vm, struct xe_vma *vma)
1101{
1102	int err;
1103
1104	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1105	lockdep_assert_held(&vm->lock);
1106
1107	mutex_lock(&vm->snap_mutex);
1108	err = drm_gpuva_insert(&vm->gpuvm, &vma->gpuva);
1109	mutex_unlock(&vm->snap_mutex);
1110	XE_WARN_ON(err);	/* Shouldn't be possible */
1111
1112	return err;
1113}
1114
1115static void xe_vm_remove_vma(struct xe_vm *vm, struct xe_vma *vma)
1116{
1117	xe_assert(vm->xe, xe_vma_vm(vma) == vm);
1118	lockdep_assert_held(&vm->lock);
1119
1120	mutex_lock(&vm->snap_mutex);
1121	drm_gpuva_remove(&vma->gpuva);
1122	mutex_unlock(&vm->snap_mutex);
1123	if (vm->usm.last_fault_vma == vma)
1124		vm->usm.last_fault_vma = NULL;
1125}
1126
1127static struct drm_gpuva_op *xe_vm_op_alloc(void)
1128{
1129	struct xe_vma_op *op;
1130
1131	op = kzalloc(sizeof(*op), GFP_KERNEL);
1132
1133	if (unlikely(!op))
1134		return NULL;
1135
1136	return &op->base;
1137}
1138
1139static void xe_vm_free(struct drm_gpuvm *gpuvm);
1140
1141static const struct drm_gpuvm_ops gpuvm_ops = {
1142	.op_alloc = xe_vm_op_alloc,
1143	.vm_bo_validate = xe_gpuvm_validate,
1144	.vm_free = xe_vm_free,
1145};
1146
1147static u64 pde_encode_pat_index(struct xe_device *xe, u16 pat_index)
1148{
1149	u64 pte = 0;
1150
1151	if (pat_index & BIT(0))
1152		pte |= XE_PPGTT_PTE_PAT0;
1153
1154	if (pat_index & BIT(1))
1155		pte |= XE_PPGTT_PTE_PAT1;
1156
1157	return pte;
1158}
1159
1160static u64 pte_encode_pat_index(struct xe_device *xe, u16 pat_index,
1161				u32 pt_level)
1162{
1163	u64 pte = 0;
1164
1165	if (pat_index & BIT(0))
1166		pte |= XE_PPGTT_PTE_PAT0;
1167
1168	if (pat_index & BIT(1))
1169		pte |= XE_PPGTT_PTE_PAT1;
1170
1171	if (pat_index & BIT(2)) {
1172		if (pt_level)
1173			pte |= XE_PPGTT_PDE_PDPE_PAT2;
1174		else
1175			pte |= XE_PPGTT_PTE_PAT2;
1176	}
1177
1178	if (pat_index & BIT(3))
1179		pte |= XELPG_PPGTT_PTE_PAT3;
1180
1181	if (pat_index & (BIT(4)))
1182		pte |= XE2_PPGTT_PTE_PAT4;
1183
1184	return pte;
1185}
1186
1187static u64 pte_encode_ps(u32 pt_level)
1188{
1189	XE_WARN_ON(pt_level > MAX_HUGEPTE_LEVEL);
1190
1191	if (pt_level == 1)
1192		return XE_PDE_PS_2M;
1193	else if (pt_level == 2)
1194		return XE_PDPE_PS_1G;
1195
1196	return 0;
1197}
1198
1199static u64 xelp_pde_encode_bo(struct xe_bo *bo, u64 bo_offset,
1200			      const u16 pat_index)
1201{
1202	struct xe_device *xe = xe_bo_device(bo);
1203	u64 pde;
1204
1205	pde = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1206	pde |= XE_PAGE_PRESENT | XE_PAGE_RW;
1207	pde |= pde_encode_pat_index(xe, pat_index);
1208
1209	return pde;
1210}
1211
1212static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
1213			      u16 pat_index, u32 pt_level)
1214{
1215	struct xe_device *xe = xe_bo_device(bo);
1216	u64 pte;
1217
1218	pte = xe_bo_addr(bo, bo_offset, XE_PAGE_SIZE);
1219	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1220	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1221	pte |= pte_encode_ps(pt_level);
1222
1223	if (xe_bo_is_vram(bo) || xe_bo_is_stolen_devmem(bo))
1224		pte |= XE_PPGTT_PTE_DM;
1225
1226	return pte;
1227}
1228
1229static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
1230			       u16 pat_index, u32 pt_level)
1231{
1232	struct xe_device *xe = xe_vma_vm(vma)->xe;
1233
1234	pte |= XE_PAGE_PRESENT;
1235
1236	if (likely(!xe_vma_read_only(vma)))
1237		pte |= XE_PAGE_RW;
1238
1239	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1240	pte |= pte_encode_ps(pt_level);
1241
1242	if (unlikely(xe_vma_is_null(vma)))
1243		pte |= XE_PTE_NULL;
1244
1245	return pte;
1246}
1247
1248static u64 xelp_pte_encode_addr(struct xe_device *xe, u64 addr,
1249				u16 pat_index,
1250				u32 pt_level, bool devmem, u64 flags)
1251{
1252	u64 pte;
1253
1254	/* Avoid passing random bits directly as flags */
1255	xe_assert(xe, !(flags & ~XE_PTE_PS64));
1256
1257	pte = addr;
1258	pte |= XE_PAGE_PRESENT | XE_PAGE_RW;
1259	pte |= pte_encode_pat_index(xe, pat_index, pt_level);
1260	pte |= pte_encode_ps(pt_level);
1261
1262	if (devmem)
1263		pte |= XE_PPGTT_PTE_DM;
1264
1265	pte |= flags;
1266
1267	return pte;
1268}
1269
1270static const struct xe_pt_ops xelp_pt_ops = {
1271	.pte_encode_bo = xelp_pte_encode_bo,
1272	.pte_encode_vma = xelp_pte_encode_vma,
1273	.pte_encode_addr = xelp_pte_encode_addr,
1274	.pde_encode_bo = xelp_pde_encode_bo,
1275};
1276
1277static void vm_destroy_work_func(struct work_struct *w);
1278
1279/**
1280 * xe_vm_create_scratch() - Setup a scratch memory pagetable tree for the
1281 * given tile and vm.
1282 * @xe: xe device.
1283 * @tile: tile to set up for.
1284 * @vm: vm to set up for.
1285 *
1286 * Sets up a pagetable tree with one page-table per level and a single
1287 * leaf PTE. All pagetable entries point to the single page-table or,
1288 * for MAX_HUGEPTE_LEVEL, a NULL huge PTE returning 0 on read and
1289 * writes become NOPs.
1290 *
1291 * Return: 0 on success, negative error code on error.
1292 */
1293static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
1294				struct xe_vm *vm)
1295{
1296	u8 id = tile->id;
1297	int i;
1298
1299	for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; i++) {
1300		vm->scratch_pt[id][i] = xe_pt_create(vm, tile, i);
1301		if (IS_ERR(vm->scratch_pt[id][i]))
1302			return PTR_ERR(vm->scratch_pt[id][i]);
1303
1304		xe_pt_populate_empty(tile, vm, vm->scratch_pt[id][i]);
1305	}
1306
1307	return 0;
1308}
1309
1310static void xe_vm_free_scratch(struct xe_vm *vm)
1311{
1312	struct xe_tile *tile;
1313	u8 id;
1314
1315	if (!xe_vm_has_scratch(vm))
1316		return;
1317
1318	for_each_tile(tile, vm->xe, id) {
1319		u32 i;
1320
1321		if (!vm->pt_root[id])
1322			continue;
1323
1324		for (i = MAX_HUGEPTE_LEVEL; i < vm->pt_root[id]->level; ++i)
1325			if (vm->scratch_pt[id][i])
1326				xe_pt_destroy(vm->scratch_pt[id][i], vm->flags, NULL);
1327	}
1328}
1329
1330struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags)
1331{
1332	struct drm_gem_object *vm_resv_obj;
1333	struct xe_vm *vm;
1334	int err, number_tiles = 0;
1335	struct xe_tile *tile;
1336	u8 id;
1337
1338	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
1339	if (!vm)
1340		return ERR_PTR(-ENOMEM);
1341
1342	vm->xe = xe;
1343
1344	vm->size = 1ull << xe->info.va_bits;
1345
1346	vm->flags = flags;
1347
1348	init_rwsem(&vm->lock);
1349	mutex_init(&vm->snap_mutex);
1350
1351	INIT_LIST_HEAD(&vm->rebind_list);
1352
1353	INIT_LIST_HEAD(&vm->userptr.repin_list);
1354	INIT_LIST_HEAD(&vm->userptr.invalidated);
1355	init_rwsem(&vm->userptr.notifier_lock);
1356	spin_lock_init(&vm->userptr.invalidated_lock);
1357
1358	INIT_WORK(&vm->destroy_work, vm_destroy_work_func);
1359
1360	INIT_LIST_HEAD(&vm->preempt.exec_queues);
1361	vm->preempt.min_run_period_ms = 10;	/* FIXME: Wire up to uAPI */
1362
1363	for_each_tile(tile, xe, id)
1364		xe_range_fence_tree_init(&vm->rftree[id]);
1365
1366	vm->pt_ops = &xelp_pt_ops;
1367
1368	if (!(flags & XE_VM_FLAG_MIGRATION))
1369		xe_device_mem_access_get(xe);
1370
1371	vm_resv_obj = drm_gpuvm_resv_object_alloc(&xe->drm);
1372	if (!vm_resv_obj) {
1373		err = -ENOMEM;
1374		goto err_no_resv;
1375	}
1376
1377	drm_gpuvm_init(&vm->gpuvm, "Xe VM", DRM_GPUVM_RESV_PROTECTED, &xe->drm,
1378		       vm_resv_obj, 0, vm->size, 0, 0, &gpuvm_ops);
1379
1380	drm_gem_object_put(vm_resv_obj);
1381
1382	err = dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
1383	if (err)
1384		goto err_close;
1385
1386	if (IS_DGFX(xe) && xe->info.vram_flags & XE_VRAM_FLAGS_NEED64K)
1387		vm->flags |= XE_VM_FLAG_64K;
1388
1389	for_each_tile(tile, xe, id) {
1390		if (flags & XE_VM_FLAG_MIGRATION &&
1391		    tile->id != XE_VM_FLAG_TILE_ID(flags))
1392			continue;
1393
1394		vm->pt_root[id] = xe_pt_create(vm, tile, xe->info.vm_max_level);
1395		if (IS_ERR(vm->pt_root[id])) {
1396			err = PTR_ERR(vm->pt_root[id]);
1397			vm->pt_root[id] = NULL;
1398			goto err_unlock_close;
1399		}
1400	}
1401
1402	if (xe_vm_has_scratch(vm)) {
1403		for_each_tile(tile, xe, id) {
1404			if (!vm->pt_root[id])
1405				continue;
1406
1407			err = xe_vm_create_scratch(xe, tile, vm);
1408			if (err)
1409				goto err_unlock_close;
1410		}
1411		vm->batch_invalidate_tlb = true;
1412	}
1413
1414	if (flags & XE_VM_FLAG_LR_MODE) {
1415		INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
1416		vm->flags |= XE_VM_FLAG_LR_MODE;
1417		vm->batch_invalidate_tlb = false;
1418	}
1419
1420	/* Fill pt_root after allocating scratch tables */
1421	for_each_tile(tile, xe, id) {
1422		if (!vm->pt_root[id])
1423			continue;
1424
1425		xe_pt_populate_empty(tile, vm, vm->pt_root[id]);
1426	}
1427	dma_resv_unlock(xe_vm_resv(vm));
1428
1429	/* Kernel migration VM shouldn't have a circular loop.. */
1430	if (!(flags & XE_VM_FLAG_MIGRATION)) {
1431		for_each_tile(tile, xe, id) {
1432			struct xe_gt *gt = tile->primary_gt;
1433			struct xe_vm *migrate_vm;
1434			struct xe_exec_queue *q;
1435			u32 create_flags = EXEC_QUEUE_FLAG_VM;
1436
1437			if (!vm->pt_root[id])
1438				continue;
1439
1440			migrate_vm = xe_migrate_get_vm(tile->migrate);
1441			q = xe_exec_queue_create_class(xe, gt, migrate_vm,
1442						       XE_ENGINE_CLASS_COPY,
1443						       create_flags);
1444			xe_vm_put(migrate_vm);
1445			if (IS_ERR(q)) {
1446				err = PTR_ERR(q);
1447				goto err_close;
1448			}
1449			vm->q[id] = q;
1450			number_tiles++;
1451		}
1452	}
1453
1454	if (number_tiles > 1)
1455		vm->composite_fence_ctx = dma_fence_context_alloc(1);
1456
1457	mutex_lock(&xe->usm.lock);
1458	if (flags & XE_VM_FLAG_FAULT_MODE)
1459		xe->usm.num_vm_in_fault_mode++;
1460	else if (!(flags & XE_VM_FLAG_MIGRATION))
1461		xe->usm.num_vm_in_non_fault_mode++;
1462	mutex_unlock(&xe->usm.lock);
1463
1464	trace_xe_vm_create(vm);
1465
1466	return vm;
1467
1468err_unlock_close:
1469	dma_resv_unlock(xe_vm_resv(vm));
1470err_close:
1471	xe_vm_close_and_put(vm);
1472	return ERR_PTR(err);
1473
1474err_no_resv:
1475	mutex_destroy(&vm->snap_mutex);
1476	for_each_tile(tile, xe, id)
1477		xe_range_fence_tree_fini(&vm->rftree[id]);
1478	kfree(vm);
1479	if (!(flags & XE_VM_FLAG_MIGRATION))
1480		xe_device_mem_access_put(xe);
1481	return ERR_PTR(err);
1482}
1483
1484static void xe_vm_close(struct xe_vm *vm)
1485{
1486	down_write(&vm->lock);
1487	vm->size = 0;
1488	up_write(&vm->lock);
1489}
1490
1491void xe_vm_close_and_put(struct xe_vm *vm)
1492{
1493	LIST_HEAD(contested);
1494	struct xe_device *xe = vm->xe;
1495	struct xe_tile *tile;
1496	struct xe_vma *vma, *next_vma;
1497	struct drm_gpuva *gpuva, *next;
1498	u8 id;
1499
1500	xe_assert(xe, !vm->preempt.num_exec_queues);
1501
1502	xe_vm_close(vm);
1503	if (xe_vm_in_preempt_fence_mode(vm))
1504		flush_work(&vm->preempt.rebind_work);
1505
1506	down_write(&vm->lock);
1507	for_each_tile(tile, xe, id) {
1508		if (vm->q[id])
1509			xe_exec_queue_last_fence_put(vm->q[id], vm);
1510	}
1511	up_write(&vm->lock);
1512
1513	for_each_tile(tile, xe, id) {
1514		if (vm->q[id]) {
1515			xe_exec_queue_kill(vm->q[id]);
1516			xe_exec_queue_put(vm->q[id]);
1517			vm->q[id] = NULL;
1518		}
1519	}
1520
1521	down_write(&vm->lock);
1522	xe_vm_lock(vm, false);
1523	drm_gpuvm_for_each_va_safe(gpuva, next, &vm->gpuvm) {
1524		vma = gpuva_to_vma(gpuva);
1525
1526		if (xe_vma_has_no_bo(vma)) {
1527			down_read(&vm->userptr.notifier_lock);
1528			vma->gpuva.flags |= XE_VMA_DESTROYED;
1529			up_read(&vm->userptr.notifier_lock);
1530		}
1531
1532		xe_vm_remove_vma(vm, vma);
1533
1534		/* easy case, remove from VMA? */
1535		if (xe_vma_has_no_bo(vma) || xe_vma_bo(vma)->vm) {
1536			list_del_init(&vma->combined_links.rebind);
1537			xe_vma_destroy(vma, NULL);
1538			continue;
1539		}
1540
1541		list_move_tail(&vma->combined_links.destroy, &contested);
1542		vma->gpuva.flags |= XE_VMA_DESTROYED;
1543	}
1544
1545	/*
1546	 * All vm operations will add shared fences to resv.
1547	 * The only exception is eviction for a shared object,
1548	 * but even so, the unbind when evicted would still
1549	 * install a fence to resv. Hence it's safe to
1550	 * destroy the pagetables immediately.
1551	 */
1552	xe_vm_free_scratch(vm);
1553
1554	for_each_tile(tile, xe, id) {
1555		if (vm->pt_root[id]) {
1556			xe_pt_destroy(vm->pt_root[id], vm->flags, NULL);
1557			vm->pt_root[id] = NULL;
1558		}
1559	}
1560	xe_vm_unlock(vm);
1561
1562	/*
1563	 * VM is now dead, cannot re-add nodes to vm->vmas if it's NULL
1564	 * Since we hold a refcount to the bo, we can remove and free
1565	 * the members safely without locking.
1566	 */
1567	list_for_each_entry_safe(vma, next_vma, &contested,
1568				 combined_links.destroy) {
1569		list_del_init(&vma->combined_links.destroy);
1570		xe_vma_destroy_unlocked(vma);
1571	}
1572
1573	up_write(&vm->lock);
1574
1575	mutex_lock(&xe->usm.lock);
1576	if (vm->flags & XE_VM_FLAG_FAULT_MODE)
1577		xe->usm.num_vm_in_fault_mode--;
1578	else if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1579		xe->usm.num_vm_in_non_fault_mode--;
1580
1581	if (vm->usm.asid) {
1582		void *lookup;
1583
1584		xe_assert(xe, xe->info.has_asid);
1585		xe_assert(xe, !(vm->flags & XE_VM_FLAG_MIGRATION));
1586
1587		lookup = xa_erase(&xe->usm.asid_to_vm, vm->usm.asid);
1588		xe_assert(xe, lookup == vm);
1589	}
1590	mutex_unlock(&xe->usm.lock);
1591
1592	for_each_tile(tile, xe, id)
1593		xe_range_fence_tree_fini(&vm->rftree[id]);
1594
1595	xe_vm_put(vm);
1596}
1597
1598static void vm_destroy_work_func(struct work_struct *w)
1599{
1600	struct xe_vm *vm =
1601		container_of(w, struct xe_vm, destroy_work);
1602	struct xe_device *xe = vm->xe;
1603	struct xe_tile *tile;
1604	u8 id;
1605
1606	/* xe_vm_close_and_put was not called? */
1607	xe_assert(xe, !vm->size);
1608
1609	mutex_destroy(&vm->snap_mutex);
1610
1611	if (!(vm->flags & XE_VM_FLAG_MIGRATION))
1612		xe_device_mem_access_put(xe);
1613
1614	for_each_tile(tile, xe, id)
1615		XE_WARN_ON(vm->pt_root[id]);
1616
1617	trace_xe_vm_free(vm);
1618	kfree(vm);
1619}
1620
1621static void xe_vm_free(struct drm_gpuvm *gpuvm)
1622{
1623	struct xe_vm *vm = container_of(gpuvm, struct xe_vm, gpuvm);
1624
1625	/* To destroy the VM we need to be able to sleep */
1626	queue_work(system_unbound_wq, &vm->destroy_work);
1627}
1628
1629struct xe_vm *xe_vm_lookup(struct xe_file *xef, u32 id)
1630{
1631	struct xe_vm *vm;
1632
1633	mutex_lock(&xef->vm.lock);
1634	vm = xa_load(&xef->vm.xa, id);
1635	if (vm)
1636		xe_vm_get(vm);
1637	mutex_unlock(&xef->vm.lock);
1638
1639	return vm;
1640}
1641
1642u64 xe_vm_pdp4_descriptor(struct xe_vm *vm, struct xe_tile *tile)
1643{
1644	return vm->pt_ops->pde_encode_bo(vm->pt_root[tile->id]->bo, 0,
1645					 tile_to_xe(tile)->pat.idx[XE_CACHE_WB]);
1646}
1647
1648static struct xe_exec_queue *
1649to_wait_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
1650{
1651	return q ? q : vm->q[0];
1652}
1653
1654static struct dma_fence *
1655xe_vm_unbind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1656		 struct xe_sync_entry *syncs, u32 num_syncs,
1657		 bool first_op, bool last_op)
1658{
1659	struct xe_vm *vm = xe_vma_vm(vma);
1660	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1661	struct xe_tile *tile;
1662	struct dma_fence *fence = NULL;
1663	struct dma_fence **fences = NULL;
1664	struct dma_fence_array *cf = NULL;
1665	int cur_fence = 0, i;
1666	int number_tiles = hweight8(vma->tile_present);
1667	int err;
1668	u8 id;
1669
1670	trace_xe_vma_unbind(vma);
1671
1672	if (vma->ufence) {
1673		struct xe_user_fence * const f = vma->ufence;
1674
1675		if (!xe_sync_ufence_get_status(f))
1676			return ERR_PTR(-EBUSY);
1677
1678		vma->ufence = NULL;
1679		xe_sync_ufence_put(f);
1680	}
1681
1682	if (number_tiles > 1) {
1683		fences = kmalloc_array(number_tiles, sizeof(*fences),
1684				       GFP_KERNEL);
1685		if (!fences)
1686			return ERR_PTR(-ENOMEM);
1687	}
1688
1689	for_each_tile(tile, vm->xe, id) {
1690		if (!(vma->tile_present & BIT(id)))
1691			goto next;
1692
1693		fence = __xe_pt_unbind_vma(tile, vma, q ? q : vm->q[id],
1694					   first_op ? syncs : NULL,
1695					   first_op ? num_syncs : 0);
1696		if (IS_ERR(fence)) {
1697			err = PTR_ERR(fence);
1698			goto err_fences;
1699		}
1700
1701		if (fences)
1702			fences[cur_fence++] = fence;
1703
1704next:
1705		if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1706			q = list_next_entry(q, multi_gt_list);
1707	}
1708
1709	if (fences) {
1710		cf = dma_fence_array_create(number_tiles, fences,
1711					    vm->composite_fence_ctx,
1712					    vm->composite_fence_seqno++,
1713					    false);
1714		if (!cf) {
1715			--vm->composite_fence_seqno;
1716			err = -ENOMEM;
1717			goto err_fences;
1718		}
1719	}
1720
1721	fence = cf ? &cf->base : !fence ?
1722		xe_exec_queue_last_fence_get(wait_exec_queue, vm) : fence;
1723	if (last_op) {
1724		for (i = 0; i < num_syncs; i++)
1725			xe_sync_entry_signal(&syncs[i], NULL, fence);
1726	}
1727
1728	return fence;
1729
1730err_fences:
1731	if (fences) {
1732		while (cur_fence)
1733			dma_fence_put(fences[--cur_fence]);
1734		kfree(fences);
1735	}
1736
1737	return ERR_PTR(err);
1738}
1739
1740static struct dma_fence *
1741xe_vm_bind_vma(struct xe_vma *vma, struct xe_exec_queue *q,
1742	       struct xe_sync_entry *syncs, u32 num_syncs,
1743	       bool first_op, bool last_op)
1744{
1745	struct xe_tile *tile;
1746	struct dma_fence *fence;
1747	struct dma_fence **fences = NULL;
1748	struct dma_fence_array *cf = NULL;
1749	struct xe_vm *vm = xe_vma_vm(vma);
1750	int cur_fence = 0, i;
1751	int number_tiles = hweight8(vma->tile_mask);
1752	int err;
1753	u8 id;
1754
1755	trace_xe_vma_bind(vma);
1756
1757	if (number_tiles > 1) {
1758		fences = kmalloc_array(number_tiles, sizeof(*fences),
1759				       GFP_KERNEL);
1760		if (!fences)
1761			return ERR_PTR(-ENOMEM);
1762	}
1763
1764	for_each_tile(tile, vm->xe, id) {
1765		if (!(vma->tile_mask & BIT(id)))
1766			goto next;
1767
1768		fence = __xe_pt_bind_vma(tile, vma, q ? q : vm->q[id],
1769					 first_op ? syncs : NULL,
1770					 first_op ? num_syncs : 0,
1771					 vma->tile_present & BIT(id));
1772		if (IS_ERR(fence)) {
1773			err = PTR_ERR(fence);
1774			goto err_fences;
1775		}
1776
1777		if (fences)
1778			fences[cur_fence++] = fence;
1779
1780next:
1781		if (q && vm->pt_root[id] && !list_empty(&q->multi_gt_list))
1782			q = list_next_entry(q, multi_gt_list);
1783	}
1784
1785	if (fences) {
1786		cf = dma_fence_array_create(number_tiles, fences,
1787					    vm->composite_fence_ctx,
1788					    vm->composite_fence_seqno++,
1789					    false);
1790		if (!cf) {
1791			--vm->composite_fence_seqno;
1792			err = -ENOMEM;
1793			goto err_fences;
1794		}
1795	}
1796
1797	if (last_op) {
1798		for (i = 0; i < num_syncs; i++)
1799			xe_sync_entry_signal(&syncs[i], NULL,
1800					     cf ? &cf->base : fence);
1801	}
1802
1803	return cf ? &cf->base : fence;
1804
1805err_fences:
1806	if (fences) {
1807		while (cur_fence)
1808			dma_fence_put(fences[--cur_fence]);
1809		kfree(fences);
1810	}
1811
1812	return ERR_PTR(err);
1813}
1814
1815static struct xe_user_fence *
1816find_ufence_get(struct xe_sync_entry *syncs, u32 num_syncs)
1817{
1818	unsigned int i;
1819
1820	for (i = 0; i < num_syncs; i++) {
1821		struct xe_sync_entry *e = &syncs[i];
1822
1823		if (xe_sync_is_ufence(e))
1824			return xe_sync_ufence_get(e);
1825	}
1826
1827	return NULL;
1828}
1829
1830static int __xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma,
1831			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1832			u32 num_syncs, bool immediate, bool first_op,
1833			bool last_op)
1834{
1835	struct dma_fence *fence;
1836	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1837	struct xe_user_fence *ufence;
1838
1839	xe_vm_assert_held(vm);
1840
1841	ufence = find_ufence_get(syncs, num_syncs);
1842	if (vma->ufence && ufence)
1843		xe_sync_ufence_put(vma->ufence);
1844
1845	vma->ufence = ufence ?: vma->ufence;
1846
1847	if (immediate) {
1848		fence = xe_vm_bind_vma(vma, q, syncs, num_syncs, first_op,
1849				       last_op);
1850		if (IS_ERR(fence))
1851			return PTR_ERR(fence);
1852	} else {
1853		int i;
1854
1855		xe_assert(vm->xe, xe_vm_in_fault_mode(vm));
1856
1857		fence = xe_exec_queue_last_fence_get(wait_exec_queue, vm);
1858		if (last_op) {
1859			for (i = 0; i < num_syncs; i++)
1860				xe_sync_entry_signal(&syncs[i], NULL, fence);
1861		}
1862	}
1863
1864	if (last_op)
1865		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
1866	dma_fence_put(fence);
1867
1868	return 0;
1869}
1870
1871static int xe_vm_bind(struct xe_vm *vm, struct xe_vma *vma, struct xe_exec_queue *q,
1872		      struct xe_bo *bo, struct xe_sync_entry *syncs,
1873		      u32 num_syncs, bool immediate, bool first_op,
1874		      bool last_op)
1875{
1876	int err;
1877
1878	xe_vm_assert_held(vm);
1879	xe_bo_assert_held(bo);
1880
1881	if (bo && immediate) {
1882		err = xe_bo_validate(bo, vm, true);
1883		if (err)
1884			return err;
1885	}
1886
1887	return __xe_vm_bind(vm, vma, q, syncs, num_syncs, immediate, first_op,
1888			    last_op);
1889}
1890
1891static int xe_vm_unbind(struct xe_vm *vm, struct xe_vma *vma,
1892			struct xe_exec_queue *q, struct xe_sync_entry *syncs,
1893			u32 num_syncs, bool first_op, bool last_op)
1894{
1895	struct dma_fence *fence;
1896	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
1897
1898	xe_vm_assert_held(vm);
1899	xe_bo_assert_held(xe_vma_bo(vma));
1900
1901	fence = xe_vm_unbind_vma(vma, q, syncs, num_syncs, first_op, last_op);
1902	if (IS_ERR(fence))
1903		return PTR_ERR(fence);
1904
1905	xe_vma_destroy(vma, fence);
1906	if (last_op)
1907		xe_exec_queue_last_fence_set(wait_exec_queue, vm, fence);
1908	dma_fence_put(fence);
1909
1910	return 0;
1911}
1912
1913#define ALL_DRM_XE_VM_CREATE_FLAGS (DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE | \
1914				    DRM_XE_VM_CREATE_FLAG_LR_MODE | \
1915				    DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1916
1917int xe_vm_create_ioctl(struct drm_device *dev, void *data,
1918		       struct drm_file *file)
1919{
1920	struct xe_device *xe = to_xe_device(dev);
1921	struct xe_file *xef = to_xe_file(file);
1922	struct drm_xe_vm_create *args = data;
1923	struct xe_tile *tile;
1924	struct xe_vm *vm;
1925	u32 id, asid;
1926	int err;
1927	u32 flags = 0;
1928
1929	if (XE_IOCTL_DBG(xe, args->extensions))
1930		return -EINVAL;
1931
1932	if (XE_WA(xe_root_mmio_gt(xe), 14016763929))
1933		args->flags |= DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE;
1934
1935	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1936			 !xe->info.has_usm))
1937		return -EINVAL;
1938
1939	if (XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
1940		return -EINVAL;
1941
1942	if (XE_IOCTL_DBG(xe, args->flags & ~ALL_DRM_XE_VM_CREATE_FLAGS))
1943		return -EINVAL;
1944
1945	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE &&
1946			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1947		return -EINVAL;
1948
1949	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE) &&
1950			 args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE))
1951		return -EINVAL;
1952
1953	if (XE_IOCTL_DBG(xe, args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE &&
1954			 xe_device_in_non_fault_mode(xe)))
1955		return -EINVAL;
1956
1957	if (XE_IOCTL_DBG(xe, !(args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE) &&
1958			 xe_device_in_fault_mode(xe)))
1959		return -EINVAL;
1960
1961	if (XE_IOCTL_DBG(xe, args->extensions))
1962		return -EINVAL;
1963
1964	if (args->flags & DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE)
1965		flags |= XE_VM_FLAG_SCRATCH_PAGE;
1966	if (args->flags & DRM_XE_VM_CREATE_FLAG_LR_MODE)
1967		flags |= XE_VM_FLAG_LR_MODE;
1968	if (args->flags & DRM_XE_VM_CREATE_FLAG_FAULT_MODE)
1969		flags |= XE_VM_FLAG_FAULT_MODE;
1970
1971	vm = xe_vm_create(xe, flags);
1972	if (IS_ERR(vm))
1973		return PTR_ERR(vm);
1974
1975	mutex_lock(&xef->vm.lock);
1976	err = xa_alloc(&xef->vm.xa, &id, vm, xa_limit_32b, GFP_KERNEL);
1977	mutex_unlock(&xef->vm.lock);
1978	if (err)
1979		goto err_close_and_put;
1980
1981	if (xe->info.has_asid) {
1982		mutex_lock(&xe->usm.lock);
1983		err = xa_alloc_cyclic(&xe->usm.asid_to_vm, &asid, vm,
1984				      XA_LIMIT(1, XE_MAX_ASID - 1),
1985				      &xe->usm.next_asid, GFP_KERNEL);
1986		mutex_unlock(&xe->usm.lock);
1987		if (err < 0)
1988			goto err_free_id;
1989
1990		vm->usm.asid = asid;
1991	}
1992
1993	args->vm_id = id;
1994	vm->xef = xef;
1995
1996	/* Record BO memory for VM pagetable created against client */
1997	for_each_tile(tile, xe, id)
1998		if (vm->pt_root[id])
1999			xe_drm_client_add_bo(vm->xef->client, vm->pt_root[id]->bo);
2000
2001#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_MEM)
2002	/* Warning: Security issue - never enable by default */
2003	args->reserved[0] = xe_bo_main_addr(vm->pt_root[0]->bo, XE_PAGE_SIZE);
2004#endif
2005
2006	return 0;
2007
2008err_free_id:
2009	mutex_lock(&xef->vm.lock);
2010	xa_erase(&xef->vm.xa, id);
2011	mutex_unlock(&xef->vm.lock);
2012err_close_and_put:
2013	xe_vm_close_and_put(vm);
2014
2015	return err;
2016}
2017
2018int xe_vm_destroy_ioctl(struct drm_device *dev, void *data,
2019			struct drm_file *file)
2020{
2021	struct xe_device *xe = to_xe_device(dev);
2022	struct xe_file *xef = to_xe_file(file);
2023	struct drm_xe_vm_destroy *args = data;
2024	struct xe_vm *vm;
2025	int err = 0;
2026
2027	if (XE_IOCTL_DBG(xe, args->pad) ||
2028	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2029		return -EINVAL;
2030
2031	mutex_lock(&xef->vm.lock);
2032	vm = xa_load(&xef->vm.xa, args->vm_id);
2033	if (XE_IOCTL_DBG(xe, !vm))
2034		err = -ENOENT;
2035	else if (XE_IOCTL_DBG(xe, vm->preempt.num_exec_queues))
2036		err = -EBUSY;
2037	else
2038		xa_erase(&xef->vm.xa, args->vm_id);
2039	mutex_unlock(&xef->vm.lock);
2040
2041	if (!err)
2042		xe_vm_close_and_put(vm);
2043
2044	return err;
2045}
2046
2047static const u32 region_to_mem_type[] = {
2048	XE_PL_TT,
2049	XE_PL_VRAM0,
2050	XE_PL_VRAM1,
2051};
2052
2053static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma,
2054			  struct xe_exec_queue *q, u32 region,
2055			  struct xe_sync_entry *syncs, u32 num_syncs,
2056			  bool first_op, bool last_op)
2057{
2058	struct xe_exec_queue *wait_exec_queue = to_wait_exec_queue(vm, q);
2059	int err;
2060
2061	xe_assert(vm->xe, region <= ARRAY_SIZE(region_to_mem_type));
2062
2063	if (!xe_vma_has_no_bo(vma)) {
2064		err = xe_bo_migrate(xe_vma_bo(vma), region_to_mem_type[region]);
2065		if (err)
2066			return err;
2067	}
2068
2069	if (vma->tile_mask != (vma->tile_present & ~vma->tile_invalidated)) {
2070		return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs,
2071				  true, first_op, last_op);
2072	} else {
2073		int i;
2074
2075		/* Nothing to do, signal fences now */
2076		if (last_op) {
2077			for (i = 0; i < num_syncs; i++) {
2078				struct dma_fence *fence =
2079					xe_exec_queue_last_fence_get(wait_exec_queue, vm);
2080
2081				xe_sync_entry_signal(&syncs[i], NULL, fence);
2082				dma_fence_put(fence);
2083			}
2084		}
2085
2086		return 0;
2087	}
2088}
2089
2090static void prep_vma_destroy(struct xe_vm *vm, struct xe_vma *vma,
2091			     bool post_commit)
2092{
2093	down_read(&vm->userptr.notifier_lock);
2094	vma->gpuva.flags |= XE_VMA_DESTROYED;
2095	up_read(&vm->userptr.notifier_lock);
2096	if (post_commit)
2097		xe_vm_remove_vma(vm, vma);
2098}
2099
2100#undef ULL
2101#define ULL	unsigned long long
2102
2103#if IS_ENABLED(CONFIG_DRM_XE_DEBUG_VM)
2104static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2105{
2106	struct xe_vma *vma;
2107
2108	switch (op->op) {
2109	case DRM_GPUVA_OP_MAP:
2110		vm_dbg(&xe->drm, "MAP: addr=0x%016llx, range=0x%016llx",
2111		       (ULL)op->map.va.addr, (ULL)op->map.va.range);
2112		break;
2113	case DRM_GPUVA_OP_REMAP:
2114		vma = gpuva_to_vma(op->remap.unmap->va);
2115		vm_dbg(&xe->drm, "REMAP:UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2116		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2117		       op->remap.unmap->keep ? 1 : 0);
2118		if (op->remap.prev)
2119			vm_dbg(&xe->drm,
2120			       "REMAP:PREV: addr=0x%016llx, range=0x%016llx",
2121			       (ULL)op->remap.prev->va.addr,
2122			       (ULL)op->remap.prev->va.range);
2123		if (op->remap.next)
2124			vm_dbg(&xe->drm,
2125			       "REMAP:NEXT: addr=0x%016llx, range=0x%016llx",
2126			       (ULL)op->remap.next->va.addr,
2127			       (ULL)op->remap.next->va.range);
2128		break;
2129	case DRM_GPUVA_OP_UNMAP:
2130		vma = gpuva_to_vma(op->unmap.va);
2131		vm_dbg(&xe->drm, "UNMAP: addr=0x%016llx, range=0x%016llx, keep=%d",
2132		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma),
2133		       op->unmap.keep ? 1 : 0);
2134		break;
2135	case DRM_GPUVA_OP_PREFETCH:
2136		vma = gpuva_to_vma(op->prefetch.va);
2137		vm_dbg(&xe->drm, "PREFETCH: addr=0x%016llx, range=0x%016llx",
2138		       (ULL)xe_vma_start(vma), (ULL)xe_vma_size(vma));
2139		break;
2140	default:
2141		drm_warn(&xe->drm, "NOT POSSIBLE");
2142	}
2143}
2144#else
2145static void print_op(struct xe_device *xe, struct drm_gpuva_op *op)
2146{
2147}
2148#endif
2149
2150/*
2151 * Create operations list from IOCTL arguments, setup operations fields so parse
2152 * and commit steps are decoupled from IOCTL arguments. This step can fail.
2153 */
2154static struct drm_gpuva_ops *
2155vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
2156			 u64 bo_offset_or_userptr, u64 addr, u64 range,
2157			 u32 operation, u32 flags,
2158			 u32 prefetch_region, u16 pat_index)
2159{
2160	struct drm_gem_object *obj = bo ? &bo->ttm.base : NULL;
2161	struct drm_gpuva_ops *ops;
2162	struct drm_gpuva_op *__op;
2163	struct drm_gpuvm_bo *vm_bo;
2164	int err;
2165
2166	lockdep_assert_held_write(&vm->lock);
2167
2168	vm_dbg(&vm->xe->drm,
2169	       "op=%d, addr=0x%016llx, range=0x%016llx, bo_offset_or_userptr=0x%016llx",
2170	       operation, (ULL)addr, (ULL)range,
2171	       (ULL)bo_offset_or_userptr);
2172
2173	switch (operation) {
2174	case DRM_XE_VM_BIND_OP_MAP:
2175	case DRM_XE_VM_BIND_OP_MAP_USERPTR:
2176		ops = drm_gpuvm_sm_map_ops_create(&vm->gpuvm, addr, range,
2177						  obj, bo_offset_or_userptr);
2178		break;
2179	case DRM_XE_VM_BIND_OP_UNMAP:
2180		ops = drm_gpuvm_sm_unmap_ops_create(&vm->gpuvm, addr, range);
2181		break;
2182	case DRM_XE_VM_BIND_OP_PREFETCH:
2183		ops = drm_gpuvm_prefetch_ops_create(&vm->gpuvm, addr, range);
2184		break;
2185	case DRM_XE_VM_BIND_OP_UNMAP_ALL:
2186		xe_assert(vm->xe, bo);
2187
2188		err = xe_bo_lock(bo, true);
2189		if (err)
2190			return ERR_PTR(err);
2191
2192		vm_bo = drm_gpuvm_bo_obtain(&vm->gpuvm, obj);
2193		if (IS_ERR(vm_bo)) {
2194			xe_bo_unlock(bo);
2195			return ERR_CAST(vm_bo);
2196		}
2197
2198		ops = drm_gpuvm_bo_unmap_ops_create(vm_bo);
2199		drm_gpuvm_bo_put(vm_bo);
2200		xe_bo_unlock(bo);
2201		break;
2202	default:
2203		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2204		ops = ERR_PTR(-EINVAL);
2205	}
2206	if (IS_ERR(ops))
2207		return ops;
2208
2209	drm_gpuva_for_each_op(__op, ops) {
2210		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2211
2212		if (__op->op == DRM_GPUVA_OP_MAP) {
2213			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2214			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
2215			op->map.pat_index = pat_index;
2216		} else if (__op->op == DRM_GPUVA_OP_PREFETCH) {
2217			op->prefetch.region = prefetch_region;
2218		}
2219
2220		print_op(vm->xe, __op);
2221	}
2222
2223	return ops;
2224}
2225
2226static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
2227			      u16 pat_index, unsigned int flags)
2228{
2229	struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2230	struct drm_exec exec;
2231	struct xe_vma *vma;
2232	int err;
2233
2234	lockdep_assert_held_write(&vm->lock);
2235
2236	if (bo) {
2237		drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2238		drm_exec_until_all_locked(&exec) {
2239			err = 0;
2240			if (!bo->vm) {
2241				err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
2242				drm_exec_retry_on_contention(&exec);
2243			}
2244			if (!err) {
2245				err = drm_exec_lock_obj(&exec, &bo->ttm.base);
2246				drm_exec_retry_on_contention(&exec);
2247			}
2248			if (err) {
2249				drm_exec_fini(&exec);
2250				return ERR_PTR(err);
2251			}
2252		}
2253	}
2254	vma = xe_vma_create(vm, bo, op->gem.offset,
2255			    op->va.addr, op->va.addr +
2256			    op->va.range - 1, pat_index, flags);
2257	if (bo)
2258		drm_exec_fini(&exec);
2259
2260	if (xe_vma_is_userptr(vma)) {
2261		err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2262		if (err) {
2263			prep_vma_destroy(vm, vma, false);
2264			xe_vma_destroy_unlocked(vma);
2265			return ERR_PTR(err);
2266		}
2267	} else if (!xe_vma_has_no_bo(vma) && !bo->vm) {
2268		err = add_preempt_fences(vm, bo);
2269		if (err) {
2270			prep_vma_destroy(vm, vma, false);
2271			xe_vma_destroy_unlocked(vma);
2272			return ERR_PTR(err);
2273		}
2274	}
2275
2276	return vma;
2277}
2278
2279static u64 xe_vma_max_pte_size(struct xe_vma *vma)
2280{
2281	if (vma->gpuva.flags & XE_VMA_PTE_1G)
2282		return SZ_1G;
2283	else if (vma->gpuva.flags & (XE_VMA_PTE_2M | XE_VMA_PTE_COMPACT))
2284		return SZ_2M;
2285	else if (vma->gpuva.flags & XE_VMA_PTE_64K)
2286		return SZ_64K;
2287	else if (vma->gpuva.flags & XE_VMA_PTE_4K)
2288		return SZ_4K;
2289
2290	return SZ_1G;	/* Uninitialized, used max size */
2291}
2292
2293static void xe_vma_set_pte_size(struct xe_vma *vma, u64 size)
2294{
2295	switch (size) {
2296	case SZ_1G:
2297		vma->gpuva.flags |= XE_VMA_PTE_1G;
2298		break;
2299	case SZ_2M:
2300		vma->gpuva.flags |= XE_VMA_PTE_2M;
2301		break;
2302	case SZ_64K:
2303		vma->gpuva.flags |= XE_VMA_PTE_64K;
2304		break;
2305	case SZ_4K:
2306		vma->gpuva.flags |= XE_VMA_PTE_4K;
2307		break;
2308	}
2309}
2310
2311static int xe_vma_op_commit(struct xe_vm *vm, struct xe_vma_op *op)
2312{
2313	int err = 0;
2314
2315	lockdep_assert_held_write(&vm->lock);
2316
2317	switch (op->base.op) {
2318	case DRM_GPUVA_OP_MAP:
2319		err |= xe_vm_insert_vma(vm, op->map.vma);
2320		if (!err)
2321			op->flags |= XE_VMA_OP_COMMITTED;
2322		break;
2323	case DRM_GPUVA_OP_REMAP:
2324	{
2325		u8 tile_present =
2326			gpuva_to_vma(op->base.remap.unmap->va)->tile_present;
2327
2328		prep_vma_destroy(vm, gpuva_to_vma(op->base.remap.unmap->va),
2329				 true);
2330		op->flags |= XE_VMA_OP_COMMITTED;
2331
2332		if (op->remap.prev) {
2333			err |= xe_vm_insert_vma(vm, op->remap.prev);
2334			if (!err)
2335				op->flags |= XE_VMA_OP_PREV_COMMITTED;
2336			if (!err && op->remap.skip_prev) {
2337				op->remap.prev->tile_present =
2338					tile_present;
2339				op->remap.prev = NULL;
2340			}
2341		}
2342		if (op->remap.next) {
2343			err |= xe_vm_insert_vma(vm, op->remap.next);
2344			if (!err)
2345				op->flags |= XE_VMA_OP_NEXT_COMMITTED;
2346			if (!err && op->remap.skip_next) {
2347				op->remap.next->tile_present =
2348					tile_present;
2349				op->remap.next = NULL;
2350			}
2351		}
2352
2353		/* Adjust for partial unbind after removin VMA from VM */
2354		if (!err) {
2355			op->base.remap.unmap->va->va.addr = op->remap.start;
2356			op->base.remap.unmap->va->va.range = op->remap.range;
2357		}
2358		break;
2359	}
2360	case DRM_GPUVA_OP_UNMAP:
2361		prep_vma_destroy(vm, gpuva_to_vma(op->base.unmap.va), true);
2362		op->flags |= XE_VMA_OP_COMMITTED;
2363		break;
2364	case DRM_GPUVA_OP_PREFETCH:
2365		op->flags |= XE_VMA_OP_COMMITTED;
2366		break;
2367	default:
2368		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2369	}
2370
2371	return err;
2372}
2373
2374
2375static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
2376				   struct drm_gpuva_ops *ops,
2377				   struct xe_sync_entry *syncs, u32 num_syncs,
2378				   struct list_head *ops_list, bool last)
2379{
2380	struct xe_device *xe = vm->xe;
2381	struct xe_vma_op *last_op = NULL;
2382	struct drm_gpuva_op *__op;
2383	int err = 0;
2384
2385	lockdep_assert_held_write(&vm->lock);
2386
2387	drm_gpuva_for_each_op(__op, ops) {
2388		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2389		struct xe_vma *vma;
2390		bool first = list_empty(ops_list);
2391		unsigned int flags = 0;
2392
2393		INIT_LIST_HEAD(&op->link);
2394		list_add_tail(&op->link, ops_list);
2395
2396		if (first) {
2397			op->flags |= XE_VMA_OP_FIRST;
2398			op->num_syncs = num_syncs;
2399			op->syncs = syncs;
2400		}
2401
2402		op->q = q;
2403
2404		switch (op->base.op) {
2405		case DRM_GPUVA_OP_MAP:
2406		{
2407			flags |= op->map.is_null ?
2408				VMA_CREATE_FLAG_IS_NULL : 0;
2409			flags |= op->map.dumpable ?
2410				VMA_CREATE_FLAG_DUMPABLE : 0;
2411
2412			vma = new_vma(vm, &op->base.map, op->map.pat_index,
2413				      flags);
2414			if (IS_ERR(vma))
2415				return PTR_ERR(vma);
2416
2417			op->map.vma = vma;
2418			break;
2419		}
2420		case DRM_GPUVA_OP_REMAP:
2421		{
2422			struct xe_vma *old =
2423				gpuva_to_vma(op->base.remap.unmap->va);
2424
2425			op->remap.start = xe_vma_start(old);
2426			op->remap.range = xe_vma_size(old);
2427
2428			if (op->base.remap.prev) {
2429				flags |= op->base.remap.unmap->va->flags &
2430					XE_VMA_READ_ONLY ?
2431					VMA_CREATE_FLAG_READ_ONLY : 0;
2432				flags |= op->base.remap.unmap->va->flags &
2433					DRM_GPUVA_SPARSE ?
2434					VMA_CREATE_FLAG_IS_NULL : 0;
2435				flags |= op->base.remap.unmap->va->flags &
2436					XE_VMA_DUMPABLE ?
2437					VMA_CREATE_FLAG_DUMPABLE : 0;
2438
2439				vma = new_vma(vm, op->base.remap.prev,
2440					      old->pat_index, flags);
2441				if (IS_ERR(vma))
2442					return PTR_ERR(vma);
2443
2444				op->remap.prev = vma;
2445
2446				/*
2447				 * Userptr creates a new SG mapping so
2448				 * we must also rebind.
2449				 */
2450				op->remap.skip_prev = !xe_vma_is_userptr(old) &&
2451					IS_ALIGNED(xe_vma_end(vma),
2452						   xe_vma_max_pte_size(old));
2453				if (op->remap.skip_prev) {
2454					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2455					op->remap.range -=
2456						xe_vma_end(vma) -
2457						xe_vma_start(old);
2458					op->remap.start = xe_vma_end(vma);
2459					vm_dbg(&xe->drm, "REMAP:SKIP_PREV: addr=0x%016llx, range=0x%016llx",
2460					       (ULL)op->remap.start,
2461					       (ULL)op->remap.range);
2462				}
2463			}
2464
2465			if (op->base.remap.next) {
2466				flags |= op->base.remap.unmap->va->flags &
2467					XE_VMA_READ_ONLY ?
2468					VMA_CREATE_FLAG_READ_ONLY : 0;
2469				flags |= op->base.remap.unmap->va->flags &
2470					DRM_GPUVA_SPARSE ?
2471					VMA_CREATE_FLAG_IS_NULL : 0;
2472				flags |= op->base.remap.unmap->va->flags &
2473					XE_VMA_DUMPABLE ?
2474					VMA_CREATE_FLAG_DUMPABLE : 0;
2475
2476				vma = new_vma(vm, op->base.remap.next,
2477					      old->pat_index, flags);
2478				if (IS_ERR(vma))
2479					return PTR_ERR(vma);
2480
2481				op->remap.next = vma;
2482
2483				/*
2484				 * Userptr creates a new SG mapping so
2485				 * we must also rebind.
2486				 */
2487				op->remap.skip_next = !xe_vma_is_userptr(old) &&
2488					IS_ALIGNED(xe_vma_start(vma),
2489						   xe_vma_max_pte_size(old));
2490				if (op->remap.skip_next) {
2491					xe_vma_set_pte_size(vma, xe_vma_max_pte_size(old));
2492					op->remap.range -=
2493						xe_vma_end(old) -
2494						xe_vma_start(vma);
2495					vm_dbg(&xe->drm, "REMAP:SKIP_NEXT: addr=0x%016llx, range=0x%016llx",
2496					       (ULL)op->remap.start,
2497					       (ULL)op->remap.range);
2498				}
2499			}
2500			break;
2501		}
2502		case DRM_GPUVA_OP_UNMAP:
2503		case DRM_GPUVA_OP_PREFETCH:
2504			/* Nothing to do */
2505			break;
2506		default:
2507			drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2508		}
2509
2510		last_op = op;
2511
2512		err = xe_vma_op_commit(vm, op);
2513		if (err)
2514			return err;
2515	}
2516
2517	/* FIXME: Unhandled corner case */
2518	XE_WARN_ON(!last_op && last && !list_empty(ops_list));
2519
2520	if (!last_op)
2521		return 0;
2522
2523	last_op->ops = ops;
2524	if (last) {
2525		last_op->flags |= XE_VMA_OP_LAST;
2526		last_op->num_syncs = num_syncs;
2527		last_op->syncs = syncs;
2528	}
2529
2530	return 0;
2531}
2532
2533static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
2534		      struct xe_vma *vma, struct xe_vma_op *op)
2535{
2536	int err;
2537
2538	lockdep_assert_held_write(&vm->lock);
2539
2540	err = xe_vm_lock_vma(exec, vma);
2541	if (err)
2542		return err;
2543
2544	xe_vm_assert_held(vm);
2545	xe_bo_assert_held(xe_vma_bo(vma));
2546
2547	switch (op->base.op) {
2548	case DRM_GPUVA_OP_MAP:
2549		err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
2550				 op->syncs, op->num_syncs,
2551				 !xe_vm_in_fault_mode(vm),
2552				 op->flags & XE_VMA_OP_FIRST,
2553				 op->flags & XE_VMA_OP_LAST);
2554		break;
2555	case DRM_GPUVA_OP_REMAP:
2556	{
2557		bool prev = !!op->remap.prev;
2558		bool next = !!op->remap.next;
2559
2560		if (!op->remap.unmap_done) {
2561			if (prev || next)
2562				vma->gpuva.flags |= XE_VMA_FIRST_REBIND;
2563			err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2564					   op->num_syncs,
2565					   op->flags & XE_VMA_OP_FIRST,
2566					   op->flags & XE_VMA_OP_LAST &&
2567					   !prev && !next);
2568			if (err)
2569				break;
2570			op->remap.unmap_done = true;
2571		}
2572
2573		if (prev) {
2574			op->remap.prev->gpuva.flags |= XE_VMA_LAST_REBIND;
2575			err = xe_vm_bind(vm, op->remap.prev, op->q,
2576					 xe_vma_bo(op->remap.prev), op->syncs,
2577					 op->num_syncs, true, false,
2578					 op->flags & XE_VMA_OP_LAST && !next);
2579			op->remap.prev->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2580			if (err)
2581				break;
2582			op->remap.prev = NULL;
2583		}
2584
2585		if (next) {
2586			op->remap.next->gpuva.flags |= XE_VMA_LAST_REBIND;
2587			err = xe_vm_bind(vm, op->remap.next, op->q,
2588					 xe_vma_bo(op->remap.next),
2589					 op->syncs, op->num_syncs,
2590					 true, false,
2591					 op->flags & XE_VMA_OP_LAST);
2592			op->remap.next->gpuva.flags &= ~XE_VMA_LAST_REBIND;
2593			if (err)
2594				break;
2595			op->remap.next = NULL;
2596		}
2597
2598		break;
2599	}
2600	case DRM_GPUVA_OP_UNMAP:
2601		err = xe_vm_unbind(vm, vma, op->q, op->syncs,
2602				   op->num_syncs, op->flags & XE_VMA_OP_FIRST,
2603				   op->flags & XE_VMA_OP_LAST);
2604		break;
2605	case DRM_GPUVA_OP_PREFETCH:
2606		err = xe_vm_prefetch(vm, vma, op->q, op->prefetch.region,
2607				     op->syncs, op->num_syncs,
2608				     op->flags & XE_VMA_OP_FIRST,
2609				     op->flags & XE_VMA_OP_LAST);
2610		break;
2611	default:
2612		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2613	}
2614
2615	if (err)
2616		trace_xe_vma_fail(vma);
2617
2618	return err;
2619}
2620
2621static int __xe_vma_op_execute(struct xe_vm *vm, struct xe_vma *vma,
2622			       struct xe_vma_op *op)
2623{
2624	struct drm_exec exec;
2625	int err;
2626
2627retry_userptr:
2628	drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2629	drm_exec_until_all_locked(&exec) {
2630		err = op_execute(&exec, vm, vma, op);
2631		drm_exec_retry_on_contention(&exec);
2632		if (err)
2633			break;
2634	}
2635	drm_exec_fini(&exec);
2636
2637	if (err == -EAGAIN) {
2638		lockdep_assert_held_write(&vm->lock);
2639
2640		if (op->base.op == DRM_GPUVA_OP_REMAP) {
2641			if (!op->remap.unmap_done)
2642				vma = gpuva_to_vma(op->base.remap.unmap->va);
2643			else if (op->remap.prev)
2644				vma = op->remap.prev;
2645			else
2646				vma = op->remap.next;
2647		}
2648
2649		if (xe_vma_is_userptr(vma)) {
2650			err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2651			if (!err)
2652				goto retry_userptr;
2653
2654			trace_xe_vma_fail(vma);
2655		}
2656	}
2657
2658	return err;
2659}
2660
2661static int xe_vma_op_execute(struct xe_vm *vm, struct xe_vma_op *op)
2662{
2663	int ret = 0;
2664
2665	lockdep_assert_held_write(&vm->lock);
2666
2667	switch (op->base.op) {
2668	case DRM_GPUVA_OP_MAP:
2669		ret = __xe_vma_op_execute(vm, op->map.vma, op);
2670		break;
2671	case DRM_GPUVA_OP_REMAP:
2672	{
2673		struct xe_vma *vma;
2674
2675		if (!op->remap.unmap_done)
2676			vma = gpuva_to_vma(op->base.remap.unmap->va);
2677		else if (op->remap.prev)
2678			vma = op->remap.prev;
2679		else
2680			vma = op->remap.next;
2681
2682		ret = __xe_vma_op_execute(vm, vma, op);
2683		break;
2684	}
2685	case DRM_GPUVA_OP_UNMAP:
2686		ret = __xe_vma_op_execute(vm, gpuva_to_vma(op->base.unmap.va),
2687					  op);
2688		break;
2689	case DRM_GPUVA_OP_PREFETCH:
2690		ret = __xe_vma_op_execute(vm,
2691					  gpuva_to_vma(op->base.prefetch.va),
2692					  op);
2693		break;
2694	default:
2695		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2696	}
2697
2698	return ret;
2699}
2700
2701static void xe_vma_op_cleanup(struct xe_vm *vm, struct xe_vma_op *op)
2702{
2703	bool last = op->flags & XE_VMA_OP_LAST;
2704
2705	if (last) {
2706		while (op->num_syncs--)
2707			xe_sync_entry_cleanup(&op->syncs[op->num_syncs]);
2708		kfree(op->syncs);
2709		if (op->q)
2710			xe_exec_queue_put(op->q);
2711	}
2712	if (!list_empty(&op->link))
2713		list_del(&op->link);
2714	if (op->ops)
2715		drm_gpuva_ops_free(&vm->gpuvm, op->ops);
2716	if (last)
2717		xe_vm_put(vm);
2718}
2719
2720static void xe_vma_op_unwind(struct xe_vm *vm, struct xe_vma_op *op,
2721			     bool post_commit, bool prev_post_commit,
2722			     bool next_post_commit)
2723{
2724	lockdep_assert_held_write(&vm->lock);
2725
2726	switch (op->base.op) {
2727	case DRM_GPUVA_OP_MAP:
2728		if (op->map.vma) {
2729			prep_vma_destroy(vm, op->map.vma, post_commit);
2730			xe_vma_destroy_unlocked(op->map.vma);
2731		}
2732		break;
2733	case DRM_GPUVA_OP_UNMAP:
2734	{
2735		struct xe_vma *vma = gpuva_to_vma(op->base.unmap.va);
2736
2737		if (vma) {
2738			down_read(&vm->userptr.notifier_lock);
2739			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2740			up_read(&vm->userptr.notifier_lock);
2741			if (post_commit)
2742				xe_vm_insert_vma(vm, vma);
2743		}
2744		break;
2745	}
2746	case DRM_GPUVA_OP_REMAP:
2747	{
2748		struct xe_vma *vma = gpuva_to_vma(op->base.remap.unmap->va);
2749
2750		if (op->remap.prev) {
2751			prep_vma_destroy(vm, op->remap.prev, prev_post_commit);
2752			xe_vma_destroy_unlocked(op->remap.prev);
2753		}
2754		if (op->remap.next) {
2755			prep_vma_destroy(vm, op->remap.next, next_post_commit);
2756			xe_vma_destroy_unlocked(op->remap.next);
2757		}
2758		if (vma) {
2759			down_read(&vm->userptr.notifier_lock);
2760			vma->gpuva.flags &= ~XE_VMA_DESTROYED;
2761			up_read(&vm->userptr.notifier_lock);
2762			if (post_commit)
2763				xe_vm_insert_vma(vm, vma);
2764		}
2765		break;
2766	}
2767	case DRM_GPUVA_OP_PREFETCH:
2768		/* Nothing to do */
2769		break;
2770	default:
2771		drm_warn(&vm->xe->drm, "NOT POSSIBLE");
2772	}
2773}
2774
2775static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm,
2776				     struct drm_gpuva_ops **ops,
2777				     int num_ops_list)
2778{
2779	int i;
2780
2781	for (i = num_ops_list - 1; i >= 0; --i) {
2782		struct drm_gpuva_ops *__ops = ops[i];
2783		struct drm_gpuva_op *__op;
2784
2785		if (!__ops)
2786			continue;
2787
2788		drm_gpuva_for_each_op_reverse(__op, __ops) {
2789			struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
2790
2791			xe_vma_op_unwind(vm, op,
2792					 op->flags & XE_VMA_OP_COMMITTED,
2793					 op->flags & XE_VMA_OP_PREV_COMMITTED,
2794					 op->flags & XE_VMA_OP_NEXT_COMMITTED);
2795		}
2796
2797		drm_gpuva_ops_free(&vm->gpuvm, __ops);
2798	}
2799}
2800
2801static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
2802				     struct list_head *ops_list)
2803{
2804	struct xe_vma_op *op, *next;
2805	int err;
2806
2807	lockdep_assert_held_write(&vm->lock);
2808
2809	list_for_each_entry_safe(op, next, ops_list, link) {
2810		err = xe_vma_op_execute(vm, op);
2811		if (err) {
2812			drm_warn(&vm->xe->drm, "VM op(%d) failed with %d",
2813				 op->base.op, err);
2814			/*
2815			 * FIXME: Killing VM rather than proper error handling
2816			 */
2817			xe_vm_kill(vm);
2818			return -ENOSPC;
2819		}
2820		xe_vma_op_cleanup(vm, op);
2821	}
2822
2823	return 0;
2824}
2825
2826#define SUPPORTED_FLAGS	(DRM_XE_VM_BIND_FLAG_NULL | \
2827	 DRM_XE_VM_BIND_FLAG_DUMPABLE)
2828#define XE_64K_PAGE_MASK 0xffffull
2829#define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
2830
2831static int vm_bind_ioctl_check_args(struct xe_device *xe,
2832				    struct drm_xe_vm_bind *args,
2833				    struct drm_xe_vm_bind_op **bind_ops)
2834{
2835	int err;
2836	int i;
2837
2838	if (XE_IOCTL_DBG(xe, args->pad || args->pad2) ||
2839	    XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]))
2840		return -EINVAL;
2841
2842	if (XE_IOCTL_DBG(xe, args->extensions))
2843		return -EINVAL;
2844
2845	if (args->num_binds > 1) {
2846		u64 __user *bind_user =
2847			u64_to_user_ptr(args->vector_of_binds);
2848
2849		*bind_ops = kvmalloc_array(args->num_binds,
2850					   sizeof(struct drm_xe_vm_bind_op),
2851					   GFP_KERNEL | __GFP_ACCOUNT);
2852		if (!*bind_ops)
2853			return -ENOMEM;
2854
2855		err = __copy_from_user(*bind_ops, bind_user,
2856				       sizeof(struct drm_xe_vm_bind_op) *
2857				       args->num_binds);
2858		if (XE_IOCTL_DBG(xe, err)) {
2859			err = -EFAULT;
2860			goto free_bind_ops;
2861		}
2862	} else {
2863		*bind_ops = &args->bind;
2864	}
2865
2866	for (i = 0; i < args->num_binds; ++i) {
2867		u64 range = (*bind_ops)[i].range;
2868		u64 addr = (*bind_ops)[i].addr;
2869		u32 op = (*bind_ops)[i].op;
2870		u32 flags = (*bind_ops)[i].flags;
2871		u32 obj = (*bind_ops)[i].obj;
2872		u64 obj_offset = (*bind_ops)[i].obj_offset;
2873		u32 prefetch_region = (*bind_ops)[i].prefetch_mem_region_instance;
2874		bool is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
2875		u16 pat_index = (*bind_ops)[i].pat_index;
2876		u16 coh_mode;
2877
2878		if (XE_IOCTL_DBG(xe, pat_index >= xe->pat.n_entries)) {
2879			err = -EINVAL;
2880			goto free_bind_ops;
2881		}
2882
2883		pat_index = array_index_nospec(pat_index, xe->pat.n_entries);
2884		(*bind_ops)[i].pat_index = pat_index;
2885		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
2886		if (XE_IOCTL_DBG(xe, !coh_mode)) { /* hw reserved */
2887			err = -EINVAL;
2888			goto free_bind_ops;
2889		}
2890
2891		if (XE_WARN_ON(coh_mode > XE_COH_AT_LEAST_1WAY)) {
2892			err = -EINVAL;
2893			goto free_bind_ops;
2894		}
2895
2896		if (XE_IOCTL_DBG(xe, op > DRM_XE_VM_BIND_OP_PREFETCH) ||
2897		    XE_IOCTL_DBG(xe, flags & ~SUPPORTED_FLAGS) ||
2898		    XE_IOCTL_DBG(xe, obj && is_null) ||
2899		    XE_IOCTL_DBG(xe, obj_offset && is_null) ||
2900		    XE_IOCTL_DBG(xe, op != DRM_XE_VM_BIND_OP_MAP &&
2901				 is_null) ||
2902		    XE_IOCTL_DBG(xe, !obj &&
2903				 op == DRM_XE_VM_BIND_OP_MAP &&
2904				 !is_null) ||
2905		    XE_IOCTL_DBG(xe, !obj &&
2906				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2907		    XE_IOCTL_DBG(xe, addr &&
2908				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2909		    XE_IOCTL_DBG(xe, range &&
2910				 op == DRM_XE_VM_BIND_OP_UNMAP_ALL) ||
2911		    XE_IOCTL_DBG(xe, obj &&
2912				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2913		    XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
2914				 op == DRM_XE_VM_BIND_OP_MAP_USERPTR) ||
2915		    XE_IOCTL_DBG(xe, obj &&
2916				 op == DRM_XE_VM_BIND_OP_PREFETCH) ||
2917		    XE_IOCTL_DBG(xe, prefetch_region &&
2918				 op != DRM_XE_VM_BIND_OP_PREFETCH) ||
2919		    XE_IOCTL_DBG(xe, !(BIT(prefetch_region) &
2920				       xe->info.mem_region_mask)) ||
2921		    XE_IOCTL_DBG(xe, obj &&
2922				 op == DRM_XE_VM_BIND_OP_UNMAP)) {
2923			err = -EINVAL;
2924			goto free_bind_ops;
2925		}
2926
2927		if (XE_IOCTL_DBG(xe, obj_offset & ~PAGE_MASK) ||
2928		    XE_IOCTL_DBG(xe, addr & ~PAGE_MASK) ||
2929		    XE_IOCTL_DBG(xe, range & ~PAGE_MASK) ||
2930		    XE_IOCTL_DBG(xe, !range &&
2931				 op != DRM_XE_VM_BIND_OP_UNMAP_ALL)) {
2932			err = -EINVAL;
2933			goto free_bind_ops;
2934		}
2935	}
2936
2937	return 0;
2938
2939free_bind_ops:
2940	if (args->num_binds > 1)
2941		kvfree(*bind_ops);
2942	return err;
2943}
2944
2945static int vm_bind_ioctl_signal_fences(struct xe_vm *vm,
2946				       struct xe_exec_queue *q,
2947				       struct xe_sync_entry *syncs,
2948				       int num_syncs)
2949{
2950	struct dma_fence *fence;
2951	int i, err = 0;
2952
2953	fence = xe_sync_in_fence_get(syncs, num_syncs,
2954				     to_wait_exec_queue(vm, q), vm);
2955	if (IS_ERR(fence))
2956		return PTR_ERR(fence);
2957
2958	for (i = 0; i < num_syncs; i++)
2959		xe_sync_entry_signal(&syncs[i], NULL, fence);
2960
2961	xe_exec_queue_last_fence_set(to_wait_exec_queue(vm, q), vm,
2962				     fence);
2963	dma_fence_put(fence);
2964
2965	return err;
2966}
2967
2968int xe_vm_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
2969{
2970	struct xe_device *xe = to_xe_device(dev);
2971	struct xe_file *xef = to_xe_file(file);
2972	struct drm_xe_vm_bind *args = data;
2973	struct drm_xe_sync __user *syncs_user;
2974	struct xe_bo **bos = NULL;
2975	struct drm_gpuva_ops **ops = NULL;
2976	struct xe_vm *vm;
2977	struct xe_exec_queue *q = NULL;
2978	u32 num_syncs, num_ufence = 0;
2979	struct xe_sync_entry *syncs = NULL;
2980	struct drm_xe_vm_bind_op *bind_ops;
2981	LIST_HEAD(ops_list);
2982	int err;
2983	int i;
2984
2985	err = vm_bind_ioctl_check_args(xe, args, &bind_ops);
2986	if (err)
2987		return err;
2988
2989	if (args->exec_queue_id) {
2990		q = xe_exec_queue_lookup(xef, args->exec_queue_id);
2991		if (XE_IOCTL_DBG(xe, !q)) {
2992			err = -ENOENT;
2993			goto free_objs;
2994		}
2995
2996		if (XE_IOCTL_DBG(xe, !(q->flags & EXEC_QUEUE_FLAG_VM))) {
2997			err = -EINVAL;
2998			goto put_exec_queue;
2999		}
3000	}
3001
3002	vm = xe_vm_lookup(xef, args->vm_id);
3003	if (XE_IOCTL_DBG(xe, !vm)) {
3004		err = -EINVAL;
3005		goto put_exec_queue;
3006	}
3007
3008	err = down_write_killable(&vm->lock);
3009	if (err)
3010		goto put_vm;
3011
3012	if (XE_IOCTL_DBG(xe, xe_vm_is_closed_or_banned(vm))) {
3013		err = -ENOENT;
3014		goto release_vm_lock;
3015	}
3016
3017	for (i = 0; i < args->num_binds; ++i) {
3018		u64 range = bind_ops[i].range;
3019		u64 addr = bind_ops[i].addr;
3020
3021		if (XE_IOCTL_DBG(xe, range > vm->size) ||
3022		    XE_IOCTL_DBG(xe, addr > vm->size - range)) {
3023			err = -EINVAL;
3024			goto release_vm_lock;
3025		}
3026	}
3027
3028	if (args->num_binds) {
3029		bos = kvcalloc(args->num_binds, sizeof(*bos),
3030			       GFP_KERNEL | __GFP_ACCOUNT);
3031		if (!bos) {
3032			err = -ENOMEM;
3033			goto release_vm_lock;
3034		}
3035
3036		ops = kvcalloc(args->num_binds, sizeof(*ops),
3037			       GFP_KERNEL | __GFP_ACCOUNT);
3038		if (!ops) {
3039			err = -ENOMEM;
3040			goto release_vm_lock;
3041		}
3042	}
3043
3044	for (i = 0; i < args->num_binds; ++i) {
3045		struct drm_gem_object *gem_obj;
3046		u64 range = bind_ops[i].range;
3047		u64 addr = bind_ops[i].addr;
3048		u32 obj = bind_ops[i].obj;
3049		u64 obj_offset = bind_ops[i].obj_offset;
3050		u16 pat_index = bind_ops[i].pat_index;
3051		u16 coh_mode;
3052
3053		if (!obj)
3054			continue;
3055
3056		gem_obj = drm_gem_object_lookup(file, obj);
3057		if (XE_IOCTL_DBG(xe, !gem_obj)) {
3058			err = -ENOENT;
3059			goto put_obj;
3060		}
3061		bos[i] = gem_to_xe_bo(gem_obj);
3062
3063		if (XE_IOCTL_DBG(xe, range > bos[i]->size) ||
3064		    XE_IOCTL_DBG(xe, obj_offset >
3065				 bos[i]->size - range)) {
3066			err = -EINVAL;
3067			goto put_obj;
3068		}
3069
3070		if (bos[i]->flags & XE_BO_INTERNAL_64K) {
3071			if (XE_IOCTL_DBG(xe, obj_offset &
3072					 XE_64K_PAGE_MASK) ||
3073			    XE_IOCTL_DBG(xe, addr & XE_64K_PAGE_MASK) ||
3074			    XE_IOCTL_DBG(xe, range & XE_64K_PAGE_MASK)) {
3075				err = -EINVAL;
3076				goto put_obj;
3077			}
3078		}
3079
3080		coh_mode = xe_pat_index_get_coh_mode(xe, pat_index);
3081		if (bos[i]->cpu_caching) {
3082			if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE &&
3083					 bos[i]->cpu_caching == DRM_XE_GEM_CPU_CACHING_WB)) {
3084				err = -EINVAL;
3085				goto put_obj;
3086			}
3087		} else if (XE_IOCTL_DBG(xe, coh_mode == XE_COH_NONE)) {
3088			/*
3089			 * Imported dma-buf from a different device should
3090			 * require 1way or 2way coherency since we don't know
3091			 * how it was mapped on the CPU. Just assume is it
3092			 * potentially cached on CPU side.
3093			 */
3094			err = -EINVAL;
3095			goto put_obj;
3096		}
3097	}
3098
3099	if (args->num_syncs) {
3100		syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL);
3101		if (!syncs) {
3102			err = -ENOMEM;
3103			goto put_obj;
3104		}
3105	}
3106
3107	syncs_user = u64_to_user_ptr(args->syncs);
3108	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
3109		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
3110					  &syncs_user[num_syncs],
3111					  (xe_vm_in_lr_mode(vm) ?
3112					   SYNC_PARSE_FLAG_LR_MODE : 0) |
3113					  (!args->num_binds ?
3114					   SYNC_PARSE_FLAG_DISALLOW_USER_FENCE : 0));
3115		if (err)
3116			goto free_syncs;
3117
3118		if (xe_sync_is_ufence(&syncs[num_syncs]))
3119			num_ufence++;
3120	}
3121
3122	if (XE_IOCTL_DBG(xe, num_ufence > 1)) {
3123		err = -EINVAL;
3124		goto free_syncs;
3125	}
3126
3127	if (!args->num_binds) {
3128		err = -ENODATA;
3129		goto free_syncs;
3130	}
3131
3132	for (i = 0; i < args->num_binds; ++i) {
3133		u64 range = bind_ops[i].range;
3134		u64 addr = bind_ops[i].addr;
3135		u32 op = bind_ops[i].op;
3136		u32 flags = bind_ops[i].flags;
3137		u64 obj_offset = bind_ops[i].obj_offset;
3138		u32 prefetch_region = bind_ops[i].prefetch_mem_region_instance;
3139		u16 pat_index = bind_ops[i].pat_index;
3140
3141		ops[i] = vm_bind_ioctl_ops_create(vm, bos[i], obj_offset,
3142						  addr, range, op, flags,
3143						  prefetch_region, pat_index);
3144		if (IS_ERR(ops[i])) {
3145			err = PTR_ERR(ops[i]);
3146			ops[i] = NULL;
3147			goto unwind_ops;
3148		}
3149
3150		err = vm_bind_ioctl_ops_parse(vm, q, ops[i], syncs, num_syncs,
3151					      &ops_list,
3152					      i == args->num_binds - 1);
3153		if (err)
3154			goto unwind_ops;
3155	}
3156
3157	/* Nothing to do */
3158	if (list_empty(&ops_list)) {
3159		err = -ENODATA;
3160		goto unwind_ops;
3161	}
3162
3163	xe_vm_get(vm);
3164	if (q)
3165		xe_exec_queue_get(q);
3166
3167	err = vm_bind_ioctl_ops_execute(vm, &ops_list);
3168
3169	up_write(&vm->lock);
3170
3171	if (q)
3172		xe_exec_queue_put(q);
3173	xe_vm_put(vm);
3174
3175	for (i = 0; bos && i < args->num_binds; ++i)
3176		xe_bo_put(bos[i]);
3177
3178	kvfree(bos);
3179	kvfree(ops);
3180	if (args->num_binds > 1)
3181		kvfree(bind_ops);
3182
3183	return err;
3184
3185unwind_ops:
3186	vm_bind_ioctl_ops_unwind(vm, ops, args->num_binds);
3187free_syncs:
3188	if (err == -ENODATA)
3189		err = vm_bind_ioctl_signal_fences(vm, q, syncs, num_syncs);
3190	while (num_syncs--)
3191		xe_sync_entry_cleanup(&syncs[num_syncs]);
3192
3193	kfree(syncs);
3194put_obj:
3195	for (i = 0; i < args->num_binds; ++i)
3196		xe_bo_put(bos[i]);
3197release_vm_lock:
3198	up_write(&vm->lock);
3199put_vm:
3200	xe_vm_put(vm);
3201put_exec_queue:
3202	if (q)
3203		xe_exec_queue_put(q);
3204free_objs:
3205	kvfree(bos);
3206	kvfree(ops);
3207	if (args->num_binds > 1)
3208		kvfree(bind_ops);
3209	return err;
3210}
3211
3212/**
3213 * xe_vm_lock() - Lock the vm's dma_resv object
3214 * @vm: The struct xe_vm whose lock is to be locked
3215 * @intr: Whether to perform any wait interruptible
3216 *
3217 * Return: 0 on success, -EINTR if @intr is true and the wait for a
3218 * contended lock was interrupted. If @intr is false, the function
3219 * always returns 0.
3220 */
3221int xe_vm_lock(struct xe_vm *vm, bool intr)
3222{
3223	if (intr)
3224		return dma_resv_lock_interruptible(xe_vm_resv(vm), NULL);
3225
3226	return dma_resv_lock(xe_vm_resv(vm), NULL);
3227}
3228
3229/**
3230 * xe_vm_unlock() - Unlock the vm's dma_resv object
3231 * @vm: The struct xe_vm whose lock is to be released.
3232 *
3233 * Unlock a buffer object lock that was locked by xe_vm_lock().
3234 */
3235void xe_vm_unlock(struct xe_vm *vm)
3236{
3237	dma_resv_unlock(xe_vm_resv(vm));
3238}
3239
3240/**
3241 * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
3242 * @vma: VMA to invalidate
3243 *
3244 * Walks a list of page tables leaves which it memset the entries owned by this
3245 * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
3246 * complete.
3247 *
3248 * Returns 0 for success, negative error code otherwise.
3249 */
3250int xe_vm_invalidate_vma(struct xe_vma *vma)
3251{
3252	struct xe_device *xe = xe_vma_vm(vma)->xe;
3253	struct xe_tile *tile;
3254	u32 tile_needs_invalidate = 0;
3255	int seqno[XE_MAX_TILES_PER_DEVICE];
3256	u8 id;
3257	int ret;
3258
3259	xe_assert(xe, !xe_vma_is_null(vma));
3260	trace_xe_vma_invalidate(vma);
3261
3262	/* Check that we don't race with page-table updates */
3263	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
3264		if (xe_vma_is_userptr(vma)) {
3265			WARN_ON_ONCE(!mmu_interval_check_retry
3266				     (&to_userptr_vma(vma)->userptr.notifier,
3267				      to_userptr_vma(vma)->userptr.notifier_seq));
3268			WARN_ON_ONCE(!dma_resv_test_signaled(xe_vm_resv(xe_vma_vm(vma)),
3269							     DMA_RESV_USAGE_BOOKKEEP));
3270
3271		} else {
3272			xe_bo_assert_held(xe_vma_bo(vma));
3273		}
3274	}
3275
3276	for_each_tile(tile, xe, id) {
3277		if (xe_pt_zap_ptes(tile, vma)) {
3278			tile_needs_invalidate |= BIT(id);
3279			xe_device_wmb(xe);
3280			/*
3281			 * FIXME: We potentially need to invalidate multiple
3282			 * GTs within the tile
3283			 */
3284			seqno[id] = xe_gt_tlb_invalidation_vma(tile->primary_gt, NULL, vma);
3285			if (seqno[id] < 0)
3286				return seqno[id];
3287		}
3288	}
3289
3290	for_each_tile(tile, xe, id) {
3291		if (tile_needs_invalidate & BIT(id)) {
3292			ret = xe_gt_tlb_invalidation_wait(tile->primary_gt, seqno[id]);
3293			if (ret < 0)
3294				return ret;
3295		}
3296	}
3297
3298	vma->tile_invalidated = vma->tile_mask;
3299
3300	return 0;
3301}
3302
3303int xe_analyze_vm(struct drm_printer *p, struct xe_vm *vm, int gt_id)
3304{
3305	struct drm_gpuva *gpuva;
3306	bool is_vram;
3307	uint64_t addr;
3308
3309	if (!down_read_trylock(&vm->lock)) {
3310		drm_printf(p, " Failed to acquire VM lock to dump capture");
3311		return 0;
3312	}
3313	if (vm->pt_root[gt_id]) {
3314		addr = xe_bo_addr(vm->pt_root[gt_id]->bo, 0, XE_PAGE_SIZE);
3315		is_vram = xe_bo_is_vram(vm->pt_root[gt_id]->bo);
3316		drm_printf(p, " VM root: A:0x%llx %s\n", addr,
3317			   is_vram ? "VRAM" : "SYS");
3318	}
3319
3320	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3321		struct xe_vma *vma = gpuva_to_vma(gpuva);
3322		bool is_userptr = xe_vma_is_userptr(vma);
3323		bool is_null = xe_vma_is_null(vma);
3324
3325		if (is_null) {
3326			addr = 0;
3327		} else if (is_userptr) {
3328			struct sg_table *sg = to_userptr_vma(vma)->userptr.sg;
3329			struct xe_res_cursor cur;
3330
3331			if (sg) {
3332				xe_res_first_sg(sg, 0, XE_PAGE_SIZE, &cur);
3333				addr = xe_res_dma(&cur);
3334			} else {
3335				addr = 0;
3336			}
3337		} else {
3338			addr = __xe_bo_addr(xe_vma_bo(vma), 0, XE_PAGE_SIZE);
3339			is_vram = xe_bo_is_vram(xe_vma_bo(vma));
3340		}
3341		drm_printf(p, " [%016llx-%016llx] S:0x%016llx A:%016llx %s\n",
3342			   xe_vma_start(vma), xe_vma_end(vma) - 1,
3343			   xe_vma_size(vma),
3344			   addr, is_null ? "NULL" : is_userptr ? "USR" :
3345			   is_vram ? "VRAM" : "SYS");
3346	}
3347	up_read(&vm->lock);
3348
3349	return 0;
3350}
3351
3352struct xe_vm_snapshot {
3353	unsigned long num_snaps;
3354	struct {
3355		u64 ofs, bo_ofs;
3356		unsigned long len;
3357		struct xe_bo *bo;
3358		void *data;
3359		struct mm_struct *mm;
3360	} snap[];
3361};
3362
3363struct xe_vm_snapshot *xe_vm_snapshot_capture(struct xe_vm *vm)
3364{
3365	unsigned long num_snaps = 0, i;
3366	struct xe_vm_snapshot *snap = NULL;
3367	struct drm_gpuva *gpuva;
3368
3369	if (!vm)
3370		return NULL;
3371
3372	mutex_lock(&vm->snap_mutex);
3373	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3374		if (gpuva->flags & XE_VMA_DUMPABLE)
3375			num_snaps++;
3376	}
3377
3378	if (num_snaps)
3379		snap = kvzalloc(offsetof(struct xe_vm_snapshot, snap[num_snaps]), GFP_NOWAIT);
3380	if (!snap)
3381		goto out_unlock;
3382
3383	snap->num_snaps = num_snaps;
3384	i = 0;
3385	drm_gpuvm_for_each_va(gpuva, &vm->gpuvm) {
3386		struct xe_vma *vma = gpuva_to_vma(gpuva);
3387		struct xe_bo *bo = vma->gpuva.gem.obj ?
3388			gem_to_xe_bo(vma->gpuva.gem.obj) : NULL;
3389
3390		if (!(gpuva->flags & XE_VMA_DUMPABLE))
3391			continue;
3392
3393		snap->snap[i].ofs = xe_vma_start(vma);
3394		snap->snap[i].len = xe_vma_size(vma);
3395		if (bo) {
3396			snap->snap[i].bo = xe_bo_get(bo);
3397			snap->snap[i].bo_ofs = xe_vma_bo_offset(vma);
3398		} else if (xe_vma_is_userptr(vma)) {
3399			struct mm_struct *mm =
3400				to_userptr_vma(vma)->userptr.notifier.mm;
3401
3402			if (mmget_not_zero(mm))
3403				snap->snap[i].mm = mm;
3404			else
3405				snap->snap[i].data = ERR_PTR(-EFAULT);
3406
3407			snap->snap[i].bo_ofs = xe_vma_userptr(vma);
3408		} else {
3409			snap->snap[i].data = ERR_PTR(-ENOENT);
3410		}
3411		i++;
3412	}
3413
3414out_unlock:
3415	mutex_unlock(&vm->snap_mutex);
3416	return snap;
3417}
3418
3419void xe_vm_snapshot_capture_delayed(struct xe_vm_snapshot *snap)
3420{
3421	for (int i = 0; i < snap->num_snaps; i++) {
3422		struct xe_bo *bo = snap->snap[i].bo;
3423		struct iosys_map src;
3424		int err;
3425
3426		if (IS_ERR(snap->snap[i].data))
3427			continue;
3428
3429		snap->snap[i].data = kvmalloc(snap->snap[i].len, GFP_USER);
3430		if (!snap->snap[i].data) {
3431			snap->snap[i].data = ERR_PTR(-ENOMEM);
3432			goto cleanup_bo;
3433		}
3434
3435		if (bo) {
3436			dma_resv_lock(bo->ttm.base.resv, NULL);
3437			err = ttm_bo_vmap(&bo->ttm, &src);
3438			if (!err) {
3439				xe_map_memcpy_from(xe_bo_device(bo),
3440						   snap->snap[i].data,
3441						   &src, snap->snap[i].bo_ofs,
3442						   snap->snap[i].len);
3443				ttm_bo_vunmap(&bo->ttm, &src);
3444			}
3445			dma_resv_unlock(bo->ttm.base.resv);
3446		} else {
3447			void __user *userptr = (void __user *)(size_t)snap->snap[i].bo_ofs;
3448
3449			kthread_use_mm(snap->snap[i].mm);
3450			if (!copy_from_user(snap->snap[i].data, userptr, snap->snap[i].len))
3451				err = 0;
3452			else
3453				err = -EFAULT;
3454			kthread_unuse_mm(snap->snap[i].mm);
3455
3456			mmput(snap->snap[i].mm);
3457			snap->snap[i].mm = NULL;
3458		}
3459
3460		if (err) {
3461			kvfree(snap->snap[i].data);
3462			snap->snap[i].data = ERR_PTR(err);
3463		}
3464
3465cleanup_bo:
3466		xe_bo_put(bo);
3467		snap->snap[i].bo = NULL;
3468	}
3469}
3470
3471void xe_vm_snapshot_print(struct xe_vm_snapshot *snap, struct drm_printer *p)
3472{
3473	unsigned long i, j;
3474
3475	for (i = 0; i < snap->num_snaps; i++) {
3476		if (IS_ERR(snap->snap[i].data))
3477			goto uncaptured;
3478
3479		drm_printf(p, "[%llx].length: 0x%lx\n", snap->snap[i].ofs, snap->snap[i].len);
3480		drm_printf(p, "[%llx].data: ",
3481			   snap->snap[i].ofs);
3482
3483		for (j = 0; j < snap->snap[i].len; j += sizeof(u32)) {
3484			u32 *val = snap->snap[i].data + j;
3485			char dumped[ASCII85_BUFSZ];
3486
3487			drm_puts(p, ascii85_encode(*val, dumped));
3488		}
3489
3490		drm_puts(p, "\n");
3491		continue;
3492
3493uncaptured:
3494		drm_printf(p, "Unable to capture range [%llx-%llx]: %li\n",
3495			   snap->snap[i].ofs, snap->snap[i].ofs + snap->snap[i].len - 1,
3496			   PTR_ERR(snap->snap[i].data));
3497	}
3498}
3499
3500void xe_vm_snapshot_free(struct xe_vm_snapshot *snap)
3501{
3502	unsigned long i;
3503
3504	if (!snap)
3505		return;
3506
3507	for (i = 0; i < snap->num_snaps; i++) {
3508		if (!IS_ERR(snap->snap[i].data))
3509			kvfree(snap->snap[i].data);
3510		xe_bo_put(snap->snap[i].bo);
3511		if (snap->snap[i].mm)
3512			mmput(snap->snap[i].mm);
3513	}
3514	kvfree(snap);
3515}
3516