1// SPDX-License-Identifier: GPL-2.0 OR MIT
2/*
3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include <linux/types.h>
24#include <linux/hmm.h>
25#include <linux/dma-direction.h>
26#include <linux/dma-mapping.h>
27#include <linux/migrate.h>
28#include "amdgpu_sync.h"
29#include "amdgpu_object.h"
30#include "amdgpu_vm.h"
31#include "amdgpu_res_cursor.h"
32#include "kfd_priv.h"
33#include "kfd_svm.h"
34#include "kfd_migrate.h"
35#include "kfd_smi_events.h"
36
37#ifdef dev_fmt
38#undef dev_fmt
39#endif
40#define dev_fmt(fmt) "kfd_migrate: " fmt
41
42static uint64_t
43svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, uint64_t addr)
44{
45	return addr + amdgpu_ttm_domain_start(adev, TTM_PL_VRAM);
46}
47
48static int
49svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
50		     dma_addr_t *addr, uint64_t *gart_addr, uint64_t flags)
51{
52	struct amdgpu_device *adev = ring->adev;
53	struct amdgpu_job *job;
54	unsigned int num_dw, num_bytes;
55	struct dma_fence *fence;
56	uint64_t src_addr, dst_addr;
57	uint64_t pte_flags;
58	void *cpu_addr;
59	int r;
60
61	/* use gart window 0 */
62	*gart_addr = adev->gmc.gart_start;
63
64	num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
65	num_bytes = npages * 8;
66
67	r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr,
68				     AMDGPU_FENCE_OWNER_UNDEFINED,
69				     num_dw * 4 + num_bytes,
70				     AMDGPU_IB_POOL_DELAYED,
71				     &job);
72	if (r)
73		return r;
74
75	src_addr = num_dw * 4;
76	src_addr += job->ibs[0].gpu_addr;
77
78	dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
79	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
80				dst_addr, num_bytes, false);
81
82	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
83	WARN_ON(job->ibs[0].length_dw > num_dw);
84
85	pte_flags = AMDGPU_PTE_VALID | AMDGPU_PTE_READABLE;
86	pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED;
87	if (!(flags & KFD_IOCTL_SVM_FLAG_GPU_RO))
88		pte_flags |= AMDGPU_PTE_WRITEABLE;
89	pte_flags |= adev->gart.gart_pte_flags;
90
91	cpu_addr = &job->ibs[0].ptr[num_dw];
92
93	amdgpu_gart_map(adev, 0, npages, addr, pte_flags, cpu_addr);
94	fence = amdgpu_job_submit(job);
95	dma_fence_put(fence);
96
97	return r;
98}
99
100/**
101 * svm_migrate_copy_memory_gart - sdma copy data between ram and vram
102 *
103 * @adev: amdgpu device the sdma ring running
104 * @sys: system DMA pointer to be copied
105 * @vram: vram destination DMA pointer
106 * @npages: number of pages to copy
107 * @direction: enum MIGRATION_COPY_DIR
108 * @mfence: output, sdma fence to signal after sdma is done
109 *
110 * ram address uses GART table continuous entries mapping to ram pages,
111 * vram address uses direct mapping of vram pages, which must have npages
112 * number of continuous pages.
113 * GART update and sdma uses same buf copy function ring, sdma is splited to
114 * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for
115 * the last sdma finish fence which is returned to check copy memory is done.
116 *
117 * Context: Process context, takes and releases gtt_window_lock
118 *
119 * Return:
120 * 0 - OK, otherwise error code
121 */
122
123static int
124svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys,
125			     uint64_t *vram, uint64_t npages,
126			     enum MIGRATION_COPY_DIR direction,
127			     struct dma_fence **mfence)
128{
129	const uint64_t GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE;
130	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
131	uint64_t gart_s, gart_d;
132	struct dma_fence *next;
133	uint64_t size;
134	int r;
135
136	mutex_lock(&adev->mman.gtt_window_lock);
137
138	while (npages) {
139		size = min(GTT_MAX_PAGES, npages);
140
141		if (direction == FROM_VRAM_TO_RAM) {
142			gart_s = svm_migrate_direct_mapping_addr(adev, *vram);
143			r = svm_migrate_gart_map(ring, size, sys, &gart_d, 0);
144
145		} else if (direction == FROM_RAM_TO_VRAM) {
146			r = svm_migrate_gart_map(ring, size, sys, &gart_s,
147						 KFD_IOCTL_SVM_FLAG_GPU_RO);
148			gart_d = svm_migrate_direct_mapping_addr(adev, *vram);
149		}
150		if (r) {
151			dev_err(adev->dev, "fail %d create gart mapping\n", r);
152			goto out_unlock;
153		}
154
155		r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE,
156				       NULL, &next, false, true, false);
157		if (r) {
158			dev_err(adev->dev, "fail %d to copy memory\n", r);
159			goto out_unlock;
160		}
161
162		dma_fence_put(*mfence);
163		*mfence = next;
164		npages -= size;
165		if (npages) {
166			sys += size;
167			vram += size;
168		}
169	}
170
171out_unlock:
172	mutex_unlock(&adev->mman.gtt_window_lock);
173
174	return r;
175}
176
177/**
178 * svm_migrate_copy_done - wait for memory copy sdma is done
179 *
180 * @adev: amdgpu device the sdma memory copy is executing on
181 * @mfence: migrate fence
182 *
183 * Wait for dma fence is signaled, if the copy ssplit into multiple sdma
184 * operations, this is the last sdma operation fence.
185 *
186 * Context: called after svm_migrate_copy_memory
187 *
188 * Return:
189 * 0		- success
190 * otherwise	- error code from dma fence signal
191 */
192static int
193svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence)
194{
195	int r = 0;
196
197	if (mfence) {
198		r = dma_fence_wait(mfence, false);
199		dma_fence_put(mfence);
200		pr_debug("sdma copy memory fence done\n");
201	}
202
203	return r;
204}
205
206unsigned long
207svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr)
208{
209	return (addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT;
210}
211
212static void
213svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
214{
215	struct page *page;
216
217	page = pfn_to_page(pfn);
218	svm_range_bo_ref(prange->svm_bo);
219	page->zone_device_data = prange->svm_bo;
220	zone_device_page_init(page);
221}
222
223static void
224svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr)
225{
226	struct page *page;
227
228	page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr));
229	unlock_page(page);
230	put_page(page);
231}
232
233static unsigned long
234svm_migrate_addr(struct amdgpu_device *adev, struct page *page)
235{
236	unsigned long addr;
237
238	addr = page_to_pfn(page) << PAGE_SHIFT;
239	return (addr - adev->kfd.pgmap.range.start);
240}
241
242static struct page *
243svm_migrate_get_sys_page(struct vm_area_struct *vma, unsigned long addr)
244{
245	struct page *page;
246
247	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
248	if (page)
249		lock_page(page);
250
251	return page;
252}
253
254static void svm_migrate_put_sys_page(unsigned long addr)
255{
256	struct page *page;
257
258	page = pfn_to_page(addr >> PAGE_SHIFT);
259	unlock_page(page);
260	put_page(page);
261}
262
263static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate)
264{
265	unsigned long cpages = 0;
266	unsigned long i;
267
268	for (i = 0; i < migrate->npages; i++) {
269		if (migrate->src[i] & MIGRATE_PFN_VALID &&
270		    migrate->src[i] & MIGRATE_PFN_MIGRATE)
271			cpages++;
272	}
273	return cpages;
274}
275
276static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate)
277{
278	unsigned long upages = 0;
279	unsigned long i;
280
281	for (i = 0; i < migrate->npages; i++) {
282		if (migrate->src[i] & MIGRATE_PFN_VALID &&
283		    !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
284			upages++;
285	}
286	return upages;
287}
288
289static int
290svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange,
291			 struct migrate_vma *migrate, struct dma_fence **mfence,
292			 dma_addr_t *scratch, uint64_t ttm_res_offset)
293{
294	uint64_t npages = migrate->cpages;
295	struct amdgpu_device *adev = node->adev;
296	struct device *dev = adev->dev;
297	struct amdgpu_res_cursor cursor;
298	dma_addr_t *src;
299	uint64_t *dst;
300	uint64_t i, j;
301	int r;
302
303	pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n", prange->svms, prange->start,
304		 prange->last, ttm_res_offset);
305
306	src = scratch;
307	dst = (uint64_t *)(scratch + npages);
308
309	amdgpu_res_first(prange->ttm_res, ttm_res_offset,
310			 npages << PAGE_SHIFT, &cursor);
311	for (i = j = 0; i < npages; i++) {
312		struct page *spage;
313
314		dst[i] = cursor.start + (j << PAGE_SHIFT);
315		migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
316		svm_migrate_get_vram_page(prange, migrate->dst[i]);
317		migrate->dst[i] = migrate_pfn(migrate->dst[i]);
318
319		spage = migrate_pfn_to_page(migrate->src[i]);
320		if (spage && !is_zone_device_page(spage)) {
321			src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
322					      DMA_TO_DEVICE);
323			r = dma_mapping_error(dev, src[i]);
324			if (r) {
325				dev_err(dev, "%s: fail %d dma_map_page\n",
326					__func__, r);
327				goto out_free_vram_pages;
328			}
329		} else {
330			if (j) {
331				r = svm_migrate_copy_memory_gart(
332						adev, src + i - j,
333						dst + i - j, j,
334						FROM_RAM_TO_VRAM,
335						mfence);
336				if (r)
337					goto out_free_vram_pages;
338				amdgpu_res_next(&cursor, (j + 1) << PAGE_SHIFT);
339				j = 0;
340			} else {
341				amdgpu_res_next(&cursor, PAGE_SIZE);
342			}
343			continue;
344		}
345
346		pr_debug_ratelimited("dma mapping src to 0x%llx, pfn 0x%lx\n",
347				     src[i] >> PAGE_SHIFT, page_to_pfn(spage));
348
349		if (j >= (cursor.size >> PAGE_SHIFT) - 1 && i < npages - 1) {
350			r = svm_migrate_copy_memory_gart(adev, src + i - j,
351							 dst + i - j, j + 1,
352							 FROM_RAM_TO_VRAM,
353							 mfence);
354			if (r)
355				goto out_free_vram_pages;
356			amdgpu_res_next(&cursor, (j + 1) * PAGE_SIZE);
357			j = 0;
358		} else {
359			j++;
360		}
361	}
362
363	r = svm_migrate_copy_memory_gart(adev, src + i - j, dst + i - j, j,
364					 FROM_RAM_TO_VRAM, mfence);
365
366out_free_vram_pages:
367	if (r) {
368		pr_debug("failed %d to copy memory to vram\n", r);
369		while (i--) {
370			svm_migrate_put_vram_page(adev, dst[i]);
371			migrate->dst[i] = 0;
372		}
373	}
374
375#ifdef DEBUG_FORCE_MIXED_DOMAINS
376	for (i = 0, j = 0; i < npages; i += 4, j++) {
377		if (j & 1)
378			continue;
379		svm_migrate_put_vram_page(adev, dst[i]);
380		migrate->dst[i] = 0;
381		svm_migrate_put_vram_page(adev, dst[i + 1]);
382		migrate->dst[i + 1] = 0;
383		svm_migrate_put_vram_page(adev, dst[i + 2]);
384		migrate->dst[i + 2] = 0;
385		svm_migrate_put_vram_page(adev, dst[i + 3]);
386		migrate->dst[i + 3] = 0;
387	}
388#endif
389
390	return r;
391}
392
393static long
394svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange,
395			struct vm_area_struct *vma, uint64_t start,
396			uint64_t end, uint32_t trigger, uint64_t ttm_res_offset)
397{
398	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
399	uint64_t npages = (end - start) >> PAGE_SHIFT;
400	struct amdgpu_device *adev = node->adev;
401	struct kfd_process_device *pdd;
402	struct dma_fence *mfence = NULL;
403	struct migrate_vma migrate = { 0 };
404	unsigned long cpages = 0;
405	dma_addr_t *scratch;
406	void *buf;
407	int r = -ENOMEM;
408
409	memset(&migrate, 0, sizeof(migrate));
410	migrate.vma = vma;
411	migrate.start = start;
412	migrate.end = end;
413	migrate.flags = MIGRATE_VMA_SELECT_SYSTEM;
414	migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
415
416	buf = kvcalloc(npages,
417		       2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
418		       GFP_KERNEL);
419	if (!buf)
420		goto out;
421
422	migrate.src = buf;
423	migrate.dst = migrate.src + npages;
424	scratch = (dma_addr_t *)(migrate.dst + npages);
425
426	kfd_smi_event_migration_start(node, p->lead_thread->pid,
427				      start >> PAGE_SHIFT, end >> PAGE_SHIFT,
428				      0, node->id, prange->prefetch_loc,
429				      prange->preferred_loc, trigger);
430
431	r = migrate_vma_setup(&migrate);
432	if (r) {
433		dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n",
434			__func__, r, prange->start, prange->last);
435		goto out_free;
436	}
437
438	cpages = migrate.cpages;
439	if (!cpages) {
440		pr_debug("failed collect migrate sys pages [0x%lx 0x%lx]\n",
441			 prange->start, prange->last);
442		goto out_free;
443	}
444	if (cpages != npages)
445		pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n",
446			 cpages, npages);
447	else
448		pr_debug("0x%lx pages migrated\n", cpages);
449
450	r = svm_migrate_copy_to_vram(node, prange, &migrate, &mfence, scratch, ttm_res_offset);
451	migrate_vma_pages(&migrate);
452
453	pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
454		svm_migrate_successful_pages(&migrate), cpages, migrate.npages);
455
456	svm_migrate_copy_done(adev, mfence);
457	migrate_vma_finalize(&migrate);
458
459	kfd_smi_event_migration_end(node, p->lead_thread->pid,
460				    start >> PAGE_SHIFT, end >> PAGE_SHIFT,
461				    0, node->id, trigger);
462
463	svm_range_dma_unmap(adev->dev, scratch, 0, npages);
464
465out_free:
466	kvfree(buf);
467out:
468	if (!r && cpages) {
469		pdd = svm_range_get_pdd_by_node(prange, node);
470		if (pdd)
471			WRITE_ONCE(pdd->page_in, pdd->page_in + cpages);
472
473		return cpages;
474	}
475	return r;
476}
477
478/**
479 * svm_migrate_ram_to_vram - migrate svm range from system to device
480 * @prange: range structure
481 * @best_loc: the device to migrate to
482 * @mm: the process mm structure
483 * @trigger: reason of migration
484 *
485 * Context: Process context, caller hold mmap read lock, svms lock, prange lock
486 *
487 * Return:
488 * 0 - OK, otherwise error code
489 */
490static int
491svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
492			struct mm_struct *mm, uint32_t trigger)
493{
494	unsigned long addr, start, end;
495	struct vm_area_struct *vma;
496	uint64_t ttm_res_offset;
497	struct kfd_node *node;
498	unsigned long cpages = 0;
499	long r = 0;
500
501	if (prange->actual_loc == best_loc) {
502		pr_debug("svms 0x%p [0x%lx 0x%lx] already on best_loc 0x%x\n",
503			 prange->svms, prange->start, prange->last, best_loc);
504		return 0;
505	}
506
507	node = svm_range_get_node_by_id(prange, best_loc);
508	if (!node) {
509		pr_debug("failed to get kfd node by id 0x%x\n", best_loc);
510		return -ENODEV;
511	}
512
513	pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms,
514		 prange->start, prange->last, best_loc);
515
516	start = prange->start << PAGE_SHIFT;
517	end = (prange->last + 1) << PAGE_SHIFT;
518
519	r = amdgpu_amdkfd_reserve_mem_limit(node->adev,
520					prange->npages * PAGE_SIZE,
521					KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
522					node->xcp ? node->xcp->id : 0);
523	if (r) {
524		dev_dbg(node->adev->dev, "failed to reserve VRAM, r: %ld\n", r);
525		return -ENOSPC;
526	}
527
528	r = svm_range_vram_node_new(node, prange, true);
529	if (r) {
530		dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r);
531		goto out;
532	}
533	ttm_res_offset = prange->offset << PAGE_SHIFT;
534
535	for (addr = start; addr < end;) {
536		unsigned long next;
537
538		vma = vma_lookup(mm, addr);
539		if (!vma)
540			break;
541
542		next = min(vma->vm_end, end);
543		r = svm_migrate_vma_to_vram(node, prange, vma, addr, next, trigger, ttm_res_offset);
544		if (r < 0) {
545			pr_debug("failed %ld to migrate\n", r);
546			break;
547		} else {
548			cpages += r;
549		}
550		ttm_res_offset += next - addr;
551		addr = next;
552	}
553
554	if (cpages) {
555		prange->actual_loc = best_loc;
556		svm_range_free_dma_mappings(prange, true);
557	} else {
558		svm_range_vram_node_free(prange);
559	}
560
561out:
562	amdgpu_amdkfd_unreserve_mem_limit(node->adev,
563					prange->npages * PAGE_SIZE,
564					KFD_IOC_ALLOC_MEM_FLAGS_VRAM,
565					node->xcp ? node->xcp->id : 0);
566	return r < 0 ? r : 0;
567}
568
569static void svm_migrate_page_free(struct page *page)
570{
571	struct svm_range_bo *svm_bo = page->zone_device_data;
572
573	if (svm_bo) {
574		pr_debug_ratelimited("ref: %d\n", kref_read(&svm_bo->kref));
575		svm_range_bo_unref_async(svm_bo);
576	}
577}
578
579static int
580svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
581			struct migrate_vma *migrate, struct dma_fence **mfence,
582			dma_addr_t *scratch, uint64_t npages)
583{
584	struct device *dev = adev->dev;
585	uint64_t *src;
586	dma_addr_t *dst;
587	struct page *dpage;
588	uint64_t i = 0, j;
589	uint64_t addr;
590	int r = 0;
591
592	pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
593		 prange->last);
594
595	addr = prange->start << PAGE_SHIFT;
596
597	src = (uint64_t *)(scratch + npages);
598	dst = scratch;
599
600	for (i = 0, j = 0; i < npages; i++, addr += PAGE_SIZE) {
601		struct page *spage;
602
603		spage = migrate_pfn_to_page(migrate->src[i]);
604		if (!spage || !is_zone_device_page(spage)) {
605			pr_debug("invalid page. Could be in CPU already svms 0x%p [0x%lx 0x%lx]\n",
606				 prange->svms, prange->start, prange->last);
607			if (j) {
608				r = svm_migrate_copy_memory_gart(adev, dst + i - j,
609								 src + i - j, j,
610								 FROM_VRAM_TO_RAM,
611								 mfence);
612				if (r)
613					goto out_oom;
614				j = 0;
615			}
616			continue;
617		}
618		src[i] = svm_migrate_addr(adev, spage);
619		if (j > 0 && src[i] != src[i - 1] + PAGE_SIZE) {
620			r = svm_migrate_copy_memory_gart(adev, dst + i - j,
621							 src + i - j, j,
622							 FROM_VRAM_TO_RAM,
623							 mfence);
624			if (r)
625				goto out_oom;
626			j = 0;
627		}
628
629		dpage = svm_migrate_get_sys_page(migrate->vma, addr);
630		if (!dpage) {
631			pr_debug("failed get page svms 0x%p [0x%lx 0x%lx]\n",
632				 prange->svms, prange->start, prange->last);
633			r = -ENOMEM;
634			goto out_oom;
635		}
636
637		dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE);
638		r = dma_mapping_error(dev, dst[i]);
639		if (r) {
640			dev_err(adev->dev, "%s: fail %d dma_map_page\n", __func__, r);
641			goto out_oom;
642		}
643
644		pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n",
645				     dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
646
647		migrate->dst[i] = migrate_pfn(page_to_pfn(dpage));
648		j++;
649	}
650
651	r = svm_migrate_copy_memory_gart(adev, dst + i - j, src + i - j, j,
652					 FROM_VRAM_TO_RAM, mfence);
653
654out_oom:
655	if (r) {
656		pr_debug("failed %d copy to ram\n", r);
657		while (i--) {
658			svm_migrate_put_sys_page(dst[i]);
659			migrate->dst[i] = 0;
660		}
661	}
662
663	return r;
664}
665
666/**
667 * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
668 *
669 * @prange: svm range structure
670 * @vma: vm_area_struct that range [start, end] belongs to
671 * @start: range start virtual address in pages
672 * @end: range end virtual address in pages
673 * @node: kfd node device to migrate from
674 * @trigger: reason of migration
675 * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback
676 *
677 * Context: Process context, caller hold mmap read lock, prange->migrate_mutex
678 *
679 * Return:
680 *   0 - success with all pages migrated
681 *   negative values - indicate error
682 *   positive values - partial migration, number of pages not migrated
683 */
684static long
685svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
686		       struct vm_area_struct *vma, uint64_t start, uint64_t end,
687		       uint32_t trigger, struct page *fault_page)
688{
689	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
690	uint64_t npages = (end - start) >> PAGE_SHIFT;
691	unsigned long upages = npages;
692	unsigned long cpages = 0;
693	struct amdgpu_device *adev = node->adev;
694	struct kfd_process_device *pdd;
695	struct dma_fence *mfence = NULL;
696	struct migrate_vma migrate = { 0 };
697	dma_addr_t *scratch;
698	void *buf;
699	int r = -ENOMEM;
700
701	memset(&migrate, 0, sizeof(migrate));
702	migrate.vma = vma;
703	migrate.start = start;
704	migrate.end = end;
705	migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
706	if (adev->gmc.xgmi.connected_to_cpu)
707		migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
708	else
709		migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
710
711	buf = kvcalloc(npages,
712		       2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
713		       GFP_KERNEL);
714	if (!buf)
715		goto out;
716
717	migrate.src = buf;
718	migrate.dst = migrate.src + npages;
719	migrate.fault_page = fault_page;
720	scratch = (dma_addr_t *)(migrate.dst + npages);
721
722	kfd_smi_event_migration_start(node, p->lead_thread->pid,
723				      start >> PAGE_SHIFT, end >> PAGE_SHIFT,
724				      node->id, 0, prange->prefetch_loc,
725				      prange->preferred_loc, trigger);
726
727	r = migrate_vma_setup(&migrate);
728	if (r) {
729		dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n",
730			__func__, r, prange->start, prange->last);
731		goto out_free;
732	}
733
734	cpages = migrate.cpages;
735	if (!cpages) {
736		pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n",
737			 prange->start, prange->last);
738		upages = svm_migrate_unsuccessful_pages(&migrate);
739		goto out_free;
740	}
741	if (cpages != npages)
742		pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n",
743			 cpages, npages);
744	else
745		pr_debug("0x%lx pages migrated\n", cpages);
746
747	r = svm_migrate_copy_to_ram(adev, prange, &migrate, &mfence,
748				    scratch, npages);
749	migrate_vma_pages(&migrate);
750
751	upages = svm_migrate_unsuccessful_pages(&migrate);
752	pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
753		 upages, cpages, migrate.npages);
754
755	svm_migrate_copy_done(adev, mfence);
756	migrate_vma_finalize(&migrate);
757
758	kfd_smi_event_migration_end(node, p->lead_thread->pid,
759				    start >> PAGE_SHIFT, end >> PAGE_SHIFT,
760				    node->id, 0, trigger);
761
762	svm_range_dma_unmap(adev->dev, scratch, 0, npages);
763
764out_free:
765	kvfree(buf);
766out:
767	if (!r && cpages) {
768		pdd = svm_range_get_pdd_by_node(prange, node);
769		if (pdd)
770			WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
771	}
772	return r ? r : upages;
773}
774
775/**
776 * svm_migrate_vram_to_ram - migrate svm range from device to system
777 * @prange: range structure
778 * @mm: process mm, use current->mm if NULL
779 * @trigger: reason of migration
780 * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback
781 *
782 * Context: Process context, caller hold mmap read lock, prange->migrate_mutex
783 *
784 * Return:
785 * 0 - OK, otherwise error code
786 */
787int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
788			    uint32_t trigger, struct page *fault_page)
789{
790	struct kfd_node *node;
791	struct vm_area_struct *vma;
792	unsigned long addr;
793	unsigned long start;
794	unsigned long end;
795	unsigned long upages = 0;
796	long r = 0;
797
798	if (!prange->actual_loc) {
799		pr_debug("[0x%lx 0x%lx] already migrated to ram\n",
800			 prange->start, prange->last);
801		return 0;
802	}
803
804	node = svm_range_get_node_by_id(prange, prange->actual_loc);
805	if (!node) {
806		pr_debug("failed to get kfd node by id 0x%x\n", prange->actual_loc);
807		return -ENODEV;
808	}
809	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu 0x%x to ram\n",
810		 prange->svms, prange, prange->start, prange->last,
811		 prange->actual_loc);
812
813	start = prange->start << PAGE_SHIFT;
814	end = (prange->last + 1) << PAGE_SHIFT;
815
816	for (addr = start; addr < end;) {
817		unsigned long next;
818
819		vma = vma_lookup(mm, addr);
820		if (!vma) {
821			pr_debug("failed to find vma for prange %p\n", prange);
822			r = -EFAULT;
823			break;
824		}
825
826		next = min(vma->vm_end, end);
827		r = svm_migrate_vma_to_ram(node, prange, vma, addr, next, trigger,
828			fault_page);
829		if (r < 0) {
830			pr_debug("failed %ld to migrate prange %p\n", r, prange);
831			break;
832		} else {
833			upages += r;
834		}
835		addr = next;
836	}
837
838	if (r >= 0 && !upages) {
839		svm_range_vram_node_free(prange);
840		prange->actual_loc = 0;
841	}
842
843	return r < 0 ? r : 0;
844}
845
846/**
847 * svm_migrate_vram_to_vram - migrate svm range from device to device
848 * @prange: range structure
849 * @best_loc: the device to migrate to
850 * @mm: process mm, use current->mm if NULL
851 * @trigger: reason of migration
852 *
853 * Context: Process context, caller hold mmap read lock, svms lock, prange lock
854 *
855 * Return:
856 * 0 - OK, otherwise error code
857 */
858static int
859svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
860			 struct mm_struct *mm, uint32_t trigger)
861{
862	int r, retries = 3;
863
864	/*
865	 * TODO: for both devices with PCIe large bar or on same xgmi hive, skip
866	 * system memory as migration bridge
867	 */
868
869	pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
870
871	do {
872		r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
873		if (r)
874			return r;
875	} while (prange->actual_loc && --retries);
876
877	if (prange->actual_loc)
878		return -EDEADLK;
879
880	return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger);
881}
882
883int
884svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc,
885		    struct mm_struct *mm, uint32_t trigger)
886{
887	if  (!prange->actual_loc)
888		return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger);
889	else
890		return svm_migrate_vram_to_vram(prange, best_loc, mm, trigger);
891
892}
893
894/**
895 * svm_migrate_to_ram - CPU page fault handler
896 * @vmf: CPU vm fault vma, address
897 *
898 * Context: vm fault handler, caller holds the mmap read lock
899 *
900 * Return:
901 * 0 - OK
902 * VM_FAULT_SIGBUS - notice application to have SIGBUS page fault
903 */
904static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
905{
906	unsigned long addr = vmf->address;
907	struct svm_range_bo *svm_bo;
908	enum svm_work_list_ops op;
909	struct svm_range *parent;
910	struct svm_range *prange;
911	struct kfd_process *p;
912	struct mm_struct *mm;
913	int r = 0;
914
915	svm_bo = vmf->page->zone_device_data;
916	if (!svm_bo) {
917		pr_debug("failed get device page at addr 0x%lx\n", addr);
918		return VM_FAULT_SIGBUS;
919	}
920	if (!mmget_not_zero(svm_bo->eviction_fence->mm)) {
921		pr_debug("addr 0x%lx of process mm is destroyed\n", addr);
922		return VM_FAULT_SIGBUS;
923	}
924
925	mm = svm_bo->eviction_fence->mm;
926	if (mm != vmf->vma->vm_mm)
927		pr_debug("addr 0x%lx is COW mapping in child process\n", addr);
928
929	p = kfd_lookup_process_by_mm(mm);
930	if (!p) {
931		pr_debug("failed find process at fault address 0x%lx\n", addr);
932		r = VM_FAULT_SIGBUS;
933		goto out_mmput;
934	}
935	if (READ_ONCE(p->svms.faulting_task) == current) {
936		pr_debug("skipping ram migration\n");
937		r = 0;
938		goto out_unref_process;
939	}
940
941	pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr);
942	addr >>= PAGE_SHIFT;
943
944	mutex_lock(&p->svms.lock);
945
946	prange = svm_range_from_addr(&p->svms, addr, &parent);
947	if (!prange) {
948		pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, addr);
949		r = -EFAULT;
950		goto out_unlock_svms;
951	}
952
953	mutex_lock(&parent->migrate_mutex);
954	if (prange != parent)
955		mutex_lock_nested(&prange->migrate_mutex, 1);
956
957	if (!prange->actual_loc)
958		goto out_unlock_prange;
959
960	svm_range_lock(parent);
961	if (prange != parent)
962		mutex_lock_nested(&prange->lock, 1);
963	r = svm_range_split_by_granularity(p, mm, addr, parent, prange);
964	if (prange != parent)
965		mutex_unlock(&prange->lock);
966	svm_range_unlock(parent);
967	if (r) {
968		pr_debug("failed %d to split range by granularity\n", r);
969		goto out_unlock_prange;
970	}
971
972	r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm,
973				    KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
974				    vmf->page);
975	if (r)
976		pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n",
977			 r, prange->svms, prange, prange->start, prange->last);
978
979	/* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
980	if (p->xnack_enabled && parent == prange)
981		op = SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP;
982	else
983		op = SVM_OP_UPDATE_RANGE_NOTIFIER;
984	svm_range_add_list_work(&p->svms, parent, mm, op);
985	schedule_deferred_list_work(&p->svms);
986
987out_unlock_prange:
988	if (prange != parent)
989		mutex_unlock(&prange->migrate_mutex);
990	mutex_unlock(&parent->migrate_mutex);
991out_unlock_svms:
992	mutex_unlock(&p->svms.lock);
993out_unref_process:
994	pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
995	kfd_unref_process(p);
996out_mmput:
997	mmput(mm);
998	return r ? VM_FAULT_SIGBUS : 0;
999}
1000
1001static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
1002	.page_free		= svm_migrate_page_free,
1003	.migrate_to_ram		= svm_migrate_to_ram,
1004};
1005
1006/* Each VRAM page uses sizeof(struct page) on system memory */
1007#define SVM_HMM_PAGE_STRUCT_SIZE(size) ((size)/PAGE_SIZE * sizeof(struct page))
1008
1009int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
1010{
1011	struct amdgpu_kfd_dev *kfddev = &adev->kfd;
1012	struct dev_pagemap *pgmap;
1013	struct resource *res = NULL;
1014	unsigned long size;
1015	void *r;
1016
1017	/* Page migration works on gfx9 or newer */
1018	if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(9, 0, 1))
1019		return -EINVAL;
1020
1021	if (adev->gmc.is_app_apu)
1022		return 0;
1023
1024	pgmap = &kfddev->pgmap;
1025	memset(pgmap, 0, sizeof(*pgmap));
1026
1027	/* TODO: register all vram to HMM for now.
1028	 * should remove reserved size
1029	 */
1030	size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
1031	if (adev->gmc.xgmi.connected_to_cpu) {
1032		pgmap->range.start = adev->gmc.aper_base;
1033		pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
1034		pgmap->type = MEMORY_DEVICE_COHERENT;
1035	} else {
1036		res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
1037		if (IS_ERR(res))
1038			return PTR_ERR(res);
1039		pgmap->range.start = res->start;
1040		pgmap->range.end = res->end;
1041		pgmap->type = MEMORY_DEVICE_PRIVATE;
1042	}
1043
1044	pgmap->nr_range = 1;
1045	pgmap->ops = &svm_migrate_pgmap_ops;
1046	pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
1047	pgmap->flags = 0;
1048	/* Device manager releases device-specific resources, memory region and
1049	 * pgmap when driver disconnects from device.
1050	 */
1051	r = devm_memremap_pages(adev->dev, pgmap);
1052	if (IS_ERR(r)) {
1053		pr_err("failed to register HMM device memory\n");
1054		if (pgmap->type == MEMORY_DEVICE_PRIVATE)
1055			devm_release_mem_region(adev->dev, res->start, resource_size(res));
1056		/* Disable SVM support capability */
1057		pgmap->type = 0;
1058		return PTR_ERR(r);
1059	}
1060
1061	pr_debug("reserve %ldMB system memory for VRAM pages struct\n",
1062		 SVM_HMM_PAGE_STRUCT_SIZE(size) >> 20);
1063
1064	amdgpu_amdkfd_reserve_system_mem(SVM_HMM_PAGE_STRUCT_SIZE(size));
1065
1066	pr_info("HMM registered %ldMB device memory\n", size >> 20);
1067
1068	return 0;
1069}
1070