kfd_migrate.c revision 1.5
1// SPDX-License-Identifier: GPL-2.0 OR MIT
2/*
3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include <linux/types.h>
24#include <linux/hmm.h>
25#include <linux/dma-direction.h>
26#include <linux/dma-mapping.h>
27#include <linux/migrate.h>
28#include "amdgpu_sync.h"
29#include "amdgpu_object.h"
30#include "amdgpu_vm.h"
31#include "amdgpu_res_cursor.h"
32#include "kfd_priv.h"
33#include "kfd_svm.h"
34#include "kfd_migrate.h"
35#include "kfd_smi_events.h"
36
37#ifdef dev_fmt
38#undef dev_fmt
39#endif
40#define dev_fmt(fmt) "kfd_migrate: " fmt
41
42static uint64_t
43svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, uint64_t addr)
44{
45	return addr + amdgpu_ttm_domain_start(adev, TTM_PL_VRAM);
46}
47
48static int
49svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages,
50		     dma_addr_t *addr, uint64_t *gart_addr, uint64_t flags)
51{
52	struct amdgpu_device *adev = ring->adev;
53	struct amdgpu_job *job;
54	unsigned int num_dw, num_bytes;
55	struct dma_fence *fence;
56	uint64_t src_addr, dst_addr;
57	uint64_t pte_flags;
58	void *cpu_addr;
59	int r;
60
61	/* use gart window 0 */
62	*gart_addr = adev->gmc.gart_start;
63
64	num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);
65	num_bytes = npages * 8;
66
67	r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr,
68				     AMDGPU_FENCE_OWNER_UNDEFINED,
69				     num_dw * 4 + num_bytes,
70				     AMDGPU_IB_POOL_DELAYED,
71				     &job);
72	if (r)
73		return r;
74
75	src_addr = num_dw * 4;
76	src_addr += job->ibs[0].gpu_addr;
77
78	dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
79	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr,
80				dst_addr, num_bytes, false);
81
82	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
83	WARN_ON(job->ibs[0].length_dw > num_dw);
84
85	pte_flags = AMDGPU_PTE_VALID | AMDGPU_PTE_READABLE;
86	pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED;
87	if (!(flags & KFD_IOCTL_SVM_FLAG_GPU_RO))
88		pte_flags |= AMDGPU_PTE_WRITEABLE;
89	pte_flags |= adev->gart.gart_pte_flags;
90
91	cpu_addr = &job->ibs[0].ptr[num_dw];
92
93	amdgpu_gart_map(adev, 0, npages, addr, pte_flags, cpu_addr);
94	fence = amdgpu_job_submit(job);
95	dma_fence_put(fence);
96
97	return r;
98}
99
100/**
101 * svm_migrate_copy_memory_gart - sdma copy data between ram and vram
102 *
103 * @adev: amdgpu device the sdma ring running
104 * @sys: system DMA pointer to be copied
105 * @vram: vram destination DMA pointer
106 * @npages: number of pages to copy
107 * @direction: enum MIGRATION_COPY_DIR
108 * @mfence: output, sdma fence to signal after sdma is done
109 *
110 * ram address uses GART table continuous entries mapping to ram pages,
111 * vram address uses direct mapping of vram pages, which must have npages
112 * number of continuous pages.
113 * GART update and sdma uses same buf copy function ring, sdma is splited to
114 * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for
115 * the last sdma finish fence which is returned to check copy memory is done.
116 *
117 * Context: Process context, takes and releases gtt_window_lock
118 *
119 * Return:
120 * 0 - OK, otherwise error code
121 */
122
123static int
124svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys,
125			     uint64_t *vram, uint64_t npages,
126			     enum MIGRATION_COPY_DIR direction,
127			     struct dma_fence **mfence)
128{
129	const uint64_t GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE;
130	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
131	uint64_t gart_s, gart_d;
132	struct dma_fence *next;
133	uint64_t size;
134	int r;
135
136	mutex_lock(&adev->mman.gtt_window_lock);
137
138	while (npages) {
139		size = min(GTT_MAX_PAGES, npages);
140
141		if (direction == FROM_VRAM_TO_RAM) {
142			gart_s = svm_migrate_direct_mapping_addr(adev, *vram);
143			r = svm_migrate_gart_map(ring, size, sys, &gart_d, 0);
144
145		} else if (direction == FROM_RAM_TO_VRAM) {
146			r = svm_migrate_gart_map(ring, size, sys, &gart_s,
147						 KFD_IOCTL_SVM_FLAG_GPU_RO);
148			gart_d = svm_migrate_direct_mapping_addr(adev, *vram);
149		}
150		if (r) {
151			dev_err(adev->dev, "fail %d create gart mapping\n", r);
152			goto out_unlock;
153		}
154
155		r = amdgpu_copy_buffer(ring, gart_s, gart_d, size * PAGE_SIZE,
156				       NULL, &next, false, true, false);
157		if (r) {
158			dev_err(adev->dev, "fail %d to copy memory\n", r);
159			goto out_unlock;
160		}
161
162		dma_fence_put(*mfence);
163		*mfence = next;
164		npages -= size;
165		if (npages) {
166			sys += size;
167			vram += size;
168		}
169	}
170
171out_unlock:
172	mutex_unlock(&adev->mman.gtt_window_lock);
173
174	return r;
175}
176
177/**
178 * svm_migrate_copy_done - wait for memory copy sdma is done
179 *
180 * @adev: amdgpu device the sdma memory copy is executing on
181 * @mfence: migrate fence
182 *
183 * Wait for dma fence is signaled, if the copy ssplit into multiple sdma
184 * operations, this is the last sdma operation fence.
185 *
186 * Context: called after svm_migrate_copy_memory
187 *
188 * Return:
189 * 0		- success
190 * otherwise	- error code from dma fence signal
191 */
192static int
193svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence)
194{
195	int r = 0;
196
197	if (mfence) {
198		r = dma_fence_wait(mfence, false);
199		dma_fence_put(mfence);
200		pr_debug("sdma copy memory fence done\n");
201	}
202
203	return r;
204}
205
206unsigned long
207svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr)
208{
209	return (addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT;
210}
211
212static void
213svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
214{
215	struct page *page;
216
217	page = pfn_to_page(pfn);
218	svm_range_bo_ref(prange->svm_bo);
219	page->zone_device_data = prange->svm_bo;
220	zone_device_page_init(page);
221}
222
223static void
224svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr)
225{
226	struct page *page;
227
228	page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr));
229	unlock_page(page);
230	put_page(page);
231}
232
233static unsigned long
234svm_migrate_addr(struct amdgpu_device *adev, struct page *page)
235{
236	unsigned long addr;
237
238	addr = page_to_pfn(page) << PAGE_SHIFT;
239	return (addr - adev->kfd.pgmap.range.start);
240}
241
242static struct page *
243svm_migrate_get_sys_page(struct vm_area_struct *vma, unsigned long addr)
244{
245	struct page *page;
246
247	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
248	if (page)
249		lock_page(page);
250
251	return page;
252}
253
254static void svm_migrate_put_sys_page(unsigned long addr)
255{
256	struct page *page;
257
258	page = pfn_to_page(addr >> PAGE_SHIFT);
259	unlock_page(page);
260	put_page(page);
261}
262
263static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate)
264{
265	unsigned long cpages = 0;
266	unsigned long i;
267
268	for (i = 0; i < migrate->npages; i++) {
269		if (migrate->src[i] & MIGRATE_PFN_VALID &&
270		    migrate->src[i] & MIGRATE_PFN_MIGRATE)
271			cpages++;
272	}
273	return cpages;
274}
275
276static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate)
277{
278	unsigned long upages = 0;
279	unsigned long i;
280
281	for (i = 0; i < migrate->npages; i++) {
282		if (migrate->src[i] & MIGRATE_PFN_VALID &&
283		    !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
284			upages++;
285	}
286	return upages;
287}
288
289static int
290svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange,
291			 struct migrate_vma *migrate, struct dma_fence **mfence,
292			 dma_addr_t *scratch, uint64_t ttm_res_offset)
293{
294	uint64_t npages = migrate->cpages;
295	struct amdgpu_device *adev = node->adev;
296	struct device *dev = adev->dev;
297	struct amdgpu_res_cursor cursor;
298	dma_addr_t *src;
299	uint64_t *dst;
300	uint64_t i, j;
301	int r;
302
303	pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n", prange->svms, prange->start,
304		 prange->last, ttm_res_offset);
305
306	src = scratch;
307	dst = (uint64_t *)(scratch + npages);
308
309	amdgpu_res_first(prange->ttm_res, ttm_res_offset,
310			 npages << PAGE_SHIFT, &cursor);
311	for (i = j = 0; i < npages; i++) {
312		struct page *spage;
313
314		dst[i] = cursor.start + (j << PAGE_SHIFT);
315		migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]);
316		svm_migrate_get_vram_page(prange, migrate->dst[i]);
317		migrate->dst[i] = migrate_pfn(migrate->dst[i]);
318
319		spage = migrate_pfn_to_page(migrate->src[i]);
320		if (spage && !is_zone_device_page(spage)) {
321			src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE,
322					      DMA_TO_DEVICE);
323			r = dma_mapping_error(dev, src[i]);
324			if (r) {
325				dev_err(dev, "%s: fail %d dma_map_page\n",
326					__func__, r);
327				goto out_free_vram_pages;
328			}
329		} else {
330			if (j) {
331				r = svm_migrate_copy_memory_gart(
332						adev, src + i - j,
333						dst + i - j, j,
334						FROM_RAM_TO_VRAM,
335						mfence);
336				if (r)
337					goto out_free_vram_pages;
338				amdgpu_res_next(&cursor, (j + 1) << PAGE_SHIFT);
339				j = 0;
340			} else {
341				amdgpu_res_next(&cursor, PAGE_SIZE);
342			}
343			continue;
344		}
345
346		pr_debug_ratelimited("dma mapping src to 0x%llx, pfn 0x%lx\n",
347				     src[i] >> PAGE_SHIFT, page_to_pfn(spage));
348
349		if (j >= (cursor.size >> PAGE_SHIFT) - 1 && i < npages - 1) {
350			r = svm_migrate_copy_memory_gart(adev, src + i - j,
351							 dst + i - j, j + 1,
352							 FROM_RAM_TO_VRAM,
353							 mfence);
354			if (r)
355				goto out_free_vram_pages;
356			amdgpu_res_next(&cursor, (j + 1) * PAGE_SIZE);
357			j = 0;
358		} else {
359			j++;
360		}
361	}
362
363	r = svm_migrate_copy_memory_gart(adev, src + i - j, dst + i - j, j,
364					 FROM_RAM_TO_VRAM, mfence);
365
366out_free_vram_pages:
367	if (r) {
368		pr_debug("failed %d to copy memory to vram\n", r);
369		while (i--) {
370			svm_migrate_put_vram_page(adev, dst[i]);
371			migrate->dst[i] = 0;
372		}
373	}
374
375#ifdef DEBUG_FORCE_MIXED_DOMAINS
376	for (i = 0, j = 0; i < npages; i += 4, j++) {
377		if (j & 1)
378			continue;
379		svm_migrate_put_vram_page(adev, dst[i]);
380		migrate->dst[i] = 0;
381		svm_migrate_put_vram_page(adev, dst[i + 1]);
382		migrate->dst[i + 1] = 0;
383		svm_migrate_put_vram_page(adev, dst[i + 2]);
384		migrate->dst[i + 2] = 0;
385		svm_migrate_put_vram_page(adev, dst[i + 3]);
386		migrate->dst[i + 3] = 0;
387	}
388#endif
389
390	return r;
391}
392
393static long
394svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange,
395			struct vm_area_struct *vma, uint64_t start,
396			uint64_t end, uint32_t trigger, uint64_t ttm_res_offset)
397{
398	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
399	uint64_t npages = (end - start) >> PAGE_SHIFT;
400	struct amdgpu_device *adev = node->adev;
401	struct kfd_process_device *pdd;
402	struct dma_fence *mfence = NULL;
403	struct migrate_vma migrate = { 0 };
404	unsigned long cpages = 0;
405	dma_addr_t *scratch;
406	void *buf;
407	int r = -ENOMEM;
408
409	memset(&migrate, 0, sizeof(migrate));
410	migrate.vma = vma;
411	migrate.start = start;
412	migrate.end = end;
413	migrate.flags = MIGRATE_VMA_SELECT_SYSTEM;
414	migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
415
416	buf = kvcalloc(npages,
417		       2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
418		       GFP_KERNEL);
419	if (!buf)
420		goto out;
421
422	migrate.src = buf;
423	migrate.dst = migrate.src + npages;
424	scratch = (dma_addr_t *)(migrate.dst + npages);
425
426	kfd_smi_event_migration_start(node, p->lead_thread->pid,
427				      start >> PAGE_SHIFT, end >> PAGE_SHIFT,
428				      0, node->id, prange->prefetch_loc,
429				      prange->preferred_loc, trigger);
430
431	r = migrate_vma_setup(&migrate);
432	if (r) {
433		dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n",
434			__func__, r, prange->start, prange->last);
435		goto out_free;
436	}
437
438	cpages = migrate.cpages;
439	if (!cpages) {
440		pr_debug("failed collect migrate sys pages [0x%lx 0x%lx]\n",
441			 prange->start, prange->last);
442		goto out_free;
443	}
444	if (cpages != npages)
445		pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n",
446			 cpages, npages);
447	else
448		pr_debug("0x%lx pages migrated\n", cpages);
449
450	r = svm_migrate_copy_to_vram(node, prange, &migrate, &mfence, scratch, ttm_res_offset);
451	migrate_vma_pages(&migrate);
452
453	pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
454		svm_migrate_successful_pages(&migrate), cpages, migrate.npages);
455
456	svm_migrate_copy_done(adev, mfence);
457	migrate_vma_finalize(&migrate);
458
459	kfd_smi_event_migration_end(node, p->lead_thread->pid,
460				    start >> PAGE_SHIFT, end >> PAGE_SHIFT,
461				    0, node->id, trigger);
462
463	svm_range_dma_unmap(adev->dev, scratch, 0, npages);
464
465out_free:
466	kvfree(buf);
467out:
468	if (!r && cpages) {
469		pdd = svm_range_get_pdd_by_node(prange, node);
470		if (pdd)
471			WRITE_ONCE(pdd->page_in, pdd->page_in + cpages);
472
473		return cpages;
474	}
475	return r;
476}
477
478/**
479 * svm_migrate_ram_to_vram - migrate svm range from system to device
480 * @prange: range structure
481 * @best_loc: the device to migrate to
482 * @mm: the process mm structure
483 * @trigger: reason of migration
484 *
485 * Context: Process context, caller hold mmap read lock, svms lock, prange lock
486 *
487 * Return:
488 * 0 - OK, otherwise error code
489 */
490static int
491svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc,
492			struct mm_struct *mm, uint32_t trigger)
493{
494	unsigned long addr, start, end;
495	struct vm_area_struct *vma;
496	uint64_t ttm_res_offset;
497	struct kfd_node *node;
498	unsigned long cpages = 0;
499	long r = 0;
500
501	if (prange->actual_loc == best_loc) {
502		pr_debug("svms 0x%p [0x%lx 0x%lx] already on best_loc 0x%x\n",
503			 prange->svms, prange->start, prange->last, best_loc);
504		return 0;
505	}
506
507	node = svm_range_get_node_by_id(prange, best_loc);
508	if (!node) {
509		pr_debug("failed to get kfd node by id 0x%x\n", best_loc);
510		return -ENODEV;
511	}
512
513	pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms,
514		 prange->start, prange->last, best_loc);
515
516	start = prange->start << PAGE_SHIFT;
517	end = (prange->last + 1) << PAGE_SHIFT;
518
519	r = svm_range_vram_node_new(node, prange, true);
520	if (r) {
521		dev_dbg(node->adev->dev, "fail %ld to alloc vram\n", r);
522		return r;
523	}
524	ttm_res_offset = prange->offset << PAGE_SHIFT;
525
526	for (addr = start; addr < end;) {
527		unsigned long next;
528
529		vma = vma_lookup(mm, addr);
530		if (!vma)
531			break;
532
533		next = min(vma->vm_end, end);
534		r = svm_migrate_vma_to_vram(node, prange, vma, addr, next, trigger, ttm_res_offset);
535		if (r < 0) {
536			pr_debug("failed %ld to migrate\n", r);
537			break;
538		} else {
539			cpages += r;
540		}
541		ttm_res_offset += next - addr;
542		addr = next;
543	}
544
545	if (cpages) {
546		prange->actual_loc = best_loc;
547		svm_range_free_dma_mappings(prange, true);
548	} else {
549		svm_range_vram_node_free(prange);
550	}
551
552	return r < 0 ? r : 0;
553}
554
555static void svm_migrate_page_free(struct page *page)
556{
557	struct svm_range_bo *svm_bo = page->zone_device_data;
558
559	if (svm_bo) {
560		pr_debug_ratelimited("ref: %d\n", kref_read(&svm_bo->kref));
561		svm_range_bo_unref_async(svm_bo);
562	}
563}
564
565static int
566svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange,
567			struct migrate_vma *migrate, struct dma_fence **mfence,
568			dma_addr_t *scratch, uint64_t npages)
569{
570	struct device *dev = adev->dev;
571	uint64_t *src;
572	dma_addr_t *dst;
573	struct page *dpage;
574	uint64_t i = 0, j;
575	uint64_t addr;
576	int r = 0;
577
578	pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, prange->start,
579		 prange->last);
580
581	addr = prange->start << PAGE_SHIFT;
582
583	src = (uint64_t *)(scratch + npages);
584	dst = scratch;
585
586	for (i = 0, j = 0; i < npages; i++, addr += PAGE_SIZE) {
587		struct page *spage;
588
589		spage = migrate_pfn_to_page(migrate->src[i]);
590		if (!spage || !is_zone_device_page(spage)) {
591			pr_debug("invalid page. Could be in CPU already svms 0x%p [0x%lx 0x%lx]\n",
592				 prange->svms, prange->start, prange->last);
593			if (j) {
594				r = svm_migrate_copy_memory_gart(adev, dst + i - j,
595								 src + i - j, j,
596								 FROM_VRAM_TO_RAM,
597								 mfence);
598				if (r)
599					goto out_oom;
600				j = 0;
601			}
602			continue;
603		}
604		src[i] = svm_migrate_addr(adev, spage);
605		if (j > 0 && src[i] != src[i - 1] + PAGE_SIZE) {
606			r = svm_migrate_copy_memory_gart(adev, dst + i - j,
607							 src + i - j, j,
608							 FROM_VRAM_TO_RAM,
609							 mfence);
610			if (r)
611				goto out_oom;
612			j = 0;
613		}
614
615		dpage = svm_migrate_get_sys_page(migrate->vma, addr);
616		if (!dpage) {
617			pr_debug("failed get page svms 0x%p [0x%lx 0x%lx]\n",
618				 prange->svms, prange->start, prange->last);
619			r = -ENOMEM;
620			goto out_oom;
621		}
622
623		dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE);
624		r = dma_mapping_error(dev, dst[i]);
625		if (r) {
626			dev_err(adev->dev, "%s: fail %d dma_map_page\n", __func__, r);
627			goto out_oom;
628		}
629
630		pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n",
631				     dst[i] >> PAGE_SHIFT, page_to_pfn(dpage));
632
633		migrate->dst[i] = migrate_pfn(page_to_pfn(dpage));
634		j++;
635	}
636
637	r = svm_migrate_copy_memory_gart(adev, dst + i - j, src + i - j, j,
638					 FROM_VRAM_TO_RAM, mfence);
639
640out_oom:
641	if (r) {
642		pr_debug("failed %d copy to ram\n", r);
643		while (i--) {
644			svm_migrate_put_sys_page(dst[i]);
645			migrate->dst[i] = 0;
646		}
647	}
648
649	return r;
650}
651
652/**
653 * svm_migrate_vma_to_ram - migrate range inside one vma from device to system
654 *
655 * @prange: svm range structure
656 * @vma: vm_area_struct that range [start, end] belongs to
657 * @start: range start virtual address in pages
658 * @end: range end virtual address in pages
659 * @node: kfd node device to migrate from
660 * @trigger: reason of migration
661 * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback
662 *
663 * Context: Process context, caller hold mmap read lock, prange->migrate_mutex
664 *
665 * Return:
666 *   0 - success with all pages migrated
667 *   negative values - indicate error
668 *   positive values - partial migration, number of pages not migrated
669 */
670static long
671svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange,
672		       struct vm_area_struct *vma, uint64_t start, uint64_t end,
673		       uint32_t trigger, struct page *fault_page)
674{
675	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
676	uint64_t npages = (end - start) >> PAGE_SHIFT;
677	unsigned long upages = npages;
678	unsigned long cpages = 0;
679	struct amdgpu_device *adev = node->adev;
680	struct kfd_process_device *pdd;
681	struct dma_fence *mfence = NULL;
682	struct migrate_vma migrate = { 0 };
683	dma_addr_t *scratch;
684	void *buf;
685	int r = -ENOMEM;
686
687	memset(&migrate, 0, sizeof(migrate));
688	migrate.vma = vma;
689	migrate.start = start;
690	migrate.end = end;
691	migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
692	if (adev->gmc.xgmi.connected_to_cpu)
693		migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
694	else
695		migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
696
697	buf = kvcalloc(npages,
698		       2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t),
699		       GFP_KERNEL);
700	if (!buf)
701		goto out;
702
703	migrate.src = buf;
704	migrate.dst = migrate.src + npages;
705	migrate.fault_page = fault_page;
706	scratch = (dma_addr_t *)(migrate.dst + npages);
707
708	kfd_smi_event_migration_start(node, p->lead_thread->pid,
709				      start >> PAGE_SHIFT, end >> PAGE_SHIFT,
710				      node->id, 0, prange->prefetch_loc,
711				      prange->preferred_loc, trigger);
712
713	r = migrate_vma_setup(&migrate);
714	if (r) {
715		dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n",
716			__func__, r, prange->start, prange->last);
717		goto out_free;
718	}
719
720	cpages = migrate.cpages;
721	if (!cpages) {
722		pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n",
723			 prange->start, prange->last);
724		upages = svm_migrate_unsuccessful_pages(&migrate);
725		goto out_free;
726	}
727	if (cpages != npages)
728		pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n",
729			 cpages, npages);
730	else
731		pr_debug("0x%lx pages migrated\n", cpages);
732
733	r = svm_migrate_copy_to_ram(adev, prange, &migrate, &mfence,
734				    scratch, npages);
735	migrate_vma_pages(&migrate);
736
737	upages = svm_migrate_unsuccessful_pages(&migrate);
738	pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
739		 upages, cpages, migrate.npages);
740
741	svm_migrate_copy_done(adev, mfence);
742	migrate_vma_finalize(&migrate);
743
744	kfd_smi_event_migration_end(node, p->lead_thread->pid,
745				    start >> PAGE_SHIFT, end >> PAGE_SHIFT,
746				    node->id, 0, trigger);
747
748	svm_range_dma_unmap(adev->dev, scratch, 0, npages);
749
750out_free:
751	kvfree(buf);
752out:
753	if (!r && cpages) {
754		pdd = svm_range_get_pdd_by_node(prange, node);
755		if (pdd)
756			WRITE_ONCE(pdd->page_out, pdd->page_out + cpages);
757	}
758	return r ? r : upages;
759}
760
761/**
762 * svm_migrate_vram_to_ram - migrate svm range from device to system
763 * @prange: range structure
764 * @mm: process mm, use current->mm if NULL
765 * @trigger: reason of migration
766 * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback
767 *
768 * Context: Process context, caller hold mmap read lock, prange->migrate_mutex
769 *
770 * Return:
771 * 0 - OK, otherwise error code
772 */
773int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm,
774			    uint32_t trigger, struct page *fault_page)
775{
776	struct kfd_node *node;
777	struct vm_area_struct *vma;
778	unsigned long addr;
779	unsigned long start;
780	unsigned long end;
781	unsigned long upages = 0;
782	long r = 0;
783
784	if (!prange->actual_loc) {
785		pr_debug("[0x%lx 0x%lx] already migrated to ram\n",
786			 prange->start, prange->last);
787		return 0;
788	}
789
790	node = svm_range_get_node_by_id(prange, prange->actual_loc);
791	if (!node) {
792		pr_debug("failed to get kfd node by id 0x%x\n", prange->actual_loc);
793		return -ENODEV;
794	}
795	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu 0x%x to ram\n",
796		 prange->svms, prange, prange->start, prange->last,
797		 prange->actual_loc);
798
799	start = prange->start << PAGE_SHIFT;
800	end = (prange->last + 1) << PAGE_SHIFT;
801
802	for (addr = start; addr < end;) {
803		unsigned long next;
804
805		vma = vma_lookup(mm, addr);
806		if (!vma) {
807			pr_debug("failed to find vma for prange %p\n", prange);
808			r = -EFAULT;
809			break;
810		}
811
812		next = min(vma->vm_end, end);
813		r = svm_migrate_vma_to_ram(node, prange, vma, addr, next, trigger,
814			fault_page);
815		if (r < 0) {
816			pr_debug("failed %ld to migrate prange %p\n", r, prange);
817			break;
818		} else {
819			upages += r;
820		}
821		addr = next;
822	}
823
824	if (r >= 0 && !upages) {
825		svm_range_vram_node_free(prange);
826		prange->actual_loc = 0;
827	}
828
829	return r < 0 ? r : 0;
830}
831
832/**
833 * svm_migrate_vram_to_vram - migrate svm range from device to device
834 * @prange: range structure
835 * @best_loc: the device to migrate to
836 * @mm: process mm, use current->mm if NULL
837 * @trigger: reason of migration
838 *
839 * Context: Process context, caller hold mmap read lock, svms lock, prange lock
840 *
841 * Return:
842 * 0 - OK, otherwise error code
843 */
844static int
845svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc,
846			 struct mm_struct *mm, uint32_t trigger)
847{
848	int r, retries = 3;
849
850	/*
851	 * TODO: for both devices with PCIe large bar or on same xgmi hive, skip
852	 * system memory as migration bridge
853	 */
854
855	pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc);
856
857	do {
858		r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL);
859		if (r)
860			return r;
861	} while (prange->actual_loc && --retries);
862
863	if (prange->actual_loc)
864		return -EDEADLK;
865
866	return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger);
867}
868
869int
870svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc,
871		    struct mm_struct *mm, uint32_t trigger)
872{
873	if  (!prange->actual_loc)
874		return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger);
875	else
876		return svm_migrate_vram_to_vram(prange, best_loc, mm, trigger);
877
878}
879
880/**
881 * svm_migrate_to_ram - CPU page fault handler
882 * @vmf: CPU vm fault vma, address
883 *
884 * Context: vm fault handler, caller holds the mmap read lock
885 *
886 * Return:
887 * 0 - OK
888 * VM_FAULT_SIGBUS - notice application to have SIGBUS page fault
889 */
890static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
891{
892	unsigned long addr = vmf->address;
893	struct svm_range_bo *svm_bo;
894	enum svm_work_list_ops op;
895	struct svm_range *parent;
896	struct svm_range *prange;
897	struct kfd_process *p;
898	struct mm_struct *mm;
899	int r = 0;
900
901	svm_bo = vmf->page->zone_device_data;
902	if (!svm_bo) {
903		pr_debug("failed get device page at addr 0x%lx\n", addr);
904		return VM_FAULT_SIGBUS;
905	}
906	if (!mmget_not_zero(svm_bo->eviction_fence->mm)) {
907		pr_debug("addr 0x%lx of process mm is destroyed\n", addr);
908		return VM_FAULT_SIGBUS;
909	}
910
911	mm = svm_bo->eviction_fence->mm;
912	if (mm != vmf->vma->vm_mm)
913		pr_debug("addr 0x%lx is COW mapping in child process\n", addr);
914
915	p = kfd_lookup_process_by_mm(mm);
916	if (!p) {
917		pr_debug("failed find process at fault address 0x%lx\n", addr);
918		r = VM_FAULT_SIGBUS;
919		goto out_mmput;
920	}
921	if (READ_ONCE(p->svms.faulting_task) == current) {
922		pr_debug("skipping ram migration\n");
923		r = 0;
924		goto out_unref_process;
925	}
926
927	pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr);
928	addr >>= PAGE_SHIFT;
929
930	mutex_lock(&p->svms.lock);
931
932	prange = svm_range_from_addr(&p->svms, addr, &parent);
933	if (!prange) {
934		pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, addr);
935		r = -EFAULT;
936		goto out_unlock_svms;
937	}
938
939	mutex_lock(&parent->migrate_mutex);
940	if (prange != parent)
941		mutex_lock_nested(&prange->migrate_mutex, 1);
942
943	if (!prange->actual_loc)
944		goto out_unlock_prange;
945
946	svm_range_lock(parent);
947	if (prange != parent)
948		mutex_lock_nested(&prange->lock, 1);
949	r = svm_range_split_by_granularity(p, mm, addr, parent, prange);
950	if (prange != parent)
951		mutex_unlock(&prange->lock);
952	svm_range_unlock(parent);
953	if (r) {
954		pr_debug("failed %d to split range by granularity\n", r);
955		goto out_unlock_prange;
956	}
957
958	r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm,
959				    KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
960				    vmf->page);
961	if (r)
962		pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n",
963			 r, prange->svms, prange, prange->start, prange->last);
964
965	/* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
966	if (p->xnack_enabled && parent == prange)
967		op = SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP;
968	else
969		op = SVM_OP_UPDATE_RANGE_NOTIFIER;
970	svm_range_add_list_work(&p->svms, parent, mm, op);
971	schedule_deferred_list_work(&p->svms);
972
973out_unlock_prange:
974	if (prange != parent)
975		mutex_unlock(&prange->migrate_mutex);
976	mutex_unlock(&parent->migrate_mutex);
977out_unlock_svms:
978	mutex_unlock(&p->svms.lock);
979out_unref_process:
980	pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
981	kfd_unref_process(p);
982out_mmput:
983	mmput(mm);
984	return r ? VM_FAULT_SIGBUS : 0;
985}
986
987static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
988	.page_free		= svm_migrate_page_free,
989	.migrate_to_ram		= svm_migrate_to_ram,
990};
991
992/* Each VRAM page uses sizeof(struct page) on system memory */
993#define SVM_HMM_PAGE_STRUCT_SIZE(size) ((size)/PAGE_SIZE * sizeof(struct page))
994
995int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
996{
997	struct amdgpu_kfd_dev *kfddev = &adev->kfd;
998	struct dev_pagemap *pgmap;
999	struct resource *res = NULL;
1000	unsigned long size;
1001	void *r;
1002
1003	/* Page migration works on gfx9 or newer */
1004	if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(9, 0, 1))
1005		return -EINVAL;
1006
1007	if (adev->gmc.is_app_apu)
1008		return 0;
1009
1010	pgmap = &kfddev->pgmap;
1011	memset(pgmap, 0, sizeof(*pgmap));
1012
1013	/* TODO: register all vram to HMM for now.
1014	 * should remove reserved size
1015	 */
1016	size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
1017	if (adev->gmc.xgmi.connected_to_cpu) {
1018		pgmap->range.start = adev->gmc.aper_base;
1019		pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1;
1020		pgmap->type = MEMORY_DEVICE_COHERENT;
1021	} else {
1022		res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
1023		if (IS_ERR(res))
1024			return -ENOMEM;
1025		pgmap->range.start = res->start;
1026		pgmap->range.end = res->end;
1027		pgmap->type = MEMORY_DEVICE_PRIVATE;
1028	}
1029
1030	pgmap->nr_range = 1;
1031	pgmap->ops = &svm_migrate_pgmap_ops;
1032	pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
1033	pgmap->flags = 0;
1034	/* Device manager releases device-specific resources, memory region and
1035	 * pgmap when driver disconnects from device.
1036	 */
1037	r = devm_memremap_pages(adev->dev, pgmap);
1038	if (IS_ERR(r)) {
1039		pr_err("failed to register HMM device memory\n");
1040		/* Disable SVM support capability */
1041		pgmap->type = 0;
1042		if (pgmap->type == MEMORY_DEVICE_PRIVATE)
1043			devm_release_mem_region(adev->dev, res->start, resource_size(res));
1044		return PTR_ERR(r);
1045	}
1046
1047	pr_debug("reserve %ldMB system memory for VRAM pages struct\n",
1048		 SVM_HMM_PAGE_STRUCT_SIZE(size) >> 20);
1049
1050	amdgpu_amdkfd_reserve_system_mem(SVM_HMM_PAGE_STRUCT_SIZE(size));
1051
1052	pr_info("HMM registered %ldMB device memory\n", size >> 20);
1053
1054	return 0;
1055}
1056