1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 *  mm/userfaultfd.c
4 *
5 *  Copyright (C) 2015  Red Hat, Inc.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched/signal.h>
10#include <linux/pagemap.h>
11#include <linux/rmap.h>
12#include <linux/swap.h>
13#include <linux/swapops.h>
14#include <linux/userfaultfd_k.h>
15#include <linux/mmu_notifier.h>
16#include <linux/hugetlb.h>
17#include <linux/shmem_fs.h>
18#include <asm/tlbflush.h>
19#include <asm/tlb.h>
20#include "internal.h"
21
22static __always_inline
23bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
24{
25	/* Make sure that the dst range is fully within dst_vma. */
26	if (dst_end > dst_vma->vm_end)
27		return false;
28
29	/*
30	 * Check the vma is registered in uffd, this is required to
31	 * enforce the VM_MAYWRITE check done at uffd registration
32	 * time.
33	 */
34	if (!dst_vma->vm_userfaultfd_ctx.ctx)
35		return false;
36
37	return true;
38}
39
40static __always_inline
41struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
42						 unsigned long addr)
43{
44	struct vm_area_struct *vma;
45
46	mmap_assert_locked(mm);
47	vma = vma_lookup(mm, addr);
48	if (!vma)
49		vma = ERR_PTR(-ENOENT);
50	else if (!(vma->vm_flags & VM_SHARED) &&
51		 unlikely(anon_vma_prepare(vma)))
52		vma = ERR_PTR(-ENOMEM);
53
54	return vma;
55}
56
57#ifdef CONFIG_PER_VMA_LOCK
58/*
59 * lock_vma() - Lookup and lock vma corresponding to @address.
60 * @mm: mm to search vma in.
61 * @address: address that the vma should contain.
62 *
63 * Should be called without holding mmap_lock. vma should be unlocked after use
64 * with unlock_vma().
65 *
66 * Return: A locked vma containing @address, -ENOENT if no vma is found, or
67 * -ENOMEM if anon_vma couldn't be allocated.
68 */
69static struct vm_area_struct *lock_vma(struct mm_struct *mm,
70				       unsigned long address)
71{
72	struct vm_area_struct *vma;
73
74	vma = lock_vma_under_rcu(mm, address);
75	if (vma) {
76		/*
77		 * lock_vma_under_rcu() only checks anon_vma for private
78		 * anonymous mappings. But we need to ensure it is assigned in
79		 * private file-backed vmas as well.
80		 */
81		if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
82			vma_end_read(vma);
83		else
84			return vma;
85	}
86
87	mmap_read_lock(mm);
88	vma = find_vma_and_prepare_anon(mm, address);
89	if (!IS_ERR(vma)) {
90		/*
91		 * We cannot use vma_start_read() as it may fail due to
92		 * false locked (see comment in vma_start_read()). We
93		 * can avoid that by directly locking vm_lock under
94		 * mmap_lock, which guarantees that nobody can lock the
95		 * vma for write (vma_start_write()) under us.
96		 */
97		down_read(&vma->vm_lock->lock);
98	}
99
100	mmap_read_unlock(mm);
101	return vma;
102}
103
104static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
105					      unsigned long dst_start,
106					      unsigned long len)
107{
108	struct vm_area_struct *dst_vma;
109
110	dst_vma = lock_vma(dst_mm, dst_start);
111	if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
112		return dst_vma;
113
114	vma_end_read(dst_vma);
115	return ERR_PTR(-ENOENT);
116}
117
118static void uffd_mfill_unlock(struct vm_area_struct *vma)
119{
120	vma_end_read(vma);
121}
122
123#else
124
125static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
126					      unsigned long dst_start,
127					      unsigned long len)
128{
129	struct vm_area_struct *dst_vma;
130
131	mmap_read_lock(dst_mm);
132	dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
133	if (IS_ERR(dst_vma))
134		goto out_unlock;
135
136	if (validate_dst_vma(dst_vma, dst_start + len))
137		return dst_vma;
138
139	dst_vma = ERR_PTR(-ENOENT);
140out_unlock:
141	mmap_read_unlock(dst_mm);
142	return dst_vma;
143}
144
145static void uffd_mfill_unlock(struct vm_area_struct *vma)
146{
147	mmap_read_unlock(vma->vm_mm);
148}
149#endif
150
151/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
152static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
153				 unsigned long dst_addr)
154{
155	struct inode *inode;
156	pgoff_t offset, max_off;
157
158	if (!dst_vma->vm_file)
159		return false;
160
161	inode = dst_vma->vm_file->f_inode;
162	offset = linear_page_index(dst_vma, dst_addr);
163	max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
164	return offset >= max_off;
165}
166
167/*
168 * Install PTEs, to map dst_addr (within dst_vma) to page.
169 *
170 * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
171 * and anon, and for both shared and private VMAs.
172 */
173int mfill_atomic_install_pte(pmd_t *dst_pmd,
174			     struct vm_area_struct *dst_vma,
175			     unsigned long dst_addr, struct page *page,
176			     bool newly_allocated, uffd_flags_t flags)
177{
178	int ret;
179	struct mm_struct *dst_mm = dst_vma->vm_mm;
180	pte_t _dst_pte, *dst_pte;
181	bool writable = dst_vma->vm_flags & VM_WRITE;
182	bool vm_shared = dst_vma->vm_flags & VM_SHARED;
183	bool page_in_cache = page_mapping(page);
184	spinlock_t *ptl;
185	struct folio *folio;
186
187	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
188	_dst_pte = pte_mkdirty(_dst_pte);
189	if (page_in_cache && !vm_shared)
190		writable = false;
191	if (writable)
192		_dst_pte = pte_mkwrite(_dst_pte, dst_vma);
193	if (flags & MFILL_ATOMIC_WP)
194		_dst_pte = pte_mkuffd_wp(_dst_pte);
195
196	ret = -EAGAIN;
197	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
198	if (!dst_pte)
199		goto out;
200
201	if (mfill_file_over_size(dst_vma, dst_addr)) {
202		ret = -EFAULT;
203		goto out_unlock;
204	}
205
206	ret = -EEXIST;
207	/*
208	 * We allow to overwrite a pte marker: consider when both MISSING|WP
209	 * registered, we firstly wr-protect a none pte which has no page cache
210	 * page backing it, then access the page.
211	 */
212	if (!pte_none_mostly(ptep_get(dst_pte)))
213		goto out_unlock;
214
215	folio = page_folio(page);
216	if (page_in_cache) {
217		/* Usually, cache pages are already added to LRU */
218		if (newly_allocated)
219			folio_add_lru(folio);
220		folio_add_file_rmap_pte(folio, page, dst_vma);
221	} else {
222		folio_add_new_anon_rmap(folio, dst_vma, dst_addr);
223		folio_add_lru_vma(folio, dst_vma);
224	}
225
226	/*
227	 * Must happen after rmap, as mm_counter() checks mapping (via
228	 * PageAnon()), which is set by __page_set_anon_rmap().
229	 */
230	inc_mm_counter(dst_mm, mm_counter(folio));
231
232	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
233
234	/* No need to invalidate - it was non-present before */
235	update_mmu_cache(dst_vma, dst_addr, dst_pte);
236	ret = 0;
237out_unlock:
238	pte_unmap_unlock(dst_pte, ptl);
239out:
240	return ret;
241}
242
243static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
244				 struct vm_area_struct *dst_vma,
245				 unsigned long dst_addr,
246				 unsigned long src_addr,
247				 uffd_flags_t flags,
248				 struct folio **foliop)
249{
250	void *kaddr;
251	int ret;
252	struct folio *folio;
253
254	if (!*foliop) {
255		ret = -ENOMEM;
256		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
257					dst_addr, false);
258		if (!folio)
259			goto out;
260
261		kaddr = kmap_local_folio(folio, 0);
262		/*
263		 * The read mmap_lock is held here.  Despite the
264		 * mmap_lock being read recursive a deadlock is still
265		 * possible if a writer has taken a lock.  For example:
266		 *
267		 * process A thread 1 takes read lock on own mmap_lock
268		 * process A thread 2 calls mmap, blocks taking write lock
269		 * process B thread 1 takes page fault, read lock on own mmap lock
270		 * process B thread 2 calls mmap, blocks taking write lock
271		 * process A thread 1 blocks taking read lock on process B
272		 * process B thread 1 blocks taking read lock on process A
273		 *
274		 * Disable page faults to prevent potential deadlock
275		 * and retry the copy outside the mmap_lock.
276		 */
277		pagefault_disable();
278		ret = copy_from_user(kaddr, (const void __user *) src_addr,
279				     PAGE_SIZE);
280		pagefault_enable();
281		kunmap_local(kaddr);
282
283		/* fallback to copy_from_user outside mmap_lock */
284		if (unlikely(ret)) {
285			ret = -ENOENT;
286			*foliop = folio;
287			/* don't free the page */
288			goto out;
289		}
290
291		flush_dcache_folio(folio);
292	} else {
293		folio = *foliop;
294		*foliop = NULL;
295	}
296
297	/*
298	 * The memory barrier inside __folio_mark_uptodate makes sure that
299	 * preceding stores to the page contents become visible before
300	 * the set_pte_at() write.
301	 */
302	__folio_mark_uptodate(folio);
303
304	ret = -ENOMEM;
305	if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
306		goto out_release;
307
308	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
309				       &folio->page, true, flags);
310	if (ret)
311		goto out_release;
312out:
313	return ret;
314out_release:
315	folio_put(folio);
316	goto out;
317}
318
319static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
320				     struct vm_area_struct *dst_vma,
321				     unsigned long dst_addr)
322{
323	pte_t _dst_pte, *dst_pte;
324	spinlock_t *ptl;
325	int ret;
326
327	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
328					 dst_vma->vm_page_prot));
329	ret = -EAGAIN;
330	dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
331	if (!dst_pte)
332		goto out;
333	if (mfill_file_over_size(dst_vma, dst_addr)) {
334		ret = -EFAULT;
335		goto out_unlock;
336	}
337	ret = -EEXIST;
338	if (!pte_none(ptep_get(dst_pte)))
339		goto out_unlock;
340	set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
341	/* No need to invalidate - it was non-present before */
342	update_mmu_cache(dst_vma, dst_addr, dst_pte);
343	ret = 0;
344out_unlock:
345	pte_unmap_unlock(dst_pte, ptl);
346out:
347	return ret;
348}
349
350/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
351static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
352				     struct vm_area_struct *dst_vma,
353				     unsigned long dst_addr,
354				     uffd_flags_t flags)
355{
356	struct inode *inode = file_inode(dst_vma->vm_file);
357	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
358	struct folio *folio;
359	struct page *page;
360	int ret;
361
362	ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
363	/* Our caller expects us to return -EFAULT if we failed to find folio */
364	if (ret == -ENOENT)
365		ret = -EFAULT;
366	if (ret)
367		goto out;
368	if (!folio) {
369		ret = -EFAULT;
370		goto out;
371	}
372
373	page = folio_file_page(folio, pgoff);
374	if (PageHWPoison(page)) {
375		ret = -EIO;
376		goto out_release;
377	}
378
379	ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
380				       page, false, flags);
381	if (ret)
382		goto out_release;
383
384	folio_unlock(folio);
385	ret = 0;
386out:
387	return ret;
388out_release:
389	folio_unlock(folio);
390	folio_put(folio);
391	goto out;
392}
393
394/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
395static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
396				   struct vm_area_struct *dst_vma,
397				   unsigned long dst_addr,
398				   uffd_flags_t flags)
399{
400	int ret;
401	struct mm_struct *dst_mm = dst_vma->vm_mm;
402	pte_t _dst_pte, *dst_pte;
403	spinlock_t *ptl;
404
405	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
406	ret = -EAGAIN;
407	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
408	if (!dst_pte)
409		goto out;
410
411	if (mfill_file_over_size(dst_vma, dst_addr)) {
412		ret = -EFAULT;
413		goto out_unlock;
414	}
415
416	ret = -EEXIST;
417	/* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
418	if (!pte_none(ptep_get(dst_pte)))
419		goto out_unlock;
420
421	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
422
423	/* No need to invalidate - it was non-present before */
424	update_mmu_cache(dst_vma, dst_addr, dst_pte);
425	ret = 0;
426out_unlock:
427	pte_unmap_unlock(dst_pte, ptl);
428out:
429	return ret;
430}
431
432static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
433{
434	pgd_t *pgd;
435	p4d_t *p4d;
436	pud_t *pud;
437
438	pgd = pgd_offset(mm, address);
439	p4d = p4d_alloc(mm, pgd, address);
440	if (!p4d)
441		return NULL;
442	pud = pud_alloc(mm, p4d, address);
443	if (!pud)
444		return NULL;
445	/*
446	 * Note that we didn't run this because the pmd was
447	 * missing, the *pmd may be already established and in
448	 * turn it may also be a trans_huge_pmd.
449	 */
450	return pmd_alloc(mm, pud, address);
451}
452
453#ifdef CONFIG_HUGETLB_PAGE
454/*
455 * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
456 * called with either vma-lock or mmap_lock held, it will release the lock
457 * before returning.
458 */
459static __always_inline ssize_t mfill_atomic_hugetlb(
460					      struct userfaultfd_ctx *ctx,
461					      struct vm_area_struct *dst_vma,
462					      unsigned long dst_start,
463					      unsigned long src_start,
464					      unsigned long len,
465					      uffd_flags_t flags)
466{
467	struct mm_struct *dst_mm = dst_vma->vm_mm;
468	ssize_t err;
469	pte_t *dst_pte;
470	unsigned long src_addr, dst_addr;
471	long copied;
472	struct folio *folio;
473	unsigned long vma_hpagesize;
474	pgoff_t idx;
475	u32 hash;
476	struct address_space *mapping;
477
478	/*
479	 * There is no default zero huge page for all huge page sizes as
480	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
481	 * by THP.  Since we can not reliably insert a zero page, this
482	 * feature is not supported.
483	 */
484	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
485		up_read(&ctx->map_changing_lock);
486		uffd_mfill_unlock(dst_vma);
487		return -EINVAL;
488	}
489
490	src_addr = src_start;
491	dst_addr = dst_start;
492	copied = 0;
493	folio = NULL;
494	vma_hpagesize = vma_kernel_pagesize(dst_vma);
495
496	/*
497	 * Validate alignment based on huge page size
498	 */
499	err = -EINVAL;
500	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
501		goto out_unlock;
502
503retry:
504	/*
505	 * On routine entry dst_vma is set.  If we had to drop mmap_lock and
506	 * retry, dst_vma will be set to NULL and we must lookup again.
507	 */
508	if (!dst_vma) {
509		dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
510		if (IS_ERR(dst_vma)) {
511			err = PTR_ERR(dst_vma);
512			goto out;
513		}
514
515		err = -ENOENT;
516		if (!is_vm_hugetlb_page(dst_vma))
517			goto out_unlock_vma;
518
519		err = -EINVAL;
520		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
521			goto out_unlock_vma;
522
523		/*
524		 * If memory mappings are changing because of non-cooperative
525		 * operation (e.g. mremap) running in parallel, bail out and
526		 * request the user to retry later
527		 */
528		down_read(&ctx->map_changing_lock);
529		err = -EAGAIN;
530		if (atomic_read(&ctx->mmap_changing))
531			goto out_unlock;
532	}
533
534	while (src_addr < src_start + len) {
535		BUG_ON(dst_addr >= dst_start + len);
536
537		/*
538		 * Serialize via vma_lock and hugetlb_fault_mutex.
539		 * vma_lock ensures the dst_pte remains valid even
540		 * in the case of shared pmds.  fault mutex prevents
541		 * races with other faulting threads.
542		 */
543		idx = linear_page_index(dst_vma, dst_addr);
544		mapping = dst_vma->vm_file->f_mapping;
545		hash = hugetlb_fault_mutex_hash(mapping, idx);
546		mutex_lock(&hugetlb_fault_mutex_table[hash]);
547		hugetlb_vma_lock_read(dst_vma);
548
549		err = -ENOMEM;
550		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
551		if (!dst_pte) {
552			hugetlb_vma_unlock_read(dst_vma);
553			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
554			goto out_unlock;
555		}
556
557		if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
558		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
559			err = -EEXIST;
560			hugetlb_vma_unlock_read(dst_vma);
561			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
562			goto out_unlock;
563		}
564
565		err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
566					       src_addr, flags, &folio);
567
568		hugetlb_vma_unlock_read(dst_vma);
569		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
570
571		cond_resched();
572
573		if (unlikely(err == -ENOENT)) {
574			up_read(&ctx->map_changing_lock);
575			uffd_mfill_unlock(dst_vma);
576			BUG_ON(!folio);
577
578			err = copy_folio_from_user(folio,
579						   (const void __user *)src_addr, true);
580			if (unlikely(err)) {
581				err = -EFAULT;
582				goto out;
583			}
584
585			dst_vma = NULL;
586			goto retry;
587		} else
588			BUG_ON(folio);
589
590		if (!err) {
591			dst_addr += vma_hpagesize;
592			src_addr += vma_hpagesize;
593			copied += vma_hpagesize;
594
595			if (fatal_signal_pending(current))
596				err = -EINTR;
597		}
598		if (err)
599			break;
600	}
601
602out_unlock:
603	up_read(&ctx->map_changing_lock);
604out_unlock_vma:
605	uffd_mfill_unlock(dst_vma);
606out:
607	if (folio)
608		folio_put(folio);
609	BUG_ON(copied < 0);
610	BUG_ON(err > 0);
611	BUG_ON(!copied && !err);
612	return copied ? copied : err;
613}
614#else /* !CONFIG_HUGETLB_PAGE */
615/* fail at build time if gcc attempts to use this */
616extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
617				    struct vm_area_struct *dst_vma,
618				    unsigned long dst_start,
619				    unsigned long src_start,
620				    unsigned long len,
621				    uffd_flags_t flags);
622#endif /* CONFIG_HUGETLB_PAGE */
623
624static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
625						struct vm_area_struct *dst_vma,
626						unsigned long dst_addr,
627						unsigned long src_addr,
628						uffd_flags_t flags,
629						struct folio **foliop)
630{
631	ssize_t err;
632
633	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
634		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
635						 dst_addr, flags);
636	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
637		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
638					       dst_addr, flags);
639	}
640
641	/*
642	 * The normal page fault path for a shmem will invoke the
643	 * fault, fill the hole in the file and COW it right away. The
644	 * result generates plain anonymous memory. So when we are
645	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
646	 * generate anonymous memory directly without actually filling
647	 * the hole. For the MAP_PRIVATE case the robustness check
648	 * only happens in the pagetable (to verify it's still none)
649	 * and not in the radix tree.
650	 */
651	if (!(dst_vma->vm_flags & VM_SHARED)) {
652		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
653			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
654						    dst_addr, src_addr,
655						    flags, foliop);
656		else
657			err = mfill_atomic_pte_zeropage(dst_pmd,
658						 dst_vma, dst_addr);
659	} else {
660		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
661					     dst_addr, src_addr,
662					     flags, foliop);
663	}
664
665	return err;
666}
667
668static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
669					    unsigned long dst_start,
670					    unsigned long src_start,
671					    unsigned long len,
672					    uffd_flags_t flags)
673{
674	struct mm_struct *dst_mm = ctx->mm;
675	struct vm_area_struct *dst_vma;
676	ssize_t err;
677	pmd_t *dst_pmd;
678	unsigned long src_addr, dst_addr;
679	long copied;
680	struct folio *folio;
681
682	/*
683	 * Sanitize the command parameters:
684	 */
685	BUG_ON(dst_start & ~PAGE_MASK);
686	BUG_ON(len & ~PAGE_MASK);
687
688	/* Does the address range wrap, or is the span zero-sized? */
689	BUG_ON(src_start + len <= src_start);
690	BUG_ON(dst_start + len <= dst_start);
691
692	src_addr = src_start;
693	dst_addr = dst_start;
694	copied = 0;
695	folio = NULL;
696retry:
697	/*
698	 * Make sure the vma is not shared, that the dst range is
699	 * both valid and fully within a single existing vma.
700	 */
701	dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
702	if (IS_ERR(dst_vma)) {
703		err = PTR_ERR(dst_vma);
704		goto out;
705	}
706
707	/*
708	 * If memory mappings are changing because of non-cooperative
709	 * operation (e.g. mremap) running in parallel, bail out and
710	 * request the user to retry later
711	 */
712	down_read(&ctx->map_changing_lock);
713	err = -EAGAIN;
714	if (atomic_read(&ctx->mmap_changing))
715		goto out_unlock;
716
717	err = -EINVAL;
718	/*
719	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
720	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
721	 */
722	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
723	    dst_vma->vm_flags & VM_SHARED))
724		goto out_unlock;
725
726	/*
727	 * validate 'mode' now that we know the dst_vma: don't allow
728	 * a wrprotect copy if the userfaultfd didn't register as WP.
729	 */
730	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
731		goto out_unlock;
732
733	/*
734	 * If this is a HUGETLB vma, pass off to appropriate routine
735	 */
736	if (is_vm_hugetlb_page(dst_vma))
737		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
738					     src_start, len, flags);
739
740	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
741		goto out_unlock;
742	if (!vma_is_shmem(dst_vma) &&
743	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
744		goto out_unlock;
745
746	while (src_addr < src_start + len) {
747		pmd_t dst_pmdval;
748
749		BUG_ON(dst_addr >= dst_start + len);
750
751		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
752		if (unlikely(!dst_pmd)) {
753			err = -ENOMEM;
754			break;
755		}
756
757		dst_pmdval = pmdp_get_lockless(dst_pmd);
758		/*
759		 * If the dst_pmd is mapped as THP don't
760		 * override it and just be strict.
761		 */
762		if (unlikely(pmd_trans_huge(dst_pmdval))) {
763			err = -EEXIST;
764			break;
765		}
766		if (unlikely(pmd_none(dst_pmdval)) &&
767		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
768			err = -ENOMEM;
769			break;
770		}
771		/* If an huge pmd materialized from under us fail */
772		if (unlikely(pmd_trans_huge(*dst_pmd))) {
773			err = -EFAULT;
774			break;
775		}
776
777		BUG_ON(pmd_none(*dst_pmd));
778		BUG_ON(pmd_trans_huge(*dst_pmd));
779
780		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
781				       src_addr, flags, &folio);
782		cond_resched();
783
784		if (unlikely(err == -ENOENT)) {
785			void *kaddr;
786
787			up_read(&ctx->map_changing_lock);
788			uffd_mfill_unlock(dst_vma);
789			BUG_ON(!folio);
790
791			kaddr = kmap_local_folio(folio, 0);
792			err = copy_from_user(kaddr,
793					     (const void __user *) src_addr,
794					     PAGE_SIZE);
795			kunmap_local(kaddr);
796			if (unlikely(err)) {
797				err = -EFAULT;
798				goto out;
799			}
800			flush_dcache_folio(folio);
801			goto retry;
802		} else
803			BUG_ON(folio);
804
805		if (!err) {
806			dst_addr += PAGE_SIZE;
807			src_addr += PAGE_SIZE;
808			copied += PAGE_SIZE;
809
810			if (fatal_signal_pending(current))
811				err = -EINTR;
812		}
813		if (err)
814			break;
815	}
816
817out_unlock:
818	up_read(&ctx->map_changing_lock);
819	uffd_mfill_unlock(dst_vma);
820out:
821	if (folio)
822		folio_put(folio);
823	BUG_ON(copied < 0);
824	BUG_ON(err > 0);
825	BUG_ON(!copied && !err);
826	return copied ? copied : err;
827}
828
829ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
830			  unsigned long src_start, unsigned long len,
831			  uffd_flags_t flags)
832{
833	return mfill_atomic(ctx, dst_start, src_start, len,
834			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
835}
836
837ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
838			      unsigned long start,
839			      unsigned long len)
840{
841	return mfill_atomic(ctx, start, 0, len,
842			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
843}
844
845ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
846			      unsigned long len, uffd_flags_t flags)
847{
848
849	/*
850	 * A caller might reasonably assume that UFFDIO_CONTINUE contains an
851	 * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
852	 * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
853	 * subsequent loads from the page through the newly mapped address range.
854	 */
855	smp_wmb();
856
857	return mfill_atomic(ctx, start, 0, len,
858			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
859}
860
861ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
862			    unsigned long len, uffd_flags_t flags)
863{
864	return mfill_atomic(ctx, start, 0, len,
865			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
866}
867
868long uffd_wp_range(struct vm_area_struct *dst_vma,
869		   unsigned long start, unsigned long len, bool enable_wp)
870{
871	unsigned int mm_cp_flags;
872	struct mmu_gather tlb;
873	long ret;
874
875	VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
876			"The address range exceeds VMA boundary.\n");
877	if (enable_wp)
878		mm_cp_flags = MM_CP_UFFD_WP;
879	else
880		mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
881
882	/*
883	 * vma->vm_page_prot already reflects that uffd-wp is enabled for this
884	 * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
885	 * to be write-protected as default whenever protection changes.
886	 * Try upgrading write permissions manually.
887	 */
888	if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
889		mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
890	tlb_gather_mmu(&tlb, dst_vma->vm_mm);
891	ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
892	tlb_finish_mmu(&tlb);
893
894	return ret;
895}
896
897int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
898			unsigned long len, bool enable_wp)
899{
900	struct mm_struct *dst_mm = ctx->mm;
901	unsigned long end = start + len;
902	unsigned long _start, _end;
903	struct vm_area_struct *dst_vma;
904	unsigned long page_mask;
905	long err;
906	VMA_ITERATOR(vmi, dst_mm, start);
907
908	/*
909	 * Sanitize the command parameters:
910	 */
911	BUG_ON(start & ~PAGE_MASK);
912	BUG_ON(len & ~PAGE_MASK);
913
914	/* Does the address range wrap, or is the span zero-sized? */
915	BUG_ON(start + len <= start);
916
917	mmap_read_lock(dst_mm);
918
919	/*
920	 * If memory mappings are changing because of non-cooperative
921	 * operation (e.g. mremap) running in parallel, bail out and
922	 * request the user to retry later
923	 */
924	down_read(&ctx->map_changing_lock);
925	err = -EAGAIN;
926	if (atomic_read(&ctx->mmap_changing))
927		goto out_unlock;
928
929	err = -ENOENT;
930	for_each_vma_range(vmi, dst_vma, end) {
931
932		if (!userfaultfd_wp(dst_vma)) {
933			err = -ENOENT;
934			break;
935		}
936
937		if (is_vm_hugetlb_page(dst_vma)) {
938			err = -EINVAL;
939			page_mask = vma_kernel_pagesize(dst_vma) - 1;
940			if ((start & page_mask) || (len & page_mask))
941				break;
942		}
943
944		_start = max(dst_vma->vm_start, start);
945		_end = min(dst_vma->vm_end, end);
946
947		err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
948
949		/* Return 0 on success, <0 on failures */
950		if (err < 0)
951			break;
952		err = 0;
953	}
954out_unlock:
955	up_read(&ctx->map_changing_lock);
956	mmap_read_unlock(dst_mm);
957	return err;
958}
959
960
961void double_pt_lock(spinlock_t *ptl1,
962		    spinlock_t *ptl2)
963	__acquires(ptl1)
964	__acquires(ptl2)
965{
966	spinlock_t *ptl_tmp;
967
968	if (ptl1 > ptl2) {
969		/* exchange ptl1 and ptl2 */
970		ptl_tmp = ptl1;
971		ptl1 = ptl2;
972		ptl2 = ptl_tmp;
973	}
974	/* lock in virtual address order to avoid lock inversion */
975	spin_lock(ptl1);
976	if (ptl1 != ptl2)
977		spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
978	else
979		__acquire(ptl2);
980}
981
982void double_pt_unlock(spinlock_t *ptl1,
983		      spinlock_t *ptl2)
984	__releases(ptl1)
985	__releases(ptl2)
986{
987	spin_unlock(ptl1);
988	if (ptl1 != ptl2)
989		spin_unlock(ptl2);
990	else
991		__release(ptl2);
992}
993
994
995static int move_present_pte(struct mm_struct *mm,
996			    struct vm_area_struct *dst_vma,
997			    struct vm_area_struct *src_vma,
998			    unsigned long dst_addr, unsigned long src_addr,
999			    pte_t *dst_pte, pte_t *src_pte,
1000			    pte_t orig_dst_pte, pte_t orig_src_pte,
1001			    spinlock_t *dst_ptl, spinlock_t *src_ptl,
1002			    struct folio *src_folio)
1003{
1004	int err = 0;
1005
1006	double_pt_lock(dst_ptl, src_ptl);
1007
1008	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1009	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1010		err = -EAGAIN;
1011		goto out;
1012	}
1013	if (folio_test_large(src_folio) ||
1014	    folio_maybe_dma_pinned(src_folio) ||
1015	    !PageAnonExclusive(&src_folio->page)) {
1016		err = -EBUSY;
1017		goto out;
1018	}
1019
1020	orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
1021	/* Folio got pinned from under us. Put it back and fail the move. */
1022	if (folio_maybe_dma_pinned(src_folio)) {
1023		set_pte_at(mm, src_addr, src_pte, orig_src_pte);
1024		err = -EBUSY;
1025		goto out;
1026	}
1027
1028	folio_move_anon_rmap(src_folio, dst_vma);
1029	WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
1030
1031	orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
1032	/* Follow mremap() behavior and treat the entry dirty after the move */
1033	orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
1034
1035	set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
1036out:
1037	double_pt_unlock(dst_ptl, src_ptl);
1038	return err;
1039}
1040
1041static int move_swap_pte(struct mm_struct *mm,
1042			 unsigned long dst_addr, unsigned long src_addr,
1043			 pte_t *dst_pte, pte_t *src_pte,
1044			 pte_t orig_dst_pte, pte_t orig_src_pte,
1045			 spinlock_t *dst_ptl, spinlock_t *src_ptl)
1046{
1047	if (!pte_swp_exclusive(orig_src_pte))
1048		return -EBUSY;
1049
1050	double_pt_lock(dst_ptl, src_ptl);
1051
1052	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1053	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1054		double_pt_unlock(dst_ptl, src_ptl);
1055		return -EAGAIN;
1056	}
1057
1058	orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
1059	set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
1060	double_pt_unlock(dst_ptl, src_ptl);
1061
1062	return 0;
1063}
1064
1065static int move_zeropage_pte(struct mm_struct *mm,
1066			     struct vm_area_struct *dst_vma,
1067			     struct vm_area_struct *src_vma,
1068			     unsigned long dst_addr, unsigned long src_addr,
1069			     pte_t *dst_pte, pte_t *src_pte,
1070			     pte_t orig_dst_pte, pte_t orig_src_pte,
1071			     spinlock_t *dst_ptl, spinlock_t *src_ptl)
1072{
1073	pte_t zero_pte;
1074
1075	double_pt_lock(dst_ptl, src_ptl);
1076	if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
1077	    !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
1078		double_pt_unlock(dst_ptl, src_ptl);
1079		return -EAGAIN;
1080	}
1081
1082	zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
1083					 dst_vma->vm_page_prot));
1084	ptep_clear_flush(src_vma, src_addr, src_pte);
1085	set_pte_at(mm, dst_addr, dst_pte, zero_pte);
1086	double_pt_unlock(dst_ptl, src_ptl);
1087
1088	return 0;
1089}
1090
1091
1092/*
1093 * The mmap_lock for reading is held by the caller. Just move the page
1094 * from src_pmd to dst_pmd if possible, and return true if succeeded
1095 * in moving the page.
1096 */
1097static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
1098			  struct vm_area_struct *dst_vma,
1099			  struct vm_area_struct *src_vma,
1100			  unsigned long dst_addr, unsigned long src_addr,
1101			  __u64 mode)
1102{
1103	swp_entry_t entry;
1104	pte_t orig_src_pte, orig_dst_pte;
1105	pte_t src_folio_pte;
1106	spinlock_t *src_ptl, *dst_ptl;
1107	pte_t *src_pte = NULL;
1108	pte_t *dst_pte = NULL;
1109
1110	struct folio *src_folio = NULL;
1111	struct anon_vma *src_anon_vma = NULL;
1112	struct mmu_notifier_range range;
1113	int err = 0;
1114
1115	flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
1116	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
1117				src_addr, src_addr + PAGE_SIZE);
1118	mmu_notifier_invalidate_range_start(&range);
1119retry:
1120	dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl);
1121
1122	/* Retry if a huge pmd materialized from under us */
1123	if (unlikely(!dst_pte)) {
1124		err = -EAGAIN;
1125		goto out;
1126	}
1127
1128	src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl);
1129
1130	/*
1131	 * We held the mmap_lock for reading so MADV_DONTNEED
1132	 * can zap transparent huge pages under us, or the
1133	 * transparent huge page fault can establish new
1134	 * transparent huge pages under us.
1135	 */
1136	if (unlikely(!src_pte)) {
1137		err = -EAGAIN;
1138		goto out;
1139	}
1140
1141	/* Sanity checks before the operation */
1142	if (WARN_ON_ONCE(pmd_none(*dst_pmd)) ||	WARN_ON_ONCE(pmd_none(*src_pmd)) ||
1143	    WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) {
1144		err = -EINVAL;
1145		goto out;
1146	}
1147
1148	spin_lock(dst_ptl);
1149	orig_dst_pte = ptep_get(dst_pte);
1150	spin_unlock(dst_ptl);
1151	if (!pte_none(orig_dst_pte)) {
1152		err = -EEXIST;
1153		goto out;
1154	}
1155
1156	spin_lock(src_ptl);
1157	orig_src_pte = ptep_get(src_pte);
1158	spin_unlock(src_ptl);
1159	if (pte_none(orig_src_pte)) {
1160		if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
1161			err = -ENOENT;
1162		else /* nothing to do to move a hole */
1163			err = 0;
1164		goto out;
1165	}
1166
1167	/* If PTE changed after we locked the folio them start over */
1168	if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
1169		err = -EAGAIN;
1170		goto out;
1171	}
1172
1173	if (pte_present(orig_src_pte)) {
1174		if (is_zero_pfn(pte_pfn(orig_src_pte))) {
1175			err = move_zeropage_pte(mm, dst_vma, src_vma,
1176					       dst_addr, src_addr, dst_pte, src_pte,
1177					       orig_dst_pte, orig_src_pte,
1178					       dst_ptl, src_ptl);
1179			goto out;
1180		}
1181
1182		/*
1183		 * Pin and lock both source folio and anon_vma. Since we are in
1184		 * RCU read section, we can't block, so on contention have to
1185		 * unmap the ptes, obtain the lock and retry.
1186		 */
1187		if (!src_folio) {
1188			struct folio *folio;
1189
1190			/*
1191			 * Pin the page while holding the lock to be sure the
1192			 * page isn't freed under us
1193			 */
1194			spin_lock(src_ptl);
1195			if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
1196				spin_unlock(src_ptl);
1197				err = -EAGAIN;
1198				goto out;
1199			}
1200
1201			folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
1202			if (!folio || !PageAnonExclusive(&folio->page)) {
1203				spin_unlock(src_ptl);
1204				err = -EBUSY;
1205				goto out;
1206			}
1207
1208			folio_get(folio);
1209			src_folio = folio;
1210			src_folio_pte = orig_src_pte;
1211			spin_unlock(src_ptl);
1212
1213			if (!folio_trylock(src_folio)) {
1214				pte_unmap(&orig_src_pte);
1215				pte_unmap(&orig_dst_pte);
1216				src_pte = dst_pte = NULL;
1217				/* now we can block and wait */
1218				folio_lock(src_folio);
1219				goto retry;
1220			}
1221
1222			if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
1223				err = -EBUSY;
1224				goto out;
1225			}
1226		}
1227
1228		/* at this point we have src_folio locked */
1229		if (folio_test_large(src_folio)) {
1230			/* split_folio() can block */
1231			pte_unmap(&orig_src_pte);
1232			pte_unmap(&orig_dst_pte);
1233			src_pte = dst_pte = NULL;
1234			err = split_folio(src_folio);
1235			if (err)
1236				goto out;
1237			/* have to reacquire the folio after it got split */
1238			folio_unlock(src_folio);
1239			folio_put(src_folio);
1240			src_folio = NULL;
1241			goto retry;
1242		}
1243
1244		if (!src_anon_vma) {
1245			/*
1246			 * folio_referenced walks the anon_vma chain
1247			 * without the folio lock. Serialize against it with
1248			 * the anon_vma lock, the folio lock is not enough.
1249			 */
1250			src_anon_vma = folio_get_anon_vma(src_folio);
1251			if (!src_anon_vma) {
1252				/* page was unmapped from under us */
1253				err = -EAGAIN;
1254				goto out;
1255			}
1256			if (!anon_vma_trylock_write(src_anon_vma)) {
1257				pte_unmap(&orig_src_pte);
1258				pte_unmap(&orig_dst_pte);
1259				src_pte = dst_pte = NULL;
1260				/* now we can block and wait */
1261				anon_vma_lock_write(src_anon_vma);
1262				goto retry;
1263			}
1264		}
1265
1266		err = move_present_pte(mm,  dst_vma, src_vma,
1267				       dst_addr, src_addr, dst_pte, src_pte,
1268				       orig_dst_pte, orig_src_pte,
1269				       dst_ptl, src_ptl, src_folio);
1270	} else {
1271		entry = pte_to_swp_entry(orig_src_pte);
1272		if (non_swap_entry(entry)) {
1273			if (is_migration_entry(entry)) {
1274				pte_unmap(&orig_src_pte);
1275				pte_unmap(&orig_dst_pte);
1276				src_pte = dst_pte = NULL;
1277				migration_entry_wait(mm, src_pmd, src_addr);
1278				err = -EAGAIN;
1279			} else
1280				err = -EFAULT;
1281			goto out;
1282		}
1283
1284		err = move_swap_pte(mm, dst_addr, src_addr,
1285				    dst_pte, src_pte,
1286				    orig_dst_pte, orig_src_pte,
1287				    dst_ptl, src_ptl);
1288	}
1289
1290out:
1291	if (src_anon_vma) {
1292		anon_vma_unlock_write(src_anon_vma);
1293		put_anon_vma(src_anon_vma);
1294	}
1295	if (src_folio) {
1296		folio_unlock(src_folio);
1297		folio_put(src_folio);
1298	}
1299	if (dst_pte)
1300		pte_unmap(dst_pte);
1301	if (src_pte)
1302		pte_unmap(src_pte);
1303	mmu_notifier_invalidate_range_end(&range);
1304
1305	return err;
1306}
1307
1308#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1309static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1310					unsigned long src_addr,
1311					unsigned long src_end)
1312{
1313	return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
1314		src_end - src_addr < HPAGE_PMD_SIZE;
1315}
1316#else
1317static inline bool move_splits_huge_pmd(unsigned long dst_addr,
1318					unsigned long src_addr,
1319					unsigned long src_end)
1320{
1321	/* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
1322	return false;
1323}
1324#endif
1325
1326static inline bool vma_move_compatible(struct vm_area_struct *vma)
1327{
1328	return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB |
1329				  VM_MIXEDMAP | VM_SHADOW_STACK));
1330}
1331
1332static int validate_move_areas(struct userfaultfd_ctx *ctx,
1333			       struct vm_area_struct *src_vma,
1334			       struct vm_area_struct *dst_vma)
1335{
1336	/* Only allow moving if both have the same access and protection */
1337	if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
1338	    pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
1339		return -EINVAL;
1340
1341	/* Only allow moving if both are mlocked or both aren't */
1342	if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
1343		return -EINVAL;
1344
1345	/*
1346	 * For now, we keep it simple and only move between writable VMAs.
1347	 * Access flags are equal, therefore cheching only the source is enough.
1348	 */
1349	if (!(src_vma->vm_flags & VM_WRITE))
1350		return -EINVAL;
1351
1352	/* Check if vma flags indicate content which can be moved */
1353	if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
1354		return -EINVAL;
1355
1356	/* Ensure dst_vma is registered in uffd we are operating on */
1357	if (!dst_vma->vm_userfaultfd_ctx.ctx ||
1358	    dst_vma->vm_userfaultfd_ctx.ctx != ctx)
1359		return -EINVAL;
1360
1361	/* Only allow moving across anonymous vmas */
1362	if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
1363		return -EINVAL;
1364
1365	return 0;
1366}
1367
1368static __always_inline
1369int find_vmas_mm_locked(struct mm_struct *mm,
1370			unsigned long dst_start,
1371			unsigned long src_start,
1372			struct vm_area_struct **dst_vmap,
1373			struct vm_area_struct **src_vmap)
1374{
1375	struct vm_area_struct *vma;
1376
1377	mmap_assert_locked(mm);
1378	vma = find_vma_and_prepare_anon(mm, dst_start);
1379	if (IS_ERR(vma))
1380		return PTR_ERR(vma);
1381
1382	*dst_vmap = vma;
1383	/* Skip finding src_vma if src_start is in dst_vma */
1384	if (src_start >= vma->vm_start && src_start < vma->vm_end)
1385		goto out_success;
1386
1387	vma = vma_lookup(mm, src_start);
1388	if (!vma)
1389		return -ENOENT;
1390out_success:
1391	*src_vmap = vma;
1392	return 0;
1393}
1394
1395#ifdef CONFIG_PER_VMA_LOCK
1396static int uffd_move_lock(struct mm_struct *mm,
1397			  unsigned long dst_start,
1398			  unsigned long src_start,
1399			  struct vm_area_struct **dst_vmap,
1400			  struct vm_area_struct **src_vmap)
1401{
1402	struct vm_area_struct *vma;
1403	int err;
1404
1405	vma = lock_vma(mm, dst_start);
1406	if (IS_ERR(vma))
1407		return PTR_ERR(vma);
1408
1409	*dst_vmap = vma;
1410	/*
1411	 * Skip finding src_vma if src_start is in dst_vma. This also ensures
1412	 * that we don't lock the same vma twice.
1413	 */
1414	if (src_start >= vma->vm_start && src_start < vma->vm_end) {
1415		*src_vmap = vma;
1416		return 0;
1417	}
1418
1419	/*
1420	 * Using lock_vma() to get src_vma can lead to following deadlock:
1421	 *
1422	 * Thread1				Thread2
1423	 * -------				-------
1424	 * vma_start_read(dst_vma)
1425	 *					mmap_write_lock(mm)
1426	 *					vma_start_write(src_vma)
1427	 * vma_start_read(src_vma)
1428	 * mmap_read_lock(mm)
1429	 *					vma_start_write(dst_vma)
1430	 */
1431	*src_vmap = lock_vma_under_rcu(mm, src_start);
1432	if (likely(*src_vmap))
1433		return 0;
1434
1435	/* Undo any locking and retry in mmap_lock critical section */
1436	vma_end_read(*dst_vmap);
1437
1438	mmap_read_lock(mm);
1439	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1440	if (!err) {
1441		/*
1442		 * See comment in lock_vma() as to why not using
1443		 * vma_start_read() here.
1444		 */
1445		down_read(&(*dst_vmap)->vm_lock->lock);
1446		if (*dst_vmap != *src_vmap)
1447			down_read_nested(&(*src_vmap)->vm_lock->lock,
1448					 SINGLE_DEPTH_NESTING);
1449	}
1450	mmap_read_unlock(mm);
1451	return err;
1452}
1453
1454static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1455			     struct vm_area_struct *src_vma)
1456{
1457	vma_end_read(src_vma);
1458	if (src_vma != dst_vma)
1459		vma_end_read(dst_vma);
1460}
1461
1462#else
1463
1464static int uffd_move_lock(struct mm_struct *mm,
1465			  unsigned long dst_start,
1466			  unsigned long src_start,
1467			  struct vm_area_struct **dst_vmap,
1468			  struct vm_area_struct **src_vmap)
1469{
1470	int err;
1471
1472	mmap_read_lock(mm);
1473	err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
1474	if (err)
1475		mmap_read_unlock(mm);
1476	return err;
1477}
1478
1479static void uffd_move_unlock(struct vm_area_struct *dst_vma,
1480			     struct vm_area_struct *src_vma)
1481{
1482	mmap_assert_locked(src_vma->vm_mm);
1483	mmap_read_unlock(dst_vma->vm_mm);
1484}
1485#endif
1486
1487/**
1488 * move_pages - move arbitrary anonymous pages of an existing vma
1489 * @ctx: pointer to the userfaultfd context
1490 * @dst_start: start of the destination virtual memory range
1491 * @src_start: start of the source virtual memory range
1492 * @len: length of the virtual memory range
1493 * @mode: flags from uffdio_move.mode
1494 *
1495 * It will either use the mmap_lock in read mode or per-vma locks
1496 *
1497 * move_pages() remaps arbitrary anonymous pages atomically in zero
1498 * copy. It only works on non shared anonymous pages because those can
1499 * be relocated without generating non linear anon_vmas in the rmap
1500 * code.
1501 *
1502 * It provides a zero copy mechanism to handle userspace page faults.
1503 * The source vma pages should have mapcount == 1, which can be
1504 * enforced by using madvise(MADV_DONTFORK) on src vma.
1505 *
1506 * The thread receiving the page during the userland page fault
1507 * will receive the faulting page in the source vma through the network,
1508 * storage or any other I/O device (MADV_DONTFORK in the source vma
1509 * avoids move_pages() to fail with -EBUSY if the process forks before
1510 * move_pages() is called), then it will call move_pages() to map the
1511 * page in the faulting address in the destination vma.
1512 *
1513 * This userfaultfd command works purely via pagetables, so it's the
1514 * most efficient way to move physical non shared anonymous pages
1515 * across different virtual addresses. Unlike mremap()/mmap()/munmap()
1516 * it does not create any new vmas. The mapping in the destination
1517 * address is atomic.
1518 *
1519 * It only works if the vma protection bits are identical from the
1520 * source and destination vma.
1521 *
1522 * It can remap non shared anonymous pages within the same vma too.
1523 *
1524 * If the source virtual memory range has any unmapped holes, or if
1525 * the destination virtual memory range is not a whole unmapped hole,
1526 * move_pages() will fail respectively with -ENOENT or -EEXIST. This
1527 * provides a very strict behavior to avoid any chance of memory
1528 * corruption going unnoticed if there are userland race conditions.
1529 * Only one thread should resolve the userland page fault at any given
1530 * time for any given faulting address. This means that if two threads
1531 * try to both call move_pages() on the same destination address at the
1532 * same time, the second thread will get an explicit error from this
1533 * command.
1534 *
1535 * The command retval will return "len" is successful. The command
1536 * however can be interrupted by fatal signals or errors. If
1537 * interrupted it will return the number of bytes successfully
1538 * remapped before the interruption if any, or the negative error if
1539 * none. It will never return zero. Either it will return an error or
1540 * an amount of bytes successfully moved. If the retval reports a
1541 * "short" remap, the move_pages() command should be repeated by
1542 * userland with src+retval, dst+reval, len-retval if it wants to know
1543 * about the error that interrupted it.
1544 *
1545 * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
1546 * prevent -ENOENT errors to materialize if there are holes in the
1547 * source virtual range that is being remapped. The holes will be
1548 * accounted as successfully remapped in the retval of the
1549 * command. This is mostly useful to remap hugepage naturally aligned
1550 * virtual regions without knowing if there are transparent hugepage
1551 * in the regions or not, but preventing the risk of having to split
1552 * the hugepmd during the remap.
1553 *
1554 * If there's any rmap walk that is taking the anon_vma locks without
1555 * first obtaining the folio lock (the only current instance is
1556 * folio_referenced), they will have to verify if the folio->mapping
1557 * has changed after taking the anon_vma lock. If it changed they
1558 * should release the lock and retry obtaining a new anon_vma, because
1559 * it means the anon_vma was changed by move_pages() before the lock
1560 * could be obtained. This is the only additional complexity added to
1561 * the rmap code to provide this anonymous page remapping functionality.
1562 */
1563ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
1564		   unsigned long src_start, unsigned long len, __u64 mode)
1565{
1566	struct mm_struct *mm = ctx->mm;
1567	struct vm_area_struct *src_vma, *dst_vma;
1568	unsigned long src_addr, dst_addr;
1569	pmd_t *src_pmd, *dst_pmd;
1570	long err = -EINVAL;
1571	ssize_t moved = 0;
1572
1573	/* Sanitize the command parameters. */
1574	if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
1575	    WARN_ON_ONCE(dst_start & ~PAGE_MASK) ||
1576	    WARN_ON_ONCE(len & ~PAGE_MASK))
1577		goto out;
1578
1579	/* Does the address range wrap, or is the span zero-sized? */
1580	if (WARN_ON_ONCE(src_start + len <= src_start) ||
1581	    WARN_ON_ONCE(dst_start + len <= dst_start))
1582		goto out;
1583
1584	err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
1585	if (err)
1586		goto out;
1587
1588	/* Re-check after taking map_changing_lock */
1589	err = -EAGAIN;
1590	down_read(&ctx->map_changing_lock);
1591	if (likely(atomic_read(&ctx->mmap_changing)))
1592		goto out_unlock;
1593	/*
1594	 * Make sure the vma is not shared, that the src and dst remap
1595	 * ranges are both valid and fully within a single existing
1596	 * vma.
1597	 */
1598	err = -EINVAL;
1599	if (src_vma->vm_flags & VM_SHARED)
1600		goto out_unlock;
1601	if (src_start + len > src_vma->vm_end)
1602		goto out_unlock;
1603
1604	if (dst_vma->vm_flags & VM_SHARED)
1605		goto out_unlock;
1606	if (dst_start + len > dst_vma->vm_end)
1607		goto out_unlock;
1608
1609	err = validate_move_areas(ctx, src_vma, dst_vma);
1610	if (err)
1611		goto out_unlock;
1612
1613	for (src_addr = src_start, dst_addr = dst_start;
1614	     src_addr < src_start + len;) {
1615		spinlock_t *ptl;
1616		pmd_t dst_pmdval;
1617		unsigned long step_size;
1618
1619		/*
1620		 * Below works because anonymous area would not have a
1621		 * transparent huge PUD. If file-backed support is added,
1622		 * that case would need to be handled here.
1623		 */
1624		src_pmd = mm_find_pmd(mm, src_addr);
1625		if (unlikely(!src_pmd)) {
1626			if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1627				err = -ENOENT;
1628				break;
1629			}
1630			src_pmd = mm_alloc_pmd(mm, src_addr);
1631			if (unlikely(!src_pmd)) {
1632				err = -ENOMEM;
1633				break;
1634			}
1635		}
1636		dst_pmd = mm_alloc_pmd(mm, dst_addr);
1637		if (unlikely(!dst_pmd)) {
1638			err = -ENOMEM;
1639			break;
1640		}
1641
1642		dst_pmdval = pmdp_get_lockless(dst_pmd);
1643		/*
1644		 * If the dst_pmd is mapped as THP don't override it and just
1645		 * be strict. If dst_pmd changes into TPH after this check, the
1646		 * move_pages_huge_pmd() will detect the change and retry
1647		 * while move_pages_pte() will detect the change and fail.
1648		 */
1649		if (unlikely(pmd_trans_huge(dst_pmdval))) {
1650			err = -EEXIST;
1651			break;
1652		}
1653
1654		ptl = pmd_trans_huge_lock(src_pmd, src_vma);
1655		if (ptl) {
1656			if (pmd_devmap(*src_pmd)) {
1657				spin_unlock(ptl);
1658				err = -ENOENT;
1659				break;
1660			}
1661
1662			/* Check if we can move the pmd without splitting it. */
1663			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
1664			    !pmd_none(dst_pmdval)) {
1665				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
1666
1667				if (!folio || (!is_huge_zero_page(&folio->page) &&
1668					       !PageAnonExclusive(&folio->page))) {
1669					spin_unlock(ptl);
1670					err = -EBUSY;
1671					break;
1672				}
1673
1674				spin_unlock(ptl);
1675				split_huge_pmd(src_vma, src_pmd, src_addr);
1676				/* The folio will be split by move_pages_pte() */
1677				continue;
1678			}
1679
1680			err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
1681						  dst_pmdval, dst_vma, src_vma,
1682						  dst_addr, src_addr);
1683			step_size = HPAGE_PMD_SIZE;
1684		} else {
1685			if (pmd_none(*src_pmd)) {
1686				if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
1687					err = -ENOENT;
1688					break;
1689				}
1690				if (unlikely(__pte_alloc(mm, src_pmd))) {
1691					err = -ENOMEM;
1692					break;
1693				}
1694			}
1695
1696			if (unlikely(pte_alloc(mm, dst_pmd))) {
1697				err = -ENOMEM;
1698				break;
1699			}
1700
1701			err = move_pages_pte(mm, dst_pmd, src_pmd,
1702					     dst_vma, src_vma,
1703					     dst_addr, src_addr, mode);
1704			step_size = PAGE_SIZE;
1705		}
1706
1707		cond_resched();
1708
1709		if (fatal_signal_pending(current)) {
1710			/* Do not override an error */
1711			if (!err || err == -EAGAIN)
1712				err = -EINTR;
1713			break;
1714		}
1715
1716		if (err) {
1717			if (err == -EAGAIN)
1718				continue;
1719			break;
1720		}
1721
1722		/* Proceed to the next page */
1723		dst_addr += step_size;
1724		src_addr += step_size;
1725		moved += step_size;
1726	}
1727
1728out_unlock:
1729	up_read(&ctx->map_changing_lock);
1730	uffd_move_unlock(dst_vma, src_vma);
1731out:
1732	VM_WARN_ON(moved < 0);
1733	VM_WARN_ON(err > 0);
1734	VM_WARN_ON(!moved && !err);
1735	return moved ? moved : err;
1736}
1737