1// SPDX-License-Identifier: GPL-2.0
2/*
3 *	mm/mremap.c
4 *
5 *	(C) Copyright 1996 Linus Torvalds
6 *
7 *	Address space accounting code	<alan@lxorguk.ukuu.org.uk>
8 *	(C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 */
10
11#include <linux/mm.h>
12#include <linux/mm_inline.h>
13#include <linux/hugetlb.h>
14#include <linux/shm.h>
15#include <linux/ksm.h>
16#include <linux/mman.h>
17#include <linux/swap.h>
18#include <linux/capability.h>
19#include <linux/fs.h>
20#include <linux/swapops.h>
21#include <linux/highmem.h>
22#include <linux/security.h>
23#include <linux/syscalls.h>
24#include <linux/mmu_notifier.h>
25#include <linux/uaccess.h>
26#include <linux/userfaultfd_k.h>
27#include <linux/mempolicy.h>
28
29#include <asm/cacheflush.h>
30#include <asm/tlb.h>
31#include <asm/pgalloc.h>
32
33#include "internal.h"
34
35static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
36{
37	pgd_t *pgd;
38	p4d_t *p4d;
39	pud_t *pud;
40
41	pgd = pgd_offset(mm, addr);
42	if (pgd_none_or_clear_bad(pgd))
43		return NULL;
44
45	p4d = p4d_offset(pgd, addr);
46	if (p4d_none_or_clear_bad(p4d))
47		return NULL;
48
49	pud = pud_offset(p4d, addr);
50	if (pud_none_or_clear_bad(pud))
51		return NULL;
52
53	return pud;
54}
55
56static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
57{
58	pud_t *pud;
59	pmd_t *pmd;
60
61	pud = get_old_pud(mm, addr);
62	if (!pud)
63		return NULL;
64
65	pmd = pmd_offset(pud, addr);
66	if (pmd_none(*pmd))
67		return NULL;
68
69	return pmd;
70}
71
72static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
73			    unsigned long addr)
74{
75	pgd_t *pgd;
76	p4d_t *p4d;
77
78	pgd = pgd_offset(mm, addr);
79	p4d = p4d_alloc(mm, pgd, addr);
80	if (!p4d)
81		return NULL;
82
83	return pud_alloc(mm, p4d, addr);
84}
85
86static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
87			    unsigned long addr)
88{
89	pud_t *pud;
90	pmd_t *pmd;
91
92	pud = alloc_new_pud(mm, vma, addr);
93	if (!pud)
94		return NULL;
95
96	pmd = pmd_alloc(mm, pud, addr);
97	if (!pmd)
98		return NULL;
99
100	VM_BUG_ON(pmd_trans_huge(*pmd));
101
102	return pmd;
103}
104
105static void take_rmap_locks(struct vm_area_struct *vma)
106{
107	if (vma->vm_file)
108		i_mmap_lock_write(vma->vm_file->f_mapping);
109	if (vma->anon_vma)
110		anon_vma_lock_write(vma->anon_vma);
111}
112
113static void drop_rmap_locks(struct vm_area_struct *vma)
114{
115	if (vma->anon_vma)
116		anon_vma_unlock_write(vma->anon_vma);
117	if (vma->vm_file)
118		i_mmap_unlock_write(vma->vm_file->f_mapping);
119}
120
121static pte_t move_soft_dirty_pte(pte_t pte)
122{
123	/*
124	 * Set soft dirty bit so we can notice
125	 * in userspace the ptes were moved.
126	 */
127#ifdef CONFIG_MEM_SOFT_DIRTY
128	if (pte_present(pte))
129		pte = pte_mksoft_dirty(pte);
130	else if (is_swap_pte(pte))
131		pte = pte_swp_mksoft_dirty(pte);
132#endif
133	return pte;
134}
135
136static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
137		unsigned long old_addr, unsigned long old_end,
138		struct vm_area_struct *new_vma, pmd_t *new_pmd,
139		unsigned long new_addr, bool need_rmap_locks)
140{
141	struct mm_struct *mm = vma->vm_mm;
142	pte_t *old_pte, *new_pte, pte;
143	spinlock_t *old_ptl, *new_ptl;
144	bool force_flush = false;
145	unsigned long len = old_end - old_addr;
146	int err = 0;
147
148	/*
149	 * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
150	 * locks to ensure that rmap will always observe either the old or the
151	 * new ptes. This is the easiest way to avoid races with
152	 * truncate_pagecache(), page migration, etc...
153	 *
154	 * When need_rmap_locks is false, we use other ways to avoid
155	 * such races:
156	 *
157	 * - During exec() shift_arg_pages(), we use a specially tagged vma
158	 *   which rmap call sites look for using vma_is_temporary_stack().
159	 *
160	 * - During mremap(), new_vma is often known to be placed after vma
161	 *   in rmap traversal order. This ensures rmap will always observe
162	 *   either the old pte, or the new pte, or both (the page table locks
163	 *   serialize access to individual ptes, but only rmap traversal
164	 *   order guarantees that we won't miss both the old and new ptes).
165	 */
166	if (need_rmap_locks)
167		take_rmap_locks(vma);
168
169	/*
170	 * We don't have to worry about the ordering of src and dst
171	 * pte locks because exclusive mmap_lock prevents deadlock.
172	 */
173	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
174	if (!old_pte) {
175		err = -EAGAIN;
176		goto out;
177	}
178	new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl);
179	if (!new_pte) {
180		pte_unmap_unlock(old_pte, old_ptl);
181		err = -EAGAIN;
182		goto out;
183	}
184	if (new_ptl != old_ptl)
185		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
186	flush_tlb_batched_pending(vma->vm_mm);
187	arch_enter_lazy_mmu_mode();
188
189	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
190				   new_pte++, new_addr += PAGE_SIZE) {
191		if (pte_none(ptep_get(old_pte)))
192			continue;
193
194		pte = ptep_get_and_clear(mm, old_addr, old_pte);
195		/*
196		 * If we are remapping a valid PTE, make sure
197		 * to flush TLB before we drop the PTL for the
198		 * PTE.
199		 *
200		 * NOTE! Both old and new PTL matter: the old one
201		 * for racing with page_mkclean(), the new one to
202		 * make sure the physical page stays valid until
203		 * the TLB entry for the old mapping has been
204		 * flushed.
205		 */
206		if (pte_present(pte))
207			force_flush = true;
208		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
209		pte = move_soft_dirty_pte(pte);
210		set_pte_at(mm, new_addr, new_pte, pte);
211	}
212
213	arch_leave_lazy_mmu_mode();
214	if (force_flush)
215		flush_tlb_range(vma, old_end - len, old_end);
216	if (new_ptl != old_ptl)
217		spin_unlock(new_ptl);
218	pte_unmap(new_pte - 1);
219	pte_unmap_unlock(old_pte - 1, old_ptl);
220out:
221	if (need_rmap_locks)
222		drop_rmap_locks(vma);
223	return err;
224}
225
226#ifndef arch_supports_page_table_move
227#define arch_supports_page_table_move arch_supports_page_table_move
228static inline bool arch_supports_page_table_move(void)
229{
230	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
231		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
232}
233#endif
234
235#ifdef CONFIG_HAVE_MOVE_PMD
236static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
237		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
238{
239	spinlock_t *old_ptl, *new_ptl;
240	struct mm_struct *mm = vma->vm_mm;
241	pmd_t pmd;
242
243	if (!arch_supports_page_table_move())
244		return false;
245	/*
246	 * The destination pmd shouldn't be established, free_pgtables()
247	 * should have released it.
248	 *
249	 * However, there's a case during execve() where we use mremap
250	 * to move the initial stack, and in that case the target area
251	 * may overlap the source area (always moving down).
252	 *
253	 * If everything is PMD-aligned, that works fine, as moving
254	 * each pmd down will clear the source pmd. But if we first
255	 * have a few 4kB-only pages that get moved down, and then
256	 * hit the "now the rest is PMD-aligned, let's do everything
257	 * one pmd at a time", we will still have the old (now empty
258	 * of any 4kB pages, but still there) PMD in the page table
259	 * tree.
260	 *
261	 * Warn on it once - because we really should try to figure
262	 * out how to do this better - but then say "I won't move
263	 * this pmd".
264	 *
265	 * One alternative might be to just unmap the target pmd at
266	 * this point, and verify that it really is empty. We'll see.
267	 */
268	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
269		return false;
270
271	/*
272	 * We don't have to worry about the ordering of src and dst
273	 * ptlocks because exclusive mmap_lock prevents deadlock.
274	 */
275	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
276	new_ptl = pmd_lockptr(mm, new_pmd);
277	if (new_ptl != old_ptl)
278		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
279
280	/* Clear the pmd */
281	pmd = *old_pmd;
282	pmd_clear(old_pmd);
283
284	VM_BUG_ON(!pmd_none(*new_pmd));
285
286	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
287	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
288	if (new_ptl != old_ptl)
289		spin_unlock(new_ptl);
290	spin_unlock(old_ptl);
291
292	return true;
293}
294#else
295static inline bool move_normal_pmd(struct vm_area_struct *vma,
296		unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
297		pmd_t *new_pmd)
298{
299	return false;
300}
301#endif
302
303#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
304static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
305		  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
306{
307	spinlock_t *old_ptl, *new_ptl;
308	struct mm_struct *mm = vma->vm_mm;
309	pud_t pud;
310
311	if (!arch_supports_page_table_move())
312		return false;
313	/*
314	 * The destination pud shouldn't be established, free_pgtables()
315	 * should have released it.
316	 */
317	if (WARN_ON_ONCE(!pud_none(*new_pud)))
318		return false;
319
320	/*
321	 * We don't have to worry about the ordering of src and dst
322	 * ptlocks because exclusive mmap_lock prevents deadlock.
323	 */
324	old_ptl = pud_lock(vma->vm_mm, old_pud);
325	new_ptl = pud_lockptr(mm, new_pud);
326	if (new_ptl != old_ptl)
327		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
328
329	/* Clear the pud */
330	pud = *old_pud;
331	pud_clear(old_pud);
332
333	VM_BUG_ON(!pud_none(*new_pud));
334
335	pud_populate(mm, new_pud, pud_pgtable(pud));
336	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
337	if (new_ptl != old_ptl)
338		spin_unlock(new_ptl);
339	spin_unlock(old_ptl);
340
341	return true;
342}
343#else
344static inline bool move_normal_pud(struct vm_area_struct *vma,
345		unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
346		pud_t *new_pud)
347{
348	return false;
349}
350#endif
351
352#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
353static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
354			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
355{
356	spinlock_t *old_ptl, *new_ptl;
357	struct mm_struct *mm = vma->vm_mm;
358	pud_t pud;
359
360	/*
361	 * The destination pud shouldn't be established, free_pgtables()
362	 * should have released it.
363	 */
364	if (WARN_ON_ONCE(!pud_none(*new_pud)))
365		return false;
366
367	/*
368	 * We don't have to worry about the ordering of src and dst
369	 * ptlocks because exclusive mmap_lock prevents deadlock.
370	 */
371	old_ptl = pud_lock(vma->vm_mm, old_pud);
372	new_ptl = pud_lockptr(mm, new_pud);
373	if (new_ptl != old_ptl)
374		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375
376	/* Clear the pud */
377	pud = *old_pud;
378	pud_clear(old_pud);
379
380	VM_BUG_ON(!pud_none(*new_pud));
381
382	/* Set the new pud */
383	/* mark soft_ditry when we add pud level soft dirty support */
384	set_pud_at(mm, new_addr, new_pud, pud);
385	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
386	if (new_ptl != old_ptl)
387		spin_unlock(new_ptl);
388	spin_unlock(old_ptl);
389
390	return true;
391}
392#else
393static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
394			  unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
395{
396	WARN_ON_ONCE(1);
397	return false;
398
399}
400#endif
401
402enum pgt_entry {
403	NORMAL_PMD,
404	HPAGE_PMD,
405	NORMAL_PUD,
406	HPAGE_PUD,
407};
408
409/*
410 * Returns an extent of the corresponding size for the pgt_entry specified if
411 * valid. Else returns a smaller extent bounded by the end of the source and
412 * destination pgt_entry.
413 */
414static __always_inline unsigned long get_extent(enum pgt_entry entry,
415			unsigned long old_addr, unsigned long old_end,
416			unsigned long new_addr)
417{
418	unsigned long next, extent, mask, size;
419
420	switch (entry) {
421	case HPAGE_PMD:
422	case NORMAL_PMD:
423		mask = PMD_MASK;
424		size = PMD_SIZE;
425		break;
426	case HPAGE_PUD:
427	case NORMAL_PUD:
428		mask = PUD_MASK;
429		size = PUD_SIZE;
430		break;
431	default:
432		BUILD_BUG();
433		break;
434	}
435
436	next = (old_addr + size) & mask;
437	/* even if next overflowed, extent below will be ok */
438	extent = next - old_addr;
439	if (extent > old_end - old_addr)
440		extent = old_end - old_addr;
441	next = (new_addr + size) & mask;
442	if (extent > next - new_addr)
443		extent = next - new_addr;
444	return extent;
445}
446
447/*
448 * Attempts to speedup the move by moving entry at the level corresponding to
449 * pgt_entry. Returns true if the move was successful, else false.
450 */
451static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
452			unsigned long old_addr, unsigned long new_addr,
453			void *old_entry, void *new_entry, bool need_rmap_locks)
454{
455	bool moved = false;
456
457	/* See comment in move_ptes() */
458	if (need_rmap_locks)
459		take_rmap_locks(vma);
460
461	switch (entry) {
462	case NORMAL_PMD:
463		moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
464					new_entry);
465		break;
466	case NORMAL_PUD:
467		moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
468					new_entry);
469		break;
470	case HPAGE_PMD:
471		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
472			move_huge_pmd(vma, old_addr, new_addr, old_entry,
473				      new_entry);
474		break;
475	case HPAGE_PUD:
476		moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
477			move_huge_pud(vma, old_addr, new_addr, old_entry,
478				      new_entry);
479		break;
480
481	default:
482		WARN_ON_ONCE(1);
483		break;
484	}
485
486	if (need_rmap_locks)
487		drop_rmap_locks(vma);
488
489	return moved;
490}
491
492/*
493 * A helper to check if aligning down is OK. The aligned address should fall
494 * on *no mapping*. For the stack moving down, that's a special move within
495 * the VMA that is created to span the source and destination of the move,
496 * so we make an exception for it.
497 */
498static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align,
499			    unsigned long mask, bool for_stack)
500{
501	unsigned long addr_masked = addr_to_align & mask;
502
503	/*
504	 * If @addr_to_align of either source or destination is not the beginning
505	 * of the corresponding VMA, we can't align down or we will destroy part
506	 * of the current mapping.
507	 */
508	if (!for_stack && vma->vm_start != addr_to_align)
509		return false;
510
511	/* In the stack case we explicitly permit in-VMA alignment. */
512	if (for_stack && addr_masked >= vma->vm_start)
513		return true;
514
515	/*
516	 * Make sure the realignment doesn't cause the address to fall on an
517	 * existing mapping.
518	 */
519	return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
520}
521
522/* Opportunistically realign to specified boundary for faster copy. */
523static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma,
524			     unsigned long *new_addr, struct vm_area_struct *new_vma,
525			     unsigned long mask, bool for_stack)
526{
527	/* Skip if the addresses are already aligned. */
528	if ((*old_addr & ~mask) == 0)
529		return;
530
531	/* Only realign if the new and old addresses are mutually aligned. */
532	if ((*old_addr & ~mask) != (*new_addr & ~mask))
533		return;
534
535	/* Ensure realignment doesn't cause overlap with existing mappings. */
536	if (!can_align_down(old_vma, *old_addr, mask, for_stack) ||
537	    !can_align_down(new_vma, *new_addr, mask, for_stack))
538		return;
539
540	*old_addr = *old_addr & mask;
541	*new_addr = *new_addr & mask;
542}
543
544unsigned long move_page_tables(struct vm_area_struct *vma,
545		unsigned long old_addr, struct vm_area_struct *new_vma,
546		unsigned long new_addr, unsigned long len,
547		bool need_rmap_locks, bool for_stack)
548{
549	unsigned long extent, old_end;
550	struct mmu_notifier_range range;
551	pmd_t *old_pmd, *new_pmd;
552	pud_t *old_pud, *new_pud;
553
554	if (!len)
555		return 0;
556
557	old_end = old_addr + len;
558
559	if (is_vm_hugetlb_page(vma))
560		return move_hugetlb_page_tables(vma, new_vma, old_addr,
561						new_addr, len);
562
563	/*
564	 * If possible, realign addresses to PMD boundary for faster copy.
565	 * Only realign if the mremap copying hits a PMD boundary.
566	 */
567	if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
568		try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK,
569				 for_stack);
570
571	flush_cache_range(vma, old_addr, old_end);
572	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
573				old_addr, old_end);
574	mmu_notifier_invalidate_range_start(&range);
575
576	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
577		cond_resched();
578		/*
579		 * If extent is PUD-sized try to speed up the move by moving at the
580		 * PUD level if possible.
581		 */
582		extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
583
584		old_pud = get_old_pud(vma->vm_mm, old_addr);
585		if (!old_pud)
586			continue;
587		new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
588		if (!new_pud)
589			break;
590		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
591			if (extent == HPAGE_PUD_SIZE) {
592				move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
593					       old_pud, new_pud, need_rmap_locks);
594				/* We ignore and continue on error? */
595				continue;
596			}
597		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
598
599			if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
600					   old_pud, new_pud, true))
601				continue;
602		}
603
604		extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
605		old_pmd = get_old_pmd(vma->vm_mm, old_addr);
606		if (!old_pmd)
607			continue;
608		new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
609		if (!new_pmd)
610			break;
611again:
612		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
613		    pmd_devmap(*old_pmd)) {
614			if (extent == HPAGE_PMD_SIZE &&
615			    move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
616					   old_pmd, new_pmd, need_rmap_locks))
617				continue;
618			split_huge_pmd(vma, old_pmd, old_addr);
619		} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
620			   extent == PMD_SIZE) {
621			/*
622			 * If the extent is PMD-sized, try to speed the move by
623			 * moving at the PMD level if possible.
624			 */
625			if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
626					   old_pmd, new_pmd, true))
627				continue;
628		}
629		if (pmd_none(*old_pmd))
630			continue;
631		if (pte_alloc(new_vma->vm_mm, new_pmd))
632			break;
633		if (move_ptes(vma, old_pmd, old_addr, old_addr + extent,
634			      new_vma, new_pmd, new_addr, need_rmap_locks) < 0)
635			goto again;
636	}
637
638	mmu_notifier_invalidate_range_end(&range);
639
640	/*
641	 * Prevent negative return values when {old,new}_addr was realigned
642	 * but we broke out of the above loop for the first PMD itself.
643	 */
644	if (len + old_addr < old_end)
645		return 0;
646
647	return len + old_addr - old_end;	/* how much done */
648}
649
650static unsigned long move_vma(struct vm_area_struct *vma,
651		unsigned long old_addr, unsigned long old_len,
652		unsigned long new_len, unsigned long new_addr,
653		bool *locked, unsigned long flags,
654		struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
655{
656	long to_account = new_len - old_len;
657	struct mm_struct *mm = vma->vm_mm;
658	struct vm_area_struct *new_vma;
659	unsigned long vm_flags = vma->vm_flags;
660	unsigned long new_pgoff;
661	unsigned long moved_len;
662	unsigned long account_start = 0;
663	unsigned long account_end = 0;
664	unsigned long hiwater_vm;
665	int err = 0;
666	bool need_rmap_locks;
667	struct vma_iterator vmi;
668
669	/*
670	 * We'd prefer to avoid failure later on in do_munmap:
671	 * which may split one vma into three before unmapping.
672	 */
673	if (mm->map_count >= sysctl_max_map_count - 3)
674		return -ENOMEM;
675
676	if (unlikely(flags & MREMAP_DONTUNMAP))
677		to_account = new_len;
678
679	if (vma->vm_ops && vma->vm_ops->may_split) {
680		if (vma->vm_start != old_addr)
681			err = vma->vm_ops->may_split(vma, old_addr);
682		if (!err && vma->vm_end != old_addr + old_len)
683			err = vma->vm_ops->may_split(vma, old_addr + old_len);
684		if (err)
685			return err;
686	}
687
688	/*
689	 * Advise KSM to break any KSM pages in the area to be moved:
690	 * it would be confusing if they were to turn up at the new
691	 * location, where they happen to coincide with different KSM
692	 * pages recently unmapped.  But leave vma->vm_flags as it was,
693	 * so KSM can come around to merge on vma and new_vma afterwards.
694	 */
695	err = ksm_madvise(vma, old_addr, old_addr + old_len,
696						MADV_UNMERGEABLE, &vm_flags);
697	if (err)
698		return err;
699
700	if (vm_flags & VM_ACCOUNT) {
701		if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
702			return -ENOMEM;
703	}
704
705	vma_start_write(vma);
706	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
707	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
708			   &need_rmap_locks);
709	if (!new_vma) {
710		if (vm_flags & VM_ACCOUNT)
711			vm_unacct_memory(to_account >> PAGE_SHIFT);
712		return -ENOMEM;
713	}
714
715	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
716				     need_rmap_locks, false);
717	if (moved_len < old_len) {
718		err = -ENOMEM;
719	} else if (vma->vm_ops && vma->vm_ops->mremap) {
720		err = vma->vm_ops->mremap(new_vma);
721	}
722
723	if (unlikely(err)) {
724		/*
725		 * On error, move entries back from new area to old,
726		 * which will succeed since page tables still there,
727		 * and then proceed to unmap new area instead of old.
728		 */
729		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
730				 true, false);
731		vma = new_vma;
732		old_len = new_len;
733		old_addr = new_addr;
734		new_addr = err;
735	} else {
736		mremap_userfaultfd_prep(new_vma, uf);
737	}
738
739	if (is_vm_hugetlb_page(vma)) {
740		clear_vma_resv_huge_pages(vma);
741	}
742
743	/* Conceal VM_ACCOUNT so old reservation is not undone */
744	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
745		vm_flags_clear(vma, VM_ACCOUNT);
746		if (vma->vm_start < old_addr)
747			account_start = vma->vm_start;
748		if (vma->vm_end > old_addr + old_len)
749			account_end = vma->vm_end;
750	}
751
752	/*
753	 * If we failed to move page tables we still do total_vm increment
754	 * since do_munmap() will decrement it by old_len == new_len.
755	 *
756	 * Since total_vm is about to be raised artificially high for a
757	 * moment, we need to restore high watermark afterwards: if stats
758	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
759	 * If this were a serious issue, we'd add a flag to do_munmap().
760	 */
761	hiwater_vm = mm->hiwater_vm;
762	vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
763
764	/* Tell pfnmap has moved from this vma */
765	if (unlikely(vma->vm_flags & VM_PFNMAP))
766		untrack_pfn_clear(vma);
767
768	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
769		/* We always clear VM_LOCKED[ONFAULT] on the old vma */
770		vm_flags_clear(vma, VM_LOCKED_MASK);
771
772		/*
773		 * anon_vma links of the old vma is no longer needed after its page
774		 * table has been moved.
775		 */
776		if (new_vma != vma && vma->vm_start == old_addr &&
777			vma->vm_end == (old_addr + old_len))
778			unlink_anon_vmas(vma);
779
780		/* Because we won't unmap we don't need to touch locked_vm */
781		return new_addr;
782	}
783
784	vma_iter_init(&vmi, mm, old_addr);
785	if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) {
786		/* OOM: unable to split vma, just get accounts right */
787		if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
788			vm_acct_memory(old_len >> PAGE_SHIFT);
789		account_start = account_end = 0;
790	}
791
792	if (vm_flags & VM_LOCKED) {
793		mm->locked_vm += new_len >> PAGE_SHIFT;
794		*locked = true;
795	}
796
797	mm->hiwater_vm = hiwater_vm;
798
799	/* Restore VM_ACCOUNT if one or two pieces of vma left */
800	if (account_start) {
801		vma = vma_prev(&vmi);
802		vm_flags_set(vma, VM_ACCOUNT);
803	}
804
805	if (account_end) {
806		vma = vma_next(&vmi);
807		vm_flags_set(vma, VM_ACCOUNT);
808	}
809
810	return new_addr;
811}
812
813static struct vm_area_struct *vma_to_resize(unsigned long addr,
814	unsigned long old_len, unsigned long new_len, unsigned long flags)
815{
816	struct mm_struct *mm = current->mm;
817	struct vm_area_struct *vma;
818	unsigned long pgoff;
819
820	vma = vma_lookup(mm, addr);
821	if (!vma)
822		return ERR_PTR(-EFAULT);
823
824	/*
825	 * !old_len is a special case where an attempt is made to 'duplicate'
826	 * a mapping.  This makes no sense for private mappings as it will
827	 * instead create a fresh/new mapping unrelated to the original.  This
828	 * is contrary to the basic idea of mremap which creates new mappings
829	 * based on the original.  There are no known use cases for this
830	 * behavior.  As a result, fail such attempts.
831	 */
832	if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
833		pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap.  This is not supported.\n", current->comm, current->pid);
834		return ERR_PTR(-EINVAL);
835	}
836
837	if ((flags & MREMAP_DONTUNMAP) &&
838			(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
839		return ERR_PTR(-EINVAL);
840
841	/* We can't remap across vm area boundaries */
842	if (old_len > vma->vm_end - addr)
843		return ERR_PTR(-EFAULT);
844
845	if (new_len == old_len)
846		return vma;
847
848	/* Need to be careful about a growing mapping */
849	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
850	pgoff += vma->vm_pgoff;
851	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
852		return ERR_PTR(-EINVAL);
853
854	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
855		return ERR_PTR(-EFAULT);
856
857	if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len))
858		return ERR_PTR(-EAGAIN);
859
860	if (!may_expand_vm(mm, vma->vm_flags,
861				(new_len - old_len) >> PAGE_SHIFT))
862		return ERR_PTR(-ENOMEM);
863
864	return vma;
865}
866
867static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
868		unsigned long new_addr, unsigned long new_len, bool *locked,
869		unsigned long flags, struct vm_userfaultfd_ctx *uf,
870		struct list_head *uf_unmap_early,
871		struct list_head *uf_unmap)
872{
873	struct mm_struct *mm = current->mm;
874	struct vm_area_struct *vma;
875	unsigned long ret = -EINVAL;
876	unsigned long map_flags = 0;
877
878	if (offset_in_page(new_addr))
879		goto out;
880
881	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
882		goto out;
883
884	/* Ensure the old/new locations do not overlap */
885	if (addr + old_len > new_addr && new_addr + new_len > addr)
886		goto out;
887
888	/*
889	 * move_vma() need us to stay 4 maps below the threshold, otherwise
890	 * it will bail out at the very beginning.
891	 * That is a problem if we have already unmaped the regions here
892	 * (new_addr, and old_addr), because userspace will not know the
893	 * state of the vma's after it gets -ENOMEM.
894	 * So, to avoid such scenario we can pre-compute if the whole
895	 * operation has high chances to success map-wise.
896	 * Worst-scenario case is when both vma's (new_addr and old_addr) get
897	 * split in 3 before unmapping it.
898	 * That means 2 more maps (1 for each) to the ones we already hold.
899	 * Check whether current map count plus 2 still leads us to 4 maps below
900	 * the threshold, otherwise return -ENOMEM here to be more safe.
901	 */
902	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
903		return -ENOMEM;
904
905	if (flags & MREMAP_FIXED) {
906		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
907		if (ret)
908			goto out;
909	}
910
911	if (old_len > new_len) {
912		ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
913		if (ret)
914			goto out;
915		old_len = new_len;
916	}
917
918	vma = vma_to_resize(addr, old_len, new_len, flags);
919	if (IS_ERR(vma)) {
920		ret = PTR_ERR(vma);
921		goto out;
922	}
923
924	/* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
925	if (flags & MREMAP_DONTUNMAP &&
926		!may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) {
927		ret = -ENOMEM;
928		goto out;
929	}
930
931	if (flags & MREMAP_FIXED)
932		map_flags |= MAP_FIXED;
933
934	if (vma->vm_flags & VM_MAYSHARE)
935		map_flags |= MAP_SHARED;
936
937	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
938				((addr - vma->vm_start) >> PAGE_SHIFT),
939				map_flags);
940	if (IS_ERR_VALUE(ret))
941		goto out;
942
943	/* We got a new mapping */
944	if (!(flags & MREMAP_FIXED))
945		new_addr = ret;
946
947	ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
948		       uf_unmap);
949
950out:
951	return ret;
952}
953
954static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
955{
956	unsigned long end = vma->vm_end + delta;
957
958	if (end < vma->vm_end) /* overflow */
959		return 0;
960	if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
961		return 0;
962	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
963			      0, MAP_FIXED) & ~PAGE_MASK)
964		return 0;
965	return 1;
966}
967
968/*
969 * Expand (or shrink) an existing mapping, potentially moving it at the
970 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
971 *
972 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
973 * This option implies MREMAP_MAYMOVE.
974 */
975SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
976		unsigned long, new_len, unsigned long, flags,
977		unsigned long, new_addr)
978{
979	struct mm_struct *mm = current->mm;
980	struct vm_area_struct *vma;
981	unsigned long ret = -EINVAL;
982	bool locked = false;
983	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
984	LIST_HEAD(uf_unmap_early);
985	LIST_HEAD(uf_unmap);
986
987	/*
988	 * There is a deliberate asymmetry here: we strip the pointer tag
989	 * from the old address but leave the new address alone. This is
990	 * for consistency with mmap(), where we prevent the creation of
991	 * aliasing mappings in userspace by leaving the tag bits of the
992	 * mapping address intact. A non-zero tag will cause the subsequent
993	 * range checks to reject the address as invalid.
994	 *
995	 * See Documentation/arch/arm64/tagged-address-abi.rst for more
996	 * information.
997	 */
998	addr = untagged_addr(addr);
999
1000	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
1001		return ret;
1002
1003	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1004		return ret;
1005
1006	/*
1007	 * MREMAP_DONTUNMAP is always a move and it does not allow resizing
1008	 * in the process.
1009	 */
1010	if (flags & MREMAP_DONTUNMAP &&
1011			(!(flags & MREMAP_MAYMOVE) || old_len != new_len))
1012		return ret;
1013
1014
1015	if (offset_in_page(addr))
1016		return ret;
1017
1018	old_len = PAGE_ALIGN(old_len);
1019	new_len = PAGE_ALIGN(new_len);
1020
1021	/*
1022	 * We allow a zero old-len as a special case
1023	 * for DOS-emu "duplicate shm area" thing. But
1024	 * a zero new-len is nonsensical.
1025	 */
1026	if (!new_len)
1027		return ret;
1028
1029	if (mmap_write_lock_killable(current->mm))
1030		return -EINTR;
1031	vma = vma_lookup(mm, addr);
1032	if (!vma) {
1033		ret = -EFAULT;
1034		goto out;
1035	}
1036
1037	if (is_vm_hugetlb_page(vma)) {
1038		struct hstate *h __maybe_unused = hstate_vma(vma);
1039
1040		old_len = ALIGN(old_len, huge_page_size(h));
1041		new_len = ALIGN(new_len, huge_page_size(h));
1042
1043		/* addrs must be huge page aligned */
1044		if (addr & ~huge_page_mask(h))
1045			goto out;
1046		if (new_addr & ~huge_page_mask(h))
1047			goto out;
1048
1049		/*
1050		 * Don't allow remap expansion, because the underlying hugetlb
1051		 * reservation is not yet capable to handle split reservation.
1052		 */
1053		if (new_len > old_len)
1054			goto out;
1055	}
1056
1057	if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
1058		ret = mremap_to(addr, old_len, new_addr, new_len,
1059				&locked, flags, &uf, &uf_unmap_early,
1060				&uf_unmap);
1061		goto out;
1062	}
1063
1064	/*
1065	 * Always allow a shrinking remap: that just unmaps
1066	 * the unnecessary pages..
1067	 * do_vmi_munmap does all the needed commit accounting, and
1068	 * unlocks the mmap_lock if so directed.
1069	 */
1070	if (old_len >= new_len) {
1071		VMA_ITERATOR(vmi, mm, addr + new_len);
1072
1073		if (old_len == new_len) {
1074			ret = addr;
1075			goto out;
1076		}
1077
1078		ret = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len,
1079				    &uf_unmap, true);
1080		if (ret)
1081			goto out;
1082
1083		ret = addr;
1084		goto out_unlocked;
1085	}
1086
1087	/*
1088	 * Ok, we need to grow..
1089	 */
1090	vma = vma_to_resize(addr, old_len, new_len, flags);
1091	if (IS_ERR(vma)) {
1092		ret = PTR_ERR(vma);
1093		goto out;
1094	}
1095
1096	/* old_len exactly to the end of the area..
1097	 */
1098	if (old_len == vma->vm_end - addr) {
1099		unsigned long delta = new_len - old_len;
1100
1101		/* can we just expand the current mapping? */
1102		if (vma_expandable(vma, delta)) {
1103			long pages = delta >> PAGE_SHIFT;
1104			VMA_ITERATOR(vmi, mm, vma->vm_end);
1105			long charged = 0;
1106
1107			if (vma->vm_flags & VM_ACCOUNT) {
1108				if (security_vm_enough_memory_mm(mm, pages)) {
1109					ret = -ENOMEM;
1110					goto out;
1111				}
1112				charged = pages;
1113			}
1114
1115			/*
1116			 * Function vma_merge_extend() is called on the
1117			 * extension we are adding to the already existing vma,
1118			 * vma_merge_extend() will merge this extension with the
1119			 * already existing vma (expand operation itself) and
1120			 * possibly also with the next vma if it becomes
1121			 * adjacent to the expanded vma and otherwise
1122			 * compatible.
1123			 */
1124			vma = vma_merge_extend(&vmi, vma, delta);
1125			if (!vma) {
1126				vm_unacct_memory(charged);
1127				ret = -ENOMEM;
1128				goto out;
1129			}
1130
1131			vm_stat_account(mm, vma->vm_flags, pages);
1132			if (vma->vm_flags & VM_LOCKED) {
1133				mm->locked_vm += pages;
1134				locked = true;
1135				new_addr = addr;
1136			}
1137			ret = addr;
1138			goto out;
1139		}
1140	}
1141
1142	/*
1143	 * We weren't able to just expand or shrink the area,
1144	 * we need to create a new one and move it..
1145	 */
1146	ret = -ENOMEM;
1147	if (flags & MREMAP_MAYMOVE) {
1148		unsigned long map_flags = 0;
1149		if (vma->vm_flags & VM_MAYSHARE)
1150			map_flags |= MAP_SHARED;
1151
1152		new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
1153					vma->vm_pgoff +
1154					((addr - vma->vm_start) >> PAGE_SHIFT),
1155					map_flags);
1156		if (IS_ERR_VALUE(new_addr)) {
1157			ret = new_addr;
1158			goto out;
1159		}
1160
1161		ret = move_vma(vma, addr, old_len, new_len, new_addr,
1162			       &locked, flags, &uf, &uf_unmap);
1163	}
1164out:
1165	if (offset_in_page(ret))
1166		locked = false;
1167	mmap_write_unlock(current->mm);
1168	if (locked && new_len > old_len)
1169		mm_populate(new_addr + old_len, new_len - old_len);
1170out_unlocked:
1171	userfaultfd_unmap_complete(mm, &uf_unmap_early);
1172	mremap_userfaultfd_complete(&uf, addr, ret, old_len);
1173	userfaultfd_unmap_complete(mm, &uf_unmap);
1174	return ret;
1175}
1176