1// SPDX-License-Identifier: GPL-2.0
2/*
3 *	linux/mm/madvise.c
4 *
5 * Copyright (C) 1999  Linus Torvalds
6 * Copyright (C) 2002  Christoph Hellwig
7 */
8
9#include <linux/mman.h>
10#include <linux/pagemap.h>
11#include <linux/syscalls.h>
12#include <linux/mempolicy.h>
13#include <linux/page-isolation.h>
14#include <linux/page_idle.h>
15#include <linux/userfaultfd_k.h>
16#include <linux/hugetlb.h>
17#include <linux/falloc.h>
18#include <linux/fadvise.h>
19#include <linux/sched.h>
20#include <linux/sched/mm.h>
21#include <linux/mm_inline.h>
22#include <linux/string.h>
23#include <linux/uio.h>
24#include <linux/ksm.h>
25#include <linux/fs.h>
26#include <linux/file.h>
27#include <linux/blkdev.h>
28#include <linux/backing-dev.h>
29#include <linux/pagewalk.h>
30#include <linux/swap.h>
31#include <linux/swapops.h>
32#include <linux/shmem_fs.h>
33#include <linux/mmu_notifier.h>
34
35#include <asm/tlb.h>
36
37#include "internal.h"
38#include "swap.h"
39
40struct madvise_walk_private {
41	struct mmu_gather *tlb;
42	bool pageout;
43};
44
45/*
46 * Any behaviour which results in changes to the vma->vm_flags needs to
47 * take mmap_lock for writing. Others, which simply traverse vmas, need
48 * to only take it for reading.
49 */
50static int madvise_need_mmap_write(int behavior)
51{
52	switch (behavior) {
53	case MADV_REMOVE:
54	case MADV_WILLNEED:
55	case MADV_DONTNEED:
56	case MADV_DONTNEED_LOCKED:
57	case MADV_COLD:
58	case MADV_PAGEOUT:
59	case MADV_FREE:
60	case MADV_POPULATE_READ:
61	case MADV_POPULATE_WRITE:
62	case MADV_COLLAPSE:
63		return 0;
64	default:
65		/* be safe, default to 1. list exceptions explicitly */
66		return 1;
67	}
68}
69
70#ifdef CONFIG_ANON_VMA_NAME
71struct anon_vma_name *anon_vma_name_alloc(const char *name)
72{
73	struct anon_vma_name *anon_name;
74	size_t count;
75
76	/* Add 1 for NUL terminator at the end of the anon_name->name */
77	count = strlen(name) + 1;
78	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
79	if (anon_name) {
80		kref_init(&anon_name->kref);
81		memcpy(anon_name->name, name, count);
82	}
83
84	return anon_name;
85}
86
87void anon_vma_name_free(struct kref *kref)
88{
89	struct anon_vma_name *anon_name =
90			container_of(kref, struct anon_vma_name, kref);
91	kfree(anon_name);
92}
93
94struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
95{
96	mmap_assert_locked(vma->vm_mm);
97
98	return vma->anon_name;
99}
100
101/* mmap_lock should be write-locked */
102static int replace_anon_vma_name(struct vm_area_struct *vma,
103				 struct anon_vma_name *anon_name)
104{
105	struct anon_vma_name *orig_name = anon_vma_name(vma);
106
107	if (!anon_name) {
108		vma->anon_name = NULL;
109		anon_vma_name_put(orig_name);
110		return 0;
111	}
112
113	if (anon_vma_name_eq(orig_name, anon_name))
114		return 0;
115
116	vma->anon_name = anon_vma_name_reuse(anon_name);
117	anon_vma_name_put(orig_name);
118
119	return 0;
120}
121#else /* CONFIG_ANON_VMA_NAME */
122static int replace_anon_vma_name(struct vm_area_struct *vma,
123				 struct anon_vma_name *anon_name)
124{
125	if (anon_name)
126		return -EINVAL;
127
128	return 0;
129}
130#endif /* CONFIG_ANON_VMA_NAME */
131/*
132 * Update the vm_flags on region of a vma, splitting it or merging it as
133 * necessary.  Must be called with mmap_lock held for writing;
134 * Caller should ensure anon_name stability by raising its refcount even when
135 * anon_name belongs to a valid vma because this function might free that vma.
136 */
137static int madvise_update_vma(struct vm_area_struct *vma,
138			      struct vm_area_struct **prev, unsigned long start,
139			      unsigned long end, unsigned long new_flags,
140			      struct anon_vma_name *anon_name)
141{
142	struct mm_struct *mm = vma->vm_mm;
143	int error;
144	VMA_ITERATOR(vmi, mm, start);
145
146	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
147		*prev = vma;
148		return 0;
149	}
150
151	vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
152				    anon_name);
153	if (IS_ERR(vma))
154		return PTR_ERR(vma);
155
156	*prev = vma;
157
158	/* vm_flags is protected by the mmap_lock held in write mode. */
159	vma_start_write(vma);
160	vm_flags_reset(vma, new_flags);
161	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
162		error = replace_anon_vma_name(vma, anon_name);
163		if (error)
164			return error;
165	}
166
167	return 0;
168}
169
170#ifdef CONFIG_SWAP
171static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
172		unsigned long end, struct mm_walk *walk)
173{
174	struct vm_area_struct *vma = walk->private;
175	struct swap_iocb *splug = NULL;
176	pte_t *ptep = NULL;
177	spinlock_t *ptl;
178	unsigned long addr;
179
180	for (addr = start; addr < end; addr += PAGE_SIZE) {
181		pte_t pte;
182		swp_entry_t entry;
183		struct folio *folio;
184
185		if (!ptep++) {
186			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
187			if (!ptep)
188				break;
189		}
190
191		pte = ptep_get(ptep);
192		if (!is_swap_pte(pte))
193			continue;
194		entry = pte_to_swp_entry(pte);
195		if (unlikely(non_swap_entry(entry)))
196			continue;
197
198		pte_unmap_unlock(ptep, ptl);
199		ptep = NULL;
200
201		folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
202					     vma, addr, &splug);
203		if (folio)
204			folio_put(folio);
205	}
206
207	if (ptep)
208		pte_unmap_unlock(ptep, ptl);
209	swap_read_unplug(splug);
210	cond_resched();
211
212	return 0;
213}
214
215static const struct mm_walk_ops swapin_walk_ops = {
216	.pmd_entry		= swapin_walk_pmd_entry,
217	.walk_lock		= PGWALK_RDLOCK,
218};
219
220static void shmem_swapin_range(struct vm_area_struct *vma,
221		unsigned long start, unsigned long end,
222		struct address_space *mapping)
223{
224	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
225	pgoff_t end_index = linear_page_index(vma, end) - 1;
226	struct folio *folio;
227	struct swap_iocb *splug = NULL;
228
229	rcu_read_lock();
230	xas_for_each(&xas, folio, end_index) {
231		unsigned long addr;
232		swp_entry_t entry;
233
234		if (!xa_is_value(folio))
235			continue;
236		entry = radix_to_swp_entry(folio);
237		/* There might be swapin error entries in shmem mapping. */
238		if (non_swap_entry(entry))
239			continue;
240
241		addr = vma->vm_start +
242			((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
243		xas_pause(&xas);
244		rcu_read_unlock();
245
246		folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
247					     vma, addr, &splug);
248		if (folio)
249			folio_put(folio);
250
251		rcu_read_lock();
252	}
253	rcu_read_unlock();
254	swap_read_unplug(splug);
255}
256#endif		/* CONFIG_SWAP */
257
258/*
259 * Schedule all required I/O operations.  Do not wait for completion.
260 */
261static long madvise_willneed(struct vm_area_struct *vma,
262			     struct vm_area_struct **prev,
263			     unsigned long start, unsigned long end)
264{
265	struct mm_struct *mm = vma->vm_mm;
266	struct file *file = vma->vm_file;
267	loff_t offset;
268
269	*prev = vma;
270#ifdef CONFIG_SWAP
271	if (!file) {
272		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
273		lru_add_drain(); /* Push any new pages onto the LRU now */
274		return 0;
275	}
276
277	if (shmem_mapping(file->f_mapping)) {
278		shmem_swapin_range(vma, start, end, file->f_mapping);
279		lru_add_drain(); /* Push any new pages onto the LRU now */
280		return 0;
281	}
282#else
283	if (!file)
284		return -EBADF;
285#endif
286
287	if (IS_DAX(file_inode(file))) {
288		/* no bad return value, but ignore advice */
289		return 0;
290	}
291
292	/*
293	 * Filesystem's fadvise may need to take various locks.  We need to
294	 * explicitly grab a reference because the vma (and hence the
295	 * vma's reference to the file) can go away as soon as we drop
296	 * mmap_lock.
297	 */
298	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
299	get_file(file);
300	offset = (loff_t)(start - vma->vm_start)
301			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
302	mmap_read_unlock(mm);
303	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
304	fput(file);
305	mmap_read_lock(mm);
306	return 0;
307}
308
309static inline bool can_do_file_pageout(struct vm_area_struct *vma)
310{
311	if (!vma->vm_file)
312		return false;
313	/*
314	 * paging out pagecache only for non-anonymous mappings that correspond
315	 * to the files the calling process could (if tried) open for writing;
316	 * otherwise we'd be including shared non-exclusive mappings, which
317	 * opens a side channel.
318	 */
319	return inode_owner_or_capable(&nop_mnt_idmap,
320				      file_inode(vma->vm_file)) ||
321	       file_permission(vma->vm_file, MAY_WRITE) == 0;
322}
323
324static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
325				unsigned long addr, unsigned long end,
326				struct mm_walk *walk)
327{
328	struct madvise_walk_private *private = walk->private;
329	struct mmu_gather *tlb = private->tlb;
330	bool pageout = private->pageout;
331	struct mm_struct *mm = tlb->mm;
332	struct vm_area_struct *vma = walk->vma;
333	pte_t *start_pte, *pte, ptent;
334	spinlock_t *ptl;
335	struct folio *folio = NULL;
336	LIST_HEAD(folio_list);
337	bool pageout_anon_only_filter;
338	unsigned int batch_count = 0;
339
340	if (fatal_signal_pending(current))
341		return -EINTR;
342
343	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
344					!can_do_file_pageout(vma);
345
346#ifdef CONFIG_TRANSPARENT_HUGEPAGE
347	if (pmd_trans_huge(*pmd)) {
348		pmd_t orig_pmd;
349		unsigned long next = pmd_addr_end(addr, end);
350
351		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
352		ptl = pmd_trans_huge_lock(pmd, vma);
353		if (!ptl)
354			return 0;
355
356		orig_pmd = *pmd;
357		if (is_huge_zero_pmd(orig_pmd))
358			goto huge_unlock;
359
360		if (unlikely(!pmd_present(orig_pmd))) {
361			VM_BUG_ON(thp_migration_supported() &&
362					!is_pmd_migration_entry(orig_pmd));
363			goto huge_unlock;
364		}
365
366		folio = pfn_folio(pmd_pfn(orig_pmd));
367
368		/* Do not interfere with other mappings of this folio */
369		if (folio_estimated_sharers(folio) != 1)
370			goto huge_unlock;
371
372		if (pageout_anon_only_filter && !folio_test_anon(folio))
373			goto huge_unlock;
374
375		if (next - addr != HPAGE_PMD_SIZE) {
376			int err;
377
378			folio_get(folio);
379			spin_unlock(ptl);
380			folio_lock(folio);
381			err = split_folio(folio);
382			folio_unlock(folio);
383			folio_put(folio);
384			if (!err)
385				goto regular_folio;
386			return 0;
387		}
388
389		if (!pageout && pmd_young(orig_pmd)) {
390			pmdp_invalidate(vma, addr, pmd);
391			orig_pmd = pmd_mkold(orig_pmd);
392
393			set_pmd_at(mm, addr, pmd, orig_pmd);
394			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
395		}
396
397		folio_clear_referenced(folio);
398		folio_test_clear_young(folio);
399		if (folio_test_active(folio))
400			folio_set_workingset(folio);
401		if (pageout) {
402			if (folio_isolate_lru(folio)) {
403				if (folio_test_unevictable(folio))
404					folio_putback_lru(folio);
405				else
406					list_add(&folio->lru, &folio_list);
407			}
408		} else
409			folio_deactivate(folio);
410huge_unlock:
411		spin_unlock(ptl);
412		if (pageout)
413			reclaim_pages(&folio_list, true);
414		return 0;
415	}
416
417regular_folio:
418#endif
419	tlb_change_page_size(tlb, PAGE_SIZE);
420restart:
421	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
422	if (!start_pte)
423		return 0;
424	flush_tlb_batched_pending(mm);
425	arch_enter_lazy_mmu_mode();
426	for (; addr < end; pte++, addr += PAGE_SIZE) {
427		ptent = ptep_get(pte);
428
429		if (++batch_count == SWAP_CLUSTER_MAX) {
430			batch_count = 0;
431			if (need_resched()) {
432				arch_leave_lazy_mmu_mode();
433				pte_unmap_unlock(start_pte, ptl);
434				cond_resched();
435				goto restart;
436			}
437		}
438
439		if (pte_none(ptent))
440			continue;
441
442		if (!pte_present(ptent))
443			continue;
444
445		folio = vm_normal_folio(vma, addr, ptent);
446		if (!folio || folio_is_zone_device(folio))
447			continue;
448
449		/*
450		 * Creating a THP page is expensive so split it only if we
451		 * are sure it's worth. Split it if we are only owner.
452		 */
453		if (folio_test_large(folio)) {
454			int err;
455
456			if (folio_estimated_sharers(folio) > 1)
457				break;
458			if (pageout_anon_only_filter && !folio_test_anon(folio))
459				break;
460			if (!folio_trylock(folio))
461				break;
462			folio_get(folio);
463			arch_leave_lazy_mmu_mode();
464			pte_unmap_unlock(start_pte, ptl);
465			start_pte = NULL;
466			err = split_folio(folio);
467			folio_unlock(folio);
468			folio_put(folio);
469			if (err)
470				break;
471			start_pte = pte =
472				pte_offset_map_lock(mm, pmd, addr, &ptl);
473			if (!start_pte)
474				break;
475			arch_enter_lazy_mmu_mode();
476			pte--;
477			addr -= PAGE_SIZE;
478			continue;
479		}
480
481		/*
482		 * Do not interfere with other mappings of this folio and
483		 * non-LRU folio.
484		 */
485		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
486			continue;
487
488		if (pageout_anon_only_filter && !folio_test_anon(folio))
489			continue;
490
491		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
492
493		if (!pageout && pte_young(ptent)) {
494			ptent = ptep_get_and_clear_full(mm, addr, pte,
495							tlb->fullmm);
496			ptent = pte_mkold(ptent);
497			set_pte_at(mm, addr, pte, ptent);
498			tlb_remove_tlb_entry(tlb, pte, addr);
499		}
500
501		/*
502		 * We are deactivating a folio for accelerating reclaiming.
503		 * VM couldn't reclaim the folio unless we clear PG_young.
504		 * As a side effect, it makes confuse idle-page tracking
505		 * because they will miss recent referenced history.
506		 */
507		folio_clear_referenced(folio);
508		folio_test_clear_young(folio);
509		if (folio_test_active(folio))
510			folio_set_workingset(folio);
511		if (pageout) {
512			if (folio_isolate_lru(folio)) {
513				if (folio_test_unevictable(folio))
514					folio_putback_lru(folio);
515				else
516					list_add(&folio->lru, &folio_list);
517			}
518		} else
519			folio_deactivate(folio);
520	}
521
522	if (start_pte) {
523		arch_leave_lazy_mmu_mode();
524		pte_unmap_unlock(start_pte, ptl);
525	}
526	if (pageout)
527		reclaim_pages(&folio_list, true);
528	cond_resched();
529
530	return 0;
531}
532
533static const struct mm_walk_ops cold_walk_ops = {
534	.pmd_entry = madvise_cold_or_pageout_pte_range,
535	.walk_lock = PGWALK_RDLOCK,
536};
537
538static void madvise_cold_page_range(struct mmu_gather *tlb,
539			     struct vm_area_struct *vma,
540			     unsigned long addr, unsigned long end)
541{
542	struct madvise_walk_private walk_private = {
543		.pageout = false,
544		.tlb = tlb,
545	};
546
547	tlb_start_vma(tlb, vma);
548	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
549	tlb_end_vma(tlb, vma);
550}
551
552static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
553{
554	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
555}
556
557static long madvise_cold(struct vm_area_struct *vma,
558			struct vm_area_struct **prev,
559			unsigned long start_addr, unsigned long end_addr)
560{
561	struct mm_struct *mm = vma->vm_mm;
562	struct mmu_gather tlb;
563
564	*prev = vma;
565	if (!can_madv_lru_vma(vma))
566		return -EINVAL;
567
568	lru_add_drain();
569	tlb_gather_mmu(&tlb, mm);
570	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
571	tlb_finish_mmu(&tlb);
572
573	return 0;
574}
575
576static void madvise_pageout_page_range(struct mmu_gather *tlb,
577			     struct vm_area_struct *vma,
578			     unsigned long addr, unsigned long end)
579{
580	struct madvise_walk_private walk_private = {
581		.pageout = true,
582		.tlb = tlb,
583	};
584
585	tlb_start_vma(tlb, vma);
586	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
587	tlb_end_vma(tlb, vma);
588}
589
590static long madvise_pageout(struct vm_area_struct *vma,
591			struct vm_area_struct **prev,
592			unsigned long start_addr, unsigned long end_addr)
593{
594	struct mm_struct *mm = vma->vm_mm;
595	struct mmu_gather tlb;
596
597	*prev = vma;
598	if (!can_madv_lru_vma(vma))
599		return -EINVAL;
600
601	/*
602	 * If the VMA belongs to a private file mapping, there can be private
603	 * dirty pages which can be paged out if even this process is neither
604	 * owner nor write capable of the file. We allow private file mappings
605	 * further to pageout dirty anon pages.
606	 */
607	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
608				(vma->vm_flags & VM_MAYSHARE)))
609		return 0;
610
611	lru_add_drain();
612	tlb_gather_mmu(&tlb, mm);
613	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
614	tlb_finish_mmu(&tlb);
615
616	return 0;
617}
618
619static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
620				unsigned long end, struct mm_walk *walk)
621
622{
623	struct mmu_gather *tlb = walk->private;
624	struct mm_struct *mm = tlb->mm;
625	struct vm_area_struct *vma = walk->vma;
626	spinlock_t *ptl;
627	pte_t *start_pte, *pte, ptent;
628	struct folio *folio;
629	int nr_swap = 0;
630	unsigned long next;
631
632	next = pmd_addr_end(addr, end);
633	if (pmd_trans_huge(*pmd))
634		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
635			return 0;
636
637	tlb_change_page_size(tlb, PAGE_SIZE);
638	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
639	if (!start_pte)
640		return 0;
641	flush_tlb_batched_pending(mm);
642	arch_enter_lazy_mmu_mode();
643	for (; addr != end; pte++, addr += PAGE_SIZE) {
644		ptent = ptep_get(pte);
645
646		if (pte_none(ptent))
647			continue;
648		/*
649		 * If the pte has swp_entry, just clear page table to
650		 * prevent swap-in which is more expensive rather than
651		 * (page allocation + zeroing).
652		 */
653		if (!pte_present(ptent)) {
654			swp_entry_t entry;
655
656			entry = pte_to_swp_entry(ptent);
657			if (!non_swap_entry(entry)) {
658				nr_swap--;
659				free_swap_and_cache(entry);
660				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
661			} else if (is_hwpoison_entry(entry) ||
662				   is_poisoned_swp_entry(entry)) {
663				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
664			}
665			continue;
666		}
667
668		folio = vm_normal_folio(vma, addr, ptent);
669		if (!folio || folio_is_zone_device(folio))
670			continue;
671
672		/*
673		 * If pmd isn't transhuge but the folio is large and
674		 * is owned by only this process, split it and
675		 * deactivate all pages.
676		 */
677		if (folio_test_large(folio)) {
678			int err;
679
680			if (folio_estimated_sharers(folio) != 1)
681				break;
682			if (!folio_trylock(folio))
683				break;
684			folio_get(folio);
685			arch_leave_lazy_mmu_mode();
686			pte_unmap_unlock(start_pte, ptl);
687			start_pte = NULL;
688			err = split_folio(folio);
689			folio_unlock(folio);
690			folio_put(folio);
691			if (err)
692				break;
693			start_pte = pte =
694				pte_offset_map_lock(mm, pmd, addr, &ptl);
695			if (!start_pte)
696				break;
697			arch_enter_lazy_mmu_mode();
698			pte--;
699			addr -= PAGE_SIZE;
700			continue;
701		}
702
703		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
704			if (!folio_trylock(folio))
705				continue;
706			/*
707			 * If folio is shared with others, we mustn't clear
708			 * the folio's dirty flag.
709			 */
710			if (folio_mapcount(folio) != 1) {
711				folio_unlock(folio);
712				continue;
713			}
714
715			if (folio_test_swapcache(folio) &&
716			    !folio_free_swap(folio)) {
717				folio_unlock(folio);
718				continue;
719			}
720
721			folio_clear_dirty(folio);
722			folio_unlock(folio);
723		}
724
725		if (pte_young(ptent) || pte_dirty(ptent)) {
726			/*
727			 * Some of architecture(ex, PPC) don't update TLB
728			 * with set_pte_at and tlb_remove_tlb_entry so for
729			 * the portability, remap the pte with old|clean
730			 * after pte clearing.
731			 */
732			ptent = ptep_get_and_clear_full(mm, addr, pte,
733							tlb->fullmm);
734
735			ptent = pte_mkold(ptent);
736			ptent = pte_mkclean(ptent);
737			set_pte_at(mm, addr, pte, ptent);
738			tlb_remove_tlb_entry(tlb, pte, addr);
739		}
740		folio_mark_lazyfree(folio);
741	}
742
743	if (nr_swap)
744		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
745	if (start_pte) {
746		arch_leave_lazy_mmu_mode();
747		pte_unmap_unlock(start_pte, ptl);
748	}
749	cond_resched();
750
751	return 0;
752}
753
754static const struct mm_walk_ops madvise_free_walk_ops = {
755	.pmd_entry		= madvise_free_pte_range,
756	.walk_lock		= PGWALK_RDLOCK,
757};
758
759static int madvise_free_single_vma(struct vm_area_struct *vma,
760			unsigned long start_addr, unsigned long end_addr)
761{
762	struct mm_struct *mm = vma->vm_mm;
763	struct mmu_notifier_range range;
764	struct mmu_gather tlb;
765
766	/* MADV_FREE works for only anon vma at the moment */
767	if (!vma_is_anonymous(vma))
768		return -EINVAL;
769
770	range.start = max(vma->vm_start, start_addr);
771	if (range.start >= vma->vm_end)
772		return -EINVAL;
773	range.end = min(vma->vm_end, end_addr);
774	if (range.end <= vma->vm_start)
775		return -EINVAL;
776	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
777				range.start, range.end);
778
779	lru_add_drain();
780	tlb_gather_mmu(&tlb, mm);
781	update_hiwater_rss(mm);
782
783	mmu_notifier_invalidate_range_start(&range);
784	tlb_start_vma(&tlb, vma);
785	walk_page_range(vma->vm_mm, range.start, range.end,
786			&madvise_free_walk_ops, &tlb);
787	tlb_end_vma(&tlb, vma);
788	mmu_notifier_invalidate_range_end(&range);
789	tlb_finish_mmu(&tlb);
790
791	return 0;
792}
793
794/*
795 * Application no longer needs these pages.  If the pages are dirty,
796 * it's OK to just throw them away.  The app will be more careful about
797 * data it wants to keep.  Be sure to free swap resources too.  The
798 * zap_page_range_single call sets things up for shrink_active_list to actually
799 * free these pages later if no one else has touched them in the meantime,
800 * although we could add these pages to a global reuse list for
801 * shrink_active_list to pick up before reclaiming other pages.
802 *
803 * NB: This interface discards data rather than pushes it out to swap,
804 * as some implementations do.  This has performance implications for
805 * applications like large transactional databases which want to discard
806 * pages in anonymous maps after committing to backing store the data
807 * that was kept in them.  There is no reason to write this data out to
808 * the swap area if the application is discarding it.
809 *
810 * An interface that causes the system to free clean pages and flush
811 * dirty pages is already available as msync(MS_INVALIDATE).
812 */
813static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
814					unsigned long start, unsigned long end)
815{
816	zap_page_range_single(vma, start, end - start, NULL);
817	return 0;
818}
819
820static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
821					    unsigned long start,
822					    unsigned long *end,
823					    int behavior)
824{
825	if (!is_vm_hugetlb_page(vma)) {
826		unsigned int forbidden = VM_PFNMAP;
827
828		if (behavior != MADV_DONTNEED_LOCKED)
829			forbidden |= VM_LOCKED;
830
831		return !(vma->vm_flags & forbidden);
832	}
833
834	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
835		return false;
836	if (start & ~huge_page_mask(hstate_vma(vma)))
837		return false;
838
839	/*
840	 * Madvise callers expect the length to be rounded up to PAGE_SIZE
841	 * boundaries, and may be unaware that this VMA uses huge pages.
842	 * Avoid unexpected data loss by rounding down the number of
843	 * huge pages freed.
844	 */
845	*end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
846
847	return true;
848}
849
850static long madvise_dontneed_free(struct vm_area_struct *vma,
851				  struct vm_area_struct **prev,
852				  unsigned long start, unsigned long end,
853				  int behavior)
854{
855	struct mm_struct *mm = vma->vm_mm;
856
857	*prev = vma;
858	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
859		return -EINVAL;
860
861	if (start == end)
862		return 0;
863
864	if (!userfaultfd_remove(vma, start, end)) {
865		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
866
867		mmap_read_lock(mm);
868		vma = vma_lookup(mm, start);
869		if (!vma)
870			return -ENOMEM;
871		/*
872		 * Potential end adjustment for hugetlb vma is OK as
873		 * the check below keeps end within vma.
874		 */
875		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
876						     behavior))
877			return -EINVAL;
878		if (end > vma->vm_end) {
879			/*
880			 * Don't fail if end > vma->vm_end. If the old
881			 * vma was split while the mmap_lock was
882			 * released the effect of the concurrent
883			 * operation may not cause madvise() to
884			 * have an undefined result. There may be an
885			 * adjacent next vma that we'll walk
886			 * next. userfaultfd_remove() will generate an
887			 * UFFD_EVENT_REMOVE repetition on the
888			 * end-vma->vm_end range, but the manager can
889			 * handle a repetition fine.
890			 */
891			end = vma->vm_end;
892		}
893		VM_WARN_ON(start >= end);
894	}
895
896	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
897		return madvise_dontneed_single_vma(vma, start, end);
898	else if (behavior == MADV_FREE)
899		return madvise_free_single_vma(vma, start, end);
900	else
901		return -EINVAL;
902}
903
904static long madvise_populate(struct vm_area_struct *vma,
905			     struct vm_area_struct **prev,
906			     unsigned long start, unsigned long end,
907			     int behavior)
908{
909	const bool write = behavior == MADV_POPULATE_WRITE;
910	struct mm_struct *mm = vma->vm_mm;
911	int locked = 1;
912	long pages;
913
914	*prev = vma;
915
916	while (start < end) {
917		/* Populate (prefault) page tables readable/writable. */
918		pages = faultin_page_range(mm, start, end, write, &locked);
919		if (!locked) {
920			mmap_read_lock(mm);
921			locked = 1;
922			*prev = NULL;
923			vma = NULL;
924		}
925		if (pages < 0) {
926			switch (pages) {
927			case -EINTR:
928				return -EINTR;
929			case -EINVAL: /* Incompatible mappings / permissions. */
930				return -EINVAL;
931			case -EHWPOISON:
932				return -EHWPOISON;
933			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
934				return -EFAULT;
935			default:
936				pr_warn_once("%s: unhandled return value: %ld\n",
937					     __func__, pages);
938				fallthrough;
939			case -ENOMEM: /* No VMA or out of memory. */
940				return -ENOMEM;
941			}
942		}
943		start += pages * PAGE_SIZE;
944	}
945	return 0;
946}
947
948/*
949 * Application wants to free up the pages and associated backing store.
950 * This is effectively punching a hole into the middle of a file.
951 */
952static long madvise_remove(struct vm_area_struct *vma,
953				struct vm_area_struct **prev,
954				unsigned long start, unsigned long end)
955{
956	loff_t offset;
957	int error;
958	struct file *f;
959	struct mm_struct *mm = vma->vm_mm;
960
961	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
962
963	if (vma->vm_flags & VM_LOCKED)
964		return -EINVAL;
965
966	f = vma->vm_file;
967
968	if (!f || !f->f_mapping || !f->f_mapping->host) {
969			return -EINVAL;
970	}
971
972	if (!vma_is_shared_maywrite(vma))
973		return -EACCES;
974
975	offset = (loff_t)(start - vma->vm_start)
976			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
977
978	/*
979	 * Filesystem's fallocate may need to take i_rwsem.  We need to
980	 * explicitly grab a reference because the vma (and hence the
981	 * vma's reference to the file) can go away as soon as we drop
982	 * mmap_lock.
983	 */
984	get_file(f);
985	if (userfaultfd_remove(vma, start, end)) {
986		/* mmap_lock was not released by userfaultfd_remove() */
987		mmap_read_unlock(mm);
988	}
989	error = vfs_fallocate(f,
990				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
991				offset, end - start);
992	fput(f);
993	mmap_read_lock(mm);
994	return error;
995}
996
997/*
998 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
999 * will handle splitting a vm area into separate areas, each area with its own
1000 * behavior.
1001 */
1002static int madvise_vma_behavior(struct vm_area_struct *vma,
1003				struct vm_area_struct **prev,
1004				unsigned long start, unsigned long end,
1005				unsigned long behavior)
1006{
1007	int error;
1008	struct anon_vma_name *anon_name;
1009	unsigned long new_flags = vma->vm_flags;
1010
1011	switch (behavior) {
1012	case MADV_REMOVE:
1013		return madvise_remove(vma, prev, start, end);
1014	case MADV_WILLNEED:
1015		return madvise_willneed(vma, prev, start, end);
1016	case MADV_COLD:
1017		return madvise_cold(vma, prev, start, end);
1018	case MADV_PAGEOUT:
1019		return madvise_pageout(vma, prev, start, end);
1020	case MADV_FREE:
1021	case MADV_DONTNEED:
1022	case MADV_DONTNEED_LOCKED:
1023		return madvise_dontneed_free(vma, prev, start, end, behavior);
1024	case MADV_POPULATE_READ:
1025	case MADV_POPULATE_WRITE:
1026		return madvise_populate(vma, prev, start, end, behavior);
1027	case MADV_NORMAL:
1028		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1029		break;
1030	case MADV_SEQUENTIAL:
1031		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1032		break;
1033	case MADV_RANDOM:
1034		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1035		break;
1036	case MADV_DONTFORK:
1037		new_flags |= VM_DONTCOPY;
1038		break;
1039	case MADV_DOFORK:
1040		if (vma->vm_flags & VM_IO)
1041			return -EINVAL;
1042		new_flags &= ~VM_DONTCOPY;
1043		break;
1044	case MADV_WIPEONFORK:
1045		/* MADV_WIPEONFORK is only supported on anonymous memory. */
1046		if (vma->vm_file || vma->vm_flags & VM_SHARED)
1047			return -EINVAL;
1048		new_flags |= VM_WIPEONFORK;
1049		break;
1050	case MADV_KEEPONFORK:
1051		new_flags &= ~VM_WIPEONFORK;
1052		break;
1053	case MADV_DONTDUMP:
1054		new_flags |= VM_DONTDUMP;
1055		break;
1056	case MADV_DODUMP:
1057		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1058			return -EINVAL;
1059		new_flags &= ~VM_DONTDUMP;
1060		break;
1061	case MADV_MERGEABLE:
1062	case MADV_UNMERGEABLE:
1063		error = ksm_madvise(vma, start, end, behavior, &new_flags);
1064		if (error)
1065			goto out;
1066		break;
1067	case MADV_HUGEPAGE:
1068	case MADV_NOHUGEPAGE:
1069		error = hugepage_madvise(vma, &new_flags, behavior);
1070		if (error)
1071			goto out;
1072		break;
1073	case MADV_COLLAPSE:
1074		return madvise_collapse(vma, prev, start, end);
1075	}
1076
1077	anon_name = anon_vma_name(vma);
1078	anon_vma_name_get(anon_name);
1079	error = madvise_update_vma(vma, prev, start, end, new_flags,
1080				   anon_name);
1081	anon_vma_name_put(anon_name);
1082
1083out:
1084	/*
1085	 * madvise() returns EAGAIN if kernel resources, such as
1086	 * slab, are temporarily unavailable.
1087	 */
1088	if (error == -ENOMEM)
1089		error = -EAGAIN;
1090	return error;
1091}
1092
1093#ifdef CONFIG_MEMORY_FAILURE
1094/*
1095 * Error injection support for memory error handling.
1096 */
1097static int madvise_inject_error(int behavior,
1098		unsigned long start, unsigned long end)
1099{
1100	unsigned long size;
1101
1102	if (!capable(CAP_SYS_ADMIN))
1103		return -EPERM;
1104
1105
1106	for (; start < end; start += size) {
1107		unsigned long pfn;
1108		struct page *page;
1109		int ret;
1110
1111		ret = get_user_pages_fast(start, 1, 0, &page);
1112		if (ret != 1)
1113			return ret;
1114		pfn = page_to_pfn(page);
1115
1116		/*
1117		 * When soft offlining hugepages, after migrating the page
1118		 * we dissolve it, therefore in the second loop "page" will
1119		 * no longer be a compound page.
1120		 */
1121		size = page_size(compound_head(page));
1122
1123		if (behavior == MADV_SOFT_OFFLINE) {
1124			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1125				 pfn, start);
1126			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1127		} else {
1128			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1129				 pfn, start);
1130			ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
1131			if (ret == -EOPNOTSUPP)
1132				ret = 0;
1133		}
1134
1135		if (ret)
1136			return ret;
1137	}
1138
1139	return 0;
1140}
1141#endif
1142
1143static bool
1144madvise_behavior_valid(int behavior)
1145{
1146	switch (behavior) {
1147	case MADV_DOFORK:
1148	case MADV_DONTFORK:
1149	case MADV_NORMAL:
1150	case MADV_SEQUENTIAL:
1151	case MADV_RANDOM:
1152	case MADV_REMOVE:
1153	case MADV_WILLNEED:
1154	case MADV_DONTNEED:
1155	case MADV_DONTNEED_LOCKED:
1156	case MADV_FREE:
1157	case MADV_COLD:
1158	case MADV_PAGEOUT:
1159	case MADV_POPULATE_READ:
1160	case MADV_POPULATE_WRITE:
1161#ifdef CONFIG_KSM
1162	case MADV_MERGEABLE:
1163	case MADV_UNMERGEABLE:
1164#endif
1165#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1166	case MADV_HUGEPAGE:
1167	case MADV_NOHUGEPAGE:
1168	case MADV_COLLAPSE:
1169#endif
1170	case MADV_DONTDUMP:
1171	case MADV_DODUMP:
1172	case MADV_WIPEONFORK:
1173	case MADV_KEEPONFORK:
1174#ifdef CONFIG_MEMORY_FAILURE
1175	case MADV_SOFT_OFFLINE:
1176	case MADV_HWPOISON:
1177#endif
1178		return true;
1179
1180	default:
1181		return false;
1182	}
1183}
1184
1185static bool process_madvise_behavior_valid(int behavior)
1186{
1187	switch (behavior) {
1188	case MADV_COLD:
1189	case MADV_PAGEOUT:
1190	case MADV_WILLNEED:
1191	case MADV_COLLAPSE:
1192		return true;
1193	default:
1194		return false;
1195	}
1196}
1197
1198/*
1199 * Walk the vmas in range [start,end), and call the visit function on each one.
1200 * The visit function will get start and end parameters that cover the overlap
1201 * between the current vma and the original range.  Any unmapped regions in the
1202 * original range will result in this function returning -ENOMEM while still
1203 * calling the visit function on all of the existing vmas in the range.
1204 * Must be called with the mmap_lock held for reading or writing.
1205 */
1206static
1207int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1208		      unsigned long end, unsigned long arg,
1209		      int (*visit)(struct vm_area_struct *vma,
1210				   struct vm_area_struct **prev, unsigned long start,
1211				   unsigned long end, unsigned long arg))
1212{
1213	struct vm_area_struct *vma;
1214	struct vm_area_struct *prev;
1215	unsigned long tmp;
1216	int unmapped_error = 0;
1217
1218	/*
1219	 * If the interval [start,end) covers some unmapped address
1220	 * ranges, just ignore them, but return -ENOMEM at the end.
1221	 * - different from the way of handling in mlock etc.
1222	 */
1223	vma = find_vma_prev(mm, start, &prev);
1224	if (vma && start > vma->vm_start)
1225		prev = vma;
1226
1227	for (;;) {
1228		int error;
1229
1230		/* Still start < end. */
1231		if (!vma)
1232			return -ENOMEM;
1233
1234		/* Here start < (end|vma->vm_end). */
1235		if (start < vma->vm_start) {
1236			unmapped_error = -ENOMEM;
1237			start = vma->vm_start;
1238			if (start >= end)
1239				break;
1240		}
1241
1242		/* Here vma->vm_start <= start < (end|vma->vm_end) */
1243		tmp = vma->vm_end;
1244		if (end < tmp)
1245			tmp = end;
1246
1247		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1248		error = visit(vma, &prev, start, tmp, arg);
1249		if (error)
1250			return error;
1251		start = tmp;
1252		if (prev && start < prev->vm_end)
1253			start = prev->vm_end;
1254		if (start >= end)
1255			break;
1256		if (prev)
1257			vma = find_vma(mm, prev->vm_end);
1258		else	/* madvise_remove dropped mmap_lock */
1259			vma = find_vma(mm, start);
1260	}
1261
1262	return unmapped_error;
1263}
1264
1265#ifdef CONFIG_ANON_VMA_NAME
1266static int madvise_vma_anon_name(struct vm_area_struct *vma,
1267				 struct vm_area_struct **prev,
1268				 unsigned long start, unsigned long end,
1269				 unsigned long anon_name)
1270{
1271	int error;
1272
1273	/* Only anonymous mappings can be named */
1274	if (vma->vm_file && !vma_is_anon_shmem(vma))
1275		return -EBADF;
1276
1277	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1278				   (struct anon_vma_name *)anon_name);
1279
1280	/*
1281	 * madvise() returns EAGAIN if kernel resources, such as
1282	 * slab, are temporarily unavailable.
1283	 */
1284	if (error == -ENOMEM)
1285		error = -EAGAIN;
1286	return error;
1287}
1288
1289int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1290			  unsigned long len_in, struct anon_vma_name *anon_name)
1291{
1292	unsigned long end;
1293	unsigned long len;
1294
1295	if (start & ~PAGE_MASK)
1296		return -EINVAL;
1297	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1298
1299	/* Check to see whether len was rounded up from small -ve to zero */
1300	if (len_in && !len)
1301		return -EINVAL;
1302
1303	end = start + len;
1304	if (end < start)
1305		return -EINVAL;
1306
1307	if (end == start)
1308		return 0;
1309
1310	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1311				 madvise_vma_anon_name);
1312}
1313#endif /* CONFIG_ANON_VMA_NAME */
1314/*
1315 * The madvise(2) system call.
1316 *
1317 * Applications can use madvise() to advise the kernel how it should
1318 * handle paging I/O in this VM area.  The idea is to help the kernel
1319 * use appropriate read-ahead and caching techniques.  The information
1320 * provided is advisory only, and can be safely disregarded by the
1321 * kernel without affecting the correct operation of the application.
1322 *
1323 * behavior values:
1324 *  MADV_NORMAL - the default behavior is to read clusters.  This
1325 *		results in some read-ahead and read-behind.
1326 *  MADV_RANDOM - the system should read the minimum amount of data
1327 *		on any access, since it is unlikely that the appli-
1328 *		cation will need more than what it asks for.
1329 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
1330 *		once, so they can be aggressively read ahead, and
1331 *		can be freed soon after they are accessed.
1332 *  MADV_WILLNEED - the application is notifying the system to read
1333 *		some pages ahead.
1334 *  MADV_DONTNEED - the application is finished with the given range,
1335 *		so the kernel can free resources associated with it.
1336 *  MADV_FREE - the application marks pages in the given range as lazy free,
1337 *		where actual purges are postponed until memory pressure happens.
1338 *  MADV_REMOVE - the application wants to free up the given range of
1339 *		pages and associated backing store.
1340 *  MADV_DONTFORK - omit this area from child's address space when forking:
1341 *		typically, to avoid COWing pages pinned by get_user_pages().
1342 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1343 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
1344 *              range after a fork.
1345 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1346 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
1347 *		were corrupted by unrecoverable hardware memory failure.
1348 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1349 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1350 *		this area with pages of identical content from other such areas.
1351 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1352 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
1353 *		huge pages in the future. Existing pages might be coalesced and
1354 *		new pages might be allocated as THP.
1355 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1356 *		transparent huge pages so the existing pages will not be
1357 *		coalesced into THP and new pages will not be allocated as THP.
1358 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
1359 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
1360 *		from being included in its core dump.
1361 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1362 *  MADV_COLD - the application is not expected to use this memory soon,
1363 *		deactivate pages in this range so that they can be reclaimed
1364 *		easily if memory pressure happens.
1365 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
1366 *		page out the pages in this range immediately.
1367 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
1368 *		triggering read faults if required
1369 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1370 *		triggering write faults if required
1371 *
1372 * return values:
1373 *  zero    - success
1374 *  -EINVAL - start + len < 0, start is not page-aligned,
1375 *		"behavior" is not a valid value, or application
1376 *		is attempting to release locked or shared pages,
1377 *		or the specified address range includes file, Huge TLB,
1378 *		MAP_SHARED or VMPFNMAP range.
1379 *  -ENOMEM - addresses in the specified range are not currently
1380 *		mapped, or are outside the AS of the process.
1381 *  -EIO    - an I/O error occurred while paging in data.
1382 *  -EBADF  - map exists, but area maps something that isn't a file.
1383 *  -EAGAIN - a kernel resource was temporarily unavailable.
1384 */
1385int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1386{
1387	unsigned long end;
1388	int error;
1389	int write;
1390	size_t len;
1391	struct blk_plug plug;
1392
1393	if (!madvise_behavior_valid(behavior))
1394		return -EINVAL;
1395
1396	if (!PAGE_ALIGNED(start))
1397		return -EINVAL;
1398	len = PAGE_ALIGN(len_in);
1399
1400	/* Check to see whether len was rounded up from small -ve to zero */
1401	if (len_in && !len)
1402		return -EINVAL;
1403
1404	end = start + len;
1405	if (end < start)
1406		return -EINVAL;
1407
1408	if (end == start)
1409		return 0;
1410
1411#ifdef CONFIG_MEMORY_FAILURE
1412	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1413		return madvise_inject_error(behavior, start, start + len_in);
1414#endif
1415
1416	write = madvise_need_mmap_write(behavior);
1417	if (write) {
1418		if (mmap_write_lock_killable(mm))
1419			return -EINTR;
1420	} else {
1421		mmap_read_lock(mm);
1422	}
1423
1424	start = untagged_addr_remote(mm, start);
1425	end = start + len;
1426
1427	blk_start_plug(&plug);
1428	error = madvise_walk_vmas(mm, start, end, behavior,
1429			madvise_vma_behavior);
1430	blk_finish_plug(&plug);
1431	if (write)
1432		mmap_write_unlock(mm);
1433	else
1434		mmap_read_unlock(mm);
1435
1436	return error;
1437}
1438
1439SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1440{
1441	return do_madvise(current->mm, start, len_in, behavior);
1442}
1443
1444SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1445		size_t, vlen, int, behavior, unsigned int, flags)
1446{
1447	ssize_t ret;
1448	struct iovec iovstack[UIO_FASTIOV];
1449	struct iovec *iov = iovstack;
1450	struct iov_iter iter;
1451	struct task_struct *task;
1452	struct mm_struct *mm;
1453	size_t total_len;
1454	unsigned int f_flags;
1455
1456	if (flags != 0) {
1457		ret = -EINVAL;
1458		goto out;
1459	}
1460
1461	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1462	if (ret < 0)
1463		goto out;
1464
1465	task = pidfd_get_task(pidfd, &f_flags);
1466	if (IS_ERR(task)) {
1467		ret = PTR_ERR(task);
1468		goto free_iov;
1469	}
1470
1471	if (!process_madvise_behavior_valid(behavior)) {
1472		ret = -EINVAL;
1473		goto release_task;
1474	}
1475
1476	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1477	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1478	if (IS_ERR_OR_NULL(mm)) {
1479		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1480		goto release_task;
1481	}
1482
1483	/*
1484	 * Require CAP_SYS_NICE for influencing process performance. Note that
1485	 * only non-destructive hints are currently supported.
1486	 */
1487	if (!capable(CAP_SYS_NICE)) {
1488		ret = -EPERM;
1489		goto release_mm;
1490	}
1491
1492	total_len = iov_iter_count(&iter);
1493
1494	while (iov_iter_count(&iter)) {
1495		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
1496					iter_iov_len(&iter), behavior);
1497		if (ret < 0)
1498			break;
1499		iov_iter_advance(&iter, iter_iov_len(&iter));
1500	}
1501
1502	ret = (total_len - iov_iter_count(&iter)) ? : ret;
1503
1504release_mm:
1505	mmput(mm);
1506release_task:
1507	put_task_struct(task);
1508free_iov:
1509	kfree(iov);
1510out:
1511	return ret;
1512}
1513