1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_PGTABLE_H
3#define _LINUX_PGTABLE_H
4
5#include <linux/pfn.h>
6#include <asm/pgtable.h>
7
8#define PMD_ORDER	(PMD_SHIFT - PAGE_SHIFT)
9#define PUD_ORDER	(PUD_SHIFT - PAGE_SHIFT)
10
11#ifndef __ASSEMBLY__
12#ifdef CONFIG_MMU
13
14#include <linux/mm_types.h>
15#include <linux/bug.h>
16#include <linux/errno.h>
17#include <asm-generic/pgtable_uffd.h>
18#include <linux/page_table_check.h>
19
20#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
21	defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
22#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
23#endif
24
25/*
26 * On almost all architectures and configurations, 0 can be used as the
27 * upper ceiling to free_pgtables(): on many architectures it has the same
28 * effect as using TASK_SIZE.  However, there is one configuration which
29 * must impose a more careful limit, to avoid freeing kernel pgtables.
30 */
31#ifndef USER_PGTABLES_CEILING
32#define USER_PGTABLES_CEILING	0UL
33#endif
34
35/*
36 * This defines the first usable user address. Platforms
37 * can override its value with custom FIRST_USER_ADDRESS
38 * defined in their respective <asm/pgtable.h>.
39 */
40#ifndef FIRST_USER_ADDRESS
41#define FIRST_USER_ADDRESS	0UL
42#endif
43
44/*
45 * This defines the generic helper for accessing PMD page
46 * table page. Although platforms can still override this
47 * via their respective <asm/pgtable.h>.
48 */
49#ifndef pmd_pgtable
50#define pmd_pgtable(pmd) pmd_page(pmd)
51#endif
52
53/*
54 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
55 *
56 * The pXx_index() functions return the index of the entry in the page
57 * table page which would control the given virtual address
58 *
59 * As these functions may be used by the same code for different levels of
60 * the page table folding, they are always available, regardless of
61 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
62 * because in such cases PTRS_PER_PxD equals 1.
63 */
64
65static inline unsigned long pte_index(unsigned long address)
66{
67	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
68}
69
70#ifndef pmd_index
71static inline unsigned long pmd_index(unsigned long address)
72{
73	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
74}
75#define pmd_index pmd_index
76#endif
77
78#ifndef pud_index
79static inline unsigned long pud_index(unsigned long address)
80{
81	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
82}
83#define pud_index pud_index
84#endif
85
86#ifndef pgd_index
87/* Must be a compile-time constant, so implement it as a macro */
88#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
89#endif
90
91#ifndef pte_offset_kernel
92static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
93{
94	return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
95}
96#define pte_offset_kernel pte_offset_kernel
97#endif
98
99#ifdef CONFIG_HIGHPTE
100#define __pte_map(pmd, address) \
101	((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
102#define pte_unmap(pte)	do {	\
103	kunmap_local((pte));	\
104	rcu_read_unlock();	\
105} while (0)
106#else
107static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
108{
109	return pte_offset_kernel(pmd, address);
110}
111static inline void pte_unmap(pte_t *pte)
112{
113	rcu_read_unlock();
114}
115#endif
116
117void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
118
119/* Find an entry in the second-level page table.. */
120#ifndef pmd_offset
121static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
122{
123	return pud_pgtable(*pud) + pmd_index(address);
124}
125#define pmd_offset pmd_offset
126#endif
127
128#ifndef pud_offset
129static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
130{
131	return p4d_pgtable(*p4d) + pud_index(address);
132}
133#define pud_offset pud_offset
134#endif
135
136static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
137{
138	return (pgd + pgd_index(address));
139};
140
141/*
142 * a shortcut to get a pgd_t in a given mm
143 */
144#ifndef pgd_offset
145#define pgd_offset(mm, address)		pgd_offset_pgd((mm)->pgd, (address))
146#endif
147
148/*
149 * a shortcut which implies the use of the kernel's pgd, instead
150 * of a process's
151 */
152#ifndef pgd_offset_k
153#define pgd_offset_k(address)		pgd_offset(&init_mm, (address))
154#endif
155
156/*
157 * In many cases it is known that a virtual address is mapped at PMD or PTE
158 * level, so instead of traversing all the page table levels, we can get a
159 * pointer to the PMD entry in user or kernel page table or translate a virtual
160 * address to the pointer in the PTE in the kernel page tables with simple
161 * helpers.
162 */
163static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
164{
165	return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
166}
167
168static inline pmd_t *pmd_off_k(unsigned long va)
169{
170	return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
171}
172
173static inline pte_t *virt_to_kpte(unsigned long vaddr)
174{
175	pmd_t *pmd = pmd_off_k(vaddr);
176
177	return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
178}
179
180#ifndef pmd_young
181static inline int pmd_young(pmd_t pmd)
182{
183	return 0;
184}
185#endif
186
187#ifndef pmd_dirty
188static inline int pmd_dirty(pmd_t pmd)
189{
190	return 0;
191}
192#endif
193
194/*
195 * A facility to provide lazy MMU batching.  This allows PTE updates and
196 * page invalidations to be delayed until a call to leave lazy MMU mode
197 * is issued.  Some architectures may benefit from doing this, and it is
198 * beneficial for both shadow and direct mode hypervisors, which may batch
199 * the PTE updates which happen during this window.  Note that using this
200 * interface requires that read hazards be removed from the code.  A read
201 * hazard could result in the direct mode hypervisor case, since the actual
202 * write to the page tables may not yet have taken place, so reads though
203 * a raw PTE pointer after it has been modified are not guaranteed to be
204 * up to date.  This mode can only be entered and left under the protection of
205 * the page table locks for all page tables which may be modified.  In the UP
206 * case, this is required so that preemption is disabled, and in the SMP case,
207 * it must synchronize the delayed page table writes properly on other CPUs.
208 */
209#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
210#define arch_enter_lazy_mmu_mode()	do {} while (0)
211#define arch_leave_lazy_mmu_mode()	do {} while (0)
212#define arch_flush_lazy_mmu_mode()	do {} while (0)
213#endif
214
215#ifndef pte_batch_hint
216/**
217 * pte_batch_hint - Number of pages that can be added to batch without scanning.
218 * @ptep: Page table pointer for the entry.
219 * @pte: Page table entry.
220 *
221 * Some architectures know that a set of contiguous ptes all map the same
222 * contiguous memory with the same permissions. In this case, it can provide a
223 * hint to aid pte batching without the core code needing to scan every pte.
224 *
225 * An architecture implementation may ignore the PTE accessed state. Further,
226 * the dirty state must apply atomically to all the PTEs described by the hint.
227 *
228 * May be overridden by the architecture, else pte_batch_hint is always 1.
229 */
230static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
231{
232	return 1;
233}
234#endif
235
236#ifndef pte_advance_pfn
237static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
238{
239	return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
240}
241#endif
242
243#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
244
245#ifndef set_ptes
246/**
247 * set_ptes - Map consecutive pages to a contiguous range of addresses.
248 * @mm: Address space to map the pages into.
249 * @addr: Address to map the first page at.
250 * @ptep: Page table pointer for the first entry.
251 * @pte: Page table entry for the first page.
252 * @nr: Number of pages to map.
253 *
254 * When nr==1, initial state of pte may be present or not present, and new state
255 * may be present or not present. When nr>1, initial state of all ptes must be
256 * not present, and new state must be present.
257 *
258 * May be overridden by the architecture, or the architecture can define
259 * set_pte() and PFN_PTE_SHIFT.
260 *
261 * Context: The caller holds the page table lock.  The pages all belong
262 * to the same folio.  The PTEs are all in the same PMD.
263 */
264static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
265		pte_t *ptep, pte_t pte, unsigned int nr)
266{
267	page_table_check_ptes_set(mm, ptep, pte, nr);
268
269	arch_enter_lazy_mmu_mode();
270	for (;;) {
271		set_pte(ptep, pte);
272		if (--nr == 0)
273			break;
274		ptep++;
275		pte = pte_next_pfn(pte);
276	}
277	arch_leave_lazy_mmu_mode();
278}
279#endif
280#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
281
282#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
283extern int ptep_set_access_flags(struct vm_area_struct *vma,
284				 unsigned long address, pte_t *ptep,
285				 pte_t entry, int dirty);
286#endif
287
288#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
289#ifdef CONFIG_TRANSPARENT_HUGEPAGE
290extern int pmdp_set_access_flags(struct vm_area_struct *vma,
291				 unsigned long address, pmd_t *pmdp,
292				 pmd_t entry, int dirty);
293extern int pudp_set_access_flags(struct vm_area_struct *vma,
294				 unsigned long address, pud_t *pudp,
295				 pud_t entry, int dirty);
296#else
297static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
298					unsigned long address, pmd_t *pmdp,
299					pmd_t entry, int dirty)
300{
301	BUILD_BUG();
302	return 0;
303}
304static inline int pudp_set_access_flags(struct vm_area_struct *vma,
305					unsigned long address, pud_t *pudp,
306					pud_t entry, int dirty)
307{
308	BUILD_BUG();
309	return 0;
310}
311#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
312#endif
313
314#ifndef ptep_get
315static inline pte_t ptep_get(pte_t *ptep)
316{
317	return READ_ONCE(*ptep);
318}
319#endif
320
321#ifndef pmdp_get
322static inline pmd_t pmdp_get(pmd_t *pmdp)
323{
324	return READ_ONCE(*pmdp);
325}
326#endif
327
328#ifndef pudp_get
329static inline pud_t pudp_get(pud_t *pudp)
330{
331	return READ_ONCE(*pudp);
332}
333#endif
334
335#ifndef p4dp_get
336static inline p4d_t p4dp_get(p4d_t *p4dp)
337{
338	return READ_ONCE(*p4dp);
339}
340#endif
341
342#ifndef pgdp_get
343static inline pgd_t pgdp_get(pgd_t *pgdp)
344{
345	return READ_ONCE(*pgdp);
346}
347#endif
348
349#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
350static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
351					    unsigned long address,
352					    pte_t *ptep)
353{
354	pte_t pte = ptep_get(ptep);
355	int r = 1;
356	if (!pte_young(pte))
357		r = 0;
358	else
359		set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
360	return r;
361}
362#endif
363
364#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
365#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
366static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
367					    unsigned long address,
368					    pmd_t *pmdp)
369{
370	pmd_t pmd = *pmdp;
371	int r = 1;
372	if (!pmd_young(pmd))
373		r = 0;
374	else
375		set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
376	return r;
377}
378#else
379static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
380					    unsigned long address,
381					    pmd_t *pmdp)
382{
383	BUILD_BUG();
384	return 0;
385}
386#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
387#endif
388
389#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
390int ptep_clear_flush_young(struct vm_area_struct *vma,
391			   unsigned long address, pte_t *ptep);
392#endif
393
394#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
395#ifdef CONFIG_TRANSPARENT_HUGEPAGE
396extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
397				  unsigned long address, pmd_t *pmdp);
398#else
399/*
400 * Despite relevant to THP only, this API is called from generic rmap code
401 * under PageTransHuge(), hence needs a dummy implementation for !THP
402 */
403static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
404					 unsigned long address, pmd_t *pmdp)
405{
406	BUILD_BUG();
407	return 0;
408}
409#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
410#endif
411
412#ifndef arch_has_hw_nonleaf_pmd_young
413/*
414 * Return whether the accessed bit in non-leaf PMD entries is supported on the
415 * local CPU.
416 */
417static inline bool arch_has_hw_nonleaf_pmd_young(void)
418{
419	return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
420}
421#endif
422
423#ifndef arch_has_hw_pte_young
424/*
425 * Return whether the accessed bit is supported on the local CPU.
426 *
427 * This stub assumes accessing through an old PTE triggers a page fault.
428 * Architectures that automatically set the access bit should overwrite it.
429 */
430static inline bool arch_has_hw_pte_young(void)
431{
432	return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
433}
434#endif
435
436#ifndef arch_check_zapped_pte
437static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
438					 pte_t pte)
439{
440}
441#endif
442
443#ifndef arch_check_zapped_pmd
444static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
445					 pmd_t pmd)
446{
447}
448#endif
449
450#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
451static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
452				       unsigned long address,
453				       pte_t *ptep)
454{
455	pte_t pte = ptep_get(ptep);
456	pte_clear(mm, address, ptep);
457	page_table_check_pte_clear(mm, pte);
458	return pte;
459}
460#endif
461
462static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
463			      pte_t *ptep)
464{
465	ptep_get_and_clear(mm, addr, ptep);
466}
467
468#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
469/*
470 * For walking the pagetables without holding any locks.  Some architectures
471 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
472 * instructions.  We are guaranteed that a PTE will only either go from not
473 * present to present, or present to not present -- it will not switch to a
474 * completely different present page without a TLB flush inbetween; which we
475 * are blocking by holding interrupts off.
476 *
477 * Setting ptes from not present to present goes:
478 *
479 *   ptep->pte_high = h;
480 *   smp_wmb();
481 *   ptep->pte_low = l;
482 *
483 * And present to not present goes:
484 *
485 *   ptep->pte_low = 0;
486 *   smp_wmb();
487 *   ptep->pte_high = 0;
488 *
489 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
490 * We load pte_high *after* loading pte_low, which ensures we don't see an older
491 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
492 * picked up a changed pte high. We might have gotten rubbish values from
493 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
494 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
495 * operates on present ptes we're safe.
496 */
497static inline pte_t ptep_get_lockless(pte_t *ptep)
498{
499	pte_t pte;
500
501	do {
502		pte.pte_low = ptep->pte_low;
503		smp_rmb();
504		pte.pte_high = ptep->pte_high;
505		smp_rmb();
506	} while (unlikely(pte.pte_low != ptep->pte_low));
507
508	return pte;
509}
510#define ptep_get_lockless ptep_get_lockless
511
512#if CONFIG_PGTABLE_LEVELS > 2
513static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
514{
515	pmd_t pmd;
516
517	do {
518		pmd.pmd_low = pmdp->pmd_low;
519		smp_rmb();
520		pmd.pmd_high = pmdp->pmd_high;
521		smp_rmb();
522	} while (unlikely(pmd.pmd_low != pmdp->pmd_low));
523
524	return pmd;
525}
526#define pmdp_get_lockless pmdp_get_lockless
527#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
528#endif /* CONFIG_PGTABLE_LEVELS > 2 */
529#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
530
531/*
532 * We require that the PTE can be read atomically.
533 */
534#ifndef ptep_get_lockless
535static inline pte_t ptep_get_lockless(pte_t *ptep)
536{
537	return ptep_get(ptep);
538}
539#endif
540
541#ifndef pmdp_get_lockless
542static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
543{
544	return pmdp_get(pmdp);
545}
546static inline void pmdp_get_lockless_sync(void)
547{
548}
549#endif
550
551#ifdef CONFIG_TRANSPARENT_HUGEPAGE
552#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
553static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
554					    unsigned long address,
555					    pmd_t *pmdp)
556{
557	pmd_t pmd = *pmdp;
558
559	pmd_clear(pmdp);
560	page_table_check_pmd_clear(mm, pmd);
561
562	return pmd;
563}
564#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
565#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
566static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
567					    unsigned long address,
568					    pud_t *pudp)
569{
570	pud_t pud = *pudp;
571
572	pud_clear(pudp);
573	page_table_check_pud_clear(mm, pud);
574
575	return pud;
576}
577#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
578#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
579
580#ifdef CONFIG_TRANSPARENT_HUGEPAGE
581#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
582static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
583					    unsigned long address, pmd_t *pmdp,
584					    int full)
585{
586	return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
587}
588#endif
589
590#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
591static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
592					    unsigned long address, pud_t *pudp,
593					    int full)
594{
595	return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
596}
597#endif
598#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
599
600#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
601static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
602					    unsigned long address, pte_t *ptep,
603					    int full)
604{
605	return ptep_get_and_clear(mm, address, ptep);
606}
607#endif
608
609#ifndef get_and_clear_full_ptes
610/**
611 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
612 *			     the same folio, collecting dirty/accessed bits.
613 * @mm: Address space the pages are mapped into.
614 * @addr: Address the first page is mapped at.
615 * @ptep: Page table pointer for the first entry.
616 * @nr: Number of entries to clear.
617 * @full: Whether we are clearing a full mm.
618 *
619 * May be overridden by the architecture; otherwise, implemented as a simple
620 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
621 * returned PTE.
622 *
623 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
624 * some PTEs might be write-protected.
625 *
626 * Context: The caller holds the page table lock.  The PTEs map consecutive
627 * pages that belong to the same folio.  The PTEs are all in the same PMD.
628 */
629static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
630		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
631{
632	pte_t pte, tmp_pte;
633
634	pte = ptep_get_and_clear_full(mm, addr, ptep, full);
635	while (--nr) {
636		ptep++;
637		addr += PAGE_SIZE;
638		tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
639		if (pte_dirty(tmp_pte))
640			pte = pte_mkdirty(pte);
641		if (pte_young(tmp_pte))
642			pte = pte_mkyoung(pte);
643	}
644	return pte;
645}
646#endif
647
648#ifndef clear_full_ptes
649/**
650 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
651 *		     folio.
652 * @mm: Address space the pages are mapped into.
653 * @addr: Address the first page is mapped at.
654 * @ptep: Page table pointer for the first entry.
655 * @nr: Number of entries to clear.
656 * @full: Whether we are clearing a full mm.
657 *
658 * May be overridden by the architecture; otherwise, implemented as a simple
659 * loop over ptep_get_and_clear_full().
660 *
661 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
662 * some PTEs might be write-protected.
663 *
664 * Context: The caller holds the page table lock.  The PTEs map consecutive
665 * pages that belong to the same folio.  The PTEs are all in the same PMD.
666 */
667static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
668		pte_t *ptep, unsigned int nr, int full)
669{
670	for (;;) {
671		ptep_get_and_clear_full(mm, addr, ptep, full);
672		if (--nr == 0)
673			break;
674		ptep++;
675		addr += PAGE_SIZE;
676	}
677}
678#endif
679
680/*
681 * If two threads concurrently fault at the same page, the thread that
682 * won the race updates the PTE and its local TLB/Cache. The other thread
683 * gives up, simply does nothing, and continues; on architectures where
684 * software can update TLB,  local TLB can be updated here to avoid next page
685 * fault. This function updates TLB only, do nothing with cache or others.
686 * It is the difference with function update_mmu_cache.
687 */
688#ifndef __HAVE_ARCH_UPDATE_MMU_TLB
689static inline void update_mmu_tlb(struct vm_area_struct *vma,
690				unsigned long address, pte_t *ptep)
691{
692}
693#define __HAVE_ARCH_UPDATE_MMU_TLB
694#endif
695
696/*
697 * Some architectures may be able to avoid expensive synchronization
698 * primitives when modifications are made to PTE's which are already
699 * not present, or in the process of an address space destruction.
700 */
701#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
702static inline void pte_clear_not_present_full(struct mm_struct *mm,
703					      unsigned long address,
704					      pte_t *ptep,
705					      int full)
706{
707	pte_clear(mm, address, ptep);
708}
709#endif
710
711#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
712extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
713			      unsigned long address,
714			      pte_t *ptep);
715#endif
716
717#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
718extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
719			      unsigned long address,
720			      pmd_t *pmdp);
721extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
722			      unsigned long address,
723			      pud_t *pudp);
724#endif
725
726#ifndef pte_mkwrite
727static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
728{
729	return pte_mkwrite_novma(pte);
730}
731#endif
732
733#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
734static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
735{
736	return pmd_mkwrite_novma(pmd);
737}
738#endif
739
740#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
741struct mm_struct;
742static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
743{
744	pte_t old_pte = ptep_get(ptep);
745	set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
746}
747#endif
748
749#ifndef wrprotect_ptes
750/**
751 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
752 *		    folio.
753 * @mm: Address space the pages are mapped into.
754 * @addr: Address the first page is mapped at.
755 * @ptep: Page table pointer for the first entry.
756 * @nr: Number of entries to write-protect.
757 *
758 * May be overridden by the architecture; otherwise, implemented as a simple
759 * loop over ptep_set_wrprotect().
760 *
761 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
762 * some PTEs might be write-protected.
763 *
764 * Context: The caller holds the page table lock.  The PTEs map consecutive
765 * pages that belong to the same folio.  The PTEs are all in the same PMD.
766 */
767static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
768		pte_t *ptep, unsigned int nr)
769{
770	for (;;) {
771		ptep_set_wrprotect(mm, addr, ptep);
772		if (--nr == 0)
773			break;
774		ptep++;
775		addr += PAGE_SIZE;
776	}
777}
778#endif
779
780/*
781 * On some architectures hardware does not set page access bit when accessing
782 * memory page, it is responsibility of software setting this bit. It brings
783 * out extra page fault penalty to track page access bit. For optimization page
784 * access bit can be set during all page fault flow on these arches.
785 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
786 * where software maintains page access bit.
787 */
788#ifndef pte_sw_mkyoung
789static inline pte_t pte_sw_mkyoung(pte_t pte)
790{
791	return pte;
792}
793#define pte_sw_mkyoung	pte_sw_mkyoung
794#endif
795
796#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
797#ifdef CONFIG_TRANSPARENT_HUGEPAGE
798static inline void pmdp_set_wrprotect(struct mm_struct *mm,
799				      unsigned long address, pmd_t *pmdp)
800{
801	pmd_t old_pmd = *pmdp;
802	set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
803}
804#else
805static inline void pmdp_set_wrprotect(struct mm_struct *mm,
806				      unsigned long address, pmd_t *pmdp)
807{
808	BUILD_BUG();
809}
810#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
811#endif
812#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
813#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
814#ifdef CONFIG_TRANSPARENT_HUGEPAGE
815static inline void pudp_set_wrprotect(struct mm_struct *mm,
816				      unsigned long address, pud_t *pudp)
817{
818	pud_t old_pud = *pudp;
819
820	set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
821}
822#else
823static inline void pudp_set_wrprotect(struct mm_struct *mm,
824				      unsigned long address, pud_t *pudp)
825{
826	BUILD_BUG();
827}
828#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
829#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
830#endif
831
832#ifndef pmdp_collapse_flush
833#ifdef CONFIG_TRANSPARENT_HUGEPAGE
834extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
835				 unsigned long address, pmd_t *pmdp);
836#else
837static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
838					unsigned long address,
839					pmd_t *pmdp)
840{
841	BUILD_BUG();
842	return *pmdp;
843}
844#define pmdp_collapse_flush pmdp_collapse_flush
845#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
846#endif
847
848#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
849extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
850				       pgtable_t pgtable);
851#endif
852
853#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
854extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
855#endif
856
857#ifndef arch_needs_pgtable_deposit
858#define arch_needs_pgtable_deposit() (false)
859#endif
860
861#ifdef CONFIG_TRANSPARENT_HUGEPAGE
862/*
863 * This is an implementation of pmdp_establish() that is only suitable for an
864 * architecture that doesn't have hardware dirty/accessed bits. In this case we
865 * can't race with CPU which sets these bits and non-atomic approach is fine.
866 */
867static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
868		unsigned long address, pmd_t *pmdp, pmd_t pmd)
869{
870	pmd_t old_pmd = *pmdp;
871	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
872	return old_pmd;
873}
874#endif
875
876#ifndef __HAVE_ARCH_PMDP_INVALIDATE
877extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
878			    pmd_t *pmdp);
879#endif
880
881#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
882
883/*
884 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
885 * hugepage mapping in the page tables. This function is similar to
886 * pmdp_invalidate(), but should only be used if the access and dirty bits would
887 * not be cleared by the software in the new PMD value. The function ensures
888 * that hardware changes of the access and dirty bits updates would not be lost.
889 *
890 * Doing so can allow in certain architectures to avoid a TLB flush in most
891 * cases. Yet, another TLB flush might be necessary later if the PMD update
892 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
893 * even when a TLB flush is needed because of the update, the caller may be able
894 * to batch these TLB flushing operations, so fewer TLB flush operations are
895 * needed.
896 */
897extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
898				unsigned long address, pmd_t *pmdp);
899#endif
900
901#ifndef __HAVE_ARCH_PTE_SAME
902static inline int pte_same(pte_t pte_a, pte_t pte_b)
903{
904	return pte_val(pte_a) == pte_val(pte_b);
905}
906#endif
907
908#ifndef __HAVE_ARCH_PTE_UNUSED
909/*
910 * Some architectures provide facilities to virtualization guests
911 * so that they can flag allocated pages as unused. This allows the
912 * host to transparently reclaim unused pages. This function returns
913 * whether the pte's page is unused.
914 */
915static inline int pte_unused(pte_t pte)
916{
917	return 0;
918}
919#endif
920
921#ifndef pte_access_permitted
922#define pte_access_permitted(pte, write) \
923	(pte_present(pte) && (!(write) || pte_write(pte)))
924#endif
925
926#ifndef pmd_access_permitted
927#define pmd_access_permitted(pmd, write) \
928	(pmd_present(pmd) && (!(write) || pmd_write(pmd)))
929#endif
930
931#ifndef pud_access_permitted
932#define pud_access_permitted(pud, write) \
933	(pud_present(pud) && (!(write) || pud_write(pud)))
934#endif
935
936#ifndef p4d_access_permitted
937#define p4d_access_permitted(p4d, write) \
938	(p4d_present(p4d) && (!(write) || p4d_write(p4d)))
939#endif
940
941#ifndef pgd_access_permitted
942#define pgd_access_permitted(pgd, write) \
943	(pgd_present(pgd) && (!(write) || pgd_write(pgd)))
944#endif
945
946#ifndef __HAVE_ARCH_PMD_SAME
947static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
948{
949	return pmd_val(pmd_a) == pmd_val(pmd_b);
950}
951#endif
952
953#ifndef pud_same
954static inline int pud_same(pud_t pud_a, pud_t pud_b)
955{
956	return pud_val(pud_a) == pud_val(pud_b);
957}
958#define pud_same pud_same
959#endif
960
961#ifndef __HAVE_ARCH_P4D_SAME
962static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
963{
964	return p4d_val(p4d_a) == p4d_val(p4d_b);
965}
966#endif
967
968#ifndef __HAVE_ARCH_PGD_SAME
969static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
970{
971	return pgd_val(pgd_a) == pgd_val(pgd_b);
972}
973#endif
974
975/*
976 * Use set_p*_safe(), and elide TLB flushing, when confident that *no*
977 * TLB flush will be required as a result of the "set". For example, use
978 * in scenarios where it is known ahead of time that the routine is
979 * setting non-present entries, or re-setting an existing entry to the
980 * same value. Otherwise, use the typical "set" helpers and flush the
981 * TLB.
982 */
983#define set_pte_safe(ptep, pte) \
984({ \
985	WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \
986	set_pte(ptep, pte); \
987})
988
989#define set_pmd_safe(pmdp, pmd) \
990({ \
991	WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \
992	set_pmd(pmdp, pmd); \
993})
994
995#define set_pud_safe(pudp, pud) \
996({ \
997	WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \
998	set_pud(pudp, pud); \
999})
1000
1001#define set_p4d_safe(p4dp, p4d) \
1002({ \
1003	WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \
1004	set_p4d(p4dp, p4d); \
1005})
1006
1007#define set_pgd_safe(pgdp, pgd) \
1008({ \
1009	WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \
1010	set_pgd(pgdp, pgd); \
1011})
1012
1013#ifndef __HAVE_ARCH_DO_SWAP_PAGE
1014/*
1015 * Some architectures support metadata associated with a page. When a
1016 * page is being swapped out, this metadata must be saved so it can be
1017 * restored when the page is swapped back in. SPARC M7 and newer
1018 * processors support an ADI (Application Data Integrity) tag for the
1019 * page as metadata for the page. arch_do_swap_page() can restore this
1020 * metadata when a page is swapped back in.
1021 */
1022static inline void arch_do_swap_page(struct mm_struct *mm,
1023				     struct vm_area_struct *vma,
1024				     unsigned long addr,
1025				     pte_t pte, pte_t oldpte)
1026{
1027
1028}
1029#endif
1030
1031#ifndef __HAVE_ARCH_UNMAP_ONE
1032/*
1033 * Some architectures support metadata associated with a page. When a
1034 * page is being swapped out, this metadata must be saved so it can be
1035 * restored when the page is swapped back in. SPARC M7 and newer
1036 * processors support an ADI (Application Data Integrity) tag for the
1037 * page as metadata for the page. arch_unmap_one() can save this
1038 * metadata on a swap-out of a page.
1039 */
1040static inline int arch_unmap_one(struct mm_struct *mm,
1041				  struct vm_area_struct *vma,
1042				  unsigned long addr,
1043				  pte_t orig_pte)
1044{
1045	return 0;
1046}
1047#endif
1048
1049/*
1050 * Allow architectures to preserve additional metadata associated with
1051 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
1052 * prototypes must be defined in the arch-specific asm/pgtable.h file.
1053 */
1054#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
1055static inline int arch_prepare_to_swap(struct page *page)
1056{
1057	return 0;
1058}
1059#endif
1060
1061#ifndef __HAVE_ARCH_SWAP_INVALIDATE
1062static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
1063{
1064}
1065
1066static inline void arch_swap_invalidate_area(int type)
1067{
1068}
1069#endif
1070
1071#ifndef __HAVE_ARCH_SWAP_RESTORE
1072static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
1073{
1074}
1075#endif
1076
1077#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
1078#define pgd_offset_gate(mm, addr)	pgd_offset(mm, addr)
1079#endif
1080
1081#ifndef __HAVE_ARCH_MOVE_PTE
1082#define move_pte(pte, prot, old_addr, new_addr)	(pte)
1083#endif
1084
1085#ifndef pte_accessible
1086# define pte_accessible(mm, pte)	((void)(pte), 1)
1087#endif
1088
1089#ifndef flush_tlb_fix_spurious_fault
1090#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
1091#endif
1092
1093/*
1094 * When walking page tables, get the address of the next boundary,
1095 * or the end address of the range if that comes earlier.  Although no
1096 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
1097 */
1098
1099#define pgd_addr_end(addr, end)						\
1100({	unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;	\
1101	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
1102})
1103
1104#ifndef p4d_addr_end
1105#define p4d_addr_end(addr, end)						\
1106({	unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;	\
1107	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
1108})
1109#endif
1110
1111#ifndef pud_addr_end
1112#define pud_addr_end(addr, end)						\
1113({	unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;	\
1114	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
1115})
1116#endif
1117
1118#ifndef pmd_addr_end
1119#define pmd_addr_end(addr, end)						\
1120({	unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;	\
1121	(__boundary - 1 < (end) - 1)? __boundary: (end);		\
1122})
1123#endif
1124
1125/*
1126 * When walking page tables, we usually want to skip any p?d_none entries;
1127 * and any p?d_bad entries - reporting the error before resetting to none.
1128 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
1129 */
1130void pgd_clear_bad(pgd_t *);
1131
1132#ifndef __PAGETABLE_P4D_FOLDED
1133void p4d_clear_bad(p4d_t *);
1134#else
1135#define p4d_clear_bad(p4d)        do { } while (0)
1136#endif
1137
1138#ifndef __PAGETABLE_PUD_FOLDED
1139void pud_clear_bad(pud_t *);
1140#else
1141#define pud_clear_bad(p4d)        do { } while (0)
1142#endif
1143
1144void pmd_clear_bad(pmd_t *);
1145
1146static inline int pgd_none_or_clear_bad(pgd_t *pgd)
1147{
1148	if (pgd_none(*pgd))
1149		return 1;
1150	if (unlikely(pgd_bad(*pgd))) {
1151		pgd_clear_bad(pgd);
1152		return 1;
1153	}
1154	return 0;
1155}
1156
1157static inline int p4d_none_or_clear_bad(p4d_t *p4d)
1158{
1159	if (p4d_none(*p4d))
1160		return 1;
1161	if (unlikely(p4d_bad(*p4d))) {
1162		p4d_clear_bad(p4d);
1163		return 1;
1164	}
1165	return 0;
1166}
1167
1168static inline int pud_none_or_clear_bad(pud_t *pud)
1169{
1170	if (pud_none(*pud))
1171		return 1;
1172	if (unlikely(pud_bad(*pud))) {
1173		pud_clear_bad(pud);
1174		return 1;
1175	}
1176	return 0;
1177}
1178
1179static inline int pmd_none_or_clear_bad(pmd_t *pmd)
1180{
1181	if (pmd_none(*pmd))
1182		return 1;
1183	if (unlikely(pmd_bad(*pmd))) {
1184		pmd_clear_bad(pmd);
1185		return 1;
1186	}
1187	return 0;
1188}
1189
1190static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
1191					     unsigned long addr,
1192					     pte_t *ptep)
1193{
1194	/*
1195	 * Get the current pte state, but zero it out to make it
1196	 * non-present, preventing the hardware from asynchronously
1197	 * updating it.
1198	 */
1199	return ptep_get_and_clear(vma->vm_mm, addr, ptep);
1200}
1201
1202static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
1203					     unsigned long addr,
1204					     pte_t *ptep, pte_t pte)
1205{
1206	/*
1207	 * The pte is non-present, so there's no hardware state to
1208	 * preserve.
1209	 */
1210	set_pte_at(vma->vm_mm, addr, ptep, pte);
1211}
1212
1213#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
1214/*
1215 * Start a pte protection read-modify-write transaction, which
1216 * protects against asynchronous hardware modifications to the pte.
1217 * The intention is not to prevent the hardware from making pte
1218 * updates, but to prevent any updates it may make from being lost.
1219 *
1220 * This does not protect against other software modifications of the
1221 * pte; the appropriate pte lock must be held over the transaction.
1222 *
1223 * Note that this interface is intended to be batchable, meaning that
1224 * ptep_modify_prot_commit may not actually update the pte, but merely
1225 * queue the update to be done at some later time.  The update must be
1226 * actually committed before the pte lock is released, however.
1227 */
1228static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
1229					   unsigned long addr,
1230					   pte_t *ptep)
1231{
1232	return __ptep_modify_prot_start(vma, addr, ptep);
1233}
1234
1235/*
1236 * Commit an update to a pte, leaving any hardware-controlled bits in
1237 * the PTE unmodified.
1238 */
1239static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
1240					   unsigned long addr,
1241					   pte_t *ptep, pte_t old_pte, pte_t pte)
1242{
1243	__ptep_modify_prot_commit(vma, addr, ptep, pte);
1244}
1245#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
1246#endif /* CONFIG_MMU */
1247
1248/*
1249 * No-op macros that just return the current protection value. Defined here
1250 * because these macros can be used even if CONFIG_MMU is not defined.
1251 */
1252
1253#ifndef pgprot_nx
1254#define pgprot_nx(prot)	(prot)
1255#endif
1256
1257#ifndef pgprot_noncached
1258#define pgprot_noncached(prot)	(prot)
1259#endif
1260
1261#ifndef pgprot_writecombine
1262#define pgprot_writecombine pgprot_noncached
1263#endif
1264
1265#ifndef pgprot_writethrough
1266#define pgprot_writethrough pgprot_noncached
1267#endif
1268
1269#ifndef pgprot_device
1270#define pgprot_device pgprot_noncached
1271#endif
1272
1273#ifndef pgprot_mhp
1274#define pgprot_mhp(prot)	(prot)
1275#endif
1276
1277#ifdef CONFIG_MMU
1278#ifndef pgprot_modify
1279#define pgprot_modify pgprot_modify
1280static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
1281{
1282	if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
1283		newprot = pgprot_noncached(newprot);
1284	if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
1285		newprot = pgprot_writecombine(newprot);
1286	if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
1287		newprot = pgprot_device(newprot);
1288	return newprot;
1289}
1290#endif
1291#endif /* CONFIG_MMU */
1292
1293#ifndef pgprot_encrypted
1294#define pgprot_encrypted(prot)	(prot)
1295#endif
1296
1297#ifndef pgprot_decrypted
1298#define pgprot_decrypted(prot)	(prot)
1299#endif
1300
1301/*
1302 * A facility to provide batching of the reload of page tables and
1303 * other process state with the actual context switch code for
1304 * paravirtualized guests.  By convention, only one of the batched
1305 * update (lazy) modes (CPU, MMU) should be active at any given time,
1306 * entry should never be nested, and entry and exits should always be
1307 * paired.  This is for sanity of maintaining and reasoning about the
1308 * kernel code.  In this case, the exit (end of the context switch) is
1309 * in architecture-specific code, and so doesn't need a generic
1310 * definition.
1311 */
1312#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
1313#define arch_start_context_switch(prev)	do {} while (0)
1314#endif
1315
1316#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
1317#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
1318static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
1319{
1320	return pmd;
1321}
1322
1323static inline int pmd_swp_soft_dirty(pmd_t pmd)
1324{
1325	return 0;
1326}
1327
1328static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
1329{
1330	return pmd;
1331}
1332#endif
1333#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
1334static inline int pte_soft_dirty(pte_t pte)
1335{
1336	return 0;
1337}
1338
1339static inline int pmd_soft_dirty(pmd_t pmd)
1340{
1341	return 0;
1342}
1343
1344static inline pte_t pte_mksoft_dirty(pte_t pte)
1345{
1346	return pte;
1347}
1348
1349static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
1350{
1351	return pmd;
1352}
1353
1354static inline pte_t pte_clear_soft_dirty(pte_t pte)
1355{
1356	return pte;
1357}
1358
1359static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
1360{
1361	return pmd;
1362}
1363
1364static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
1365{
1366	return pte;
1367}
1368
1369static inline int pte_swp_soft_dirty(pte_t pte)
1370{
1371	return 0;
1372}
1373
1374static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
1375{
1376	return pte;
1377}
1378
1379static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
1380{
1381	return pmd;
1382}
1383
1384static inline int pmd_swp_soft_dirty(pmd_t pmd)
1385{
1386	return 0;
1387}
1388
1389static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
1390{
1391	return pmd;
1392}
1393#endif
1394
1395#ifndef __HAVE_PFNMAP_TRACKING
1396/*
1397 * Interfaces that can be used by architecture code to keep track of
1398 * memory type of pfn mappings specified by the remap_pfn_range,
1399 * vmf_insert_pfn.
1400 */
1401
1402/*
1403 * track_pfn_remap is called when a _new_ pfn mapping is being established
1404 * by remap_pfn_range() for physical range indicated by pfn and size.
1405 */
1406static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
1407				  unsigned long pfn, unsigned long addr,
1408				  unsigned long size)
1409{
1410	return 0;
1411}
1412
1413/*
1414 * track_pfn_insert is called when a _new_ single pfn is established
1415 * by vmf_insert_pfn().
1416 */
1417static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
1418				    pfn_t pfn)
1419{
1420}
1421
1422/*
1423 * track_pfn_copy is called when vma that is covering the pfnmap gets
1424 * copied through copy_page_range().
1425 */
1426static inline int track_pfn_copy(struct vm_area_struct *vma)
1427{
1428	return 0;
1429}
1430
1431/*
1432 * untrack_pfn is called while unmapping a pfnmap for a region.
1433 * untrack can be called for a specific region indicated by pfn and size or
1434 * can be for the entire vma (in which case pfn, size are zero).
1435 */
1436static inline void untrack_pfn(struct vm_area_struct *vma,
1437			       unsigned long pfn, unsigned long size,
1438			       bool mm_wr_locked)
1439{
1440}
1441
1442/*
1443 * untrack_pfn_clear is called while mremapping a pfnmap for a new region
1444 * or fails to copy pgtable during duplicate vm area.
1445 */
1446static inline void untrack_pfn_clear(struct vm_area_struct *vma)
1447{
1448}
1449#else
1450extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
1451			   unsigned long pfn, unsigned long addr,
1452			   unsigned long size);
1453extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
1454			     pfn_t pfn);
1455extern int track_pfn_copy(struct vm_area_struct *vma);
1456extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
1457			unsigned long size, bool mm_wr_locked);
1458extern void untrack_pfn_clear(struct vm_area_struct *vma);
1459#endif
1460
1461#ifdef CONFIG_MMU
1462#ifdef __HAVE_COLOR_ZERO_PAGE
1463static inline int is_zero_pfn(unsigned long pfn)
1464{
1465	extern unsigned long zero_pfn;
1466	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
1467	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
1468}
1469
1470#define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
1471
1472#else
1473static inline int is_zero_pfn(unsigned long pfn)
1474{
1475	extern unsigned long zero_pfn;
1476	return pfn == zero_pfn;
1477}
1478
1479static inline unsigned long my_zero_pfn(unsigned long addr)
1480{
1481	extern unsigned long zero_pfn;
1482	return zero_pfn;
1483}
1484#endif
1485#else
1486static inline int is_zero_pfn(unsigned long pfn)
1487{
1488	return 0;
1489}
1490
1491static inline unsigned long my_zero_pfn(unsigned long addr)
1492{
1493	return 0;
1494}
1495#endif /* CONFIG_MMU */
1496
1497#ifdef CONFIG_MMU
1498
1499#ifndef CONFIG_TRANSPARENT_HUGEPAGE
1500static inline int pmd_trans_huge(pmd_t pmd)
1501{
1502	return 0;
1503}
1504#ifndef pmd_write
1505static inline int pmd_write(pmd_t pmd)
1506{
1507	BUG();
1508	return 0;
1509}
1510#endif /* pmd_write */
1511#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1512
1513#ifndef pud_write
1514static inline int pud_write(pud_t pud)
1515{
1516	BUG();
1517	return 0;
1518}
1519#endif /* pud_write */
1520
1521#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
1522static inline int pmd_devmap(pmd_t pmd)
1523{
1524	return 0;
1525}
1526static inline int pud_devmap(pud_t pud)
1527{
1528	return 0;
1529}
1530static inline int pgd_devmap(pgd_t pgd)
1531{
1532	return 0;
1533}
1534#endif
1535
1536#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
1537	!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1538static inline int pud_trans_huge(pud_t pud)
1539{
1540	return 0;
1541}
1542#endif
1543
1544static inline int pud_trans_unstable(pud_t *pud)
1545{
1546#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
1547	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1548	pud_t pudval = READ_ONCE(*pud);
1549
1550	if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
1551		return 1;
1552	if (unlikely(pud_bad(pudval))) {
1553		pud_clear_bad(pud);
1554		return 1;
1555	}
1556#endif
1557	return 0;
1558}
1559
1560#ifndef CONFIG_NUMA_BALANCING
1561/*
1562 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
1563 * perfectly valid to indicate "no" in that case, which is why our default
1564 * implementation defaults to "always no".
1565 *
1566 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
1567 * page protection due to NUMA hinting. NUMA hinting faults only apply in
1568 * accessible VMAs.
1569 *
1570 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
1571 * looking at the VMA accessibility is sufficient.
1572 */
1573static inline int pte_protnone(pte_t pte)
1574{
1575	return 0;
1576}
1577
1578static inline int pmd_protnone(pmd_t pmd)
1579{
1580	return 0;
1581}
1582#endif /* CONFIG_NUMA_BALANCING */
1583
1584#endif /* CONFIG_MMU */
1585
1586#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
1587
1588#ifndef __PAGETABLE_P4D_FOLDED
1589int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
1590void p4d_clear_huge(p4d_t *p4d);
1591#else
1592static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
1593{
1594	return 0;
1595}
1596static inline void p4d_clear_huge(p4d_t *p4d) { }
1597#endif /* !__PAGETABLE_P4D_FOLDED */
1598
1599int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
1600int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
1601int pud_clear_huge(pud_t *pud);
1602int pmd_clear_huge(pmd_t *pmd);
1603int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
1604int pud_free_pmd_page(pud_t *pud, unsigned long addr);
1605int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
1606#else	/* !CONFIG_HAVE_ARCH_HUGE_VMAP */
1607static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
1608{
1609	return 0;
1610}
1611static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1612{
1613	return 0;
1614}
1615static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1616{
1617	return 0;
1618}
1619static inline void p4d_clear_huge(p4d_t *p4d) { }
1620static inline int pud_clear_huge(pud_t *pud)
1621{
1622	return 0;
1623}
1624static inline int pmd_clear_huge(pmd_t *pmd)
1625{
1626	return 0;
1627}
1628static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
1629{
1630	return 0;
1631}
1632static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1633{
1634	return 0;
1635}
1636static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1637{
1638	return 0;
1639}
1640#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
1641
1642#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
1643#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1644/*
1645 * ARCHes with special requirements for evicting THP backing TLB entries can
1646 * implement this. Otherwise also, it can help optimize normal TLB flush in
1647 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
1648 * entire TLB if flush span is greater than a threshold, which will
1649 * likely be true for a single huge page. Thus a single THP flush will
1650 * invalidate the entire TLB which is not desirable.
1651 * e.g. see arch/arc: flush_pmd_tlb_range
1652 */
1653#define flush_pmd_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
1654#define flush_pud_tlb_range(vma, addr, end)	flush_tlb_range(vma, addr, end)
1655#else
1656#define flush_pmd_tlb_range(vma, addr, end)	BUILD_BUG()
1657#define flush_pud_tlb_range(vma, addr, end)	BUILD_BUG()
1658#endif
1659#endif
1660
1661struct file;
1662int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
1663			unsigned long size, pgprot_t *vma_prot);
1664
1665#ifndef CONFIG_X86_ESPFIX64
1666static inline void init_espfix_bsp(void) { }
1667#endif
1668
1669extern void __init pgtable_cache_init(void);
1670
1671#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
1672static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
1673{
1674	return true;
1675}
1676
1677static inline bool arch_has_pfn_modify_check(void)
1678{
1679	return false;
1680}
1681#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
1682
1683/*
1684 * Architecture PAGE_KERNEL_* fallbacks
1685 *
1686 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
1687 * because they really don't support them, or the port needs to be updated to
1688 * reflect the required functionality. Below are a set of relatively safe
1689 * fallbacks, as best effort, which we can count on in lieu of the architectures
1690 * not defining them on their own yet.
1691 */
1692
1693#ifndef PAGE_KERNEL_RO
1694# define PAGE_KERNEL_RO PAGE_KERNEL
1695#endif
1696
1697#ifndef PAGE_KERNEL_EXEC
1698# define PAGE_KERNEL_EXEC PAGE_KERNEL
1699#endif
1700
1701/*
1702 * Page Table Modification bits for pgtbl_mod_mask.
1703 *
1704 * These are used by the p?d_alloc_track*() set of functions an in the generic
1705 * vmalloc/ioremap code to track at which page-table levels entries have been
1706 * modified. Based on that the code can better decide when vmalloc and ioremap
1707 * mapping changes need to be synchronized to other page-tables in the system.
1708 */
1709#define		__PGTBL_PGD_MODIFIED	0
1710#define		__PGTBL_P4D_MODIFIED	1
1711#define		__PGTBL_PUD_MODIFIED	2
1712#define		__PGTBL_PMD_MODIFIED	3
1713#define		__PGTBL_PTE_MODIFIED	4
1714
1715#define		PGTBL_PGD_MODIFIED	BIT(__PGTBL_PGD_MODIFIED)
1716#define		PGTBL_P4D_MODIFIED	BIT(__PGTBL_P4D_MODIFIED)
1717#define		PGTBL_PUD_MODIFIED	BIT(__PGTBL_PUD_MODIFIED)
1718#define		PGTBL_PMD_MODIFIED	BIT(__PGTBL_PMD_MODIFIED)
1719#define		PGTBL_PTE_MODIFIED	BIT(__PGTBL_PTE_MODIFIED)
1720
1721/* Page-Table Modification Mask */
1722typedef unsigned int pgtbl_mod_mask;
1723
1724#endif /* !__ASSEMBLY__ */
1725
1726#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
1727#ifdef CONFIG_PHYS_ADDR_T_64BIT
1728/*
1729 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
1730 * with physical address space extension, but falls back to
1731 * BITS_PER_LONG otherwise.
1732 */
1733#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
1734#else
1735#define MAX_POSSIBLE_PHYSMEM_BITS 32
1736#endif
1737#endif
1738
1739#ifndef has_transparent_hugepage
1740#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
1741#endif
1742
1743#ifndef has_transparent_pud_hugepage
1744#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
1745#endif
1746/*
1747 * On some architectures it depends on the mm if the p4d/pud or pmd
1748 * layer of the page table hierarchy is folded or not.
1749 */
1750#ifndef mm_p4d_folded
1751#define mm_p4d_folded(mm)	__is_defined(__PAGETABLE_P4D_FOLDED)
1752#endif
1753
1754#ifndef mm_pud_folded
1755#define mm_pud_folded(mm)	__is_defined(__PAGETABLE_PUD_FOLDED)
1756#endif
1757
1758#ifndef mm_pmd_folded
1759#define mm_pmd_folded(mm)	__is_defined(__PAGETABLE_PMD_FOLDED)
1760#endif
1761
1762#ifndef p4d_offset_lockless
1763#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
1764#endif
1765#ifndef pud_offset_lockless
1766#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
1767#endif
1768#ifndef pmd_offset_lockless
1769#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
1770#endif
1771
1772/*
1773 * p?d_leaf() - true if this entry is a final mapping to a physical address.
1774 * This differs from p?d_huge() by the fact that they are always available (if
1775 * the architecture supports large pages at the appropriate level) even
1776 * if CONFIG_HUGETLB_PAGE is not defined.
1777 * Only meaningful when called on a valid entry.
1778 */
1779#ifndef pgd_leaf
1780#define pgd_leaf(x)	false
1781#endif
1782#ifndef p4d_leaf
1783#define p4d_leaf(x)	false
1784#endif
1785#ifndef pud_leaf
1786#define pud_leaf(x)	false
1787#endif
1788#ifndef pmd_leaf
1789#define pmd_leaf(x)	false
1790#endif
1791
1792#ifndef pgd_leaf_size
1793#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
1794#endif
1795#ifndef p4d_leaf_size
1796#define p4d_leaf_size(x) P4D_SIZE
1797#endif
1798#ifndef pud_leaf_size
1799#define pud_leaf_size(x) PUD_SIZE
1800#endif
1801#ifndef pmd_leaf_size
1802#define pmd_leaf_size(x) PMD_SIZE
1803#endif
1804#ifndef pte_leaf_size
1805#define pte_leaf_size(x) PAGE_SIZE
1806#endif
1807
1808/*
1809 * Some architectures have MMUs that are configurable or selectable at boot
1810 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
1811 * helps to have a static maximum value.
1812 */
1813
1814#ifndef MAX_PTRS_PER_PTE
1815#define MAX_PTRS_PER_PTE PTRS_PER_PTE
1816#endif
1817
1818#ifndef MAX_PTRS_PER_PMD
1819#define MAX_PTRS_PER_PMD PTRS_PER_PMD
1820#endif
1821
1822#ifndef MAX_PTRS_PER_PUD
1823#define MAX_PTRS_PER_PUD PTRS_PER_PUD
1824#endif
1825
1826#ifndef MAX_PTRS_PER_P4D
1827#define MAX_PTRS_PER_P4D PTRS_PER_P4D
1828#endif
1829
1830/* description of effects of mapping type and prot in current implementation.
1831 * this is due to the limited x86 page protection hardware.  The expected
1832 * behavior is in parens:
1833 *
1834 * map_type	prot
1835 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
1836 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
1837 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
1838 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
1839 *
1840 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
1841 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
1842 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
1843 *
1844 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
1845 * MAP_PRIVATE (with Enhanced PAN supported):
1846 *								r: (no) no
1847 *								w: (no) no
1848 *								x: (yes) yes
1849 */
1850#define DECLARE_VM_GET_PAGE_PROT					\
1851pgprot_t vm_get_page_prot(unsigned long vm_flags)			\
1852{									\
1853		return protection_map[vm_flags &			\
1854			(VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];	\
1855}									\
1856EXPORT_SYMBOL(vm_get_page_prot);
1857
1858#endif /* _LINUX_PGTABLE_H */
1859