11541Srgrimes// SPDX-License-Identifier: GPL-2.0-only 21541Srgrimes/* 31541Srgrimes * Copyright (C) 2023 ARM Ltd. 41541Srgrimes */ 51541Srgrimes 61541Srgrimes#include <linux/mm.h> 71541Srgrimes#include <linux/efi.h> 81541Srgrimes#include <linux/export.h> 91541Srgrimes#include <asm/tlbflush.h> 101541Srgrimes 111541Srgrimesstatic inline bool mm_is_user(struct mm_struct *mm) 121541Srgrimes{ 131541Srgrimes /* 141541Srgrimes * Don't attempt to apply the contig bit to kernel mappings, because 151541Srgrimes * dynamically adding/removing the contig bit can cause page faults. 161541Srgrimes * These racing faults are ok for user space, since they get serialized 171541Srgrimes * on the PTL. But kernel mappings can't tolerate faults. 181541Srgrimes */ 191541Srgrimes if (unlikely(mm_is_efi(mm))) 201541Srgrimes return false; 211541Srgrimes return mm != &init_mm; 221541Srgrimes} 231541Srgrimes 241541Srgrimesstatic inline pte_t *contpte_align_down(pte_t *ptep) 251541Srgrimes{ 261541Srgrimes return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); 271541Srgrimes} 281541Srgrimes 291541Srgrimesstatic void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr, 301541Srgrimes pte_t *ptep, unsigned int nr) 311541Srgrimes{ 321541Srgrimes /* 331541Srgrimes * Unfold any partially covered contpte block at the beginning and end 341541Srgrimes * of the range. 351541Srgrimes */ 361541Srgrimes 371541Srgrimes if (ptep != contpte_align_down(ptep) || nr < CONT_PTES) 3814476Shsu contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); 3939238Sgibbs 401541Srgrimes if (ptep + nr != contpte_align_down(ptep + nr)) { 411541Srgrimes unsigned long last_addr = addr + PAGE_SIZE * (nr - 1); 421541Srgrimes pte_t *last_ptep = ptep + nr - 1; 431541Srgrimes 4415493Sbde contpte_try_unfold(mm, last_addr, last_ptep, 451541Srgrimes __ptep_get(last_ptep)); 461541Srgrimes } 471541Srgrimes} 481541Srgrimes 496549Sbdestatic void contpte_convert(struct mm_struct *mm, unsigned long addr, 5034266Sjulian pte_t *ptep, pte_t pte) 516549Sbde{ 5234266Sjulian struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); 5334266Sjulian unsigned long start_addr; 5434266Sjulian pte_t *start_ptep; 5534266Sjulian int i; 5634266Sjulian 5734266Sjulian start_ptep = ptep = contpte_align_down(ptep); 5834266Sjulian start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); 5934266Sjulian pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte)); 6034266Sjulian 6134266Sjulian for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) { 6234266Sjulian pte_t ptent = __ptep_get_and_clear(mm, addr, ptep); 6334266Sjulian 6434266Sjulian if (pte_dirty(ptent)) 6534266Sjulian pte = pte_mkdirty(pte); 6634266Sjulian 6734266Sjulian if (pte_young(ptent)) 6834266Sjulian pte = pte_mkyoung(pte); 696549Sbde } 706549Sbde 716549Sbde __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3); 726549Sbde 736549Sbde __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES); 746549Sbde} 756549Sbde 766549Sbdevoid __contpte_try_fold(struct mm_struct *mm, unsigned long addr, 776549Sbde pte_t *ptep, pte_t pte) 786549Sbde{ 791541Srgrimes /* 801541Srgrimes * We have already checked that the virtual and pysical addresses are 811541Srgrimes * correctly aligned for a contpte mapping in contpte_try_fold() so the 821541Srgrimes * remaining checks are to ensure that the contpte range is fully 831541Srgrimes * covered by a single folio, and ensure that all the ptes are valid 841541Srgrimes * with contiguous PFNs and matching prots. We ignore the state of the 851541Srgrimes * access and dirty bits for the purpose of deciding if its a contiguous 8612408Sdyson * range; the folding process will generate a single contpte entry which 871541Srgrimes * has a single access and dirty bit. Those 2 bits are the logical OR of 8818068Sgibbs * their respective bits in the constituent pte entries. In order to 8918068Sgibbs * ensure the contpte range is covered by a single folio, we must 9012767Sdyson * recover the folio from the pfn, but special mappings don't have a 911541Srgrimes * folio backing them. Fortunately contpte_try_fold() already checked 921541Srgrimes * that the pte is not special - we never try to fold special mappings. 931541Srgrimes * Note we can't use vm_normal_page() for this since we don't have the 941541Srgrimes * vma. 951541Srgrimes */ 9631493Sphk 9720054Sdyson unsigned long folio_start, folio_end; 9820054Sdyson unsigned long cont_start, cont_end; 991541Srgrimes pte_t expected_pte, subpte; 1001541Srgrimes struct folio *folio; 10134611Sdyson struct page *page; 1021541Srgrimes unsigned long pfn; 1031541Srgrimes pte_t *orig_ptep; 1046549Sbde pgprot_t prot; 1056549Sbde 1061541Srgrimes int i; 1071541Srgrimes 1081541Srgrimes if (!mm_is_user(mm)) 1091541Srgrimes return; 1101541Srgrimes 1111541Srgrimes page = pte_page(pte); 1121541Srgrimes folio = page_folio(page); 1131549Srgrimes folio_start = addr - (page - &folio->page) * PAGE_SIZE; 11421002Sdyson folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE; 1151549Srgrimes cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE); 1161549Srgrimes cont_end = cont_start + CONT_PTE_SIZE; 1171549Srgrimes 1181549Srgrimes if (folio_start > cont_start || folio_end < cont_end) 11912404Sdyson return; 12012404Sdyson 12112404Sdyson pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES); 12212404Sdyson prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); 12315582Sphk expected_pte = pfn_pte(pfn, prot); 1243374Sdg orig_ptep = ptep; 12534266Sjulian ptep = contpte_align_down(ptep); 1261541Srgrimes 1271541Srgrimes for (i = 0; i < CONT_PTES; i++) { 1281541Srgrimes subpte = pte_mkold(pte_mkclean(__ptep_get(ptep))); 1291541Srgrimes if (!pte_same(subpte, expected_pte)) 1301541Srgrimes return; 1311541Srgrimes expected_pte = pte_advance_pfn(expected_pte, 1); 13214476Shsu ptep++; 1331541Srgrimes } 1341541Srgrimes 1351541Srgrimes pte = pte_mkcont(pte); 1361541Srgrimes contpte_convert(mm, addr, orig_ptep, pte); 1371541Srgrimes} 1381541SrgrimesEXPORT_SYMBOL_GPL(__contpte_try_fold); 13938862Sphk 1401541Srgrimesvoid __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, 1411541Srgrimes pte_t *ptep, pte_t pte) 1421541Srgrimes{ 14338524Sphk /* 1441541Srgrimes * We have already checked that the ptes are contiguous in 1451541Srgrimes * contpte_try_unfold(), so just check that the mm is user space. 1461541Srgrimes */ 1475455Sdg if (!mm_is_user(mm)) 14818068Sgibbs return; 1491541Srgrimes 1501541Srgrimes pte = pte_mknoncont(pte); 1511541Srgrimes contpte_convert(mm, addr, ptep, pte); 15238524Sphk} 1537695SdgEXPORT_SYMBOL_GPL(__contpte_try_unfold); 1541541Srgrimes 1551541Srgrimespte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte) 1561541Srgrimes{ 1571541Srgrimes /* 1587935Sdg * Gather access/dirty bits, which may be populated in any of the ptes 15918068Sgibbs * of the contig range. We are guaranteed to be holding the PTL, so any 16021002Sdyson * contiguous range cannot be unfolded or otherwise modified under our 1613374Sdg * feet. 1621549Srgrimes */ 1631549Srgrimes 1641541Srgrimes pte_t pte; 16538862Sphk int i; 16638862Sphk 16738862Sphk ptep = contpte_align_down(ptep); 16838862Sphk 16938862Sphk for (i = 0; i < CONT_PTES; i++, ptep++) { 17038862Sphk pte = __ptep_get(ptep); 17134695Sdyson 17234695Sdyson if (pte_dirty(pte)) 17339238Sgibbs orig_pte = pte_mkdirty(orig_pte); 17435766Sgibbs 17539238Sgibbs if (pte_young(pte)) 17629683Sgibbs orig_pte = pte_mkyoung(orig_pte); 17729683Sgibbs } 17839238Sgibbs 17929683Sgibbs return orig_pte; 18039238Sgibbs} 18129683SgibbsEXPORT_SYMBOL_GPL(contpte_ptep_get); 18239238Sgibbs 18329683Sgibbspte_t contpte_ptep_get_lockless(pte_t *orig_ptep) 18429683Sgibbs{ 18539238Sgibbs /* 18639238Sgibbs * The ptep_get_lockless() API requires us to read and return *orig_ptep 18729683Sgibbs * so that it is self-consistent, without the PTL held, so we may be 18839238Sgibbs * racing with other threads modifying the pte. Usually a READ_ONCE() 18939238Sgibbs * would suffice, but for the contpte case, we also need to gather the 19029683Sgibbs * access and dirty bits from across all ptes in the contiguous block, 19139238Sgibbs * and we can't read all of those neighbouring ptes atomically, so any 19229683Sgibbs * contiguous range may be unfolded/modified/refolded under our feet. 19329683Sgibbs * Therefore we ensure we read a _consistent_ contpte range by checking 19439238Sgibbs * that all ptes in the range are valid and have CONT_PTE set, that all 19529683Sgibbs * pfns are contiguous and that all pgprots are the same (ignoring 19629683Sgibbs * access/dirty). If we find a pte that is not consistent, then we must 19729683Sgibbs * be racing with an update so start again. If the target pte does not 19829683Sgibbs * have CONT_PTE set then that is considered consistent on its own 19929683Sgibbs * because it is not part of a contpte range. 20039238Sgibbs */ 20129683Sgibbs 20229683Sgibbs pgprot_t orig_prot; 20329683Sgibbs unsigned long pfn; 20429683Sgibbs pte_t orig_pte; 20529683Sgibbs pgprot_t prot; 20629683Sgibbs pte_t *ptep; 20729683Sgibbs pte_t pte; 20829683Sgibbs int i; 20929683Sgibbs 21039238Sgibbsretry: 21129683Sgibbs orig_pte = __ptep_get(orig_ptep); 21236022Sgibbs 21336022Sgibbs if (!pte_valid_cont(orig_pte)) 21439238Sgibbs return orig_pte; 21539238Sgibbs 21639238Sgibbs orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte))); 21739238Sgibbs ptep = contpte_align_down(orig_ptep); 21839238Sgibbs pfn = pte_pfn(orig_pte) - (orig_ptep - ptep); 21939238Sgibbs 22039238Sgibbs for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) { 22129683Sgibbs pte = __ptep_get(ptep); 22236022Sgibbs prot = pte_pgprot(pte_mkold(pte_mkclean(pte))); 22336022Sgibbs 22429683Sgibbs if (!pte_valid_cont(pte) || 22529683Sgibbs pte_pfn(pte) != pfn || 22629683Sgibbs pgprot_val(prot) != pgprot_val(orig_prot)) 22739238Sgibbs goto retry; 22829683Sgibbs 22929683Sgibbs if (pte_dirty(pte)) 23029683Sgibbs orig_pte = pte_mkdirty(orig_pte); 23129683Sgibbs 23229683Sgibbs if (pte_young(pte)) 2331541Srgrimes orig_pte = pte_mkyoung(orig_pte); 2341564Sdg } 2351564Sdg 2361564Sdg return orig_pte; 2371564Sdg} 2381541SrgrimesEXPORT_SYMBOL_GPL(contpte_ptep_get_lockless); 2391564Sdg 2401564Sdgvoid contpte_set_ptes(struct mm_struct *mm, unsigned long addr, 2411564Sdg pte_t *ptep, pte_t pte, unsigned int nr) 24237653Sbde{ 2431564Sdg unsigned long next; 2441564Sdg unsigned long end; 2451564Sdg unsigned long pfn; 2461564Sdg pgprot_t prot; 2475455Sdg 2481564Sdg /* 2491564Sdg * The set_ptes() spec guarantees that when nr > 1, the initial state of 2501564Sdg * all ptes is not-present. Therefore we never need to unfold or 2511564Sdg * otherwise invalidate a range before we set the new ptes. 2525455Sdg * contpte_set_ptes() should never be called for nr < 2. 2535455Sdg */ 2545455Sdg VM_WARN_ON(nr == 1); 2551564Sdg 2561564Sdg if (!mm_is_user(mm)) 2571541Srgrimes return __set_ptes(mm, addr, ptep, pte, nr); 2581541Srgrimes 2591541Srgrimes end = addr + (nr << PAGE_SHIFT); 26013086Sdg pfn = pte_pfn(pte); 2611541Srgrimes prot = pte_pgprot(pte); 2621541Srgrimes 2631541Srgrimes do { 2641541Srgrimes next = pte_cont_addr_end(addr, end); 2651541Srgrimes nr = (next - addr) >> PAGE_SHIFT; 2661541Srgrimes pte = pfn_pte(pfn, prot); 2671541Srgrimes 2681541Srgrimes if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0) 2692112Swollman pte = pte_mkcont(pte); 2702112Swollman else 2712112Swollman pte = pte_mknoncont(pte); 2722112Swollman 2732112Swollman __set_ptes(mm, addr, ptep, pte, nr); 2742112Swollman 27526664Sdyson addr = next; 2762112Swollman ptep += nr; 27734206Sdyson pfn += nr; 2781541Srgrimes 27934924Sbde } while (addr != end); 28034924Sbde} 2813098SphkEXPORT_SYMBOL_GPL(contpte_set_ptes); 2823098Sphk 2831541Srgrimesvoid contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, 2841541Srgrimes pte_t *ptep, unsigned int nr, int full) 2851541Srgrimes{ 2861541Srgrimes contpte_try_unfold_partial(mm, addr, ptep, nr); 2873098Sphk __clear_full_ptes(mm, addr, ptep, nr, full); 2883098Sphk} 2893098SphkEXPORT_SYMBOL_GPL(contpte_clear_full_ptes); 29034266Sjulian 29118068Sgibbspte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, 2921549Srgrimes unsigned long addr, pte_t *ptep, 29313490Sdyson unsigned int nr, int full) 29412767Sdyson{ 2953484Sphk contpte_try_unfold_partial(mm, addr, ptep, nr); 2963098Sphk return __get_and_clear_full_ptes(mm, addr, ptep, nr, full); 29712767Sdyson} 2987090SbdeEXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes); 2993098Sphk 3003098Sphkint contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, 30114476Shsu unsigned long addr, pte_t *ptep) 3023098Sphk{ 3033098Sphk /* 3043098Sphk * ptep_clear_flush_young() technically requires us to clear the access 3051541Srgrimes * flag for a _single_ pte. However, the core-mm code actually tracks 3061541Srgrimes * access/dirty per folio, not per page. And since we only create a 30721002Sdyson * contig range when the range is covered by a single folio, we can get 30812767Sdyson * away with clearing young for the whole contig range here, so we avoid 3091541Srgrimes * having to unfold. 31012428Sphk */ 31112428Sphk 3123484Sphk int young = 0; 3137695Sdg int i; 3147090Sbde 31529506Sbde ptep = contpte_align_down(ptep); 3163484Sphk addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); 3173484Sphk 3183484Sphk for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) 3193484Sphk young |= __ptep_test_and_clear_young(vma, addr, ptep); 3203484Sphk 3213484Sphk return young; 3227090Sbde} 3237090SbdeEXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young); 3243484Sphk 3257090Sbdeint contpte_ptep_clear_flush_young(struct vm_area_struct *vma, 3267430Sbde unsigned long addr, pte_t *ptep) 3277430Sbde{ 3287430Sbde int young; 3297430Sbde 33026664Sdyson young = contpte_ptep_test_and_clear_young(vma, addr, ptep); 33115493Sbde 33215493Sbde if (young) { 3331541Srgrimes /* 334 * See comment in __ptep_clear_flush_young(); same rationale for 335 * eliding the trailing DSB applies here. 336 */ 337 addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); 338 __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE, 339 PAGE_SIZE, true, 3); 340 } 341 342 return young; 343} 344EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young); 345 346void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, 347 pte_t *ptep, unsigned int nr) 348{ 349 /* 350 * If wrprotecting an entire contig range, we can avoid unfolding. Just 351 * set wrprotect and wait for the later mmu_gather flush to invalidate 352 * the tlb. Until the flush, the page may or may not be wrprotected. 353 * After the flush, it is guaranteed wrprotected. If it's a partial 354 * range though, we must unfold, because we can't have a case where 355 * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this 356 * would cause it to continue to be unpredictable after the flush. 357 */ 358 359 contpte_try_unfold_partial(mm, addr, ptep, nr); 360 __wrprotect_ptes(mm, addr, ptep, nr); 361} 362EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes); 363 364int contpte_ptep_set_access_flags(struct vm_area_struct *vma, 365 unsigned long addr, pte_t *ptep, 366 pte_t entry, int dirty) 367{ 368 unsigned long start_addr; 369 pte_t orig_pte; 370 int i; 371 372 /* 373 * Gather the access/dirty bits for the contiguous range. If nothing has 374 * changed, its a noop. 375 */ 376 orig_pte = pte_mknoncont(ptep_get(ptep)); 377 if (pte_val(orig_pte) == pte_val(entry)) 378 return 0; 379 380 /* 381 * We can fix up access/dirty bits without having to unfold the contig 382 * range. But if the write bit is changing, we must unfold. 383 */ 384 if (pte_write(orig_pte) == pte_write(entry)) { 385 /* 386 * For HW access management, we technically only need to update 387 * the flag on a single pte in the range. But for SW access 388 * management, we need to update all the ptes to prevent extra 389 * faults. Avoid per-page tlb flush in __ptep_set_access_flags() 390 * and instead flush the whole range at the end. 391 */ 392 ptep = contpte_align_down(ptep); 393 start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE); 394 395 for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) 396 __ptep_set_access_flags(vma, addr, ptep, entry, 0); 397 398 if (dirty) 399 __flush_tlb_range(vma, start_addr, addr, 400 PAGE_SIZE, true, 3); 401 } else { 402 __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte); 403 __ptep_set_access_flags(vma, addr, ptep, entry, dirty); 404 } 405 406 return 1; 407} 408EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags); 409