1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * Copyright (c) 2014 Andrew Turner 13 * All rights reserved. 14 * Copyright (c) 2014-2016 The FreeBSD Foundation 15 * All rights reserved. 16 * 17 * This code is derived from software contributed to Berkeley by 18 * the Systems Programming Group of the University of Utah Computer 19 * Science Department and William Jolitz of UUNET Technologies Inc. 20 * 21 * This software was developed by Andrew Turner under sponsorship from 22 * the FreeBSD Foundation. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 3. All advertising materials mentioning features or use of this software 33 * must display the following acknowledgement: 34 * This product includes software developed by the University of 35 * California, Berkeley and its contributors. 36 * 4. Neither the name of the University nor the names of its contributors 37 * may be used to endorse or promote products derived from this software 38 * without specific prior written permission. 39 * 40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 50 * SUCH DAMAGE. 51 */ 52/*- 53 * Copyright (c) 2003 Networks Associates Technology, Inc. 54 * All rights reserved. 55 * 56 * This software was developed for the FreeBSD Project by Jake Burkholder, 57 * Safeport Network Services, and Network Associates Laboratories, the 58 * Security Research Division of Network Associates, Inc. under 59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 60 * CHATS research program. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 81 * SUCH DAMAGE. 82 */ 83 84#include <sys/cdefs.h> 85/* 86 * Manages physical address maps. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104#include "opt_vm.h" 105 106#include <sys/param.h> 107#include <sys/asan.h> 108#include <sys/bitstring.h> 109#include <sys/bus.h> 110#include <sys/systm.h> 111#include <sys/kernel.h> 112#include <sys/ktr.h> 113#include <sys/limits.h> 114#include <sys/lock.h> 115#include <sys/malloc.h> 116#include <sys/mman.h> 117#include <sys/msan.h> 118#include <sys/msgbuf.h> 119#include <sys/mutex.h> 120#include <sys/physmem.h> 121#include <sys/proc.h> 122#include <sys/rangeset.h> 123#include <sys/rwlock.h> 124#include <sys/sbuf.h> 125#include <sys/sx.h> 126#include <sys/vmem.h> 127#include <sys/vmmeter.h> 128#include <sys/sched.h> 129#include <sys/sysctl.h> 130#include <sys/_unrhdr.h> 131#include <sys/smp.h> 132 133#include <vm/vm.h> 134#include <vm/vm_param.h> 135#include <vm/vm_kern.h> 136#include <vm/vm_page.h> 137#include <vm/vm_map.h> 138#include <vm/vm_object.h> 139#include <vm/vm_extern.h> 140#include <vm/vm_pageout.h> 141#include <vm/vm_pager.h> 142#include <vm/vm_phys.h> 143#include <vm/vm_radix.h> 144#include <vm/vm_reserv.h> 145#include <vm/vm_dumpset.h> 146#include <vm/uma.h> 147 148#include <machine/asan.h> 149#include <machine/machdep.h> 150#include <machine/md_var.h> 151#include <machine/pcb.h> 152 153#ifdef NUMA 154#define PMAP_MEMDOM MAXMEMDOM 155#else 156#define PMAP_MEMDOM 1 157#endif 158 159#define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1) 160#define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2) 161 162#define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) 163#define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) 164#define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) 165#define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) 166 167#define NUL0E L0_ENTRIES 168#define NUL1E (NUL0E * NL1PG) 169#define NUL2E (NUL1E * NL2PG) 170 171#ifdef PV_STATS 172#define PV_STAT(x) do { x ; } while (0) 173#define __pvused 174#else 175#define PV_STAT(x) do { } while (0) 176#define __pvused __unused 177#endif 178 179#define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT)) 180#define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) 181#define pmap_l2_pindex(v) ((v) >> L2_SHIFT) 182 183#ifdef __ARM_FEATURE_BTI_DEFAULT 184#define ATTR_KERN_GP ATTR_S1_GP 185#else 186#define ATTR_KERN_GP 0 187#endif 188#define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \ 189 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) 190 191struct pmap_large_md_page { 192 struct rwlock pv_lock; 193 struct md_page pv_page; 194 /* Pad to a power of 2, see pmap_init_pv_table(). */ 195 int pv_pad[2]; 196}; 197 198__exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; 199#define pv_dummy pv_dummy_large.pv_page 200__read_mostly static struct pmap_large_md_page *pv_table; 201 202static struct pmap_large_md_page * 203_pa_to_pmdp(vm_paddr_t pa) 204{ 205 struct vm_phys_seg *seg; 206 207 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL) 208 return ((struct pmap_large_md_page *)seg->md_first + 209 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start)); 210 return (NULL); 211} 212 213static struct pmap_large_md_page * 214pa_to_pmdp(vm_paddr_t pa) 215{ 216 struct pmap_large_md_page *pvd; 217 218 pvd = _pa_to_pmdp(pa); 219 if (pvd == NULL) 220 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa); 221 return (pvd); 222} 223 224static struct pmap_large_md_page * 225page_to_pmdp(vm_page_t m) 226{ 227 struct vm_phys_seg *seg; 228 229 seg = &vm_phys_segs[m->segind]; 230 return ((struct pmap_large_md_page *)seg->md_first + 231 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start)); 232} 233 234#define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) 235#define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page)) 236 237#define PHYS_TO_PV_LIST_LOCK(pa) ({ \ 238 struct pmap_large_md_page *_pvd; \ 239 struct rwlock *_lock; \ 240 _pvd = _pa_to_pmdp(pa); \ 241 if (__predict_false(_pvd == NULL)) \ 242 _lock = &pv_dummy_large.pv_lock; \ 243 else \ 244 _lock = &(_pvd->pv_lock); \ 245 _lock; \ 246}) 247 248static struct rwlock * 249VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m) 250{ 251 if ((m->flags & PG_FICTITIOUS) == 0) 252 return (&page_to_pmdp(m)->pv_lock); 253 else 254 return (&pv_dummy_large.pv_lock); 255} 256 257#define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \ 258 struct rwlock **_lockp = (lockp); \ 259 struct rwlock *_new_lock = (new_lock); \ 260 \ 261 if (_new_lock != *_lockp) { \ 262 if (*_lockp != NULL) \ 263 rw_wunlock(*_lockp); \ 264 *_lockp = _new_lock; \ 265 rw_wlock(*_lockp); \ 266 } \ 267} while (0) 268 269#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \ 270 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa)) 271 272#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 273 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m)) 274 275#define RELEASE_PV_LIST_LOCK(lockp) do { \ 276 struct rwlock **_lockp = (lockp); \ 277 \ 278 if (*_lockp != NULL) { \ 279 rw_wunlock(*_lockp); \ 280 *_lockp = NULL; \ 281 } \ 282} while (0) 283 284#define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) 285#define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m)) 286 287/* 288 * The presence of this flag indicates that the mapping is writeable. 289 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise 290 * it is dirty. This flag may only be set on managed mappings. 291 * 292 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it 293 * as a software managed bit. 294 */ 295#define ATTR_SW_DBM ATTR_DBM 296 297struct pmap kernel_pmap_store; 298 299/* Used for mapping ACPI memory before VM is initialized */ 300#define PMAP_PREINIT_MAPPING_COUNT 32 301#define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) 302static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ 303static int vm_initialized = 0; /* No need to use pre-init maps when set */ 304 305/* 306 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. 307 * Always map entire L2 block for simplicity. 308 * VA of L2 block = preinit_map_va + i * L2_SIZE 309 */ 310static struct pmap_preinit_mapping { 311 vm_paddr_t pa; 312 vm_offset_t va; 313 vm_size_t size; 314} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 315 316vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 317vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 318vm_offset_t kernel_vm_end = 0; 319 320/* 321 * Data for the pv entry allocation mechanism. 322 */ 323#ifdef NUMA 324static __inline int 325pc_to_domain(struct pv_chunk *pc) 326{ 327 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); 328} 329#else 330static __inline int 331pc_to_domain(struct pv_chunk *pc __unused) 332{ 333 return (0); 334} 335#endif 336 337struct pv_chunks_list { 338 struct mtx pvc_lock; 339 TAILQ_HEAD(pch, pv_chunk) pvc_list; 340 int active_reclaims; 341} __aligned(CACHE_LINE_SIZE); 342 343struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; 344 345vm_paddr_t dmap_phys_base; /* The start of the dmap region */ 346vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ 347vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ 348 349extern pt_entry_t pagetable_l0_ttbr1[]; 350 351#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) 352static vm_paddr_t physmap[PHYSMAP_SIZE]; 353static u_int physmap_idx; 354 355static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 356 "VM/pmap parameters"); 357 358#if PAGE_SIZE == PAGE_SIZE_4K 359#define L1_BLOCKS_SUPPORTED 1 360#else 361/* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */ 362#define L1_BLOCKS_SUPPORTED 0 363#endif 364 365#define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED) 366 367/* 368 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs 369 * that it has currently allocated to a pmap, a cursor ("asid_next") to 370 * optimize its search for a free ASID in the bit vector, and an epoch number 371 * ("asid_epoch") to indicate when it has reclaimed all previously allocated 372 * ASIDs that are not currently active on a processor. 373 * 374 * The current epoch number is always in the range [0, INT_MAX). Negative 375 * numbers and INT_MAX are reserved for special cases that are described 376 * below. 377 */ 378struct asid_set { 379 int asid_bits; 380 bitstr_t *asid_set; 381 int asid_set_size; 382 int asid_next; 383 int asid_epoch; 384 struct mtx asid_set_mutex; 385}; 386 387static struct asid_set asids; 388static struct asid_set vmids; 389 390static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 391 "ASID allocator"); 392SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0, 393 "The number of bits in an ASID"); 394SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0, 395 "The last allocated ASID plus one"); 396SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0, 397 "The current epoch number"); 398 399static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator"); 400SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0, 401 "The number of bits in an VMID"); 402SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0, 403 "The last allocated VMID plus one"); 404SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0, 405 "The current epoch number"); 406 407void (*pmap_clean_stage2_tlbi)(void); 408void (*pmap_invalidate_vpipt_icache)(void); 409void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); 410void (*pmap_stage2_invalidate_all)(uint64_t); 411 412/* 413 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved 414 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for 415 * dynamically allocated ASIDs have a non-negative epoch number. 416 * 417 * An invalid ASID is represented by -1. 418 * 419 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN), 420 * which indicates that an ASID should never be allocated to the pmap, and 421 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be 422 * allocated when the pmap is next activated. 423 */ 424#define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \ 425 ((u_long)(epoch) << 32))) 426#define COOKIE_TO_ASID(cookie) ((int)(cookie)) 427#define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32)) 428 429#define TLBI_VA_SHIFT 12 430#define TLBI_VA_MASK ((1ul << 44) - 1) 431#define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK) 432#define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT) 433 434static int __read_frequently superpages_enabled = 1; 435SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, 436 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, 437 "Are large page mappings enabled?"); 438 439/* 440 * True when Branch Target Identification should be used by userspace. This 441 * allows pmap to mark pages as guarded with ATTR_S1_GP. 442 */ 443__read_mostly static bool pmap_bti_support = false; 444 445/* 446 * Internal flags for pmap_enter()'s helper functions. 447 */ 448#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ 449#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ 450 451TAILQ_HEAD(pv_chunklist, pv_chunk); 452 453static void free_pv_chunk(struct pv_chunk *pc); 454static void free_pv_chunk_batch(struct pv_chunklist *batch); 455static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 456static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 457static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 458static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 459static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 460 vm_offset_t va); 461 462static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); 463static bool pmap_activate_int(pmap_t pmap); 464static void pmap_alloc_asid(pmap_t pmap); 465static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, 466 vm_prot_t prot, int mode, bool skip_unmapped); 467static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 468 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp); 469static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); 470static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, 471 vm_offset_t va, struct rwlock **lockp); 472static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 473static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va); 474static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 475 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 476static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, 477 u_int flags, vm_page_t m, struct rwlock **lockp); 478static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 479 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp); 480static bool pmap_every_pte_zero(vm_paddr_t pa); 481static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 482 bool all_l3e_AF_set); 483static pt_entry_t pmap_load_l3c(pt_entry_t *l3p); 484static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 485 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits); 486static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 487 struct rwlock **lockp); 488static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); 489static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 490 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); 491static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, 492 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); 493static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 494 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 495 struct rwlock **lockp); 496static void pmap_reset_asid_set(pmap_t pmap); 497static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 498 vm_page_t m, struct rwlock **lockp); 499 500static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, 501 struct rwlock **lockp); 502 503static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, 504 struct spglist *free); 505static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 506static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, 507 vm_offset_t va, vm_size_t size); 508static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 509 510static uma_zone_t pmap_bti_ranges_zone; 511static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 512static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va); 513static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); 514static void *bti_dup_range(void *ctx, void *data); 515static void bti_free_range(void *ctx, void *node); 516static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap); 517static void pmap_bti_deassign_all(pmap_t pmap); 518 519/* 520 * These load the old table data and store the new value. 521 * They need to be atomic as the System MMU may write to the table at 522 * the same time as the CPU. 523 */ 524#define pmap_clear(table) atomic_store_64(table, 0) 525#define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) 526#define pmap_load(table) (*table) 527#define pmap_load_clear(table) atomic_swap_64(table, 0) 528#define pmap_load_store(table, entry) atomic_swap_64(table, entry) 529#define pmap_set_bits(table, bits) atomic_set_64(table, bits) 530#define pmap_store(table, entry) atomic_store_64(table, entry) 531 532/********************/ 533/* Inline functions */ 534/********************/ 535 536static __inline void 537pagecopy(void *s, void *d) 538{ 539 540 memcpy(d, s, PAGE_SIZE); 541} 542 543static __inline pd_entry_t * 544pmap_l0(pmap_t pmap, vm_offset_t va) 545{ 546 547 return (&pmap->pm_l0[pmap_l0_index(va)]); 548} 549 550static __inline pd_entry_t * 551pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) 552{ 553 pd_entry_t *l1; 554 555 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 556 return (&l1[pmap_l1_index(va)]); 557} 558 559static __inline pd_entry_t * 560pmap_l1(pmap_t pmap, vm_offset_t va) 561{ 562 pd_entry_t *l0; 563 564 l0 = pmap_l0(pmap, va); 565 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) 566 return (NULL); 567 568 return (pmap_l0_to_l1(l0, va)); 569} 570 571static __inline pd_entry_t * 572pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va) 573{ 574 pd_entry_t l1, *l2p; 575 576 l1 = pmap_load(l1p); 577 578 KASSERT(ADDR_IS_CANONICAL(va), 579 ("%s: Address not in canonical form: %lx", __func__, va)); 580 /* 581 * The valid bit may be clear if pmap_update_entry() is concurrently 582 * modifying the entry, so for KVA only the entry type may be checked. 583 */ 584 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0, 585 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va)); 586 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 587 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va)); 588 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1)); 589 return (&l2p[pmap_l2_index(va)]); 590} 591 592static __inline pd_entry_t * 593pmap_l2(pmap_t pmap, vm_offset_t va) 594{ 595 pd_entry_t *l1; 596 597 l1 = pmap_l1(pmap, va); 598 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) 599 return (NULL); 600 601 return (pmap_l1_to_l2(l1, va)); 602} 603 604static __inline pt_entry_t * 605pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va) 606{ 607 pd_entry_t l2; 608 pt_entry_t *l3p; 609 610 l2 = pmap_load(l2p); 611 612 KASSERT(ADDR_IS_CANONICAL(va), 613 ("%s: Address not in canonical form: %lx", __func__, va)); 614 /* 615 * The valid bit may be clear if pmap_update_entry() is concurrently 616 * modifying the entry, so for KVA only the entry type may be checked. 617 */ 618 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0, 619 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va)); 620 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE, 621 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va)); 622 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2)); 623 return (&l3p[pmap_l3_index(va)]); 624} 625 626/* 627 * Returns the lowest valid pde for a given virtual address. 628 * The next level may or may not point to a valid page or block. 629 */ 630static __inline pd_entry_t * 631pmap_pde(pmap_t pmap, vm_offset_t va, int *level) 632{ 633 pd_entry_t *l0, *l1, *l2, desc; 634 635 l0 = pmap_l0(pmap, va); 636 desc = pmap_load(l0) & ATTR_DESCR_MASK; 637 if (desc != L0_TABLE) { 638 *level = -1; 639 return (NULL); 640 } 641 642 l1 = pmap_l0_to_l1(l0, va); 643 desc = pmap_load(l1) & ATTR_DESCR_MASK; 644 if (desc != L1_TABLE) { 645 *level = 0; 646 return (l0); 647 } 648 649 l2 = pmap_l1_to_l2(l1, va); 650 desc = pmap_load(l2) & ATTR_DESCR_MASK; 651 if (desc != L2_TABLE) { 652 *level = 1; 653 return (l1); 654 } 655 656 *level = 2; 657 return (l2); 658} 659 660/* 661 * Returns the lowest valid pte block or table entry for a given virtual 662 * address. If there are no valid entries return NULL and set the level to 663 * the first invalid level. 664 */ 665static __inline pt_entry_t * 666pmap_pte(pmap_t pmap, vm_offset_t va, int *level) 667{ 668 pd_entry_t *l1, *l2, desc; 669 pt_entry_t *l3; 670 671 l1 = pmap_l1(pmap, va); 672 if (l1 == NULL) { 673 *level = 0; 674 return (NULL); 675 } 676 desc = pmap_load(l1) & ATTR_DESCR_MASK; 677 if (desc == L1_BLOCK) { 678 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 679 *level = 1; 680 return (l1); 681 } 682 683 if (desc != L1_TABLE) { 684 *level = 1; 685 return (NULL); 686 } 687 688 l2 = pmap_l1_to_l2(l1, va); 689 desc = pmap_load(l2) & ATTR_DESCR_MASK; 690 if (desc == L2_BLOCK) { 691 *level = 2; 692 return (l2); 693 } 694 695 if (desc != L2_TABLE) { 696 *level = 2; 697 return (NULL); 698 } 699 700 *level = 3; 701 l3 = pmap_l2_to_l3(l2, va); 702 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) 703 return (NULL); 704 705 return (l3); 706} 707 708/* 709 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified 710 * level that maps the specified virtual address, then a pointer to that entry 711 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled 712 * and a diagnostic message is provided, in which case this function panics. 713 */ 714static __always_inline pt_entry_t * 715pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag) 716{ 717 pd_entry_t *l0p, *l1p, *l2p; 718 pt_entry_t desc, *l3p; 719 int walk_level __diagused; 720 721 KASSERT(level >= 0 && level < 4, 722 ("%s: %s passed an out-of-range level (%d)", __func__, diag, 723 level)); 724 l0p = pmap_l0(pmap, va); 725 desc = pmap_load(l0p) & ATTR_DESCR_MASK; 726 if (desc == L0_TABLE && level > 0) { 727 l1p = pmap_l0_to_l1(l0p, va); 728 desc = pmap_load(l1p) & ATTR_DESCR_MASK; 729 if (desc == L1_BLOCK && level == 1) { 730 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 731 return (l1p); 732 } 733 if (desc == L1_TABLE && level > 1) { 734 l2p = pmap_l1_to_l2(l1p, va); 735 desc = pmap_load(l2p) & ATTR_DESCR_MASK; 736 if (desc == L2_BLOCK && level == 2) 737 return (l2p); 738 else if (desc == L2_TABLE && level > 2) { 739 l3p = pmap_l2_to_l3(l2p, va); 740 desc = pmap_load(l3p) & ATTR_DESCR_MASK; 741 if (desc == L3_PAGE && level == 3) 742 return (l3p); 743 else 744 walk_level = 3; 745 } else 746 walk_level = 2; 747 } else 748 walk_level = 1; 749 } else 750 walk_level = 0; 751 KASSERT(diag == NULL, 752 ("%s: va %#lx not mapped at level %d, desc %ld at level %d", 753 diag, va, level, desc, walk_level)); 754 return (NULL); 755} 756 757bool 758pmap_ps_enabled(pmap_t pmap) 759{ 760 /* 761 * Promotion requires a hypervisor call when the kernel is running 762 * in EL1. To stop this disable superpage support on non-stage 1 763 * pmaps for now. 764 */ 765 if (pmap->pm_stage != PM_STAGE1) 766 return (false); 767 768#ifdef KMSAN 769 /* 770 * The break-before-make in pmap_update_entry() results in a situation 771 * where a CPU may call into the KMSAN runtime while the entry is 772 * invalid. If the entry is used to map the current thread structure, 773 * then the runtime will attempt to access unmapped memory. Avoid this 774 * by simply disabling superpage promotion for the kernel map. 775 */ 776 if (pmap == kernel_pmap) 777 return (false); 778#endif 779 780 return (superpages_enabled != 0); 781} 782 783bool 784pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, 785 pd_entry_t **l2, pt_entry_t **l3) 786{ 787 pd_entry_t *l0p, *l1p, *l2p; 788 789 if (pmap->pm_l0 == NULL) 790 return (false); 791 792 l0p = pmap_l0(pmap, va); 793 *l0 = l0p; 794 795 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) 796 return (false); 797 798 l1p = pmap_l0_to_l1(l0p, va); 799 *l1 = l1p; 800 801 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { 802 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 803 *l2 = NULL; 804 *l3 = NULL; 805 return (true); 806 } 807 808 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) 809 return (false); 810 811 l2p = pmap_l1_to_l2(l1p, va); 812 *l2 = l2p; 813 814 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { 815 *l3 = NULL; 816 return (true); 817 } 818 819 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) 820 return (false); 821 822 *l3 = pmap_l2_to_l3(l2p, va); 823 824 return (true); 825} 826 827static __inline int 828pmap_l3_valid(pt_entry_t l3) 829{ 830 831 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); 832} 833 834CTASSERT(L1_BLOCK == L2_BLOCK); 835 836static pt_entry_t 837pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr) 838{ 839 pt_entry_t val; 840 841 if (pmap->pm_stage == PM_STAGE1) { 842 val = ATTR_S1_IDX(memattr); 843 if (memattr == VM_MEMATTR_DEVICE) 844 val |= ATTR_S1_XN; 845 return (val); 846 } 847 848 val = 0; 849 850 switch (memattr) { 851 case VM_MEMATTR_DEVICE: 852 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) | 853 ATTR_S2_XN(ATTR_S2_XN_ALL)); 854 case VM_MEMATTR_UNCACHEABLE: 855 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC)); 856 case VM_MEMATTR_WRITE_BACK: 857 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB)); 858 case VM_MEMATTR_WRITE_THROUGH: 859 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT)); 860 default: 861 panic("%s: invalid memory attribute %x", __func__, memattr); 862 } 863} 864 865static pt_entry_t 866pmap_pte_prot(pmap_t pmap, vm_prot_t prot) 867{ 868 pt_entry_t val; 869 870 val = 0; 871 if (pmap->pm_stage == PM_STAGE1) { 872 if ((prot & VM_PROT_EXECUTE) == 0) 873 val |= ATTR_S1_XN; 874 if ((prot & VM_PROT_WRITE) == 0) 875 val |= ATTR_S1_AP(ATTR_S1_AP_RO); 876 } else { 877 if ((prot & VM_PROT_WRITE) != 0) 878 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 879 if ((prot & VM_PROT_READ) != 0) 880 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ); 881 if ((prot & VM_PROT_EXECUTE) == 0) 882 val |= ATTR_S2_XN(ATTR_S2_XN_ALL); 883 } 884 885 return (val); 886} 887 888/* 889 * Checks if the PTE is dirty. 890 */ 891static inline int 892pmap_pte_dirty(pmap_t pmap, pt_entry_t pte) 893{ 894 895 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); 896 897 if (pmap->pm_stage == PM_STAGE1) { 898 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0, 899 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); 900 901 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 902 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)); 903 } 904 905 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 906 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)); 907} 908 909static __inline void 910pmap_resident_count_inc(pmap_t pmap, int count) 911{ 912 913 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 914 pmap->pm_stats.resident_count += count; 915} 916 917static __inline void 918pmap_resident_count_dec(pmap_t pmap, int count) 919{ 920 921 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 922 KASSERT(pmap->pm_stats.resident_count >= count, 923 ("pmap %p resident count underflow %ld %d", pmap, 924 pmap->pm_stats.resident_count, count)); 925 pmap->pm_stats.resident_count -= count; 926} 927 928static vm_paddr_t 929pmap_early_vtophys(vm_offset_t va) 930{ 931 vm_paddr_t pa_page; 932 933 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK; 934 return (pa_page | (va & PAR_LOW_MASK)); 935} 936 937/* State of the bootstrapped DMAP page tables */ 938struct pmap_bootstrap_state { 939 pt_entry_t *l1; 940 pt_entry_t *l2; 941 pt_entry_t *l3; 942 vm_offset_t freemempos; 943 vm_offset_t va; 944 vm_paddr_t pa; 945 pt_entry_t table_attrs; 946 u_int l0_slot; 947 u_int l1_slot; 948 u_int l2_slot; 949 bool dmap_valid; 950}; 951 952/* The bootstrap state */ 953static struct pmap_bootstrap_state bs_state = { 954 .l1 = NULL, 955 .l2 = NULL, 956 .l3 = NULL, 957 .table_attrs = TATTR_PXN_TABLE, 958 .l0_slot = L0_ENTRIES, 959 .l1_slot = Ln_ENTRIES, 960 .l2_slot = Ln_ENTRIES, 961 .dmap_valid = false, 962}; 963 964static void 965pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state) 966{ 967 vm_paddr_t l1_pa; 968 pd_entry_t l0e; 969 u_int l0_slot; 970 971 /* Link the level 0 table to a level 1 table */ 972 l0_slot = pmap_l0_index(state->va); 973 if (l0_slot != state->l0_slot) { 974 /* 975 * Make sure we move from a low address to high address 976 * before the DMAP region is ready. This ensures we never 977 * modify an existing mapping until we can map from a 978 * physical address to a virtual address. 979 */ 980 MPASS(state->l0_slot < l0_slot || 981 state->l0_slot == L0_ENTRIES || 982 state->dmap_valid); 983 984 /* Reset lower levels */ 985 state->l2 = NULL; 986 state->l3 = NULL; 987 state->l1_slot = Ln_ENTRIES; 988 state->l2_slot = Ln_ENTRIES; 989 990 /* Check the existing L0 entry */ 991 state->l0_slot = l0_slot; 992 if (state->dmap_valid) { 993 l0e = pagetable_l0_ttbr1[l0_slot]; 994 if ((l0e & ATTR_DESCR_VALID) != 0) { 995 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE); 996 l1_pa = PTE_TO_PHYS(l0e); 997 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa); 998 return; 999 } 1000 } 1001 1002 /* Create a new L0 table entry */ 1003 state->l1 = (pt_entry_t *)state->freemempos; 1004 memset(state->l1, 0, PAGE_SIZE); 1005 state->freemempos += PAGE_SIZE; 1006 1007 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1); 1008 MPASS((l1_pa & Ln_TABLE_MASK) == 0); 1009 MPASS(pagetable_l0_ttbr1[l0_slot] == 0); 1010 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) | 1011 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE); 1012 } 1013 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__)); 1014} 1015 1016static void 1017pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state) 1018{ 1019 vm_paddr_t l2_pa; 1020 pd_entry_t l1e; 1021 u_int l1_slot; 1022 1023 /* Make sure there is a valid L0 -> L1 table */ 1024 pmap_bootstrap_l0_table(state); 1025 1026 /* Link the level 1 table to a level 2 table */ 1027 l1_slot = pmap_l1_index(state->va); 1028 if (l1_slot != state->l1_slot) { 1029 /* See pmap_bootstrap_l0_table for a description */ 1030 MPASS(state->l1_slot < l1_slot || 1031 state->l1_slot == Ln_ENTRIES || 1032 state->dmap_valid); 1033 1034 /* Reset lower levels */ 1035 state->l3 = NULL; 1036 state->l2_slot = Ln_ENTRIES; 1037 1038 /* Check the existing L1 entry */ 1039 state->l1_slot = l1_slot; 1040 if (state->dmap_valid) { 1041 l1e = state->l1[l1_slot]; 1042 if ((l1e & ATTR_DESCR_VALID) != 0) { 1043 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE); 1044 l2_pa = PTE_TO_PHYS(l1e); 1045 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa); 1046 return; 1047 } 1048 } 1049 1050 /* Create a new L1 table entry */ 1051 state->l2 = (pt_entry_t *)state->freemempos; 1052 memset(state->l2, 0, PAGE_SIZE); 1053 state->freemempos += PAGE_SIZE; 1054 1055 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2); 1056 MPASS((l2_pa & Ln_TABLE_MASK) == 0); 1057 MPASS(state->l1[l1_slot] == 0); 1058 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) | 1059 state->table_attrs | L1_TABLE); 1060 } 1061 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__)); 1062} 1063 1064static void 1065pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state) 1066{ 1067 vm_paddr_t l3_pa; 1068 pd_entry_t l2e; 1069 u_int l2_slot; 1070 1071 /* Make sure there is a valid L1 -> L2 table */ 1072 pmap_bootstrap_l1_table(state); 1073 1074 /* Link the level 2 table to a level 3 table */ 1075 l2_slot = pmap_l2_index(state->va); 1076 if (l2_slot != state->l2_slot) { 1077 /* See pmap_bootstrap_l0_table for a description */ 1078 MPASS(state->l2_slot < l2_slot || 1079 state->l2_slot == Ln_ENTRIES || 1080 state->dmap_valid); 1081 1082 /* Check the existing L2 entry */ 1083 state->l2_slot = l2_slot; 1084 if (state->dmap_valid) { 1085 l2e = state->l2[l2_slot]; 1086 if ((l2e & ATTR_DESCR_VALID) != 0) { 1087 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE); 1088 l3_pa = PTE_TO_PHYS(l2e); 1089 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa); 1090 return; 1091 } 1092 } 1093 1094 /* Create a new L2 table entry */ 1095 state->l3 = (pt_entry_t *)state->freemempos; 1096 memset(state->l3, 0, PAGE_SIZE); 1097 state->freemempos += PAGE_SIZE; 1098 1099 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3); 1100 MPASS((l3_pa & Ln_TABLE_MASK) == 0); 1101 MPASS(state->l2[l2_slot] == 0); 1102 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) | 1103 state->table_attrs | L2_TABLE); 1104 } 1105 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__)); 1106} 1107 1108static void 1109pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i) 1110{ 1111 u_int l2_slot; 1112 bool first; 1113 1114 if ((physmap[i + 1] - state->pa) < L2_SIZE) 1115 return; 1116 1117 /* Make sure there is a valid L1 table */ 1118 pmap_bootstrap_l1_table(state); 1119 1120 MPASS((state->va & L2_OFFSET) == 0); 1121 for (first = true; 1122 state->va < DMAP_MAX_ADDRESS && 1123 (physmap[i + 1] - state->pa) >= L2_SIZE; 1124 state->va += L2_SIZE, state->pa += L2_SIZE) { 1125 /* 1126 * Stop if we are about to walk off the end of what the 1127 * current L1 slot can address. 1128 */ 1129 if (!first && (state->pa & L1_OFFSET) == 0) 1130 break; 1131 1132 first = false; 1133 l2_slot = pmap_l2_index(state->va); 1134 MPASS((state->pa & L2_OFFSET) == 0); 1135 MPASS(state->l2[l2_slot] == 0); 1136 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) | 1137 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | 1138 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK); 1139 } 1140 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1141} 1142 1143static void 1144pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i) 1145{ 1146 pt_entry_t contig; 1147 u_int l3_slot; 1148 bool first; 1149 1150 if (physmap[i + 1] - state->pa < L3_SIZE) 1151 return; 1152 1153 /* Make sure there is a valid L2 table */ 1154 pmap_bootstrap_l2_table(state); 1155 1156 MPASS((state->va & L3_OFFSET) == 0); 1157 for (first = true, contig = 0; 1158 state->va < DMAP_MAX_ADDRESS && 1159 physmap[i + 1] - state->pa >= L3_SIZE; 1160 state->va += L3_SIZE, state->pa += L3_SIZE) { 1161 /* 1162 * Stop if we are about to walk off the end of what the 1163 * current L2 slot can address. 1164 */ 1165 if (!first && (state->pa & L2_OFFSET) == 0) 1166 break; 1167 1168 /* 1169 * If we have an aligned, contiguous chunk of L3C_ENTRIES 1170 * L3 pages, set the contiguous bit within each PTE so that 1171 * the chunk can be cached using only one TLB entry. 1172 */ 1173 if ((state->pa & L3C_OFFSET) == 0) { 1174 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS && 1175 physmap[i + 1] - state->pa >= L3C_SIZE) { 1176 contig = ATTR_CONTIGUOUS; 1177 } else { 1178 contig = 0; 1179 } 1180 } 1181 1182 first = false; 1183 l3_slot = pmap_l3_index(state->va); 1184 MPASS((state->pa & L3_OFFSET) == 0); 1185 MPASS(state->l3[l3_slot] == 0); 1186 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) | 1187 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | 1188 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE); 1189 } 1190 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS)); 1191} 1192 1193static void 1194pmap_bootstrap_dmap(vm_paddr_t min_pa) 1195{ 1196 int i; 1197 1198 dmap_phys_base = min_pa & ~L1_OFFSET; 1199 dmap_phys_max = 0; 1200 dmap_max_addr = 0; 1201 1202 for (i = 0; i < (physmap_idx * 2); i += 2) { 1203 bs_state.pa = physmap[i] & ~L3_OFFSET; 1204 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS; 1205 1206 /* Create L3 mappings at the start of the region */ 1207 if ((bs_state.pa & L2_OFFSET) != 0) 1208 pmap_bootstrap_l3_page(&bs_state, i); 1209 MPASS(bs_state.pa <= physmap[i + 1]); 1210 1211 if (L1_BLOCKS_SUPPORTED) { 1212 /* Create L2 mappings at the start of the region */ 1213 if ((bs_state.pa & L1_OFFSET) != 0) 1214 pmap_bootstrap_l2_block(&bs_state, i); 1215 MPASS(bs_state.pa <= physmap[i + 1]); 1216 1217 /* Create the main L1 block mappings */ 1218 for (; bs_state.va < DMAP_MAX_ADDRESS && 1219 (physmap[i + 1] - bs_state.pa) >= L1_SIZE; 1220 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) { 1221 /* Make sure there is a valid L1 table */ 1222 pmap_bootstrap_l0_table(&bs_state); 1223 MPASS((bs_state.pa & L1_OFFSET) == 0); 1224 pmap_store( 1225 &bs_state.l1[pmap_l1_index(bs_state.va)], 1226 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT | 1227 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 1228 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK); 1229 } 1230 MPASS(bs_state.pa <= physmap[i + 1]); 1231 1232 /* Create L2 mappings at the end of the region */ 1233 pmap_bootstrap_l2_block(&bs_state, i); 1234 } else { 1235 while (bs_state.va < DMAP_MAX_ADDRESS && 1236 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) { 1237 pmap_bootstrap_l2_block(&bs_state, i); 1238 } 1239 } 1240 MPASS(bs_state.pa <= physmap[i + 1]); 1241 1242 /* Create L3 mappings at the end of the region */ 1243 pmap_bootstrap_l3_page(&bs_state, i); 1244 MPASS(bs_state.pa == physmap[i + 1]); 1245 1246 if (bs_state.pa > dmap_phys_max) { 1247 dmap_phys_max = bs_state.pa; 1248 dmap_max_addr = bs_state.va; 1249 } 1250 } 1251 1252 cpu_tlb_flushID(); 1253} 1254 1255static void 1256pmap_bootstrap_l2(vm_offset_t va) 1257{ 1258 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); 1259 1260 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1261 bs_state.va = va; 1262 1263 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE) 1264 pmap_bootstrap_l1_table(&bs_state); 1265} 1266 1267static void 1268pmap_bootstrap_l3(vm_offset_t va) 1269{ 1270 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); 1271 1272 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/ 1273 bs_state.va = va; 1274 1275 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE) 1276 pmap_bootstrap_l2_table(&bs_state); 1277} 1278 1279/* 1280 * Bootstrap the system enough to run with virtual memory. 1281 */ 1282void 1283pmap_bootstrap(vm_size_t kernlen) 1284{ 1285 vm_offset_t dpcpu, msgbufpv; 1286 vm_paddr_t start_pa, pa, min_pa; 1287 int i; 1288 1289 /* Verify that the ASID is set through TTBR0. */ 1290 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0, 1291 ("pmap_bootstrap: TCR_EL1.A1 != 0")); 1292 1293 /* Set this early so we can use the pagetable walking functions */ 1294 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1; 1295 PMAP_LOCK_INIT(kernel_pmap); 1296 kernel_pmap->pm_l0_paddr = 1297 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0); 1298 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 1299 vm_radix_init(&kernel_pmap->pm_root); 1300 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN); 1301 kernel_pmap->pm_stage = PM_STAGE1; 1302 kernel_pmap->pm_levels = 4; 1303 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr; 1304 kernel_pmap->pm_asid_set = &asids; 1305 1306 /* Assume the address we were loaded to is a valid physical address */ 1307 min_pa = pmap_early_vtophys(KERNBASE); 1308 1309 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1310 physmap_idx /= 2; 1311 1312 /* 1313 * Find the minimum physical address. physmap is sorted, 1314 * but may contain empty ranges. 1315 */ 1316 for (i = 0; i < physmap_idx * 2; i += 2) { 1317 if (physmap[i] == physmap[i + 1]) 1318 continue; 1319 if (physmap[i] <= min_pa) 1320 min_pa = physmap[i]; 1321 } 1322 1323 bs_state.freemempos = KERNBASE + kernlen; 1324 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE); 1325 1326 /* Create a direct map region early so we can use it for pa -> va */ 1327 pmap_bootstrap_dmap(min_pa); 1328 bs_state.dmap_valid = true; 1329 /* 1330 * We only use PXN when we know nothing will be executed from it, e.g. 1331 * the DMAP region. 1332 */ 1333 bs_state.table_attrs &= ~TATTR_PXN_TABLE; 1334 1335 start_pa = pa = pmap_early_vtophys(KERNBASE); 1336 1337 /* 1338 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the 1339 * loader allocated the first and only l2 page table page used to map 1340 * the kernel, preloaded files and module metadata. 1341 */ 1342 pmap_bootstrap_l2(KERNBASE + L1_SIZE); 1343 /* And the l3 tables for the early devmap */ 1344 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); 1345 1346 cpu_tlb_flushID(); 1347 1348#define alloc_pages(var, np) \ 1349 (var) = bs_state.freemempos; \ 1350 bs_state.freemempos += (np * PAGE_SIZE); \ 1351 memset((char *)(var), 0, ((np) * PAGE_SIZE)); 1352 1353 /* Allocate dynamic per-cpu area. */ 1354 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); 1355 dpcpu_init((void *)dpcpu, 0); 1356 1357 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ 1358 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); 1359 msgbufp = (void *)msgbufpv; 1360 1361 /* Reserve some VA space for early BIOS/ACPI mapping */ 1362 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE); 1363 1364 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; 1365 virtual_avail = roundup2(virtual_avail, L1_SIZE); 1366 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); 1367 kernel_vm_end = virtual_avail; 1368 1369 pa = pmap_early_vtophys(bs_state.freemempos); 1370 1371 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); 1372 1373 cpu_tlb_flushID(); 1374} 1375 1376#if defined(KASAN) || defined(KMSAN) 1377static void 1378pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa, 1379 vm_offset_t *vap, vm_offset_t eva) 1380{ 1381 vm_paddr_t pa; 1382 vm_offset_t va; 1383 pd_entry_t *l2; 1384 1385 va = *vap; 1386 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE); 1387 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) { 1388 l2 = pmap_l2(kernel_pmap, va); 1389 1390 /* 1391 * KASAN stack checking results in us having already allocated 1392 * part of our shadow map, so we can just skip those segments. 1393 */ 1394 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) { 1395 pa += L2_SIZE; 1396 continue; 1397 } 1398 1399 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE); 1400 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC); 1401 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK); 1402 } 1403 *vap = va; 1404} 1405 1406/* 1407 * Finish constructing the initial shadow map: 1408 * - Count how many pages from KERNBASE to virtual_avail (scaled for 1409 * shadow map) 1410 * - Map that entire range using L2 superpages. 1411 */ 1412static void 1413pmap_bootstrap_san1(vm_offset_t va, int scale) 1414{ 1415 vm_offset_t eva; 1416 vm_paddr_t kernstart; 1417 int i; 1418 1419 kernstart = pmap_early_vtophys(KERNBASE); 1420 1421 /* 1422 * Rebuild physmap one more time, we may have excluded more regions from 1423 * allocation since pmap_bootstrap(). 1424 */ 1425 bzero(physmap, sizeof(physmap)); 1426 physmap_idx = physmem_avail(physmap, nitems(physmap)); 1427 physmap_idx /= 2; 1428 1429 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale; 1430 1431 /* 1432 * Find a slot in the physmap large enough for what we needed. We try to put 1433 * the shadow map as high up as we can to avoid depleting the lower 4GB in case 1434 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA. 1435 */ 1436 for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) { 1437 vm_paddr_t plow, phigh; 1438 1439 /* L2 mappings must be backed by memory that is L2-aligned */ 1440 plow = roundup2(physmap[i], L2_SIZE); 1441 phigh = physmap[i + 1]; 1442 if (plow >= phigh) 1443 continue; 1444 if (kernstart >= plow && kernstart < phigh) 1445 phigh = kernstart; 1446 if (phigh - plow >= L2_SIZE) { 1447 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva); 1448 if (va >= eva) 1449 break; 1450 } 1451 } 1452 if (i < 0) 1453 panic("Could not find phys region for shadow map"); 1454 1455 /* 1456 * Done. We should now have a valid shadow address mapped for all KVA 1457 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus, 1458 * shadow accesses by the sanitizer runtime will succeed for this range. 1459 * When the kernel virtual address range is later expanded, as will 1460 * happen in vm_mem_init(), the shadow map will be grown as well. This 1461 * is handled by pmap_san_enter(). 1462 */ 1463} 1464 1465void 1466pmap_bootstrap_san(void) 1467{ 1468#ifdef KASAN 1469 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE); 1470#else 1471 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1472 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE); 1473 pd_entry_t *l0, *l1; 1474 1475 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE) 1476 panic("initial kernel map is too large"); 1477 1478 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS); 1479 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1480 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp))); 1481 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS); 1482 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1483 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE))); 1484 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1); 1485 1486 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS); 1487 pmap_store(l0, L0_TABLE | PHYS_TO_PTE( 1488 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp))); 1489 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS); 1490 pmap_store(l1, L1_TABLE | PHYS_TO_PTE( 1491 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE))); 1492 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1); 1493#endif 1494} 1495#endif 1496 1497/* 1498 * Initialize a vm_page's machine-dependent fields. 1499 */ 1500void 1501pmap_page_init(vm_page_t m) 1502{ 1503 1504 TAILQ_INIT(&m->md.pv_list); 1505 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; 1506} 1507 1508static void 1509pmap_init_asids(struct asid_set *set, int bits) 1510{ 1511 int i; 1512 1513 set->asid_bits = bits; 1514 1515 /* 1516 * We may be too early in the overall initialization process to use 1517 * bit_alloc(). 1518 */ 1519 set->asid_set_size = 1 << set->asid_bits; 1520 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size), 1521 M_WAITOK | M_ZERO); 1522 for (i = 0; i < ASID_FIRST_AVAILABLE; i++) 1523 bit_set(set->asid_set, i); 1524 set->asid_next = ASID_FIRST_AVAILABLE; 1525 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN); 1526} 1527 1528static void 1529pmap_init_pv_table(void) 1530{ 1531 struct vm_phys_seg *seg, *next_seg; 1532 struct pmap_large_md_page *pvd; 1533 vm_size_t s; 1534 int domain, i, j, pages; 1535 1536 /* 1537 * We strongly depend on the size being a power of two, so the assert 1538 * is overzealous. However, should the struct be resized to a 1539 * different power of two, the code below needs to be revisited. 1540 */ 1541 CTASSERT((sizeof(*pvd) == 64)); 1542 1543 /* 1544 * Calculate the size of the array. 1545 */ 1546 s = 0; 1547 for (i = 0; i < vm_phys_nsegs; i++) { 1548 seg = &vm_phys_segs[i]; 1549 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1550 pmap_l2_pindex(seg->start); 1551 s += round_page(pages * sizeof(*pvd)); 1552 } 1553 pv_table = (struct pmap_large_md_page *)kva_alloc(s); 1554 if (pv_table == NULL) 1555 panic("%s: kva_alloc failed\n", __func__); 1556 1557 /* 1558 * Iterate physical segments to allocate domain-local memory for PV 1559 * list headers. 1560 */ 1561 pvd = pv_table; 1562 for (i = 0; i < vm_phys_nsegs; i++) { 1563 seg = &vm_phys_segs[i]; 1564 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1565 pmap_l2_pindex(seg->start); 1566 domain = seg->domain; 1567 1568 s = round_page(pages * sizeof(*pvd)); 1569 1570 for (j = 0; j < s; j += PAGE_SIZE) { 1571 vm_page_t m = vm_page_alloc_noobj_domain(domain, 1572 VM_ALLOC_ZERO); 1573 if (m == NULL) 1574 panic("failed to allocate PV table page"); 1575 pmap_qenter((vm_offset_t)pvd + j, &m, 1); 1576 } 1577 1578 for (j = 0; j < s / sizeof(*pvd); j++) { 1579 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); 1580 TAILQ_INIT(&pvd->pv_page.pv_list); 1581 pvd++; 1582 } 1583 } 1584 pvd = &pv_dummy_large; 1585 memset(pvd, 0, sizeof(*pvd)); 1586 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); 1587 TAILQ_INIT(&pvd->pv_page.pv_list); 1588 1589 /* 1590 * Set pointers from vm_phys_segs to pv_table. 1591 */ 1592 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) { 1593 seg = &vm_phys_segs[i]; 1594 seg->md_first = pvd; 1595 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1596 pmap_l2_pindex(seg->start); 1597 1598 /* 1599 * If there is a following segment, and the final 1600 * superpage of this segment and the initial superpage 1601 * of the next segment are the same then adjust the 1602 * pv_table entry for that next segment down by one so 1603 * that the pv_table entries will be shared. 1604 */ 1605 if (i + 1 < vm_phys_nsegs) { 1606 next_seg = &vm_phys_segs[i + 1]; 1607 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 == 1608 pmap_l2_pindex(next_seg->start)) { 1609 pvd--; 1610 } 1611 } 1612 } 1613} 1614 1615/* 1616 * Initialize the pmap module. 1617 * Called by vm_init, to initialize any structures that the pmap 1618 * system needs to map virtual memory. 1619 */ 1620void 1621pmap_init(void) 1622{ 1623 uint64_t mmfr1; 1624 int i, vmid_bits; 1625 1626 /* 1627 * Are large page mappings enabled? 1628 */ 1629 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); 1630 if (superpages_enabled) { 1631 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1632 ("pmap_init: can't assign to pagesizes[1]")); 1633 pagesizes[1] = L2_SIZE; 1634 if (L1_BLOCKS_SUPPORTED) { 1635 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0, 1636 ("pmap_init: can't assign to pagesizes[2]")); 1637 pagesizes[2] = L1_SIZE; 1638 } 1639 } 1640 1641 /* 1642 * Initialize the ASID allocator. 1643 */ 1644 pmap_init_asids(&asids, 1645 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8); 1646 1647 if (has_hyp()) { 1648 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); 1649 vmid_bits = 8; 1650 1651 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) == 1652 ID_AA64MMFR1_VMIDBits_16) 1653 vmid_bits = 16; 1654 pmap_init_asids(&vmids, vmid_bits); 1655 } 1656 1657 /* 1658 * Initialize pv chunk lists. 1659 */ 1660 for (i = 0; i < PMAP_MEMDOM; i++) { 1661 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, 1662 MTX_DEF); 1663 TAILQ_INIT(&pv_chunks[i].pvc_list); 1664 } 1665 pmap_init_pv_table(); 1666 1667 vm_initialized = 1; 1668} 1669 1670static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1671 "2MB page mapping counters"); 1672 1673static u_long pmap_l2_demotions; 1674SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, 1675 &pmap_l2_demotions, 0, "2MB page demotions"); 1676 1677static u_long pmap_l2_mappings; 1678SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, 1679 &pmap_l2_mappings, 0, "2MB page mappings"); 1680 1681static u_long pmap_l2_p_failures; 1682SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, 1683 &pmap_l2_p_failures, 0, "2MB page promotion failures"); 1684 1685static u_long pmap_l2_promotions; 1686SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, 1687 &pmap_l2_promotions, 0, "2MB page promotions"); 1688 1689static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 1690 "L3C (64KB/2MB) page mapping counters"); 1691 1692static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions); 1693SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD, 1694 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions"); 1695 1696static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings); 1697SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD, 1698 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings"); 1699 1700static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures); 1701SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD, 1702 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures"); 1703 1704static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions); 1705SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD, 1706 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions"); 1707 1708/* 1709 * If the given value for "final_only" is false, then any cached intermediate- 1710 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to 1711 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry. 1712 * Otherwise, just the cached final-level entry is invalidated. 1713 */ 1714static __inline void 1715pmap_s1_invalidate_kernel(uint64_t r, bool final_only) 1716{ 1717 if (final_only) 1718 __asm __volatile("tlbi vaale1is, %0" : : "r" (r)); 1719 else 1720 __asm __volatile("tlbi vaae1is, %0" : : "r" (r)); 1721} 1722 1723static __inline void 1724pmap_s1_invalidate_user(uint64_t r, bool final_only) 1725{ 1726 if (final_only) 1727 __asm __volatile("tlbi vale1is, %0" : : "r" (r)); 1728 else 1729 __asm __volatile("tlbi vae1is, %0" : : "r" (r)); 1730} 1731 1732/* 1733 * Invalidates any cached final- and optionally intermediate-level TLB entries 1734 * for the specified virtual address in the given virtual address space. 1735 */ 1736static __inline void 1737pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1738{ 1739 uint64_t r; 1740 1741 PMAP_ASSERT_STAGE1(pmap); 1742 1743 dsb(ishst); 1744 r = TLBI_VA(va); 1745 if (pmap == kernel_pmap) { 1746 pmap_s1_invalidate_kernel(r, final_only); 1747 } else { 1748 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1749 pmap_s1_invalidate_user(r, final_only); 1750 } 1751 dsb(ish); 1752 isb(); 1753} 1754 1755static __inline void 1756pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1757{ 1758 PMAP_ASSERT_STAGE2(pmap); 1759 MPASS(pmap_stage2_invalidate_range != NULL); 1760 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE, 1761 final_only); 1762} 1763 1764static __inline void 1765pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) 1766{ 1767 if (pmap->pm_stage == PM_STAGE1) 1768 pmap_s1_invalidate_page(pmap, va, final_only); 1769 else 1770 pmap_s2_invalidate_page(pmap, va, final_only); 1771} 1772 1773/* 1774 * Invalidates any cached final- and optionally intermediate-level TLB entries 1775 * for the specified virtual address range in the given virtual address space. 1776 */ 1777static __inline void 1778pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1779 bool final_only) 1780{ 1781 uint64_t end, r, start; 1782 1783 PMAP_ASSERT_STAGE1(pmap); 1784 1785 dsb(ishst); 1786 if (pmap == kernel_pmap) { 1787 start = TLBI_VA(sva); 1788 end = TLBI_VA(eva); 1789 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1790 pmap_s1_invalidate_kernel(r, final_only); 1791 } else { 1792 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1793 start |= TLBI_VA(sva); 1794 end |= TLBI_VA(eva); 1795 for (r = start; r < end; r += TLBI_VA_L3_INCR) 1796 pmap_s1_invalidate_user(r, final_only); 1797 } 1798 dsb(ish); 1799 isb(); 1800} 1801 1802static __inline void 1803pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1804 bool final_only) 1805{ 1806 PMAP_ASSERT_STAGE2(pmap); 1807 MPASS(pmap_stage2_invalidate_range != NULL); 1808 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only); 1809} 1810 1811static __inline void 1812pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, 1813 bool final_only) 1814{ 1815 if (pmap->pm_stage == PM_STAGE1) 1816 pmap_s1_invalidate_range(pmap, sva, eva, final_only); 1817 else 1818 pmap_s2_invalidate_range(pmap, sva, eva, final_only); 1819} 1820 1821/* 1822 * Invalidates all cached intermediate- and final-level TLB entries for the 1823 * given virtual address space. 1824 */ 1825static __inline void 1826pmap_s1_invalidate_all(pmap_t pmap) 1827{ 1828 uint64_t r; 1829 1830 PMAP_ASSERT_STAGE1(pmap); 1831 1832 dsb(ishst); 1833 if (pmap == kernel_pmap) { 1834 __asm __volatile("tlbi vmalle1is"); 1835 } else { 1836 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 1837 __asm __volatile("tlbi aside1is, %0" : : "r" (r)); 1838 } 1839 dsb(ish); 1840 isb(); 1841} 1842 1843static __inline void 1844pmap_s2_invalidate_all(pmap_t pmap) 1845{ 1846 PMAP_ASSERT_STAGE2(pmap); 1847 MPASS(pmap_stage2_invalidate_all != NULL); 1848 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap)); 1849} 1850 1851static __inline void 1852pmap_invalidate_all(pmap_t pmap) 1853{ 1854 if (pmap->pm_stage == PM_STAGE1) 1855 pmap_s1_invalidate_all(pmap); 1856 else 1857 pmap_s2_invalidate_all(pmap); 1858} 1859 1860/* 1861 * Routine: pmap_extract 1862 * Function: 1863 * Extract the physical page address associated 1864 * with the given map/virtual_address pair. 1865 */ 1866vm_paddr_t 1867pmap_extract(pmap_t pmap, vm_offset_t va) 1868{ 1869 pt_entry_t *pte, tpte; 1870 vm_paddr_t pa; 1871 int lvl; 1872 1873 pa = 0; 1874 PMAP_LOCK(pmap); 1875 /* 1876 * Find the block or page map for this virtual address. pmap_pte 1877 * will return either a valid block/page entry, or NULL. 1878 */ 1879 pte = pmap_pte(pmap, va, &lvl); 1880 if (pte != NULL) { 1881 tpte = pmap_load(pte); 1882 pa = PTE_TO_PHYS(tpte); 1883 switch(lvl) { 1884 case 1: 1885 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 1886 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, 1887 ("pmap_extract: Invalid L1 pte found: %lx", 1888 tpte & ATTR_DESCR_MASK)); 1889 pa |= (va & L1_OFFSET); 1890 break; 1891 case 2: 1892 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, 1893 ("pmap_extract: Invalid L2 pte found: %lx", 1894 tpte & ATTR_DESCR_MASK)); 1895 pa |= (va & L2_OFFSET); 1896 break; 1897 case 3: 1898 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, 1899 ("pmap_extract: Invalid L3 pte found: %lx", 1900 tpte & ATTR_DESCR_MASK)); 1901 pa |= (va & L3_OFFSET); 1902 break; 1903 } 1904 } 1905 PMAP_UNLOCK(pmap); 1906 return (pa); 1907} 1908 1909/* 1910 * Routine: pmap_extract_and_hold 1911 * Function: 1912 * Atomically extract and hold the physical page 1913 * with the given pmap and virtual address pair 1914 * if that mapping permits the given protection. 1915 */ 1916vm_page_t 1917pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1918{ 1919 pt_entry_t *pte, tpte; 1920 vm_offset_t off; 1921 vm_page_t m; 1922 int lvl; 1923 bool use; 1924 1925 m = NULL; 1926 PMAP_LOCK(pmap); 1927 pte = pmap_pte(pmap, va, &lvl); 1928 if (pte != NULL) { 1929 tpte = pmap_load(pte); 1930 1931 KASSERT(lvl > 0 && lvl <= 3, 1932 ("pmap_extract_and_hold: Invalid level %d", lvl)); 1933 /* 1934 * Check that the pte is either a L3 page, or a L1 or L2 block 1935 * entry. We can assume L1_BLOCK == L2_BLOCK. 1936 */ 1937 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || 1938 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), 1939 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, 1940 tpte & ATTR_DESCR_MASK)); 1941 1942 use = false; 1943 if ((prot & VM_PROT_WRITE) == 0) 1944 use = true; 1945 else if (pmap->pm_stage == PM_STAGE1 && 1946 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)) 1947 use = true; 1948 else if (pmap->pm_stage == PM_STAGE2 && 1949 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) == 1950 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE))) 1951 use = true; 1952 1953 if (use) { 1954 switch (lvl) { 1955 case 1: 1956 off = va & L1_OFFSET; 1957 break; 1958 case 2: 1959 off = va & L2_OFFSET; 1960 break; 1961 case 3: 1962 default: 1963 off = 0; 1964 } 1965 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off); 1966 if (m != NULL && !vm_page_wire_mapped(m)) 1967 m = NULL; 1968 } 1969 } 1970 PMAP_UNLOCK(pmap); 1971 return (m); 1972} 1973 1974/* 1975 * Walks the page tables to translate a kernel virtual address to a 1976 * physical address. Returns true if the kva is valid and stores the 1977 * physical address in pa if it is not NULL. 1978 * 1979 * See the comment above data_abort() for the rationale for specifying 1980 * NO_PERTHREAD_SSP here. 1981 */ 1982bool NO_PERTHREAD_SSP 1983pmap_klookup(vm_offset_t va, vm_paddr_t *pa) 1984{ 1985 pt_entry_t *pte, tpte; 1986 register_t intr; 1987 uint64_t par; 1988 1989 /* 1990 * Disable interrupts so we don't get interrupted between asking 1991 * for address translation, and getting the result back. 1992 */ 1993 intr = intr_disable(); 1994 par = arm64_address_translate_s1e1r(va); 1995 intr_restore(intr); 1996 1997 if (PAR_SUCCESS(par)) { 1998 if (pa != NULL) 1999 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK); 2000 return (true); 2001 } 2002 2003 /* 2004 * Fall back to walking the page table. The address translation 2005 * instruction may fail when the page is in a break-before-make 2006 * sequence. As we only clear the valid bit in said sequence we 2007 * can walk the page table to find the physical address. 2008 */ 2009 2010 pte = pmap_l1(kernel_pmap, va); 2011 if (pte == NULL) 2012 return (false); 2013 2014 /* 2015 * A concurrent pmap_update_entry() will clear the entry's valid bit 2016 * but leave the rest of the entry unchanged. Therefore, we treat a 2017 * non-zero entry as being valid, and we ignore the valid bit when 2018 * determining whether the entry maps a block, page, or table. 2019 */ 2020 tpte = pmap_load(pte); 2021 if (tpte == 0) 2022 return (false); 2023 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2024 if (pa != NULL) 2025 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET); 2026 return (true); 2027 } 2028 pte = pmap_l1_to_l2(&tpte, va); 2029 tpte = pmap_load(pte); 2030 if (tpte == 0) 2031 return (false); 2032 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 2033 if (pa != NULL) 2034 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET); 2035 return (true); 2036 } 2037 pte = pmap_l2_to_l3(&tpte, va); 2038 tpte = pmap_load(pte); 2039 if (tpte == 0) 2040 return (false); 2041 if (pa != NULL) 2042 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET); 2043 return (true); 2044} 2045 2046/* 2047 * Routine: pmap_kextract 2048 * Function: 2049 * Extract the physical page address associated with the given kernel 2050 * virtual address. 2051 */ 2052vm_paddr_t 2053pmap_kextract(vm_offset_t va) 2054{ 2055 vm_paddr_t pa; 2056 2057 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 2058 return (DMAP_TO_PHYS(va)); 2059 2060 if (pmap_klookup(va, &pa) == false) 2061 return (0); 2062 return (pa); 2063} 2064 2065/*************************************************** 2066 * Low level mapping routines..... 2067 ***************************************************/ 2068 2069void 2070pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) 2071{ 2072 pd_entry_t *pde; 2073 pt_entry_t attr, old_l3e, *pte; 2074 vm_offset_t va; 2075 vm_page_t mpte; 2076 int error, lvl; 2077 2078 KASSERT((pa & L3_OFFSET) == 0, 2079 ("pmap_kenter: Invalid physical address")); 2080 KASSERT((sva & L3_OFFSET) == 0, 2081 ("pmap_kenter: Invalid virtual address")); 2082 KASSERT((size & PAGE_MASK) == 0, 2083 ("pmap_kenter: Mapping is not page-sized")); 2084 2085 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2086 ATTR_KERN_GP | ATTR_S1_IDX(mode); 2087 old_l3e = 0; 2088 va = sva; 2089 while (size != 0) { 2090 pde = pmap_pde(kernel_pmap, va, &lvl); 2091 KASSERT(pde != NULL, 2092 ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); 2093 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); 2094 2095 /* 2096 * If we have an aligned, contiguous chunk of L2_SIZE, try 2097 * to create an L2_BLOCK mapping. 2098 */ 2099 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE && 2100 (pa & L2_OFFSET) == 0 && vm_initialized) { 2101 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 2102 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)), 2103 ("pmap_kenter: Unexpected mapping")); 2104 PMAP_LOCK(kernel_pmap); 2105 error = pmap_insert_pt_page(kernel_pmap, mpte, false, 2106 false); 2107 if (error == 0) { 2108 attr &= ~ATTR_CONTIGUOUS; 2109 2110 /* 2111 * Although the page table page "mpte" should 2112 * be devoid of mappings, the TLB might hold 2113 * intermediate entries that reference it, so 2114 * we perform a single-page invalidation. 2115 */ 2116 pmap_update_entry(kernel_pmap, pde, 2117 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va, 2118 PAGE_SIZE); 2119 } 2120 PMAP_UNLOCK(kernel_pmap); 2121 if (error == 0) { 2122 va += L2_SIZE; 2123 pa += L2_SIZE; 2124 size -= L2_SIZE; 2125 continue; 2126 } 2127 } 2128 2129 /* 2130 * If we have an aligned, contiguous chunk of L3C_ENTRIES 2131 * L3 pages, set the contiguous bit within each PTE so that 2132 * the chunk can be cached using only one TLB entry. 2133 */ 2134 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) { 2135 if (size >= L3C_SIZE) 2136 attr |= ATTR_CONTIGUOUS; 2137 else 2138 attr &= ~ATTR_CONTIGUOUS; 2139 } 2140 2141 pte = pmap_l2_to_l3(pde, va); 2142 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr | 2143 L3_PAGE); 2144 2145 va += PAGE_SIZE; 2146 pa += PAGE_SIZE; 2147 size -= PAGE_SIZE; 2148 } 2149 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2150 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2151 else { 2152 /* 2153 * Because the old entries were invalid and the new mappings 2154 * are not executable, an isb is not required. 2155 */ 2156 dsb(ishst); 2157 } 2158} 2159 2160void 2161pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) 2162{ 2163 2164 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE); 2165} 2166 2167/* 2168 * Remove a page from the kernel pagetables. 2169 */ 2170void 2171pmap_kremove(vm_offset_t va) 2172{ 2173 pt_entry_t *pte; 2174 2175 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__); 2176 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0, 2177 ("pmap_kremove: unexpected ATTR_CONTIGUOUS")); 2178 pmap_clear(pte); 2179 pmap_s1_invalidate_page(kernel_pmap, va, true); 2180} 2181 2182/* 2183 * Remove the specified range of mappings from the kernel address space. 2184 * 2185 * Should only be applied to mappings that were created by pmap_kenter() or 2186 * pmap_kenter_device(). Nothing about this function is actually specific 2187 * to device mappings. 2188 */ 2189void 2190pmap_kremove_device(vm_offset_t sva, vm_size_t size) 2191{ 2192 pt_entry_t *ptep, *ptep_end; 2193 vm_offset_t va; 2194 int lvl; 2195 2196 KASSERT((sva & L3_OFFSET) == 0, 2197 ("pmap_kremove_device: Invalid virtual address")); 2198 KASSERT((size & PAGE_MASK) == 0, 2199 ("pmap_kremove_device: Mapping is not page-sized")); 2200 2201 va = sva; 2202 while (size != 0) { 2203 ptep = pmap_pte(kernel_pmap, va, &lvl); 2204 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va)); 2205 switch (lvl) { 2206 case 2: 2207 KASSERT((va & L2_OFFSET) == 0, 2208 ("Unaligned virtual address")); 2209 KASSERT(size >= L2_SIZE, ("Insufficient size")); 2210 2211 if (va != sva) { 2212 pmap_s1_invalidate_range(kernel_pmap, sva, va, 2213 true); 2214 } 2215 pmap_clear(ptep); 2216 pmap_s1_invalidate_page(kernel_pmap, va, true); 2217 PMAP_LOCK(kernel_pmap); 2218 pmap_remove_kernel_l2(kernel_pmap, ptep, va); 2219 PMAP_UNLOCK(kernel_pmap); 2220 2221 va += L2_SIZE; 2222 sva = va; 2223 size -= L2_SIZE; 2224 break; 2225 case 3: 2226 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 2227 KASSERT((va & L3C_OFFSET) == 0, 2228 ("Unaligned L3C virtual address")); 2229 KASSERT(size >= L3C_SIZE, 2230 ("Insufficient L3C size")); 2231 2232 ptep_end = ptep + L3C_ENTRIES; 2233 for (; ptep < ptep_end; ptep++) 2234 pmap_clear(ptep); 2235 2236 va += L3C_SIZE; 2237 size -= L3C_SIZE; 2238 break; 2239 } 2240 pmap_clear(ptep); 2241 2242 va += PAGE_SIZE; 2243 size -= PAGE_SIZE; 2244 break; 2245 default: 2246 __assert_unreachable(); 2247 break; 2248 } 2249 } 2250 if (va != sva) 2251 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2252} 2253 2254/* 2255 * Used to map a range of physical addresses into kernel 2256 * virtual address space. 2257 * 2258 * The value passed in '*virt' is a suggested virtual address for 2259 * the mapping. Architectures which can support a direct-mapped 2260 * physical to virtual region can return the appropriate address 2261 * within that region, leaving '*virt' unchanged. Other 2262 * architectures should map the pages starting at '*virt' and 2263 * update '*virt' with the first usable address after the mapped 2264 * region. 2265 */ 2266vm_offset_t 2267pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 2268{ 2269 return PHYS_TO_DMAP(start); 2270} 2271 2272/* 2273 * Add a list of wired pages to the kva 2274 * this routine is only used for temporary 2275 * kernel mappings that do not need to have 2276 * page modification or references recorded. 2277 * Note that old mappings are simply written 2278 * over. The page *must* be wired. 2279 * Note: SMP coherent. Uses a ranged shootdown IPI. 2280 */ 2281void 2282pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 2283{ 2284 pd_entry_t *pde; 2285 pt_entry_t attr, old_l3e, *pte; 2286 vm_offset_t va; 2287 vm_page_t m; 2288 int i, lvl; 2289 2290 old_l3e = 0; 2291 va = sva; 2292 for (i = 0; i < count; i++) { 2293 pde = pmap_pde(kernel_pmap, va, &lvl); 2294 KASSERT(pde != NULL, 2295 ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); 2296 KASSERT(lvl == 2, 2297 ("pmap_qenter: Invalid level %d", lvl)); 2298 2299 m = ma[i]; 2300 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN | 2301 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE; 2302 pte = pmap_l2_to_l3(pde, va); 2303 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr); 2304 2305 va += L3_SIZE; 2306 } 2307 if ((old_l3e & ATTR_DESCR_VALID) != 0) 2308 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2309 else { 2310 /* 2311 * Because the old entries were invalid and the new mappings 2312 * are not executable, an isb is not required. 2313 */ 2314 dsb(ishst); 2315 } 2316} 2317 2318/* 2319 * This routine tears out page mappings from the 2320 * kernel -- it is meant only for temporary mappings. 2321 */ 2322void 2323pmap_qremove(vm_offset_t sva, int count) 2324{ 2325 pt_entry_t *pte; 2326 vm_offset_t va; 2327 2328 KASSERT(ADDR_IS_CANONICAL(sva), 2329 ("%s: Address not in canonical form: %lx", __func__, sva)); 2330 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva)); 2331 2332 va = sva; 2333 while (count-- > 0) { 2334 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL); 2335 if (pte != NULL) { 2336 pmap_clear(pte); 2337 } 2338 2339 va += PAGE_SIZE; 2340 } 2341 pmap_s1_invalidate_range(kernel_pmap, sva, va, true); 2342} 2343 2344/*************************************************** 2345 * Page table page management routines..... 2346 ***************************************************/ 2347/* 2348 * Schedule the specified unused page table page to be freed. Specifically, 2349 * add the page to the specified list of pages that will be released to the 2350 * physical memory manager after the TLB has been updated. 2351 */ 2352static __inline void 2353pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO) 2354{ 2355 2356 if (set_PG_ZERO) 2357 m->flags |= PG_ZERO; 2358 else 2359 m->flags &= ~PG_ZERO; 2360 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2361} 2362 2363/* 2364 * Decrements a page table page's reference count, which is used to record the 2365 * number of valid page table entries within the page. If the reference count 2366 * drops to zero, then the page table page is unmapped. Returns true if the 2367 * page table page was unmapped and false otherwise. 2368 */ 2369static inline bool 2370pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2371{ 2372 2373 --m->ref_count; 2374 if (m->ref_count == 0) { 2375 _pmap_unwire_l3(pmap, va, m, free); 2376 return (true); 2377 } else 2378 return (false); 2379} 2380 2381static void 2382_pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2383{ 2384 2385 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2386 /* 2387 * unmap the page table page 2388 */ 2389 if (m->pindex >= (NUL2E + NUL1E)) { 2390 /* l1 page */ 2391 pd_entry_t *l0; 2392 2393 l0 = pmap_l0(pmap, va); 2394 pmap_clear(l0); 2395 } else if (m->pindex >= NUL2E) { 2396 /* l2 page */ 2397 pd_entry_t *l1; 2398 2399 l1 = pmap_l1(pmap, va); 2400 pmap_clear(l1); 2401 } else { 2402 /* l3 page */ 2403 pd_entry_t *l2; 2404 2405 l2 = pmap_l2(pmap, va); 2406 pmap_clear(l2); 2407 } 2408 pmap_resident_count_dec(pmap, 1); 2409 if (m->pindex < NUL2E) { 2410 /* We just released an l3, unhold the matching l2 */ 2411 pd_entry_t *l1, tl1; 2412 vm_page_t l2pg; 2413 2414 l1 = pmap_l1(pmap, va); 2415 tl1 = pmap_load(l1); 2416 l2pg = PTE_TO_VM_PAGE(tl1); 2417 pmap_unwire_l3(pmap, va, l2pg, free); 2418 } else if (m->pindex < (NUL2E + NUL1E)) { 2419 /* We just released an l2, unhold the matching l1 */ 2420 pd_entry_t *l0, tl0; 2421 vm_page_t l1pg; 2422 2423 l0 = pmap_l0(pmap, va); 2424 tl0 = pmap_load(l0); 2425 l1pg = PTE_TO_VM_PAGE(tl0); 2426 pmap_unwire_l3(pmap, va, l1pg, free); 2427 } 2428 pmap_invalidate_page(pmap, va, false); 2429 2430 /* 2431 * Put page on a list so that it is released after 2432 * *ALL* TLB shootdown is done 2433 */ 2434 pmap_add_delayed_free_list(m, free, true); 2435} 2436 2437/* 2438 * After removing a page table entry, this routine is used to 2439 * conditionally free the page, and manage the reference count. 2440 */ 2441static int 2442pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2443 struct spglist *free) 2444{ 2445 vm_page_t mpte; 2446 2447 KASSERT(ADDR_IS_CANONICAL(va), 2448 ("%s: Address not in canonical form: %lx", __func__, va)); 2449 if (ADDR_IS_KERNEL(va)) 2450 return (0); 2451 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2452 mpte = PTE_TO_VM_PAGE(ptepde); 2453 return (pmap_unwire_l3(pmap, va, mpte, free)); 2454} 2455 2456/* 2457 * Release a page table page reference after a failed attempt to create a 2458 * mapping. 2459 */ 2460static void 2461pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 2462{ 2463 struct spglist free; 2464 2465 SLIST_INIT(&free); 2466 if (pmap_unwire_l3(pmap, va, mpte, &free)) 2467 vm_page_free_pages_toq(&free, true); 2468} 2469 2470void 2471pmap_pinit0(pmap_t pmap) 2472{ 2473 2474 PMAP_LOCK_INIT(pmap); 2475 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2476 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1); 2477 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2478 TAILQ_INIT(&pmap->pm_pvchunk); 2479 vm_radix_init(&pmap->pm_root); 2480 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN); 2481 pmap->pm_stage = PM_STAGE1; 2482 pmap->pm_levels = 4; 2483 pmap->pm_ttbr = pmap->pm_l0_paddr; 2484 pmap->pm_asid_set = &asids; 2485 pmap->pm_bti = NULL; 2486 2487 PCPU_SET(curpmap, pmap); 2488} 2489 2490int 2491pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels) 2492{ 2493 vm_page_t m; 2494 2495 /* 2496 * allocate the l0 page 2497 */ 2498 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED | 2499 VM_ALLOC_ZERO); 2500 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m); 2501 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr); 2502 2503 TAILQ_INIT(&pmap->pm_pvchunk); 2504 vm_radix_init(&pmap->pm_root); 2505 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); 2506 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX); 2507 2508 MPASS(levels == 3 || levels == 4); 2509 pmap->pm_levels = levels; 2510 pmap->pm_stage = stage; 2511 pmap->pm_bti = NULL; 2512 switch (stage) { 2513 case PM_STAGE1: 2514 pmap->pm_asid_set = &asids; 2515 if (pmap_bti_support) { 2516 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF, 2517 M_ZERO | M_WAITOK); 2518 rangeset_init(pmap->pm_bti, bti_dup_range, 2519 bti_free_range, pmap, M_NOWAIT); 2520 } 2521 break; 2522 case PM_STAGE2: 2523 pmap->pm_asid_set = &vmids; 2524 break; 2525 default: 2526 panic("%s: Invalid pmap type %d", __func__, stage); 2527 break; 2528 } 2529 2530 /* XXX Temporarily disable deferred ASID allocation. */ 2531 pmap_alloc_asid(pmap); 2532 2533 /* 2534 * Allocate the level 1 entry to use as the root. This will increase 2535 * the refcount on the level 1 page so it won't be removed until 2536 * pmap_release() is called. 2537 */ 2538 if (pmap->pm_levels == 3) { 2539 PMAP_LOCK(pmap); 2540 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL); 2541 PMAP_UNLOCK(pmap); 2542 } 2543 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m); 2544 2545 return (1); 2546} 2547 2548int 2549pmap_pinit(pmap_t pmap) 2550{ 2551 2552 return (pmap_pinit_stage(pmap, PM_STAGE1, 4)); 2553} 2554 2555/* 2556 * This routine is called if the desired page table page does not exist. 2557 * 2558 * If page table page allocation fails, this routine may sleep before 2559 * returning NULL. It sleeps only if a lock pointer was given. 2560 * 2561 * Note: If a page allocation fails at page table level two or three, 2562 * one or two pages may be held during the wait, only to be released 2563 * afterwards. This conservative approach is easily argued to avoid 2564 * race conditions. 2565 */ 2566static vm_page_t 2567_pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2568{ 2569 vm_page_t m, l1pg, l2pg; 2570 2571 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2572 2573 /* 2574 * Allocate a page table page. 2575 */ 2576 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2577 if (lockp != NULL) { 2578 RELEASE_PV_LIST_LOCK(lockp); 2579 PMAP_UNLOCK(pmap); 2580 vm_wait(NULL); 2581 PMAP_LOCK(pmap); 2582 } 2583 2584 /* 2585 * Indicate the need to retry. While waiting, the page table 2586 * page may have been allocated. 2587 */ 2588 return (NULL); 2589 } 2590 m->pindex = ptepindex; 2591 2592 /* 2593 * Because of AArch64's weak memory consistency model, we must have a 2594 * barrier here to ensure that the stores for zeroing "m", whether by 2595 * pmap_zero_page() or an earlier function, are visible before adding 2596 * "m" to the page table. Otherwise, a page table walk by another 2597 * processor's MMU could see the mapping to "m" and a stale, non-zero 2598 * PTE within "m". 2599 */ 2600 dmb(ishst); 2601 2602 /* 2603 * Map the pagetable page into the process address space, if 2604 * it isn't already there. 2605 */ 2606 2607 if (ptepindex >= (NUL2E + NUL1E)) { 2608 pd_entry_t *l0p, l0e; 2609 vm_pindex_t l0index; 2610 2611 l0index = ptepindex - (NUL2E + NUL1E); 2612 l0p = &pmap->pm_l0[l0index]; 2613 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0, 2614 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p))); 2615 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE; 2616 2617 /* 2618 * Mark all kernel memory as not accessible from userspace 2619 * and userspace memory as not executable from the kernel. 2620 * This has been done for the bootstrap L0 entries in 2621 * locore.S. 2622 */ 2623 if (pmap == kernel_pmap) 2624 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0; 2625 else 2626 l0e |= TATTR_PXN_TABLE; 2627 pmap_store(l0p, l0e); 2628 } else if (ptepindex >= NUL2E) { 2629 vm_pindex_t l0index, l1index; 2630 pd_entry_t *l0, *l1; 2631 pd_entry_t tl0; 2632 2633 l1index = ptepindex - NUL2E; 2634 l0index = l1index >> Ln_ENTRIES_SHIFT; 2635 2636 l0 = &pmap->pm_l0[l0index]; 2637 tl0 = pmap_load(l0); 2638 if (tl0 == 0) { 2639 /* recurse for allocating page dir */ 2640 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, 2641 lockp) == NULL) { 2642 vm_page_unwire_noq(m); 2643 vm_page_free_zero(m); 2644 return (NULL); 2645 } 2646 } else { 2647 l1pg = PTE_TO_VM_PAGE(tl0); 2648 l1pg->ref_count++; 2649 } 2650 2651 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0))); 2652 l1 = &l1[ptepindex & Ln_ADDR_MASK]; 2653 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0, 2654 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); 2655 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 2656 } else { 2657 vm_pindex_t l0index, l1index; 2658 pd_entry_t *l0, *l1, *l2; 2659 pd_entry_t tl0, tl1; 2660 2661 l1index = ptepindex >> Ln_ENTRIES_SHIFT; 2662 l0index = l1index >> Ln_ENTRIES_SHIFT; 2663 2664 l0 = &pmap->pm_l0[l0index]; 2665 tl0 = pmap_load(l0); 2666 if (tl0 == 0) { 2667 /* recurse for allocating page dir */ 2668 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2669 lockp) == NULL) { 2670 vm_page_unwire_noq(m); 2671 vm_page_free_zero(m); 2672 return (NULL); 2673 } 2674 tl0 = pmap_load(l0); 2675 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2676 l1 = &l1[l1index & Ln_ADDR_MASK]; 2677 } else { 2678 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0)); 2679 l1 = &l1[l1index & Ln_ADDR_MASK]; 2680 tl1 = pmap_load(l1); 2681 if (tl1 == 0) { 2682 /* recurse for allocating page dir */ 2683 if (_pmap_alloc_l3(pmap, NUL2E + l1index, 2684 lockp) == NULL) { 2685 vm_page_unwire_noq(m); 2686 vm_page_free_zero(m); 2687 return (NULL); 2688 } 2689 } else { 2690 l2pg = PTE_TO_VM_PAGE(tl1); 2691 l2pg->ref_count++; 2692 } 2693 } 2694 2695 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1))); 2696 l2 = &l2[ptepindex & Ln_ADDR_MASK]; 2697 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0, 2698 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); 2699 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 2700 } 2701 2702 pmap_resident_count_inc(pmap, 1); 2703 2704 return (m); 2705} 2706 2707static pd_entry_t * 2708pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp, 2709 struct rwlock **lockp) 2710{ 2711 pd_entry_t *l1, *l2; 2712 vm_page_t l2pg; 2713 vm_pindex_t l2pindex; 2714 2715 KASSERT(ADDR_IS_CANONICAL(va), 2716 ("%s: Address not in canonical form: %lx", __func__, va)); 2717 2718retry: 2719 l1 = pmap_l1(pmap, va); 2720 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { 2721 l2 = pmap_l1_to_l2(l1, va); 2722 if (!ADDR_IS_KERNEL(va)) { 2723 /* Add a reference to the L2 page. */ 2724 l2pg = PTE_TO_VM_PAGE(pmap_load(l1)); 2725 l2pg->ref_count++; 2726 } else 2727 l2pg = NULL; 2728 } else if (!ADDR_IS_KERNEL(va)) { 2729 /* Allocate a L2 page. */ 2730 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; 2731 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); 2732 if (l2pg == NULL) { 2733 if (lockp != NULL) 2734 goto retry; 2735 else 2736 return (NULL); 2737 } 2738 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); 2739 l2 = &l2[pmap_l2_index(va)]; 2740 } else 2741 panic("pmap_alloc_l2: missing page table page for va %#lx", 2742 va); 2743 *l2pgp = l2pg; 2744 return (l2); 2745} 2746 2747static vm_page_t 2748pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2749{ 2750 vm_pindex_t ptepindex; 2751 pd_entry_t *pde, tpde; 2752#ifdef INVARIANTS 2753 pt_entry_t *pte; 2754#endif 2755 vm_page_t m; 2756 int lvl; 2757 2758 /* 2759 * Calculate pagetable page index 2760 */ 2761 ptepindex = pmap_l2_pindex(va); 2762retry: 2763 /* 2764 * Get the page directory entry 2765 */ 2766 pde = pmap_pde(pmap, va, &lvl); 2767 2768 /* 2769 * If the page table page is mapped, we just increment the hold count, 2770 * and activate it. If we get a level 2 pde it will point to a level 3 2771 * table. 2772 */ 2773 switch (lvl) { 2774 case -1: 2775 break; 2776 case 0: 2777#ifdef INVARIANTS 2778 pte = pmap_l0_to_l1(pde, va); 2779 KASSERT(pmap_load(pte) == 0, 2780 ("pmap_alloc_l3: TODO: l0 superpages")); 2781#endif 2782 break; 2783 case 1: 2784#ifdef INVARIANTS 2785 pte = pmap_l1_to_l2(pde, va); 2786 KASSERT(pmap_load(pte) == 0, 2787 ("pmap_alloc_l3: TODO: l1 superpages")); 2788#endif 2789 break; 2790 case 2: 2791 tpde = pmap_load(pde); 2792 if (tpde != 0) { 2793 m = PTE_TO_VM_PAGE(tpde); 2794 m->ref_count++; 2795 return (m); 2796 } 2797 break; 2798 default: 2799 panic("pmap_alloc_l3: Invalid level %d", lvl); 2800 } 2801 2802 /* 2803 * Here if the pte page isn't mapped, or if it has been deallocated. 2804 */ 2805 m = _pmap_alloc_l3(pmap, ptepindex, lockp); 2806 if (m == NULL && lockp != NULL) 2807 goto retry; 2808 2809 return (m); 2810} 2811 2812/*************************************************** 2813 * Pmap allocation/deallocation routines. 2814 ***************************************************/ 2815 2816/* 2817 * Release any resources held by the given physical map. 2818 * Called when a pmap initialized by pmap_pinit is being released. 2819 * Should only be called if the map contains no valid mappings. 2820 */ 2821void 2822pmap_release(pmap_t pmap) 2823{ 2824 bool rv __diagused; 2825 struct spglist freelist; 2826 struct asid_set *set; 2827 vm_page_t m; 2828 int asid; 2829 2830 if (pmap->pm_levels != 4) { 2831 PMAP_ASSERT_STAGE2(pmap); 2832 KASSERT(pmap->pm_stats.resident_count == 1, 2833 ("pmap_release: pmap resident count %ld != 0", 2834 pmap->pm_stats.resident_count)); 2835 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID, 2836 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0])); 2837 2838 SLIST_INIT(&freelist); 2839 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr); 2840 PMAP_LOCK(pmap); 2841 rv = pmap_unwire_l3(pmap, 0, m, &freelist); 2842 PMAP_UNLOCK(pmap); 2843 MPASS(rv == true); 2844 vm_page_free_pages_toq(&freelist, true); 2845 } 2846 2847 KASSERT(pmap->pm_stats.resident_count == 0, 2848 ("pmap_release: pmap resident count %ld != 0", 2849 pmap->pm_stats.resident_count)); 2850 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2851 ("pmap_release: pmap has reserved page table page(s)")); 2852 2853 set = pmap->pm_asid_set; 2854 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 2855 2856 /* 2857 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate 2858 * the entries when removing them so rely on a later tlb invalidation. 2859 * this will happen when updating the VMID generation. Because of this 2860 * we don't reuse VMIDs within a generation. 2861 */ 2862 if (pmap->pm_stage == PM_STAGE1) { 2863 mtx_lock_spin(&set->asid_set_mutex); 2864 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) { 2865 asid = COOKIE_TO_ASID(pmap->pm_cookie); 2866 KASSERT(asid >= ASID_FIRST_AVAILABLE && 2867 asid < set->asid_set_size, 2868 ("pmap_release: pmap cookie has out-of-range asid")); 2869 bit_clear(set->asid_set, asid); 2870 } 2871 mtx_unlock_spin(&set->asid_set_mutex); 2872 2873 if (pmap->pm_bti != NULL) { 2874 rangeset_fini(pmap->pm_bti); 2875 free(pmap->pm_bti, M_DEVBUF); 2876 } 2877 } 2878 2879 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr); 2880 vm_page_unwire_noq(m); 2881 vm_page_free_zero(m); 2882} 2883 2884static int 2885kvm_size(SYSCTL_HANDLER_ARGS) 2886{ 2887 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2888 2889 return sysctl_handle_long(oidp, &ksize, 0, req); 2890} 2891SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2892 0, 0, kvm_size, "LU", 2893 "Size of KVM"); 2894 2895static int 2896kvm_free(SYSCTL_HANDLER_ARGS) 2897{ 2898 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2899 2900 return sysctl_handle_long(oidp, &kfree, 0, req); 2901} 2902SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 2903 0, 0, kvm_free, "LU", 2904 "Amount of KVM free"); 2905 2906/* 2907 * grow the number of kernel page table entries, if needed 2908 */ 2909void 2910pmap_growkernel(vm_offset_t addr) 2911{ 2912 vm_page_t nkpg; 2913 pd_entry_t *l0, *l1, *l2; 2914 2915 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2916 2917 addr = roundup2(addr, L2_SIZE); 2918 if (addr - 1 >= vm_map_max(kernel_map)) 2919 addr = vm_map_max(kernel_map); 2920 if (kernel_vm_end < addr) { 2921 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 2922 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end); 2923 } 2924 while (kernel_vm_end < addr) { 2925 l0 = pmap_l0(kernel_pmap, kernel_vm_end); 2926 KASSERT(pmap_load(l0) != 0, 2927 ("pmap_growkernel: No level 0 kernel entry")); 2928 2929 l1 = pmap_l0_to_l1(l0, kernel_vm_end); 2930 if (pmap_load(l1) == 0) { 2931 /* We need a new PDP entry */ 2932 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | 2933 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2934 if (nkpg == NULL) 2935 panic("pmap_growkernel: no memory to grow kernel"); 2936 nkpg->pindex = kernel_vm_end >> L1_SHIFT; 2937 /* See the dmb() in _pmap_alloc_l3(). */ 2938 dmb(ishst); 2939 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE); 2940 continue; /* try again */ 2941 } 2942 l2 = pmap_l1_to_l2(l1, kernel_vm_end); 2943 if (pmap_load(l2) != 0) { 2944 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2945 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2946 kernel_vm_end = vm_map_max(kernel_map); 2947 break; 2948 } 2949 continue; 2950 } 2951 2952 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 2953 VM_ALLOC_ZERO); 2954 if (nkpg == NULL) 2955 panic("pmap_growkernel: no memory to grow kernel"); 2956 nkpg->pindex = kernel_vm_end >> L2_SHIFT; 2957 /* See the dmb() in _pmap_alloc_l3(). */ 2958 dmb(ishst); 2959 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE); 2960 2961 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; 2962 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2963 kernel_vm_end = vm_map_max(kernel_map); 2964 break; 2965 } 2966 } 2967} 2968 2969/*************************************************** 2970 * page management routines. 2971 ***************************************************/ 2972 2973static const uint64_t pc_freemask[_NPCM] = { 2974 [0 ... _NPCM - 2] = PC_FREEN, 2975 [_NPCM - 1] = PC_FREEL 2976}; 2977 2978#ifdef PV_STATS 2979static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2980 2981SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2982 "Current number of pv entry chunks"); 2983SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2984 "Current number of pv entry chunks allocated"); 2985SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2986 "Current number of pv entry chunks frees"); 2987SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2988 "Number of times tried to get a chunk page but failed."); 2989 2990static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2991static int pv_entry_spare; 2992 2993SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2994 "Current number of pv entry frees"); 2995SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2996 "Current number of pv entry allocs"); 2997SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2998 "Current number of pv entries"); 2999SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 3000 "Current number of spare pv entries"); 3001#endif 3002 3003/* 3004 * We are in a serious low memory condition. Resort to 3005 * drastic measures to free some pages so we can allocate 3006 * another pv entry chunk. 3007 * 3008 * Returns NULL if PV entries were reclaimed from the specified pmap. 3009 * 3010 * We do not, however, unmap 2mpages because subsequent accesses will 3011 * allocate per-page pv entries until repromotion occurs, thereby 3012 * exacerbating the shortage of free pv entries. 3013 */ 3014static vm_page_t 3015reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) 3016{ 3017 struct pv_chunks_list *pvc; 3018 struct pv_chunk *pc, *pc_marker, *pc_marker_end; 3019 struct pv_chunk_header pc_marker_b, pc_marker_end_b; 3020 struct md_page *pvh; 3021 pd_entry_t *pde; 3022 pmap_t next_pmap, pmap; 3023 pt_entry_t *pte, tpte; 3024 pv_entry_t pv; 3025 vm_offset_t va; 3026 vm_page_t m, m_pc; 3027 struct spglist free; 3028 uint64_t inuse; 3029 int bit, field, freed, lvl; 3030 3031 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 3032 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 3033 3034 pmap = NULL; 3035 m_pc = NULL; 3036 SLIST_INIT(&free); 3037 bzero(&pc_marker_b, sizeof(pc_marker_b)); 3038 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); 3039 pc_marker = (struct pv_chunk *)&pc_marker_b; 3040 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; 3041 3042 pvc = &pv_chunks[domain]; 3043 mtx_lock(&pvc->pvc_lock); 3044 pvc->active_reclaims++; 3045 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); 3046 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); 3047 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && 3048 SLIST_EMPTY(&free)) { 3049 next_pmap = pc->pc_pmap; 3050 if (next_pmap == NULL) { 3051 /* 3052 * The next chunk is a marker. However, it is 3053 * not our marker, so active_reclaims must be 3054 * > 1. Consequently, the next_chunk code 3055 * will not rotate the pv_chunks list. 3056 */ 3057 goto next_chunk; 3058 } 3059 mtx_unlock(&pvc->pvc_lock); 3060 3061 /* 3062 * A pv_chunk can only be removed from the pc_lru list 3063 * when both pvc->pvc_lock is owned and the 3064 * corresponding pmap is locked. 3065 */ 3066 if (pmap != next_pmap) { 3067 if (pmap != NULL && pmap != locked_pmap) 3068 PMAP_UNLOCK(pmap); 3069 pmap = next_pmap; 3070 /* Avoid deadlock and lock recursion. */ 3071 if (pmap > locked_pmap) { 3072 RELEASE_PV_LIST_LOCK(lockp); 3073 PMAP_LOCK(pmap); 3074 mtx_lock(&pvc->pvc_lock); 3075 continue; 3076 } else if (pmap != locked_pmap) { 3077 if (PMAP_TRYLOCK(pmap)) { 3078 mtx_lock(&pvc->pvc_lock); 3079 continue; 3080 } else { 3081 pmap = NULL; /* pmap is not locked */ 3082 mtx_lock(&pvc->pvc_lock); 3083 pc = TAILQ_NEXT(pc_marker, pc_lru); 3084 if (pc == NULL || 3085 pc->pc_pmap != next_pmap) 3086 continue; 3087 goto next_chunk; 3088 } 3089 } 3090 } 3091 3092 /* 3093 * Destroy every non-wired, 4 KB page mapping in the chunk. 3094 */ 3095 freed = 0; 3096 for (field = 0; field < _NPCM; field++) { 3097 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 3098 inuse != 0; inuse &= ~(1UL << bit)) { 3099 bit = ffsl(inuse) - 1; 3100 pv = &pc->pc_pventry[field * 64 + bit]; 3101 va = pv->pv_va; 3102 pde = pmap_pde(pmap, va, &lvl); 3103 if (lvl != 2) 3104 continue; 3105 pte = pmap_l2_to_l3(pde, va); 3106 tpte = pmap_load(pte); 3107 if ((tpte & ATTR_SW_WIRED) != 0) 3108 continue; 3109 if ((tpte & ATTR_CONTIGUOUS) != 0) 3110 (void)pmap_demote_l3c(pmap, pte, va); 3111 tpte = pmap_load_clear(pte); 3112 m = PTE_TO_VM_PAGE(tpte); 3113 if (pmap_pte_dirty(pmap, tpte)) 3114 vm_page_dirty(m); 3115 if ((tpte & ATTR_AF) != 0) { 3116 pmap_s1_invalidate_page(pmap, va, true); 3117 vm_page_aflag_set(m, PGA_REFERENCED); 3118 } 3119 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3120 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3121 m->md.pv_gen++; 3122 if (TAILQ_EMPTY(&m->md.pv_list) && 3123 (m->flags & PG_FICTITIOUS) == 0) { 3124 pvh = page_to_pvh(m); 3125 if (TAILQ_EMPTY(&pvh->pv_list)) { 3126 vm_page_aflag_clear(m, 3127 PGA_WRITEABLE); 3128 } 3129 } 3130 pc->pc_map[field] |= 1UL << bit; 3131 pmap_unuse_pt(pmap, va, pmap_load(pde), &free); 3132 freed++; 3133 } 3134 } 3135 if (freed == 0) { 3136 mtx_lock(&pvc->pvc_lock); 3137 goto next_chunk; 3138 } 3139 /* Every freed mapping is for a 4 KB page. */ 3140 pmap_resident_count_dec(pmap, freed); 3141 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 3142 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 3143 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 3144 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3145 if (pc_is_free(pc)) { 3146 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3147 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3148 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3149 /* Entire chunk is free; return it. */ 3150 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3151 dump_drop_page(m_pc->phys_addr); 3152 mtx_lock(&pvc->pvc_lock); 3153 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3154 break; 3155 } 3156 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3157 mtx_lock(&pvc->pvc_lock); 3158 /* One freed pv entry in locked_pmap is sufficient. */ 3159 if (pmap == locked_pmap) 3160 break; 3161 3162next_chunk: 3163 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3164 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); 3165 if (pvc->active_reclaims == 1 && pmap != NULL) { 3166 /* 3167 * Rotate the pv chunks list so that we do not 3168 * scan the same pv chunks that could not be 3169 * freed (because they contained a wired 3170 * and/or superpage mapping) on every 3171 * invocation of reclaim_pv_chunk(). 3172 */ 3173 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){ 3174 MPASS(pc->pc_pmap != NULL); 3175 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3176 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3177 } 3178 } 3179 } 3180 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); 3181 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); 3182 pvc->active_reclaims--; 3183 mtx_unlock(&pvc->pvc_lock); 3184 if (pmap != NULL && pmap != locked_pmap) 3185 PMAP_UNLOCK(pmap); 3186 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 3187 m_pc = SLIST_FIRST(&free); 3188 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 3189 /* Recycle a freed page table page. */ 3190 m_pc->ref_count = 1; 3191 } 3192 vm_page_free_pages_toq(&free, true); 3193 return (m_pc); 3194} 3195 3196static vm_page_t 3197reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 3198{ 3199 vm_page_t m; 3200 int i, domain; 3201 3202 domain = PCPU_GET(domain); 3203 for (i = 0; i < vm_ndomains; i++) { 3204 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); 3205 if (m != NULL) 3206 break; 3207 domain = (domain + 1) % vm_ndomains; 3208 } 3209 3210 return (m); 3211} 3212 3213/* 3214 * free the pv_entry back to the free list 3215 */ 3216static void 3217free_pv_entry(pmap_t pmap, pv_entry_t pv) 3218{ 3219 struct pv_chunk *pc; 3220 int idx, field, bit; 3221 3222 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3223 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 3224 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 3225 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 3226 pc = pv_to_chunk(pv); 3227 idx = pv - &pc->pc_pventry[0]; 3228 field = idx / 64; 3229 bit = idx % 64; 3230 pc->pc_map[field] |= 1ul << bit; 3231 if (!pc_is_free(pc)) { 3232 /* 98% of the time, pc is already at the head of the list. */ 3233 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 3234 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3235 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3236 } 3237 return; 3238 } 3239 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3240 free_pv_chunk(pc); 3241} 3242 3243static void 3244free_pv_chunk_dequeued(struct pv_chunk *pc) 3245{ 3246 vm_page_t m; 3247 3248 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 3249 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 3250 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 3251 /* entire chunk is free, return it */ 3252 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3253 dump_drop_page(m->phys_addr); 3254 vm_page_unwire_noq(m); 3255 vm_page_free(m); 3256} 3257 3258static void 3259free_pv_chunk(struct pv_chunk *pc) 3260{ 3261 struct pv_chunks_list *pvc; 3262 3263 pvc = &pv_chunks[pc_to_domain(pc)]; 3264 mtx_lock(&pvc->pvc_lock); 3265 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3266 mtx_unlock(&pvc->pvc_lock); 3267 free_pv_chunk_dequeued(pc); 3268} 3269 3270static void 3271free_pv_chunk_batch(struct pv_chunklist *batch) 3272{ 3273 struct pv_chunks_list *pvc; 3274 struct pv_chunk *pc, *npc; 3275 int i; 3276 3277 for (i = 0; i < vm_ndomains; i++) { 3278 if (TAILQ_EMPTY(&batch[i])) 3279 continue; 3280 pvc = &pv_chunks[i]; 3281 mtx_lock(&pvc->pvc_lock); 3282 TAILQ_FOREACH(pc, &batch[i], pc_list) { 3283 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); 3284 } 3285 mtx_unlock(&pvc->pvc_lock); 3286 } 3287 3288 for (i = 0; i < vm_ndomains; i++) { 3289 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { 3290 free_pv_chunk_dequeued(pc); 3291 } 3292 } 3293} 3294 3295/* 3296 * Returns a new PV entry, allocating a new PV chunk from the system when 3297 * needed. If this PV chunk allocation fails and a PV list lock pointer was 3298 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 3299 * returned. 3300 * 3301 * The given PV list lock may be released. 3302 */ 3303static pv_entry_t 3304get_pv_entry(pmap_t pmap, struct rwlock **lockp) 3305{ 3306 struct pv_chunks_list *pvc; 3307 int bit, field; 3308 pv_entry_t pv; 3309 struct pv_chunk *pc; 3310 vm_page_t m; 3311 3312 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3313 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 3314retry: 3315 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3316 if (pc != NULL) { 3317 for (field = 0; field < _NPCM; field++) { 3318 if (pc->pc_map[field]) { 3319 bit = ffsl(pc->pc_map[field]) - 1; 3320 break; 3321 } 3322 } 3323 if (field < _NPCM) { 3324 pv = &pc->pc_pventry[field * 64 + bit]; 3325 pc->pc_map[field] &= ~(1ul << bit); 3326 /* If this was the last item, move it to tail */ 3327 if (pc_is_full(pc)) { 3328 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3329 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 3330 pc_list); 3331 } 3332 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3333 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 3334 return (pv); 3335 } 3336 } 3337 /* No free items, allocate another chunk */ 3338 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3339 if (m == NULL) { 3340 if (lockp == NULL) { 3341 PV_STAT(pc_chunk_tryfail++); 3342 return (NULL); 3343 } 3344 m = reclaim_pv_chunk(pmap, lockp); 3345 if (m == NULL) 3346 goto retry; 3347 } 3348 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3349 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3350 dump_add_page(m->phys_addr); 3351 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3352 pc->pc_pmap = pmap; 3353 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3354 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */ 3355 pvc = &pv_chunks[vm_page_domain(m)]; 3356 mtx_lock(&pvc->pvc_lock); 3357 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); 3358 mtx_unlock(&pvc->pvc_lock); 3359 pv = &pc->pc_pventry[0]; 3360 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3361 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 3362 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 3363 return (pv); 3364} 3365 3366/* 3367 * Ensure that the number of spare PV entries in the specified pmap meets or 3368 * exceeds the given count, "needed". 3369 * 3370 * The given PV list lock may be released. 3371 */ 3372static void 3373reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 3374{ 3375 struct pv_chunks_list *pvc; 3376 struct pch new_tail[PMAP_MEMDOM]; 3377 struct pv_chunk *pc; 3378 vm_page_t m; 3379 int avail, free, i; 3380 bool reclaimed; 3381 3382 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3383 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 3384 3385 /* 3386 * Newly allocated PV chunks must be stored in a private list until 3387 * the required number of PV chunks have been allocated. Otherwise, 3388 * reclaim_pv_chunk() could recycle one of these chunks. In 3389 * contrast, these chunks must be added to the pmap upon allocation. 3390 */ 3391 for (i = 0; i < PMAP_MEMDOM; i++) 3392 TAILQ_INIT(&new_tail[i]); 3393retry: 3394 avail = 0; 3395 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3396 bit_count((bitstr_t *)pc->pc_map, 0, 3397 sizeof(pc->pc_map) * NBBY, &free); 3398 if (free == 0) 3399 break; 3400 avail += free; 3401 if (avail >= needed) 3402 break; 3403 } 3404 for (reclaimed = false; avail < needed; avail += _NPCPV) { 3405 m = vm_page_alloc_noobj(VM_ALLOC_WIRED); 3406 if (m == NULL) { 3407 m = reclaim_pv_chunk(pmap, lockp); 3408 if (m == NULL) 3409 goto retry; 3410 reclaimed = true; 3411 } 3412 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3413 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3414 dump_add_page(m->phys_addr); 3415 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3416 pc->pc_pmap = pmap; 3417 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask)); 3418 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3419 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru); 3420 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3421 3422 /* 3423 * The reclaim might have freed a chunk from the current pmap. 3424 * If that chunk contained available entries, we need to 3425 * re-count the number of available entries. 3426 */ 3427 if (reclaimed) 3428 goto retry; 3429 } 3430 for (i = 0; i < vm_ndomains; i++) { 3431 if (TAILQ_EMPTY(&new_tail[i])) 3432 continue; 3433 pvc = &pv_chunks[i]; 3434 mtx_lock(&pvc->pvc_lock); 3435 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); 3436 mtx_unlock(&pvc->pvc_lock); 3437 } 3438} 3439 3440/* 3441 * First find and then remove the pv entry for the specified pmap and virtual 3442 * address from the specified pv list. Returns the pv entry if found and NULL 3443 * otherwise. This operation can be performed on pv lists for either 4KB or 3444 * 2MB page mappings. 3445 */ 3446static __inline pv_entry_t 3447pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3448{ 3449 pv_entry_t pv; 3450 3451 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3452 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3453 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3454 pvh->pv_gen++; 3455 break; 3456 } 3457 } 3458 return (pv); 3459} 3460 3461/* 3462 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3463 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3464 * entries for each of the 4KB page mappings. 3465 */ 3466static void 3467pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3468 struct rwlock **lockp) 3469{ 3470 struct md_page *pvh; 3471 struct pv_chunk *pc; 3472 pv_entry_t pv; 3473 vm_offset_t va_last; 3474 vm_page_t m; 3475 int bit, field; 3476 3477 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3478 KASSERT((va & L2_OFFSET) == 0, 3479 ("pmap_pv_demote_l2: va is not 2mpage aligned")); 3480 KASSERT((pa & L2_OFFSET) == 0, 3481 ("pmap_pv_demote_l2: pa is not 2mpage aligned")); 3482 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3483 3484 /* 3485 * Transfer the 2mpage's pv entry for this mapping to the first 3486 * page's pv list. Once this transfer begins, the pv list lock 3487 * must not be released until the last pv entry is reinstantiated. 3488 */ 3489 pvh = pa_to_pvh(pa); 3490 pv = pmap_pvh_remove(pvh, pmap, va); 3491 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); 3492 m = PHYS_TO_VM_PAGE(pa); 3493 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3494 m->md.pv_gen++; 3495 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ 3496 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); 3497 va_last = va + L2_SIZE - PAGE_SIZE; 3498 for (;;) { 3499 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3500 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare")); 3501 for (field = 0; field < _NPCM; field++) { 3502 while (pc->pc_map[field]) { 3503 bit = ffsl(pc->pc_map[field]) - 1; 3504 pc->pc_map[field] &= ~(1ul << bit); 3505 pv = &pc->pc_pventry[field * 64 + bit]; 3506 va += PAGE_SIZE; 3507 pv->pv_va = va; 3508 m++; 3509 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3510 ("pmap_pv_demote_l2: page %p is not managed", m)); 3511 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3512 m->md.pv_gen++; 3513 if (va == va_last) 3514 goto out; 3515 } 3516 } 3517 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3518 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3519 } 3520out: 3521 if (pc_is_full(pc)) { 3522 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3523 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3524 } 3525 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); 3526 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); 3527} 3528 3529/* 3530 * First find and then destroy the pv entry for the specified pmap and virtual 3531 * address. This operation can be performed on pv lists for either 4KB or 2MB 3532 * page mappings. 3533 */ 3534static void 3535pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3536{ 3537 pv_entry_t pv; 3538 3539 pv = pmap_pvh_remove(pvh, pmap, va); 3540 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3541 free_pv_entry(pmap, pv); 3542} 3543 3544/* 3545 * Conditionally create the PV entry for a 4KB page mapping if the required 3546 * memory can be allocated without resorting to reclamation. 3547 */ 3548static bool 3549pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3550 struct rwlock **lockp) 3551{ 3552 pv_entry_t pv; 3553 3554 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3555 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3556 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3557 pv->pv_va = va; 3558 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3559 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3560 m->md.pv_gen++; 3561 return (true); 3562 } else 3563 return (false); 3564} 3565 3566/* 3567 * Create the PV entry for a 2MB page mapping. Always returns true unless the 3568 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns 3569 * false if the PV entry cannot be allocated without resorting to reclamation. 3570 */ 3571static bool 3572pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, 3573 struct rwlock **lockp) 3574{ 3575 struct md_page *pvh; 3576 pv_entry_t pv; 3577 vm_paddr_t pa; 3578 3579 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3580 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3581 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? 3582 NULL : lockp)) == NULL) 3583 return (false); 3584 pv->pv_va = va; 3585 pa = PTE_TO_PHYS(l2e); 3586 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3587 pvh = pa_to_pvh(pa); 3588 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3589 pvh->pv_gen++; 3590 return (true); 3591} 3592 3593/* 3594 * Conditionally creates the PV entries for a L3C superpage mapping if 3595 * the required memory can be allocated without resorting to reclamation. 3596 */ 3597static bool 3598pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m, 3599 struct rwlock **lockp) 3600{ 3601 pv_entry_t pv; 3602 vm_offset_t tva; 3603 vm_paddr_t pa __diagused; 3604 vm_page_t mt; 3605 3606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3607 KASSERT((va & L3C_OFFSET) == 0, 3608 ("pmap_pv_insert_l3c: va is not aligned")); 3609 pa = VM_PAGE_TO_PHYS(m); 3610 KASSERT((pa & L3C_OFFSET) == 0, 3611 ("pmap_pv_insert_l3c: pa is not aligned")); 3612 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3613 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) { 3614 /* Pass NULL instead of lockp to disable reclamation. */ 3615 pv = get_pv_entry(pmap, NULL); 3616 if (__predict_false(pv == NULL)) { 3617 while (tva > va) { 3618 mt--; 3619 tva -= L3_SIZE; 3620 pmap_pvh_free(&mt->md, pmap, tva); 3621 } 3622 return (false); 3623 } 3624 pv->pv_va = tva; 3625 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next); 3626 mt->md.pv_gen++; 3627 } 3628 return (true); 3629} 3630 3631static void 3632pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 3633{ 3634 pt_entry_t newl2, oldl2 __diagused; 3635 vm_page_t ml3; 3636 vm_paddr_t ml3pa; 3637 3638 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); 3639 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3641 3642 ml3 = pmap_remove_pt_page(pmap, va); 3643 if (ml3 == NULL) 3644 panic("pmap_remove_kernel_l2: Missing pt page"); 3645 3646 ml3pa = VM_PAGE_TO_PHYS(ml3); 3647 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE; 3648 3649 /* 3650 * If this page table page was unmapped by a promotion, then it 3651 * contains valid mappings. Zero it to invalidate those mappings. 3652 */ 3653 if (vm_page_any_valid(ml3)) 3654 pagezero((void *)PHYS_TO_DMAP(ml3pa)); 3655 3656 /* 3657 * Demote the mapping. The caller must have already invalidated the 3658 * mapping (i.e., the "break" in break-before-make). 3659 */ 3660 oldl2 = pmap_load_store(l2, newl2); 3661 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", 3662 __func__, l2, oldl2)); 3663} 3664 3665/* 3666 * pmap_remove_l2: Do the things to unmap a level 2 superpage. 3667 */ 3668static int 3669pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, 3670 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) 3671{ 3672 struct md_page *pvh; 3673 pt_entry_t old_l2; 3674 vm_page_t m, ml3, mt; 3675 3676 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3677 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); 3678 old_l2 = pmap_load_clear(l2); 3679 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 3680 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); 3681 3682 /* 3683 * Since a promotion must break the 4KB page mappings before making 3684 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 3685 */ 3686 pmap_s1_invalidate_page(pmap, sva, true); 3687 3688 if (old_l2 & ATTR_SW_WIRED) 3689 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; 3690 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); 3691 if (old_l2 & ATTR_SW_MANAGED) { 3692 m = PTE_TO_VM_PAGE(old_l2); 3693 pvh = page_to_pvh(m); 3694 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3695 pmap_pvh_free(pvh, pmap, sva); 3696 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) { 3697 if (pmap_pte_dirty(pmap, old_l2)) 3698 vm_page_dirty(mt); 3699 if (old_l2 & ATTR_AF) 3700 vm_page_aflag_set(mt, PGA_REFERENCED); 3701 if (TAILQ_EMPTY(&mt->md.pv_list) && 3702 TAILQ_EMPTY(&pvh->pv_list)) 3703 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3704 } 3705 } 3706 if (pmap == kernel_pmap) { 3707 pmap_remove_kernel_l2(pmap, l2, sva); 3708 } else { 3709 ml3 = pmap_remove_pt_page(pmap, sva); 3710 if (ml3 != NULL) { 3711 KASSERT(vm_page_any_valid(ml3), 3712 ("pmap_remove_l2: l3 page not promoted")); 3713 pmap_resident_count_dec(pmap, 1); 3714 KASSERT(ml3->ref_count == NL3PG, 3715 ("pmap_remove_l2: l3 page ref count error")); 3716 ml3->ref_count = 0; 3717 pmap_add_delayed_free_list(ml3, free, false); 3718 } 3719 } 3720 return (pmap_unuse_pt(pmap, sva, l1e, free)); 3721} 3722 3723/* 3724 * pmap_remove_l3: do the things to unmap a page in a process 3725 */ 3726static int 3727pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 3728 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) 3729{ 3730 struct md_page *pvh; 3731 pt_entry_t old_l3; 3732 vm_page_t m; 3733 3734 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3735 old_l3 = pmap_load(l3); 3736 if ((old_l3 & ATTR_CONTIGUOUS) != 0) 3737 (void)pmap_demote_l3c(pmap, l3, va); 3738 old_l3 = pmap_load_clear(l3); 3739 pmap_s1_invalidate_page(pmap, va, true); 3740 if (old_l3 & ATTR_SW_WIRED) 3741 pmap->pm_stats.wired_count -= 1; 3742 pmap_resident_count_dec(pmap, 1); 3743 if (old_l3 & ATTR_SW_MANAGED) { 3744 m = PTE_TO_VM_PAGE(old_l3); 3745 if (pmap_pte_dirty(pmap, old_l3)) 3746 vm_page_dirty(m); 3747 if (old_l3 & ATTR_AF) 3748 vm_page_aflag_set(m, PGA_REFERENCED); 3749 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3750 pmap_pvh_free(&m->md, pmap, va); 3751 if (TAILQ_EMPTY(&m->md.pv_list) && 3752 (m->flags & PG_FICTITIOUS) == 0) { 3753 pvh = page_to_pvh(m); 3754 if (TAILQ_EMPTY(&pvh->pv_list)) 3755 vm_page_aflag_clear(m, PGA_WRITEABLE); 3756 } 3757 } 3758 return (pmap_unuse_pt(pmap, va, l2e, free)); 3759} 3760 3761/* 3762 * Removes the specified L3C superpage mapping. Requests TLB invalidations 3763 * to be performed by the caller through the returned "*vap". Returns true 3764 * if the level 3 table "ml3" was unmapped and added to the spglist "free". 3765 * Otherwise, returns false. 3766 */ 3767static bool 3768pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap, 3769 vm_offset_t va_next, vm_page_t ml3, struct spglist *free, 3770 struct rwlock **lockp) 3771{ 3772 struct md_page *pvh; 3773 struct rwlock *new_lock; 3774 pt_entry_t first_l3e, l3e, *tl3p; 3775 vm_offset_t tva; 3776 vm_page_t m, mt; 3777 3778 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3779 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 3780 0, ("pmap_remove_l3c: l3p is not aligned")); 3781 KASSERT((va & L3C_OFFSET) == 0, 3782 ("pmap_remove_l3c: va is not aligned")); 3783 3784 /* 3785 * Hardware accessed and dirty bit maintenance might only update a 3786 * single L3 entry, so we must combine the accessed and dirty bits 3787 * from this entire set of contiguous L3 entries. 3788 */ 3789 first_l3e = pmap_load_clear(l3p); 3790 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 3791 l3e = pmap_load_clear(tl3p); 3792 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 3793 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS")); 3794 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 3795 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 3796 first_l3e &= ~ATTR_S1_AP_RW_BIT; 3797 first_l3e |= l3e & ATTR_AF; 3798 } 3799 if ((first_l3e & ATTR_SW_WIRED) != 0) 3800 pmap->pm_stats.wired_count -= L3C_ENTRIES; 3801 pmap_resident_count_dec(pmap, L3C_ENTRIES); 3802 if ((first_l3e & ATTR_SW_MANAGED) != 0) { 3803 m = PTE_TO_VM_PAGE(first_l3e); 3804 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3805 if (new_lock != *lockp) { 3806 if (*lockp != NULL) { 3807 /* 3808 * Pending TLB invalidations must be 3809 * performed before the PV list lock is 3810 * released. Otherwise, a concurrent 3811 * pmap_remove_all() on a physical page 3812 * could return while a stale TLB entry 3813 * still provides access to that page. 3814 */ 3815 if (*vap != va_next) { 3816 pmap_invalidate_range(pmap, *vap, va, 3817 true); 3818 *vap = va_next; 3819 } 3820 rw_wunlock(*lockp); 3821 } 3822 *lockp = new_lock; 3823 rw_wlock(*lockp); 3824 } 3825 pvh = page_to_pvh(m); 3826 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += 3827 L3_SIZE) { 3828 if (pmap_pte_dirty(pmap, first_l3e)) 3829 vm_page_dirty(mt); 3830 if ((first_l3e & ATTR_AF) != 0) 3831 vm_page_aflag_set(mt, PGA_REFERENCED); 3832 pmap_pvh_free(&mt->md, pmap, tva); 3833 if (TAILQ_EMPTY(&mt->md.pv_list) && 3834 TAILQ_EMPTY(&pvh->pv_list)) 3835 vm_page_aflag_clear(mt, PGA_WRITEABLE); 3836 } 3837 } 3838 if (*vap == va_next) 3839 *vap = va; 3840 if (ml3 != NULL) { 3841 ml3->ref_count -= L3C_ENTRIES; 3842 if (ml3->ref_count == 0) { 3843 _pmap_unwire_l3(pmap, va, ml3, free); 3844 return (true); 3845 } 3846 } 3847 return (false); 3848} 3849 3850/* 3851 * Remove the specified range of addresses from the L3 page table that is 3852 * identified by the given L2 entry. 3853 */ 3854static void 3855pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, 3856 vm_offset_t eva, struct spglist *free, struct rwlock **lockp) 3857{ 3858 struct md_page *pvh; 3859 struct rwlock *new_lock; 3860 pt_entry_t *l3, old_l3; 3861 vm_offset_t va; 3862 vm_page_t l3pg, m; 3863 3864 KASSERT(ADDR_IS_CANONICAL(sva), 3865 ("%s: Start address not in canonical form: %lx", __func__, sva)); 3866 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS, 3867 ("%s: End address not in canonical form: %lx", __func__, eva)); 3868 3869 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3870 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), 3871 ("pmap_remove_l3_range: range crosses an L3 page table boundary")); 3872 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL; 3873 va = eva; 3874 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { 3875 old_l3 = pmap_load(l3); 3876 if (!pmap_l3_valid(old_l3)) { 3877 if (va != eva) { 3878 pmap_invalidate_range(pmap, va, sva, true); 3879 va = eva; 3880 } 3881 continue; 3882 } 3883 if ((old_l3 & ATTR_CONTIGUOUS) != 0) { 3884 /* 3885 * Is this entire set of contiguous L3 entries being 3886 * removed? Handle the possibility that "eva" is zero 3887 * because of address wraparound. 3888 */ 3889 if ((sva & L3C_OFFSET) == 0 && 3890 sva + L3C_OFFSET <= eva - 1) { 3891 if (pmap_remove_l3c(pmap, l3, sva, &va, eva, 3892 l3pg, free, lockp)) { 3893 /* The L3 table was unmapped. */ 3894 sva += L3C_SIZE; 3895 break; 3896 } 3897 l3 += L3C_ENTRIES - 1; 3898 sva += L3C_SIZE - L3_SIZE; 3899 continue; 3900 } 3901 3902 (void)pmap_demote_l3c(pmap, l3, sva); 3903 } 3904 old_l3 = pmap_load_clear(l3); 3905 if ((old_l3 & ATTR_SW_WIRED) != 0) 3906 pmap->pm_stats.wired_count--; 3907 pmap_resident_count_dec(pmap, 1); 3908 if ((old_l3 & ATTR_SW_MANAGED) != 0) { 3909 m = PTE_TO_VM_PAGE(old_l3); 3910 if (pmap_pte_dirty(pmap, old_l3)) 3911 vm_page_dirty(m); 3912 if ((old_l3 & ATTR_AF) != 0) 3913 vm_page_aflag_set(m, PGA_REFERENCED); 3914 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m); 3915 if (new_lock != *lockp) { 3916 if (*lockp != NULL) { 3917 /* 3918 * Pending TLB invalidations must be 3919 * performed before the PV list lock is 3920 * released. Otherwise, a concurrent 3921 * pmap_remove_all() on a physical page 3922 * could return while a stale TLB entry 3923 * still provides access to that page. 3924 */ 3925 if (va != eva) { 3926 pmap_invalidate_range(pmap, va, 3927 sva, true); 3928 va = eva; 3929 } 3930 rw_wunlock(*lockp); 3931 } 3932 *lockp = new_lock; 3933 rw_wlock(*lockp); 3934 } 3935 pmap_pvh_free(&m->md, pmap, sva); 3936 if (TAILQ_EMPTY(&m->md.pv_list) && 3937 (m->flags & PG_FICTITIOUS) == 0) { 3938 pvh = page_to_pvh(m); 3939 if (TAILQ_EMPTY(&pvh->pv_list)) 3940 vm_page_aflag_clear(m, PGA_WRITEABLE); 3941 } 3942 } 3943 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) { 3944 /* 3945 * _pmap_unwire_l3() has already invalidated the TLB 3946 * entries at all levels for "sva". So, we need not 3947 * perform "sva += L3_SIZE;" here. Moreover, we need 3948 * not perform "va = sva;" if "sva" is at the start 3949 * of a new valid range consisting of a single page. 3950 */ 3951 break; 3952 } 3953 if (va == eva) 3954 va = sva; 3955 } 3956 if (va != eva) 3957 pmap_invalidate_range(pmap, va, sva, true); 3958} 3959 3960static void 3961pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete) 3962{ 3963 struct rwlock *lock; 3964 vm_offset_t va_next; 3965 pd_entry_t *l0, *l1, *l2; 3966 pt_entry_t l3_paddr; 3967 struct spglist free; 3968 3969 /* 3970 * Perform an unsynchronized read. This is, however, safe. 3971 */ 3972 if (pmap->pm_stats.resident_count == 0) 3973 return; 3974 3975 SLIST_INIT(&free); 3976 3977 PMAP_LOCK(pmap); 3978 if (map_delete) 3979 pmap_bti_on_remove(pmap, sva, eva); 3980 3981 lock = NULL; 3982 for (; sva < eva; sva = va_next) { 3983 if (pmap->pm_stats.resident_count == 0) 3984 break; 3985 3986 l0 = pmap_l0(pmap, sva); 3987 if (pmap_load(l0) == 0) { 3988 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 3989 if (va_next < sva) 3990 va_next = eva; 3991 continue; 3992 } 3993 3994 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 3995 if (va_next < sva) 3996 va_next = eva; 3997 l1 = pmap_l0_to_l1(l0, sva); 3998 if (pmap_load(l1) == 0) 3999 continue; 4000 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4001 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4002 KASSERT(va_next <= eva, 4003 ("partial update of non-transparent 1G page " 4004 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4005 pmap_load(l1), sva, eva, va_next)); 4006 MPASS(pmap != kernel_pmap); 4007 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4008 pmap_clear(l1); 4009 pmap_s1_invalidate_page(pmap, sva, true); 4010 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE); 4011 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free); 4012 continue; 4013 } 4014 4015 /* 4016 * Calculate index for next page table. 4017 */ 4018 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4019 if (va_next < sva) 4020 va_next = eva; 4021 4022 l2 = pmap_l1_to_l2(l1, sva); 4023 if (l2 == NULL) 4024 continue; 4025 4026 l3_paddr = pmap_load(l2); 4027 4028 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { 4029 if (sva + L2_SIZE == va_next && eva >= va_next) { 4030 pmap_remove_l2(pmap, l2, sva, pmap_load(l1), 4031 &free, &lock); 4032 continue; 4033 } else if (pmap_demote_l2_locked(pmap, l2, sva, 4034 &lock) == NULL) 4035 continue; 4036 l3_paddr = pmap_load(l2); 4037 } 4038 4039 /* 4040 * Weed out invalid mappings. 4041 */ 4042 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) 4043 continue; 4044 4045 /* 4046 * Limit our scan to either the end of the va represented 4047 * by the current page table page, or to the end of the 4048 * range being removed. 4049 */ 4050 if (va_next > eva) 4051 va_next = eva; 4052 4053 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, 4054 &lock); 4055 } 4056 if (lock != NULL) 4057 rw_wunlock(lock); 4058 PMAP_UNLOCK(pmap); 4059 vm_page_free_pages_toq(&free, true); 4060} 4061 4062/* 4063 * Remove the given range of addresses from the specified map. 4064 * 4065 * It is assumed that the start and end are properly 4066 * rounded to the page size. 4067 */ 4068void 4069pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4070{ 4071 pmap_remove1(pmap, sva, eva, false); 4072} 4073 4074/* 4075 * Remove the given range of addresses as part of a logical unmap 4076 * operation. This has the effect of calling pmap_remove(), but 4077 * also clears any metadata that should persist for the lifetime 4078 * of a logical mapping. 4079 */ 4080void 4081pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4082{ 4083 pmap_remove1(pmap, sva, eva, true); 4084} 4085 4086/* 4087 * Routine: pmap_remove_all 4088 * Function: 4089 * Removes this physical page from 4090 * all physical maps in which it resides. 4091 * Reflects back modify bits to the pager. 4092 * 4093 * Notes: 4094 * Original versions of this routine were very 4095 * inefficient because they iteratively called 4096 * pmap_remove (slow...) 4097 */ 4098 4099void 4100pmap_remove_all(vm_page_t m) 4101{ 4102 struct md_page *pvh; 4103 pv_entry_t pv; 4104 pmap_t pmap; 4105 struct rwlock *lock; 4106 pd_entry_t *pde, tpde; 4107 pt_entry_t *pte, tpte; 4108 vm_offset_t va; 4109 struct spglist free; 4110 int lvl, pvh_gen, md_gen; 4111 4112 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4113 ("pmap_remove_all: page %p is not managed", m)); 4114 SLIST_INIT(&free); 4115 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4116 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 4117 rw_wlock(lock); 4118retry: 4119 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 4120 pmap = PV_PMAP(pv); 4121 if (!PMAP_TRYLOCK(pmap)) { 4122 pvh_gen = pvh->pv_gen; 4123 rw_wunlock(lock); 4124 PMAP_LOCK(pmap); 4125 rw_wlock(lock); 4126 if (pvh_gen != pvh->pv_gen) { 4127 PMAP_UNLOCK(pmap); 4128 goto retry; 4129 } 4130 } 4131 va = pv->pv_va; 4132 pte = pmap_pte_exists(pmap, va, 2, __func__); 4133 pmap_demote_l2_locked(pmap, pte, va, &lock); 4134 PMAP_UNLOCK(pmap); 4135 } 4136 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4137 pmap = PV_PMAP(pv); 4138 if (!PMAP_TRYLOCK(pmap)) { 4139 pvh_gen = pvh->pv_gen; 4140 md_gen = m->md.pv_gen; 4141 rw_wunlock(lock); 4142 PMAP_LOCK(pmap); 4143 rw_wlock(lock); 4144 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 4145 PMAP_UNLOCK(pmap); 4146 goto retry; 4147 } 4148 } 4149 pmap_resident_count_dec(pmap, 1); 4150 4151 pde = pmap_pde(pmap, pv->pv_va, &lvl); 4152 KASSERT(pde != NULL, 4153 ("pmap_remove_all: no page directory entry found")); 4154 KASSERT(lvl == 2, 4155 ("pmap_remove_all: invalid pde level %d", lvl)); 4156 tpde = pmap_load(pde); 4157 4158 pte = pmap_l2_to_l3(pde, pv->pv_va); 4159 tpte = pmap_load(pte); 4160 if ((tpte & ATTR_CONTIGUOUS) != 0) 4161 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 4162 tpte = pmap_load_clear(pte); 4163 if (tpte & ATTR_SW_WIRED) 4164 pmap->pm_stats.wired_count--; 4165 if ((tpte & ATTR_AF) != 0) { 4166 pmap_invalidate_page(pmap, pv->pv_va, true); 4167 vm_page_aflag_set(m, PGA_REFERENCED); 4168 } 4169 4170 /* 4171 * Update the vm_page_t clean and reference bits. 4172 */ 4173 if (pmap_pte_dirty(pmap, tpte)) 4174 vm_page_dirty(m); 4175 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); 4176 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4177 m->md.pv_gen++; 4178 free_pv_entry(pmap, pv); 4179 PMAP_UNLOCK(pmap); 4180 } 4181 vm_page_aflag_clear(m, PGA_WRITEABLE); 4182 rw_wunlock(lock); 4183 vm_page_free_pages_toq(&free, true); 4184} 4185 4186/* 4187 * Masks and sets bits in a level 2 page table entries in the specified pmap 4188 */ 4189static void 4190pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, 4191 pt_entry_t nbits) 4192{ 4193 pd_entry_t old_l2; 4194 vm_page_t m, mt; 4195 4196 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4197 PMAP_ASSERT_STAGE1(pmap); 4198 KASSERT((sva & L2_OFFSET) == 0, 4199 ("pmap_protect_l2: sva is not 2mpage aligned")); 4200 old_l2 = pmap_load(l2); 4201 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, 4202 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); 4203 4204 /* 4205 * Return if the L2 entry already has the desired access restrictions 4206 * in place. 4207 */ 4208 if ((old_l2 & mask) == nbits) 4209 return; 4210 4211 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) 4212 cpu_spinwait(); 4213 4214 /* 4215 * When a dirty read/write superpage mapping is write protected, 4216 * update the dirty field of each of the superpage's constituent 4KB 4217 * pages. 4218 */ 4219 if ((old_l2 & ATTR_SW_MANAGED) != 0 && 4220 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4221 pmap_pte_dirty(pmap, old_l2)) { 4222 m = PTE_TO_VM_PAGE(old_l2); 4223 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 4224 vm_page_dirty(mt); 4225 } 4226 4227 /* 4228 * Since a promotion must break the 4KB page mappings before making 4229 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices. 4230 */ 4231 pmap_s1_invalidate_page(pmap, sva, true); 4232} 4233 4234/* 4235 * Masks and sets bits in the specified L3C superpage mapping. 4236 * 4237 * Requests TLB invalidations to be performed by the caller through the 4238 * returned "*vap". 4239 */ 4240static void 4241pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, 4242 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits) 4243{ 4244 pt_entry_t l3e, *tl3p; 4245 vm_page_t m, mt; 4246 bool dirty; 4247 4248 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4249 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) == 4250 0, ("pmap_mask_set_l3c: l3p is not aligned")); 4251 KASSERT((va & L3C_OFFSET) == 0, 4252 ("pmap_mask_set_l3c: va is not aligned")); 4253 dirty = false; 4254 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 4255 l3e = pmap_load(tl3p); 4256 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 4257 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS")); 4258 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 4259 cpu_spinwait(); 4260 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) == 4261 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW))) 4262 dirty = true; 4263 } 4264 4265 /* 4266 * When a dirty read/write superpage mapping is write protected, 4267 * update the dirty field of each of the superpage's constituent 4KB 4268 * pages. 4269 */ 4270 if ((l3e & ATTR_SW_MANAGED) != 0 && 4271 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4272 dirty) { 4273 m = PTE_TO_VM_PAGE(pmap_load(l3p)); 4274 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 4275 vm_page_dirty(mt); 4276 } 4277 4278 if (*vap == va_next) 4279 *vap = va; 4280} 4281 4282/* 4283 * Masks and sets bits in last level page table entries in the specified 4284 * pmap and range 4285 */ 4286static void 4287pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4288 pt_entry_t nbits, bool invalidate) 4289{ 4290 vm_offset_t va, va_next; 4291 pd_entry_t *l0, *l1, *l2; 4292 pt_entry_t *l3p, l3; 4293 4294 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4295 for (; sva < eva; sva = va_next) { 4296 l0 = pmap_l0(pmap, sva); 4297 if (pmap_load(l0) == 0) { 4298 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 4299 if (va_next < sva) 4300 va_next = eva; 4301 continue; 4302 } 4303 4304 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 4305 if (va_next < sva) 4306 va_next = eva; 4307 l1 = pmap_l0_to_l1(l0, sva); 4308 if (pmap_load(l1) == 0) 4309 continue; 4310 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 4311 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4312 KASSERT(va_next <= eva, 4313 ("partial update of non-transparent 1G page " 4314 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 4315 pmap_load(l1), sva, eva, va_next)); 4316 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0); 4317 if ((pmap_load(l1) & mask) != nbits) { 4318 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits); 4319 if (invalidate) 4320 pmap_s1_invalidate_page(pmap, sva, true); 4321 } 4322 continue; 4323 } 4324 4325 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 4326 if (va_next < sva) 4327 va_next = eva; 4328 4329 l2 = pmap_l1_to_l2(l1, sva); 4330 if (pmap_load(l2) == 0) 4331 continue; 4332 4333 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 4334 if (sva + L2_SIZE == va_next && eva >= va_next) { 4335 pmap_protect_l2(pmap, l2, sva, mask, nbits); 4336 continue; 4337 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 4338 continue; 4339 } 4340 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 4341 ("pmap_protect: Invalid L2 entry after demotion")); 4342 4343 if (va_next > eva) 4344 va_next = eva; 4345 4346 va = va_next; 4347 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, 4348 sva += L3_SIZE) { 4349 l3 = pmap_load(l3p); 4350 4351 /* 4352 * Go to the next L3 entry if the current one is 4353 * invalid or already has the desired access 4354 * restrictions in place. (The latter case occurs 4355 * frequently. For example, in a "buildworld" 4356 * workload, almost 1 out of 4 L3 entries already 4357 * have the desired restrictions.) 4358 */ 4359 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { 4360 if (va != va_next) { 4361 if (invalidate) 4362 pmap_s1_invalidate_range(pmap, 4363 va, sva, true); 4364 va = va_next; 4365 } 4366 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4367 l3p += L3C_ENTRIES - 1; 4368 sva += L3C_SIZE - L3_SIZE; 4369 } 4370 continue; 4371 } 4372 4373 if ((l3 & ATTR_CONTIGUOUS) != 0) { 4374 /* 4375 * Is this entire set of contiguous L3 entries 4376 * being protected? Handle the possibility 4377 * that "va_next" is zero because of address 4378 * wraparound. 4379 */ 4380 if ((sva & L3C_OFFSET) == 0 && 4381 sva + L3C_OFFSET <= va_next - 1) { 4382 pmap_mask_set_l3c(pmap, l3p, sva, &va, 4383 va_next, mask, nbits); 4384 l3p += L3C_ENTRIES - 1; 4385 sva += L3C_SIZE - L3_SIZE; 4386 continue; 4387 } 4388 4389 (void)pmap_demote_l3c(pmap, l3p, sva); 4390 4391 /* 4392 * The L3 entry's accessed bit may have changed. 4393 */ 4394 l3 = pmap_load(l3p); 4395 } 4396 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | 4397 nbits)) 4398 cpu_spinwait(); 4399 4400 /* 4401 * When a dirty read/write mapping is write protected, 4402 * update the page's dirty field. 4403 */ 4404 if ((l3 & ATTR_SW_MANAGED) != 0 && 4405 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 && 4406 pmap_pte_dirty(pmap, l3)) 4407 vm_page_dirty(PTE_TO_VM_PAGE(l3)); 4408 4409 if (va == va_next) 4410 va = sva; 4411 } 4412 if (va != va_next && invalidate) 4413 pmap_s1_invalidate_range(pmap, va, sva, true); 4414 } 4415} 4416 4417static void 4418pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask, 4419 pt_entry_t nbits, bool invalidate) 4420{ 4421 PMAP_LOCK(pmap); 4422 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate); 4423 PMAP_UNLOCK(pmap); 4424} 4425 4426/* 4427 * Set the physical protection on the 4428 * specified range of this map as requested. 4429 */ 4430void 4431pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 4432{ 4433 pt_entry_t mask, nbits; 4434 4435 PMAP_ASSERT_STAGE1(pmap); 4436 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 4437 if (prot == VM_PROT_NONE) { 4438 pmap_remove(pmap, sva, eva); 4439 return; 4440 } 4441 4442 mask = nbits = 0; 4443 if ((prot & VM_PROT_WRITE) == 0) { 4444 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM; 4445 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO); 4446 } 4447 if ((prot & VM_PROT_EXECUTE) == 0) { 4448 mask |= ATTR_S1_XN; 4449 nbits |= ATTR_S1_XN; 4450 } 4451 if (pmap == kernel_pmap) { 4452 mask |= ATTR_KERN_GP; 4453 nbits |= ATTR_KERN_GP; 4454 } 4455 if (mask == 0) 4456 return; 4457 4458 pmap_mask_set(pmap, sva, eva, mask, nbits, true); 4459} 4460 4461void 4462pmap_disable_promotion(vm_offset_t sva, vm_size_t size) 4463{ 4464 4465 MPASS((sva & L3_OFFSET) == 0); 4466 MPASS(((sva + size) & L3_OFFSET) == 0); 4467 4468 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE, 4469 ATTR_SW_NO_PROMOTE, false); 4470} 4471 4472/* 4473 * Inserts the specified page table page into the specified pmap's collection 4474 * of idle page table pages. Each of a pmap's page table pages is responsible 4475 * for mapping a distinct range of virtual addresses. The pmap's collection is 4476 * ordered by this virtual address range. 4477 * 4478 * If "promoted" is false, then the page table page "mpte" must be zero filled; 4479 * "mpte"'s valid field will be set to 0. 4480 * 4481 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must 4482 * contain valid mappings with identical attributes except for ATTR_AF; 4483 * "mpte"'s valid field will be set to 1. 4484 * 4485 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain 4486 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid 4487 * field will be set to VM_PAGE_BITS_ALL. 4488 */ 4489static __inline int 4490pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted, 4491 bool all_l3e_AF_set) 4492{ 4493 4494 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4495 KASSERT(promoted || !all_l3e_AF_set, 4496 ("a zero-filled PTP can't have ATTR_AF set in every PTE")); 4497 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0; 4498 return (vm_radix_insert(&pmap->pm_root, mpte)); 4499} 4500 4501/* 4502 * Removes the page table page mapping the specified virtual address from the 4503 * specified pmap's collection of idle page table pages, and returns it. 4504 * Otherwise, returns NULL if there is no page table page corresponding to the 4505 * specified virtual address. 4506 */ 4507static __inline vm_page_t 4508pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 4509{ 4510 4511 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4512 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); 4513} 4514 4515/* 4516 * Performs a break-before-make update of a pmap entry. This is needed when 4517 * either promoting or demoting pages to ensure the TLB doesn't get into an 4518 * inconsistent state. 4519 */ 4520static void 4521pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte, 4522 vm_offset_t va, vm_size_t size) 4523{ 4524 pd_entry_t *lip, *ptep_end; 4525 register_t intr; 4526 4527 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4528 4529 if ((newpte & ATTR_SW_NO_PROMOTE) != 0) 4530 panic("%s: Updating non-promote pte", __func__); 4531 4532 if (size == L3C_SIZE) 4533 ptep_end = ptep + L3C_ENTRIES; 4534 else 4535 ptep_end = ptep + 1; 4536 4537 /* 4538 * Ensure we don't get switched out with the page table in an 4539 * inconsistent state. We also need to ensure no interrupts fire 4540 * as they may make use of an address we are about to invalidate. 4541 */ 4542 intr = intr_disable(); 4543 4544 /* 4545 * Clear the old mapping's valid bit, but leave the rest of the entry 4546 * unchanged, so that a lockless, concurrent pmap_kextract() can still 4547 * lookup the physical address. 4548 */ 4549 for (lip = ptep; lip < ptep_end; lip++) 4550 pmap_clear_bits(lip, ATTR_DESCR_VALID); 4551 4552 /* 4553 * When promoting, the L{1,2}_TABLE entry that is being replaced might 4554 * be cached, so we invalidate intermediate entries as well as final 4555 * entries. 4556 */ 4557 pmap_s1_invalidate_range(pmap, va, va + size, size == L3C_SIZE); 4558 4559 /* Create the new mapping */ 4560 for (lip = ptep; lip < ptep_end; lip++) { 4561 pmap_store(lip, newpte); 4562 newpte += PAGE_SIZE; 4563 } 4564 dsb(ishst); 4565 4566 intr_restore(intr); 4567} 4568 4569#if VM_NRESERVLEVEL > 0 4570/* 4571 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 4572 * replace the many pv entries for the 4KB page mappings by a single pv entry 4573 * for the 2MB page mapping. 4574 */ 4575static void 4576pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 4577 struct rwlock **lockp) 4578{ 4579 struct md_page *pvh; 4580 pv_entry_t pv; 4581 vm_offset_t va_last; 4582 vm_page_t m; 4583 4584 KASSERT((pa & L2_OFFSET) == 0, 4585 ("pmap_pv_promote_l2: pa is not 2mpage aligned")); 4586 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 4587 4588 /* 4589 * Transfer the first page's pv entry for this mapping to the 2mpage's 4590 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 4591 * a transfer avoids the possibility that get_pv_entry() calls 4592 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 4593 * mappings that is being promoted. 4594 */ 4595 m = PHYS_TO_VM_PAGE(pa); 4596 va = va & ~L2_OFFSET; 4597 pv = pmap_pvh_remove(&m->md, pmap, va); 4598 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); 4599 pvh = page_to_pvh(m); 4600 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4601 pvh->pv_gen++; 4602 /* Free the remaining NPTEPG - 1 pv entries. */ 4603 va_last = va + L2_SIZE - PAGE_SIZE; 4604 do { 4605 m++; 4606 va += PAGE_SIZE; 4607 pmap_pvh_free(&m->md, pmap, va); 4608 } while (va < va_last); 4609} 4610 4611/* 4612 * Tries to promote the 512, contiguous 4KB page mappings that are within a 4613 * single level 2 table entry to a single 2MB page mapping. For promotion 4614 * to occur, two conditions must be met: (1) the 4KB page mappings must map 4615 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 4616 * identical characteristics. 4617 */ 4618static bool 4619pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte, 4620 struct rwlock **lockp) 4621{ 4622 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa; 4623 4624 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4625 4626 /* 4627 * Currently, this function only supports promotion on stage 1 pmaps 4628 * because it tests stage 1 specific fields and performs a break- 4629 * before-make sequence that is incorrect for stage 2 pmaps. 4630 */ 4631 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4632 return (false); 4633 4634 /* 4635 * Examine the first L3E in the specified PTP. Abort if this L3E is 4636 * ineligible for promotion... 4637 */ 4638 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); 4639 newl2 = pmap_load(firstl3); 4640 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0) 4641 return (false); 4642 /* ... is not the first physical page within an L2 block */ 4643 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 || 4644 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */ 4645 atomic_add_long(&pmap_l2_p_failures, 1); 4646 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4647 " in pmap %p", va, pmap); 4648 return (false); 4649 } 4650 4651 /* 4652 * Both here and in the below "for" loop, to allow for repromotion 4653 * after MADV_FREE, conditionally write protect a clean L3E before 4654 * possibly aborting the promotion due to other L3E attributes. Why? 4655 * Suppose that MADV_FREE is applied to a part of a superpage, the 4656 * address range [S, E). pmap_advise() will demote the superpage 4657 * mapping, destroy the 4KB page mapping at the end of [S, E), and 4658 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later, 4659 * imagine that the memory in [S, E) is recycled, but the last 4KB 4660 * page in [S, E) is not the last to be rewritten, or simply accessed. 4661 * In other words, there is still a 4KB page in [S, E), call it P, 4662 * that is writeable but AP_RO is set and AF is clear in P's L3E. 4663 * Unless we write protect P before aborting the promotion, if and 4664 * when P is finally rewritten, there won't be a page fault to trigger 4665 * repromotion. 4666 */ 4667setl2: 4668 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4669 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4670 /* 4671 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 4672 * ATTR_SW_DBM can be cleared without a TLB invalidation. 4673 */ 4674 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM)) 4675 goto setl2; 4676 newl2 &= ~ATTR_SW_DBM; 4677 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx" 4678 " in pmap %p", va & ~L2_OFFSET, pmap); 4679 } 4680 4681 /* 4682 * Examine each of the other L3Es in the specified PTP. Abort if this 4683 * L3E maps an unexpected 4KB physical page or does not have identical 4684 * characteristics to the first L3E. If ATTR_AF is not set in every 4685 * PTE, then request that the PTP be refilled on demotion. 4686 */ 4687 all_l3e_AF = newl2 & ATTR_AF; 4688 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK)) 4689 + L2_SIZE - PAGE_SIZE; 4690 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { 4691 oldl3 = pmap_load(l3); 4692 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 4693 atomic_add_long(&pmap_l2_p_failures, 1); 4694 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4695 " in pmap %p", va, pmap); 4696 return (false); 4697 } 4698setl3: 4699 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4700 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4701 /* 4702 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 4703 * set, ATTR_SW_DBM can be cleared without a TLB 4704 * invalidation. 4705 */ 4706 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 4707 ~ATTR_SW_DBM)) 4708 goto setl3; 4709 oldl3 &= ~ATTR_SW_DBM; 4710 } 4711 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) { 4712 atomic_add_long(&pmap_l2_p_failures, 1); 4713 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" 4714 " in pmap %p", va, pmap); 4715 return (false); 4716 } 4717 all_l3e_AF &= oldl3; 4718 pa -= PAGE_SIZE; 4719 } 4720 4721 /* 4722 * Unless all PTEs have ATTR_AF set, clear it from the superpage 4723 * mapping, so that promotions triggered by speculative mappings, 4724 * such as pmap_enter_quick(), don't automatically mark the 4725 * underlying pages as referenced. 4726 */ 4727 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF; 4728 4729 /* 4730 * Save the page table page in its current state until the L2 4731 * mapping the superpage is demoted by pmap_demote_l2() or 4732 * destroyed by pmap_remove_l3(). 4733 */ 4734 if (mpte == NULL) 4735 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 4736 KASSERT(mpte >= vm_page_array && 4737 mpte < &vm_page_array[vm_page_array_size], 4738 ("pmap_promote_l2: page table page is out of range")); 4739 KASSERT(mpte->pindex == pmap_l2_pindex(va), 4740 ("pmap_promote_l2: page table page's pindex is wrong")); 4741 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) { 4742 atomic_add_long(&pmap_l2_p_failures, 1); 4743 CTR2(KTR_PMAP, 4744 "pmap_promote_l2: failure for va %#lx in pmap %p", va, 4745 pmap); 4746 return (false); 4747 } 4748 4749 if ((newl2 & ATTR_SW_MANAGED) != 0) 4750 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp); 4751 4752 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE); 4753 4754 atomic_add_long(&pmap_l2_promotions, 1); 4755 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, 4756 pmap); 4757 return (true); 4758} 4759 4760/* 4761 * Tries to promote an aligned, contiguous set of base page mappings to a 4762 * single L3C page mapping. For promotion to occur, two conditions must be 4763 * met: (1) the base page mappings must map aligned, contiguous physical 4764 * memory and (2) the base page mappings must have identical characteristics 4765 * except for the accessed flag. 4766 */ 4767static bool 4768pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va) 4769{ 4770 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa; 4771 4772 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4773 4774 /* 4775 * Currently, this function only supports promotion on stage 1 pmaps 4776 * because it tests stage 1 specific fields and performs a break- 4777 * before-make sequence that is incorrect for stage 2 pmaps. 4778 */ 4779 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap)) 4780 return (false); 4781 4782 /* 4783 * Compute the address of the first L3 entry in the superpage 4784 * candidate. 4785 */ 4786 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 4787 sizeof(pt_entry_t)) - 1)); 4788 4789 firstl3c = pmap_load(l3p); 4790 4791 /* 4792 * Examine the first L3 entry. Abort if this L3E is ineligible for 4793 * promotion... 4794 */ 4795 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0) 4796 return (false); 4797 /* ...is not properly aligned... */ 4798 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 || 4799 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */ 4800 counter_u64_add(pmap_l3c_p_failures, 1); 4801 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 4802 " in pmap %p", va, pmap); 4803 return (false); 4804 } 4805 4806 /* 4807 * If the first L3 entry is a clean read-write mapping, convert it 4808 * to a read-only mapping. See pmap_promote_l2() for the rationale. 4809 */ 4810set_first: 4811 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4812 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4813 /* 4814 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set, 4815 * ATTR_SW_DBM can be cleared without a TLB invalidation. 4816 */ 4817 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM)) 4818 goto set_first; 4819 firstl3c &= ~ATTR_SW_DBM; 4820 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 4821 " in pmap %p", va & ~L3C_OFFSET, pmap); 4822 } 4823 4824 /* 4825 * Check that the rest of the L3 entries are compatible with the first, 4826 * and convert clean read-write mappings to read-only mappings. 4827 */ 4828 all_l3e_AF = firstl3c & ATTR_AF; 4829 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) + 4830 L3C_SIZE - PAGE_SIZE; 4831 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) { 4832 oldl3 = pmap_load(l3); 4833 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) { 4834 counter_u64_add(pmap_l3c_p_failures, 1); 4835 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 4836 " in pmap %p", va, pmap); 4837 return (false); 4838 } 4839set_l3: 4840 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 4841 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) { 4842 /* 4843 * When the mapping is clean, i.e., ATTR_S1_AP_RO is 4844 * set, ATTR_SW_DBM can be cleared without a TLB 4845 * invalidation. 4846 */ 4847 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & 4848 ~ATTR_SW_DBM)) 4849 goto set_l3; 4850 oldl3 &= ~ATTR_SW_DBM; 4851 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx" 4852 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) | 4853 (va & ~L3C_OFFSET), pmap); 4854 } 4855 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) { 4856 counter_u64_add(pmap_l3c_p_failures, 1); 4857 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx" 4858 " in pmap %p", va, pmap); 4859 return (false); 4860 } 4861 all_l3e_AF &= oldl3; 4862 pa -= PAGE_SIZE; 4863 } 4864 4865 /* 4866 * Unless all PTEs have ATTR_AF set, clear it from the superpage 4867 * mapping, so that promotions triggered by speculative mappings, 4868 * such as pmap_enter_quick(), don't automatically mark the 4869 * underlying pages as referenced. 4870 */ 4871 firstl3c &= ~ATTR_AF | all_l3e_AF; 4872 4873 /* 4874 * Remake the mappings with the contiguous bit set. 4875 */ 4876 pmap_update_entry(pmap, l3p, firstl3c | ATTR_CONTIGUOUS, va & 4877 ~L3C_OFFSET, L3C_SIZE); 4878 4879 counter_u64_add(pmap_l3c_promotions, 1); 4880 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va, 4881 pmap); 4882 return (true); 4883} 4884#endif /* VM_NRESERVLEVEL > 0 */ 4885 4886static int 4887pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags, 4888 int psind) 4889{ 4890 pd_entry_t *l0p, *l1p, *l2p, origpte; 4891 vm_page_t mp; 4892 4893 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4894 KASSERT(psind > 0 && psind < MAXPAGESIZES, 4895 ("psind %d unexpected", psind)); 4896 KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0, 4897 ("unaligned phys address %#lx newpte %#lx psind %d", 4898 PTE_TO_PHYS(newpte), newpte, psind)); 4899 4900restart: 4901 if (!pmap_bti_same(pmap, va, va + pagesizes[psind])) 4902 return (KERN_PROTECTION_FAILURE); 4903 if (psind == 2) { 4904 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 4905 4906 l0p = pmap_l0(pmap, va); 4907 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) { 4908 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL); 4909 if (mp == NULL) { 4910 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4911 return (KERN_RESOURCE_SHORTAGE); 4912 PMAP_UNLOCK(pmap); 4913 vm_wait(NULL); 4914 PMAP_LOCK(pmap); 4915 goto restart; 4916 } 4917 l1p = pmap_l0_to_l1(l0p, va); 4918 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4919 origpte = pmap_load(l1p); 4920 } else { 4921 l1p = pmap_l0_to_l1(l0p, va); 4922 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va)); 4923 origpte = pmap_load(l1p); 4924 if ((origpte & ATTR_DESCR_VALID) == 0) { 4925 mp = PTE_TO_VM_PAGE(pmap_load(l0p)); 4926 mp->ref_count++; 4927 } 4928 } 4929 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) && 4930 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) || 4931 (origpte & ATTR_DESCR_VALID) == 0, 4932 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx", 4933 va, origpte, newpte)); 4934 pmap_store(l1p, newpte); 4935 } else /* (psind == 1) */ { 4936 l2p = pmap_l2(pmap, va); 4937 if (l2p == NULL) { 4938 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL); 4939 if (mp == NULL) { 4940 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 4941 return (KERN_RESOURCE_SHORTAGE); 4942 PMAP_UNLOCK(pmap); 4943 vm_wait(NULL); 4944 PMAP_LOCK(pmap); 4945 goto restart; 4946 } 4947 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp)); 4948 l2p = &l2p[pmap_l2_index(va)]; 4949 origpte = pmap_load(l2p); 4950 } else { 4951 l1p = pmap_l1(pmap, va); 4952 origpte = pmap_load(l2p); 4953 if ((origpte & ATTR_DESCR_VALID) == 0) { 4954 mp = PTE_TO_VM_PAGE(pmap_load(l1p)); 4955 mp->ref_count++; 4956 } 4957 } 4958 KASSERT((origpte & ATTR_DESCR_VALID) == 0 || 4959 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK && 4960 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)), 4961 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx", 4962 va, origpte, newpte)); 4963 pmap_store(l2p, newpte); 4964 } 4965 dsb(ishst); 4966 4967 if ((origpte & ATTR_DESCR_VALID) == 0) 4968 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE); 4969 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0) 4970 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE; 4971 else if ((newpte & ATTR_SW_WIRED) == 0 && 4972 (origpte & ATTR_SW_WIRED) != 0) 4973 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE; 4974 4975 return (KERN_SUCCESS); 4976} 4977 4978/* 4979 * Insert the given physical page (p) at 4980 * the specified virtual address (v) in the 4981 * target physical map with the protection requested. 4982 * 4983 * If specified, the page will be wired down, meaning 4984 * that the related pte can not be reclaimed. 4985 * 4986 * NB: This is the only routine which MAY NOT lazy-evaluate 4987 * or lose information. That is, this routine must actually 4988 * insert this page into the given map NOW. 4989 */ 4990int 4991pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4992 u_int flags, int8_t psind) 4993{ 4994 struct rwlock *lock; 4995 pd_entry_t *pde; 4996 pt_entry_t new_l3, orig_l3; 4997 pt_entry_t *l2, *l3; 4998 pv_entry_t pv; 4999 vm_paddr_t opa, pa; 5000 vm_page_t mpte, om; 5001 bool nosleep; 5002 int lvl, rv; 5003 5004 KASSERT(ADDR_IS_CANONICAL(va), 5005 ("%s: Address not in canonical form: %lx", __func__, va)); 5006 5007 va = trunc_page(va); 5008 if ((m->oflags & VPO_UNMANAGED) == 0) 5009 VM_PAGE_OBJECT_BUSY_ASSERT(m); 5010 pa = VM_PAGE_TO_PHYS(m); 5011 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE); 5012 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr); 5013 new_l3 |= pmap_pte_prot(pmap, prot); 5014 if ((flags & PMAP_ENTER_WIRED) != 0) 5015 new_l3 |= ATTR_SW_WIRED; 5016 if (pmap->pm_stage == PM_STAGE1) { 5017 if (!ADDR_IS_KERNEL(va)) 5018 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5019 else 5020 new_l3 |= ATTR_S1_UXN; 5021 if (pmap != kernel_pmap) 5022 new_l3 |= ATTR_S1_nG; 5023 } else { 5024 /* 5025 * Clear the access flag on executable mappings, this will be 5026 * set later when the page is accessed. The fault handler is 5027 * required to invalidate the I-cache. 5028 * 5029 * TODO: Switch to the valid flag to allow hardware management 5030 * of the access flag. Much of the pmap code assumes the 5031 * valid flag is set and fails to destroy the old page tables 5032 * correctly if it is clear. 5033 */ 5034 if (prot & VM_PROT_EXECUTE) 5035 new_l3 &= ~ATTR_AF; 5036 } 5037 if ((m->oflags & VPO_UNMANAGED) == 0) { 5038 new_l3 |= ATTR_SW_MANAGED; 5039 if ((prot & VM_PROT_WRITE) != 0) { 5040 new_l3 |= ATTR_SW_DBM; 5041 if ((flags & VM_PROT_WRITE) == 0) { 5042 if (pmap->pm_stage == PM_STAGE1) 5043 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO); 5044 else 5045 new_l3 &= 5046 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 5047 } 5048 } 5049 } 5050 5051 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); 5052 5053 lock = NULL; 5054 PMAP_LOCK(pmap); 5055 /* Wait until we lock the pmap to protect the bti rangeset */ 5056 new_l3 |= pmap_pte_bti(pmap, va); 5057 5058 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) { 5059 KASSERT((m->oflags & VPO_UNMANAGED) != 0, 5060 ("managed largepage va %#lx flags %#x", va, flags)); 5061 new_l3 &= ~L3_PAGE; 5062 if (psind == 2) { 5063 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 5064 new_l3 |= L1_BLOCK; 5065 } else /* (psind == 1) */ 5066 new_l3 |= L2_BLOCK; 5067 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind); 5068 goto out; 5069 } 5070 if (psind == 1) { 5071 /* Assert the required virtual and physical alignment. */ 5072 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); 5073 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); 5074 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, 5075 flags, m, &lock); 5076 goto out; 5077 } 5078 mpte = NULL; 5079 5080 /* 5081 * In the case that a page table page is not 5082 * resident, we are creating it here. 5083 */ 5084retry: 5085 pde = pmap_pde(pmap, va, &lvl); 5086 if (pde != NULL && lvl == 2) { 5087 l3 = pmap_l2_to_l3(pde, va); 5088 if (!ADDR_IS_KERNEL(va) && mpte == NULL) { 5089 mpte = PTE_TO_VM_PAGE(pmap_load(pde)); 5090 mpte->ref_count++; 5091 } 5092 goto havel3; 5093 } else if (pde != NULL && lvl == 1) { 5094 l2 = pmap_l1_to_l2(pde, va); 5095 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && 5096 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { 5097 l3 = &l3[pmap_l3_index(va)]; 5098 if (!ADDR_IS_KERNEL(va)) { 5099 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5100 mpte->ref_count++; 5101 } 5102 goto havel3; 5103 } 5104 /* We need to allocate an L3 table. */ 5105 } 5106 if (!ADDR_IS_KERNEL(va)) { 5107 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; 5108 5109 /* 5110 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order 5111 * to handle the possibility that a superpage mapping for "va" 5112 * was created while we slept. 5113 */ 5114 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), 5115 nosleep ? NULL : &lock); 5116 if (mpte == NULL && nosleep) { 5117 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); 5118 rv = KERN_RESOURCE_SHORTAGE; 5119 goto out; 5120 } 5121 goto retry; 5122 } else 5123 panic("pmap_enter: missing L3 table for kernel va %#lx", va); 5124 5125havel3: 5126 orig_l3 = pmap_load(l3); 5127 opa = PTE_TO_PHYS(orig_l3); 5128 pv = NULL; 5129 5130 /* 5131 * Is the specified virtual address already mapped? 5132 */ 5133 if (pmap_l3_valid(orig_l3)) { 5134 /* 5135 * Wiring change, just update stats. We don't worry about 5136 * wiring PT pages as they remain resident as long as there 5137 * are valid mappings in them. Hence, if a user page is wired, 5138 * the PT page will be also. 5139 */ 5140 if ((flags & PMAP_ENTER_WIRED) != 0 && 5141 (orig_l3 & ATTR_SW_WIRED) == 0) 5142 pmap->pm_stats.wired_count++; 5143 else if ((flags & PMAP_ENTER_WIRED) == 0 && 5144 (orig_l3 & ATTR_SW_WIRED) != 0) 5145 pmap->pm_stats.wired_count--; 5146 5147 /* 5148 * Remove the extra PT page reference. 5149 */ 5150 if (mpte != NULL) { 5151 mpte->ref_count--; 5152 KASSERT(mpte->ref_count > 0, 5153 ("pmap_enter: missing reference to page table page," 5154 " va: 0x%lx", va)); 5155 } 5156 5157 /* 5158 * Has the physical page changed? 5159 */ 5160 if (opa == pa) { 5161 /* 5162 * No, might be a protection or wiring change. 5163 */ 5164 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5165 (new_l3 & ATTR_SW_DBM) != 0) 5166 vm_page_aflag_set(m, PGA_WRITEABLE); 5167 goto validate; 5168 } 5169 5170 /* 5171 * The physical page has changed. Temporarily invalidate 5172 * the mapping. 5173 */ 5174 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5175 (void)pmap_demote_l3c(pmap, l3, va); 5176 orig_l3 = pmap_load_clear(l3); 5177 KASSERT(PTE_TO_PHYS(orig_l3) == opa, 5178 ("pmap_enter: unexpected pa update for %#lx", va)); 5179 if ((orig_l3 & ATTR_SW_MANAGED) != 0) { 5180 om = PHYS_TO_VM_PAGE(opa); 5181 5182 /* 5183 * The pmap lock is sufficient to synchronize with 5184 * concurrent calls to pmap_page_test_mappings() and 5185 * pmap_ts_referenced(). 5186 */ 5187 if (pmap_pte_dirty(pmap, orig_l3)) 5188 vm_page_dirty(om); 5189 if ((orig_l3 & ATTR_AF) != 0) { 5190 pmap_invalidate_page(pmap, va, true); 5191 vm_page_aflag_set(om, PGA_REFERENCED); 5192 } 5193 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om); 5194 pv = pmap_pvh_remove(&om->md, pmap, va); 5195 if ((m->oflags & VPO_UNMANAGED) != 0) 5196 free_pv_entry(pmap, pv); 5197 if ((om->a.flags & PGA_WRITEABLE) != 0 && 5198 TAILQ_EMPTY(&om->md.pv_list) && 5199 ((om->flags & PG_FICTITIOUS) != 0 || 5200 TAILQ_EMPTY(&page_to_pvh(om)->pv_list))) 5201 vm_page_aflag_clear(om, PGA_WRITEABLE); 5202 } else { 5203 KASSERT((orig_l3 & ATTR_AF) != 0, 5204 ("pmap_enter: unmanaged mapping lacks ATTR_AF")); 5205 pmap_invalidate_page(pmap, va, true); 5206 } 5207 orig_l3 = 0; 5208 } else { 5209 /* 5210 * Increment the counters. 5211 */ 5212 if ((new_l3 & ATTR_SW_WIRED) != 0) 5213 pmap->pm_stats.wired_count++; 5214 pmap_resident_count_inc(pmap, 1); 5215 } 5216 /* 5217 * Enter on the PV list if part of our managed memory. 5218 */ 5219 if ((m->oflags & VPO_UNMANAGED) == 0) { 5220 if (pv == NULL) { 5221 pv = get_pv_entry(pmap, &lock); 5222 pv->pv_va = va; 5223 } 5224 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5225 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5226 m->md.pv_gen++; 5227 if ((new_l3 & ATTR_SW_DBM) != 0) 5228 vm_page_aflag_set(m, PGA_WRITEABLE); 5229 } 5230 5231validate: 5232 if (pmap->pm_stage == PM_STAGE1) { 5233 /* 5234 * Sync icache if exec permission and attribute 5235 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping 5236 * is stored and made valid for hardware table walk. If done 5237 * later, then other can access this page before caches are 5238 * properly synced. Don't do it for kernel memory which is 5239 * mapped with exec permission even if the memory isn't going 5240 * to hold executable code. The only time when icache sync is 5241 * needed is after kernel module is loaded and the relocation 5242 * info is processed. And it's done in elf_cpu_load_file(). 5243 */ 5244 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5245 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && 5246 (opa != pa || (orig_l3 & ATTR_S1_XN))) { 5247 PMAP_ASSERT_STAGE1(pmap); 5248 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 5249 PAGE_SIZE); 5250 } 5251 } else { 5252 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 5253 } 5254 5255 /* 5256 * Update the L3 entry 5257 */ 5258 if (pmap_l3_valid(orig_l3)) { 5259 KASSERT(opa == pa, ("pmap_enter: invalid update")); 5260 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { 5261 /* same PA, different attributes */ 5262 if ((orig_l3 & ATTR_CONTIGUOUS) != 0) 5263 (void)pmap_demote_l3c(pmap, l3, va); 5264 orig_l3 = pmap_load_store(l3, new_l3); 5265 pmap_invalidate_page(pmap, va, true); 5266 if ((orig_l3 & ATTR_SW_MANAGED) != 0 && 5267 pmap_pte_dirty(pmap, orig_l3)) 5268 vm_page_dirty(m); 5269 } else { 5270 /* 5271 * orig_l3 == new_l3 5272 * This can happens if multiple threads simultaneously 5273 * access not yet mapped page. This bad for performance 5274 * since this can cause full demotion-NOP-promotion 5275 * cycle. 5276 * Another possible reasons are: 5277 * - VM and pmap memory layout are diverged 5278 * - tlb flush is missing somewhere and CPU doesn't see 5279 * actual mapping. 5280 */ 5281 CTR4(KTR_PMAP, "%s: already mapped page - " 5282 "pmap %p va 0x%#lx pte 0x%lx", 5283 __func__, pmap, va, new_l3); 5284 } 5285 } else { 5286 /* New mapping */ 5287 pmap_store(l3, new_l3); 5288 dsb(ishst); 5289 } 5290 5291#if VM_NRESERVLEVEL > 0 5292 /* 5293 * First, attempt L3C promotion, if the virtual and physical addresses 5294 * are aligned with each other and an underlying reservation has the 5295 * neighboring L3 pages allocated. The first condition is simply an 5296 * optimization that recognizes some eventual promotion failures early 5297 * at a lower run-time cost. Then, if both the page table page and 5298 * the reservation are fully populated, attempt L2 promotion. 5299 */ 5300 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) && 5301 (m->flags & PG_FICTITIOUS) == 0 && 5302 vm_reserv_is_populated(m, L3C_ENTRIES) && 5303 pmap_promote_l3c(pmap, l3, va) && 5304 (mpte == NULL || mpte->ref_count == NL3PG) && 5305 vm_reserv_level_iffullpop(m) == 0) 5306 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock); 5307#endif 5308 5309 rv = KERN_SUCCESS; 5310out: 5311 if (lock != NULL) 5312 rw_wunlock(lock); 5313 PMAP_UNLOCK(pmap); 5314 return (rv); 5315} 5316 5317/* 5318 * Tries to create a read- and/or execute-only L2 page mapping. Returns 5319 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5320 * value. See pmap_enter_l2() for the possible error values when "no sleep", 5321 * "no replace", and "no reclaim" are specified. 5322 */ 5323static int 5324pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 5325 struct rwlock **lockp) 5326{ 5327 pd_entry_t new_l2; 5328 5329 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5330 PMAP_ASSERT_STAGE1(pmap); 5331 KASSERT(ADDR_IS_CANONICAL(va), 5332 ("%s: Address not in canonical form: %lx", __func__, va)); 5333 5334 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT | 5335 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5336 L2_BLOCK); 5337 new_l2 |= pmap_pte_bti(pmap, va); 5338 if ((m->oflags & VPO_UNMANAGED) == 0) { 5339 new_l2 |= ATTR_SW_MANAGED; 5340 new_l2 &= ~ATTR_AF; 5341 } 5342 if ((prot & VM_PROT_EXECUTE) == 0 || 5343 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5344 new_l2 |= ATTR_S1_XN; 5345 if (!ADDR_IS_KERNEL(va)) 5346 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5347 else 5348 new_l2 |= ATTR_S1_UXN; 5349 if (pmap != kernel_pmap) 5350 new_l2 |= ATTR_S1_nG; 5351 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | 5352 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp)); 5353} 5354 5355/* 5356 * Returns true if every page table entry in the specified page table is 5357 * zero. 5358 */ 5359static bool 5360pmap_every_pte_zero(vm_paddr_t pa) 5361{ 5362 pt_entry_t *pt_end, *pte; 5363 5364 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned")); 5365 pte = (pt_entry_t *)PHYS_TO_DMAP(pa); 5366 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) { 5367 if (*pte != 0) 5368 return (false); 5369 } 5370 return (true); 5371} 5372 5373/* 5374 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if 5375 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or 5376 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if 5377 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists 5378 * within the L2 virtual address range starting at the specified virtual 5379 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a 5380 * L2 page mapping already exists at the specified virtual address. Returns 5381 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a 5382 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified 5383 * and a PV entry allocation failed. 5384 */ 5385static int 5386pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, 5387 vm_page_t m, struct rwlock **lockp) 5388{ 5389 struct spglist free; 5390 pd_entry_t *l2, old_l2; 5391 vm_page_t l2pg, mt; 5392 vm_page_t uwptpg; 5393 5394 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5395 KASSERT(ADDR_IS_CANONICAL(va), 5396 ("%s: Address not in canonical form: %lx", __func__, va)); 5397 5398 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags & 5399 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { 5400 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", 5401 va, pmap); 5402 return (KERN_RESOURCE_SHORTAGE); 5403 } 5404 5405 /* 5406 * If bti is not the same for the whole l2 range, return failure 5407 * and let vm_fault() cope. Check after l2 allocation, since 5408 * it could sleep. 5409 */ 5410 if (!pmap_bti_same(pmap, va, va + L2_SIZE)) { 5411 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP")); 5412 pmap_abort_ptp(pmap, va, l2pg); 5413 return (KERN_PROTECTION_FAILURE); 5414 } 5415 5416 /* 5417 * If there are existing mappings, either abort or remove them. 5418 */ 5419 if ((old_l2 = pmap_load(l2)) != 0) { 5420 KASSERT(l2pg == NULL || l2pg->ref_count > 1, 5421 ("pmap_enter_l2: l2pg's ref count is too low")); 5422 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5423 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) { 5424 if (l2pg != NULL) 5425 l2pg->ref_count--; 5426 CTR2(KTR_PMAP, 5427 "pmap_enter_l2: no space for va %#lx" 5428 " in pmap %p", va, pmap); 5429 return (KERN_NO_SPACE); 5430 } else if (!ADDR_IS_KERNEL(va) || 5431 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) { 5432 if (l2pg != NULL) 5433 l2pg->ref_count--; 5434 CTR2(KTR_PMAP, 5435 "pmap_enter_l2: failure for va %#lx" 5436 " in pmap %p", va, pmap); 5437 return (KERN_FAILURE); 5438 } 5439 } 5440 SLIST_INIT(&free); 5441 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) 5442 (void)pmap_remove_l2(pmap, l2, va, 5443 pmap_load(pmap_l1(pmap, va)), &free, lockp); 5444 else 5445 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, 5446 &free, lockp); 5447 if (!ADDR_IS_KERNEL(va)) { 5448 vm_page_free_pages_toq(&free, true); 5449 KASSERT(pmap_load(l2) == 0, 5450 ("pmap_enter_l2: non-zero L2 entry %p", l2)); 5451 } else { 5452 KASSERT(SLIST_EMPTY(&free), 5453 ("pmap_enter_l2: freed kernel page table page")); 5454 5455 /* 5456 * Both pmap_remove_l2() and pmap_remove_l3_range() 5457 * will leave the kernel page table page zero filled. 5458 * Nonetheless, the TLB could have an intermediate 5459 * entry for the kernel page table page, so request 5460 * an invalidation at all levels after clearing 5461 * the L2_TABLE entry. 5462 */ 5463 mt = PTE_TO_VM_PAGE(pmap_load(l2)); 5464 if (pmap_insert_pt_page(pmap, mt, false, false)) 5465 panic("pmap_enter_l2: trie insert failed"); 5466 pmap_clear(l2); 5467 pmap_s1_invalidate_page(pmap, va, false); 5468 } 5469 } 5470 5471 /* 5472 * Allocate leaf ptpage for wired userspace pages. 5473 */ 5474 uwptpg = NULL; 5475 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) { 5476 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED); 5477 if (uwptpg == NULL) { 5478 return (KERN_RESOURCE_SHORTAGE); 5479 } 5480 uwptpg->pindex = pmap_l2_pindex(va); 5481 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) { 5482 vm_page_unwire_noq(uwptpg); 5483 vm_page_free(uwptpg); 5484 return (KERN_RESOURCE_SHORTAGE); 5485 } 5486 pmap_resident_count_inc(pmap, 1); 5487 uwptpg->ref_count = NL3PG; 5488 } 5489 if ((new_l2 & ATTR_SW_MANAGED) != 0) { 5490 /* 5491 * Abort this mapping if its PV entry could not be created. 5492 */ 5493 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { 5494 if (l2pg != NULL) 5495 pmap_abort_ptp(pmap, va, l2pg); 5496 if (uwptpg != NULL) { 5497 mt = pmap_remove_pt_page(pmap, va); 5498 KASSERT(mt == uwptpg, 5499 ("removed pt page %p, expected %p", mt, 5500 uwptpg)); 5501 pmap_resident_count_dec(pmap, 1); 5502 uwptpg->ref_count = 1; 5503 vm_page_unwire_noq(uwptpg); 5504 vm_page_free(uwptpg); 5505 } 5506 CTR2(KTR_PMAP, 5507 "pmap_enter_l2: failure for va %#lx in pmap %p", 5508 va, pmap); 5509 return (KERN_RESOURCE_SHORTAGE); 5510 } 5511 if ((new_l2 & ATTR_SW_DBM) != 0) 5512 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 5513 vm_page_aflag_set(mt, PGA_WRITEABLE); 5514 } 5515 5516 /* 5517 * Increment counters. 5518 */ 5519 if ((new_l2 & ATTR_SW_WIRED) != 0) 5520 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; 5521 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; 5522 5523 /* 5524 * Conditionally sync the icache. See pmap_enter() for details. 5525 */ 5526 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) != 5527 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) && 5528 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) { 5529 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)), 5530 L2_SIZE); 5531 } 5532 5533 /* 5534 * Map the superpage. 5535 */ 5536 pmap_store(l2, new_l2); 5537 dsb(ishst); 5538 5539 atomic_add_long(&pmap_l2_mappings, 1); 5540 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", 5541 va, pmap); 5542 5543 return (KERN_SUCCESS); 5544} 5545 5546/* 5547 * Tries to create a read- and/or execute-only L3C page mapping. Returns 5548 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error 5549 * value. 5550 */ 5551static int 5552pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p, 5553 vm_prot_t prot, struct rwlock **lockp) 5554{ 5555 pt_entry_t l3e; 5556 5557 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5558 PMAP_ASSERT_STAGE1(pmap); 5559 KASSERT(ADDR_IS_CANONICAL(va), 5560 ("%s: Address not in canonical form: %lx", __func__, va)); 5561 5562 l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT | 5563 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) | 5564 ATTR_CONTIGUOUS | L3_PAGE; 5565 l3e |= pmap_pte_bti(pmap, va); 5566 if ((m->oflags & VPO_UNMANAGED) == 0) { 5567 l3e |= ATTR_SW_MANAGED; 5568 l3e &= ~ATTR_AF; 5569 } 5570 if ((prot & VM_PROT_EXECUTE) == 0 || 5571 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5572 l3e |= ATTR_S1_XN; 5573 if (!ADDR_IS_KERNEL(va)) 5574 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5575 else 5576 l3e |= ATTR_S1_UXN; 5577 if (pmap != kernel_pmap) 5578 l3e |= ATTR_S1_nG; 5579 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP | 5580 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp)); 5581} 5582 5583static int 5584pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags, 5585 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp) 5586{ 5587 pd_entry_t *l2p, *pde; 5588 pt_entry_t *l3p, *tl3p; 5589 vm_page_t mt; 5590 vm_paddr_t pa; 5591 vm_pindex_t l2pindex; 5592 int lvl; 5593 5594 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5595 KASSERT((va & L3C_OFFSET) == 0, 5596 ("pmap_enter_l3c: va is not aligned")); 5597 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0, 5598 ("pmap_enter_l3c: managed mapping within the clean submap")); 5599 5600 /* 5601 * If the L3 PTP is not resident, we attempt to create it here. 5602 */ 5603 if (!ADDR_IS_KERNEL(va)) { 5604 /* 5605 * Were we given the correct L3 PTP? If so, we can simply 5606 * increment its ref count. 5607 */ 5608 l2pindex = pmap_l2_pindex(va); 5609 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) { 5610 (*ml3p)->ref_count += L3C_ENTRIES; 5611 } else { 5612retry: 5613 /* 5614 * Get the L2 entry. 5615 */ 5616 pde = pmap_pde(pmap, va, &lvl); 5617 5618 /* 5619 * If the L2 entry is a superpage, we either abort or 5620 * demote depending on the given flags. 5621 */ 5622 if (lvl == 1) { 5623 l2p = pmap_l1_to_l2(pde, va); 5624 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == 5625 L2_BLOCK) { 5626 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 5627 return (KERN_FAILURE); 5628 l3p = pmap_demote_l2_locked(pmap, l2p, 5629 va, lockp); 5630 if (l3p != NULL) { 5631 *ml3p = PTE_TO_VM_PAGE( 5632 pmap_load(l2p)); 5633 (*ml3p)->ref_count += 5634 L3C_ENTRIES; 5635 goto have_l3p; 5636 } 5637 } 5638 /* We need to allocate an L3 PTP. */ 5639 } 5640 5641 /* 5642 * If the L3 PTP is mapped, we just increment its ref 5643 * count. Otherwise, we attempt to allocate it. 5644 */ 5645 if (lvl == 2 && pmap_load(pde) != 0) { 5646 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde)); 5647 (*ml3p)->ref_count += L3C_ENTRIES; 5648 } else { 5649 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags & 5650 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp); 5651 if (*ml3p == NULL) { 5652 if ((flags & PMAP_ENTER_NOSLEEP) != 0) 5653 return (KERN_FAILURE); 5654 5655 /* 5656 * The page table may have changed 5657 * while we slept. 5658 */ 5659 goto retry; 5660 } 5661 (*ml3p)->ref_count += L3C_ENTRIES - 1; 5662 } 5663 } 5664 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p)); 5665 5666have_l3p: 5667 /* 5668 * If bti is not the same for the whole L3C range, return 5669 * failure and let vm_fault() cope. Check after L3 allocation, 5670 * since it could sleep. 5671 */ 5672 if (!pmap_bti_same(pmap, va, va + L3C_SIZE)) { 5673 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 5674 pmap_abort_ptp(pmap, va, *ml3p); 5675 *ml3p = NULL; 5676 return (KERN_PROTECTION_FAILURE); 5677 } 5678 } else { 5679 *ml3p = NULL; 5680 5681 /* 5682 * If the L2 entry is a superpage, we either abort or demote 5683 * depending on the given flags. 5684 */ 5685 pde = pmap_pde(kernel_pmap, va, &lvl); 5686 if (lvl == 1) { 5687 l2p = pmap_l1_to_l2(pde, va); 5688 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK, 5689 ("pmap_enter_l3c: missing L2 block")); 5690 if ((flags & PMAP_ENTER_NOREPLACE) != 0) 5691 return (KERN_FAILURE); 5692 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp); 5693 } else { 5694 KASSERT(lvl == 2, 5695 ("pmap_enter_l3c: Invalid level %d", lvl)); 5696 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS( 5697 pmap_load(pde))); 5698 } 5699 } 5700 l3p = &l3p[pmap_l3_index(va)]; 5701 5702 /* 5703 * If there are existing mappings, either abort or remove them. 5704 */ 5705 if ((flags & PMAP_ENTER_NOREPLACE) != 0) { 5706 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5707 if (pmap_load(tl3p) != 0) { 5708 if (*ml3p != NULL) 5709 (*ml3p)->ref_count -= L3C_ENTRIES; 5710 return (KERN_FAILURE); 5711 } 5712 } 5713 } else { 5714 /* 5715 * Because we increment the L3 page's reference count above, 5716 * it is guaranteed not to be freed here and we can pass NULL 5717 * instead of a valid free list. 5718 */ 5719 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va, 5720 va + L3C_SIZE, NULL, lockp); 5721 } 5722 5723 /* 5724 * Enter on the PV list if part of our managed memory. 5725 */ 5726 if ((l3e & ATTR_SW_MANAGED) != 0) { 5727 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) { 5728 if (*ml3p != NULL) { 5729 (*ml3p)->ref_count -= L3C_ENTRIES - 1; 5730 pmap_abort_ptp(pmap, va, *ml3p); 5731 *ml3p = NULL; 5732 } 5733 return (KERN_RESOURCE_SHORTAGE); 5734 } 5735 if ((l3e & ATTR_SW_DBM) != 0) 5736 for (mt = m; mt < &m[L3C_ENTRIES]; mt++) 5737 vm_page_aflag_set(mt, PGA_WRITEABLE); 5738 } 5739 5740 /* 5741 * Increment counters. 5742 */ 5743 if ((l3e & ATTR_SW_WIRED) != 0) 5744 pmap->pm_stats.wired_count += L3C_ENTRIES; 5745 pmap_resident_count_inc(pmap, L3C_ENTRIES); 5746 5747 pa = VM_PAGE_TO_PHYS(m); 5748 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned")); 5749 5750 /* 5751 * Sync the icache before the mapping is stored. 5752 */ 5753 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap && 5754 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 5755 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE); 5756 5757 /* 5758 * Map the superpage. 5759 */ 5760 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 5761 pmap_store(tl3p, l3e); 5762 l3e += L3_SIZE; 5763 } 5764 dsb(ishst); 5765 5766 counter_u64_add(pmap_l3c_mappings, 1); 5767 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p", 5768 va, pmap); 5769 return (KERN_SUCCESS); 5770} 5771 5772/* 5773 * Maps a sequence of resident pages belonging to the same object. 5774 * The sequence begins with the given page m_start. This page is 5775 * mapped at the given virtual address start. Each subsequent page is 5776 * mapped at a virtual address that is offset from start by the same 5777 * amount as the page is offset from m_start within the object. The 5778 * last page in the sequence is the page with the largest offset from 5779 * m_start that can be mapped at a virtual address less than the given 5780 * virtual address end. Not every virtual page between start and end 5781 * is mapped; only those for which a resident page exists with the 5782 * corresponding offset from m_start are mapped. 5783 */ 5784void 5785pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 5786 vm_page_t m_start, vm_prot_t prot) 5787{ 5788 struct rwlock *lock; 5789 vm_offset_t va; 5790 vm_page_t m, mpte; 5791 vm_pindex_t diff, psize; 5792 int rv; 5793 5794 VM_OBJECT_ASSERT_LOCKED(m_start->object); 5795 5796 psize = atop(end - start); 5797 mpte = NULL; 5798 m = m_start; 5799 lock = NULL; 5800 PMAP_LOCK(pmap); 5801 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 5802 va = start + ptoa(diff); 5803 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && 5804 m->psind == 1 && pmap_ps_enabled(pmap) && 5805 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) == 5806 KERN_SUCCESS || rv == KERN_NO_SPACE)) 5807 m = &m[L2_SIZE / PAGE_SIZE - 1]; 5808 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end && 5809 (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 && 5810 vm_reserv_is_populated(m, L3C_ENTRIES) && 5811 pmap_ps_enabled(pmap) && 5812 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot, 5813 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE)) 5814 m = &m[L3C_ENTRIES - 1]; 5815 else 5816 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, 5817 &lock); 5818 m = TAILQ_NEXT(m, listq); 5819 } 5820 if (lock != NULL) 5821 rw_wunlock(lock); 5822 PMAP_UNLOCK(pmap); 5823} 5824 5825/* 5826 * this code makes some *MAJOR* assumptions: 5827 * 1. Current pmap & pmap exists. 5828 * 2. Not wired. 5829 * 3. Read access. 5830 * 4. No page table pages. 5831 * but is *MUCH* faster than pmap_enter... 5832 */ 5833 5834void 5835pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 5836{ 5837 struct rwlock *lock; 5838 5839 lock = NULL; 5840 PMAP_LOCK(pmap); 5841 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 5842 if (lock != NULL) 5843 rw_wunlock(lock); 5844 PMAP_UNLOCK(pmap); 5845} 5846 5847static vm_page_t 5848pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 5849 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 5850{ 5851 pd_entry_t *pde; 5852 pt_entry_t *l1, *l2, *l3, l3_val; 5853 vm_paddr_t pa; 5854 int lvl; 5855 5856 KASSERT(!VA_IS_CLEANMAP(va) || 5857 (m->oflags & VPO_UNMANAGED) != 0, 5858 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 5859 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5860 PMAP_ASSERT_STAGE1(pmap); 5861 KASSERT(ADDR_IS_CANONICAL(va), 5862 ("%s: Address not in canonical form: %lx", __func__, va)); 5863 l2 = NULL; 5864 5865 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); 5866 /* 5867 * In the case that a page table page is not 5868 * resident, we are creating it here. 5869 */ 5870 if (!ADDR_IS_KERNEL(va)) { 5871 vm_pindex_t l2pindex; 5872 5873 /* 5874 * Calculate pagetable page index 5875 */ 5876 l2pindex = pmap_l2_pindex(va); 5877 if (mpte && (mpte->pindex == l2pindex)) { 5878 mpte->ref_count++; 5879 } else { 5880 /* 5881 * If the page table page is mapped, we just increment 5882 * the hold count, and activate it. Otherwise, we 5883 * attempt to allocate a page table page, passing NULL 5884 * instead of the PV list lock pointer because we don't 5885 * intend to sleep. If this attempt fails, we don't 5886 * retry. Instead, we give up. 5887 */ 5888 l1 = pmap_l1(pmap, va); 5889 if (l1 != NULL && pmap_load(l1) != 0) { 5890 if ((pmap_load(l1) & ATTR_DESCR_MASK) == 5891 L1_BLOCK) 5892 return (NULL); 5893 l2 = pmap_l1_to_l2(l1, va); 5894 if (pmap_load(l2) != 0) { 5895 if ((pmap_load(l2) & ATTR_DESCR_MASK) == 5896 L2_BLOCK) 5897 return (NULL); 5898 mpte = PTE_TO_VM_PAGE(pmap_load(l2)); 5899 mpte->ref_count++; 5900 } else { 5901 mpte = _pmap_alloc_l3(pmap, l2pindex, 5902 NULL); 5903 if (mpte == NULL) 5904 return (mpte); 5905 } 5906 } else { 5907 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); 5908 if (mpte == NULL) 5909 return (mpte); 5910 } 5911 } 5912 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 5913 l3 = &l3[pmap_l3_index(va)]; 5914 } else { 5915 mpte = NULL; 5916 pde = pmap_pde(kernel_pmap, va, &lvl); 5917 KASSERT(pde != NULL, 5918 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", 5919 va)); 5920 KASSERT(lvl == 2, 5921 ("pmap_enter_quick_locked: Invalid level %d", lvl)); 5922 l3 = pmap_l2_to_l3(pde, va); 5923 } 5924 5925 /* 5926 * Abort if a mapping already exists. 5927 */ 5928 if (pmap_load(l3) != 0) { 5929 if (mpte != NULL) 5930 mpte->ref_count--; 5931 return (NULL); 5932 } 5933 5934 /* 5935 * Enter on the PV list if part of our managed memory. 5936 */ 5937 if ((m->oflags & VPO_UNMANAGED) == 0 && 5938 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 5939 if (mpte != NULL) 5940 pmap_abort_ptp(pmap, va, mpte); 5941 return (NULL); 5942 } 5943 5944 /* 5945 * Increment counters 5946 */ 5947 pmap_resident_count_inc(pmap, 1); 5948 5949 pa = VM_PAGE_TO_PHYS(m); 5950 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) | 5951 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE; 5952 l3_val |= pmap_pte_bti(pmap, va); 5953 if ((prot & VM_PROT_EXECUTE) == 0 || 5954 m->md.pv_memattr == VM_MEMATTR_DEVICE) 5955 l3_val |= ATTR_S1_XN; 5956 if (!ADDR_IS_KERNEL(va)) 5957 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN; 5958 else 5959 l3_val |= ATTR_S1_UXN; 5960 if (pmap != kernel_pmap) 5961 l3_val |= ATTR_S1_nG; 5962 5963 /* 5964 * Now validate mapping with RO protection 5965 */ 5966 if ((m->oflags & VPO_UNMANAGED) == 0) { 5967 l3_val |= ATTR_SW_MANAGED; 5968 l3_val &= ~ATTR_AF; 5969 } 5970 5971 /* Sync icache before the mapping is stored to PTE */ 5972 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && 5973 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) 5974 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE); 5975 5976 pmap_store(l3, l3_val); 5977 dsb(ishst); 5978 5979#if VM_NRESERVLEVEL > 0 5980 /* 5981 * If both the PTP and the reservation are fully populated, then 5982 * attempt promotion. 5983 */ 5984 if ((mpte == NULL || mpte->ref_count == NL3PG) && 5985 (m->flags & PG_FICTITIOUS) == 0 && 5986 vm_reserv_level_iffullpop(m) == 0) { 5987 if (l2 == NULL) 5988 l2 = pmap_pde(pmap, va, &lvl); 5989 5990 /* 5991 * If promotion succeeds, then the next call to this function 5992 * should not be given the unmapped PTP as a hint. 5993 */ 5994 if (pmap_promote_l2(pmap, l2, va, mpte, lockp)) 5995 mpte = NULL; 5996 } 5997#endif 5998 5999 return (mpte); 6000} 6001 6002/* 6003 * This code maps large physical mmap regions into the 6004 * processor address space. Note that some shortcuts 6005 * are taken, but the code works. 6006 */ 6007void 6008pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 6009 vm_pindex_t pindex, vm_size_t size) 6010{ 6011 6012 VM_OBJECT_ASSERT_WLOCKED(object); 6013 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 6014 ("pmap_object_init_pt: non-device object")); 6015} 6016 6017/* 6018 * Clear the wired attribute from the mappings for the specified range of 6019 * addresses in the given pmap. Every valid mapping within that range 6020 * must have the wired attribute set. In contrast, invalid mappings 6021 * cannot have the wired attribute set, so they are ignored. 6022 * 6023 * The wired attribute of the page table entry is not a hardware feature, 6024 * so there is no need to invalidate any TLB entries. 6025 */ 6026void 6027pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 6028{ 6029 vm_offset_t va_next; 6030 pd_entry_t *l0, *l1, *l2; 6031 pt_entry_t *l3; 6032 bool partial_l3c; 6033 6034 PMAP_LOCK(pmap); 6035 for (; sva < eva; sva = va_next) { 6036 l0 = pmap_l0(pmap, sva); 6037 if (pmap_load(l0) == 0) { 6038 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 6039 if (va_next < sva) 6040 va_next = eva; 6041 continue; 6042 } 6043 6044 l1 = pmap_l0_to_l1(l0, sva); 6045 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 6046 if (va_next < sva) 6047 va_next = eva; 6048 if (pmap_load(l1) == 0) 6049 continue; 6050 6051 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6052 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6053 KASSERT(va_next <= eva, 6054 ("partial update of non-transparent 1G page " 6055 "l1 %#lx sva %#lx eva %#lx va_next %#lx", 6056 pmap_load(l1), sva, eva, va_next)); 6057 MPASS(pmap != kernel_pmap); 6058 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED | 6059 ATTR_SW_WIRED)) == ATTR_SW_WIRED); 6060 pmap_clear_bits(l1, ATTR_SW_WIRED); 6061 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE; 6062 continue; 6063 } 6064 6065 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 6066 if (va_next < sva) 6067 va_next = eva; 6068 6069 l2 = pmap_l1_to_l2(l1, sva); 6070 if (pmap_load(l2) == 0) 6071 continue; 6072 6073 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { 6074 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) 6075 panic("pmap_unwire: l2 %#jx is missing " 6076 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); 6077 6078 /* 6079 * Are we unwiring the entire large page? If not, 6080 * demote the mapping and fall through. 6081 */ 6082 if (sva + L2_SIZE == va_next && eva >= va_next) { 6083 pmap_clear_bits(l2, ATTR_SW_WIRED); 6084 pmap->pm_stats.wired_count -= L2_SIZE / 6085 PAGE_SIZE; 6086 continue; 6087 } else if (pmap_demote_l2(pmap, l2, sva) == NULL) 6088 panic("pmap_unwire: demotion failed"); 6089 } 6090 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 6091 ("pmap_unwire: Invalid l2 entry after demotion")); 6092 6093 if (va_next > eva) 6094 va_next = eva; 6095 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva); 6096 sva != va_next; l3++, sva += L3_SIZE) { 6097 if (pmap_load(l3) == 0) 6098 continue; 6099 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) { 6100 /* 6101 * Avoid demotion for whole-page unwiring. 6102 */ 6103 if ((sva & L3C_OFFSET) == 0) { 6104 /* 6105 * Handle the possibility that 6106 * "va_next" is zero because of 6107 * address wraparound. 6108 */ 6109 partial_l3c = sva + L3C_OFFSET > 6110 va_next - 1; 6111 } 6112 if (partial_l3c) 6113 (void)pmap_demote_l3c(pmap, l3, sva); 6114 } 6115 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) 6116 panic("pmap_unwire: l3 %#jx is missing " 6117 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); 6118 6119 /* 6120 * ATTR_SW_WIRED must be cleared atomically. Although 6121 * the pmap lock synchronizes access to ATTR_SW_WIRED, 6122 * the System MMU may write to the entry concurrently. 6123 */ 6124 pmap_clear_bits(l3, ATTR_SW_WIRED); 6125 pmap->pm_stats.wired_count--; 6126 } 6127 } 6128 PMAP_UNLOCK(pmap); 6129} 6130 6131/* 6132 * This function requires that the caller has already added one to ml3's 6133 * ref_count in anticipation of creating a 4KB page mapping. 6134 */ 6135static bool 6136pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e, 6137 vm_page_t ml3, struct rwlock **lockp) 6138{ 6139 pt_entry_t *tl3p; 6140 6141 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6142 KASSERT((va & L3C_OFFSET) == 0, 6143 ("pmap_copy_l3c: va is not aligned")); 6144 KASSERT((l3e & ATTR_SW_MANAGED) != 0, 6145 ("pmap_copy_l3c: l3e is not managed")); 6146 6147 /* 6148 * Abort if a mapping already exists. 6149 */ 6150 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) 6151 if (pmap_load(tl3p) != 0) { 6152 if (ml3 != NULL) 6153 ml3->ref_count--; 6154 return (false); 6155 } 6156 6157 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) { 6158 if (ml3 != NULL) 6159 pmap_abort_ptp(pmap, va, ml3); 6160 return (false); 6161 } 6162 ml3->ref_count += L3C_ENTRIES - 1; 6163 6164 /* 6165 * Clear the wired and accessed bits. However, leave the dirty bit 6166 * unchanged because read/write superpage mappings are required to be 6167 * dirty. 6168 */ 6169 l3e &= ~(ATTR_SW_WIRED | ATTR_AF); 6170 6171 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) { 6172 pmap_store(tl3p, l3e); 6173 l3e += L3_SIZE; 6174 } 6175 pmap_resident_count_inc(pmap, L3C_ENTRIES); 6176 counter_u64_add(pmap_l3c_mappings, 1); 6177 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p", 6178 va, pmap); 6179 return (true); 6180} 6181 6182/* 6183 * Copy the range specified by src_addr/len 6184 * from the source map to the range dst_addr/len 6185 * in the destination map. 6186 * 6187 * This routine is only advisory and need not do anything. 6188 * 6189 * Because the executable mappings created by this routine are copied, 6190 * it should not have to flush the instruction cache. 6191 */ 6192void 6193pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 6194 vm_offset_t src_addr) 6195{ 6196 struct rwlock *lock; 6197 pd_entry_t *l0, *l1, *l2, srcptepaddr; 6198 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; 6199 vm_offset_t addr, end_addr, va_next; 6200 vm_page_t dst_m, dstmpte, srcmpte; 6201 6202 PMAP_ASSERT_STAGE1(dst_pmap); 6203 PMAP_ASSERT_STAGE1(src_pmap); 6204 6205 if (dst_addr != src_addr) 6206 return; 6207 end_addr = src_addr + len; 6208 lock = NULL; 6209 if (dst_pmap < src_pmap) { 6210 PMAP_LOCK(dst_pmap); 6211 PMAP_LOCK(src_pmap); 6212 } else { 6213 PMAP_LOCK(src_pmap); 6214 PMAP_LOCK(dst_pmap); 6215 } 6216 for (addr = src_addr; addr < end_addr; addr = va_next) { 6217 l0 = pmap_l0(src_pmap, addr); 6218 if (pmap_load(l0) == 0) { 6219 va_next = (addr + L0_SIZE) & ~L0_OFFSET; 6220 if (va_next < addr) 6221 va_next = end_addr; 6222 continue; 6223 } 6224 6225 va_next = (addr + L1_SIZE) & ~L1_OFFSET; 6226 if (va_next < addr) 6227 va_next = end_addr; 6228 l1 = pmap_l0_to_l1(l0, addr); 6229 if (pmap_load(l1) == 0) 6230 continue; 6231 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 6232 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 6233 KASSERT(va_next <= end_addr, 6234 ("partial update of non-transparent 1G page " 6235 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6236 pmap_load(l1), addr, end_addr, va_next)); 6237 srcptepaddr = pmap_load(l1); 6238 l1 = pmap_l1(dst_pmap, addr); 6239 if (l1 == NULL) { 6240 if (_pmap_alloc_l3(dst_pmap, 6241 pmap_l0_pindex(addr), NULL) == NULL) 6242 break; 6243 l1 = pmap_l1(dst_pmap, addr); 6244 } else { 6245 l0 = pmap_l0(dst_pmap, addr); 6246 dst_m = PTE_TO_VM_PAGE(pmap_load(l0)); 6247 dst_m->ref_count++; 6248 } 6249 KASSERT(pmap_load(l1) == 0, 6250 ("1G mapping present in dst pmap " 6251 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx", 6252 pmap_load(l1), addr, end_addr, va_next)); 6253 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED); 6254 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE); 6255 continue; 6256 } 6257 6258 va_next = (addr + L2_SIZE) & ~L2_OFFSET; 6259 if (va_next < addr) 6260 va_next = end_addr; 6261 l2 = pmap_l1_to_l2(l1, addr); 6262 srcptepaddr = pmap_load(l2); 6263 if (srcptepaddr == 0) 6264 continue; 6265 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { 6266 /* 6267 * We can only virtual copy whole superpages. 6268 */ 6269 if ((addr & L2_OFFSET) != 0 || 6270 addr + L2_SIZE > end_addr) 6271 continue; 6272 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL); 6273 if (l2 == NULL) 6274 break; 6275 if (pmap_load(l2) == 0 && 6276 ((srcptepaddr & ATTR_SW_MANAGED) == 0 || 6277 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, 6278 PMAP_ENTER_NORECLAIM, &lock))) { 6279 /* 6280 * We leave the dirty bit unchanged because 6281 * managed read/write superpage mappings are 6282 * required to be dirty. However, managed 6283 * superpage mappings are not required to 6284 * have their accessed bit set, so we clear 6285 * it because we don't know if this mapping 6286 * will be used. 6287 */ 6288 srcptepaddr &= ~ATTR_SW_WIRED; 6289 if ((srcptepaddr & ATTR_SW_MANAGED) != 0) 6290 srcptepaddr &= ~ATTR_AF; 6291 pmap_store(l2, srcptepaddr); 6292 pmap_resident_count_inc(dst_pmap, L2_SIZE / 6293 PAGE_SIZE); 6294 atomic_add_long(&pmap_l2_mappings, 1); 6295 } else 6296 pmap_abort_ptp(dst_pmap, addr, dst_m); 6297 continue; 6298 } 6299 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, 6300 ("pmap_copy: invalid L2 entry")); 6301 srcmpte = PTE_TO_VM_PAGE(srcptepaddr); 6302 KASSERT(srcmpte->ref_count > 0, 6303 ("pmap_copy: source page table page is unused")); 6304 if (va_next > end_addr) 6305 va_next = end_addr; 6306 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr)); 6307 src_pte = &src_pte[pmap_l3_index(addr)]; 6308 dstmpte = NULL; 6309 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { 6310 ptetemp = pmap_load(src_pte); 6311 6312 /* 6313 * We only virtual copy managed pages. 6314 */ 6315 if ((ptetemp & ATTR_SW_MANAGED) == 0) 6316 continue; 6317 6318 if (dstmpte != NULL) { 6319 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), 6320 ("dstmpte pindex/addr mismatch")); 6321 dstmpte->ref_count++; 6322 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, 6323 NULL)) == NULL) 6324 goto out; 6325 dst_pte = (pt_entry_t *) 6326 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 6327 dst_pte = &dst_pte[pmap_l3_index(addr)]; 6328 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr & 6329 L3C_OFFSET) == 0 && addr + L3C_OFFSET <= 6330 va_next - 1) { 6331 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr, 6332 ptetemp, dstmpte, &lock)) 6333 goto out; 6334 addr += L3C_SIZE - PAGE_SIZE; 6335 src_pte += L3C_ENTRIES - 1; 6336 } else if (pmap_load(dst_pte) == 0 && 6337 pmap_try_insert_pv_entry(dst_pmap, addr, 6338 PTE_TO_VM_PAGE(ptetemp), &lock)) { 6339 /* 6340 * Clear the wired, contiguous, modified, and 6341 * accessed bits from the destination PTE. 6342 * The contiguous bit is cleared because we 6343 * are not copying the entire L3C superpage. 6344 */ 6345 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS | 6346 ATTR_AF; 6347 nbits = 0; 6348 if ((ptetemp & ATTR_SW_DBM) != 0) 6349 nbits |= ATTR_S1_AP_RW_BIT; 6350 pmap_store(dst_pte, (ptetemp & ~mask) | nbits); 6351 pmap_resident_count_inc(dst_pmap, 1); 6352 } else { 6353 pmap_abort_ptp(dst_pmap, addr, dstmpte); 6354 goto out; 6355 } 6356 /* Have we copied all of the valid mappings? */ 6357 if (dstmpte->ref_count >= srcmpte->ref_count) 6358 break; 6359 } 6360 } 6361out: 6362 /* 6363 * XXX This barrier may not be needed because the destination pmap is 6364 * not active. 6365 */ 6366 dsb(ishst); 6367 6368 if (lock != NULL) 6369 rw_wunlock(lock); 6370 PMAP_UNLOCK(src_pmap); 6371 PMAP_UNLOCK(dst_pmap); 6372} 6373 6374int 6375pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) 6376{ 6377 int error; 6378 6379 if (dst_pmap->pm_stage != src_pmap->pm_stage) 6380 return (EINVAL); 6381 6382 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL) 6383 return (0); 6384 6385 for (;;) { 6386 if (dst_pmap < src_pmap) { 6387 PMAP_LOCK(dst_pmap); 6388 PMAP_LOCK(src_pmap); 6389 } else { 6390 PMAP_LOCK(src_pmap); 6391 PMAP_LOCK(dst_pmap); 6392 } 6393 error = pmap_bti_copy(dst_pmap, src_pmap); 6394 /* Clean up partial copy on failure due to no memory. */ 6395 if (error == ENOMEM) 6396 pmap_bti_deassign_all(dst_pmap); 6397 PMAP_UNLOCK(src_pmap); 6398 PMAP_UNLOCK(dst_pmap); 6399 if (error != ENOMEM) 6400 break; 6401 vm_wait(NULL); 6402 } 6403 return (error); 6404} 6405 6406/* 6407 * pmap_zero_page zeros the specified hardware page by mapping 6408 * the page into KVM and using bzero to clear its contents. 6409 */ 6410void 6411pmap_zero_page(vm_page_t m) 6412{ 6413 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6414 6415 pagezero((void *)va); 6416} 6417 6418/* 6419 * pmap_zero_page_area zeros the specified hardware page by mapping 6420 * the page into KVM and using bzero to clear its contents. 6421 * 6422 * off and size may not cover an area beyond a single hardware page. 6423 */ 6424void 6425pmap_zero_page_area(vm_page_t m, int off, int size) 6426{ 6427 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 6428 6429 if (off == 0 && size == PAGE_SIZE) 6430 pagezero((void *)va); 6431 else 6432 bzero((char *)va + off, size); 6433} 6434 6435/* 6436 * pmap_copy_page copies the specified (machine independent) 6437 * page by mapping the page into virtual memory and using 6438 * bcopy to copy the page, one machine dependent page at a 6439 * time. 6440 */ 6441void 6442pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 6443{ 6444 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 6445 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 6446 6447 pagecopy((void *)src, (void *)dst); 6448} 6449 6450int unmapped_buf_allowed = 1; 6451 6452void 6453pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 6454 vm_offset_t b_offset, int xfersize) 6455{ 6456 void *a_cp, *b_cp; 6457 vm_page_t m_a, m_b; 6458 vm_paddr_t p_a, p_b; 6459 vm_offset_t a_pg_offset, b_pg_offset; 6460 int cnt; 6461 6462 while (xfersize > 0) { 6463 a_pg_offset = a_offset & PAGE_MASK; 6464 m_a = ma[a_offset >> PAGE_SHIFT]; 6465 p_a = m_a->phys_addr; 6466 b_pg_offset = b_offset & PAGE_MASK; 6467 m_b = mb[b_offset >> PAGE_SHIFT]; 6468 p_b = m_b->phys_addr; 6469 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 6470 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 6471 if (__predict_false(!PHYS_IN_DMAP(p_a))) { 6472 panic("!DMAP a %lx", p_a); 6473 } else { 6474 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 6475 } 6476 if (__predict_false(!PHYS_IN_DMAP(p_b))) { 6477 panic("!DMAP b %lx", p_b); 6478 } else { 6479 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 6480 } 6481 bcopy(a_cp, b_cp, cnt); 6482 a_offset += cnt; 6483 b_offset += cnt; 6484 xfersize -= cnt; 6485 } 6486} 6487 6488vm_offset_t 6489pmap_quick_enter_page(vm_page_t m) 6490{ 6491 6492 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); 6493} 6494 6495void 6496pmap_quick_remove_page(vm_offset_t addr) 6497{ 6498} 6499 6500/* 6501 * Returns true if the pmap's pv is one of the first 6502 * 16 pvs linked to from this page. This count may 6503 * be changed upwards or downwards in the future; it 6504 * is only necessary that true be returned for a small 6505 * subset of pmaps for proper page aging. 6506 */ 6507bool 6508pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 6509{ 6510 struct md_page *pvh; 6511 struct rwlock *lock; 6512 pv_entry_t pv; 6513 int loops = 0; 6514 bool rv; 6515 6516 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6517 ("pmap_page_exists_quick: page %p is not managed", m)); 6518 rv = false; 6519 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6520 rw_rlock(lock); 6521 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6522 if (PV_PMAP(pv) == pmap) { 6523 rv = true; 6524 break; 6525 } 6526 loops++; 6527 if (loops >= 16) 6528 break; 6529 } 6530 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 6531 pvh = page_to_pvh(m); 6532 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6533 if (PV_PMAP(pv) == pmap) { 6534 rv = true; 6535 break; 6536 } 6537 loops++; 6538 if (loops >= 16) 6539 break; 6540 } 6541 } 6542 rw_runlock(lock); 6543 return (rv); 6544} 6545 6546/* 6547 * pmap_page_wired_mappings: 6548 * 6549 * Return the number of managed mappings to the given physical page 6550 * that are wired. 6551 */ 6552int 6553pmap_page_wired_mappings(vm_page_t m) 6554{ 6555 struct rwlock *lock; 6556 struct md_page *pvh; 6557 pmap_t pmap; 6558 pt_entry_t *pte; 6559 pv_entry_t pv; 6560 int count, md_gen, pvh_gen; 6561 6562 if ((m->oflags & VPO_UNMANAGED) != 0) 6563 return (0); 6564 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6565 rw_rlock(lock); 6566restart: 6567 count = 0; 6568 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6569 pmap = PV_PMAP(pv); 6570 if (!PMAP_TRYLOCK(pmap)) { 6571 md_gen = m->md.pv_gen; 6572 rw_runlock(lock); 6573 PMAP_LOCK(pmap); 6574 rw_rlock(lock); 6575 if (md_gen != m->md.pv_gen) { 6576 PMAP_UNLOCK(pmap); 6577 goto restart; 6578 } 6579 } 6580 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 6581 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 6582 count++; 6583 PMAP_UNLOCK(pmap); 6584 } 6585 if ((m->flags & PG_FICTITIOUS) == 0) { 6586 pvh = page_to_pvh(m); 6587 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6588 pmap = PV_PMAP(pv); 6589 if (!PMAP_TRYLOCK(pmap)) { 6590 md_gen = m->md.pv_gen; 6591 pvh_gen = pvh->pv_gen; 6592 rw_runlock(lock); 6593 PMAP_LOCK(pmap); 6594 rw_rlock(lock); 6595 if (md_gen != m->md.pv_gen || 6596 pvh_gen != pvh->pv_gen) { 6597 PMAP_UNLOCK(pmap); 6598 goto restart; 6599 } 6600 } 6601 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 6602 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0) 6603 count++; 6604 PMAP_UNLOCK(pmap); 6605 } 6606 } 6607 rw_runlock(lock); 6608 return (count); 6609} 6610 6611/* 6612 * Returns true if the given page is mapped individually or as part of 6613 * a 2mpage. Otherwise, returns false. 6614 */ 6615bool 6616pmap_page_is_mapped(vm_page_t m) 6617{ 6618 struct rwlock *lock; 6619 bool rv; 6620 6621 if ((m->oflags & VPO_UNMANAGED) != 0) 6622 return (false); 6623 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6624 rw_rlock(lock); 6625 rv = !TAILQ_EMPTY(&m->md.pv_list) || 6626 ((m->flags & PG_FICTITIOUS) == 0 && 6627 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list)); 6628 rw_runlock(lock); 6629 return (rv); 6630} 6631 6632/* 6633 * Destroy all managed, non-wired mappings in the given user-space 6634 * pmap. This pmap cannot be active on any processor besides the 6635 * caller. 6636 * 6637 * This function cannot be applied to the kernel pmap. Moreover, it 6638 * is not intended for general use. It is only to be used during 6639 * process termination. Consequently, it can be implemented in ways 6640 * that make it faster than pmap_remove(). First, it can more quickly 6641 * destroy mappings by iterating over the pmap's collection of PV 6642 * entries, rather than searching the page table. Second, it doesn't 6643 * have to test and clear the page table entries atomically, because 6644 * no processor is currently accessing the user address space. In 6645 * particular, a page table entry's dirty bit won't change state once 6646 * this function starts. 6647 */ 6648void 6649pmap_remove_pages(pmap_t pmap) 6650{ 6651 pd_entry_t *pde; 6652 pt_entry_t *pte, tpte; 6653 struct spglist free; 6654 struct pv_chunklist free_chunks[PMAP_MEMDOM]; 6655 vm_page_t m, ml3, mt; 6656 pv_entry_t pv; 6657 struct md_page *pvh; 6658 struct pv_chunk *pc, *npc; 6659 struct rwlock *lock; 6660 int64_t bit; 6661 uint64_t inuse, bitmask; 6662 int allfree, field, i, idx, lvl; 6663 int freed __pvused; 6664 vm_paddr_t pa; 6665 6666 lock = NULL; 6667 6668 for (i = 0; i < PMAP_MEMDOM; i++) 6669 TAILQ_INIT(&free_chunks[i]); 6670 SLIST_INIT(&free); 6671 PMAP_LOCK(pmap); 6672 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 6673 allfree = 1; 6674 freed = 0; 6675 for (field = 0; field < _NPCM; field++) { 6676 inuse = ~pc->pc_map[field] & pc_freemask[field]; 6677 while (inuse != 0) { 6678 bit = ffsl(inuse) - 1; 6679 bitmask = 1UL << bit; 6680 idx = field * 64 + bit; 6681 pv = &pc->pc_pventry[idx]; 6682 inuse &= ~bitmask; 6683 6684 pde = pmap_pde(pmap, pv->pv_va, &lvl); 6685 KASSERT(pde != NULL, 6686 ("Attempting to remove an unmapped page")); 6687 6688 switch(lvl) { 6689 case 1: 6690 pte = pmap_l1_to_l2(pde, pv->pv_va); 6691 tpte = pmap_load(pte); 6692 KASSERT((tpte & ATTR_DESCR_MASK) == 6693 L2_BLOCK, 6694 ("Attempting to remove an invalid " 6695 "block: %lx", tpte)); 6696 break; 6697 case 2: 6698 pte = pmap_l2_to_l3(pde, pv->pv_va); 6699 tpte = pmap_load(pte); 6700 KASSERT((tpte & ATTR_DESCR_MASK) == 6701 L3_PAGE, 6702 ("Attempting to remove an invalid " 6703 "page: %lx", tpte)); 6704 break; 6705 default: 6706 panic( 6707 "Invalid page directory level: %d", 6708 lvl); 6709 } 6710 6711 /* 6712 * We cannot remove wired mappings at this time. 6713 * 6714 * For L3C superpages, all of the constituent PTEs 6715 * should have the wired bit set, so we don't 6716 * check for ATTR_CONTIGUOUS here. 6717 */ 6718 if (tpte & ATTR_SW_WIRED) { 6719 allfree = 0; 6720 continue; 6721 } 6722 6723 /* Mark free */ 6724 pc->pc_map[field] |= bitmask; 6725 6726 /* 6727 * Because this pmap is not active on other 6728 * processors, the dirty bit cannot have 6729 * changed state since we last loaded pte. 6730 */ 6731 pmap_clear(pte); 6732 6733 pa = PTE_TO_PHYS(tpte); 6734 6735 m = PHYS_TO_VM_PAGE(pa); 6736 KASSERT(m->phys_addr == pa, 6737 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 6738 m, (uintmax_t)m->phys_addr, 6739 (uintmax_t)tpte)); 6740 6741 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 6742 m < &vm_page_array[vm_page_array_size], 6743 ("pmap_remove_pages: bad pte %#jx", 6744 (uintmax_t)tpte)); 6745 6746 /* 6747 * Update the vm_page_t clean/reference bits. 6748 * 6749 * We don't check for ATTR_CONTIGUOUS here 6750 * because writeable L3C superpages are expected 6751 * to be dirty, i.e., every constituent PTE 6752 * should be dirty. 6753 */ 6754 if (pmap_pte_dirty(pmap, tpte)) { 6755 switch (lvl) { 6756 case 1: 6757 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 6758 vm_page_dirty(mt); 6759 break; 6760 case 2: 6761 vm_page_dirty(m); 6762 break; 6763 } 6764 } 6765 6766 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 6767 6768 switch (lvl) { 6769 case 1: 6770 pmap_resident_count_dec(pmap, 6771 L2_SIZE / PAGE_SIZE); 6772 pvh = page_to_pvh(m); 6773 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); 6774 pvh->pv_gen++; 6775 if (TAILQ_EMPTY(&pvh->pv_list)) { 6776 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) 6777 if ((mt->a.flags & PGA_WRITEABLE) != 0 && 6778 TAILQ_EMPTY(&mt->md.pv_list)) 6779 vm_page_aflag_clear(mt, PGA_WRITEABLE); 6780 } 6781 ml3 = pmap_remove_pt_page(pmap, 6782 pv->pv_va); 6783 if (ml3 != NULL) { 6784 KASSERT(vm_page_any_valid(ml3), 6785 ("pmap_remove_pages: l3 page not promoted")); 6786 pmap_resident_count_dec(pmap,1); 6787 KASSERT(ml3->ref_count == NL3PG, 6788 ("pmap_remove_pages: l3 page ref count error")); 6789 ml3->ref_count = 0; 6790 pmap_add_delayed_free_list(ml3, 6791 &free, false); 6792 } 6793 break; 6794 case 2: 6795 pmap_resident_count_dec(pmap, 1); 6796 TAILQ_REMOVE(&m->md.pv_list, pv, 6797 pv_next); 6798 m->md.pv_gen++; 6799 if ((m->a.flags & PGA_WRITEABLE) != 0 && 6800 TAILQ_EMPTY(&m->md.pv_list) && 6801 (m->flags & PG_FICTITIOUS) == 0) { 6802 pvh = page_to_pvh(m); 6803 if (TAILQ_EMPTY(&pvh->pv_list)) 6804 vm_page_aflag_clear(m, 6805 PGA_WRITEABLE); 6806 } 6807 break; 6808 } 6809 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), 6810 &free); 6811 freed++; 6812 } 6813 } 6814 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 6815 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 6816 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 6817 if (allfree) { 6818 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 6819 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, 6820 pc_list); 6821 } 6822 } 6823 if (lock != NULL) 6824 rw_wunlock(lock); 6825 pmap_invalidate_all(pmap); 6826 pmap_bti_deassign_all(pmap); 6827 free_pv_chunk_batch(free_chunks); 6828 PMAP_UNLOCK(pmap); 6829 vm_page_free_pages_toq(&free, true); 6830} 6831 6832/* 6833 * This is used to check if a page has been accessed or modified. 6834 */ 6835static bool 6836pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified) 6837{ 6838 struct rwlock *lock; 6839 pv_entry_t pv; 6840 struct md_page *pvh; 6841 pt_entry_t l3e, mask, *pte, value; 6842 pmap_t pmap; 6843 int md_gen, pvh_gen; 6844 bool rv; 6845 6846 rv = false; 6847 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6848 rw_rlock(lock); 6849restart: 6850 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6851 pmap = PV_PMAP(pv); 6852 PMAP_ASSERT_STAGE1(pmap); 6853 if (!PMAP_TRYLOCK(pmap)) { 6854 md_gen = m->md.pv_gen; 6855 rw_runlock(lock); 6856 PMAP_LOCK(pmap); 6857 rw_rlock(lock); 6858 if (md_gen != m->md.pv_gen) { 6859 PMAP_UNLOCK(pmap); 6860 goto restart; 6861 } 6862 } 6863 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 6864 mask = 0; 6865 value = 0; 6866 if (modified) { 6867 mask |= ATTR_S1_AP_RW_BIT; 6868 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 6869 } 6870 if (accessed) { 6871 mask |= ATTR_AF | ATTR_DESCR_MASK; 6872 value |= ATTR_AF | L3_PAGE; 6873 } 6874 l3e = pmap_load(pte); 6875 if ((l3e & ATTR_CONTIGUOUS) != 0) 6876 l3e = pmap_load_l3c(pte); 6877 PMAP_UNLOCK(pmap); 6878 rv = (l3e & mask) == value; 6879 if (rv) 6880 goto out; 6881 } 6882 if ((m->flags & PG_FICTITIOUS) == 0) { 6883 pvh = page_to_pvh(m); 6884 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 6885 pmap = PV_PMAP(pv); 6886 PMAP_ASSERT_STAGE1(pmap); 6887 if (!PMAP_TRYLOCK(pmap)) { 6888 md_gen = m->md.pv_gen; 6889 pvh_gen = pvh->pv_gen; 6890 rw_runlock(lock); 6891 PMAP_LOCK(pmap); 6892 rw_rlock(lock); 6893 if (md_gen != m->md.pv_gen || 6894 pvh_gen != pvh->pv_gen) { 6895 PMAP_UNLOCK(pmap); 6896 goto restart; 6897 } 6898 } 6899 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__); 6900 mask = 0; 6901 value = 0; 6902 if (modified) { 6903 mask |= ATTR_S1_AP_RW_BIT; 6904 value |= ATTR_S1_AP(ATTR_S1_AP_RW); 6905 } 6906 if (accessed) { 6907 mask |= ATTR_AF | ATTR_DESCR_MASK; 6908 value |= ATTR_AF | L2_BLOCK; 6909 } 6910 rv = (pmap_load(pte) & mask) == value; 6911 PMAP_UNLOCK(pmap); 6912 if (rv) 6913 goto out; 6914 } 6915 } 6916out: 6917 rw_runlock(lock); 6918 return (rv); 6919} 6920 6921/* 6922 * pmap_is_modified: 6923 * 6924 * Return whether or not the specified physical page was modified 6925 * in any physical maps. 6926 */ 6927bool 6928pmap_is_modified(vm_page_t m) 6929{ 6930 6931 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6932 ("pmap_is_modified: page %p is not managed", m)); 6933 6934 /* 6935 * If the page is not busied then this check is racy. 6936 */ 6937 if (!pmap_page_is_write_mapped(m)) 6938 return (false); 6939 return (pmap_page_test_mappings(m, false, true)); 6940} 6941 6942/* 6943 * pmap_is_prefaultable: 6944 * 6945 * Return whether or not the specified virtual address is eligible 6946 * for prefault. 6947 */ 6948bool 6949pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 6950{ 6951 pd_entry_t *pde; 6952 pt_entry_t *pte; 6953 bool rv; 6954 int lvl; 6955 6956 /* 6957 * Return true if and only if the L3 entry for the specified virtual 6958 * address is allocated but invalid. 6959 */ 6960 rv = false; 6961 PMAP_LOCK(pmap); 6962 pde = pmap_pde(pmap, addr, &lvl); 6963 if (pde != NULL && lvl == 2) { 6964 pte = pmap_l2_to_l3(pde, addr); 6965 rv = pmap_load(pte) == 0; 6966 } 6967 PMAP_UNLOCK(pmap); 6968 return (rv); 6969} 6970 6971/* 6972 * pmap_is_referenced: 6973 * 6974 * Return whether or not the specified physical page was referenced 6975 * in any physical maps. 6976 */ 6977bool 6978pmap_is_referenced(vm_page_t m) 6979{ 6980 6981 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6982 ("pmap_is_referenced: page %p is not managed", m)); 6983 return (pmap_page_test_mappings(m, true, false)); 6984} 6985 6986/* 6987 * Clear the write and modified bits in each of the given page's mappings. 6988 */ 6989void 6990pmap_remove_write(vm_page_t m) 6991{ 6992 struct md_page *pvh; 6993 pmap_t pmap; 6994 struct rwlock *lock; 6995 pv_entry_t next_pv, pv; 6996 pt_entry_t oldpte, *pte, set, clear, mask, val; 6997 vm_offset_t va; 6998 int md_gen, pvh_gen; 6999 7000 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7001 ("pmap_remove_write: page %p is not managed", m)); 7002 vm_page_assert_busied(m); 7003 7004 if (!pmap_page_is_write_mapped(m)) 7005 return; 7006 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7007 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7008 rw_wlock(lock); 7009retry: 7010 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7011 pmap = PV_PMAP(pv); 7012 PMAP_ASSERT_STAGE1(pmap); 7013 if (!PMAP_TRYLOCK(pmap)) { 7014 pvh_gen = pvh->pv_gen; 7015 rw_wunlock(lock); 7016 PMAP_LOCK(pmap); 7017 rw_wlock(lock); 7018 if (pvh_gen != pvh->pv_gen) { 7019 PMAP_UNLOCK(pmap); 7020 goto retry; 7021 } 7022 } 7023 va = pv->pv_va; 7024 pte = pmap_pte_exists(pmap, va, 2, __func__); 7025 if ((pmap_load(pte) & ATTR_SW_DBM) != 0) 7026 (void)pmap_demote_l2_locked(pmap, pte, va, &lock); 7027 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 7028 ("inconsistent pv lock %p %p for page %p", 7029 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 7030 PMAP_UNLOCK(pmap); 7031 } 7032 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7033 pmap = PV_PMAP(pv); 7034 if (!PMAP_TRYLOCK(pmap)) { 7035 pvh_gen = pvh->pv_gen; 7036 md_gen = m->md.pv_gen; 7037 rw_wunlock(lock); 7038 PMAP_LOCK(pmap); 7039 rw_wlock(lock); 7040 if (pvh_gen != pvh->pv_gen || 7041 md_gen != m->md.pv_gen) { 7042 PMAP_UNLOCK(pmap); 7043 goto retry; 7044 } 7045 } 7046 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7047 oldpte = pmap_load(pte); 7048 if ((oldpte & ATTR_SW_DBM) != 0) { 7049 if ((oldpte & ATTR_CONTIGUOUS) != 0) { 7050 (void)pmap_demote_l3c(pmap, pte, pv->pv_va); 7051 7052 /* 7053 * The L3 entry's accessed bit may have 7054 * changed. 7055 */ 7056 oldpte = pmap_load(pte); 7057 } 7058 if (pmap->pm_stage == PM_STAGE1) { 7059 set = ATTR_S1_AP_RW_BIT; 7060 clear = 0; 7061 mask = ATTR_S1_AP_RW_BIT; 7062 val = ATTR_S1_AP(ATTR_S1_AP_RW); 7063 } else { 7064 set = 0; 7065 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7066 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7067 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE); 7068 } 7069 clear |= ATTR_SW_DBM; 7070 while (!atomic_fcmpset_64(pte, &oldpte, 7071 (oldpte | set) & ~clear)) 7072 cpu_spinwait(); 7073 7074 if ((oldpte & mask) == val) 7075 vm_page_dirty(m); 7076 pmap_invalidate_page(pmap, pv->pv_va, true); 7077 } 7078 PMAP_UNLOCK(pmap); 7079 } 7080 rw_wunlock(lock); 7081 vm_page_aflag_clear(m, PGA_WRITEABLE); 7082} 7083 7084/* 7085 * pmap_ts_referenced: 7086 * 7087 * Return a count of reference bits for a page, clearing those bits. 7088 * It is not necessary for every reference bit to be cleared, but it 7089 * is necessary that 0 only be returned when there are truly no 7090 * reference bits set. 7091 * 7092 * As an optimization, update the page's dirty field if a modified bit is 7093 * found while counting reference bits. This opportunistic update can be 7094 * performed at low cost and can eliminate the need for some future calls 7095 * to pmap_is_modified(). However, since this function stops after 7096 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 7097 * dirty pages. Those dirty pages will only be detected by a future call 7098 * to pmap_is_modified(). 7099 */ 7100int 7101pmap_ts_referenced(vm_page_t m) 7102{ 7103 struct md_page *pvh; 7104 pv_entry_t pv, pvf; 7105 pmap_t pmap; 7106 struct rwlock *lock; 7107 pt_entry_t *pte, tpte; 7108 vm_offset_t va; 7109 vm_paddr_t pa; 7110 int cleared, md_gen, not_cleared, pvh_gen; 7111 struct spglist free; 7112 7113 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7114 ("pmap_ts_referenced: page %p is not managed", m)); 7115 SLIST_INIT(&free); 7116 cleared = 0; 7117 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7118 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7119 rw_wlock(lock); 7120retry: 7121 not_cleared = 0; 7122 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 7123 goto small_mappings; 7124 pv = pvf; 7125 do { 7126 if (pvf == NULL) 7127 pvf = pv; 7128 pmap = PV_PMAP(pv); 7129 if (!PMAP_TRYLOCK(pmap)) { 7130 pvh_gen = pvh->pv_gen; 7131 rw_wunlock(lock); 7132 PMAP_LOCK(pmap); 7133 rw_wlock(lock); 7134 if (pvh_gen != pvh->pv_gen) { 7135 PMAP_UNLOCK(pmap); 7136 goto retry; 7137 } 7138 } 7139 va = pv->pv_va; 7140 pte = pmap_pte_exists(pmap, va, 2, __func__); 7141 tpte = pmap_load(pte); 7142 if (pmap_pte_dirty(pmap, tpte)) { 7143 /* 7144 * Although "tpte" is mapping a 2MB page, because 7145 * this function is called at a 4KB page granularity, 7146 * we only update the 4KB page under test. 7147 */ 7148 vm_page_dirty(m); 7149 } 7150 if ((tpte & ATTR_AF) != 0) { 7151 pa = VM_PAGE_TO_PHYS(m); 7152 7153 /* 7154 * Since this reference bit is shared by 512 4KB pages, 7155 * it should not be cleared every time it is tested. 7156 * Apply a simple "hash" function on the physical page 7157 * number, the virtual superpage number, and the pmap 7158 * address to select one 4KB page out of the 512 on 7159 * which testing the reference bit will result in 7160 * clearing that reference bit. This function is 7161 * designed to avoid the selection of the same 4KB page 7162 * for every 2MB page mapping. 7163 * 7164 * On demotion, a mapping that hasn't been referenced 7165 * is simply destroyed. To avoid the possibility of a 7166 * subsequent page fault on a demoted wired mapping, 7167 * always leave its reference bit set. Moreover, 7168 * since the superpage is wired, the current state of 7169 * its reference bit won't affect page replacement. 7170 */ 7171 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^ 7172 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && 7173 (tpte & ATTR_SW_WIRED) == 0) { 7174 pmap_clear_bits(pte, ATTR_AF); 7175 pmap_invalidate_page(pmap, va, true); 7176 cleared++; 7177 } else 7178 not_cleared++; 7179 } 7180 PMAP_UNLOCK(pmap); 7181 /* Rotate the PV list if it has more than one entry. */ 7182 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7183 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 7184 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 7185 pvh->pv_gen++; 7186 } 7187 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 7188 goto out; 7189 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 7190small_mappings: 7191 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 7192 goto out; 7193 pv = pvf; 7194 do { 7195 if (pvf == NULL) 7196 pvf = pv; 7197 pmap = PV_PMAP(pv); 7198 if (!PMAP_TRYLOCK(pmap)) { 7199 pvh_gen = pvh->pv_gen; 7200 md_gen = m->md.pv_gen; 7201 rw_wunlock(lock); 7202 PMAP_LOCK(pmap); 7203 rw_wlock(lock); 7204 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7205 PMAP_UNLOCK(pmap); 7206 goto retry; 7207 } 7208 } 7209 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__); 7210 tpte = pmap_load(pte); 7211 if (pmap_pte_dirty(pmap, tpte)) 7212 vm_page_dirty(m); 7213 if ((tpte & ATTR_AF) != 0) { 7214 if ((tpte & ATTR_SW_WIRED) == 0) { 7215 /* 7216 * Clear the accessed bit in this L3 entry 7217 * regardless of the contiguous bit. 7218 */ 7219 pmap_clear_bits(pte, ATTR_AF); 7220 pmap_invalidate_page(pmap, pv->pv_va, true); 7221 cleared++; 7222 } else 7223 not_cleared++; 7224 } else if ((tpte & ATTR_CONTIGUOUS) != 0 && 7225 (pmap_load_l3c(pte) & ATTR_AF) != 0) { 7226 /* 7227 * An L3C superpage mapping is regarded as accessed 7228 * until the accessed bit has been cleared in all 7229 * of its constituent entries. 7230 */ 7231 not_cleared++; 7232 } 7233 PMAP_UNLOCK(pmap); 7234 /* Rotate the PV list if it has more than one entry. */ 7235 if (TAILQ_NEXT(pv, pv_next) != NULL) { 7236 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 7237 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 7238 m->md.pv_gen++; 7239 } 7240 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 7241 not_cleared < PMAP_TS_REFERENCED_MAX); 7242out: 7243 rw_wunlock(lock); 7244 vm_page_free_pages_toq(&free, true); 7245 return (cleared + not_cleared); 7246} 7247 7248/* 7249 * Apply the given advice to the specified range of addresses within the 7250 * given pmap. Depending on the advice, clear the referenced and/or 7251 * modified flags in each mapping and set the mapped page's dirty field. 7252 */ 7253void 7254pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 7255{ 7256 struct rwlock *lock; 7257 vm_offset_t va, va_next, dva; 7258 vm_page_t m; 7259 pd_entry_t *l0, *l1, *l2, oldl2; 7260 pt_entry_t *l3, *dl3, oldl3; 7261 7262 PMAP_ASSERT_STAGE1(pmap); 7263 7264 if (advice != MADV_DONTNEED && advice != MADV_FREE) 7265 return; 7266 7267 PMAP_LOCK(pmap); 7268 for (; sva < eva; sva = va_next) { 7269 l0 = pmap_l0(pmap, sva); 7270 if (pmap_load(l0) == 0) { 7271 va_next = (sva + L0_SIZE) & ~L0_OFFSET; 7272 if (va_next < sva) 7273 va_next = eva; 7274 continue; 7275 } 7276 7277 va_next = (sva + L1_SIZE) & ~L1_OFFSET; 7278 if (va_next < sva) 7279 va_next = eva; 7280 l1 = pmap_l0_to_l1(l0, sva); 7281 if (pmap_load(l1) == 0) 7282 continue; 7283 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) { 7284 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7285 continue; 7286 } 7287 7288 va_next = (sva + L2_SIZE) & ~L2_OFFSET; 7289 if (va_next < sva) 7290 va_next = eva; 7291 l2 = pmap_l1_to_l2(l1, sva); 7292 oldl2 = pmap_load(l2); 7293 if (oldl2 == 0) 7294 continue; 7295 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { 7296 if ((oldl2 & ATTR_SW_MANAGED) == 0) 7297 continue; 7298 lock = NULL; 7299 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { 7300 if (lock != NULL) 7301 rw_wunlock(lock); 7302 7303 /* 7304 * The 2MB page mapping was destroyed. 7305 */ 7306 continue; 7307 } 7308 7309 /* 7310 * Unless the page mappings are wired, remove the 7311 * mapping to a single page so that a subsequent 7312 * access may repromote. Choosing the last page 7313 * within the address range [sva, min(va_next, eva)) 7314 * generally results in more repromotions. Since the 7315 * underlying page table page is fully populated, this 7316 * removal never frees a page table page. 7317 */ 7318 if ((oldl2 & ATTR_SW_WIRED) == 0) { 7319 va = eva; 7320 if (va > va_next) 7321 va = va_next; 7322 va -= PAGE_SIZE; 7323 KASSERT(va >= sva, 7324 ("pmap_advise: no address gap")); 7325 l3 = pmap_l2_to_l3(l2, va); 7326 KASSERT(pmap_load(l3) != 0, 7327 ("pmap_advise: invalid PTE")); 7328 pmap_remove_l3(pmap, l3, va, pmap_load(l2), 7329 NULL, &lock); 7330 } 7331 if (lock != NULL) 7332 rw_wunlock(lock); 7333 } 7334 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, 7335 ("pmap_advise: invalid L2 entry after demotion")); 7336 if (va_next > eva) 7337 va_next = eva; 7338 va = va_next; 7339 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, 7340 sva += L3_SIZE) { 7341 oldl3 = pmap_load(l3); 7342 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != 7343 (ATTR_SW_MANAGED | L3_PAGE)) 7344 goto maybe_invlrng; 7345 else if (pmap_pte_dirty(pmap, oldl3)) { 7346 if (advice == MADV_DONTNEED) { 7347 /* 7348 * Future calls to pmap_is_modified() 7349 * can be avoided by making the page 7350 * dirty now. 7351 */ 7352 m = PTE_TO_VM_PAGE(oldl3); 7353 vm_page_dirty(m); 7354 } 7355 if ((oldl3 & ATTR_CONTIGUOUS) != 0) { 7356 /* 7357 * Unconditionally demote the L3C 7358 * superpage because we do not allow 7359 * writeable, clean superpages. 7360 */ 7361 (void)pmap_demote_l3c(pmap, l3, sva); 7362 7363 /* 7364 * Destroy the final mapping before the 7365 * next L3C boundary or va_next, 7366 * whichever comes first, so that a 7367 * subsequent access may act as a 7368 * repromotion trigger. 7369 */ 7370 if ((oldl3 & ATTR_SW_WIRED) == 0) { 7371 dva = MIN((sva & ~L3C_OFFSET) + 7372 L3C_SIZE - PAGE_SIZE, 7373 va_next - PAGE_SIZE); 7374 dl3 = pmap_l2_to_l3(l2, dva); 7375 KASSERT(pmap_load(dl3) != 0, 7376 ("pmap_advise: invalid PTE")); 7377 lock = NULL; 7378 pmap_remove_l3(pmap, dl3, dva, 7379 pmap_load(l2), NULL, &lock); 7380 if (lock != NULL) 7381 rw_wunlock(lock); 7382 } 7383 7384 /* 7385 * The L3 entry's accessed bit may have 7386 * changed. 7387 */ 7388 oldl3 = pmap_load(l3); 7389 } 7390 7391 /* 7392 * Check that we did not just destroy this entry so 7393 * we avoid corrupting the page able. 7394 */ 7395 if (oldl3 != 0) { 7396 while (!atomic_fcmpset_long(l3, &oldl3, 7397 (oldl3 & ~ATTR_AF) | 7398 ATTR_S1_AP(ATTR_S1_AP_RO))) 7399 cpu_spinwait(); 7400 } 7401 } else if ((oldl3 & ATTR_AF) != 0) { 7402 /* 7403 * Clear the accessed bit in this L3 entry 7404 * regardless of the contiguous bit. 7405 */ 7406 pmap_clear_bits(l3, ATTR_AF); 7407 } else 7408 goto maybe_invlrng; 7409 if (va == va_next) 7410 va = sva; 7411 continue; 7412maybe_invlrng: 7413 if (va != va_next) { 7414 pmap_s1_invalidate_range(pmap, va, sva, true); 7415 va = va_next; 7416 } 7417 } 7418 if (va != va_next) 7419 pmap_s1_invalidate_range(pmap, va, sva, true); 7420 } 7421 PMAP_UNLOCK(pmap); 7422} 7423 7424/* 7425 * Clear the modify bits on the specified physical page. 7426 */ 7427void 7428pmap_clear_modify(vm_page_t m) 7429{ 7430 struct md_page *pvh; 7431 struct rwlock *lock; 7432 pmap_t pmap; 7433 pv_entry_t next_pv, pv; 7434 pd_entry_t *l2, oldl2; 7435 pt_entry_t *l3, oldl3; 7436 vm_offset_t va; 7437 int md_gen, pvh_gen; 7438 7439 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 7440 ("pmap_clear_modify: page %p is not managed", m)); 7441 vm_page_assert_busied(m); 7442 7443 if (!pmap_page_is_write_mapped(m)) 7444 return; 7445 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m); 7446 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 7447 rw_wlock(lock); 7448restart: 7449 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 7450 pmap = PV_PMAP(pv); 7451 PMAP_ASSERT_STAGE1(pmap); 7452 if (!PMAP_TRYLOCK(pmap)) { 7453 pvh_gen = pvh->pv_gen; 7454 rw_wunlock(lock); 7455 PMAP_LOCK(pmap); 7456 rw_wlock(lock); 7457 if (pvh_gen != pvh->pv_gen) { 7458 PMAP_UNLOCK(pmap); 7459 goto restart; 7460 } 7461 } 7462 va = pv->pv_va; 7463 l2 = pmap_l2(pmap, va); 7464 oldl2 = pmap_load(l2); 7465 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ 7466 if ((oldl2 & ATTR_SW_DBM) != 0 && 7467 pmap_demote_l2_locked(pmap, l2, va, &lock) && 7468 (oldl2 & ATTR_SW_WIRED) == 0) { 7469 /* 7470 * Write protect the mapping to a single page so that 7471 * a subsequent write access may repromote. 7472 */ 7473 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); 7474 l3 = pmap_l2_to_l3(l2, va); 7475 oldl3 = pmap_load(l3); 7476 while (!atomic_fcmpset_long(l3, &oldl3, 7477 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO))) 7478 cpu_spinwait(); 7479 vm_page_dirty(m); 7480 pmap_s1_invalidate_page(pmap, va, true); 7481 } 7482 PMAP_UNLOCK(pmap); 7483 } 7484 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 7485 pmap = PV_PMAP(pv); 7486 PMAP_ASSERT_STAGE1(pmap); 7487 if (!PMAP_TRYLOCK(pmap)) { 7488 md_gen = m->md.pv_gen; 7489 pvh_gen = pvh->pv_gen; 7490 rw_wunlock(lock); 7491 PMAP_LOCK(pmap); 7492 rw_wlock(lock); 7493 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 7494 PMAP_UNLOCK(pmap); 7495 goto restart; 7496 } 7497 } 7498 l2 = pmap_l2(pmap, pv->pv_va); 7499 l3 = pmap_l2_to_l3(l2, pv->pv_va); 7500 oldl3 = pmap_load(l3); 7501 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 || 7502 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 7503 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 7504 ("writeable L3C superpage not dirty")); 7505 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { 7506 if ((oldl3 & ATTR_CONTIGUOUS) != 0) 7507 (void)pmap_demote_l3c(pmap, l3, pv->pv_va); 7508 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO)); 7509 pmap_s1_invalidate_page(pmap, pv->pv_va, true); 7510 } 7511 PMAP_UNLOCK(pmap); 7512 } 7513 rw_wunlock(lock); 7514} 7515 7516void * 7517pmap_mapbios(vm_paddr_t pa, vm_size_t size) 7518{ 7519 struct pmap_preinit_mapping *ppim; 7520 vm_offset_t va, offset; 7521 pd_entry_t old_l2e, *pde; 7522 pt_entry_t *l2; 7523 int i, lvl, l2_blocks, free_l2_count, start_idx; 7524 7525 if (!vm_initialized) { 7526 /* 7527 * No L3 ptables so map entire L2 blocks where start VA is: 7528 * preinit_map_va + start_idx * L2_SIZE 7529 * There may be duplicate mappings (multiple VA -> same PA) but 7530 * ARM64 dcache is always PIPT so that's acceptable. 7531 */ 7532 if (size == 0) 7533 return (NULL); 7534 7535 /* Calculate how many L2 blocks are needed for the mapping */ 7536 l2_blocks = (roundup2(pa + size, L2_SIZE) - 7537 rounddown2(pa, L2_SIZE)) >> L2_SHIFT; 7538 7539 offset = pa & L2_OFFSET; 7540 7541 if (preinit_map_va == 0) 7542 return (NULL); 7543 7544 /* Map 2MiB L2 blocks from reserved VA space */ 7545 7546 free_l2_count = 0; 7547 start_idx = -1; 7548 /* Find enough free contiguous VA space */ 7549 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7550 ppim = pmap_preinit_mapping + i; 7551 if (free_l2_count > 0 && ppim->pa != 0) { 7552 /* Not enough space here */ 7553 free_l2_count = 0; 7554 start_idx = -1; 7555 continue; 7556 } 7557 7558 if (ppim->pa == 0) { 7559 /* Free L2 block */ 7560 if (start_idx == -1) 7561 start_idx = i; 7562 free_l2_count++; 7563 if (free_l2_count == l2_blocks) 7564 break; 7565 } 7566 } 7567 if (free_l2_count != l2_blocks) 7568 panic("%s: too many preinit mappings", __func__); 7569 7570 va = preinit_map_va + (start_idx * L2_SIZE); 7571 for (i = start_idx; i < start_idx + l2_blocks; i++) { 7572 /* Mark entries as allocated */ 7573 ppim = pmap_preinit_mapping + i; 7574 ppim->pa = pa; 7575 ppim->va = va + offset; 7576 ppim->size = size; 7577 } 7578 7579 /* Map L2 blocks */ 7580 pa = rounddown2(pa, L2_SIZE); 7581 old_l2e = 0; 7582 for (i = 0; i < l2_blocks; i++) { 7583 pde = pmap_pde(kernel_pmap, va, &lvl); 7584 KASSERT(pde != NULL, 7585 ("pmap_mapbios: Invalid page entry, va: 0x%lx", 7586 va)); 7587 KASSERT(lvl == 1, 7588 ("pmap_mapbios: Invalid level %d", lvl)); 7589 7590 /* Insert L2_BLOCK */ 7591 l2 = pmap_l1_to_l2(pde, va); 7592 old_l2e |= pmap_load_store(l2, 7593 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN | 7594 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | 7595 L2_BLOCK); 7596 7597 va += L2_SIZE; 7598 pa += L2_SIZE; 7599 } 7600 if ((old_l2e & ATTR_DESCR_VALID) != 0) 7601 pmap_s1_invalidate_all(kernel_pmap); 7602 else { 7603 /* 7604 * Because the old entries were invalid and the new 7605 * mappings are not executable, an isb is not required. 7606 */ 7607 dsb(ishst); 7608 } 7609 7610 va = preinit_map_va + (start_idx * L2_SIZE); 7611 7612 } else { 7613 /* kva_alloc may be used to map the pages */ 7614 offset = pa & PAGE_MASK; 7615 size = round_page(offset + size); 7616 7617 va = kva_alloc(size); 7618 if (va == 0) 7619 panic("%s: Couldn't allocate KVA", __func__); 7620 7621 pde = pmap_pde(kernel_pmap, va, &lvl); 7622 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); 7623 7624 /* L3 table is linked */ 7625 va = trunc_page(va); 7626 pa = trunc_page(pa); 7627 pmap_kenter(va, size, pa, memory_mapping_mode(pa)); 7628 } 7629 7630 return ((void *)(va + offset)); 7631} 7632 7633void 7634pmap_unmapbios(void *p, vm_size_t size) 7635{ 7636 struct pmap_preinit_mapping *ppim; 7637 vm_offset_t offset, va, va_trunc; 7638 pd_entry_t *pde; 7639 pt_entry_t *l2; 7640 int i, lvl, l2_blocks, block; 7641 bool preinit_map; 7642 7643 va = (vm_offset_t)p; 7644 l2_blocks = 7645 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; 7646 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); 7647 7648 /* Remove preinit mapping */ 7649 preinit_map = false; 7650 block = 0; 7651 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 7652 ppim = pmap_preinit_mapping + i; 7653 if (ppim->va == va) { 7654 KASSERT(ppim->size == size, 7655 ("pmap_unmapbios: size mismatch")); 7656 ppim->va = 0; 7657 ppim->pa = 0; 7658 ppim->size = 0; 7659 preinit_map = true; 7660 offset = block * L2_SIZE; 7661 va_trunc = rounddown2(va, L2_SIZE) + offset; 7662 7663 /* Remove L2_BLOCK */ 7664 pde = pmap_pde(kernel_pmap, va_trunc, &lvl); 7665 KASSERT(pde != NULL, 7666 ("pmap_unmapbios: Invalid page entry, va: 0x%lx", 7667 va_trunc)); 7668 l2 = pmap_l1_to_l2(pde, va_trunc); 7669 pmap_clear(l2); 7670 7671 if (block == (l2_blocks - 1)) 7672 break; 7673 block++; 7674 } 7675 } 7676 if (preinit_map) { 7677 pmap_s1_invalidate_all(kernel_pmap); 7678 return; 7679 } 7680 7681 /* Unmap the pages reserved with kva_alloc. */ 7682 if (vm_initialized) { 7683 offset = va & PAGE_MASK; 7684 size = round_page(offset + size); 7685 va = trunc_page(va); 7686 7687 /* Unmap and invalidate the pages */ 7688 pmap_kremove_device(va, size); 7689 7690 kva_free(va, size); 7691 } 7692} 7693 7694/* 7695 * Sets the memory attribute for the specified page. 7696 */ 7697void 7698pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 7699{ 7700 7701 m->md.pv_memattr = ma; 7702 7703 /* 7704 * If "m" is a normal page, update its direct mapping. This update 7705 * can be relied upon to perform any cache operations that are 7706 * required for data coherence. 7707 */ 7708 if ((m->flags & PG_FICTITIOUS) == 0 && 7709 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 7710 m->md.pv_memattr) != 0) 7711 panic("memory attribute change on the direct map failed"); 7712} 7713 7714/* 7715 * Changes the specified virtual address range's memory type to that given by 7716 * the parameter "mode". The specified virtual address range must be 7717 * completely contained within either the direct map or the kernel map. If 7718 * the virtual address range is contained within the kernel map, then the 7719 * memory type for each of the corresponding ranges of the direct map is also 7720 * changed. (The corresponding ranges of the direct map are those ranges that 7721 * map the same physical pages as the specified virtual address range.) These 7722 * changes to the direct map are necessary because Intel describes the 7723 * behavior of their processors as "undefined" if two or more mappings to the 7724 * same physical page have different memory types. 7725 * 7726 * Returns zero if the change completed successfully, and either EINVAL or 7727 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 7728 * of the virtual address range was not mapped, and ENOMEM is returned if 7729 * there was insufficient memory available to complete the change. In the 7730 * latter case, the memory type may have been changed on some part of the 7731 * virtual address range or the direct map. 7732 */ 7733int 7734pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 7735{ 7736 int error; 7737 7738 PMAP_LOCK(kernel_pmap); 7739 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false); 7740 PMAP_UNLOCK(kernel_pmap); 7741 return (error); 7742} 7743 7744/* 7745 * Changes the specified virtual address range's protections to those 7746 * specified by "prot". Like pmap_change_attr(), protections for aliases 7747 * in the direct map are updated as well. Protections on aliasing mappings may 7748 * be a subset of the requested protections; for example, mappings in the direct 7749 * map are never executable. 7750 */ 7751int 7752pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot) 7753{ 7754 int error; 7755 7756 /* Only supported within the kernel map. */ 7757 if (va < VM_MIN_KERNEL_ADDRESS) 7758 return (EINVAL); 7759 7760 PMAP_LOCK(kernel_pmap); 7761 error = pmap_change_props_locked(va, size, prot, -1, false); 7762 PMAP_UNLOCK(kernel_pmap); 7763 return (error); 7764} 7765 7766static int 7767pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, 7768 int mode, bool skip_unmapped) 7769{ 7770 vm_offset_t base, offset, tmpva; 7771 vm_size_t pte_size; 7772 vm_paddr_t pa; 7773 pt_entry_t pte, *ptep, *newpte; 7774 pt_entry_t bits, mask; 7775 int lvl, rv; 7776 7777 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 7778 base = trunc_page(va); 7779 offset = va & PAGE_MASK; 7780 size = round_page(offset + size); 7781 7782 if (!VIRT_IN_DMAP(base) && 7783 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS)) 7784 return (EINVAL); 7785 7786 bits = 0; 7787 mask = 0; 7788 if (mode != -1) { 7789 bits = ATTR_S1_IDX(mode); 7790 mask = ATTR_S1_IDX_MASK; 7791 if (mode == VM_MEMATTR_DEVICE) { 7792 mask |= ATTR_S1_XN; 7793 bits |= ATTR_S1_XN; 7794 } 7795 } 7796 if (prot != VM_PROT_NONE) { 7797 /* Don't mark the DMAP as executable. It never is on arm64. */ 7798 if (VIRT_IN_DMAP(base)) { 7799 prot &= ~VM_PROT_EXECUTE; 7800 /* 7801 * XXX Mark the DMAP as writable for now. We rely 7802 * on this in ddb & dtrace to insert breakpoint 7803 * instructions. 7804 */ 7805 prot |= VM_PROT_WRITE; 7806 } 7807 7808 if ((prot & VM_PROT_WRITE) == 0) { 7809 bits |= ATTR_S1_AP(ATTR_S1_AP_RO); 7810 } 7811 if ((prot & VM_PROT_EXECUTE) == 0) { 7812 bits |= ATTR_S1_PXN; 7813 } 7814 bits |= ATTR_S1_UXN; 7815 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN; 7816 } 7817 7818 for (tmpva = base; tmpva < base + size; ) { 7819 ptep = pmap_pte(kernel_pmap, tmpva, &lvl); 7820 if (ptep == NULL && !skip_unmapped) { 7821 return (EINVAL); 7822 } else if ((ptep == NULL && skip_unmapped) || 7823 (pmap_load(ptep) & mask) == bits) { 7824 /* 7825 * We already have the correct attribute or there 7826 * is no memory mapped at this address and we are 7827 * skipping unmapped memory. 7828 */ 7829 switch (lvl) { 7830 default: 7831 panic("Invalid DMAP table level: %d\n", lvl); 7832 case 1: 7833 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; 7834 break; 7835 case 2: 7836 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; 7837 break; 7838 case 3: 7839 tmpva += PAGE_SIZE; 7840 break; 7841 } 7842 } else { 7843 /* We can't demote/promote this entry */ 7844 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0); 7845 7846 /* 7847 * Split the entry to an level 3 table, then 7848 * set the new attribute. 7849 */ 7850 switch (lvl) { 7851 default: 7852 panic("Invalid DMAP table level: %d\n", lvl); 7853 case 1: 7854 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7855 if ((tmpva & L1_OFFSET) == 0 && 7856 (base + size - tmpva) >= L1_SIZE) { 7857 pte_size = L1_SIZE; 7858 break; 7859 } 7860 newpte = pmap_demote_l1(kernel_pmap, ptep, 7861 tmpva & ~L1_OFFSET); 7862 if (newpte == NULL) 7863 return (EINVAL); 7864 ptep = pmap_l1_to_l2(ptep, tmpva); 7865 /* FALLTHROUGH */ 7866 case 2: 7867 if ((tmpva & L2_OFFSET) == 0 && 7868 (base + size - tmpva) >= L2_SIZE) { 7869 pte_size = L2_SIZE; 7870 break; 7871 } 7872 newpte = pmap_demote_l2(kernel_pmap, ptep, 7873 tmpva); 7874 if (newpte == NULL) 7875 return (EINVAL); 7876 ptep = pmap_l2_to_l3(ptep, tmpva); 7877 /* FALLTHROUGH */ 7878 case 3: 7879 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) { 7880 if ((tmpva & L3C_OFFSET) == 0 && 7881 (base + size - tmpva) >= L3C_SIZE) { 7882 pte_size = L3C_SIZE; 7883 break; 7884 } 7885 if (!pmap_demote_l3c(kernel_pmap, ptep, 7886 tmpva)) 7887 return (EINVAL); 7888 } 7889 pte_size = PAGE_SIZE; 7890 break; 7891 } 7892 7893 /* Update the entry */ 7894 pte = pmap_load(ptep); 7895 pte &= ~mask; 7896 pte |= bits; 7897 7898 pmap_update_entry(kernel_pmap, ptep, pte, tmpva, 7899 pte_size); 7900 7901 pa = PTE_TO_PHYS(pte); 7902 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) { 7903 /* 7904 * Keep the DMAP memory in sync. 7905 */ 7906 rv = pmap_change_props_locked( 7907 PHYS_TO_DMAP(pa), pte_size, 7908 prot, mode, true); 7909 if (rv != 0) 7910 return (rv); 7911 } 7912 7913 /* 7914 * If moving to a non-cacheable entry flush 7915 * the cache. 7916 */ 7917 if (mode == VM_MEMATTR_UNCACHEABLE) 7918 cpu_dcache_wbinv_range((void *)tmpva, pte_size); 7919 tmpva += pte_size; 7920 } 7921 } 7922 7923 return (0); 7924} 7925 7926/* 7927 * Create an L2 table to map all addresses within an L1 mapping. 7928 */ 7929static pt_entry_t * 7930pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) 7931{ 7932 pt_entry_t *l2, newl2, oldl1; 7933 vm_offset_t tmpl1; 7934 vm_paddr_t l2phys, phys; 7935 vm_page_t ml2; 7936 int i; 7937 7938 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 7939 oldl1 = pmap_load(l1); 7940 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 7941 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, 7942 ("pmap_demote_l1: Demoting a non-block entry")); 7943 KASSERT((va & L1_OFFSET) == 0, 7944 ("pmap_demote_l1: Invalid virtual address %#lx", va)); 7945 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, 7946 ("pmap_demote_l1: Level 1 table shouldn't be managed")); 7947 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0, 7948 ("pmap_demote_l1: Demoting entry with no-demote flag set")); 7949 7950 tmpl1 = 0; 7951 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { 7952 tmpl1 = kva_alloc(PAGE_SIZE); 7953 if (tmpl1 == 0) 7954 return (NULL); 7955 } 7956 7957 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) == 7958 NULL) { 7959 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" 7960 " in pmap %p", va, pmap); 7961 l2 = NULL; 7962 goto fail; 7963 } 7964 7965 l2phys = VM_PAGE_TO_PHYS(ml2); 7966 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); 7967 7968 /* Address the range points at */ 7969 phys = PTE_TO_PHYS(oldl1); 7970 /* The attributed from the old l1 table to be copied */ 7971 newl2 = oldl1 & ATTR_MASK; 7972 7973 /* Create the new entries */ 7974 for (i = 0; i < Ln_ENTRIES; i++) { 7975 l2[i] = newl2 | phys; 7976 phys += L2_SIZE; 7977 } 7978 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), 7979 ("Invalid l2 page (%lx != %lx)", l2[0], 7980 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); 7981 7982 if (tmpl1 != 0) { 7983 pmap_kenter(tmpl1, PAGE_SIZE, 7984 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, 7985 VM_MEMATTR_WRITE_BACK); 7986 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); 7987 } 7988 7989 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); 7990 7991fail: 7992 if (tmpl1 != 0) { 7993 pmap_kremove(tmpl1); 7994 kva_free(tmpl1, PAGE_SIZE); 7995 } 7996 7997 return (l2); 7998} 7999 8000static void 8001pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) 8002{ 8003 pt_entry_t *l3; 8004 8005 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { 8006 *l3 = newl3; 8007 newl3 += L3_SIZE; 8008 } 8009} 8010 8011static void 8012pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused) 8013{ 8014#ifdef INVARIANTS 8015#ifdef DIAGNOSTIC 8016 pt_entry_t *xl3p, *yl3p; 8017 8018 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES; 8019 xl3p++, newl3e += PAGE_SIZE) { 8020 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) { 8021 printf("pmap_demote_l2: xl3e %zd and newl3e map " 8022 "different pages: found %#lx, expected %#lx\n", 8023 xl3p - firstl3p, pmap_load(xl3p), newl3e); 8024 printf("page table dump\n"); 8025 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES; 8026 yl3p++) { 8027 printf("%zd %#lx\n", yl3p - firstl3p, 8028 pmap_load(yl3p)); 8029 } 8030 panic("firstpte"); 8031 } 8032 } 8033#else 8034 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e), 8035 ("pmap_demote_l2: firstl3 and newl3e map different physical" 8036 " addresses")); 8037#endif 8038#endif 8039} 8040 8041static void 8042pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, 8043 struct rwlock **lockp) 8044{ 8045 struct spglist free; 8046 8047 SLIST_INIT(&free); 8048 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, 8049 lockp); 8050 vm_page_free_pages_toq(&free, true); 8051} 8052 8053/* 8054 * Create an L3 table to map all addresses within an L2 mapping. 8055 */ 8056static pt_entry_t * 8057pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, 8058 struct rwlock **lockp) 8059{ 8060 pt_entry_t *l3, newl3, oldl2; 8061 vm_offset_t tmpl2; 8062 vm_paddr_t l3phys; 8063 vm_page_t ml3; 8064 8065 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8066 PMAP_ASSERT_STAGE1(pmap); 8067 KASSERT(ADDR_IS_CANONICAL(va), 8068 ("%s: Address not in canonical form: %lx", __func__, va)); 8069 8070 l3 = NULL; 8071 oldl2 = pmap_load(l2); 8072 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, 8073 ("pmap_demote_l2: Demoting a non-block entry")); 8074 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0, 8075 ("pmap_demote_l2: Demoting entry with no-demote flag set")); 8076 va &= ~L2_OFFSET; 8077 8078 tmpl2 = 0; 8079 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { 8080 tmpl2 = kva_alloc(PAGE_SIZE); 8081 if (tmpl2 == 0) 8082 return (NULL); 8083 } 8084 8085 /* 8086 * Invalidate the 2MB page mapping and return "failure" if the 8087 * mapping was never accessed. 8088 */ 8089 if ((oldl2 & ATTR_AF) == 0) { 8090 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8091 ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); 8092 pmap_demote_l2_abort(pmap, va, l2, lockp); 8093 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", 8094 va, pmap); 8095 goto fail; 8096 } 8097 8098 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { 8099 KASSERT((oldl2 & ATTR_SW_WIRED) == 0, 8100 ("pmap_demote_l2: page table page for a wired mapping" 8101 " is missing")); 8102 8103 /* 8104 * If the page table page is missing and the mapping 8105 * is for a kernel address, the mapping must belong to 8106 * either the direct map or the early kernel memory. 8107 * Page table pages are preallocated for every other 8108 * part of the kernel address space, so the direct map 8109 * region and early kernel memory are the only parts of the 8110 * kernel address space that must be handled here. 8111 */ 8112 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) || 8113 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end), 8114 ("pmap_demote_l2: No saved mpte for va %#lx", va)); 8115 8116 /* 8117 * If the 2MB page mapping belongs to the direct map 8118 * region of the kernel's address space, then the page 8119 * allocation request specifies the highest possible 8120 * priority (VM_ALLOC_INTERRUPT). Otherwise, the 8121 * priority is normal. 8122 */ 8123 ml3 = vm_page_alloc_noobj( 8124 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) | 8125 VM_ALLOC_WIRED); 8126 8127 /* 8128 * If the allocation of the new page table page fails, 8129 * invalidate the 2MB page mapping and return "failure". 8130 */ 8131 if (ml3 == NULL) { 8132 pmap_demote_l2_abort(pmap, va, l2, lockp); 8133 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" 8134 " in pmap %p", va, pmap); 8135 goto fail; 8136 } 8137 ml3->pindex = pmap_l2_pindex(va); 8138 8139 if (!ADDR_IS_KERNEL(va)) { 8140 ml3->ref_count = NL3PG; 8141 pmap_resident_count_inc(pmap, 1); 8142 } 8143 } 8144 l3phys = VM_PAGE_TO_PHYS(ml3); 8145 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); 8146 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; 8147 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 8148 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM), 8149 ("pmap_demote_l2: L2 entry is writeable but not dirty")); 8150 8151 /* 8152 * If the PTP is not leftover from an earlier promotion or it does not 8153 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all 8154 * have ATTR_AF set. 8155 * 8156 * When pmap_update_entry() clears the old L2 mapping, it (indirectly) 8157 * performs a dsb(). That dsb() ensures that the stores for filling 8158 * "l3" are visible before "l3" is added to the page table. 8159 */ 8160 if (!vm_page_all_valid(ml3)) 8161 pmap_fill_l3(l3, newl3); 8162 8163 pmap_demote_l2_check(l3, newl3); 8164 8165 /* 8166 * If the mapping has changed attributes, update the L3Es. 8167 */ 8168 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE)) 8169 pmap_fill_l3(l3, newl3); 8170 8171 /* 8172 * Map the temporary page so we don't lose access to the l2 table. 8173 */ 8174 if (tmpl2 != 0) { 8175 pmap_kenter(tmpl2, PAGE_SIZE, 8176 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, 8177 VM_MEMATTR_WRITE_BACK); 8178 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); 8179 } 8180 8181 /* 8182 * The spare PV entries must be reserved prior to demoting the 8183 * mapping, that is, prior to changing the PDE. Otherwise, the state 8184 * of the L2 and the PV lists will be inconsistent, which can result 8185 * in reclaim_pv_chunk() attempting to remove a PV entry from the 8186 * wrong PV list and pmap_pv_demote_l2() failing to find the expected 8187 * PV entry for the 2MB page mapping that is being demoted. 8188 */ 8189 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8190 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); 8191 8192 /* 8193 * Pass PAGE_SIZE so that a single TLB invalidation is performed on 8194 * the 2MB page mapping. 8195 */ 8196 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); 8197 8198 /* 8199 * Demote the PV entry. 8200 */ 8201 if ((oldl2 & ATTR_SW_MANAGED) != 0) 8202 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); 8203 8204 atomic_add_long(&pmap_l2_demotions, 1); 8205 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" 8206 " in pmap %p %lx", va, pmap, l3[0]); 8207 8208fail: 8209 if (tmpl2 != 0) { 8210 pmap_kremove(tmpl2); 8211 kva_free(tmpl2, PAGE_SIZE); 8212 } 8213 8214 return (l3); 8215 8216} 8217 8218static pt_entry_t * 8219pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) 8220{ 8221 struct rwlock *lock; 8222 pt_entry_t *l3; 8223 8224 lock = NULL; 8225 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); 8226 if (lock != NULL) 8227 rw_wunlock(lock); 8228 return (l3); 8229} 8230 8231/* 8232 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings. 8233 */ 8234static bool 8235pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va) 8236{ 8237 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8238 vm_offset_t tmpl3; 8239 register_t intr; 8240 8241 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 8242 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8243 sizeof(pt_entry_t)) - 1)); 8244 l3c_end = l3c_start + L3C_ENTRIES; 8245 tmpl3 = 0; 8246 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end && 8247 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) { 8248 tmpl3 = kva_alloc(PAGE_SIZE); 8249 if (tmpl3 == 0) 8250 return (false); 8251 pmap_kenter(tmpl3, PAGE_SIZE, 8252 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET, 8253 VM_MEMATTR_WRITE_BACK); 8254 l3c_start = (pt_entry_t *)(tmpl3 + 8255 ((vm_offset_t)l3c_start & PAGE_MASK)); 8256 l3c_end = (pt_entry_t *)(tmpl3 + 8257 ((vm_offset_t)l3c_end & PAGE_MASK)); 8258 } 8259 mask = 0; 8260 nbits = ATTR_DESCR_VALID; 8261 intr = intr_disable(); 8262 8263 /* 8264 * Break the mappings. 8265 */ 8266 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8267 /* 8268 * Clear the mapping's contiguous and valid bits, but leave 8269 * the rest of the entry unchanged, so that a lockless, 8270 * concurrent pmap_kextract() can still lookup the physical 8271 * address. 8272 */ 8273 l3e = pmap_load(tl3p); 8274 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8275 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS")); 8276 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) != 8277 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)), 8278 ("pmap_demote_l3c: missing ATTR_S1_AP_RW")); 8279 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS | 8280 ATTR_DESCR_VALID))) 8281 cpu_spinwait(); 8282 8283 /* 8284 * Hardware accessed and dirty bit maintenance might only 8285 * update a single L3 entry, so we must combine the accessed 8286 * and dirty bits from this entire set of contiguous L3 8287 * entries. 8288 */ 8289 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8290 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8291 mask = ATTR_S1_AP_RW_BIT; 8292 nbits |= l3e & ATTR_AF; 8293 } 8294 if ((nbits & ATTR_AF) != 0) { 8295 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) & 8296 ~L3C_OFFSET, true); 8297 } 8298 8299 /* 8300 * Remake the mappings, updating the accessed and dirty bits. 8301 */ 8302 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8303 l3e = pmap_load(tl3p); 8304 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits)) 8305 cpu_spinwait(); 8306 } 8307 dsb(ishst); 8308 8309 intr_restore(intr); 8310 if (tmpl3 != 0) { 8311 pmap_kremove(tmpl3); 8312 kva_free(tmpl3, PAGE_SIZE); 8313 } 8314 counter_u64_add(pmap_l3c_demotions, 1); 8315 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p", 8316 va, pmap); 8317 return (true); 8318} 8319 8320/* 8321 * Accumulate the accessed and dirty bits within a L3C superpage and 8322 * return the specified PTE with them applied correctly. 8323 */ 8324static pt_entry_t 8325pmap_load_l3c(pt_entry_t *l3p) 8326{ 8327 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p; 8328 8329 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES * 8330 sizeof(pt_entry_t)) - 1)); 8331 l3c_end = l3c_start + L3C_ENTRIES; 8332 mask = 0; 8333 nbits = 0; 8334 /* Iterate over each mapping in the superpage. */ 8335 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) { 8336 l3e = pmap_load(tl3p); 8337 KASSERT((l3e & ATTR_CONTIGUOUS) != 0, 8338 ("pmap_load_l3c: missing ATTR_CONTIGUOUS")); 8339 /* Update mask if the current page has its dirty bit set. */ 8340 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == 8341 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM)) 8342 mask = ATTR_S1_AP_RW_BIT; 8343 /* Update nbits if the accessed bit is set. */ 8344 nbits |= l3e & ATTR_AF; 8345 } 8346 return ((pmap_load(l3p) & ~mask) | nbits); 8347} 8348 8349/* 8350 * Perform the pmap work for mincore(2). If the page is not both referenced and 8351 * modified by this pmap, returns its physical address so that the caller can 8352 * find other mappings. 8353 */ 8354int 8355pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap) 8356{ 8357 pt_entry_t *pte, tpte; 8358 vm_paddr_t mask, pa; 8359 int lvl, val; 8360 bool managed; 8361 8362 PMAP_ASSERT_STAGE1(pmap); 8363 PMAP_LOCK(pmap); 8364 pte = pmap_pte(pmap, addr, &lvl); 8365 if (pte != NULL) { 8366 tpte = pmap_load(pte); 8367 8368 switch (lvl) { 8369 case 3: 8370 mask = L3_OFFSET; 8371 break; 8372 case 2: 8373 mask = L2_OFFSET; 8374 break; 8375 case 1: 8376 mask = L1_OFFSET; 8377 break; 8378 default: 8379 panic("pmap_mincore: invalid level %d", lvl); 8380 } 8381 8382 managed = (tpte & ATTR_SW_MANAGED) != 0; 8383 val = MINCORE_INCORE; 8384 if (lvl != 3) 8385 val |= MINCORE_PSIND(3 - lvl); 8386 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed && 8387 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))) 8388 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 8389 if ((tpte & ATTR_AF) == ATTR_AF) 8390 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 8391 8392 pa = PTE_TO_PHYS(tpte) | (addr & mask); 8393 } else { 8394 managed = false; 8395 val = 0; 8396 } 8397 8398 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 8399 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { 8400 *pap = pa; 8401 } 8402 PMAP_UNLOCK(pmap); 8403 return (val); 8404} 8405 8406/* 8407 * Garbage collect every ASID that is neither active on a processor nor 8408 * reserved. 8409 */ 8410static void 8411pmap_reset_asid_set(pmap_t pmap) 8412{ 8413 pmap_t curpmap; 8414 int asid, cpuid, epoch; 8415 struct asid_set *set; 8416 enum pmap_stage stage; 8417 8418 set = pmap->pm_asid_set; 8419 stage = pmap->pm_stage; 8420 8421 set = pmap->pm_asid_set; 8422 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8423 mtx_assert(&set->asid_set_mutex, MA_OWNED); 8424 8425 /* 8426 * Ensure that the store to asid_epoch is globally visible before the 8427 * loads from pc_curpmap are performed. 8428 */ 8429 epoch = set->asid_epoch + 1; 8430 if (epoch == INT_MAX) 8431 epoch = 0; 8432 set->asid_epoch = epoch; 8433 dsb(ishst); 8434 if (stage == PM_STAGE1) { 8435 __asm __volatile("tlbi vmalle1is"); 8436 } else { 8437 KASSERT(pmap_clean_stage2_tlbi != NULL, 8438 ("%s: Unset stage 2 tlb invalidation callback\n", 8439 __func__)); 8440 pmap_clean_stage2_tlbi(); 8441 } 8442 dsb(ish); 8443 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE, 8444 set->asid_set_size - 1); 8445 CPU_FOREACH(cpuid) { 8446 if (cpuid == curcpu) 8447 continue; 8448 if (stage == PM_STAGE1) { 8449 curpmap = pcpu_find(cpuid)->pc_curpmap; 8450 PMAP_ASSERT_STAGE1(pmap); 8451 } else { 8452 curpmap = pcpu_find(cpuid)->pc_curvmpmap; 8453 if (curpmap == NULL) 8454 continue; 8455 PMAP_ASSERT_STAGE2(pmap); 8456 } 8457 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set")); 8458 asid = COOKIE_TO_ASID(curpmap->pm_cookie); 8459 if (asid == -1) 8460 continue; 8461 bit_set(set->asid_set, asid); 8462 curpmap->pm_cookie = COOKIE_FROM(asid, epoch); 8463 } 8464} 8465 8466/* 8467 * Allocate a new ASID for the specified pmap. 8468 */ 8469static void 8470pmap_alloc_asid(pmap_t pmap) 8471{ 8472 struct asid_set *set; 8473 int new_asid; 8474 8475 set = pmap->pm_asid_set; 8476 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8477 8478 mtx_lock_spin(&set->asid_set_mutex); 8479 8480 /* 8481 * While this processor was waiting to acquire the asid set mutex, 8482 * pmap_reset_asid_set() running on another processor might have 8483 * updated this pmap's cookie to the current epoch. In which case, we 8484 * don't need to allocate a new ASID. 8485 */ 8486 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) 8487 goto out; 8488 8489 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size, 8490 &new_asid); 8491 if (new_asid == -1) { 8492 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 8493 set->asid_next, &new_asid); 8494 if (new_asid == -1) { 8495 pmap_reset_asid_set(pmap); 8496 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE, 8497 set->asid_set_size, &new_asid); 8498 KASSERT(new_asid != -1, ("ASID allocation failure")); 8499 } 8500 } 8501 bit_set(set->asid_set, new_asid); 8502 set->asid_next = new_asid + 1; 8503 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch); 8504out: 8505 mtx_unlock_spin(&set->asid_set_mutex); 8506} 8507 8508static uint64_t __read_mostly ttbr_flags; 8509 8510/* 8511 * Compute the value that should be stored in ttbr0 to activate the specified 8512 * pmap. This value may change from time to time. 8513 */ 8514uint64_t 8515pmap_to_ttbr0(pmap_t pmap) 8516{ 8517 uint64_t ttbr; 8518 8519 ttbr = pmap->pm_ttbr; 8520 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); 8521 ttbr |= ttbr_flags; 8522 8523 return (ttbr); 8524} 8525 8526static void 8527pmap_set_cnp(void *arg) 8528{ 8529 uint64_t ttbr0, ttbr1; 8530 u_int cpuid; 8531 8532 cpuid = *(u_int *)arg; 8533 if (cpuid == curcpu) { 8534 /* 8535 * Set the flags while all CPUs are handling the 8536 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls 8537 * to pmap_to_ttbr0 after this will have the CnP flag set. 8538 * The dsb after invalidating the TLB will act as a barrier 8539 * to ensure all CPUs can observe this change. 8540 */ 8541 ttbr_flags |= TTBR_CnP; 8542 } 8543 8544 ttbr0 = READ_SPECIALREG(ttbr0_el1); 8545 ttbr0 |= TTBR_CnP; 8546 8547 ttbr1 = READ_SPECIALREG(ttbr1_el1); 8548 ttbr1 |= TTBR_CnP; 8549 8550 /* Update ttbr{0,1}_el1 with the CnP flag */ 8551 WRITE_SPECIALREG(ttbr0_el1, ttbr0); 8552 WRITE_SPECIALREG(ttbr1_el1, ttbr1); 8553 isb(); 8554 __asm __volatile("tlbi vmalle1is"); 8555 dsb(ish); 8556 isb(); 8557} 8558 8559/* 8560 * Defer enabling some features until we have read the ID registers to know 8561 * if they are supported on all CPUs. 8562 */ 8563static void 8564pmap_init_mp(void *dummy __unused) 8565{ 8566 uint64_t reg; 8567 8568 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) { 8569 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) { 8570 if (bootverbose) 8571 printf("Enabling BTI\n"); 8572 pmap_bti_support = true; 8573 8574 pmap_bti_ranges_zone = uma_zcreate("BTI ranges", 8575 sizeof(struct rs_el), NULL, NULL, NULL, NULL, 8576 UMA_ALIGN_PTR, 0); 8577 } 8578 } 8579} 8580SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL); 8581 8582/* 8583 * Defer enabling CnP until we have read the ID registers to know if it's 8584 * supported on all CPUs. 8585 */ 8586static void 8587pmap_init_cnp(void *dummy __unused) 8588{ 8589 uint64_t reg; 8590 u_int cpuid; 8591 8592 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®)) 8593 return; 8594 8595 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) { 8596 if (bootverbose) 8597 printf("Enabling CnP\n"); 8598 cpuid = curcpu; 8599 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid); 8600 } 8601 8602} 8603SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL); 8604 8605static bool 8606pmap_activate_int(pmap_t pmap) 8607{ 8608 struct asid_set *set; 8609 int epoch; 8610 8611 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap")); 8612 KASSERT(pmap != kernel_pmap, ("kernel pmap activation")); 8613 8614 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) || 8615 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) { 8616 /* 8617 * Handle the possibility that the old thread was preempted 8618 * after an "ic" or "tlbi" instruction but before it performed 8619 * a "dsb" instruction. If the old thread migrates to a new 8620 * processor, its completion of a "dsb" instruction on that 8621 * new processor does not guarantee that the "ic" or "tlbi" 8622 * instructions performed on the old processor have completed. 8623 */ 8624 dsb(ish); 8625 return (false); 8626 } 8627 8628 set = pmap->pm_asid_set; 8629 KASSERT(set != NULL, ("%s: NULL asid set", __func__)); 8630 8631 /* 8632 * Ensure that the store to curpmap is globally visible before the 8633 * load from asid_epoch is performed. 8634 */ 8635 if (pmap->pm_stage == PM_STAGE1) 8636 PCPU_SET(curpmap, pmap); 8637 else 8638 PCPU_SET(curvmpmap, pmap); 8639 dsb(ish); 8640 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie); 8641 if (epoch >= 0 && epoch != set->asid_epoch) 8642 pmap_alloc_asid(pmap); 8643 8644 if (pmap->pm_stage == PM_STAGE1) { 8645 set_ttbr0(pmap_to_ttbr0(pmap)); 8646 if (PCPU_GET(bcast_tlbi_workaround) != 0) 8647 invalidate_local_icache(); 8648 } 8649 return (true); 8650} 8651 8652void 8653pmap_activate_vm(pmap_t pmap) 8654{ 8655 8656 PMAP_ASSERT_STAGE2(pmap); 8657 8658 (void)pmap_activate_int(pmap); 8659} 8660 8661void 8662pmap_activate(struct thread *td) 8663{ 8664 pmap_t pmap; 8665 8666 pmap = vmspace_pmap(td->td_proc->p_vmspace); 8667 PMAP_ASSERT_STAGE1(pmap); 8668 critical_enter(); 8669 (void)pmap_activate_int(pmap); 8670 critical_exit(); 8671} 8672 8673/* 8674 * Activate the thread we are switching to. 8675 * To simplify the assembly in cpu_throw return the new threads pcb. 8676 */ 8677struct pcb * 8678pmap_switch(struct thread *new) 8679{ 8680 pcpu_bp_harden bp_harden; 8681 struct pcb *pcb; 8682 8683 /* Store the new curthread */ 8684 PCPU_SET(curthread, new); 8685 8686 /* And the new pcb */ 8687 pcb = new->td_pcb; 8688 PCPU_SET(curpcb, pcb); 8689 8690 /* 8691 * TODO: We may need to flush the cache here if switching 8692 * to a user process. 8693 */ 8694 8695 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) { 8696 /* 8697 * Stop userspace from training the branch predictor against 8698 * other processes. This will call into a CPU specific 8699 * function that clears the branch predictor state. 8700 */ 8701 bp_harden = PCPU_GET(bp_harden); 8702 if (bp_harden != NULL) 8703 bp_harden(); 8704 } 8705 8706 return (pcb); 8707} 8708 8709void 8710pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) 8711{ 8712 8713 PMAP_ASSERT_STAGE1(pmap); 8714 KASSERT(ADDR_IS_CANONICAL(va), 8715 ("%s: Address not in canonical form: %lx", __func__, va)); 8716 8717 if (ADDR_IS_KERNEL(va)) { 8718 cpu_icache_sync_range((void *)va, sz); 8719 } else { 8720 u_int len, offset; 8721 vm_paddr_t pa; 8722 8723 /* Find the length of data in this page to flush */ 8724 offset = va & PAGE_MASK; 8725 len = imin(PAGE_SIZE - offset, sz); 8726 8727 while (sz != 0) { 8728 /* Extract the physical address & find it in the DMAP */ 8729 pa = pmap_extract(pmap, va); 8730 if (pa != 0) 8731 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), 8732 len); 8733 8734 /* Move to the next page */ 8735 sz -= len; 8736 va += len; 8737 /* Set the length for the next iteration */ 8738 len = imin(PAGE_SIZE, sz); 8739 } 8740 } 8741} 8742 8743static int 8744pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far) 8745{ 8746 pd_entry_t *pdep; 8747 pt_entry_t *ptep, pte; 8748 int rv, lvl, dfsc; 8749 8750 PMAP_ASSERT_STAGE2(pmap); 8751 rv = KERN_FAILURE; 8752 8753 /* Data and insn aborts use same encoding for FSC field. */ 8754 dfsc = esr & ISS_DATA_DFSC_MASK; 8755 switch (dfsc) { 8756 case ISS_DATA_DFSC_TF_L0: 8757 case ISS_DATA_DFSC_TF_L1: 8758 case ISS_DATA_DFSC_TF_L2: 8759 case ISS_DATA_DFSC_TF_L3: 8760 PMAP_LOCK(pmap); 8761 pdep = pmap_pde(pmap, far, &lvl); 8762 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) { 8763 PMAP_UNLOCK(pmap); 8764 break; 8765 } 8766 8767 switch (lvl) { 8768 case 0: 8769 ptep = pmap_l0_to_l1(pdep, far); 8770 break; 8771 case 1: 8772 ptep = pmap_l1_to_l2(pdep, far); 8773 break; 8774 case 2: 8775 ptep = pmap_l2_to_l3(pdep, far); 8776 break; 8777 default: 8778 panic("%s: Invalid pde level %d", __func__,lvl); 8779 } 8780 goto fault_exec; 8781 8782 case ISS_DATA_DFSC_AFF_L1: 8783 case ISS_DATA_DFSC_AFF_L2: 8784 case ISS_DATA_DFSC_AFF_L3: 8785 PMAP_LOCK(pmap); 8786 ptep = pmap_pte(pmap, far, &lvl); 8787fault_exec: 8788 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) { 8789 if (icache_vmid) { 8790 pmap_invalidate_vpipt_icache(); 8791 } else { 8792 /* 8793 * If accessing an executable page invalidate 8794 * the I-cache so it will be valid when we 8795 * continue execution in the guest. The D-cache 8796 * is assumed to already be clean to the Point 8797 * of Coherency. 8798 */ 8799 if ((pte & ATTR_S2_XN_MASK) != 8800 ATTR_S2_XN(ATTR_S2_XN_NONE)) { 8801 invalidate_icache(); 8802 } 8803 } 8804 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID); 8805 rv = KERN_SUCCESS; 8806 } 8807 PMAP_UNLOCK(pmap); 8808 break; 8809 } 8810 8811 return (rv); 8812} 8813 8814int 8815pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) 8816{ 8817 pt_entry_t pte, *ptep; 8818 register_t intr; 8819 uint64_t ec, par; 8820 int lvl, rv; 8821 8822 rv = KERN_FAILURE; 8823 8824 ec = ESR_ELx_EXCEPTION(esr); 8825 switch (ec) { 8826 case EXCP_INSN_ABORT_L: 8827 case EXCP_INSN_ABORT: 8828 case EXCP_DATA_ABORT_L: 8829 case EXCP_DATA_ABORT: 8830 break; 8831 default: 8832 return (rv); 8833 } 8834 8835 if (pmap->pm_stage == PM_STAGE2) 8836 return (pmap_stage2_fault(pmap, esr, far)); 8837 8838 /* Data and insn aborts use same encoding for FSC field. */ 8839 switch (esr & ISS_DATA_DFSC_MASK) { 8840 case ISS_DATA_DFSC_AFF_L1: 8841 case ISS_DATA_DFSC_AFF_L2: 8842 case ISS_DATA_DFSC_AFF_L3: 8843 PMAP_LOCK(pmap); 8844 ptep = pmap_pte(pmap, far, &lvl); 8845 if (ptep != NULL) { 8846 pmap_set_bits(ptep, ATTR_AF); 8847 rv = KERN_SUCCESS; 8848 /* 8849 * XXXMJ as an optimization we could mark the entry 8850 * dirty if this is a write fault. 8851 */ 8852 } 8853 PMAP_UNLOCK(pmap); 8854 break; 8855 case ISS_DATA_DFSC_PF_L1: 8856 case ISS_DATA_DFSC_PF_L2: 8857 case ISS_DATA_DFSC_PF_L3: 8858 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || 8859 (esr & ISS_DATA_WnR) == 0) 8860 return (rv); 8861 PMAP_LOCK(pmap); 8862 ptep = pmap_pte(pmap, far, &lvl); 8863 if (ptep != NULL && 8864 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) { 8865 if ((pte & ATTR_S1_AP_RW_BIT) == 8866 ATTR_S1_AP(ATTR_S1_AP_RO)) { 8867 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT); 8868 pmap_s1_invalidate_page(pmap, far, true); 8869 } 8870 rv = KERN_SUCCESS; 8871 } 8872 PMAP_UNLOCK(pmap); 8873 break; 8874 case ISS_DATA_DFSC_TF_L0: 8875 case ISS_DATA_DFSC_TF_L1: 8876 case ISS_DATA_DFSC_TF_L2: 8877 case ISS_DATA_DFSC_TF_L3: 8878 /* 8879 * Retry the translation. A break-before-make sequence can 8880 * produce a transient fault. 8881 */ 8882 if (pmap == kernel_pmap) { 8883 /* 8884 * The translation fault may have occurred within a 8885 * critical section. Therefore, we must check the 8886 * address without acquiring the kernel pmap's lock. 8887 */ 8888 if (pmap_klookup(far, NULL)) 8889 rv = KERN_SUCCESS; 8890 } else { 8891 PMAP_LOCK(pmap); 8892 /* Ask the MMU to check the address. */ 8893 intr = intr_disable(); 8894 par = arm64_address_translate_s1e0r(far); 8895 intr_restore(intr); 8896 PMAP_UNLOCK(pmap); 8897 8898 /* 8899 * If the translation was successful, then we can 8900 * return success to the trap handler. 8901 */ 8902 if (PAR_SUCCESS(par)) 8903 rv = KERN_SUCCESS; 8904 } 8905 break; 8906 } 8907 8908 return (rv); 8909} 8910 8911/* 8912 * Increase the starting virtual address of the given mapping if a 8913 * different alignment might result in more superpage mappings. 8914 */ 8915void 8916pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 8917 vm_offset_t *addr, vm_size_t size) 8918{ 8919 vm_offset_t superpage_offset; 8920 8921 if (size < L2_SIZE) 8922 return; 8923 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 8924 offset += ptoa(object->pg_color); 8925 superpage_offset = offset & L2_OFFSET; 8926 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || 8927 (*addr & L2_OFFSET) == superpage_offset) 8928 return; 8929 if ((*addr & L2_OFFSET) < superpage_offset) 8930 *addr = (*addr & ~L2_OFFSET) + superpage_offset; 8931 else 8932 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; 8933} 8934 8935/** 8936 * Get the kernel virtual address of a set of physical pages. If there are 8937 * physical addresses not covered by the DMAP perform a transient mapping 8938 * that will be removed when calling pmap_unmap_io_transient. 8939 * 8940 * \param page The pages the caller wishes to obtain the virtual 8941 * address on the kernel memory map. 8942 * \param vaddr On return contains the kernel virtual memory address 8943 * of the pages passed in the page parameter. 8944 * \param count Number of pages passed in. 8945 * \param can_fault true if the thread using the mapped pages can take 8946 * page faults, false otherwise. 8947 * 8948 * \returns true if the caller must call pmap_unmap_io_transient when 8949 * finished or false otherwise. 8950 * 8951 */ 8952bool 8953pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8954 bool can_fault) 8955{ 8956 vm_paddr_t paddr; 8957 bool needs_mapping; 8958 int error __diagused, i; 8959 8960 /* 8961 * Allocate any KVA space that we need, this is done in a separate 8962 * loop to prevent calling vmem_alloc while pinned. 8963 */ 8964 needs_mapping = false; 8965 for (i = 0; i < count; i++) { 8966 paddr = VM_PAGE_TO_PHYS(page[i]); 8967 if (__predict_false(!PHYS_IN_DMAP(paddr))) { 8968 error = vmem_alloc(kernel_arena, PAGE_SIZE, 8969 M_BESTFIT | M_WAITOK, &vaddr[i]); 8970 KASSERT(error == 0, ("vmem_alloc failed: %d", error)); 8971 needs_mapping = true; 8972 } else { 8973 vaddr[i] = PHYS_TO_DMAP(paddr); 8974 } 8975 } 8976 8977 /* Exit early if everything is covered by the DMAP */ 8978 if (!needs_mapping) 8979 return (false); 8980 8981 if (!can_fault) 8982 sched_pin(); 8983 for (i = 0; i < count; i++) { 8984 paddr = VM_PAGE_TO_PHYS(page[i]); 8985 if (!PHYS_IN_DMAP(paddr)) { 8986 panic( 8987 "pmap_map_io_transient: TODO: Map out of DMAP data"); 8988 } 8989 } 8990 8991 return (needs_mapping); 8992} 8993 8994void 8995pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, 8996 bool can_fault) 8997{ 8998 vm_paddr_t paddr; 8999 int i; 9000 9001 if (!can_fault) 9002 sched_unpin(); 9003 for (i = 0; i < count; i++) { 9004 paddr = VM_PAGE_TO_PHYS(page[i]); 9005 if (!PHYS_IN_DMAP(paddr)) { 9006 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); 9007 } 9008 } 9009} 9010 9011bool 9012pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) 9013{ 9014 9015 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); 9016} 9017 9018static void * 9019bti_dup_range(void *ctx __unused, void *data) 9020{ 9021 struct rs_el *node, *new_node; 9022 9023 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9024 if (new_node == NULL) 9025 return (NULL); 9026 node = data; 9027 memcpy(new_node, node, sizeof(*node)); 9028 return (new_node); 9029} 9030 9031static void 9032bti_free_range(void *ctx __unused, void *node) 9033{ 9034 9035 uma_zfree(pmap_bti_ranges_zone, node); 9036} 9037 9038static int 9039pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9040{ 9041 struct rs_el *rs; 9042 int error; 9043 9044 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9045 PMAP_ASSERT_STAGE1(pmap); 9046 MPASS(pmap->pm_bti != NULL); 9047 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT); 9048 if (rs == NULL) 9049 return (ENOMEM); 9050 error = rangeset_insert(pmap->pm_bti, sva, eva, rs); 9051 if (error != 0) 9052 uma_zfree(pmap_bti_ranges_zone, rs); 9053 return (error); 9054} 9055 9056static void 9057pmap_bti_deassign_all(pmap_t pmap) 9058{ 9059 9060 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9061 if (pmap->pm_bti != NULL) 9062 rangeset_remove_all(pmap->pm_bti); 9063} 9064 9065static bool 9066pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9067{ 9068 struct rs_el *prev_rs, *rs; 9069 vm_offset_t va; 9070 9071 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9072 KASSERT(ADDR_IS_CANONICAL(sva), 9073 ("%s: Start address not in canonical form: %lx", __func__, sva)); 9074 KASSERT(ADDR_IS_CANONICAL(eva), 9075 ("%s: End address not in canonical form: %lx", __func__, eva)); 9076 9077 if (pmap->pm_bti == NULL || ADDR_IS_KERNEL(sva)) 9078 return (true); 9079 MPASS(!ADDR_IS_KERNEL(eva)); 9080 for (va = sva; va < eva; prev_rs = rs) { 9081 rs = rangeset_lookup(pmap->pm_bti, va); 9082 if (va == sva) 9083 prev_rs = rs; 9084 else if ((rs == NULL) ^ (prev_rs == NULL)) 9085 return (false); 9086 if (rs == NULL) { 9087 va += PAGE_SIZE; 9088 continue; 9089 } 9090 va = rs->re_end; 9091 } 9092 return (true); 9093} 9094 9095static pt_entry_t 9096pmap_pte_bti(pmap_t pmap, vm_offset_t va) 9097{ 9098 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9099 MPASS(ADDR_IS_CANONICAL(va)); 9100 9101 if (pmap->pm_stage != PM_STAGE1) 9102 return (0); 9103 if (pmap == kernel_pmap) 9104 return (ATTR_KERN_GP); 9105 if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL) 9106 return (ATTR_S1_GP); 9107 return (0); 9108} 9109 9110static void 9111pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9112{ 9113 9114 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9115 if (pmap->pm_bti != NULL) 9116 rangeset_remove(pmap->pm_bti, sva, eva); 9117} 9118 9119static int 9120pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap) 9121{ 9122 9123 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); 9124 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); 9125 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage); 9126 MPASS(src_pmap->pm_bti != NULL); 9127 MPASS(dst_pmap->pm_bti != NULL); 9128 if (src_pmap->pm_bti->rs_data_ctx == NULL) 9129 return (0); 9130 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti)); 9131} 9132 9133static void 9134pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set) 9135{ 9136 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 9137 PMAP_ASSERT_STAGE1(pmap); 9138 9139 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0, 9140 true); 9141} 9142 9143int 9144pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 9145{ 9146 int error; 9147 9148 if (pmap->pm_bti == NULL) 9149 return (0); 9150 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva)) 9151 return (EINVAL); 9152 if (pmap->pm_stage != PM_STAGE1) 9153 return (EINVAL); 9154 if (eva <= sva || ADDR_IS_KERNEL(eva)) 9155 return (EFAULT); 9156 9157 sva = trunc_page(sva); 9158 eva = round_page(eva); 9159 for (;;) { 9160 PMAP_LOCK(pmap); 9161 error = pmap_bti_assign(pmap, sva, eva); 9162 if (error == 0) 9163 pmap_bti_update_range(pmap, sva, eva, true); 9164 PMAP_UNLOCK(pmap); 9165 if (error != ENOMEM) 9166 break; 9167 vm_wait(NULL); 9168 } 9169 return (error); 9170} 9171 9172#if defined(KASAN) || defined(KMSAN) 9173static pd_entry_t *pmap_san_early_l2; 9174 9175#define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE) 9176#define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE) 9177static vm_offset_t __nosanitizeaddress 9178pmap_san_enter_bootstrap_alloc_l2(void) 9179{ 9180 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE); 9181 static size_t offset = 0; 9182 vm_offset_t addr; 9183 9184 if (offset + L2_SIZE > sizeof(bootstrap_data)) { 9185 panic("%s: out of memory for the bootstrap shadow map L2 entries", 9186 __func__); 9187 } 9188 9189 addr = (uintptr_t)&bootstrap_data[offset]; 9190 offset += L2_SIZE; 9191 return (addr); 9192} 9193 9194/* 9195 * SAN L1 + L2 pages, maybe L3 entries later? 9196 */ 9197static vm_offset_t __nosanitizeaddress 9198pmap_san_enter_bootstrap_alloc_pages(int npages) 9199{ 9200 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE); 9201 static size_t offset = 0; 9202 vm_offset_t addr; 9203 9204 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) { 9205 panic("%s: out of memory for the bootstrap shadow map", 9206 __func__); 9207 } 9208 9209 addr = (uintptr_t)&bootstrap_data[offset]; 9210 offset += (npages * PAGE_SIZE); 9211 return (addr); 9212} 9213 9214static void __nosanitizeaddress 9215pmap_san_enter_bootstrap(void) 9216{ 9217 vm_offset_t freemempos; 9218 9219 /* L1, L2 */ 9220 freemempos = pmap_san_enter_bootstrap_alloc_pages(2); 9221 bs_state.freemempos = freemempos; 9222 bs_state.va = KASAN_MIN_ADDRESS; 9223 pmap_bootstrap_l1_table(&bs_state); 9224 pmap_san_early_l2 = bs_state.l2; 9225} 9226 9227static vm_page_t 9228pmap_san_enter_alloc_l3(void) 9229{ 9230 vm_page_t m; 9231 9232 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED | 9233 VM_ALLOC_ZERO); 9234 if (m == NULL) 9235 panic("%s: no memory to grow shadow map", __func__); 9236 return (m); 9237} 9238 9239static vm_page_t 9240pmap_san_enter_alloc_l2(void) 9241{ 9242 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO, 9243 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT)); 9244} 9245 9246void __nosanitizeaddress __nosanitizememory 9247pmap_san_enter(vm_offset_t va) 9248{ 9249 pd_entry_t *l1, *l2; 9250 pt_entry_t *l3; 9251 vm_page_t m; 9252 9253 if (virtual_avail == 0) { 9254 vm_offset_t block; 9255 int slot; 9256 bool first; 9257 9258 /* Temporary shadow map prior to pmap_bootstrap(). */ 9259 first = pmap_san_early_l2 == NULL; 9260 if (first) 9261 pmap_san_enter_bootstrap(); 9262 9263 l2 = pmap_san_early_l2; 9264 slot = pmap_l2_index(va); 9265 9266 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) { 9267 MPASS(first); 9268 block = pmap_san_enter_bootstrap_alloc_l2(); 9269 pmap_store(&l2[slot], 9270 PHYS_TO_PTE(pmap_early_vtophys(block)) | 9271 PMAP_SAN_PTE_BITS | L2_BLOCK); 9272 dmb(ishst); 9273 } 9274 9275 return; 9276 } 9277 9278 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 9279 l1 = pmap_l1(kernel_pmap, va); 9280 MPASS(l1 != NULL); 9281 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) { 9282 m = pmap_san_enter_alloc_l3(); 9283 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE); 9284 } 9285 l2 = pmap_l1_to_l2(l1, va); 9286 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) { 9287 m = pmap_san_enter_alloc_l2(); 9288 if (m != NULL) { 9289 pmap_store(l2, VM_PAGE_TO_PTE(m) | 9290 PMAP_SAN_PTE_BITS | L2_BLOCK); 9291 } else { 9292 m = pmap_san_enter_alloc_l3(); 9293 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE); 9294 } 9295 dmb(ishst); 9296 } 9297 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) 9298 return; 9299 l3 = pmap_l2_to_l3(l2, va); 9300 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0) 9301 return; 9302 m = pmap_san_enter_alloc_l3(); 9303 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE); 9304 dmb(ishst); 9305} 9306#endif /* KASAN || KMSAN */ 9307 9308/* 9309 * Track a range of the kernel's virtual address space that is contiguous 9310 * in various mapping attributes. 9311 */ 9312struct pmap_kernel_map_range { 9313 vm_offset_t sva; 9314 pt_entry_t attrs; 9315 int l3pages; 9316 int l3contig; 9317 int l2blocks; 9318 int l1blocks; 9319}; 9320 9321static void 9322sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, 9323 vm_offset_t eva) 9324{ 9325 const char *mode; 9326 int index; 9327 9328 if (eva <= range->sva) 9329 return; 9330 9331 index = range->attrs & ATTR_S1_IDX_MASK; 9332 switch (index) { 9333 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP): 9334 mode = "DEV-NP"; 9335 break; 9336 case ATTR_S1_IDX(VM_MEMATTR_DEVICE): 9337 mode = "DEV"; 9338 break; 9339 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE): 9340 mode = "UC"; 9341 break; 9342 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK): 9343 mode = "WB"; 9344 break; 9345 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH): 9346 mode = "WT"; 9347 break; 9348 default: 9349 printf( 9350 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n", 9351 __func__, index, range->sva, eva); 9352 mode = "??"; 9353 break; 9354 } 9355 9356 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d\n", 9357 range->sva, eva, 9358 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-', 9359 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x', 9360 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X', 9361 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's', 9362 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-', 9363 mode, range->l1blocks, range->l2blocks, range->l3contig, 9364 range->l3pages); 9365 9366 /* Reset to sentinel value. */ 9367 range->sva = 0xfffffffffffffffful; 9368} 9369 9370/* 9371 * Determine whether the attributes specified by a page table entry match those 9372 * being tracked by the current range. 9373 */ 9374static bool 9375sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) 9376{ 9377 9378 return (range->attrs == attrs); 9379} 9380 9381static void 9382sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, 9383 pt_entry_t attrs) 9384{ 9385 9386 memset(range, 0, sizeof(*range)); 9387 range->sva = va; 9388 range->attrs = attrs; 9389} 9390 9391/* Get the block/page attributes that correspond to the table attributes */ 9392static pt_entry_t 9393sysctl_kmaps_table_attrs(pd_entry_t table) 9394{ 9395 pt_entry_t attrs; 9396 9397 attrs = 0; 9398 if ((table & TATTR_UXN_TABLE) != 0) 9399 attrs |= ATTR_S1_UXN; 9400 if ((table & TATTR_PXN_TABLE) != 0) 9401 attrs |= ATTR_S1_PXN; 9402 if ((table & TATTR_AP_TABLE_RO) != 0) 9403 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO); 9404 9405 return (attrs); 9406} 9407 9408/* Read the block/page attributes we care about */ 9409static pt_entry_t 9410sysctl_kmaps_block_attrs(pt_entry_t block) 9411{ 9412 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK | 9413 ATTR_S1_GP)); 9414} 9415 9416/* 9417 * Given a leaf PTE, derive the mapping's attributes. If they do not match 9418 * those of the current run, dump the address range and its attributes, and 9419 * begin a new run. 9420 */ 9421static void 9422sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, 9423 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e, 9424 pt_entry_t l3e) 9425{ 9426 pt_entry_t attrs; 9427 9428 attrs = sysctl_kmaps_table_attrs(l0e); 9429 9430 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 9431 attrs |= sysctl_kmaps_block_attrs(l1e); 9432 goto done; 9433 } 9434 attrs |= sysctl_kmaps_table_attrs(l1e); 9435 9436 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) { 9437 attrs |= sysctl_kmaps_block_attrs(l2e); 9438 goto done; 9439 } 9440 attrs |= sysctl_kmaps_table_attrs(l2e); 9441 attrs |= sysctl_kmaps_block_attrs(l3e); 9442 9443done: 9444 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { 9445 sysctl_kmaps_dump(sb, range, va); 9446 sysctl_kmaps_reinit(range, va, attrs); 9447 } 9448} 9449 9450static int 9451sysctl_kmaps(SYSCTL_HANDLER_ARGS) 9452{ 9453 struct pmap_kernel_map_range range; 9454 struct sbuf sbuf, *sb; 9455 pd_entry_t l0e, *l1, l1e, *l2, l2e; 9456 pt_entry_t *l3, l3e; 9457 vm_offset_t sva; 9458 vm_paddr_t pa; 9459 int error, i, j, k, l; 9460 9461 error = sysctl_wire_old_buffer(req, 0); 9462 if (error != 0) 9463 return (error); 9464 sb = &sbuf; 9465 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); 9466 9467 /* Sentinel value. */ 9468 range.sva = 0xfffffffffffffffful; 9469 9470 /* 9471 * Iterate over the kernel page tables without holding the kernel pmap 9472 * lock. Kernel page table pages are never freed, so at worst we will 9473 * observe inconsistencies in the output. 9474 */ 9475 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES; 9476 i++) { 9477 if (i == pmap_l0_index(DMAP_MIN_ADDRESS)) 9478 sbuf_printf(sb, "\nDirect map:\n"); 9479 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS)) 9480 sbuf_printf(sb, "\nKernel map:\n"); 9481#ifdef KASAN 9482 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS)) 9483 sbuf_printf(sb, "\nKASAN shadow map:\n"); 9484#endif 9485#ifdef KMSAN 9486 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS)) 9487 sbuf_printf(sb, "\nKMSAN shadow map:\n"); 9488 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS)) 9489 sbuf_printf(sb, "\nKMSAN origin map:\n"); 9490#endif 9491 9492 l0e = kernel_pmap->pm_l0[i]; 9493 if ((l0e & ATTR_DESCR_VALID) == 0) { 9494 sysctl_kmaps_dump(sb, &range, sva); 9495 sva += L0_SIZE; 9496 continue; 9497 } 9498 pa = PTE_TO_PHYS(l0e); 9499 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa); 9500 9501 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) { 9502 l1e = l1[j]; 9503 if ((l1e & ATTR_DESCR_VALID) == 0) { 9504 sysctl_kmaps_dump(sb, &range, sva); 9505 sva += L1_SIZE; 9506 continue; 9507 } 9508 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) { 9509 PMAP_ASSERT_L1_BLOCKS_SUPPORTED; 9510 sysctl_kmaps_check(sb, &range, sva, l0e, l1e, 9511 0, 0); 9512 range.l1blocks++; 9513 sva += L1_SIZE; 9514 continue; 9515 } 9516 pa = PTE_TO_PHYS(l1e); 9517 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa); 9518 9519 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) { 9520 l2e = l2[k]; 9521 if ((l2e & ATTR_DESCR_VALID) == 0) { 9522 sysctl_kmaps_dump(sb, &range, sva); 9523 sva += L2_SIZE; 9524 continue; 9525 } 9526 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) { 9527 sysctl_kmaps_check(sb, &range, sva, 9528 l0e, l1e, l2e, 0); 9529 range.l2blocks++; 9530 sva += L2_SIZE; 9531 continue; 9532 } 9533 pa = PTE_TO_PHYS(l2e); 9534 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); 9535 9536 for (l = pmap_l3_index(sva); l < Ln_ENTRIES; 9537 l++, sva += L3_SIZE) { 9538 l3e = l3[l]; 9539 if ((l3e & ATTR_DESCR_VALID) == 0) { 9540 sysctl_kmaps_dump(sb, &range, 9541 sva); 9542 continue; 9543 } 9544 sysctl_kmaps_check(sb, &range, sva, 9545 l0e, l1e, l2e, l3e); 9546 if ((l3e & ATTR_CONTIGUOUS) != 0) 9547 range.l3contig += 9548 l % L3C_ENTRIES == 0 ? 9549 1 : 0; 9550 else 9551 range.l3pages++; 9552 } 9553 } 9554 } 9555 } 9556 9557 error = sbuf_finish(sb); 9558 sbuf_delete(sb); 9559 return (error); 9560} 9561SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, 9562 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP, 9563 NULL, 0, sysctl_kmaps, "A", 9564 "Dump kernel address layout"); 9565