1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <vm/pmap.h>
29#include <vm/vm_map.h>
30#include <kern/ledger.h>
31#include <i386/pmap_internal.h>
32
33void		pmap_remove_range(
34			pmap_t		pmap,
35			vm_map_offset_t	va,
36			pt_entry_t	*spte,
37			pt_entry_t	*epte);
38
39void		pmap_remove_range_options(
40			pmap_t		pmap,
41			vm_map_offset_t	va,
42			pt_entry_t	*spte,
43			pt_entry_t	*epte,
44			int		options);
45
46void		pmap_reusable_range(
47			pmap_t		pmap,
48			vm_map_offset_t	va,
49			pt_entry_t	*spte,
50			pt_entry_t	*epte,
51			boolean_t	reusable);
52
53uint32_t pmap_update_clear_pte_count;
54
55/*
56 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
57 * on a NBPDE boundary.
58 */
59
60/* These symbols may be referenced directly by VM */
61uint64_t pmap_nesting_size_min = NBPDE;
62uint64_t pmap_nesting_size_max = 0 - (uint64_t)NBPDE;
63
64/*
65 *	kern_return_t pmap_nest(grand, subord, va_start, size)
66 *
67 *	grand  = the pmap that we will nest subord into
68 *	subord = the pmap that goes into the grand
69 *	va_start  = start of range in pmap to be inserted
70 *	nstart  = start of range in pmap nested pmap
71 *	size   = Size of nest area (up to 16TB)
72 *
73 *	Inserts a pmap into another.  This is used to implement shared segments.
74 *
75 *	Note that we depend upon higher level VM locks to insure that things don't change while
76 *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
77 *	or do 2 nests at once.
78 */
79
80/*
81 * This routine can nest subtrees either at the PDPT level (1GiB) or at the
82 * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
83 * container and the "grand" parent. A minor optimization to consider for the
84 * future: make the "subord" truly a container rather than a full-fledged
85 * pagetable hierarchy which can be unnecessarily sparse (DRK).
86 */
87
88kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t nstart, uint64_t size) {
89	vm_map_offset_t	vaddr, nvaddr;
90	pd_entry_t	*pde,*npde;
91	unsigned int	i;
92	uint64_t	num_pde;
93
94	if ((size & (pmap_nesting_size_min-1)) ||
95	    (va_start & (pmap_nesting_size_min-1)) ||
96	    (nstart & (pmap_nesting_size_min-1)) ||
97	    ((size >> 28) > 65536))	/* Max size we can nest is 16TB */
98		return KERN_INVALID_VALUE;
99
100	if(size == 0) {
101		panic("pmap_nest: size is invalid - %016llX\n", size);
102	}
103
104	if (va_start != nstart)
105		panic("pmap_nest: va_start(0x%llx) != nstart(0x%llx)\n", va_start, nstart);
106
107	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
108	(uintptr_t) grand, (uintptr_t) subord,
109	    (uintptr_t) (va_start>>32), (uintptr_t) va_start, 0);
110
111	nvaddr = (vm_map_offset_t)nstart;
112	num_pde = size >> PDESHIFT;
113
114	PMAP_LOCK(subord);
115
116	subord->pm_shared = TRUE;
117
118	for (i = 0; i < num_pde;) {
119		if (((nvaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG && cpu_64bit) {
120
121			npde = pmap64_pdpt(subord, nvaddr);
122
123			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
124				PMAP_UNLOCK(subord);
125				pmap_expand_pdpt(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE);
126				PMAP_LOCK(subord);
127				npde = pmap64_pdpt(subord, nvaddr);
128			}
129			*npde |= INTEL_PDPTE_NESTED;
130			nvaddr += NBPDPT;
131			i += (uint32_t)NPDEPG;
132		}
133		else {
134			npde = pmap_pde(subord, nvaddr);
135
136			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
137				PMAP_UNLOCK(subord);
138				pmap_expand(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE);
139				PMAP_LOCK(subord);
140				npde = pmap_pde(subord, nvaddr);
141			}
142			nvaddr += NBPDE;
143			i++;
144		}
145	}
146
147	PMAP_UNLOCK(subord);
148
149	vaddr = (vm_map_offset_t)va_start;
150
151	PMAP_LOCK(grand);
152
153	for (i = 0;i < num_pde;) {
154		pd_entry_t tpde;
155
156		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG) && cpu_64bit) {
157			npde = pmap64_pdpt(subord, vaddr);
158			if (npde == 0)
159				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
160			tpde = *npde;
161			pde = pmap64_pdpt(grand, vaddr);
162			if (0 == pde) {
163				PMAP_UNLOCK(grand);
164				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
165				PMAP_LOCK(grand);
166				pde = pmap64_pdpt(grand, vaddr);
167			}
168			if (pde == 0)
169				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
170			pmap_store_pte(pde, tpde);
171			vaddr += NBPDPT;
172			i += (uint32_t) NPDEPG;
173		}
174		else {
175			npde = pmap_pde(subord, nstart);
176			if (npde == 0)
177				panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord, nstart);
178			tpde = *npde;
179			nstart += NBPDE;
180			pde = pmap_pde(grand, vaddr);
181			if ((0 == pde) && cpu_64bit) {
182				PMAP_UNLOCK(grand);
183				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
184				PMAP_LOCK(grand);
185				pde = pmap_pde(grand, vaddr);
186			}
187
188			if (pde == 0)
189				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
190			vaddr += NBPDE;
191			pmap_store_pte(pde, tpde);
192			i++;
193		}
194	}
195
196	PMAP_UNLOCK(grand);
197
198	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
199
200	return KERN_SUCCESS;
201}
202
203/*
204 *	kern_return_t pmap_unnest(grand, vaddr)
205 *
206 *	grand  = the pmap that we will un-nest subord from
207 *	vaddr  = start of range in pmap to be unnested
208 *
209 *	Removes a pmap from another.  This is used to implement shared segments.
210 */
211
212kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) {
213
214	pd_entry_t *pde;
215	unsigned int i;
216	uint64_t num_pde;
217	addr64_t va_start, va_end;
218	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
219
220	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
221	    (uintptr_t) grand,
222	    (uintptr_t) (vaddr>>32), (uintptr_t) vaddr, 0, 0);
223
224	if ((size & (pmap_nesting_size_min-1)) ||
225	    (vaddr & (pmap_nesting_size_min-1))) {
226		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...\n",
227		    grand, vaddr, size);
228	}
229
230	/* align everything to PDE boundaries */
231	va_start = vaddr & ~(NBPDE-1);
232	va_end = (vaddr + size + NBPDE - 1) & ~(NBPDE-1);
233	size = va_end - va_start;
234
235	PMAP_LOCK(grand);
236
237	num_pde = size >> PDESHIFT;
238	vaddr = va_start;
239
240	for (i = 0; i < num_pde; ) {
241		if ((pdptnum(grand, vaddr) != npdpt) && cpu_64bit) {
242			npdpt = pdptnum(grand, vaddr);
243			pde = pmap64_pdpt(grand, vaddr);
244			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
245				pmap_store_pte(pde, (pd_entry_t)0);
246				i += (uint32_t) NPDEPG;
247				vaddr += NBPDPT;
248				continue;
249			}
250		}
251		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
252		if (pde == 0)
253			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx\n", grand, vaddr);
254		pmap_store_pte(pde, (pd_entry_t)0);
255		i++;
256		vaddr += NBPDE;
257	}
258
259	PMAP_UPDATE_TLBS(grand, va_start, va_end);
260
261	PMAP_UNLOCK(grand);
262
263	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
264
265	return KERN_SUCCESS;
266}
267
268/* Invoked by the Mach VM to determine the platform specific unnest region */
269
270boolean_t pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e) {
271	pd_entry_t *pdpte;
272	boolean_t rval = FALSE;
273
274	if (!cpu_64bit)
275		return rval;
276
277	PMAP_LOCK(p);
278
279	pdpte = pmap64_pdpt(p, *s);
280	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
281		*s &= ~(NBPDPT -1);
282		rval = TRUE;
283	}
284
285	pdpte = pmap64_pdpt(p, *e);
286	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
287		*e = ((*e + NBPDPT) & ~(NBPDPT -1));
288		rval = TRUE;
289	}
290
291	PMAP_UNLOCK(p);
292
293	return rval;
294}
295
296/*
297 * pmap_find_phys returns the (4K) physical page number containing a
298 * given virtual address in a given pmap.
299 * Note that pmap_pte may return a pde if this virtual address is
300 * mapped by a large page and this is taken into account in order
301 * to return the correct page number in this case.
302 */
303ppnum_t
304pmap_find_phys(pmap_t pmap, addr64_t va)
305{
306	pt_entry_t	*ptp;
307	pd_entry_t	*pdep;
308	ppnum_t		ppn = 0;
309	pd_entry_t	pde;
310	pt_entry_t	pte;
311
312	mp_disable_preemption();
313
314	/* This refcount test is a band-aid--several infrastructural changes
315	 * are necessary to eliminate invocation of this routine from arbitrary
316	 * contexts.
317	 */
318
319	if (!pmap->ref_count)
320		goto pfp_exit;
321
322	pdep = pmap_pde(pmap, va);
323
324	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & INTEL_PTE_VALID)) {
325		if (pde & INTEL_PTE_PS) {
326			ppn = (ppnum_t) i386_btop(pte_to_pa(pde));
327			ppn += (ppnum_t) ptenum(va);
328		}
329		else {
330			ptp = pmap_pte(pmap, va);
331			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & INTEL_PTE_VALID) != 0)) {
332				ppn = (ppnum_t) i386_btop(pte_to_pa(pte));
333			}
334		}
335	}
336pfp_exit:
337	mp_enable_preemption();
338
339        return ppn;
340}
341
342/*
343 * Update cache attributes for all extant managed mappings.
344 * Assumes PV for this page is locked, and that the page
345 * is managed.
346 */
347
348void
349pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) {
350	pv_rooted_entry_t	pv_h, pv_e;
351	pv_hashed_entry_t       pvh_e, nexth;
352	vm_map_offset_t vaddr;
353	pmap_t	pmap;
354	pt_entry_t	*ptep;
355
356	assert(IS_MANAGED_PAGE(pn));
357
358	pv_h = pai_to_pvh(pn);
359	/* TODO: translate the PHYS_* bits to PTE bits, while they're
360	 * currently identical, they may not remain so
361	 * Potential optimization (here and in page_protect),
362	 * parallel shootdowns, check for redundant
363	 * attribute modifications.
364	 */
365
366	/*
367	 * Alter attributes on all mappings
368	 */
369	if (pv_h->pmap != PMAP_NULL) {
370		pv_e = pv_h;
371		pvh_e = (pv_hashed_entry_t)pv_e;
372
373		do {
374			pmap = pv_e->pmap;
375			vaddr = pv_e->va;
376			ptep = pmap_pte(pmap, vaddr);
377
378			if (0 == ptep)
379				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
380
381			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
382			pmap_update_pte(ptep, PHYS_CACHEABILITY_MASK, attributes);
383			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
384			pvh_e = nexth;
385		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
386	}
387}
388
389void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) {
390	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
391
392	if (dofilter) {
393		CPU_CR3_MARK_INACTIVE();
394	} else {
395		CPU_CR3_MARK_ACTIVE();
396		mfence();
397		if (current_cpu_datap()->cpu_tlb_invalid)
398			process_pmap_updates();
399	}
400}
401
402
403/*
404 *	Insert the given physical page (p) at
405 *	the specified virtual address (v) in the
406 *	target physical map with the protection requested.
407 *
408 *	If specified, the page will be wired down, meaning
409 *	that the related pte cannot be reclaimed.
410 *
411 *	NB:  This is the only routine which MAY NOT lazy-evaluate
412 *	or lose information.  That is, this routine must actually
413 *	insert this page into the given map NOW.
414 */
415
416void
417pmap_enter(
418	register pmap_t		pmap,
419 	vm_map_offset_t		vaddr,
420	ppnum_t                 pn,
421	vm_prot_t		prot,
422	vm_prot_t		fault_type,
423	unsigned int 		flags,
424	boolean_t		wired)
425{
426	(void) pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL);
427}
428
429
430kern_return_t
431pmap_enter_options(
432	register pmap_t		pmap,
433 	vm_map_offset_t		vaddr,
434	ppnum_t                 pn,
435	vm_prot_t		prot,
436	__unused vm_prot_t	fault_type,
437	unsigned int 		flags,
438	boolean_t		wired,
439	unsigned int		options,
440	void			*arg)
441{
442	pt_entry_t		*pte;
443	pv_rooted_entry_t	pv_h;
444	ppnum_t			pai;
445	pv_hashed_entry_t	pvh_e;
446	pv_hashed_entry_t	pvh_new;
447	pt_entry_t		template;
448	pmap_paddr_t		old_pa;
449	pmap_paddr_t		pa = (pmap_paddr_t) i386_ptob(pn);
450	boolean_t		need_tlbflush = FALSE;
451	boolean_t		set_NX;
452	char			oattr;
453	boolean_t		old_pa_locked;
454	/* 2MiB mappings are confined to x86_64 by VM */
455	boolean_t		superpage = flags & VM_MEM_SUPERPAGE;
456	vm_object_t		delpage_pm_obj = NULL;
457	uint64_t		delpage_pde_index = 0;
458	pt_entry_t		old_pte;
459	kern_return_t		kr_expand;
460
461	pmap_intr_assert();
462
463	if (pmap == PMAP_NULL)
464		return KERN_INVALID_ARGUMENT;
465
466	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
467	 * unused value for that scenario.
468	 */
469	assert(pn != vm_page_fictitious_addr);
470
471	if (pn == vm_page_guard_addr)
472		return KERN_INVALID_ARGUMENT;
473
474	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
475	    pmap,
476	    (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
477	    pn, prot);
478
479	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
480		set_NX = FALSE;
481	else
482		set_NX = TRUE;
483
484	if (__improbable(set_NX && (pmap == kernel_pmap) && ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) || (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
485		set_NX = FALSE;
486	}
487
488	/*
489	 *	Must allocate a new pvlist entry while we're unlocked;
490	 *	zalloc may cause pageout (which will lock the pmap system).
491	 *	If we determine we need a pvlist entry, we will unlock
492	 *	and allocate one.  Then we will retry, throughing away
493	 *	the allocated entry later (if we no longer need it).
494	 */
495
496	pvh_new = PV_HASHED_ENTRY_NULL;
497Retry:
498	pvh_e = PV_HASHED_ENTRY_NULL;
499
500	PMAP_LOCK(pmap);
501
502	/*
503	 *	Expand pmap to include this pte.  Assume that
504	 *	pmap is always expanded to include enough hardware
505	 *	pages to map one VM page.
506	 */
507	 if(superpage) {
508	 	while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
509			/* need room for another pde entry */
510			PMAP_UNLOCK(pmap);
511			kr_expand = pmap_expand_pdpt(pmap, vaddr, options);
512			if (kr_expand != KERN_SUCCESS)
513				return kr_expand;
514			PMAP_LOCK(pmap);
515		}
516	} else {
517		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
518			/*
519			 * Must unlock to expand the pmap
520			 * going to grow pde level page(s)
521			 */
522			PMAP_UNLOCK(pmap);
523			kr_expand = pmap_expand(pmap, vaddr, options);
524			if (kr_expand != KERN_SUCCESS)
525				return kr_expand;
526			PMAP_LOCK(pmap);
527		}
528	}
529	if (options & PMAP_EXPAND_OPTIONS_NOENTER) {
530		PMAP_UNLOCK(pmap);
531		return KERN_SUCCESS;
532	}
533
534	if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
535		/*
536		 * There is still an empty page table mapped that
537		 * was used for a previous base page mapping.
538		 * Remember the PDE and the PDE index, so that we
539		 * can free the page at the end of this function.
540		 */
541		delpage_pde_index = pdeidx(pmap, vaddr);
542		delpage_pm_obj = pmap->pm_obj;
543		*pte = 0;
544	}
545
546	old_pa = pte_to_pa(*pte);
547	pai = pa_index(old_pa);
548	old_pa_locked = FALSE;
549
550	if (old_pa == 0 &&
551	    (*pte & INTEL_PTE_COMPRESSED)) {
552		/* one less "compressed" */
553		OSAddAtomic64(-1, &pmap->stats.compressed);
554		/* marker will be cleared below */
555	}
556
557	/*
558	 * if we have a previous managed page, lock the pv entry now. after
559	 * we lock it, check to see if someone beat us to the lock and if so
560	 * drop the lock
561	 */
562	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
563		LOCK_PVH(pai);
564		old_pa_locked = TRUE;
565		old_pa = pte_to_pa(*pte);
566		if (0 == old_pa) {
567			UNLOCK_PVH(pai);	/* another path beat us to it */
568			old_pa_locked = FALSE;
569		}
570	}
571
572	/*
573	 *	Special case if the incoming physical page is already mapped
574	 *	at this address.
575	 */
576	if (old_pa == pa) {
577		pt_entry_t old_attributes =
578		    *pte & ~(INTEL_PTE_REF | INTEL_PTE_MOD);
579
580		/*
581	         *	May be changing its wired attribute or protection
582	         */
583
584		template = pa_to_pte(pa) | INTEL_PTE_VALID;
585		template |= pmap_get_cache_attributes(pa_index(pa));
586
587		if (VM_MEM_NOT_CACHEABLE ==
588		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
589			if (!(flags & VM_MEM_GUARDED))
590				template |= INTEL_PTE_PTA;
591			template |= INTEL_PTE_NCACHE;
592		}
593		if (pmap != kernel_pmap)
594			template |= INTEL_PTE_USER;
595		if (prot & VM_PROT_WRITE) {
596			template |= INTEL_PTE_WRITE;
597		}
598
599		if (set_NX)
600			template |= INTEL_PTE_NX;
601
602		if (wired) {
603			template |= INTEL_PTE_WIRED;
604			if (!iswired(old_attributes))  {
605				OSAddAtomic(+1, &pmap->stats.wired_count);
606				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
607			}
608		} else {
609			if (iswired(old_attributes)) {
610				assert(pmap->stats.wired_count >= 1);
611				OSAddAtomic(-1, &pmap->stats.wired_count);
612				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
613			}
614		}
615		if (superpage)		/* this path can not be used */
616			template |= INTEL_PTE_PS;	/* to change the page size! */
617
618		if (old_attributes == template)
619			goto dont_update_pte;
620
621		/* Determine delta, PV locked */
622		need_tlbflush =
623		    ((old_attributes ^ template) != INTEL_PTE_WIRED);
624
625		if (need_tlbflush == TRUE && !(old_attributes & INTEL_PTE_WRITE)) {
626			if ((old_attributes ^ template) == INTEL_PTE_WRITE)
627				need_tlbflush = FALSE;
628		}
629
630		/* store modified PTE and preserve RC bits */
631		pt_entry_t npte, opte;;
632		do {
633			opte = *pte;
634			npte = template | (opte & (INTEL_PTE_REF | INTEL_PTE_MOD));
635		} while (!pmap_cmpx_pte(pte, opte, npte));
636dont_update_pte:
637		if (old_pa_locked) {
638			UNLOCK_PVH(pai);
639			old_pa_locked = FALSE;
640		}
641		goto Done;
642	}
643
644	/*
645	 *	Outline of code from here:
646	 *	   1) If va was mapped, update TLBs, remove the mapping
647	 *	      and remove old pvlist entry.
648	 *	   2) Add pvlist entry for new mapping
649	 *	   3) Enter new mapping.
650	 *
651	 *	If the old physical page is not managed step 1) is skipped
652	 *	(except for updating the TLBs), and the mapping is
653	 *	overwritten at step 3).  If the new physical page is not
654	 *	managed, step 2) is skipped.
655	 */
656
657	if (old_pa != (pmap_paddr_t) 0) {
658
659		/*
660	         *	Don't do anything to pages outside valid memory here.
661	         *	Instead convince the code that enters a new mapping
662	         *	to overwrite the old one.
663	         */
664
665		/* invalidate the PTE */
666		pmap_update_pte(pte, INTEL_PTE_VALID, 0);
667		/* propagate invalidate everywhere */
668		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
669		/* remember reference and change */
670		old_pte	= *pte;
671		oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED));
672		/* completely invalidate the PTE */
673		pmap_store_pte(pte, 0);
674
675		if (IS_MANAGED_PAGE(pai)) {
676			pmap_assert(old_pa_locked == TRUE);
677			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
678			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
679			assert(pmap->stats.resident_count >= 1);
680			OSAddAtomic(-1, &pmap->stats.resident_count);
681			if (pmap != kernel_pmap) {
682				if (IS_REUSABLE_PAGE(pai)) {
683					assert(pmap->stats.reusable > 0);
684					OSAddAtomic(-1, &pmap->stats.reusable);
685				} else if (IS_INTERNAL_PAGE(pai)) {
686					assert(pmap->stats.internal > 0);
687					OSAddAtomic(-1, &pmap->stats.internal);
688				} else {
689					assert(pmap->stats.external > 0);
690					OSAddAtomic(-1, &pmap->stats.external);
691				}
692			}
693			if (iswired(*pte)) {
694				assert(pmap->stats.wired_count >= 1);
695				OSAddAtomic(-1, &pmap->stats.wired_count);
696				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
697				    PAGE_SIZE);
698			}
699			pmap_phys_attributes[pai] |= oattr;
700
701			/*
702			 *	Remove the mapping from the pvlist for
703			 *	this physical page.
704			 *      We'll end up with either a rooted pv or a
705			 *      hashed pv
706			 */
707			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte);
708
709		} else {
710
711			/*
712			 *	old_pa is not managed.
713			 *	Do removal part of accounting.
714			 */
715
716			if (pmap != kernel_pmap) {
717#if 00
718				assert(pmap->stats.device > 0);
719				OSAddAtomic(-1, &pmap->stats.device);
720#endif
721			}
722			if (iswired(*pte)) {
723				assert(pmap->stats.wired_count >= 1);
724				OSAddAtomic(-1, &pmap->stats.wired_count);
725				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
726			}
727		}
728	}
729
730	/*
731	 * if we had a previously managed paged locked, unlock it now
732	 */
733	if (old_pa_locked) {
734		UNLOCK_PVH(pai);
735		old_pa_locked = FALSE;
736	}
737
738	pai = pa_index(pa);	/* now working with new incoming phys page */
739	if (IS_MANAGED_PAGE(pai)) {
740
741		/*
742	         *	Step 2) Enter the mapping in the PV list for this
743	         *	physical page.
744	         */
745		pv_h = pai_to_pvh(pai);
746
747		LOCK_PVH(pai);
748
749		if (pv_h->pmap == PMAP_NULL) {
750			/*
751			 *	No mappings yet, use rooted pv
752			 */
753			pv_h->va = vaddr;
754			pv_h->pmap = pmap;
755			queue_init(&pv_h->qlink);
756
757			if (options & PMAP_OPTIONS_INTERNAL) {
758				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
759			} else {
760				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
761			}
762			if (options & PMAP_OPTIONS_REUSABLE) {
763				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
764			} else {
765				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
766			}
767		} else {
768			/*
769			 *	Add new pv_hashed_entry after header.
770			 */
771			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
772				pvh_e = pvh_new;
773				pvh_new = PV_HASHED_ENTRY_NULL;
774			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
775				PV_HASHED_ALLOC(&pvh_e);
776				if (PV_HASHED_ENTRY_NULL == pvh_e) {
777					/*
778					 * the pv list is empty. if we are on
779					 * the kernel pmap we'll use one of
780					 * the special private kernel pv_e's,
781					 * else, we need to unlock
782					 * everything, zalloc a pv_e, and
783					 * restart bringing in the pv_e with
784					 * us.
785					 */
786					if (kernel_pmap == pmap) {
787						PV_HASHED_KERN_ALLOC(&pvh_e);
788					} else {
789						UNLOCK_PVH(pai);
790						PMAP_UNLOCK(pmap);
791						pmap_pv_throttle(pmap);
792						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
793						goto Retry;
794					}
795				}
796			}
797
798			if (PV_HASHED_ENTRY_NULL == pvh_e)
799				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
800
801			pvh_e->va = vaddr;
802			pvh_e->pmap = pmap;
803			pvh_e->ppn = pn;
804			pv_hash_add(pvh_e, pv_h);
805
806			/*
807			 *	Remember that we used the pvlist entry.
808			 */
809			pvh_e = PV_HASHED_ENTRY_NULL;
810		}
811
812		/*
813	         * only count the mapping
814	         * for 'managed memory'
815	         */
816		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
817		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
818		OSAddAtomic(+1,  &pmap->stats.resident_count);
819		if (pmap->stats.resident_count > pmap->stats.resident_max) {
820			pmap->stats.resident_max = pmap->stats.resident_count;
821		}
822		if (pmap != kernel_pmap) {
823			if (IS_REUSABLE_PAGE(pai)) {
824				OSAddAtomic(+1, &pmap->stats.reusable);
825				PMAP_STATS_PEAK(pmap->stats.reusable);
826			} else if (IS_INTERNAL_PAGE(pai)) {
827				OSAddAtomic(+1, &pmap->stats.internal);
828				PMAP_STATS_PEAK(pmap->stats.internal);
829			} else {
830				OSAddAtomic(+1, &pmap->stats.external);
831				PMAP_STATS_PEAK(pmap->stats.external);
832			}
833		}
834	} else if (last_managed_page == 0) {
835		/* Account for early mappings created before "managed pages"
836		 * are determined. Consider consulting the available DRAM map.
837		 */
838		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
839		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
840		OSAddAtomic(+1,  &pmap->stats.resident_count);
841		if (pmap != kernel_pmap) {
842#if 00
843			OSAddAtomic(+1, &pmap->stats.device);
844			PMAP_STATS_PEAK(pmap->stats.device);
845#endif
846		}
847	}
848	/*
849	 * Step 3) Enter the mapping.
850	 *
851	 *	Build a template to speed up entering -
852	 *	only the pfn changes.
853	 */
854	template = pa_to_pte(pa) | INTEL_PTE_VALID;
855	/*
856	 * DRK: It may be worth asserting on cache attribute flags that diverge
857	 * from the existing physical page attributes.
858	 */
859
860	template |= pmap_get_cache_attributes(pa_index(pa));
861
862	if (flags & VM_MEM_NOT_CACHEABLE) {
863		if (!(flags & VM_MEM_GUARDED))
864			template |= INTEL_PTE_PTA;
865		template |= INTEL_PTE_NCACHE;
866	}
867	if (pmap != kernel_pmap)
868		template |= INTEL_PTE_USER;
869	if (prot & VM_PROT_WRITE)
870		template |= INTEL_PTE_WRITE;
871	if (set_NX)
872		template |= INTEL_PTE_NX;
873	if (wired) {
874		template |= INTEL_PTE_WIRED;
875		OSAddAtomic(+1,  & pmap->stats.wired_count);
876		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
877	}
878	if (superpage)
879		template |= INTEL_PTE_PS;
880	pmap_store_pte(pte, template);
881
882	/*
883	 * if this was a managed page we delayed unlocking the pv until here
884	 * to prevent pmap_page_protect et al from finding it until the pte
885	 * has been stored
886	 */
887	if (IS_MANAGED_PAGE(pai)) {
888		UNLOCK_PVH(pai);
889	}
890Done:
891	if (need_tlbflush == TRUE) {
892		if (options & PMAP_OPTIONS_NOFLUSH)
893			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
894		else
895			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
896	}
897	if (pvh_e != PV_HASHED_ENTRY_NULL) {
898		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
899	}
900	if (pvh_new != PV_HASHED_ENTRY_NULL) {
901		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
902	}
903	PMAP_UNLOCK(pmap);
904
905	if (delpage_pm_obj) {
906		vm_page_t m;
907
908		vm_object_lock(delpage_pm_obj);
909		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
910		if (m == VM_PAGE_NULL)
911		    panic("pmap_enter: pte page not in object");
912		vm_object_unlock(delpage_pm_obj);
913		VM_PAGE_FREE(m);
914		OSAddAtomic(-1,  &inuse_ptepages_count);
915		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
916	}
917
918	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
919	return KERN_SUCCESS;
920}
921
922/*
923 *	Remove a range of hardware page-table entries.
924 *	The entries given are the first (inclusive)
925 *	and last (exclusive) entries for the VM pages.
926 *	The virtual address is the va for the first pte.
927 *
928 *	The pmap must be locked.
929 *	If the pmap is not the kernel pmap, the range must lie
930 *	entirely within one pte-page.  This is NOT checked.
931 *	Assumes that the pte-page exists.
932 */
933
934void
935pmap_remove_range(
936	pmap_t			pmap,
937	vm_map_offset_t		start_vaddr,
938	pt_entry_t		*spte,
939	pt_entry_t		*epte)
940{
941	pmap_remove_range_options(pmap, start_vaddr, spte, epte, 0);
942}
943
944void
945pmap_remove_range_options(
946	pmap_t			pmap,
947	vm_map_offset_t		start_vaddr,
948	pt_entry_t		*spte,
949	pt_entry_t		*epte,
950	int			options)
951{
952	pt_entry_t		*cpte;
953	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
954	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
955	pv_hashed_entry_t       pvh_e;
956	int			pvh_cnt = 0;
957	int			num_removed, num_unwired, num_found, num_invalid;
958	int			num_device, num_external, num_internal, num_reusable;
959	uint64_t		num_compressed;
960	ppnum_t			pai;
961	pmap_paddr_t		pa;
962	vm_map_offset_t		vaddr;
963
964	num_removed = 0;
965	num_unwired = 0;
966	num_found   = 0;
967	num_invalid = 0;
968	num_device  = 0;
969	num_external = 0;
970	num_internal = 0;
971	num_reusable = 0;
972	num_compressed = 0;
973	/* invalidate the PTEs first to "freeze" them */
974	for (cpte = spte, vaddr = start_vaddr;
975	     cpte < epte;
976	     cpte++, vaddr += PAGE_SIZE_64) {
977		pt_entry_t p = *cpte;
978
979		pa = pte_to_pa(p);
980		if (pa == 0) {
981			if (pmap != kernel_pmap &&
982			    (options & PMAP_OPTIONS_REMOVE) &&
983			    (p & INTEL_PTE_COMPRESSED)) {
984				/* one less "compressed" */
985				num_compressed++;
986				/* clear marker */
987				/* XXX probably does not need to be atomic! */
988				pmap_update_pte(cpte, INTEL_PTE_COMPRESSED, 0);
989			}
990			continue;
991		}
992		num_found++;
993
994		if (iswired(p))
995			num_unwired++;
996
997		pai = pa_index(pa);
998
999		if (!IS_MANAGED_PAGE(pai)) {
1000			/*
1001			 *	Outside range of managed physical memory.
1002			 *	Just remove the mappings.
1003			 */
1004			pmap_store_pte(cpte, 0);
1005			num_device++;
1006			continue;
1007		}
1008
1009		if ((p & INTEL_PTE_VALID) == 0)
1010			num_invalid++;
1011
1012		/* invalidate the PTE */
1013		pmap_update_pte(cpte, INTEL_PTE_VALID, 0);
1014	}
1015
1016	if (num_found == 0) {
1017		/* nothing was changed: we're done */
1018	        goto update_counts;
1019	}
1020
1021	/* propagate the invalidates to other CPUs */
1022
1023	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1024
1025	for (cpte = spte, vaddr = start_vaddr;
1026	     cpte < epte;
1027	     cpte++, vaddr += PAGE_SIZE_64) {
1028
1029		pa = pte_to_pa(*cpte);
1030		if (pa == 0)
1031			continue;
1032
1033		pai = pa_index(pa);
1034
1035		LOCK_PVH(pai);
1036
1037		pa = pte_to_pa(*cpte);
1038		if (pa == 0) {
1039			UNLOCK_PVH(pai);
1040			continue;
1041		}
1042		num_removed++;
1043		if (IS_REUSABLE_PAGE(pai)) {
1044			num_reusable++;
1045		} else if (IS_INTERNAL_PAGE(pai)) {
1046			num_internal++;
1047		} else {
1048			num_external++;
1049		}
1050
1051		/*
1052	       	 * Get the modify and reference bits, then
1053	       	 * nuke the entry in the page table
1054	       	 */
1055		/* remember reference and change */
1056		pmap_phys_attributes[pai] |=
1057			(char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1058
1059		/*
1060	      	 * Remove the mapping from the pvlist for this physical page.
1061	         */
1062		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte);
1063
1064		/* completely invalidate the PTE */
1065		pmap_store_pte(cpte, 0);
1066
1067		UNLOCK_PVH(pai);
1068
1069		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1070			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1071			pvh_eh = pvh_e;
1072
1073			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1074				pvh_et = pvh_e;
1075			}
1076			pvh_cnt++;
1077		}
1078	} /* for loop */
1079
1080	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1081		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1082	}
1083update_counts:
1084	/*
1085	 *	Update the counts
1086	 */
1087#if TESTING
1088	if (pmap->stats.resident_count < num_removed)
1089	        panic("pmap_remove_range: resident_count");
1090#endif
1091	pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1092	pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(num_removed));
1093	assert(pmap->stats.resident_count >= num_removed);
1094	OSAddAtomic(-num_removed,  &pmap->stats.resident_count);
1095
1096	if (pmap != kernel_pmap) {
1097#if 00
1098		assert(pmap->stats.device >= num_device);
1099		if (num_device)
1100			OSAddAtomic(-num_device, &pmap->stats.device);
1101#endif /* 00 */
1102		assert(pmap->stats.external >= num_external);
1103		if (num_external)
1104			OSAddAtomic(-num_external, &pmap->stats.external);
1105		assert(pmap->stats.internal >= num_internal);
1106		if (num_internal)
1107			OSAddAtomic(-num_internal, &pmap->stats.internal);
1108		assert(pmap->stats.reusable >= num_reusable);
1109		if (num_reusable)
1110			OSAddAtomic(-num_reusable, &pmap->stats.reusable);
1111		assert(pmap->stats.compressed >= num_compressed);
1112		if (num_compressed)
1113			OSAddAtomic64(-num_compressed, &pmap->stats.compressed);
1114	}
1115
1116#if TESTING
1117	if (pmap->stats.wired_count < num_unwired)
1118	        panic("pmap_remove_range: wired_count");
1119#endif
1120	assert(pmap->stats.wired_count >= num_unwired);
1121	OSAddAtomic(-num_unwired,  &pmap->stats.wired_count);
1122	pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1123
1124	return;
1125}
1126
1127
1128/*
1129 *	Remove the given range of addresses
1130 *	from the specified map.
1131 *
1132 *	It is assumed that the start and end are properly
1133 *	rounded to the hardware page size.
1134 */
1135void
1136pmap_remove(
1137	pmap_t		map,
1138	addr64_t	s64,
1139	addr64_t	e64)
1140{
1141	pmap_remove_options(map, s64, e64, 0);
1142}
1143
1144void
1145pmap_remove_options(
1146	pmap_t		map,
1147	addr64_t	s64,
1148	addr64_t	e64,
1149	int		options)
1150{
1151	pt_entry_t     *pde;
1152	pt_entry_t     *spte, *epte;
1153	addr64_t        l64;
1154	uint64_t        deadline;
1155
1156	pmap_intr_assert();
1157
1158	if (map == PMAP_NULL || s64 == e64)
1159		return;
1160
1161	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1162		   map,
1163		   (uint32_t) (s64 >> 32), s64,
1164		   (uint32_t) (e64 >> 32), e64);
1165
1166
1167	PMAP_LOCK(map);
1168
1169#if 0
1170	/*
1171	 * Check that address range in the kernel does not overlap the stacks.
1172	 * We initialize local static min/max variables once to avoid making
1173	 * 2 function calls for every remove. Note also that these functions
1174	 * both return 0 before kernel stacks have been initialized, and hence
1175	 * the panic is not triggered in this case.
1176	 */
1177	if (map == kernel_pmap) {
1178		static vm_offset_t kernel_stack_min = 0;
1179		static vm_offset_t kernel_stack_max = 0;
1180
1181		if (kernel_stack_min == 0) {
1182			kernel_stack_min = min_valid_stack_address();
1183			kernel_stack_max = max_valid_stack_address();
1184		}
1185		if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
1186		    (kernel_stack_min < e64 && e64 <= kernel_stack_max))
1187			panic("pmap_remove() attempted in kernel stack");
1188	}
1189#else
1190
1191	/*
1192	 * The values of kernel_stack_min and kernel_stack_max are no longer
1193	 * relevant now that we allocate kernel stacks in the kernel map,
1194	 * so the old code above no longer applies.  If we wanted to check that
1195	 * we weren't removing a mapping of a page in a kernel stack we'd
1196	 * mark the PTE with an unused bit and check that here.
1197	 */
1198
1199#endif
1200
1201	deadline = rdtsc64() + max_preemption_latency_tsc;
1202
1203	while (s64 < e64) {
1204		l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1205		if (l64 > e64)
1206			l64 = e64;
1207		pde = pmap_pde(map, s64);
1208
1209		if (pde && (*pde & INTEL_PTE_VALID)) {
1210			if (*pde & INTEL_PTE_PS) {
1211				/*
1212				 * If we're removing a superpage, pmap_remove_range()
1213				 * must work on level 2 instead of level 1; and we're
1214				 * only passing a single level 2 entry instead of a
1215				 * level 1 range.
1216				 */
1217				spte = pde;
1218				epte = spte+1; /* excluded */
1219			} else {
1220				spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
1221				spte = &spte[ptenum(s64)];
1222				epte = &spte[intel_btop(l64 - s64)];
1223			}
1224			pmap_remove_range_options(map, s64, spte, epte,
1225						  options);
1226		}
1227		s64 = l64;
1228
1229		if (s64 < e64 && rdtsc64() >= deadline) {
1230			PMAP_UNLOCK(map)
1231			PMAP_LOCK(map)
1232			deadline = rdtsc64() + max_preemption_latency_tsc;
1233		}
1234	}
1235
1236	PMAP_UNLOCK(map);
1237
1238	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
1239		   map, 0, 0, 0, 0);
1240
1241}
1242
1243void
1244pmap_page_protect(
1245        ppnum_t         pn,
1246	vm_prot_t	prot)
1247{
1248	pmap_page_protect_options(pn, prot, 0, NULL);
1249}
1250
1251/*
1252 *	Routine:	pmap_page_protect_options
1253 *
1254 *	Function:
1255 *		Lower the permission for all mappings to a given
1256 *		page.
1257 */
1258void
1259pmap_page_protect_options(
1260        ppnum_t         pn,
1261	vm_prot_t	prot,
1262	unsigned int	options,
1263	void		*arg)
1264{
1265	pv_hashed_entry_t	pvh_eh = PV_HASHED_ENTRY_NULL;
1266	pv_hashed_entry_t	pvh_et = PV_HASHED_ENTRY_NULL;
1267	pv_hashed_entry_t	nexth;
1268	int			pvh_cnt = 0;
1269	pv_rooted_entry_t	pv_h;
1270	pv_rooted_entry_t	pv_e;
1271	pv_hashed_entry_t	pvh_e;
1272	pt_entry_t		*pte;
1273	int			pai;
1274	pmap_t			pmap;
1275	boolean_t		remove;
1276	pt_entry_t		new_pte_value;
1277
1278	pmap_intr_assert();
1279	assert(pn != vm_page_fictitious_addr);
1280	if (pn == vm_page_guard_addr)
1281		return;
1282
1283	pai = ppn_to_pai(pn);
1284
1285	if (!IS_MANAGED_PAGE(pai)) {
1286		/*
1287	         *	Not a managed page.
1288	         */
1289		return;
1290	}
1291	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
1292		   pn, prot, 0, 0, 0);
1293
1294	/*
1295	 * Determine the new protection.
1296	 */
1297	switch (prot) {
1298	case VM_PROT_READ:
1299	case VM_PROT_READ | VM_PROT_EXECUTE:
1300		remove = FALSE;
1301		break;
1302	case VM_PROT_ALL:
1303		return;		/* nothing to do */
1304	default:
1305		remove = TRUE;
1306		break;
1307	}
1308
1309	pv_h = pai_to_pvh(pai);
1310
1311	LOCK_PVH(pai);
1312
1313
1314	/*
1315	 * Walk down PV list, if any, changing or removing all mappings.
1316	 */
1317	if (pv_h->pmap == PMAP_NULL)
1318		goto done;
1319
1320	pv_e = pv_h;
1321	pvh_e = (pv_hashed_entry_t) pv_e;	/* cheat */
1322
1323	do {
1324		vm_map_offset_t vaddr;
1325
1326		pmap = pv_e->pmap;
1327		vaddr = pv_e->va;
1328		pte = pmap_pte(pmap, vaddr);
1329
1330		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1331		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1332
1333		if (0 == pte) {
1334			panic("pmap_page_protect() "
1335				"pmap=%p pn=0x%x vaddr=0x%llx\n",
1336				pmap, pn, vaddr);
1337		}
1338		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1339
1340		/*
1341		 * Remove the mapping if new protection is NONE
1342		 */
1343		if (remove) {
1344
1345			/* Remove per-pmap wired count */
1346			if (iswired(*pte)) {
1347				OSAddAtomic(-1, &pmap->stats.wired_count);
1348				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1349			}
1350
1351			if (pmap != kernel_pmap &&
1352			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1353			    IS_INTERNAL_PAGE(pai)) {
1354				/* adjust "reclaimed" stats */
1355				OSAddAtomic64(+1, &pmap->stats.compressed);
1356				PMAP_STATS_PEAK(pmap->stats.compressed);
1357				pmap->stats.compressed_lifetime++;
1358				/* mark this PTE as having been "reclaimed" */
1359				new_pte_value = INTEL_PTE_COMPRESSED;
1360			} else {
1361				new_pte_value = 0;
1362			}
1363
1364			if (options & PMAP_OPTIONS_NOREFMOD) {
1365				pmap_store_pte(pte, new_pte_value);
1366
1367				if (options & PMAP_OPTIONS_NOFLUSH)
1368					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1369				else
1370					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1371			} else {
1372				/*
1373				 * Remove the mapping, collecting dirty bits.
1374				 */
1375				pmap_update_pte(pte, INTEL_PTE_VALID, 0);
1376
1377				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1378				pmap_phys_attributes[pai] |=
1379					*pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1380				pmap_store_pte(pte, new_pte_value);
1381			}
1382#if TESTING
1383			if (pmap->stats.resident_count < 1)
1384				panic("pmap_page_protect: resident_count");
1385#endif
1386			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1387			assert(pmap->stats.resident_count >= 1);
1388			OSAddAtomic(-1,  &pmap->stats.resident_count);
1389			if (options & PMAP_OPTIONS_COMPRESSOR) {
1390				/*
1391				 * This removal is only being done so we can send this page to
1392				 * the compressor; therefore it mustn't affect total task footprint.
1393				 */
1394				pmap_ledger_credit(pmap, task_ledgers.phys_compressed, PAGE_SIZE);
1395			} else {
1396				pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1397			}
1398
1399			if (pmap != kernel_pmap) {
1400				if (IS_REUSABLE_PAGE(pai)) {
1401					assert(pmap->stats.reusable > 0);
1402					OSAddAtomic(-1, &pmap->stats.reusable);
1403				} else if (IS_INTERNAL_PAGE(pai)) {
1404					assert(pmap->stats.internal > 0);
1405					OSAddAtomic(-1, &pmap->stats.internal);
1406				} else {
1407					assert(pmap->stats.external > 0);
1408					OSAddAtomic(-1, &pmap->stats.external);
1409				}
1410			}
1411
1412			/*
1413		         * Deal with the pv_rooted_entry.
1414		         */
1415
1416			if (pv_e == pv_h) {
1417				/*
1418				 * Fix up head later.
1419				 */
1420				pv_h->pmap = PMAP_NULL;
1421			} else {
1422				/*
1423				 * Delete this entry.
1424				 */
1425				pv_hash_remove(pvh_e);
1426				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1427				pvh_eh = pvh_e;
1428
1429				if (pvh_et == PV_HASHED_ENTRY_NULL)
1430					pvh_et = pvh_e;
1431				pvh_cnt++;
1432			}
1433		} else {
1434			/*
1435		         * Write-protect, after opportunistic refmod collect
1436		         */
1437			pmap_phys_attributes[pai] |=
1438			    *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1439			pmap_update_pte(pte, INTEL_PTE_WRITE, 0);
1440
1441			if (options & PMAP_OPTIONS_NOFLUSH)
1442				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1443			else
1444				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1445		}
1446		pvh_e = nexth;
1447	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1448
1449
1450	/*
1451	 * If pv_head mapping was removed, fix it up.
1452	 */
1453	if (pv_h->pmap == PMAP_NULL) {
1454		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1455
1456		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1457			pv_hash_remove(pvh_e);
1458			pv_h->pmap = pvh_e->pmap;
1459			pv_h->va = pvh_e->va;
1460			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1461			pvh_eh = pvh_e;
1462
1463			if (pvh_et == PV_HASHED_ENTRY_NULL)
1464				pvh_et = pvh_e;
1465			pvh_cnt++;
1466		}
1467	}
1468	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1469		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1470	}
1471done:
1472	UNLOCK_PVH(pai);
1473
1474	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
1475		   0, 0, 0, 0, 0);
1476}
1477
1478
1479/*
1480 *	Clear specified attribute bits.
1481 */
1482void
1483phys_attribute_clear(
1484	ppnum_t		pn,
1485	int		bits,
1486	unsigned int	options,
1487	void		*arg)
1488{
1489	pv_rooted_entry_t	pv_h;
1490	pv_hashed_entry_t	pv_e;
1491	pt_entry_t		*pte;
1492	int			pai;
1493	pmap_t			pmap;
1494	char			attributes = 0;
1495
1496	pmap_intr_assert();
1497	assert(pn != vm_page_fictitious_addr);
1498	if (pn == vm_page_guard_addr)
1499		return;
1500
1501	pai = ppn_to_pai(pn);
1502
1503	if (!IS_MANAGED_PAGE(pai)) {
1504		/*
1505		 *	Not a managed page.
1506		 */
1507		return;
1508	}
1509
1510	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
1511		   pn, bits, 0, 0, 0);
1512
1513	pv_h = pai_to_pvh(pai);
1514
1515	LOCK_PVH(pai);
1516
1517	/*
1518	 * Walk down PV list, clearing all modify or reference bits.
1519	 * We do not have to lock the pv_list because we have
1520	 * the per-pmap lock
1521	 */
1522	if (pv_h->pmap != PMAP_NULL) {
1523		/*
1524		 * There are some mappings.
1525		 */
1526
1527		pv_e = (pv_hashed_entry_t)pv_h;
1528
1529		do {
1530			vm_map_offset_t	va;
1531
1532			pmap = pv_e->pmap;
1533			va = pv_e->va;
1534
1535			 /*
1536			  * Clear modify and/or reference bits.
1537			  */
1538			pte = pmap_pte(pmap, va);
1539			attributes |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1540			pmap_update_pte(pte, bits, 0);
1541			/* Ensure all processors using this translation
1542			 * invalidate this TLB entry. The invalidation *must*
1543			 * follow the PTE update, to ensure that the TLB
1544			 * shadow of the 'D' bit (in particular) is
1545			 * synchronized with the updated PTE.
1546			 */
1547			if (options & PMAP_OPTIONS_NOFLUSH) {
1548				if (arg)
1549					PMAP_UPDATE_TLBS_DELAYED(pmap, va, va + PAGE_SIZE, (pmap_flush_context *)arg);
1550			} else
1551				PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
1552
1553			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
1554
1555		} while (pv_e != (pv_hashed_entry_t)pv_h);
1556	}
1557	/* Opportunistic refmod collection, annulled
1558	 * if both REF and MOD are being cleared.
1559	 */
1560
1561	pmap_phys_attributes[pai] |= attributes;
1562	pmap_phys_attributes[pai] &= (~bits);
1563
1564	UNLOCK_PVH(pai);
1565
1566	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
1567		   0, 0, 0, 0, 0);
1568}
1569
1570/*
1571 *	Check specified attribute bits.
1572 */
1573int
1574phys_attribute_test(
1575	ppnum_t		pn,
1576	int		bits)
1577{
1578	pv_rooted_entry_t	pv_h;
1579	pv_hashed_entry_t	pv_e;
1580	pt_entry_t		*pte;
1581	int			pai;
1582	pmap_t			pmap;
1583	int			attributes = 0;
1584
1585	pmap_intr_assert();
1586	assert(pn != vm_page_fictitious_addr);
1587	if (pn == vm_page_guard_addr)
1588		return 0;
1589
1590	pai = ppn_to_pai(pn);
1591
1592	if (!IS_MANAGED_PAGE(pai)) {
1593		/*
1594		 *	Not a managed page.
1595		 */
1596		return 0;
1597	}
1598
1599	/*
1600	 * Fast check...  if bits already collected
1601	 * no need to take any locks...
1602	 * if not set, we need to recheck after taking
1603	 * the lock in case they got pulled in while
1604	 * we were waiting for the lock
1605	 */
1606	if ((pmap_phys_attributes[pai] & bits) == bits)
1607		return bits;
1608
1609	pv_h = pai_to_pvh(pai);
1610
1611	LOCK_PVH(pai);
1612
1613	attributes = pmap_phys_attributes[pai] & bits;
1614
1615
1616	/*
1617	 * Walk down PV list, checking the mappings until we
1618	 * reach the end or we've found the desired attributes.
1619	 */
1620	if (attributes != bits &&
1621	    pv_h->pmap != PMAP_NULL) {
1622		/*
1623		 * There are some mappings.
1624		 */
1625		pv_e = (pv_hashed_entry_t)pv_h;
1626		do {
1627			vm_map_offset_t va;
1628
1629			pmap = pv_e->pmap;
1630			va = pv_e->va;
1631			/*
1632	 		 * pick up modify and/or reference bits from mapping
1633			 */
1634
1635			pte = pmap_pte(pmap, va);
1636			attributes |= (int)(*pte & bits);
1637
1638			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
1639
1640		} while ((attributes != bits) &&
1641			 (pv_e != (pv_hashed_entry_t)pv_h));
1642	}
1643	pmap_phys_attributes[pai] |= attributes;
1644
1645	UNLOCK_PVH(pai);
1646	return (attributes);
1647}
1648
1649/*
1650 *	Routine:	pmap_change_wiring
1651 *	Function:	Change the wiring attribute for a map/virtual-address
1652 *			pair.
1653 *	In/out conditions:
1654 *			The mapping must already exist in the pmap.
1655 */
1656void
1657pmap_change_wiring(
1658	pmap_t		map,
1659	vm_map_offset_t	vaddr,
1660	boolean_t	wired)
1661{
1662	pt_entry_t	*pte;
1663
1664	PMAP_LOCK(map);
1665
1666	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
1667		panic("pmap_change_wiring: pte missing");
1668
1669	if (wired && !iswired(*pte)) {
1670		/*
1671		 * wiring down mapping
1672		 */
1673		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
1674		OSAddAtomic(+1,  &map->stats.wired_count);
1675		pmap_update_pte(pte, 0, INTEL_PTE_WIRED);
1676	}
1677	else if (!wired && iswired(*pte)) {
1678		/*
1679		 * unwiring mapping
1680		 */
1681		assert(map->stats.wired_count >= 1);
1682		OSAddAtomic(-1,  &map->stats.wired_count);
1683		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
1684		pmap_update_pte(pte, INTEL_PTE_WIRED, 0);
1685	}
1686
1687	PMAP_UNLOCK(map);
1688}
1689
1690/*
1691 *	"Backdoor" direct map routine for early mappings.
1692 * 	Useful for mapping memory outside the range
1693 *      Sets A, D and NC if requested
1694 */
1695
1696vm_offset_t
1697pmap_map_bd(
1698	vm_offset_t	virt,
1699	vm_map_offset_t	start_addr,
1700	vm_map_offset_t	end_addr,
1701	vm_prot_t	prot,
1702	unsigned int	flags)
1703{
1704	pt_entry_t	template;
1705	pt_entry_t	*pte;
1706	spl_t           spl;
1707	vm_offset_t	base = virt;
1708	template = pa_to_pte(start_addr)
1709		| INTEL_PTE_REF
1710		| INTEL_PTE_MOD
1711		| INTEL_PTE_WIRED
1712		| INTEL_PTE_VALID;
1713
1714	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
1715		template |= INTEL_PTE_NCACHE;
1716		if (!(flags & (VM_MEM_GUARDED)))
1717			template |= INTEL_PTE_PTA;
1718	}
1719
1720#if    defined(__x86_64__)
1721	if ((prot & VM_PROT_EXECUTE) == 0)
1722		template |= INTEL_PTE_NX;
1723#endif
1724
1725	if (prot & VM_PROT_WRITE)
1726		template |= INTEL_PTE_WRITE;
1727
1728	while (start_addr < end_addr) {
1729	        spl = splhigh();
1730		pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
1731		if (pte == PT_ENTRY_NULL) {
1732			panic("pmap_map_bd: Invalid kernel address\n");
1733		}
1734		pmap_store_pte(pte, template);
1735		splx(spl);
1736		pte_increment_pa(template);
1737		virt += PAGE_SIZE;
1738		start_addr += PAGE_SIZE;
1739	}
1740	flush_tlb_raw();
1741	PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
1742	return(virt);
1743}
1744
1745void
1746pmap_reusable(
1747	pmap_t		pmap,
1748	addr64_t	s64,
1749	addr64_t	e64,
1750	boolean_t	reusable)
1751{
1752	pt_entry_t     *pde;
1753	pt_entry_t     *spte, *epte;
1754	addr64_t        l64;
1755	uint64_t        deadline;
1756
1757	pmap_intr_assert();
1758
1759	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64)
1760		return;
1761
1762	PMAP_TRACE(PMAP_CODE(PMAP__REUSABLE) | DBG_FUNC_START,
1763		   pmap,
1764		   (uint32_t) (s64 >> 32), s64,
1765		   (uint32_t) (e64 >> 32), e64);
1766
1767	PMAP_LOCK(pmap);
1768
1769	deadline = rdtsc64() + max_preemption_latency_tsc;
1770
1771	while (s64 < e64) {
1772		l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1773		if (l64 > e64)
1774			l64 = e64;
1775		pde = pmap_pde(pmap, s64);
1776
1777		if (pde && (*pde & INTEL_PTE_VALID)) {
1778			if (*pde & INTEL_PTE_PS) {
1779				/* superpage: not supported */
1780			} else {
1781				spte = pmap_pte(pmap,
1782						(s64 & ~(pde_mapped_size - 1)));
1783				spte = &spte[ptenum(s64)];
1784				epte = &spte[intel_btop(l64 - s64)];
1785				pmap_reusable_range(pmap, s64, spte, epte,
1786						    reusable);
1787			}
1788		}
1789		s64 = l64;
1790
1791		if (s64 < e64 && rdtsc64() >= deadline) {
1792			PMAP_UNLOCK(pmap);
1793			PMAP_LOCK(pmap);
1794			deadline = rdtsc64() + max_preemption_latency_tsc;
1795		}
1796	}
1797
1798	PMAP_UNLOCK(pmap);
1799
1800	PMAP_TRACE(PMAP_CODE(PMAP__REUSABLE) | DBG_FUNC_END,
1801		   pmap, reusable, 0, 0, 0);
1802}
1803
1804void
1805pmap_reusable_range(
1806	pmap_t			pmap,
1807	vm_map_offset_t		start_vaddr,
1808	pt_entry_t		*spte,
1809	pt_entry_t		*epte,
1810	boolean_t		reusable)
1811{
1812	pt_entry_t		*cpte;
1813	int			num_external, num_internal, num_reusable;
1814	ppnum_t			pai;
1815	pmap_paddr_t		pa;
1816	vm_map_offset_t		vaddr;
1817
1818	num_external = 0;
1819	num_internal = 0;
1820	num_reusable = 0;
1821
1822	for (cpte = spte, vaddr = start_vaddr;
1823	     cpte < epte;
1824	     cpte++, vaddr += PAGE_SIZE_64) {
1825
1826		pa = pte_to_pa(*cpte);
1827		if (pa == 0)
1828			continue;
1829
1830		pai = pa_index(pa);
1831
1832		LOCK_PVH(pai);
1833
1834		pa = pte_to_pa(*cpte);
1835		if (pa == 0) {
1836			UNLOCK_PVH(pai);
1837			continue;
1838		}
1839		if (reusable) {
1840			/* we want to set "reusable" */
1841			if (IS_REUSABLE_PAGE(pai)) {
1842				/* already reusable: no change */
1843			} else {
1844				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1845				/* one more "reusable" */
1846				num_reusable++;
1847				if (IS_INTERNAL_PAGE(pai)) {
1848					/* one less "internal" */
1849					num_internal--;
1850				} else {
1851					/* one less "external" */
1852					num_external--;
1853				}
1854			}
1855		} else {
1856			/* we want to clear "reusable" */
1857			if (IS_REUSABLE_PAGE(pai)) {
1858				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1859				/* one less "reusable" */
1860				num_reusable--;
1861				if (IS_INTERNAL_PAGE(pai)) {
1862					/* one more "internal" */
1863					num_internal++;
1864				} else {
1865					/* one more "external" */
1866					num_external++;
1867				}
1868			} else {
1869				/* already not reusable: no change */
1870			}
1871		}
1872
1873		UNLOCK_PVH(pai);
1874
1875	} /* for loop */
1876
1877	/*
1878	 *	Update the counts
1879	 */
1880	if (pmap != kernel_pmap) {
1881		if (num_external) {
1882			OSAddAtomic(num_external, &pmap->stats.external);
1883			PMAP_STATS_PEAK(pmap->stats.external);
1884		}
1885		assert(pmap->stats.external >= 0);
1886		if (num_internal) {
1887			OSAddAtomic(num_internal, &pmap->stats.internal);
1888			PMAP_STATS_PEAK(pmap->stats.internal);
1889		}
1890		assert(pmap->stats.internal >= 0);
1891		if (num_reusable) {
1892			OSAddAtomic(num_reusable, &pmap->stats.reusable);
1893			PMAP_STATS_PEAK(pmap->stats.reusable);
1894		}
1895		assert(pmap->stats.reusable >= 0);
1896	}
1897
1898	return;
1899}
1900
1901unsigned int
1902pmap_query_resident(
1903	pmap_t		pmap,
1904	addr64_t	s64,
1905	addr64_t	e64)
1906{
1907	pt_entry_t     *pde;
1908	pt_entry_t     *spte, *epte;
1909	addr64_t        l64;
1910	uint64_t        deadline;
1911	unsigned int	result;
1912
1913	pmap_intr_assert();
1914
1915	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64)
1916		return 0;
1917
1918	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
1919		   pmap,
1920		   (uint32_t) (s64 >> 32), s64,
1921		   (uint32_t) (e64 >> 32), e64);
1922
1923	result = 0;
1924
1925	PMAP_LOCK(pmap);
1926
1927	deadline = rdtsc64() + max_preemption_latency_tsc;
1928
1929	while (s64 < e64) {
1930		l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1931		if (l64 > e64)
1932			l64 = e64;
1933		pde = pmap_pde(pmap, s64);
1934
1935		if (pde && (*pde & INTEL_PTE_VALID)) {
1936			if (*pde & INTEL_PTE_PS) {
1937				/* superpage: not supported */
1938			} else {
1939				spte = pmap_pte(pmap,
1940						(s64 & ~(pde_mapped_size - 1)));
1941				spte = &spte[ptenum(s64)];
1942				epte = &spte[intel_btop(l64 - s64)];
1943
1944				for (; spte < epte; spte++) {
1945					if (pte_to_pa(*spte) != 0) {
1946						result++;
1947					}
1948				}
1949
1950			}
1951		}
1952		s64 = l64;
1953
1954		if (s64 < e64 && rdtsc64() >= deadline) {
1955			PMAP_UNLOCK(pmap);
1956			PMAP_LOCK(pmap);
1957			deadline = rdtsc64() + max_preemption_latency_tsc;
1958		}
1959	}
1960
1961	PMAP_UNLOCK(pmap);
1962
1963	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
1964		   pmap, 0, 0, 0, 0);
1965
1966	return result;
1967}
1968