1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach_assert.h>
30
31#include <vm/pmap.h>
32#include <vm/vm_map.h>
33#include <kern/ledger.h>
34#include <i386/pmap_internal.h>
35
36void		pmap_remove_range(
37			pmap_t		pmap,
38			vm_map_offset_t	va,
39			pt_entry_t	*spte,
40			pt_entry_t	*epte);
41
42void		pmap_remove_range_options(
43			pmap_t		pmap,
44			vm_map_offset_t	va,
45			pt_entry_t	*spte,
46			pt_entry_t	*epte,
47			int		options);
48
49void		pmap_reusable_range(
50			pmap_t		pmap,
51			vm_map_offset_t	va,
52			pt_entry_t	*spte,
53			pt_entry_t	*epte,
54			boolean_t	reusable);
55
56uint32_t pmap_update_clear_pte_count;
57
58/*
59 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
60 * on a NBPDE boundary.
61 */
62
63/* These symbols may be referenced directly by VM */
64uint64_t pmap_nesting_size_min = NBPDE;
65uint64_t pmap_nesting_size_max = 0 - (uint64_t)NBPDE;
66
67/*
68 *	kern_return_t pmap_nest(grand, subord, va_start, size)
69 *
70 *	grand  = the pmap that we will nest subord into
71 *	subord = the pmap that goes into the grand
72 *	va_start  = start of range in pmap to be inserted
73 *	nstart  = start of range in pmap nested pmap
74 *	size   = Size of nest area (up to 16TB)
75 *
76 *	Inserts a pmap into another.  This is used to implement shared segments.
77 *
78 *	Note that we depend upon higher level VM locks to insure that things don't change while
79 *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
80 *	or do 2 nests at once.
81 */
82
83/*
84 * This routine can nest subtrees either at the PDPT level (1GiB) or at the
85 * PDE level (2MiB). We currently disallow disparate offsets for the "subord"
86 * container and the "grand" parent. A minor optimization to consider for the
87 * future: make the "subord" truly a container rather than a full-fledged
88 * pagetable hierarchy which can be unnecessarily sparse (DRK).
89 */
90
91kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t va_start, addr64_t nstart, uint64_t size) {
92	vm_map_offset_t	vaddr, nvaddr;
93	pd_entry_t	*pde,*npde;
94	unsigned int	i;
95	uint64_t	num_pde;
96
97	if ((size & (pmap_nesting_size_min-1)) ||
98	    (va_start & (pmap_nesting_size_min-1)) ||
99	    (nstart & (pmap_nesting_size_min-1)) ||
100	    ((size >> 28) > 65536))	/* Max size we can nest is 16TB */
101		return KERN_INVALID_VALUE;
102
103	if(size == 0) {
104		panic("pmap_nest: size is invalid - %016llX\n", size);
105	}
106
107	if (va_start != nstart)
108		panic("pmap_nest: va_start(0x%llx) != nstart(0x%llx)\n", va_start, nstart);
109
110	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
111	(uintptr_t) grand, (uintptr_t) subord,
112	    (uintptr_t) (va_start>>32), (uintptr_t) va_start, 0);
113
114	nvaddr = (vm_map_offset_t)nstart;
115	num_pde = size >> PDESHIFT;
116
117	PMAP_LOCK(subord);
118
119	subord->pm_shared = TRUE;
120
121	for (i = 0; i < num_pde;) {
122		if (((nvaddr & PDPTMASK) == 0) && (num_pde - i) >= NPDEPG && cpu_64bit) {
123
124			npde = pmap64_pdpt(subord, nvaddr);
125
126			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
127				PMAP_UNLOCK(subord);
128				pmap_expand_pdpt(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE);
129				PMAP_LOCK(subord);
130				npde = pmap64_pdpt(subord, nvaddr);
131			}
132			*npde |= INTEL_PDPTE_NESTED;
133			nvaddr += NBPDPT;
134			i += (uint32_t)NPDEPG;
135		}
136		else {
137			npde = pmap_pde(subord, nvaddr);
138
139			while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
140				PMAP_UNLOCK(subord);
141				pmap_expand(subord, nvaddr, PMAP_EXPAND_OPTIONS_NONE);
142				PMAP_LOCK(subord);
143				npde = pmap_pde(subord, nvaddr);
144			}
145			nvaddr += NBPDE;
146			i++;
147		}
148	}
149
150	PMAP_UNLOCK(subord);
151
152	vaddr = (vm_map_offset_t)va_start;
153
154	PMAP_LOCK(grand);
155
156	for (i = 0;i < num_pde;) {
157		pd_entry_t tpde;
158
159		if (((vaddr & PDPTMASK) == 0) && ((num_pde - i) >= NPDEPG) && cpu_64bit) {
160			npde = pmap64_pdpt(subord, vaddr);
161			if (npde == 0)
162				panic("pmap_nest: no PDPT, subord %p nstart 0x%llx", subord, vaddr);
163			tpde = *npde;
164			pde = pmap64_pdpt(grand, vaddr);
165			if (0 == pde) {
166				PMAP_UNLOCK(grand);
167				pmap_expand_pml4(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
168				PMAP_LOCK(grand);
169				pde = pmap64_pdpt(grand, vaddr);
170			}
171			if (pde == 0)
172				panic("pmap_nest: no PDPT, grand  %p vaddr 0x%llx", grand, vaddr);
173			pmap_store_pte(pde, tpde);
174			vaddr += NBPDPT;
175			i += (uint32_t) NPDEPG;
176		}
177		else {
178			npde = pmap_pde(subord, nstart);
179			if (npde == 0)
180				panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord, nstart);
181			tpde = *npde;
182			nstart += NBPDE;
183			pde = pmap_pde(grand, vaddr);
184			if ((0 == pde) && cpu_64bit) {
185				PMAP_UNLOCK(grand);
186				pmap_expand_pdpt(grand, vaddr, PMAP_EXPAND_OPTIONS_NONE);
187				PMAP_LOCK(grand);
188				pde = pmap_pde(grand, vaddr);
189			}
190
191			if (pde == 0)
192				panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
193			vaddr += NBPDE;
194			pmap_store_pte(pde, tpde);
195			i++;
196		}
197	}
198
199	PMAP_UNLOCK(grand);
200
201	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
202
203	return KERN_SUCCESS;
204}
205
206/*
207 *	kern_return_t pmap_unnest(grand, vaddr)
208 *
209 *	grand  = the pmap that we will un-nest subord from
210 *	vaddr  = start of range in pmap to be unnested
211 *
212 *	Removes a pmap from another.  This is used to implement shared segments.
213 */
214
215kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) {
216
217	pd_entry_t *pde;
218	unsigned int i;
219	uint64_t num_pde;
220	addr64_t va_start, va_end;
221	uint64_t npdpt = PMAP_INVALID_PDPTNUM;
222
223	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
224	    (uintptr_t) grand,
225	    (uintptr_t) (vaddr>>32), (uintptr_t) vaddr, 0, 0);
226
227	if ((size & (pmap_nesting_size_min-1)) ||
228	    (vaddr & (pmap_nesting_size_min-1))) {
229		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...\n",
230		    grand, vaddr, size);
231	}
232
233	/* align everything to PDE boundaries */
234	va_start = vaddr & ~(NBPDE-1);
235	va_end = (vaddr + size + NBPDE - 1) & ~(NBPDE-1);
236	size = va_end - va_start;
237
238	PMAP_LOCK(grand);
239
240	num_pde = size >> PDESHIFT;
241	vaddr = va_start;
242
243	for (i = 0; i < num_pde; ) {
244		if ((pdptnum(grand, vaddr) != npdpt) && cpu_64bit) {
245			npdpt = pdptnum(grand, vaddr);
246			pde = pmap64_pdpt(grand, vaddr);
247			if (pde && (*pde & INTEL_PDPTE_NESTED)) {
248				pmap_store_pte(pde, (pd_entry_t)0);
249				i += (uint32_t) NPDEPG;
250				vaddr += NBPDPT;
251				continue;
252			}
253		}
254		pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
255		if (pde == 0)
256			panic("pmap_unnest: no pde, grand %p vaddr 0x%llx\n", grand, vaddr);
257		pmap_store_pte(pde, (pd_entry_t)0);
258		i++;
259		vaddr += NBPDE;
260	}
261
262	PMAP_UPDATE_TLBS(grand, va_start, va_end);
263
264	PMAP_UNLOCK(grand);
265
266	PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
267
268	return KERN_SUCCESS;
269}
270
271/* Invoked by the Mach VM to determine the platform specific unnest region */
272
273boolean_t pmap_adjust_unnest_parameters(pmap_t p, vm_map_offset_t *s, vm_map_offset_t *e) {
274	pd_entry_t *pdpte;
275	boolean_t rval = FALSE;
276
277	if (!cpu_64bit)
278		return rval;
279
280	PMAP_LOCK(p);
281
282	pdpte = pmap64_pdpt(p, *s);
283	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
284		*s &= ~(NBPDPT -1);
285		rval = TRUE;
286	}
287
288	pdpte = pmap64_pdpt(p, *e);
289	if (pdpte && (*pdpte & INTEL_PDPTE_NESTED)) {
290		*e = ((*e + NBPDPT) & ~(NBPDPT -1));
291		rval = TRUE;
292	}
293
294	PMAP_UNLOCK(p);
295
296	return rval;
297}
298
299/*
300 * pmap_find_phys returns the (4K) physical page number containing a
301 * given virtual address in a given pmap.
302 * Note that pmap_pte may return a pde if this virtual address is
303 * mapped by a large page and this is taken into account in order
304 * to return the correct page number in this case.
305 */
306ppnum_t
307pmap_find_phys(pmap_t pmap, addr64_t va)
308{
309	pt_entry_t	*ptp;
310	pd_entry_t	*pdep;
311	ppnum_t		ppn = 0;
312	pd_entry_t	pde;
313	pt_entry_t	pte;
314
315	mp_disable_preemption();
316
317	/* This refcount test is a band-aid--several infrastructural changes
318	 * are necessary to eliminate invocation of this routine from arbitrary
319	 * contexts.
320	 */
321
322	if (!pmap->ref_count)
323		goto pfp_exit;
324
325	pdep = pmap_pde(pmap, va);
326
327	if ((pdep != PD_ENTRY_NULL) && ((pde = *pdep) & INTEL_PTE_VALID)) {
328		if (pde & INTEL_PTE_PS) {
329			ppn = (ppnum_t) i386_btop(pte_to_pa(pde));
330			ppn += (ppnum_t) ptenum(va);
331		}
332		else {
333			ptp = pmap_pte(pmap, va);
334			if ((PT_ENTRY_NULL != ptp) && (((pte = *ptp) & INTEL_PTE_VALID) != 0)) {
335				ppn = (ppnum_t) i386_btop(pte_to_pa(pte));
336			}
337		}
338	}
339pfp_exit:
340	mp_enable_preemption();
341
342        return ppn;
343}
344
345/*
346 * Update cache attributes for all extant managed mappings.
347 * Assumes PV for this page is locked, and that the page
348 * is managed.
349 */
350
351void
352pmap_update_cache_attributes_locked(ppnum_t pn, unsigned attributes) {
353	pv_rooted_entry_t	pv_h, pv_e;
354	pv_hashed_entry_t       pvh_e, nexth;
355	vm_map_offset_t vaddr;
356	pmap_t	pmap;
357	pt_entry_t	*ptep;
358
359	assert(IS_MANAGED_PAGE(pn));
360
361	pv_h = pai_to_pvh(pn);
362	/* TODO: translate the PHYS_* bits to PTE bits, while they're
363	 * currently identical, they may not remain so
364	 * Potential optimization (here and in page_protect),
365	 * parallel shootdowns, check for redundant
366	 * attribute modifications.
367	 */
368
369	/*
370	 * Alter attributes on all mappings
371	 */
372	if (pv_h->pmap != PMAP_NULL) {
373		pv_e = pv_h;
374		pvh_e = (pv_hashed_entry_t)pv_e;
375
376		do {
377			pmap = pv_e->pmap;
378			vaddr = pv_e->va;
379			ptep = pmap_pte(pmap, vaddr);
380
381			if (0 == ptep)
382				panic("pmap_update_cache_attributes_locked: Missing PTE, pmap: %p, pn: 0x%x vaddr: 0x%llx kernel_pmap: %p", pmap, pn, vaddr, kernel_pmap);
383
384			nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);
385			pmap_update_pte(ptep, PHYS_CACHEABILITY_MASK, attributes);
386			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
387			pvh_e = nexth;
388		} while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
389	}
390}
391
392void x86_filter_TLB_coherency_interrupts(boolean_t dofilter) {
393	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
394
395	if (dofilter) {
396		CPU_CR3_MARK_INACTIVE();
397	} else {
398		CPU_CR3_MARK_ACTIVE();
399		mfence();
400		if (current_cpu_datap()->cpu_tlb_invalid)
401			process_pmap_updates();
402	}
403}
404
405
406/*
407 *	Insert the given physical page (p) at
408 *	the specified virtual address (v) in the
409 *	target physical map with the protection requested.
410 *
411 *	If specified, the page will be wired down, meaning
412 *	that the related pte cannot be reclaimed.
413 *
414 *	NB:  This is the only routine which MAY NOT lazy-evaluate
415 *	or lose information.  That is, this routine must actually
416 *	insert this page into the given map NOW.
417 */
418
419void
420pmap_enter(
421	register pmap_t		pmap,
422 	vm_map_offset_t		vaddr,
423	ppnum_t                 pn,
424	vm_prot_t		prot,
425	vm_prot_t		fault_type,
426	unsigned int 		flags,
427	boolean_t		wired)
428{
429	(void) pmap_enter_options(pmap, vaddr, pn, prot, fault_type, flags, wired, PMAP_EXPAND_OPTIONS_NONE, NULL);
430}
431
432
433kern_return_t
434pmap_enter_options(
435	register pmap_t		pmap,
436 	vm_map_offset_t		vaddr,
437	ppnum_t                 pn,
438	vm_prot_t		prot,
439	__unused vm_prot_t	fault_type,
440	unsigned int 		flags,
441	boolean_t		wired,
442	unsigned int		options,
443	void			*arg)
444{
445	pt_entry_t		*pte;
446	pv_rooted_entry_t	pv_h;
447	ppnum_t			pai;
448	pv_hashed_entry_t	pvh_e;
449	pv_hashed_entry_t	pvh_new;
450	pt_entry_t		template;
451	pmap_paddr_t		old_pa;
452	pmap_paddr_t		pa = (pmap_paddr_t) i386_ptob(pn);
453	boolean_t		need_tlbflush = FALSE;
454	boolean_t		set_NX;
455	char			oattr;
456	boolean_t		old_pa_locked;
457	/* 2MiB mappings are confined to x86_64 by VM */
458	boolean_t		superpage = flags & VM_MEM_SUPERPAGE;
459	vm_object_t		delpage_pm_obj = NULL;
460	uint64_t		delpage_pde_index = 0;
461	pt_entry_t		old_pte;
462	kern_return_t		kr_expand;
463
464	pmap_intr_assert();
465
466	if (pmap == PMAP_NULL)
467		return KERN_INVALID_ARGUMENT;
468
469	/* N.B. We can be supplied a zero page frame in the NOENTER case, it's an
470	 * unused value for that scenario.
471	 */
472	assert(pn != vm_page_fictitious_addr);
473
474	if (pn == vm_page_guard_addr)
475		return KERN_INVALID_ARGUMENT;
476
477	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
478	    pmap,
479	    (uint32_t) (vaddr >> 32), (uint32_t) vaddr,
480	    pn, prot);
481
482	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
483		set_NX = FALSE;
484	else
485		set_NX = TRUE;
486
487	if (__improbable(set_NX && (pmap == kernel_pmap) && ((pmap_disable_kstack_nx && (flags & VM_MEM_STACK)) || (pmap_disable_kheap_nx && !(flags & VM_MEM_STACK))))) {
488		set_NX = FALSE;
489	}
490
491	/*
492	 *	Must allocate a new pvlist entry while we're unlocked;
493	 *	zalloc may cause pageout (which will lock the pmap system).
494	 *	If we determine we need a pvlist entry, we will unlock
495	 *	and allocate one.  Then we will retry, throughing away
496	 *	the allocated entry later (if we no longer need it).
497	 */
498
499	pvh_new = PV_HASHED_ENTRY_NULL;
500Retry:
501	pvh_e = PV_HASHED_ENTRY_NULL;
502
503	PMAP_LOCK(pmap);
504
505	/*
506	 *	Expand pmap to include this pte.  Assume that
507	 *	pmap is always expanded to include enough hardware
508	 *	pages to map one VM page.
509	 */
510	 if(superpage) {
511	 	while ((pte = pmap64_pde(pmap, vaddr)) == PD_ENTRY_NULL) {
512			/* need room for another pde entry */
513			PMAP_UNLOCK(pmap);
514			kr_expand = pmap_expand_pdpt(pmap, vaddr, options);
515			if (kr_expand != KERN_SUCCESS)
516				return kr_expand;
517			PMAP_LOCK(pmap);
518		}
519	} else {
520		while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
521			/*
522			 * Must unlock to expand the pmap
523			 * going to grow pde level page(s)
524			 */
525			PMAP_UNLOCK(pmap);
526			kr_expand = pmap_expand(pmap, vaddr, options);
527			if (kr_expand != KERN_SUCCESS)
528				return kr_expand;
529			PMAP_LOCK(pmap);
530		}
531	}
532	if (options & PMAP_EXPAND_OPTIONS_NOENTER) {
533		PMAP_UNLOCK(pmap);
534		return KERN_SUCCESS;
535	}
536
537	if (superpage && *pte && !(*pte & INTEL_PTE_PS)) {
538		/*
539		 * There is still an empty page table mapped that
540		 * was used for a previous base page mapping.
541		 * Remember the PDE and the PDE index, so that we
542		 * can free the page at the end of this function.
543		 */
544		delpage_pde_index = pdeidx(pmap, vaddr);
545		delpage_pm_obj = pmap->pm_obj;
546		*pte = 0;
547	}
548
549	old_pa = pte_to_pa(*pte);
550	pai = pa_index(old_pa);
551	old_pa_locked = FALSE;
552
553	if (old_pa == 0 &&
554	    (*pte & INTEL_PTE_COMPRESSED)) {
555		/* one less "compressed" */
556		OSAddAtomic64(-1, &pmap->stats.compressed);
557		/* marker will be cleared below */
558	}
559
560	/*
561	 * if we have a previous managed page, lock the pv entry now. after
562	 * we lock it, check to see if someone beat us to the lock and if so
563	 * drop the lock
564	 */
565	if ((0 != old_pa) && IS_MANAGED_PAGE(pai)) {
566		LOCK_PVH(pai);
567		old_pa_locked = TRUE;
568		old_pa = pte_to_pa(*pte);
569		if (0 == old_pa) {
570			UNLOCK_PVH(pai);	/* another path beat us to it */
571			old_pa_locked = FALSE;
572		}
573	}
574
575	/*
576	 *	Special case if the incoming physical page is already mapped
577	 *	at this address.
578	 */
579	if (old_pa == pa) {
580		pt_entry_t old_attributes =
581		    *pte & ~(INTEL_PTE_REF | INTEL_PTE_MOD);
582
583		/*
584	         *	May be changing its wired attribute or protection
585	         */
586
587		template = pa_to_pte(pa) | INTEL_PTE_VALID;
588		template |= pmap_get_cache_attributes(pa_index(pa));
589
590		if (VM_MEM_NOT_CACHEABLE ==
591		    (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
592			if (!(flags & VM_MEM_GUARDED))
593				template |= INTEL_PTE_PTA;
594			template |= INTEL_PTE_NCACHE;
595		}
596		if (pmap != kernel_pmap)
597			template |= INTEL_PTE_USER;
598		if (prot & VM_PROT_WRITE) {
599			template |= INTEL_PTE_WRITE;
600		}
601
602		if (set_NX)
603			template |= INTEL_PTE_NX;
604
605		if (wired) {
606			template |= INTEL_PTE_WIRED;
607			if (!iswired(old_attributes))  {
608				OSAddAtomic(+1, &pmap->stats.wired_count);
609				pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
610			}
611		} else {
612			if (iswired(old_attributes)) {
613				assert(pmap->stats.wired_count >= 1);
614				OSAddAtomic(-1, &pmap->stats.wired_count);
615				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
616			}
617		}
618		if (superpage)		/* this path can not be used */
619			template |= INTEL_PTE_PS;	/* to change the page size! */
620
621		if (old_attributes == template)
622			goto dont_update_pte;
623
624		/* Determine delta, PV locked */
625		need_tlbflush =
626		    ((old_attributes ^ template) != INTEL_PTE_WIRED);
627
628		if (need_tlbflush == TRUE && !(old_attributes & INTEL_PTE_WRITE)) {
629			if ((old_attributes ^ template) == INTEL_PTE_WRITE)
630				need_tlbflush = FALSE;
631		}
632
633		/* store modified PTE and preserve RC bits */
634		pt_entry_t npte, opte;;
635		do {
636			opte = *pte;
637			npte = template | (opte & (INTEL_PTE_REF | INTEL_PTE_MOD));
638		} while (!pmap_cmpx_pte(pte, opte, npte));
639dont_update_pte:
640		if (old_pa_locked) {
641			UNLOCK_PVH(pai);
642			old_pa_locked = FALSE;
643		}
644		goto Done;
645	}
646
647	/*
648	 *	Outline of code from here:
649	 *	   1) If va was mapped, update TLBs, remove the mapping
650	 *	      and remove old pvlist entry.
651	 *	   2) Add pvlist entry for new mapping
652	 *	   3) Enter new mapping.
653	 *
654	 *	If the old physical page is not managed step 1) is skipped
655	 *	(except for updating the TLBs), and the mapping is
656	 *	overwritten at step 3).  If the new physical page is not
657	 *	managed, step 2) is skipped.
658	 */
659
660	if (old_pa != (pmap_paddr_t) 0) {
661
662		/*
663	         *	Don't do anything to pages outside valid memory here.
664	         *	Instead convince the code that enters a new mapping
665	         *	to overwrite the old one.
666	         */
667
668		/* invalidate the PTE */
669		pmap_update_pte(pte, INTEL_PTE_VALID, 0);
670		/* propagate invalidate everywhere */
671		PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
672		/* remember reference and change */
673		old_pte	= *pte;
674		oattr = (char) (old_pte & (PHYS_MODIFIED | PHYS_REFERENCED));
675		/* completely invalidate the PTE */
676		pmap_store_pte(pte, 0);
677
678		if (IS_MANAGED_PAGE(pai)) {
679			pmap_assert(old_pa_locked == TRUE);
680			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
681			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
682			assert(pmap->stats.resident_count >= 1);
683			OSAddAtomic(-1, &pmap->stats.resident_count);
684			if (pmap != kernel_pmap) {
685				if (IS_REUSABLE_PAGE(pai)) {
686					assert(pmap->stats.reusable > 0);
687					OSAddAtomic(-1, &pmap->stats.reusable);
688				} else if (IS_INTERNAL_PAGE(pai)) {
689					assert(pmap->stats.internal > 0);
690					OSAddAtomic(-1, &pmap->stats.internal);
691				} else {
692					assert(pmap->stats.external > 0);
693					OSAddAtomic(-1, &pmap->stats.external);
694				}
695			}
696			if (iswired(*pte)) {
697				assert(pmap->stats.wired_count >= 1);
698				OSAddAtomic(-1, &pmap->stats.wired_count);
699				pmap_ledger_debit(pmap, task_ledgers.wired_mem,
700				    PAGE_SIZE);
701			}
702			pmap_phys_attributes[pai] |= oattr;
703
704			/*
705			 *	Remove the mapping from the pvlist for
706			 *	this physical page.
707			 *      We'll end up with either a rooted pv or a
708			 *      hashed pv
709			 */
710			pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, &old_pte);
711
712		} else {
713
714			/*
715			 *	old_pa is not managed.
716			 *	Do removal part of accounting.
717			 */
718
719			if (pmap != kernel_pmap) {
720#if 00
721				assert(pmap->stats.device > 0);
722				OSAddAtomic(-1, &pmap->stats.device);
723#endif
724			}
725			if (iswired(*pte)) {
726				assert(pmap->stats.wired_count >= 1);
727				OSAddAtomic(-1, &pmap->stats.wired_count);
728				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
729			}
730		}
731	}
732
733	/*
734	 * if we had a previously managed paged locked, unlock it now
735	 */
736	if (old_pa_locked) {
737		UNLOCK_PVH(pai);
738		old_pa_locked = FALSE;
739	}
740
741	pai = pa_index(pa);	/* now working with new incoming phys page */
742	if (IS_MANAGED_PAGE(pai)) {
743
744		/*
745	         *	Step 2) Enter the mapping in the PV list for this
746	         *	physical page.
747	         */
748		pv_h = pai_to_pvh(pai);
749
750		LOCK_PVH(pai);
751
752		if (pv_h->pmap == PMAP_NULL) {
753			/*
754			 *	No mappings yet, use rooted pv
755			 */
756			pv_h->va = vaddr;
757			pv_h->pmap = pmap;
758			queue_init(&pv_h->qlink);
759
760			if (options & PMAP_OPTIONS_INTERNAL) {
761				pmap_phys_attributes[pai] |= PHYS_INTERNAL;
762			} else {
763				pmap_phys_attributes[pai] &= ~PHYS_INTERNAL;
764			}
765			if (options & PMAP_OPTIONS_REUSABLE) {
766				pmap_phys_attributes[pai] |= PHYS_REUSABLE;
767			} else {
768				pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
769			}
770		} else {
771			/*
772			 *	Add new pv_hashed_entry after header.
773			 */
774			if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
775				pvh_e = pvh_new;
776				pvh_new = PV_HASHED_ENTRY_NULL;
777			} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
778				PV_HASHED_ALLOC(&pvh_e);
779				if (PV_HASHED_ENTRY_NULL == pvh_e) {
780					/*
781					 * the pv list is empty. if we are on
782					 * the kernel pmap we'll use one of
783					 * the special private kernel pv_e's,
784					 * else, we need to unlock
785					 * everything, zalloc a pv_e, and
786					 * restart bringing in the pv_e with
787					 * us.
788					 */
789					if (kernel_pmap == pmap) {
790						PV_HASHED_KERN_ALLOC(&pvh_e);
791					} else {
792						UNLOCK_PVH(pai);
793						PMAP_UNLOCK(pmap);
794						pmap_pv_throttle(pmap);
795						pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
796						goto Retry;
797					}
798				}
799			}
800
801			if (PV_HASHED_ENTRY_NULL == pvh_e)
802				panic("Mapping alias chain exhaustion, possibly induced by numerous kernel virtual double mappings");
803
804			pvh_e->va = vaddr;
805			pvh_e->pmap = pmap;
806			pvh_e->ppn = pn;
807			pv_hash_add(pvh_e, pv_h);
808
809			/*
810			 *	Remember that we used the pvlist entry.
811			 */
812			pvh_e = PV_HASHED_ENTRY_NULL;
813		}
814
815		/*
816	         * only count the mapping
817	         * for 'managed memory'
818	         */
819		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
820		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
821		OSAddAtomic(+1,  &pmap->stats.resident_count);
822		if (pmap->stats.resident_count > pmap->stats.resident_max) {
823			pmap->stats.resident_max = pmap->stats.resident_count;
824		}
825		if (pmap != kernel_pmap) {
826			if (IS_REUSABLE_PAGE(pai)) {
827				OSAddAtomic(+1, &pmap->stats.reusable);
828				PMAP_STATS_PEAK(pmap->stats.reusable);
829			} else if (IS_INTERNAL_PAGE(pai)) {
830				OSAddAtomic(+1, &pmap->stats.internal);
831				PMAP_STATS_PEAK(pmap->stats.internal);
832			} else {
833				OSAddAtomic(+1, &pmap->stats.external);
834				PMAP_STATS_PEAK(pmap->stats.external);
835			}
836		}
837	} else if (last_managed_page == 0) {
838		/* Account for early mappings created before "managed pages"
839		 * are determined. Consider consulting the available DRAM map.
840		 */
841		pmap_ledger_credit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
842		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
843		OSAddAtomic(+1,  &pmap->stats.resident_count);
844		if (pmap != kernel_pmap) {
845#if 00
846			OSAddAtomic(+1, &pmap->stats.device);
847			PMAP_STATS_PEAK(pmap->stats.device);
848#endif
849		}
850	}
851	/*
852	 * Step 3) Enter the mapping.
853	 *
854	 *	Build a template to speed up entering -
855	 *	only the pfn changes.
856	 */
857	template = pa_to_pte(pa) | INTEL_PTE_VALID;
858	/*
859	 * DRK: It may be worth asserting on cache attribute flags that diverge
860	 * from the existing physical page attributes.
861	 */
862
863	template |= pmap_get_cache_attributes(pa_index(pa));
864
865	if (flags & VM_MEM_NOT_CACHEABLE) {
866		if (!(flags & VM_MEM_GUARDED))
867			template |= INTEL_PTE_PTA;
868		template |= INTEL_PTE_NCACHE;
869	}
870	if (pmap != kernel_pmap)
871		template |= INTEL_PTE_USER;
872	if (prot & VM_PROT_WRITE)
873		template |= INTEL_PTE_WRITE;
874	if (set_NX)
875		template |= INTEL_PTE_NX;
876	if (wired) {
877		template |= INTEL_PTE_WIRED;
878		OSAddAtomic(+1,  & pmap->stats.wired_count);
879		pmap_ledger_credit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
880	}
881	if (superpage)
882		template |= INTEL_PTE_PS;
883	pmap_store_pte(pte, template);
884
885	/*
886	 * if this was a managed page we delayed unlocking the pv until here
887	 * to prevent pmap_page_protect et al from finding it until the pte
888	 * has been stored
889	 */
890	if (IS_MANAGED_PAGE(pai)) {
891		UNLOCK_PVH(pai);
892	}
893Done:
894	if (need_tlbflush == TRUE) {
895		if (options & PMAP_OPTIONS_NOFLUSH)
896			PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
897		else
898			PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
899	}
900	if (pvh_e != PV_HASHED_ENTRY_NULL) {
901		PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
902	}
903	if (pvh_new != PV_HASHED_ENTRY_NULL) {
904		PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
905	}
906	PMAP_UNLOCK(pmap);
907
908	if (delpage_pm_obj) {
909		vm_page_t m;
910
911		vm_object_lock(delpage_pm_obj);
912		m = vm_page_lookup(delpage_pm_obj, (delpage_pde_index * PAGE_SIZE));
913		if (m == VM_PAGE_NULL)
914		    panic("pmap_enter: pte page not in object");
915		vm_object_unlock(delpage_pm_obj);
916		VM_PAGE_FREE(m);
917		OSAddAtomic(-1,  &inuse_ptepages_count);
918		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
919	}
920
921	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
922	return KERN_SUCCESS;
923}
924
925/*
926 *	Remove a range of hardware page-table entries.
927 *	The entries given are the first (inclusive)
928 *	and last (exclusive) entries for the VM pages.
929 *	The virtual address is the va for the first pte.
930 *
931 *	The pmap must be locked.
932 *	If the pmap is not the kernel pmap, the range must lie
933 *	entirely within one pte-page.  This is NOT checked.
934 *	Assumes that the pte-page exists.
935 */
936
937void
938pmap_remove_range(
939	pmap_t			pmap,
940	vm_map_offset_t		start_vaddr,
941	pt_entry_t		*spte,
942	pt_entry_t		*epte)
943{
944	pmap_remove_range_options(pmap, start_vaddr, spte, epte, 0);
945}
946
947void
948pmap_remove_range_options(
949	pmap_t			pmap,
950	vm_map_offset_t		start_vaddr,
951	pt_entry_t		*spte,
952	pt_entry_t		*epte,
953	int			options)
954{
955	pt_entry_t		*cpte;
956	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
957	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
958	pv_hashed_entry_t       pvh_e;
959	int			pvh_cnt = 0;
960	int			num_removed, num_unwired, num_found, num_invalid;
961	int			num_device, num_external, num_internal, num_reusable;
962	uint64_t		num_compressed;
963	ppnum_t			pai;
964	pmap_paddr_t		pa;
965	vm_map_offset_t		vaddr;
966
967	num_removed = 0;
968	num_unwired = 0;
969	num_found   = 0;
970	num_invalid = 0;
971	num_device  = 0;
972	num_external = 0;
973	num_internal = 0;
974	num_reusable = 0;
975	num_compressed = 0;
976	/* invalidate the PTEs first to "freeze" them */
977	for (cpte = spte, vaddr = start_vaddr;
978	     cpte < epte;
979	     cpte++, vaddr += PAGE_SIZE_64) {
980		pt_entry_t p = *cpte;
981
982		pa = pte_to_pa(p);
983		if (pa == 0) {
984			if (pmap != kernel_pmap &&
985			    (options & PMAP_OPTIONS_REMOVE) &&
986			    (p & INTEL_PTE_COMPRESSED)) {
987				/* one less "compressed" */
988				num_compressed++;
989				/* clear marker */
990				/* XXX probably does not need to be atomic! */
991				pmap_update_pte(cpte, INTEL_PTE_COMPRESSED, 0);
992			}
993			continue;
994		}
995		num_found++;
996
997		if (iswired(p))
998			num_unwired++;
999
1000		pai = pa_index(pa);
1001
1002		if (!IS_MANAGED_PAGE(pai)) {
1003			/*
1004			 *	Outside range of managed physical memory.
1005			 *	Just remove the mappings.
1006			 */
1007			pmap_store_pte(cpte, 0);
1008			num_device++;
1009			continue;
1010		}
1011
1012		if ((p & INTEL_PTE_VALID) == 0)
1013			num_invalid++;
1014
1015		/* invalidate the PTE */
1016		pmap_update_pte(cpte, INTEL_PTE_VALID, 0);
1017	}
1018
1019	if (num_found == 0) {
1020		/* nothing was changed: we're done */
1021	        goto update_counts;
1022	}
1023
1024	/* propagate the invalidates to other CPUs */
1025
1026	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1027
1028	for (cpte = spte, vaddr = start_vaddr;
1029	     cpte < epte;
1030	     cpte++, vaddr += PAGE_SIZE_64) {
1031
1032		pa = pte_to_pa(*cpte);
1033		if (pa == 0)
1034			continue;
1035
1036		pai = pa_index(pa);
1037
1038		LOCK_PVH(pai);
1039
1040		pa = pte_to_pa(*cpte);
1041		if (pa == 0) {
1042			UNLOCK_PVH(pai);
1043			continue;
1044		}
1045		num_removed++;
1046		if (IS_REUSABLE_PAGE(pai)) {
1047			num_reusable++;
1048		} else if (IS_INTERNAL_PAGE(pai)) {
1049			num_internal++;
1050		} else {
1051			num_external++;
1052		}
1053
1054		/*
1055	       	 * Get the modify and reference bits, then
1056	       	 * nuke the entry in the page table
1057	       	 */
1058		/* remember reference and change */
1059		pmap_phys_attributes[pai] |=
1060			(char) (*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
1061
1062		/*
1063	      	 * Remove the mapping from the pvlist for this physical page.
1064	         */
1065		pvh_e = pmap_pv_remove(pmap, vaddr, (ppnum_t *) &pai, cpte);
1066
1067		/* completely invalidate the PTE */
1068		pmap_store_pte(cpte, 0);
1069
1070		UNLOCK_PVH(pai);
1071
1072		if (pvh_e != PV_HASHED_ENTRY_NULL) {
1073			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1074			pvh_eh = pvh_e;
1075
1076			if (pvh_et == PV_HASHED_ENTRY_NULL) {
1077				pvh_et = pvh_e;
1078			}
1079			pvh_cnt++;
1080		}
1081	} /* for loop */
1082
1083	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1084		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1085	}
1086update_counts:
1087	/*
1088	 *	Update the counts
1089	 */
1090#if TESTING
1091	if (pmap->stats.resident_count < num_removed)
1092	        panic("pmap_remove_range: resident_count");
1093#endif
1094	pmap_ledger_debit(pmap, task_ledgers.phys_mem, machine_ptob(num_removed));
1095	pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(num_removed));
1096	assert(pmap->stats.resident_count >= num_removed);
1097	OSAddAtomic(-num_removed,  &pmap->stats.resident_count);
1098
1099	if (pmap != kernel_pmap) {
1100#if 00
1101		assert(pmap->stats.device >= num_device);
1102		if (num_device)
1103			OSAddAtomic(-num_device, &pmap->stats.device);
1104#endif /* 00 */
1105		assert(pmap->stats.external >= num_external);
1106		if (num_external)
1107			OSAddAtomic(-num_external, &pmap->stats.external);
1108		assert(pmap->stats.internal >= num_internal);
1109		if (num_internal)
1110			OSAddAtomic(-num_internal, &pmap->stats.internal);
1111		assert(pmap->stats.reusable >= num_reusable);
1112		if (num_reusable)
1113			OSAddAtomic(-num_reusable, &pmap->stats.reusable);
1114		assert(pmap->stats.compressed >= num_compressed);
1115		if (num_compressed)
1116			OSAddAtomic64(-num_compressed, &pmap->stats.compressed);
1117	}
1118
1119#if TESTING
1120	if (pmap->stats.wired_count < num_unwired)
1121	        panic("pmap_remove_range: wired_count");
1122#endif
1123	assert(pmap->stats.wired_count >= num_unwired);
1124	OSAddAtomic(-num_unwired,  &pmap->stats.wired_count);
1125	pmap_ledger_debit(pmap, task_ledgers.wired_mem, machine_ptob(num_unwired));
1126
1127	return;
1128}
1129
1130
1131/*
1132 *	Remove the given range of addresses
1133 *	from the specified map.
1134 *
1135 *	It is assumed that the start and end are properly
1136 *	rounded to the hardware page size.
1137 */
1138void
1139pmap_remove(
1140	pmap_t		map,
1141	addr64_t	s64,
1142	addr64_t	e64)
1143{
1144	pmap_remove_options(map, s64, e64, 0);
1145}
1146
1147void
1148pmap_remove_options(
1149	pmap_t		map,
1150	addr64_t	s64,
1151	addr64_t	e64,
1152	int		options)
1153{
1154	pt_entry_t     *pde;
1155	pt_entry_t     *spte, *epte;
1156	addr64_t        l64;
1157	uint64_t        deadline;
1158
1159	pmap_intr_assert();
1160
1161	if (map == PMAP_NULL || s64 == e64)
1162		return;
1163
1164	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
1165		   map,
1166		   (uint32_t) (s64 >> 32), s64,
1167		   (uint32_t) (e64 >> 32), e64);
1168
1169
1170	PMAP_LOCK(map);
1171
1172#if 0
1173	/*
1174	 * Check that address range in the kernel does not overlap the stacks.
1175	 * We initialize local static min/max variables once to avoid making
1176	 * 2 function calls for every remove. Note also that these functions
1177	 * both return 0 before kernel stacks have been initialized, and hence
1178	 * the panic is not triggered in this case.
1179	 */
1180	if (map == kernel_pmap) {
1181		static vm_offset_t kernel_stack_min = 0;
1182		static vm_offset_t kernel_stack_max = 0;
1183
1184		if (kernel_stack_min == 0) {
1185			kernel_stack_min = min_valid_stack_address();
1186			kernel_stack_max = max_valid_stack_address();
1187		}
1188		if ((kernel_stack_min <= s64 && s64 < kernel_stack_max) ||
1189		    (kernel_stack_min < e64 && e64 <= kernel_stack_max))
1190			panic("pmap_remove() attempted in kernel stack");
1191	}
1192#else
1193
1194	/*
1195	 * The values of kernel_stack_min and kernel_stack_max are no longer
1196	 * relevant now that we allocate kernel stacks in the kernel map,
1197	 * so the old code above no longer applies.  If we wanted to check that
1198	 * we weren't removing a mapping of a page in a kernel stack we'd
1199	 * mark the PTE with an unused bit and check that here.
1200	 */
1201
1202#endif
1203
1204	deadline = rdtsc64() + max_preemption_latency_tsc;
1205
1206	while (s64 < e64) {
1207		l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1208		if (l64 > e64)
1209			l64 = e64;
1210		pde = pmap_pde(map, s64);
1211
1212		if (pde && (*pde & INTEL_PTE_VALID)) {
1213			if (*pde & INTEL_PTE_PS) {
1214				/*
1215				 * If we're removing a superpage, pmap_remove_range()
1216				 * must work on level 2 instead of level 1; and we're
1217				 * only passing a single level 2 entry instead of a
1218				 * level 1 range.
1219				 */
1220				spte = pde;
1221				epte = spte+1; /* excluded */
1222			} else {
1223				spte = pmap_pte(map, (s64 & ~(pde_mapped_size - 1)));
1224				spte = &spte[ptenum(s64)];
1225				epte = &spte[intel_btop(l64 - s64)];
1226			}
1227			pmap_remove_range_options(map, s64, spte, epte,
1228						  options);
1229		}
1230		s64 = l64;
1231
1232		if (s64 < e64 && rdtsc64() >= deadline) {
1233			PMAP_UNLOCK(map)
1234			    /* TODO: Rapid release/reacquisition can defeat
1235			     * the "backoff" intent here; either consider a
1236			     * fair spinlock, or a scheme whereby each lock
1237			     * attempt marks the processor as within a spinlock
1238			     * acquisition, and scan CPUs here to determine
1239			     * if a backoff is necessary, to avoid sacrificing
1240			     * performance in the common case.
1241			     */
1242			PMAP_LOCK(map)
1243			deadline = rdtsc64() + max_preemption_latency_tsc;
1244		}
1245	}
1246
1247	PMAP_UNLOCK(map);
1248
1249	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
1250		   map, 0, 0, 0, 0);
1251
1252}
1253
1254void
1255pmap_page_protect(
1256        ppnum_t         pn,
1257	vm_prot_t	prot)
1258{
1259	pmap_page_protect_options(pn, prot, 0, NULL);
1260}
1261
1262/*
1263 *	Routine:	pmap_page_protect_options
1264 *
1265 *	Function:
1266 *		Lower the permission for all mappings to a given
1267 *		page.
1268 */
1269void
1270pmap_page_protect_options(
1271        ppnum_t         pn,
1272	vm_prot_t	prot,
1273	unsigned int	options,
1274	void		*arg)
1275{
1276	pv_hashed_entry_t	pvh_eh = PV_HASHED_ENTRY_NULL;
1277	pv_hashed_entry_t	pvh_et = PV_HASHED_ENTRY_NULL;
1278	pv_hashed_entry_t	nexth;
1279	int			pvh_cnt = 0;
1280	pv_rooted_entry_t	pv_h;
1281	pv_rooted_entry_t	pv_e;
1282	pv_hashed_entry_t	pvh_e;
1283	pt_entry_t		*pte;
1284	int			pai;
1285	pmap_t			pmap;
1286	boolean_t		remove;
1287	pt_entry_t		new_pte_value;
1288
1289	pmap_intr_assert();
1290	assert(pn != vm_page_fictitious_addr);
1291	if (pn == vm_page_guard_addr)
1292		return;
1293
1294	pai = ppn_to_pai(pn);
1295
1296	if (!IS_MANAGED_PAGE(pai)) {
1297		/*
1298	         *	Not a managed page.
1299	         */
1300		return;
1301	}
1302	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
1303		   pn, prot, 0, 0, 0);
1304
1305	/*
1306	 * Determine the new protection.
1307	 */
1308	switch (prot) {
1309	case VM_PROT_READ:
1310	case VM_PROT_READ | VM_PROT_EXECUTE:
1311		remove = FALSE;
1312		break;
1313	case VM_PROT_ALL:
1314		return;		/* nothing to do */
1315	default:
1316		remove = TRUE;
1317		break;
1318	}
1319
1320	pv_h = pai_to_pvh(pai);
1321
1322	LOCK_PVH(pai);
1323
1324
1325	/*
1326	 * Walk down PV list, if any, changing or removing all mappings.
1327	 */
1328	if (pv_h->pmap == PMAP_NULL)
1329		goto done;
1330
1331	pv_e = pv_h;
1332	pvh_e = (pv_hashed_entry_t) pv_e;	/* cheat */
1333
1334	do {
1335		vm_map_offset_t vaddr;
1336
1337		pmap = pv_e->pmap;
1338		vaddr = pv_e->va;
1339		pte = pmap_pte(pmap, vaddr);
1340
1341		pmap_assert2((pa_index(pte_to_pa(*pte)) == pn),
1342		    "pmap_page_protect: PTE mismatch, pn: 0x%x, pmap: %p, vaddr: 0x%llx, pte: 0x%llx", pn, pmap, vaddr, *pte);
1343
1344		if (0 == pte) {
1345			panic("pmap_page_protect() "
1346				"pmap=%p pn=0x%x vaddr=0x%llx\n",
1347				pmap, pn, vaddr);
1348		}
1349		nexth = (pv_hashed_entry_t) queue_next(&pvh_e->qlink);
1350
1351		/*
1352		 * Remove the mapping if new protection is NONE
1353		 */
1354		if (remove) {
1355
1356			/* Remove per-pmap wired count */
1357			if (iswired(*pte)) {
1358				OSAddAtomic(-1, &pmap->stats.wired_count);
1359				pmap_ledger_debit(pmap, task_ledgers.wired_mem, PAGE_SIZE);
1360			}
1361
1362			if (pmap != kernel_pmap &&
1363			    (options & PMAP_OPTIONS_COMPRESSOR) &&
1364			    IS_INTERNAL_PAGE(pai)) {
1365				/* adjust "reclaimed" stats */
1366				OSAddAtomic64(+1, &pmap->stats.compressed);
1367				PMAP_STATS_PEAK(pmap->stats.compressed);
1368				pmap->stats.compressed_lifetime++;
1369				/* mark this PTE as having been "reclaimed" */
1370				new_pte_value = INTEL_PTE_COMPRESSED;
1371			} else {
1372				new_pte_value = 0;
1373			}
1374
1375			if (options & PMAP_OPTIONS_NOREFMOD) {
1376				pmap_store_pte(pte, new_pte_value);
1377
1378				if (options & PMAP_OPTIONS_NOFLUSH)
1379					PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1380				else
1381					PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
1382			} else {
1383				/*
1384				 * Remove the mapping, collecting dirty bits.
1385				 */
1386				pmap_update_pte(pte, INTEL_PTE_VALID, 0);
1387
1388				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1389				pmap_phys_attributes[pai] |=
1390					*pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1391				pmap_store_pte(pte, new_pte_value);
1392			}
1393#if TESTING
1394			if (pmap->stats.resident_count < 1)
1395				panic("pmap_page_protect: resident_count");
1396#endif
1397			pmap_ledger_debit(pmap, task_ledgers.phys_mem, PAGE_SIZE);
1398			assert(pmap->stats.resident_count >= 1);
1399			OSAddAtomic(-1,  &pmap->stats.resident_count);
1400			if (options & PMAP_OPTIONS_COMPRESSOR) {
1401				/*
1402				 * This removal is only being done so we can send this page to
1403				 * the compressor; therefore it mustn't affect total task footprint.
1404				 */
1405				pmap_ledger_credit(pmap, task_ledgers.internal_compressed, PAGE_SIZE);
1406			} else {
1407				pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
1408			}
1409
1410			if (pmap != kernel_pmap) {
1411				if (IS_REUSABLE_PAGE(pai)) {
1412					assert(pmap->stats.reusable > 0);
1413					OSAddAtomic(-1, &pmap->stats.reusable);
1414				} else if (IS_INTERNAL_PAGE(pai)) {
1415					assert(pmap->stats.internal > 0);
1416					OSAddAtomic(-1, &pmap->stats.internal);
1417				} else {
1418					assert(pmap->stats.external > 0);
1419					OSAddAtomic(-1, &pmap->stats.external);
1420				}
1421			}
1422
1423			/*
1424		         * Deal with the pv_rooted_entry.
1425		         */
1426
1427			if (pv_e == pv_h) {
1428				/*
1429				 * Fix up head later.
1430				 */
1431				pv_h->pmap = PMAP_NULL;
1432			} else {
1433				/*
1434				 * Delete this entry.
1435				 */
1436				pv_hash_remove(pvh_e);
1437				pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1438				pvh_eh = pvh_e;
1439
1440				if (pvh_et == PV_HASHED_ENTRY_NULL)
1441					pvh_et = pvh_e;
1442				pvh_cnt++;
1443			}
1444		} else {
1445			/*
1446		         * Write-protect, after opportunistic refmod collect
1447		         */
1448			pmap_phys_attributes[pai] |=
1449			    *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
1450			pmap_update_pte(pte, INTEL_PTE_WRITE, 0);
1451
1452			if (options & PMAP_OPTIONS_NOFLUSH)
1453				PMAP_UPDATE_TLBS_DELAYED(pmap, vaddr, vaddr + PAGE_SIZE, (pmap_flush_context *)arg);
1454			else
1455				PMAP_UPDATE_TLBS(pmap, vaddr, vaddr+PAGE_SIZE);
1456		}
1457		pvh_e = nexth;
1458	} while ((pv_e = (pv_rooted_entry_t) nexth) != pv_h);
1459
1460
1461	/*
1462	 * If pv_head mapping was removed, fix it up.
1463	 */
1464	if (pv_h->pmap == PMAP_NULL) {
1465		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
1466
1467		if (pvh_e != (pv_hashed_entry_t) pv_h) {
1468			pv_hash_remove(pvh_e);
1469			pv_h->pmap = pvh_e->pmap;
1470			pv_h->va = pvh_e->va;
1471			pvh_e->qlink.next = (queue_entry_t) pvh_eh;
1472			pvh_eh = pvh_e;
1473
1474			if (pvh_et == PV_HASHED_ENTRY_NULL)
1475				pvh_et = pvh_e;
1476			pvh_cnt++;
1477		}
1478	}
1479	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
1480		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
1481	}
1482done:
1483	UNLOCK_PVH(pai);
1484
1485	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
1486		   0, 0, 0, 0, 0);
1487}
1488
1489
1490/*
1491 *	Clear specified attribute bits.
1492 */
1493void
1494phys_attribute_clear(
1495	ppnum_t		pn,
1496	int		bits,
1497	unsigned int	options,
1498	void		*arg)
1499{
1500	pv_rooted_entry_t	pv_h;
1501	pv_hashed_entry_t	pv_e;
1502	pt_entry_t		*pte;
1503	int			pai;
1504	pmap_t			pmap;
1505	char			attributes = 0;
1506	boolean_t		is_internal, is_reusable;
1507
1508	if ((bits & PHYS_MODIFIED) &&
1509	    (options & PMAP_OPTIONS_NOFLUSH) &&
1510	    arg == NULL) {
1511		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p): "
1512		      "should not clear 'modified' without flushing TLBs\n",
1513		      pn, bits, options, arg);
1514	}
1515
1516	pmap_intr_assert();
1517	assert(pn != vm_page_fictitious_addr);
1518	if (pn == vm_page_guard_addr)
1519		return;
1520
1521	pai = ppn_to_pai(pn);
1522
1523	if (!IS_MANAGED_PAGE(pai)) {
1524		/*
1525		 *	Not a managed page.
1526		 */
1527		return;
1528	}
1529
1530	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
1531		   pn, bits, 0, 0, 0);
1532
1533	pv_h = pai_to_pvh(pai);
1534
1535	LOCK_PVH(pai);
1536
1537	/*
1538	 * Walk down PV list, clearing all modify or reference bits.
1539	 * We do not have to lock the pv_list because we have
1540	 * the per-pmap lock
1541	 */
1542	if (pv_h->pmap != PMAP_NULL) {
1543		/*
1544		 * There are some mappings.
1545		 */
1546
1547		is_internal = IS_INTERNAL_PAGE(pai);
1548		is_reusable = IS_REUSABLE_PAGE(pai);
1549
1550		pv_e = (pv_hashed_entry_t)pv_h;
1551
1552		do {
1553			vm_map_offset_t	va;
1554			char pte_bits;
1555
1556			pmap = pv_e->pmap;
1557			va = pv_e->va;
1558			pte_bits = 0;
1559
1560			if (bits) {
1561				pte = pmap_pte(pmap, va);
1562				/* grab ref/mod bits from this PTE */
1563				pte_bits = (*pte & (PHYS_MODIFIED |
1564						    PHYS_REFERENCED));
1565				/* propagate to page's global attributes */
1566				attributes |= pte_bits;
1567				/* which bits to clear for this PTE? */
1568				pte_bits &= bits;
1569			}
1570
1571			 /*
1572			  * Clear modify and/or reference bits.
1573			  */
1574			if (pte_bits) {
1575				pmap_update_pte(pte, bits, 0);
1576
1577				/* Ensure all processors using this translation
1578				 * invalidate this TLB entry. The invalidation
1579				 * *must* follow the PTE update, to ensure that
1580				 * the TLB shadow of the 'D' bit (in particular)
1581				 * is synchronized with the updated PTE.
1582				 */
1583				if (! (options & PMAP_OPTIONS_NOFLUSH)) {
1584					/* flush TLBS now */
1585					PMAP_UPDATE_TLBS(pmap,
1586							 va,
1587							 va + PAGE_SIZE);
1588				} else if (arg) {
1589					/* delayed TLB flush: add "pmap" info */
1590					PMAP_UPDATE_TLBS_DELAYED(
1591						pmap,
1592						va,
1593						va + PAGE_SIZE,
1594						(pmap_flush_context *)arg);
1595				} else {
1596					/* no TLB flushing at all */
1597				}
1598			}
1599
1600			/* update pmap "reusable" stats */
1601			if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
1602			    is_reusable &&
1603			    pmap != kernel_pmap) {
1604				/* one less "reusable" */
1605				assert(pmap->stats.reusable > 0);
1606				OSAddAtomic(-1, &pmap->stats.reusable);
1607				if (is_internal) {
1608					/* one more "internal" */
1609					OSAddAtomic(+1, &pmap->stats.internal);
1610					PMAP_STATS_PEAK(pmap->stats.internal);
1611				} else {
1612					/* one more "external" */
1613					OSAddAtomic(+1, &pmap->stats.external);
1614					PMAP_STATS_PEAK(pmap->stats.external);
1615				}
1616			} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
1617				   !is_reusable &&
1618				   pmap != kernel_pmap) {
1619				/* one more "reusable" */
1620				OSAddAtomic(+1, &pmap->stats.reusable);
1621				PMAP_STATS_PEAK(pmap->stats.reusable);
1622				if (is_internal) {
1623					/* one less "internal" */
1624					assert(pmap->stats.internal > 0);
1625					OSAddAtomic(-1, &pmap->stats.internal);
1626				} else {
1627					/* one less "external" */
1628					assert(pmap->stats.external > 0);
1629					OSAddAtomic(-1, &pmap->stats.external);
1630				}
1631			}
1632
1633			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
1634
1635		} while (pv_e != (pv_hashed_entry_t)pv_h);
1636	}
1637	/* Opportunistic refmod collection, annulled
1638	 * if both REF and MOD are being cleared.
1639	 */
1640
1641	pmap_phys_attributes[pai] |= attributes;
1642	pmap_phys_attributes[pai] &= (~bits);
1643
1644	/* update this page's "reusable" status */
1645	if (options & PMAP_OPTIONS_CLEAR_REUSABLE) {
1646		pmap_phys_attributes[pai] &= ~PHYS_REUSABLE;
1647	} else if (options & PMAP_OPTIONS_SET_REUSABLE) {
1648		pmap_phys_attributes[pai] |= PHYS_REUSABLE;
1649	}
1650
1651	UNLOCK_PVH(pai);
1652
1653	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
1654		   0, 0, 0, 0, 0);
1655}
1656
1657/*
1658 *	Check specified attribute bits.
1659 */
1660int
1661phys_attribute_test(
1662	ppnum_t		pn,
1663	int		bits)
1664{
1665	pv_rooted_entry_t	pv_h;
1666	pv_hashed_entry_t	pv_e;
1667	pt_entry_t		*pte;
1668	int			pai;
1669	pmap_t			pmap;
1670	int			attributes = 0;
1671
1672	pmap_intr_assert();
1673	assert(pn != vm_page_fictitious_addr);
1674	if (pn == vm_page_guard_addr)
1675		return 0;
1676
1677	pai = ppn_to_pai(pn);
1678
1679	if (!IS_MANAGED_PAGE(pai)) {
1680		/*
1681		 *	Not a managed page.
1682		 */
1683		return 0;
1684	}
1685
1686	/*
1687	 * Fast check...  if bits already collected
1688	 * no need to take any locks...
1689	 * if not set, we need to recheck after taking
1690	 * the lock in case they got pulled in while
1691	 * we were waiting for the lock
1692	 */
1693	if ((pmap_phys_attributes[pai] & bits) == bits)
1694		return bits;
1695
1696	pv_h = pai_to_pvh(pai);
1697
1698	LOCK_PVH(pai);
1699
1700	attributes = pmap_phys_attributes[pai] & bits;
1701
1702
1703	/*
1704	 * Walk down PV list, checking the mappings until we
1705	 * reach the end or we've found the desired attributes.
1706	 */
1707	if (attributes != bits &&
1708	    pv_h->pmap != PMAP_NULL) {
1709		/*
1710		 * There are some mappings.
1711		 */
1712		pv_e = (pv_hashed_entry_t)pv_h;
1713		do {
1714			vm_map_offset_t va;
1715
1716			pmap = pv_e->pmap;
1717			va = pv_e->va;
1718			/*
1719	 		 * pick up modify and/or reference bits from mapping
1720			 */
1721
1722			pte = pmap_pte(pmap, va);
1723			attributes |= (int)(*pte & bits);
1724
1725			pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
1726
1727		} while ((attributes != bits) &&
1728			 (pv_e != (pv_hashed_entry_t)pv_h));
1729	}
1730	pmap_phys_attributes[pai] |= attributes;
1731
1732	UNLOCK_PVH(pai);
1733	return (attributes);
1734}
1735
1736/*
1737 *	Routine:	pmap_change_wiring
1738 *	Function:	Change the wiring attribute for a map/virtual-address
1739 *			pair.
1740 *	In/out conditions:
1741 *			The mapping must already exist in the pmap.
1742 */
1743void
1744pmap_change_wiring(
1745	pmap_t		map,
1746	vm_map_offset_t	vaddr,
1747	boolean_t	wired)
1748{
1749	pt_entry_t	*pte;
1750
1751	PMAP_LOCK(map);
1752
1753	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
1754		panic("pmap_change_wiring: pte missing");
1755
1756	if (wired && !iswired(*pte)) {
1757		/*
1758		 * wiring down mapping
1759		 */
1760		pmap_ledger_credit(map, task_ledgers.wired_mem, PAGE_SIZE);
1761		OSAddAtomic(+1,  &map->stats.wired_count);
1762		pmap_update_pte(pte, 0, INTEL_PTE_WIRED);
1763	}
1764	else if (!wired && iswired(*pte)) {
1765		/*
1766		 * unwiring mapping
1767		 */
1768		assert(map->stats.wired_count >= 1);
1769		OSAddAtomic(-1,  &map->stats.wired_count);
1770		pmap_ledger_debit(map, task_ledgers.wired_mem, PAGE_SIZE);
1771		pmap_update_pte(pte, INTEL_PTE_WIRED, 0);
1772	}
1773
1774	PMAP_UNLOCK(map);
1775}
1776
1777/*
1778 *	"Backdoor" direct map routine for early mappings.
1779 * 	Useful for mapping memory outside the range
1780 *      Sets A, D and NC if requested
1781 */
1782
1783vm_offset_t
1784pmap_map_bd(
1785	vm_offset_t	virt,
1786	vm_map_offset_t	start_addr,
1787	vm_map_offset_t	end_addr,
1788	vm_prot_t	prot,
1789	unsigned int	flags)
1790{
1791	pt_entry_t	template;
1792	pt_entry_t	*pte;
1793	spl_t           spl;
1794	vm_offset_t	base = virt;
1795	template = pa_to_pte(start_addr)
1796		| INTEL_PTE_REF
1797		| INTEL_PTE_MOD
1798		| INTEL_PTE_WIRED
1799		| INTEL_PTE_VALID;
1800
1801	if ((flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) == VM_MEM_NOT_CACHEABLE) {
1802		template |= INTEL_PTE_NCACHE;
1803		if (!(flags & (VM_MEM_GUARDED)))
1804			template |= INTEL_PTE_PTA;
1805	}
1806
1807#if    defined(__x86_64__)
1808	if ((prot & VM_PROT_EXECUTE) == 0)
1809		template |= INTEL_PTE_NX;
1810#endif
1811
1812	if (prot & VM_PROT_WRITE)
1813		template |= INTEL_PTE_WRITE;
1814
1815	while (start_addr < end_addr) {
1816	        spl = splhigh();
1817		pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
1818		if (pte == PT_ENTRY_NULL) {
1819			panic("pmap_map_bd: Invalid kernel address\n");
1820		}
1821		pmap_store_pte(pte, template);
1822		splx(spl);
1823		pte_increment_pa(template);
1824		virt += PAGE_SIZE;
1825		start_addr += PAGE_SIZE;
1826	}
1827	flush_tlb_raw();
1828	PMAP_UPDATE_TLBS(kernel_pmap, base, base + end_addr - start_addr);
1829	return(virt);
1830}
1831
1832unsigned int
1833pmap_query_resident(
1834	pmap_t		pmap,
1835	addr64_t	s64,
1836	addr64_t	e64)
1837{
1838	pt_entry_t     *pde;
1839	pt_entry_t     *spte, *epte;
1840	addr64_t        l64;
1841	uint64_t        deadline;
1842	unsigned int	result;
1843
1844	pmap_intr_assert();
1845
1846	if (pmap == PMAP_NULL || pmap == kernel_pmap || s64 == e64)
1847		return 0;
1848
1849	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
1850		   pmap,
1851		   (uint32_t) (s64 >> 32), s64,
1852		   (uint32_t) (e64 >> 32), e64);
1853
1854	result = 0;
1855
1856	PMAP_LOCK(pmap);
1857
1858	deadline = rdtsc64() + max_preemption_latency_tsc;
1859
1860	while (s64 < e64) {
1861		l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size - 1);
1862		if (l64 > e64)
1863			l64 = e64;
1864		pde = pmap_pde(pmap, s64);
1865
1866		if (pde && (*pde & INTEL_PTE_VALID)) {
1867			if (*pde & INTEL_PTE_PS) {
1868				/* superpage: not supported */
1869			} else {
1870				spte = pmap_pte(pmap,
1871						(s64 & ~(pde_mapped_size - 1)));
1872				spte = &spte[ptenum(s64)];
1873				epte = &spte[intel_btop(l64 - s64)];
1874
1875				for (; spte < epte; spte++) {
1876					if (pte_to_pa(*spte) != 0) {
1877						result++;
1878					}
1879				}
1880
1881			}
1882		}
1883		s64 = l64;
1884
1885		if (s64 < e64 && rdtsc64() >= deadline) {
1886			PMAP_UNLOCK(pmap);
1887			PMAP_LOCK(pmap);
1888			deadline = rdtsc64() + max_preemption_latency_tsc;
1889		}
1890	}
1891
1892	PMAP_UNLOCK(pmap);
1893
1894	PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
1895		   pmap, 0, 0, 0, 0);
1896
1897	return result;
1898}
1899
1900#if MACH_ASSERT
1901void
1902pmap_set_process(
1903	__unused pmap_t pmap,
1904	__unused int pid,
1905	__unused char *procname)
1906{
1907}
1908#endif /* MACH_ASSERT */
1909