1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30#ifndef	_I386_PMAP_INTERNAL_
31#define _I386_PMAP_INTERNAL_
32#ifdef MACH_KERNEL_PRIVATE
33
34#include <vm/pmap.h>
35#include <sys/kdebug.h>
36#include <kern/ledger.h>
37
38/*
39 * pmap locking
40 */
41
42#define PMAP_LOCK(pmap) {		\
43	simple_lock(&(pmap)->lock);	\
44}
45
46#define PMAP_UNLOCK(pmap) {			\
47	simple_unlock(&(pmap)->lock);		\
48}
49
50#define PMAP_UPDATE_TLBS(pmap, s, e)			\
51	pmap_flush_tlbs(pmap, s, e, 0, NULL)
52
53
54#define	PMAP_DELAY_TLB_FLUSH		0x01
55
56#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c)			\
57	pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
58
59
60#define	iswired(pte)	((pte) & INTEL_PTE_WIRED)
61
62#ifdef	PMAP_TRACES
63extern	boolean_t	pmap_trace;
64#define PMAP_TRACE(x,a,b,c,d,e)						\
65	if (pmap_trace) {						\
66		KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);			\
67	}
68#else
69#define PMAP_TRACE(x,a,b,c,d,e)	KERNEL_DEBUG(x,a,b,c,d,e)
70#endif /* PMAP_TRACES */
71
72#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e)				\
73	KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);				\
74
75kern_return_t	pmap_expand_pml4(
76			pmap_t		map,
77			vm_map_offset_t	v,
78			unsigned int options);
79
80kern_return_t	pmap_expand_pdpt(
81			pmap_t		map,
82			vm_map_offset_t	v,
83			unsigned int options);
84
85void		phys_attribute_set(
86			ppnum_t		phys,
87			int		bits);
88
89void		pmap_set_reference(
90			ppnum_t pn);
91
92boolean_t	phys_page_exists(
93			ppnum_t pn);
94
95void
96pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
97
98void
99pmap_update_cache_attributes_locked(ppnum_t, unsigned);
100
101extern const boolean_t cpu_64bit;
102
103/*
104 *	Private data structures.
105 */
106
107/*
108 *	For each vm_page_t, there is a list of all currently
109 *	valid virtual mappings of that page.  An entry is
110 *	a pv_rooted_entry_t; the list is the pv_table.
111 *
112 *      N.B.  with the new combo rooted/hashed scheme it is
113 *      only possibly to remove individual non-rooted entries
114 *      if they are found via the hashed chains as there is no
115 *      way to unlink the singly linked hashed entries if navigated to
116 *      via the queue list off the rooted entries.  Think of it as
117 *      hash/walk/pull, keeping track of the prev pointer while walking
118 *      the singly linked hash list.  All of this is to save memory and
119 *      keep both types of pv_entries as small as possible.
120 */
121
122/*
123
124PV HASHING Changes - JK 1/2007
125
126Pve's establish physical to virtual mappings.  These are used for aliasing of a
127physical page to (potentially many) virtual addresses within pmaps. In the
128previous implementation the structure of the pv_entries (each 16 bytes in size) was
129
130typedef struct pv_entry {
131    struct pv_entry_t    next;
132    pmap_t                    pmap;
133    vm_map_offset_t   va;
134} *pv_entry_t;
135
136An initial array of these is created at boot time, one per physical page of
137memory, indexed by the physical page number. Additionally, a pool of entries
138is created from a pv_zone to be used as needed by pmap_enter() when it is
139creating new mappings.  Originally, we kept this pool around because the code
140in pmap_enter() was unable to block if it needed an entry and none were
141available - we'd panic.  Some time ago I restructured the pmap_enter() code
142so that for user pmaps it can block while zalloc'ing a pv structure and restart,
143removing a panic from the code (in the case of the kernel pmap we cannot block
144and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
145The pool has not been removed since there is a large performance gain keeping
146freed pv's around for reuse and not suffering the overhead of zalloc for every
147new pv we need.
148
149As pmap_enter() created new mappings it linked the new pve's for them off the
150fixed pv array for that ppn (off the next pointer).  These pve's are accessed
151for several operations, one of them being address space teardown. In that case,
152we basically do this
153
154	for (every page/pte in the space) {
155		calc pve_ptr from the ppn in the pte
156		for (every pv in the list for the ppn) {
157			if (this pv is for this pmap/vaddr) {
158				do housekeeping
159				unlink/free the pv
160			}
161		}
162	}
163
164The problem arose when we were running, say 8000 (or even 2000) apache or
165other processes and one or all terminate. The list hanging off each pv array
166entry could have thousands of entries.  We were continuously linearly searching
167each of these lists as we stepped through the address space we were tearing
168down.  Because of the locks we hold, likely taking a cache miss for each node,
169and interrupt disabling for MP issues the system became completely unresponsive
170for many seconds while we did this.
171
172Realizing that pve's are accessed in two distinct ways (linearly running the
173list by ppn for operations like pmap_page_protect and finding and
174modifying/removing a single pve as part of pmap_enter processing) has led to
175modifying the pve structures and databases.
176
177There are now two types of pve structures.  A "rooted" structure which is
178basically the original structure accessed in an array by ppn, and a ''hashed''
179structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
180designed with the two goals of minimizing wired memory and making the lookup of
181a ppn faster.  Since a vast majority of pages in the system are not aliased
182and hence represented by a single pv entry I've kept the rooted entry size as
183small as possible because there is one of these dedicated for every physical
184page of memory.  The hashed pve's are larger due to the addition of the hash
185link and the ppn entry needed for matching while running the hash list to find
186the entry we are looking for.  This way, only systems that have lots of
187aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
188structures have the same first three fields allowing some simplification in
189the code.
190
191They have these shapes
192
193typedef struct pv_rooted_entry {
194	queue_head_t		qlink;
195        vm_map_offset_t		va;
196	pmap_t			pmap;
197} *pv_rooted_entry_t;
198
199
200typedef struct pv_hashed_entry {
201	queue_head_t		qlink;
202	vm_map_offset_t		va;
203	pmap_t			pmap;
204	ppnum_t 		ppn;
205	struct pv_hashed_entry *nexth;
206} *pv_hashed_entry_t;
207
208The main flow difference is that the code is now aware of the rooted entry and
209the hashed entries.  Code that runs the pv list still starts with the rooted
210entry and then continues down the qlink onto the hashed entries.  Code that is
211looking up a specific pv entry first checks the rooted entry and then hashes
212and runs the hash list for the match. The hash list lengths are much smaller
213than the original pv lists that contained all aliases for the specific ppn.
214
215*/
216
217typedef struct pv_rooted_entry {
218	/* first three entries must match pv_hashed_entry_t */
219        queue_head_t		qlink;
220	vm_map_offset_t		va;	/* virtual address for mapping */
221	pmap_t			pmap;	/* pmap where mapping lies */
222} *pv_rooted_entry_t;
223
224#define PV_ROOTED_ENTRY_NULL	((pv_rooted_entry_t) 0)
225
226typedef struct pv_hashed_entry {
227	/* first three entries must match pv_rooted_entry_t */
228	queue_head_t		qlink;
229	vm_map_offset_t		va;
230	pmap_t			pmap;
231	ppnum_t			ppn;
232	struct pv_hashed_entry	*nexth;
233} *pv_hashed_entry_t;
234
235#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
236
237//#define PV_DEBUG 1   /* uncomment to enable some PV debugging code */
238#ifdef PV_DEBUG
239#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
240#else
241#define CHK_NPVHASH(x)
242#endif
243
244#define NPVHASH 4095   /* MUST BE 2^N - 1 */
245#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
246#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
247#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
248#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
249
250extern volatile uint32_t	mappingrecurse;
251extern uint32_t  pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
252
253/*
254 * PV hash locking
255 */
256
257#define LOCK_PV_HASH(hash)	lock_hash_hash(hash)
258#define UNLOCK_PV_HASH(hash)	unlock_hash_hash(hash)
259extern uint32_t npvhash;
260extern pv_hashed_entry_t	*pv_hash_table;  /* hash lists */
261extern pv_hashed_entry_t	pv_hashed_free_list;
262extern pv_hashed_entry_t	pv_hashed_kern_free_list;
263decl_simple_lock_data(extern, pv_hashed_free_list_lock)
264decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
265decl_simple_lock_data(extern, pv_hash_table_lock)
266
267extern zone_t		pv_hashed_list_zone;	/* zone of pv_hashed_entry
268						 * structures */
269
270extern uint32_t		pv_hashed_free_count;
271extern uint32_t		pv_hashed_kern_free_count;
272/*
273 *	Each entry in the pv_head_table is locked by a bit in the
274 *	pv_lock_table.  The lock bits are accessed by the address of
275 *	the frame they lock.
276 */
277#define pv_lock_table_size(n)	(((n)+BYTE_SIZE-1)/BYTE_SIZE)
278#define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
279extern char		*pv_lock_table;		/* pointer to array of bits */
280extern char		*pv_hash_lock_table;
281extern pv_rooted_entry_t pv_head_table;	/* array of entries, one per page */
282
283extern event_t mapping_replenish_event;
284
285static inline void	PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
286	pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
287	simple_lock(&pv_hashed_free_list_lock);
288	/* If the kernel reserved pool is low, let non-kernel mappings allocate
289	 * synchronously, possibly subject to a throttle.
290	 */
291	if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
292		pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
293		pv_hashed_free_count--;
294	}
295
296	simple_unlock(&pv_hashed_free_list_lock);
297
298	if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
299		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
300			thread_wakeup(&mapping_replenish_event);
301	}
302}
303
304static inline void	PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
305	simple_lock(&pv_hashed_free_list_lock);
306	pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
307	pv_hashed_free_list = pvh_eh;
308	pv_hashed_free_count += pv_cnt;
309	simple_unlock(&pv_hashed_free_list_lock);
310}
311
312extern unsigned pmap_kern_reserve_alloc_stat;
313
314static inline void	PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
315	pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
316	simple_lock(&pv_hashed_kern_free_list_lock);
317
318	if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
319		pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
320		pv_hashed_kern_free_count--;
321		pmap_kern_reserve_alloc_stat++;
322	}
323
324	simple_unlock(&pv_hashed_kern_free_list_lock);
325
326	if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
327		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
328			thread_wakeup(&mapping_replenish_event);
329	}
330}
331
332static inline void	PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
333	simple_lock(&pv_hashed_kern_free_list_lock);
334	pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
335	pv_hashed_kern_free_list = pvh_eh;
336	pv_hashed_kern_free_count += pv_cnt;
337	simple_unlock(&pv_hashed_kern_free_list_lock);
338}
339
340extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
341extern event_t pmap_user_pv_throttle_event;
342
343static inline void pmap_pv_throttle(__unused pmap_t p) {
344	pmap_assert(p != kernel_pmap);
345	/* Apply throttle on non-kernel mappings */
346	if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
347		pmap_pv_throttle_stat++;
348		/* This doesn't need to be strictly accurate, merely a hint
349		 * to eliminate the timeout when the reserve is replenished.
350		 */
351		pmap_pv_throttled_waiters++;
352		assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
353		thread_block(THREAD_CONTINUE_NULL);
354	}
355}
356
357/*
358 *	Index into pv_head table, its lock bits, and the modify/reference and managed bits
359 */
360
361#define pa_index(pa)		(i386_btop(pa))
362#define ppn_to_pai(ppn)		((int)ppn)
363
364#define pai_to_pvh(pai)		(&pv_head_table[pai])
365#define lock_pvh_pai(pai)	bit_lock(pai, (void *)pv_lock_table)
366#define unlock_pvh_pai(pai)	bit_unlock(pai, (void *)pv_lock_table)
367#define pvhash(idx)		(&pv_hash_table[idx])
368#define lock_hash_hash(hash)	bit_lock(hash, (void *)pv_hash_lock_table)
369#define unlock_hash_hash(hash)	bit_unlock(hash, (void *)pv_hash_lock_table)
370
371#define IS_MANAGED_PAGE(x)				\
372	((unsigned int)(x) <= last_managed_page &&	\
373	 (pmap_phys_attributes[x] & PHYS_MANAGED))
374#define IS_INTERNAL_PAGE(x)			\
375	(IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
376#define IS_REUSABLE_PAGE(x)			\
377	(IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
378
379/*
380 *	Physical page attributes.  Copy bits from PTE definition.
381 */
382#define	PHYS_MODIFIED	INTEL_PTE_MOD	/* page modified */
383#define	PHYS_REFERENCED	INTEL_PTE_REF	/* page referenced */
384#define PHYS_MANAGED	INTEL_PTE_VALID /* page is managed */
385#define PHYS_NOENCRYPT	INTEL_PTE_USER	/* no need to encrypt this page in the hibernation image */
386#define	PHYS_NCACHE	INTEL_PTE_NCACHE
387#define	PHYS_PTA	INTEL_PTE_PTA
388#define	PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
389#define PHYS_INTERNAL	INTEL_PTE_WTHRU	/* page from internal object */
390#define PHYS_REUSABLE	INTEL_PTE_WRITE /* page is "reusable" */
391
392extern const boolean_t	pmap_disable_kheap_nx;
393extern const boolean_t	pmap_disable_kstack_nx;
394
395#define PMAP_EXPAND_OPTIONS_NONE (0x0)
396#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
397#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
398
399/*
400 *	Amount of virtual memory mapped by one
401 *	page-directory entry.
402 */
403#define	PDE_MAPPED_SIZE		(pdetova(1))
404
405
406/*
407 *	Locking and TLB invalidation
408 */
409
410/*
411 *	Locking Protocols: (changed 2/2007 JK)
412 *
413 *	There are two structures in the pmap module that need locking:
414 *	the pmaps themselves, and the per-page pv_lists (which are locked
415 *	by locking the pv_lock_table entry that corresponds to the pv_head
416 *	for the list in question.)  Most routines want to lock a pmap and
417 *	then do operations in it that require pv_list locking -- however
418 *	pmap_remove_all and pmap_copy_on_write operate on a physical page
419 *	basis and want to do the locking in the reverse order, i.e. lock
420 *	a pv_list and then go through all the pmaps referenced by that list.
421 *
422 *      The system wide pmap lock has been removed. Now, paths take a lock
423 *      on the pmap before changing its 'shape' and the reverse order lockers
424 *      (coming in by phys ppn) take a lock on the corresponding pv and then
425 *      retest to be sure nothing changed during the window before they locked
426 *      and can then run up/down the pv lists holding the list lock. This also
427 *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
428 *      previously.
429 */
430
431/*
432 * PV locking
433 */
434
435#define LOCK_PVH(index)	{		\
436	mp_disable_preemption();	\
437	lock_pvh_pai(index);		\
438}
439
440#define UNLOCK_PVH(index) {		\
441	unlock_pvh_pai(index);		\
442	mp_enable_preemption();		\
443}
444
445extern uint64_t pde_mapped_size;
446
447extern char		*pmap_phys_attributes;
448extern ppnum_t		last_managed_page;
449
450extern ppnum_t	lowest_lo;
451extern ppnum_t	lowest_hi;
452extern ppnum_t	highest_hi;
453
454/*
455 * when spinning through pmap_remove
456 * ensure that we don't spend too much
457 * time with preemption disabled.
458 * I'm setting the current threshold
459 * to 20us
460 */
461#define MAX_PREEMPTION_LATENCY_NS 20000
462extern uint64_t max_preemption_latency_tsc;
463
464/* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
465#ifdef DEBUGINTERRUPTS
466#define pmap_intr_assert() {							\
467	if (processor_avail_count > 1 && !ml_get_interrupts_enabled())		\
468		panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);	\
469}
470#else
471#define pmap_intr_assert()
472#endif
473
474extern int 		nx_enabled;
475extern unsigned int    inuse_ptepages_count;
476
477static inline uint32_t
478pvhashidx(pmap_t pmap, vm_map_offset_t va)
479{
480	return ((uint32_t)(uintptr_t)pmap ^
481		((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
482	       npvhash;
483}
484
485
486/*
487 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
488 * properly deals with the anchor.
489 * must be called with the hash locked, does not unlock it
490 */
491static inline void
492pmap_pvh_unlink(pv_hashed_entry_t pvh)
493{
494	pv_hashed_entry_t	curh;
495	pv_hashed_entry_t	*pprevh;
496	int           		pvhash_idx;
497
498	CHK_NPVHASH();
499	pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
500
501	pprevh = pvhash(pvhash_idx);
502
503#if PV_DEBUG
504	if (NULL == *pprevh)
505		panic("pvh_unlink null anchor"); /* JK DEBUG */
506#endif
507	curh = *pprevh;
508
509	while (PV_HASHED_ENTRY_NULL != curh) {
510		if (pvh == curh)
511			break;
512		pprevh = &curh->nexth;
513		curh = curh->nexth;
514	}
515	if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
516	*pprevh = pvh->nexth;
517	return;
518}
519
520static inline void
521pv_hash_add(pv_hashed_entry_t	pvh_e,
522	    pv_rooted_entry_t	pv_h)
523{
524	pv_hashed_entry_t       *hashp;
525	int                     pvhash_idx;
526
527	CHK_NPVHASH();
528	pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
529	LOCK_PV_HASH(pvhash_idx);
530	insque(&pvh_e->qlink, &pv_h->qlink);
531	hashp = pvhash(pvhash_idx);
532#if PV_DEBUG
533	if (NULL==hashp)
534		panic("pv_hash_add(%p) null hash bucket", pvh_e);
535#endif
536	pvh_e->nexth = *hashp;
537	*hashp = pvh_e;
538	UNLOCK_PV_HASH(pvhash_idx);
539}
540
541static inline void
542pv_hash_remove(pv_hashed_entry_t pvh_e)
543{
544	int                     pvhash_idx;
545
546	CHK_NPVHASH();
547	pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
548	LOCK_PV_HASH(pvhash_idx);
549	remque(&pvh_e->qlink);
550	pmap_pvh_unlink(pvh_e);
551	UNLOCK_PV_HASH(pvhash_idx);
552}
553
554static inline boolean_t popcnt1(uint64_t distance) {
555	return ((distance & (distance - 1)) == 0);
556}
557
558/*
559 * Routines to handle suppression of/recovery from some forms of pagetable corruption
560 * incidents observed in the field. These can be either software induced (wild
561 * stores to the mapwindows where applicable, use after free errors
562 * (typically of pages addressed physically), mis-directed DMAs etc., or due
563 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
564 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
565 * still assert on potential software races, but attempt recovery from incidents
566 * identifiable as occurring due to issues beyond the control of the pmap module.
567 * The latter includes single-bit errors and malformed pagetable entries.
568 * We currently limit ourselves to recovery/suppression of one incident per
569 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
570 * are logged.
571 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
572 */
573
574typedef enum {
575	PTE_VALID		= 0x0,
576	PTE_INVALID		= 0x1,
577	PTE_RSVD		= 0x2,
578	PTE_SUPERVISOR		= 0x4,
579	PTE_BITFLIP		= 0x8,
580	PV_BITFLIP		= 0x10,
581	PTE_INVALID_CACHEABILITY = 0x20
582} pmap_pagetable_corruption_t;
583
584typedef enum {
585	ROOT_PRESENT = 0,
586	ROOT_ABSENT = 1
587} pmap_pv_assertion_t;
588
589typedef enum {
590	PMAP_ACTION_IGNORE	= 0x0,
591	PMAP_ACTION_ASSERT	= 0x1,
592	PMAP_ACTION_RETRY	= 0x2,
593	PMAP_ACTION_RETRY_RELOCK = 0x4
594} pmap_pagetable_corruption_action_t;
595
596#define	PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
597extern uint64_t pmap_pagetable_corruption_interval_abstime;
598
599extern uint32_t pmap_pagetable_corruption_incidents;
600#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
601typedef struct {
602	pmap_pv_assertion_t incident;
603	pmap_pagetable_corruption_t reason;
604	pmap_pagetable_corruption_action_t action;
605	pmap_t	pmap;
606	vm_map_offset_t vaddr;
607	pt_entry_t pte;
608	ppnum_t ppn;
609	pmap_t pvpmap;
610	vm_map_offset_t pvva;
611	uint64_t abstime;
612} pmap_pagetable_corruption_record_t;
613
614extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
615extern uint64_t pmap_pagetable_corruption_last_abstime;
616extern thread_call_t 	pmap_pagetable_corruption_log_call;
617extern boolean_t pmap_pagetable_corruption_timeout;
618
619static inline void
620pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
621	uint32_t pmap_pagetable_corruption_log_index;
622	pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
623	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
624	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
625	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
626	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
627	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
628	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
629	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
630	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
631	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
632	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
633	/* Asynchronously log */
634	thread_call_enter(pmap_pagetable_corruption_log_call);
635}
636
637static inline pmap_pagetable_corruption_action_t
638pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
639	pmap_pagetable_corruption_action_t	action = PMAP_ACTION_ASSERT;
640	pmap_pagetable_corruption_t	suppress_reason = PTE_VALID;
641	ppnum_t			suppress_ppn = 0;
642	pt_entry_t cpte = *ptep;
643	ppnum_t	cpn = pa_index(pte_to_pa(cpte));
644	ppnum_t	ppn = *ppnp;
645	pv_rooted_entry_t	pv_h = pai_to_pvh(ppn_to_pai(ppn));
646	pv_rooted_entry_t	pv_e = pv_h;
647	uint32_t	bitdex;
648	pmap_t pvpmap = pv_h->pmap;
649	vm_map_offset_t pvva = pv_h->va;
650	boolean_t ppcd = FALSE;
651
652	/* Ideally, we'd consult the Mach VM here to definitively determine
653	 * the nature of the mapping for this address space and address.
654	 * As that would be a layering violation in this context, we
655	 * use various heuristics to recover from single bit errors,
656	 * malformed pagetable entries etc. These are not intended
657	 * to be comprehensive.
658	 */
659
660	/* As a precautionary measure, mark A+D */
661	pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
662
663	/*
664	 * Correct potential single bit errors in either (but not both) element
665	 * of the PV
666	 */
667	do {
668		if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
669		    (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
670			pv_e->pmap = pmap;
671			pv_e->va = vaddr;
672			suppress_reason = PV_BITFLIP;
673			action = PMAP_ACTION_RETRY;
674			goto pmap_cpc_exit;
675		}
676	} while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
677
678	/* Discover root entries with a Hamming
679	 * distance of 1 from the supplied
680	 * physical page frame.
681	 */
682	for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
683		ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
684		if (IS_MANAGED_PAGE(npn)) {
685			pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
686			if (npv_h->va == vaddr && npv_h->pmap == pmap) {
687				suppress_reason = PTE_BITFLIP;
688				suppress_ppn = npn;
689				action = PMAP_ACTION_RETRY_RELOCK;
690				UNLOCK_PVH(ppn_to_pai(ppn));
691				*ppnp = npn;
692				goto pmap_cpc_exit;
693			}
694		}
695	}
696
697	if (pmap == kernel_pmap) {
698		action = PMAP_ACTION_ASSERT;
699		goto pmap_cpc_exit;
700	}
701
702	/* Check for malformed/inconsistent entries */
703
704	if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) ==  (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
705		action = PMAP_ACTION_IGNORE;
706		suppress_reason = PTE_INVALID_CACHEABILITY;
707	}
708	else if (cpte & INTEL_PTE_RSVD) {
709		action = PMAP_ACTION_IGNORE;
710		suppress_reason = PTE_RSVD;
711	}
712	else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
713		action = PMAP_ACTION_IGNORE;
714		suppress_reason = PTE_SUPERVISOR;
715	}
716pmap_cpc_exit:
717	PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
718
719	if (debug_boot_arg && !ppcd) {
720		action = PMAP_ACTION_ASSERT;
721	}
722
723	if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
724		action = PMAP_ACTION_ASSERT;
725		pmap_pagetable_corruption_timeout = TRUE;
726	}
727	else
728	{
729		pmap_pagetable_corruption_last_abstime = mach_absolute_time();
730	}
731	pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
732	return action;
733}
734
735/*
736 * Remove pv list entry.
737 * Called with pv_head_table entry locked.
738 * Returns pv entry to be freed (or NULL).
739 */
740static inline __attribute__((always_inline)) pv_hashed_entry_t
741pmap_pv_remove(pmap_t		pmap,
742	       vm_map_offset_t	vaddr,
743    		ppnum_t		*ppnp,
744		pt_entry_t	*pte)
745{
746	pv_hashed_entry_t       pvh_e;
747	pv_rooted_entry_t	pv_h;
748	pv_hashed_entry_t	*pprevh;
749	int                     pvhash_idx;
750	uint32_t                pv_cnt;
751	ppnum_t			ppn;
752
753pmap_pv_remove_retry:
754	ppn = *ppnp;
755	pvh_e = PV_HASHED_ENTRY_NULL;
756	pv_h = pai_to_pvh(ppn_to_pai(ppn));
757
758	if (__improbable(pv_h->pmap == PMAP_NULL)) {
759		pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
760		if (pac == PMAP_ACTION_IGNORE)
761			goto pmap_pv_remove_exit;
762		else if (pac == PMAP_ACTION_ASSERT)
763			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
764		else if (pac == PMAP_ACTION_RETRY_RELOCK) {
765			LOCK_PVH(ppn_to_pai(*ppnp));
766			pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
767			goto pmap_pv_remove_retry;
768		}
769		else if (pac == PMAP_ACTION_RETRY)
770			goto pmap_pv_remove_retry;
771	}
772
773	if (pv_h->va == vaddr && pv_h->pmap == pmap) {
774		/*
775	         * Header is the pv_rooted_entry.
776		 * We can't free that. If there is a queued
777	         * entry after this one we remove that
778	         * from the ppn queue, we remove it from the hash chain
779	         * and copy it to the rooted entry. Then free it instead.
780	         */
781		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
782		if (pv_h != (pv_rooted_entry_t) pvh_e) {
783			/*
784			 * Entry queued to root, remove this from hash
785			 * and install as new root.
786			 */
787			CHK_NPVHASH();
788			pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
789			LOCK_PV_HASH(pvhash_idx);
790			remque(&pvh_e->qlink);
791			pprevh = pvhash(pvhash_idx);
792			if (PV_HASHED_ENTRY_NULL == *pprevh) {
793				panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
794				      "empty hash, removing rooted",
795				      pmap, vaddr, ppn);
796			}
797			pmap_pvh_unlink(pvh_e);
798			UNLOCK_PV_HASH(pvhash_idx);
799			pv_h->pmap = pvh_e->pmap;
800			pv_h->va = pvh_e->va;	/* dispose of pvh_e */
801		} else {
802			/* none queued after rooted */
803			pv_h->pmap = PMAP_NULL;
804			pvh_e = PV_HASHED_ENTRY_NULL;
805		}
806	} else {
807		/*
808		 * not removing rooted pv. find it on hash chain, remove from
809		 * ppn queue and hash chain and free it
810		 */
811		CHK_NPVHASH();
812		pvhash_idx = pvhashidx(pmap, vaddr);
813		LOCK_PV_HASH(pvhash_idx);
814		pprevh = pvhash(pvhash_idx);
815		if (PV_HASHED_ENTRY_NULL == *pprevh) {
816			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
817			    pmap, vaddr, ppn, *pte, pte);
818		}
819		pvh_e = *pprevh;
820		pmap_pv_hashlist_walks++;
821		pv_cnt = 0;
822		while (PV_HASHED_ENTRY_NULL != pvh_e) {
823			pv_cnt++;
824			if (pvh_e->pmap == pmap &&
825			    pvh_e->va == vaddr &&
826			    pvh_e->ppn == ppn)
827				break;
828			pprevh = &pvh_e->nexth;
829			pvh_e = pvh_e->nexth;
830		}
831
832		if (PV_HASHED_ENTRY_NULL == pvh_e) {
833			pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
834
835			if (pac == PMAP_ACTION_ASSERT)
836				panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
837			else {
838				UNLOCK_PV_HASH(pvhash_idx);
839				if (pac == PMAP_ACTION_RETRY_RELOCK) {
840					LOCK_PVH(ppn_to_pai(*ppnp));
841					pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
842					goto pmap_pv_remove_retry;
843				}
844				else if (pac == PMAP_ACTION_RETRY) {
845					goto pmap_pv_remove_retry;
846				}
847				else if (pac == PMAP_ACTION_IGNORE) {
848					goto pmap_pv_remove_exit;
849				}
850			}
851		}
852
853		pmap_pv_hashlist_cnts += pv_cnt;
854		if (pmap_pv_hashlist_max < pv_cnt)
855			pmap_pv_hashlist_max = pv_cnt;
856		*pprevh = pvh_e->nexth;
857		remque(&pvh_e->qlink);
858		UNLOCK_PV_HASH(pvhash_idx);
859	}
860pmap_pv_remove_exit:
861	return pvh_e;
862}
863
864
865extern int 	pt_fake_zone_index;
866static inline void
867PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
868{
869	thread_t thr = current_thread();
870	task_t task;
871	zinfo_usage_t zinfo;
872
873	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
874
875	if (pt_fake_zone_index != -1 &&
876	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
877		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
878}
879
880static inline void
881PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
882{
883	thread_t thr = current_thread();
884	task_t task;
885	zinfo_usage_t zinfo;
886
887	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
888
889	if (pt_fake_zone_index != -1 &&
890	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
891		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
892}
893
894static inline void
895PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
896{
897	pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
898}
899
900static inline void
901PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
902{
903	pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
904}
905
906extern boolean_t	pmap_initialized;/* Has pmap_init completed? */
907#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
908
909// XXX
910#define HIGH_MEM_BASE  ((uint32_t)( -NBPDE) )  /* shared gdt etc seg addr */ /* XXX64 ?? */
911// XXX
912
913
914int		phys_attribute_test(
915			ppnum_t		phys,
916			int		bits);
917void		phys_attribute_clear(
918			ppnum_t		phys,
919			int		bits,
920			unsigned int	options,
921	                void		*arg);
922
923//#define PCID_DEBUG 1
924#if	PCID_DEBUG
925#define pmap_pcid_log(fmt, args...)					\
926	do {								\
927		kprintf(fmt, ##args);					\
928		printf(fmt, ##args);					\
929	} while(0)
930#else
931#define pmap_pcid_log(fmt, args...)
932#endif
933void	pmap_pcid_configure(void);
934
935
936/*
937 * Atomic 64-bit compare and exchange of a page table entry.
938 */
939static inline boolean_t
940pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
941{
942	boolean_t		ret;
943
944	/*
945	 * Load the old value into %rax
946	 * Load the new value into another register
947	 * Compare-exchange-quad at address entryp
948	 * If the compare succeeds, the new value is stored, return TRUE.
949	 * Otherwise, no swap is made, return FALSE.
950	 */
951	asm volatile(
952		"	lock; cmpxchgq %2,(%3)	\n\t"
953		"	setz	%%al		\n\t"
954		"	movzbl	%%al,%0"
955		: "=a" (ret)
956		: "a" (old),
957		  "r" (new),
958		  "r" (entryp)
959		: "memory");
960	return ret;
961}
962
963extern uint32_t pmap_update_clear_pte_count;
964
965static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
966	pt_entry_t npte, opte;
967	do {
968		opte = *mptep;
969		if (__improbable(opte == 0)) {
970			pmap_update_clear_pte_count++;
971			break;
972		}
973		npte = opte & ~(pclear_bits);
974		npte |= pset_bits;
975	}	while (!pmap_cmpx_pte(mptep, opte, npte));
976}
977
978#if	defined(__x86_64__)
979/*
980 * The single pml4 page per pmap is allocated at pmap create time and exists
981 * for the duration of the pmap. we allocate this page in kernel vm.
982 * this returns the address of the requested pml4 entry in the top level page.
983 */
984static inline
985pml4_entry_t *
986pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
987{
988	if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
989		(vaddr < 0xFFFF800000000000ULL))) {
990		return (NULL);
991	}
992
993#if	PMAP_ASSERT
994	return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
995#else
996	return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
997#endif
998}
999
1000/*
1001 * Returns address of requested PDPT entry in the physmap.
1002 */
1003static inline pdpt_entry_t *
1004pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1005{
1006	pml4_entry_t	newpf;
1007	pml4_entry_t	*pml4;
1008
1009	pml4 = pmap64_pml4(pmap, vaddr);
1010	if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
1011		newpf = *pml4 & PG_FRAME;
1012		return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1013			[(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
1014	}
1015	return (NULL);
1016}
1017/*
1018 * Returns the address of the requested PDE entry in the physmap.
1019 */
1020static inline pd_entry_t *
1021pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
1022{
1023	pdpt_entry_t	newpf;
1024	pdpt_entry_t	*pdpt;
1025
1026	pdpt = pmap64_pdpt(pmap, vaddr);
1027
1028	if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
1029		newpf = *pdpt & PG_FRAME;
1030		return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1031			[(vaddr >> PDSHIFT) & (NPDPG-1)];
1032	}
1033	return (NULL);
1034}
1035
1036static inline pd_entry_t     *
1037pmap_pde(pmap_t m, vm_map_offset_t v)
1038{
1039	pd_entry_t     *pde;
1040
1041	pde = pmap64_pde(m, v);
1042
1043	return pde;
1044}
1045
1046
1047/*
1048 * return address of mapped pte for vaddr va in pmap pmap.
1049 *
1050 * In case the pde maps a superpage, return the pde, which, in this case
1051 * is the actual page table entry.
1052 */
1053static inline pt_entry_t *
1054pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1055{
1056	pd_entry_t	*pde;
1057	pd_entry_t	newpf;
1058
1059	assert(pmap);
1060	pde = pmap64_pde(pmap, vaddr);
1061
1062	if (pde && ((*pde & INTEL_PTE_VALID))) {
1063		if (*pde & INTEL_PTE_PS)
1064			return pde;
1065		newpf = *pde & PG_FRAME;
1066		return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1067			[i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
1068	}
1069	return (NULL);
1070}
1071#endif
1072#if	DEBUG
1073#define DPRINTF(x...)	kprintf(x)
1074#else
1075#define DPRINTF(x...)
1076#endif
1077
1078#endif /* MACH_KERNEL_PRIVATE */
1079#endif /* _I386_PMAP_INTERNAL_ */
1080