1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30#ifndef	_I386_PMAP_INTERNAL_
31#define _I386_PMAP_INTERNAL_
32#ifdef MACH_KERNEL_PRIVATE
33
34#include <vm/pmap.h>
35#include <sys/kdebug.h>
36#include <kern/ledger.h>
37#include <kern/simple_lock.h>
38#include <i386/bit_routines.h>
39
40/*
41 * pmap locking
42 */
43
44#define PMAP_LOCK(pmap) {		\
45	simple_lock(&(pmap)->lock);	\
46}
47
48#define PMAP_UNLOCK(pmap) {			\
49	simple_unlock(&(pmap)->lock);		\
50}
51
52#define PMAP_UPDATE_TLBS(pmap, s, e)			\
53	pmap_flush_tlbs(pmap, s, e, 0, NULL)
54
55
56#define	PMAP_DELAY_TLB_FLUSH		0x01
57
58#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c)			\
59	pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c)
60
61
62#define	iswired(pte)	((pte) & INTEL_PTE_WIRED)
63
64#ifdef	PMAP_TRACES
65extern	boolean_t	pmap_trace;
66#define PMAP_TRACE(x,a,b,c,d,e)						\
67	if (pmap_trace) {						\
68		KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);			\
69	}
70#else
71#define PMAP_TRACE(x,a,b,c,d,e)	KERNEL_DEBUG(x,a,b,c,d,e)
72#endif /* PMAP_TRACES */
73
74#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e)				\
75	KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);				\
76
77kern_return_t	pmap_expand_pml4(
78			pmap_t		map,
79			vm_map_offset_t	v,
80			unsigned int options);
81
82kern_return_t	pmap_expand_pdpt(
83			pmap_t		map,
84			vm_map_offset_t	v,
85			unsigned int options);
86
87void		phys_attribute_set(
88			ppnum_t		phys,
89			int		bits);
90
91void		pmap_set_reference(
92			ppnum_t pn);
93
94boolean_t	phys_page_exists(
95			ppnum_t pn);
96
97void
98pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *);
99
100void
101pmap_update_cache_attributes_locked(ppnum_t, unsigned);
102
103extern const boolean_t cpu_64bit;
104
105/*
106 *	Private data structures.
107 */
108
109/*
110 *	For each vm_page_t, there is a list of all currently
111 *	valid virtual mappings of that page.  An entry is
112 *	a pv_rooted_entry_t; the list is the pv_table.
113 *
114 *      N.B.  with the new combo rooted/hashed scheme it is
115 *      only possibly to remove individual non-rooted entries
116 *      if they are found via the hashed chains as there is no
117 *      way to unlink the singly linked hashed entries if navigated to
118 *      via the queue list off the rooted entries.  Think of it as
119 *      hash/walk/pull, keeping track of the prev pointer while walking
120 *      the singly linked hash list.  All of this is to save memory and
121 *      keep both types of pv_entries as small as possible.
122 */
123
124/*
125
126PV HASHING Changes - JK 1/2007
127
128Pve's establish physical to virtual mappings.  These are used for aliasing of a
129physical page to (potentially many) virtual addresses within pmaps. In the
130previous implementation the structure of the pv_entries (each 16 bytes in size) was
131
132typedef struct pv_entry {
133    struct pv_entry_t    next;
134    pmap_t                    pmap;
135    vm_map_offset_t   va;
136} *pv_entry_t;
137
138An initial array of these is created at boot time, one per physical page of
139memory, indexed by the physical page number. Additionally, a pool of entries
140is created from a pv_zone to be used as needed by pmap_enter() when it is
141creating new mappings.  Originally, we kept this pool around because the code
142in pmap_enter() was unable to block if it needed an entry and none were
143available - we'd panic.  Some time ago I restructured the pmap_enter() code
144so that for user pmaps it can block while zalloc'ing a pv structure and restart,
145removing a panic from the code (in the case of the kernel pmap we cannot block
146and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
147The pool has not been removed since there is a large performance gain keeping
148freed pv's around for reuse and not suffering the overhead of zalloc for every
149new pv we need.
150
151As pmap_enter() created new mappings it linked the new pve's for them off the
152fixed pv array for that ppn (off the next pointer).  These pve's are accessed
153for several operations, one of them being address space teardown. In that case,
154we basically do this
155
156	for (every page/pte in the space) {
157		calc pve_ptr from the ppn in the pte
158		for (every pv in the list for the ppn) {
159			if (this pv is for this pmap/vaddr) {
160				do housekeeping
161				unlink/free the pv
162			}
163		}
164	}
165
166The problem arose when we were running, say 8000 (or even 2000) apache or
167other processes and one or all terminate. The list hanging off each pv array
168entry could have thousands of entries.  We were continuously linearly searching
169each of these lists as we stepped through the address space we were tearing
170down.  Because of the locks we hold, likely taking a cache miss for each node,
171and interrupt disabling for MP issues the system became completely unresponsive
172for many seconds while we did this.
173
174Realizing that pve's are accessed in two distinct ways (linearly running the
175list by ppn for operations like pmap_page_protect and finding and
176modifying/removing a single pve as part of pmap_enter processing) has led to
177modifying the pve structures and databases.
178
179There are now two types of pve structures.  A "rooted" structure which is
180basically the original structure accessed in an array by ppn, and a ''hashed''
181structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
182designed with the two goals of minimizing wired memory and making the lookup of
183a ppn faster.  Since a vast majority of pages in the system are not aliased
184and hence represented by a single pv entry I've kept the rooted entry size as
185small as possible because there is one of these dedicated for every physical
186page of memory.  The hashed pve's are larger due to the addition of the hash
187link and the ppn entry needed for matching while running the hash list to find
188the entry we are looking for.  This way, only systems that have lots of
189aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
190structures have the same first three fields allowing some simplification in
191the code.
192
193They have these shapes
194
195typedef struct pv_rooted_entry {
196	queue_head_t		qlink;
197        vm_map_offset_t		va;
198	pmap_t			pmap;
199} *pv_rooted_entry_t;
200
201
202typedef struct pv_hashed_entry {
203	queue_head_t		qlink;
204	vm_map_offset_t		va;
205	pmap_t			pmap;
206	ppnum_t 		ppn;
207	struct pv_hashed_entry *nexth;
208} *pv_hashed_entry_t;
209
210The main flow difference is that the code is now aware of the rooted entry and
211the hashed entries.  Code that runs the pv list still starts with the rooted
212entry and then continues down the qlink onto the hashed entries.  Code that is
213looking up a specific pv entry first checks the rooted entry and then hashes
214and runs the hash list for the match. The hash list lengths are much smaller
215than the original pv lists that contained all aliases for the specific ppn.
216
217*/
218
219typedef struct pv_rooted_entry {
220	/* first three entries must match pv_hashed_entry_t */
221        queue_head_t		qlink;
222	vm_map_offset_t		va;	/* virtual address for mapping */
223	pmap_t			pmap;	/* pmap where mapping lies */
224} *pv_rooted_entry_t;
225
226#define PV_ROOTED_ENTRY_NULL	((pv_rooted_entry_t) 0)
227
228typedef struct pv_hashed_entry {
229	/* first three entries must match pv_rooted_entry_t */
230	queue_head_t		qlink;
231	vm_map_offset_t		va;
232	pmap_t			pmap;
233	ppnum_t			ppn;
234	struct pv_hashed_entry	*nexth;
235} *pv_hashed_entry_t;
236
237#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
238
239//#define PV_DEBUG 1   /* uncomment to enable some PV debugging code */
240#ifdef PV_DEBUG
241#define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized");
242#else
243#define CHK_NPVHASH(x)
244#endif
245
246#define NPVHASHBUCKETS (4096)
247#define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */
248#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
249#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
250#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
251#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
252
253extern volatile uint32_t	mappingrecurse;
254extern uint32_t  pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
255
256/*
257 * PV hash locking
258 */
259
260#define LOCK_PV_HASH(hash)	lock_hash_hash(hash)
261#define UNLOCK_PV_HASH(hash)	unlock_hash_hash(hash)
262extern uint32_t npvhashmask;
263extern pv_hashed_entry_t	*pv_hash_table;  /* hash lists */
264extern pv_hashed_entry_t	pv_hashed_free_list;
265extern pv_hashed_entry_t	pv_hashed_kern_free_list;
266decl_simple_lock_data(extern, pv_hashed_free_list_lock)
267decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
268decl_simple_lock_data(extern, pv_hash_table_lock)
269decl_simple_lock_data(extern, phys_backup_lock)
270
271extern zone_t		pv_hashed_list_zone;	/* zone of pv_hashed_entry
272						 * structures */
273
274extern uint32_t		pv_hashed_free_count;
275extern uint32_t		pv_hashed_kern_free_count;
276/*
277 *	Each entry in the pv_head_table is locked by a bit in the
278 *	pv_lock_table.  The lock bits are accessed by the address of
279 *	the frame they lock.
280 */
281#define pv_lock_table_size(n)	(((n)+BYTE_SIZE-1)/BYTE_SIZE)
282#define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
283extern char		*pv_lock_table;		/* pointer to array of bits */
284extern char		*pv_hash_lock_table;
285extern pv_rooted_entry_t pv_head_table;	/* array of entries, one per page */
286
287extern event_t mapping_replenish_event;
288
289static inline void	PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
290	pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
291	simple_lock(&pv_hashed_free_list_lock);
292	/* If the kernel reserved pool is low, let non-kernel mappings allocate
293	 * synchronously, possibly subject to a throttle.
294	 */
295	if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
296		pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
297		pv_hashed_free_count--;
298	}
299
300	simple_unlock(&pv_hashed_free_list_lock);
301
302	if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
303		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
304			thread_wakeup(&mapping_replenish_event);
305	}
306}
307
308static inline void	PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
309	simple_lock(&pv_hashed_free_list_lock);
310	pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
311	pv_hashed_free_list = pvh_eh;
312	pv_hashed_free_count += pv_cnt;
313	simple_unlock(&pv_hashed_free_list_lock);
314}
315
316extern unsigned pmap_kern_reserve_alloc_stat;
317
318static inline void	PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
319	pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
320	simple_lock(&pv_hashed_kern_free_list_lock);
321
322	if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
323		pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
324		pv_hashed_kern_free_count--;
325		pmap_kern_reserve_alloc_stat++;
326	}
327
328	simple_unlock(&pv_hashed_kern_free_list_lock);
329
330	if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
331		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
332			thread_wakeup(&mapping_replenish_event);
333	}
334}
335
336static inline void	PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
337	simple_lock(&pv_hashed_kern_free_list_lock);
338	pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
339	pv_hashed_kern_free_list = pvh_eh;
340	pv_hashed_kern_free_count += pv_cnt;
341	simple_unlock(&pv_hashed_kern_free_list_lock);
342}
343
344extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
345extern event_t pmap_user_pv_throttle_event;
346
347static inline void pmap_pv_throttle(__unused pmap_t p) {
348	pmap_assert(p != kernel_pmap);
349	/* Apply throttle on non-kernel mappings */
350	if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
351		pmap_pv_throttle_stat++;
352		/* This doesn't need to be strictly accurate, merely a hint
353		 * to eliminate the timeout when the reserve is replenished.
354		 */
355		pmap_pv_throttled_waiters++;
356		assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
357		thread_block(THREAD_CONTINUE_NULL);
358	}
359}
360
361/*
362 *	Index into pv_head table, its lock bits, and the modify/reference and managed bits
363 */
364
365#define pa_index(pa)		(i386_btop(pa))
366#define ppn_to_pai(ppn)		((int)ppn)
367
368#define pai_to_pvh(pai)		(&pv_head_table[pai])
369#define lock_pvh_pai(pai)	bit_lock(pai, (void *)pv_lock_table)
370#define unlock_pvh_pai(pai)	bit_unlock(pai, (void *)pv_lock_table)
371#define pvhash(idx)		(&pv_hash_table[idx])
372#define lock_hash_hash(hash)	bit_lock(hash, (void *)pv_hash_lock_table)
373#define unlock_hash_hash(hash)	bit_unlock(hash, (void *)pv_hash_lock_table)
374
375#define IS_MANAGED_PAGE(x)				\
376	((unsigned int)(x) <= last_managed_page &&	\
377	 (pmap_phys_attributes[x] & PHYS_MANAGED))
378#define IS_INTERNAL_PAGE(x)			\
379	(IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL))
380#define IS_REUSABLE_PAGE(x)			\
381	(IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE))
382
383/*
384 *	Physical page attributes.  Copy bits from PTE definition.
385 */
386#define	PHYS_MODIFIED	INTEL_PTE_MOD	/* page modified */
387#define	PHYS_REFERENCED	INTEL_PTE_REF	/* page referenced */
388#define PHYS_MANAGED	INTEL_PTE_VALID /* page is managed */
389#define PHYS_NOENCRYPT	INTEL_PTE_USER	/* no need to encrypt this page in the hibernation image */
390#define	PHYS_NCACHE	INTEL_PTE_NCACHE
391#define	PHYS_PTA	INTEL_PTE_PTA
392#define	PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
393#define PHYS_INTERNAL	INTEL_PTE_WTHRU	/* page from internal object */
394#define PHYS_REUSABLE	INTEL_PTE_WRITE /* page is "reusable" */
395
396extern const boolean_t	pmap_disable_kheap_nx;
397extern const boolean_t	pmap_disable_kstack_nx;
398
399#define PMAP_EXPAND_OPTIONS_NONE (0x0)
400#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
401#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
402
403/*
404 *	Amount of virtual memory mapped by one
405 *	page-directory entry.
406 */
407#define	PDE_MAPPED_SIZE		(pdetova(1))
408
409
410/*
411 *	Locking and TLB invalidation
412 */
413
414/*
415 *	Locking Protocols: (changed 2/2007 JK)
416 *
417 *	There are two structures in the pmap module that need locking:
418 *	the pmaps themselves, and the per-page pv_lists (which are locked
419 *	by locking the pv_lock_table entry that corresponds to the pv_head
420 *	for the list in question.)  Most routines want to lock a pmap and
421 *	then do operations in it that require pv_list locking -- however
422 *	pmap_remove_all and pmap_copy_on_write operate on a physical page
423 *	basis and want to do the locking in the reverse order, i.e. lock
424 *	a pv_list and then go through all the pmaps referenced by that list.
425 *
426 *      The system wide pmap lock has been removed. Now, paths take a lock
427 *      on the pmap before changing its 'shape' and the reverse order lockers
428 *      (coming in by phys ppn) take a lock on the corresponding pv and then
429 *      retest to be sure nothing changed during the window before they locked
430 *      and can then run up/down the pv lists holding the list lock. This also
431 *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
432 *      previously.
433 */
434
435/*
436 * PV locking
437 */
438
439#define LOCK_PVH(index)	{		\
440	mp_disable_preemption();	\
441	lock_pvh_pai(index);		\
442}
443
444#define UNLOCK_PVH(index) {		\
445	unlock_pvh_pai(index);		\
446	mp_enable_preemption();		\
447}
448
449extern uint64_t pde_mapped_size;
450
451extern char		*pmap_phys_attributes;
452extern ppnum_t		last_managed_page;
453
454extern ppnum_t	lowest_lo;
455extern ppnum_t	lowest_hi;
456extern ppnum_t	highest_hi;
457
458/*
459 * when spinning through pmap_remove
460 * ensure that we don't spend too much
461 * time with preemption disabled.
462 * I'm setting the current threshold
463 * to 20us
464 */
465#define MAX_PREEMPTION_LATENCY_NS 20000
466extern uint64_t max_preemption_latency_tsc;
467
468/* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
469#ifdef DEBUGINTERRUPTS
470#define pmap_intr_assert() {							\
471	if (processor_avail_count > 1 && !ml_get_interrupts_enabled())		\
472		panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);	\
473}
474#else
475#define pmap_intr_assert()
476#endif
477
478extern int 		nx_enabled;
479extern unsigned int    inuse_ptepages_count;
480
481static inline uint32_t
482pvhashidx(pmap_t pmap, vm_map_offset_t va)
483{
484	uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^
485		((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
486	       npvhashmask;
487	    return hashidx;
488}
489
490
491/*
492 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
493 * properly deals with the anchor.
494 * must be called with the hash locked, does not unlock it
495 */
496static inline void
497pmap_pvh_unlink(pv_hashed_entry_t pvh)
498{
499	pv_hashed_entry_t	curh;
500	pv_hashed_entry_t	*pprevh;
501	int           		pvhash_idx;
502
503	CHK_NPVHASH();
504	pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
505
506	pprevh = pvhash(pvhash_idx);
507
508#if PV_DEBUG
509	if (NULL == *pprevh)
510		panic("pvh_unlink null anchor"); /* JK DEBUG */
511#endif
512	curh = *pprevh;
513
514	while (PV_HASHED_ENTRY_NULL != curh) {
515		if (pvh == curh)
516			break;
517		pprevh = &curh->nexth;
518		curh = curh->nexth;
519	}
520	if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
521	*pprevh = pvh->nexth;
522	return;
523}
524
525static inline void
526pv_hash_add(pv_hashed_entry_t	pvh_e,
527	    pv_rooted_entry_t	pv_h)
528{
529	pv_hashed_entry_t       *hashp;
530	int                     pvhash_idx;
531
532	CHK_NPVHASH();
533	pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
534	LOCK_PV_HASH(pvhash_idx);
535	insque(&pvh_e->qlink, &pv_h->qlink);
536	hashp = pvhash(pvhash_idx);
537#if PV_DEBUG
538	if (NULL==hashp)
539		panic("pv_hash_add(%p) null hash bucket", pvh_e);
540#endif
541	pvh_e->nexth = *hashp;
542	*hashp = pvh_e;
543	UNLOCK_PV_HASH(pvhash_idx);
544}
545
546static inline void
547pv_hash_remove(pv_hashed_entry_t pvh_e)
548{
549	int                     pvhash_idx;
550
551	CHK_NPVHASH();
552	pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
553	LOCK_PV_HASH(pvhash_idx);
554	remque(&pvh_e->qlink);
555	pmap_pvh_unlink(pvh_e);
556	UNLOCK_PV_HASH(pvhash_idx);
557}
558
559static inline boolean_t popcnt1(uint64_t distance) {
560	return ((distance & (distance - 1)) == 0);
561}
562
563/*
564 * Routines to handle suppression of/recovery from some forms of pagetable corruption
565 * incidents observed in the field. These can be either software induced (wild
566 * stores to the mapwindows where applicable, use after free errors
567 * (typically of pages addressed physically), mis-directed DMAs etc., or due
568 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
569 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
570 * still assert on potential software races, but attempt recovery from incidents
571 * identifiable as occurring due to issues beyond the control of the pmap module.
572 * The latter includes single-bit errors and malformed pagetable entries.
573 * We currently limit ourselves to recovery/suppression of one incident per
574 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
575 * are logged.
576 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
577 */
578
579typedef enum {
580	PTE_VALID		= 0x0,
581	PTE_INVALID		= 0x1,
582	PTE_RSVD		= 0x2,
583	PTE_SUPERVISOR		= 0x4,
584	PTE_BITFLIP		= 0x8,
585	PV_BITFLIP		= 0x10,
586	PTE_INVALID_CACHEABILITY = 0x20
587} pmap_pagetable_corruption_t;
588
589typedef enum {
590	ROOT_PRESENT = 0,
591	ROOT_ABSENT = 1
592} pmap_pv_assertion_t;
593
594typedef enum {
595	PMAP_ACTION_IGNORE	= 0x0,
596	PMAP_ACTION_ASSERT	= 0x1,
597	PMAP_ACTION_RETRY	= 0x2,
598	PMAP_ACTION_RETRY_RELOCK = 0x4
599} pmap_pagetable_corruption_action_t;
600
601#define	PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
602extern uint64_t pmap_pagetable_corruption_interval_abstime;
603
604extern uint32_t pmap_pagetable_corruption_incidents;
605#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
606typedef struct {
607	pmap_pv_assertion_t incident;
608	pmap_pagetable_corruption_t reason;
609	pmap_pagetable_corruption_action_t action;
610	pmap_t	pmap;
611	vm_map_offset_t vaddr;
612	pt_entry_t pte;
613	ppnum_t ppn;
614	pmap_t pvpmap;
615	vm_map_offset_t pvva;
616	uint64_t abstime;
617} pmap_pagetable_corruption_record_t;
618
619extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
620extern uint64_t pmap_pagetable_corruption_last_abstime;
621extern thread_call_t 	pmap_pagetable_corruption_log_call;
622extern boolean_t pmap_pagetable_corruption_timeout;
623
624static inline void
625pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
626	uint32_t pmap_pagetable_corruption_log_index;
627	pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
628	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
629	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
630	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
631	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
632	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
633	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
634	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
635	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
636	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
637	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
638	/* Asynchronously log */
639	thread_call_enter(pmap_pagetable_corruption_log_call);
640}
641
642static inline pmap_pagetable_corruption_action_t
643pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
644	pmap_pagetable_corruption_action_t	action = PMAP_ACTION_ASSERT;
645	pmap_pagetable_corruption_t	suppress_reason = PTE_VALID;
646	ppnum_t			suppress_ppn = 0;
647	pt_entry_t cpte = *ptep;
648	ppnum_t	cpn = pa_index(pte_to_pa(cpte));
649	ppnum_t	ppn = *ppnp;
650	pv_rooted_entry_t	pv_h = pai_to_pvh(ppn_to_pai(ppn));
651	pv_rooted_entry_t	pv_e = pv_h;
652	uint32_t	bitdex;
653	pmap_t pvpmap = pv_h->pmap;
654	vm_map_offset_t pvva = pv_h->va;
655	boolean_t ppcd = FALSE;
656
657	/* Ideally, we'd consult the Mach VM here to definitively determine
658	 * the nature of the mapping for this address space and address.
659	 * As that would be a layering violation in this context, we
660	 * use various heuristics to recover from single bit errors,
661	 * malformed pagetable entries etc. These are not intended
662	 * to be comprehensive.
663	 */
664
665	/* As a precautionary measure, mark A+D */
666	pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
667
668	/*
669	 * Correct potential single bit errors in either (but not both) element
670	 * of the PV
671	 */
672	do {
673		if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
674		    (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
675			pv_e->pmap = pmap;
676			pv_e->va = vaddr;
677			suppress_reason = PV_BITFLIP;
678			action = PMAP_ACTION_RETRY;
679			goto pmap_cpc_exit;
680		}
681	} while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
682
683	/* Discover root entries with a Hamming
684	 * distance of 1 from the supplied
685	 * physical page frame.
686	 */
687	for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
688		ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
689		if (IS_MANAGED_PAGE(npn)) {
690			pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
691			if (npv_h->va == vaddr && npv_h->pmap == pmap) {
692				suppress_reason = PTE_BITFLIP;
693				suppress_ppn = npn;
694				action = PMAP_ACTION_RETRY_RELOCK;
695				UNLOCK_PVH(ppn_to_pai(ppn));
696				*ppnp = npn;
697				goto pmap_cpc_exit;
698			}
699		}
700	}
701
702	if (pmap == kernel_pmap) {
703		action = PMAP_ACTION_ASSERT;
704		goto pmap_cpc_exit;
705	}
706
707	/* Check for malformed/inconsistent entries */
708
709	if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) ==  (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
710		action = PMAP_ACTION_IGNORE;
711		suppress_reason = PTE_INVALID_CACHEABILITY;
712	}
713	else if (cpte & INTEL_PTE_RSVD) {
714		action = PMAP_ACTION_IGNORE;
715		suppress_reason = PTE_RSVD;
716	}
717	else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
718		action = PMAP_ACTION_IGNORE;
719		suppress_reason = PTE_SUPERVISOR;
720	}
721pmap_cpc_exit:
722	PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
723
724	if (debug_boot_arg && !ppcd) {
725		action = PMAP_ACTION_ASSERT;
726	}
727
728	if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
729		action = PMAP_ACTION_ASSERT;
730		pmap_pagetable_corruption_timeout = TRUE;
731	}
732	else
733	{
734		pmap_pagetable_corruption_last_abstime = mach_absolute_time();
735	}
736	pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
737	return action;
738}
739
740/*
741 * Remove pv list entry.
742 * Called with pv_head_table entry locked.
743 * Returns pv entry to be freed (or NULL).
744 */
745static inline __attribute__((always_inline)) pv_hashed_entry_t
746pmap_pv_remove(pmap_t		pmap,
747	       vm_map_offset_t	vaddr,
748    		ppnum_t		*ppnp,
749		pt_entry_t	*pte)
750{
751	pv_hashed_entry_t       pvh_e;
752	pv_rooted_entry_t	pv_h;
753	pv_hashed_entry_t	*pprevh;
754	int                     pvhash_idx;
755	uint32_t                pv_cnt;
756	ppnum_t			ppn;
757
758pmap_pv_remove_retry:
759	ppn = *ppnp;
760	pvh_e = PV_HASHED_ENTRY_NULL;
761	pv_h = pai_to_pvh(ppn_to_pai(ppn));
762
763	if (__improbable(pv_h->pmap == PMAP_NULL)) {
764		pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
765		if (pac == PMAP_ACTION_IGNORE)
766			goto pmap_pv_remove_exit;
767		else if (pac == PMAP_ACTION_ASSERT)
768			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
769		else if (pac == PMAP_ACTION_RETRY_RELOCK) {
770			LOCK_PVH(ppn_to_pai(*ppnp));
771			pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
772			goto pmap_pv_remove_retry;
773		}
774		else if (pac == PMAP_ACTION_RETRY)
775			goto pmap_pv_remove_retry;
776	}
777
778	if (pv_h->va == vaddr && pv_h->pmap == pmap) {
779		/*
780	         * Header is the pv_rooted_entry.
781		 * We can't free that. If there is a queued
782	         * entry after this one we remove that
783	         * from the ppn queue, we remove it from the hash chain
784	         * and copy it to the rooted entry. Then free it instead.
785	         */
786		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
787		if (pv_h != (pv_rooted_entry_t) pvh_e) {
788			/*
789			 * Entry queued to root, remove this from hash
790			 * and install as new root.
791			 */
792			CHK_NPVHASH();
793			pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
794			LOCK_PV_HASH(pvhash_idx);
795			remque(&pvh_e->qlink);
796			pprevh = pvhash(pvhash_idx);
797			if (PV_HASHED_ENTRY_NULL == *pprevh) {
798				panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): "
799				      "empty hash, removing rooted",
800				      pmap, vaddr, ppn);
801			}
802			pmap_pvh_unlink(pvh_e);
803			UNLOCK_PV_HASH(pvhash_idx);
804			pv_h->pmap = pvh_e->pmap;
805			pv_h->va = pvh_e->va;	/* dispose of pvh_e */
806		} else {
807			/* none queued after rooted */
808			pv_h->pmap = PMAP_NULL;
809			pvh_e = PV_HASHED_ENTRY_NULL;
810		}
811	} else {
812		/*
813		 * not removing rooted pv. find it on hash chain, remove from
814		 * ppn queue and hash chain and free it
815		 */
816		CHK_NPVHASH();
817		pvhash_idx = pvhashidx(pmap, vaddr);
818		LOCK_PV_HASH(pvhash_idx);
819		pprevh = pvhash(pvhash_idx);
820		if (PV_HASHED_ENTRY_NULL == *pprevh) {
821			panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
822			    pmap, vaddr, ppn, *pte, pte);
823		}
824		pvh_e = *pprevh;
825		pmap_pv_hashlist_walks++;
826		pv_cnt = 0;
827		while (PV_HASHED_ENTRY_NULL != pvh_e) {
828			pv_cnt++;
829			if (pvh_e->pmap == pmap &&
830			    pvh_e->va == vaddr &&
831			    pvh_e->ppn == ppn)
832				break;
833			pprevh = &pvh_e->nexth;
834			pvh_e = pvh_e->nexth;
835		}
836
837		if (PV_HASHED_ENTRY_NULL == pvh_e) {
838			pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
839
840			if (pac == PMAP_ACTION_ASSERT)
841				panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
842			else {
843				UNLOCK_PV_HASH(pvhash_idx);
844				if (pac == PMAP_ACTION_RETRY_RELOCK) {
845					LOCK_PVH(ppn_to_pai(*ppnp));
846					pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
847					goto pmap_pv_remove_retry;
848				}
849				else if (pac == PMAP_ACTION_RETRY) {
850					goto pmap_pv_remove_retry;
851				}
852				else if (pac == PMAP_ACTION_IGNORE) {
853					goto pmap_pv_remove_exit;
854				}
855			}
856		}
857
858		pmap_pv_hashlist_cnts += pv_cnt;
859		if (pmap_pv_hashlist_max < pv_cnt)
860			pmap_pv_hashlist_max = pv_cnt;
861		*pprevh = pvh_e->nexth;
862		remque(&pvh_e->qlink);
863		UNLOCK_PV_HASH(pvhash_idx);
864	}
865pmap_pv_remove_exit:
866	return pvh_e;
867}
868
869
870extern int 	pt_fake_zone_index;
871static inline void
872PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
873{
874	thread_t thr = current_thread();
875	task_t task;
876	zinfo_usage_t zinfo;
877
878	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
879
880	if (pt_fake_zone_index != -1 &&
881	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
882		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
883}
884
885static inline void
886PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
887{
888	thread_t thr = current_thread();
889	task_t task;
890	zinfo_usage_t zinfo;
891
892	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
893
894	if (pt_fake_zone_index != -1 &&
895	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
896		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
897}
898
899static inline void
900PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
901{
902	pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
903}
904
905static inline void
906PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
907{
908	pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
909}
910
911extern boolean_t	pmap_initialized;/* Has pmap_init completed? */
912#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
913
914// XXX
915#define HIGH_MEM_BASE  ((uint32_t)( -NBPDE) )  /* shared gdt etc seg addr */ /* XXX64 ?? */
916// XXX
917
918
919int		phys_attribute_test(
920			ppnum_t		phys,
921			int		bits);
922void		phys_attribute_clear(
923			ppnum_t		phys,
924			int		bits,
925			unsigned int	options,
926	                void		*arg);
927
928//#define PCID_DEBUG 1
929#if	PCID_DEBUG
930#define pmap_pcid_log(fmt, args...)					\
931	do {								\
932		kprintf(fmt, ##args);					\
933		printf(fmt, ##args);					\
934	} while(0)
935#else
936#define pmap_pcid_log(fmt, args...)
937#endif
938void	pmap_pcid_configure(void);
939
940
941/*
942 * Atomic 64-bit compare and exchange of a page table entry.
943 */
944static inline boolean_t
945pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
946{
947	boolean_t		ret;
948
949	/*
950	 * Load the old value into %rax
951	 * Load the new value into another register
952	 * Compare-exchange-quad at address entryp
953	 * If the compare succeeds, the new value is stored, return TRUE.
954	 * Otherwise, no swap is made, return FALSE.
955	 */
956	asm volatile(
957		"	lock; cmpxchgq %2,(%3)	\n\t"
958		"	setz	%%al		\n\t"
959		"	movzbl	%%al,%0"
960		: "=a" (ret)
961		: "a" (old),
962		  "r" (new),
963		  "r" (entryp)
964		: "memory");
965	return ret;
966}
967
968extern uint32_t pmap_update_clear_pte_count;
969
970static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
971	pt_entry_t npte, opte;
972	do {
973		opte = *mptep;
974		if (__improbable(opte == 0)) {
975			pmap_update_clear_pte_count++;
976			break;
977		}
978		npte = opte & ~(pclear_bits);
979		npte |= pset_bits;
980	}	while (!pmap_cmpx_pte(mptep, opte, npte));
981}
982
983#if	defined(__x86_64__)
984/*
985 * The single pml4 page per pmap is allocated at pmap create time and exists
986 * for the duration of the pmap. we allocate this page in kernel vm.
987 * this returns the address of the requested pml4 entry in the top level page.
988 */
989static inline
990pml4_entry_t *
991pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
992{
993	if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
994		(vaddr < 0xFFFF800000000000ULL))) {
995		return (NULL);
996	}
997
998#if	PMAP_ASSERT
999	return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
1000#else
1001	return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
1002#endif
1003}
1004
1005/*
1006 * Returns address of requested PDPT entry in the physmap.
1007 */
1008static inline pdpt_entry_t *
1009pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1010{
1011	pml4_entry_t	newpf;
1012	pml4_entry_t	*pml4;
1013
1014	pml4 = pmap64_pml4(pmap, vaddr);
1015	if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
1016		newpf = *pml4 & PG_FRAME;
1017		return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1018			[(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
1019	}
1020	return (NULL);
1021}
1022/*
1023 * Returns the address of the requested PDE entry in the physmap.
1024 */
1025static inline pd_entry_t *
1026pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
1027{
1028	pdpt_entry_t	newpf;
1029	pdpt_entry_t	*pdpt;
1030
1031	pdpt = pmap64_pdpt(pmap, vaddr);
1032
1033	if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
1034		newpf = *pdpt & PG_FRAME;
1035		return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1036			[(vaddr >> PDSHIFT) & (NPDPG-1)];
1037	}
1038	return (NULL);
1039}
1040
1041static inline pd_entry_t     *
1042pmap_pde(pmap_t m, vm_map_offset_t v)
1043{
1044	pd_entry_t     *pde;
1045
1046	pde = pmap64_pde(m, v);
1047
1048	return pde;
1049}
1050
1051
1052/*
1053 * return address of mapped pte for vaddr va in pmap pmap.
1054 *
1055 * In case the pde maps a superpage, return the pde, which, in this case
1056 * is the actual page table entry.
1057 */
1058static inline pt_entry_t *
1059pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1060{
1061	pd_entry_t	*pde;
1062	pd_entry_t	newpf;
1063
1064	assert(pmap);
1065	pde = pmap64_pde(pmap, vaddr);
1066
1067	if (pde && ((*pde & INTEL_PTE_VALID))) {
1068		if (*pde & INTEL_PTE_PS)
1069			return pde;
1070		newpf = *pde & PG_FRAME;
1071		return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1072			[i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
1073	}
1074	return (NULL);
1075}
1076#endif
1077#if	DEBUG
1078#define DPRINTF(x...)	kprintf(x)
1079#else
1080#define DPRINTF(x...)
1081#endif
1082
1083#endif /* MACH_KERNEL_PRIVATE */
1084#endif /* _I386_PMAP_INTERNAL_ */
1085