1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29
30#ifndef	_I386_PMAP_INTERNAL_
31#define _I386_PMAP_INTERNAL_
32#ifdef MACH_KERNEL_PRIVATE
33
34#include <vm/pmap.h>
35#include <sys/kdebug.h>
36#include <kern/ledger.h>
37
38/*
39 * pmap locking
40 */
41
42#define PMAP_LOCK(pmap) {		\
43	simple_lock(&(pmap)->lock);	\
44}
45
46#define PMAP_UNLOCK(pmap) {			\
47	simple_unlock(&(pmap)->lock);		\
48}
49
50#define PMAP_UPDATE_TLBS(pmap, s, e)					\
51	pmap_flush_tlbs(pmap, s, e)
52
53#define	iswired(pte)	((pte) & INTEL_PTE_WIRED)
54
55#ifdef	PMAP_TRACES
56extern	boolean_t	pmap_trace;
57#define PMAP_TRACE(x,a,b,c,d,e)						\
58	if (pmap_trace) {						\
59		KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);			\
60	}
61#else
62#define PMAP_TRACE(x,a,b,c,d,e)	KERNEL_DEBUG(x,a,b,c,d,e)
63#endif /* PMAP_TRACES */
64
65#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e)				\
66	KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);				\
67
68kern_return_t	pmap_expand_pml4(
69			pmap_t		map,
70			vm_map_offset_t	v,
71			unsigned int options);
72
73kern_return_t	pmap_expand_pdpt(
74			pmap_t		map,
75			vm_map_offset_t	v,
76			unsigned int options);
77
78void		phys_attribute_set(
79			ppnum_t		phys,
80			int		bits);
81
82void		pmap_set_reference(
83			ppnum_t pn);
84
85boolean_t	phys_page_exists(
86			ppnum_t pn);
87
88void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t);
89
90void
91pmap_update_cache_attributes_locked(ppnum_t, unsigned);
92
93#if CONFIG_YONAH
94extern boolean_t cpu_64bit;
95#else
96extern const boolean_t cpu_64bit;
97#endif
98
99/*
100 *	Private data structures.
101 */
102
103/*
104 *	For each vm_page_t, there is a list of all currently
105 *	valid virtual mappings of that page.  An entry is
106 *	a pv_rooted_entry_t; the list is the pv_table.
107 *
108 *      N.B.  with the new combo rooted/hashed scheme it is
109 *      only possibly to remove individual non-rooted entries
110 *      if they are found via the hashed chains as there is no
111 *      way to unlink the singly linked hashed entries if navigated to
112 *      via the queue list off the rooted entries.  Think of it as
113 *      hash/walk/pull, keeping track of the prev pointer while walking
114 *      the singly linked hash list.  All of this is to save memory and
115 *      keep both types of pv_entries as small as possible.
116 */
117
118/*
119
120PV HASHING Changes - JK 1/2007
121
122Pve's establish physical to virtual mappings.  These are used for aliasing of a
123physical page to (potentially many) virtual addresses within pmaps. In the
124previous implementation the structure of the pv_entries (each 16 bytes in size) was
125
126typedef struct pv_entry {
127    struct pv_entry_t    next;
128    pmap_t                    pmap;
129    vm_map_offset_t   va;
130} *pv_entry_t;
131
132An initial array of these is created at boot time, one per physical page of
133memory, indexed by the physical page number. Additionally, a pool of entries
134is created from a pv_zone to be used as needed by pmap_enter() when it is
135creating new mappings.  Originally, we kept this pool around because the code
136in pmap_enter() was unable to block if it needed an entry and none were
137available - we'd panic.  Some time ago I restructured the pmap_enter() code
138so that for user pmaps it can block while zalloc'ing a pv structure and restart,
139removing a panic from the code (in the case of the kernel pmap we cannot block
140and still panic, so, we keep a separate hot pool for use only on kernel pmaps).
141The pool has not been removed since there is a large performance gain keeping
142freed pv's around for reuse and not suffering the overhead of zalloc for every
143new pv we need.
144
145As pmap_enter() created new mappings it linked the new pve's for them off the
146fixed pv array for that ppn (off the next pointer).  These pve's are accessed
147for several operations, one of them being address space teardown. In that case,
148we basically do this
149
150	for (every page/pte in the space) {
151		calc pve_ptr from the ppn in the pte
152		for (every pv in the list for the ppn) {
153			if (this pv is for this pmap/vaddr) {
154				do housekeeping
155				unlink/free the pv
156			}
157		}
158	}
159
160The problem arose when we were running, say 8000 (or even 2000) apache or
161other processes and one or all terminate. The list hanging off each pv array
162entry could have thousands of entries.  We were continuously linearly searching
163each of these lists as we stepped through the address space we were tearing
164down.  Because of the locks we hold, likely taking a cache miss for each node,
165and interrupt disabling for MP issues the system became completely unresponsive
166for many seconds while we did this.
167
168Realizing that pve's are accessed in two distinct ways (linearly running the
169list by ppn for operations like pmap_page_protect and finding and
170modifying/removing a single pve as part of pmap_enter processing) has led to
171modifying the pve structures and databases.
172
173There are now two types of pve structures.  A "rooted" structure which is
174basically the original structure accessed in an array by ppn, and a ''hashed''
175structure accessed on a hash list via a hash of [pmap, vaddr]. These have been
176designed with the two goals of minimizing wired memory and making the lookup of
177a ppn faster.  Since a vast majority of pages in the system are not aliased
178and hence represented by a single pv entry I've kept the rooted entry size as
179small as possible because there is one of these dedicated for every physical
180page of memory.  The hashed pve's are larger due to the addition of the hash
181link and the ppn entry needed for matching while running the hash list to find
182the entry we are looking for.  This way, only systems that have lots of
183aliasing (like 2000+ httpd procs) will pay the extra memory price. Both
184structures have the same first three fields allowing some simplification in
185the code.
186
187They have these shapes
188
189typedef struct pv_rooted_entry {
190	queue_head_t		qlink;
191        vm_map_offset_t		va;
192	pmap_t			pmap;
193} *pv_rooted_entry_t;
194
195
196typedef struct pv_hashed_entry {
197	queue_head_t		qlink;
198	vm_map_offset_t		va;
199	pmap_t			pmap;
200	ppnum_t 		ppn;
201	struct pv_hashed_entry *nexth;
202} *pv_hashed_entry_t;
203
204The main flow difference is that the code is now aware of the rooted entry and
205the hashed entries.  Code that runs the pv list still starts with the rooted
206entry and then continues down the qlink onto the hashed entries.  Code that is
207looking up a specific pv entry first checks the rooted entry and then hashes
208and runs the hash list for the match. The hash list lengths are much smaller
209than the original pv lists that contained all aliases for the specific ppn.
210
211*/
212
213typedef struct pv_rooted_entry {
214	/* first three entries must match pv_hashed_entry_t */
215        queue_head_t		qlink;
216	vm_map_offset_t		va;	/* virtual address for mapping */
217	pmap_t			pmap;	/* pmap where mapping lies */
218} *pv_rooted_entry_t;
219
220#define PV_ROOTED_ENTRY_NULL	((pv_rooted_entry_t) 0)
221
222typedef struct pv_hashed_entry {
223	/* first three entries must match pv_rooted_entry_t */
224	queue_head_t		qlink;
225	vm_map_offset_t		va;
226	pmap_t			pmap;
227	ppnum_t			ppn;
228	struct pv_hashed_entry	*nexth;
229} *pv_hashed_entry_t;
230
231#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
232
233//#define PV_DEBUG 1   /* uncomment to enable some PV debugging code */
234#ifdef PV_DEBUG
235#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
236#else
237#define CHK_NPVHASH(x)
238#endif
239
240#define NPVHASH 4095   /* MUST BE 2^N - 1 */
241#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000
242#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000
243#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000
244#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200
245
246extern volatile uint32_t	mappingrecurse;
247extern uint32_t  pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark;
248
249/*
250 * PV hash locking
251 */
252
253#define LOCK_PV_HASH(hash)	lock_hash_hash(hash)
254#define UNLOCK_PV_HASH(hash)	unlock_hash_hash(hash)
255extern uint32_t npvhash;
256extern pv_hashed_entry_t	*pv_hash_table;  /* hash lists */
257extern pv_hashed_entry_t	pv_hashed_free_list;
258extern pv_hashed_entry_t	pv_hashed_kern_free_list;
259decl_simple_lock_data(extern, pv_hashed_free_list_lock)
260decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock)
261decl_simple_lock_data(extern, pv_hash_table_lock)
262
263extern zone_t		pv_hashed_list_zone;	/* zone of pv_hashed_entry
264						 * structures */
265
266extern uint32_t		pv_hashed_free_count;
267extern uint32_t		pv_hashed_kern_free_count;
268/*
269 *	Each entry in the pv_head_table is locked by a bit in the
270 *	pv_lock_table.  The lock bits are accessed by the address of
271 *	the frame they lock.
272 */
273#define pv_lock_table_size(n)	(((n)+BYTE_SIZE-1)/BYTE_SIZE)
274#define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
275extern char		*pv_lock_table;		/* pointer to array of bits */
276extern char		*pv_hash_lock_table;
277extern pv_rooted_entry_t pv_head_table;	/* array of entries, one per page */
278
279extern event_t mapping_replenish_event;
280
281static inline void	PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) {
282	pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL);
283	simple_lock(&pv_hashed_free_list_lock);
284	/* If the kernel reserved pool is low, let non-kernel mappings allocate
285	 * synchronously, possibly subject to a throttle.
286	 */
287	if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) {
288		pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next;
289		pv_hashed_free_count--;
290	}
291
292	simple_unlock(&pv_hashed_free_list_lock);
293
294	if (pv_hashed_free_count <= pv_hashed_low_water_mark) {
295		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
296			thread_wakeup(&mapping_replenish_event);
297	}
298}
299
300static inline void	PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
301	simple_lock(&pv_hashed_free_list_lock);
302	pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;
303	pv_hashed_free_list = pvh_eh;
304	pv_hashed_free_count += pv_cnt;
305	simple_unlock(&pv_hashed_free_list_lock);
306}
307
308extern unsigned pmap_kern_reserve_alloc_stat;
309
310static inline void	PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) {
311	pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL);
312	simple_lock(&pv_hashed_kern_free_list_lock);
313
314	if ((*pvh_e = pv_hashed_kern_free_list) != 0) {
315		pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next;
316		pv_hashed_kern_free_count--;
317		pmap_kern_reserve_alloc_stat++;
318	}
319
320	simple_unlock(&pv_hashed_kern_free_list_lock);
321
322	if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) {
323		if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
324			thread_wakeup(&mapping_replenish_event);
325	}
326}
327
328static inline void	PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) {
329	simple_lock(&pv_hashed_kern_free_list_lock);
330	pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;
331	pv_hashed_kern_free_list = pvh_eh;
332	pv_hashed_kern_free_count += pv_cnt;
333	simple_unlock(&pv_hashed_kern_free_list_lock);
334}
335
336extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters;
337extern event_t pmap_user_pv_throttle_event;
338
339static inline void pmap_pv_throttle(__unused pmap_t p) {
340	pmap_assert(p != kernel_pmap);
341	/* Apply throttle on non-kernel mappings */
342	if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) {
343		pmap_pv_throttle_stat++;
344		/* This doesn't need to be strictly accurate, merely a hint
345		 * to eliminate the timeout when the reserve is replenished.
346		 */
347		pmap_pv_throttled_waiters++;
348		assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC);
349		thread_block(THREAD_CONTINUE_NULL);
350	}
351}
352
353/*
354 *	Index into pv_head table, its lock bits, and the modify/reference and managed bits
355 */
356
357#define pa_index(pa)		(i386_btop(pa))
358#define ppn_to_pai(ppn)		((int)ppn)
359
360#define pai_to_pvh(pai)		(&pv_head_table[pai])
361#define lock_pvh_pai(pai)	bit_lock(pai, (void *)pv_lock_table)
362#define unlock_pvh_pai(pai)	bit_unlock(pai, (void *)pv_lock_table)
363#define pvhash(idx)		(&pv_hash_table[idx])
364#define lock_hash_hash(hash)	bit_lock(hash, (void *)pv_hash_lock_table)
365#define unlock_hash_hash(hash)	bit_unlock(hash, (void *)pv_hash_lock_table)
366
367#define IS_MANAGED_PAGE(x)				\
368	((unsigned int)(x) <= last_managed_page &&	\
369	 (pmap_phys_attributes[x] & PHYS_MANAGED))
370
371/*
372 *	Physical page attributes.  Copy bits from PTE definition.
373 */
374#define	PHYS_MODIFIED	INTEL_PTE_MOD	/* page modified */
375#define	PHYS_REFERENCED	INTEL_PTE_REF	/* page referenced */
376#define PHYS_MANAGED	INTEL_PTE_VALID /* page is managed */
377#define PHYS_NOENCRYPT	INTEL_PTE_USER	/* no need to encrypt this page in the hibernation image */
378#define	PHYS_NCACHE	INTEL_PTE_NCACHE
379#define	PHYS_PTA	INTEL_PTE_PTA
380#define	PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE)
381
382extern const boolean_t	pmap_disable_kheap_nx;
383extern const boolean_t	pmap_disable_kstack_nx;
384
385#define PMAP_EXPAND_OPTIONS_NONE (0x0)
386#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT)
387#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER)
388
389/*
390 *	Amount of virtual memory mapped by one
391 *	page-directory entry.
392 */
393#define	PDE_MAPPED_SIZE		(pdetova(1))
394
395
396/*
397 *	Locking and TLB invalidation
398 */
399
400/*
401 *	Locking Protocols: (changed 2/2007 JK)
402 *
403 *	There are two structures in the pmap module that need locking:
404 *	the pmaps themselves, and the per-page pv_lists (which are locked
405 *	by locking the pv_lock_table entry that corresponds to the pv_head
406 *	for the list in question.)  Most routines want to lock a pmap and
407 *	then do operations in it that require pv_list locking -- however
408 *	pmap_remove_all and pmap_copy_on_write operate on a physical page
409 *	basis and want to do the locking in the reverse order, i.e. lock
410 *	a pv_list and then go through all the pmaps referenced by that list.
411 *
412 *      The system wide pmap lock has been removed. Now, paths take a lock
413 *      on the pmap before changing its 'shape' and the reverse order lockers
414 *      (coming in by phys ppn) take a lock on the corresponding pv and then
415 *      retest to be sure nothing changed during the window before they locked
416 *      and can then run up/down the pv lists holding the list lock. This also
417 *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
418 *      previously.
419 */
420
421/*
422 * PV locking
423 */
424
425#define LOCK_PVH(index)	{		\
426	mp_disable_preemption();	\
427	lock_pvh_pai(index);		\
428}
429
430#define UNLOCK_PVH(index) {		\
431	unlock_pvh_pai(index);		\
432	mp_enable_preemption();		\
433}
434
435extern uint64_t pde_mapped_size;
436
437extern char		*pmap_phys_attributes;
438extern ppnum_t		last_managed_page;
439
440extern ppnum_t	lowest_lo;
441extern ppnum_t	lowest_hi;
442extern ppnum_t	highest_hi;
443
444/*
445 * when spinning through pmap_remove
446 * ensure that we don't spend too much
447 * time with preemption disabled.
448 * I'm setting the current threshold
449 * to 20us
450 */
451#define MAX_PREEMPTION_LATENCY_NS 20000
452extern uint64_t max_preemption_latency_tsc;
453
454/* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
455#ifdef DEBUGINTERRUPTS
456#define pmap_intr_assert() {							\
457	if (processor_avail_count > 1 && !ml_get_interrupts_enabled())		\
458		panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);	\
459}
460#else
461#define pmap_intr_assert()
462#endif
463
464extern int 		nx_enabled;
465extern unsigned int    inuse_ptepages_count;
466
467static inline uint32_t
468pvhashidx(pmap_t pmap, vm_map_offset_t va)
469{
470	return ((uint32_t)(uintptr_t)pmap ^
471		((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) &
472	       npvhash;
473}
474
475
476/*
477 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
478 * properly deals with the anchor.
479 * must be called with the hash locked, does not unlock it
480 */
481static inline void
482pmap_pvh_unlink(pv_hashed_entry_t pvh)
483{
484	pv_hashed_entry_t	curh;
485	pv_hashed_entry_t	*pprevh;
486	int           		pvhash_idx;
487
488	CHK_NPVHASH();
489	pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
490
491	pprevh = pvhash(pvhash_idx);
492
493#if PV_DEBUG
494	if (NULL == *pprevh)
495		panic("pvh_unlink null anchor"); /* JK DEBUG */
496#endif
497	curh = *pprevh;
498
499	while (PV_HASHED_ENTRY_NULL != curh) {
500		if (pvh == curh)
501			break;
502		pprevh = &curh->nexth;
503		curh = curh->nexth;
504	}
505	if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
506	*pprevh = pvh->nexth;
507	return;
508}
509
510static inline void
511pv_hash_add(pv_hashed_entry_t	pvh_e,
512	    pv_rooted_entry_t	pv_h)
513{
514	pv_hashed_entry_t       *hashp;
515	int                     pvhash_idx;
516
517	CHK_NPVHASH();
518	pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
519	LOCK_PV_HASH(pvhash_idx);
520	insque(&pvh_e->qlink, &pv_h->qlink);
521	hashp = pvhash(pvhash_idx);
522#if PV_DEBUG
523	if (NULL==hashp)
524		panic("pv_hash_add(%p) null hash bucket", pvh_e);
525#endif
526	pvh_e->nexth = *hashp;
527	*hashp = pvh_e;
528	UNLOCK_PV_HASH(pvhash_idx);
529}
530
531static inline void
532pv_hash_remove(pv_hashed_entry_t pvh_e)
533{
534	int                     pvhash_idx;
535
536	CHK_NPVHASH();
537	pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
538	LOCK_PV_HASH(pvhash_idx);
539	remque(&pvh_e->qlink);
540	pmap_pvh_unlink(pvh_e);
541	UNLOCK_PV_HASH(pvhash_idx);
542}
543
544static inline boolean_t popcnt1(uint64_t distance) {
545	return ((distance & (distance - 1)) == 0);
546}
547
548/*
549 * Routines to handle suppression of/recovery from some forms of pagetable corruption
550 * incidents observed in the field. These can be either software induced (wild
551 * stores to the mapwindows where applicable, use after free errors
552 * (typically of pages addressed physically), mis-directed DMAs etc., or due
553 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors,
554 * the recording mechanism is deliberately not MP-safe. The overarching goal is to
555 * still assert on potential software races, but attempt recovery from incidents
556 * identifiable as occurring due to issues beyond the control of the pmap module.
557 * The latter includes single-bit errors and malformed pagetable entries.
558 * We currently limit ourselves to recovery/suppression of one incident per
559 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident
560 * are logged.
561 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09)
562 */
563
564typedef enum {
565	PTE_VALID		= 0x0,
566	PTE_INVALID		= 0x1,
567	PTE_RSVD		= 0x2,
568	PTE_SUPERVISOR		= 0x4,
569	PTE_BITFLIP		= 0x8,
570	PV_BITFLIP		= 0x10,
571	PTE_INVALID_CACHEABILITY = 0x20
572} pmap_pagetable_corruption_t;
573
574typedef enum {
575	ROOT_PRESENT = 0,
576	ROOT_ABSENT = 1
577} pmap_pv_assertion_t;
578
579typedef enum {
580	PMAP_ACTION_IGNORE	= 0x0,
581	PMAP_ACTION_ASSERT	= 0x1,
582	PMAP_ACTION_RETRY	= 0x2,
583	PMAP_ACTION_RETRY_RELOCK = 0x4
584} pmap_pagetable_corruption_action_t;
585
586#define	PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL)
587extern uint64_t pmap_pagetable_corruption_interval_abstime;
588
589extern uint32_t pmap_pagetable_corruption_incidents;
590#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8)
591typedef struct {
592	pmap_pv_assertion_t incident;
593	pmap_pagetable_corruption_t reason;
594	pmap_pagetable_corruption_action_t action;
595	pmap_t	pmap;
596	vm_map_offset_t vaddr;
597	pt_entry_t pte;
598	ppnum_t ppn;
599	pmap_t pvpmap;
600	vm_map_offset_t pvva;
601	uint64_t abstime;
602} pmap_pagetable_corruption_record_t;
603
604extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[];
605extern uint64_t pmap_pagetable_corruption_last_abstime;
606extern thread_call_t 	pmap_pagetable_corruption_log_call;
607extern boolean_t pmap_pagetable_corruption_timeout;
608
609static inline void
610pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) {
611	uint32_t pmap_pagetable_corruption_log_index;
612	pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG;
613	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident;
614	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason;
615	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action;
616	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap;
617	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr;
618	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep;
619	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn;
620	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap;
621	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva;
622	pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time();
623	/* Asynchronously log */
624	thread_call_enter(pmap_pagetable_corruption_log_call);
625}
626
627static inline pmap_pagetable_corruption_action_t
628pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) {
629	pmap_pagetable_corruption_action_t	action = PMAP_ACTION_ASSERT;
630	pmap_pagetable_corruption_t	suppress_reason = PTE_VALID;
631	ppnum_t			suppress_ppn = 0;
632	pt_entry_t cpte = *ptep;
633	ppnum_t	cpn = pa_index(pte_to_pa(cpte));
634	ppnum_t	ppn = *ppnp;
635	pv_rooted_entry_t	pv_h = pai_to_pvh(ppn_to_pai(ppn));
636	pv_rooted_entry_t	pv_e = pv_h;
637	uint32_t	bitdex;
638	pmap_t pvpmap = pv_h->pmap;
639	vm_map_offset_t pvva = pv_h->va;
640	boolean_t ppcd = FALSE;
641
642	/* Ideally, we'd consult the Mach VM here to definitively determine
643	 * the nature of the mapping for this address space and address.
644	 * As that would be a layering violation in this context, we
645	 * use various heuristics to recover from single bit errors,
646	 * malformed pagetable entries etc. These are not intended
647	 * to be comprehensive.
648	 */
649
650	/* As a precautionary measure, mark A+D */
651	pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
652
653	/*
654	 * Correct potential single bit errors in either (but not both) element
655	 * of the PV
656	 */
657	do {
658		if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) ||
659		    (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) {
660			pv_e->pmap = pmap;
661			pv_e->va = vaddr;
662			suppress_reason = PV_BITFLIP;
663			action = PMAP_ACTION_RETRY;
664			goto pmap_cpc_exit;
665		}
666	} while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h));
667
668	/* Discover root entries with a Hamming
669	 * distance of 1 from the supplied
670	 * physical page frame.
671	 */
672	for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) {
673		ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex);
674		if (IS_MANAGED_PAGE(npn)) {
675			pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn));
676			if (npv_h->va == vaddr && npv_h->pmap == pmap) {
677				suppress_reason = PTE_BITFLIP;
678				suppress_ppn = npn;
679				action = PMAP_ACTION_RETRY_RELOCK;
680				UNLOCK_PVH(ppn_to_pai(ppn));
681				*ppnp = npn;
682				goto pmap_cpc_exit;
683			}
684		}
685	}
686
687	if (pmap == kernel_pmap) {
688		action = PMAP_ACTION_ASSERT;
689		goto pmap_cpc_exit;
690	}
691
692	/* Check for malformed/inconsistent entries */
693
694	if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) ==  (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) {
695		action = PMAP_ACTION_IGNORE;
696		suppress_reason = PTE_INVALID_CACHEABILITY;
697	}
698	else if (cpte & INTEL_PTE_RSVD) {
699		action = PMAP_ACTION_IGNORE;
700		suppress_reason = PTE_RSVD;
701	}
702	else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) {
703		action = PMAP_ACTION_IGNORE;
704		suppress_reason = PTE_SUPERVISOR;
705	}
706pmap_cpc_exit:
707	PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd));
708
709	if (debug_boot_arg && !ppcd) {
710		action = PMAP_ACTION_ASSERT;
711	}
712
713	if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) {
714		action = PMAP_ACTION_ASSERT;
715		pmap_pagetable_corruption_timeout = TRUE;
716	}
717	else
718	{
719		pmap_pagetable_corruption_last_abstime = mach_absolute_time();
720	}
721	pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva);
722	return action;
723}
724
725/*
726 * Remove pv list entry.
727 * Called with pv_head_table entry locked.
728 * Returns pv entry to be freed (or NULL).
729 */
730static inline __attribute__((always_inline)) pv_hashed_entry_t
731pmap_pv_remove(pmap_t		pmap,
732	       vm_map_offset_t	vaddr,
733    		ppnum_t		*ppnp,
734		pt_entry_t	*pte)
735{
736	pv_hashed_entry_t       pvh_e;
737	pv_rooted_entry_t	pv_h;
738	pv_hashed_entry_t	*pprevh;
739	int                     pvhash_idx;
740	uint32_t                pv_cnt;
741	ppnum_t			ppn;
742
743pmap_pv_remove_retry:
744	ppn = *ppnp;
745	pvh_e = PV_HASHED_ENTRY_NULL;
746	pv_h = pai_to_pvh(ppn_to_pai(ppn));
747
748	if (__improbable(pv_h->pmap == PMAP_NULL)) {
749		pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT);
750		if (pac == PMAP_ACTION_IGNORE)
751			goto pmap_pv_remove_exit;
752		else if (pac == PMAP_ACTION_ASSERT)
753			panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte);
754		else if (pac == PMAP_ACTION_RETRY_RELOCK) {
755			LOCK_PVH(ppn_to_pai(*ppnp));
756			pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
757			goto pmap_pv_remove_retry;
758		}
759		else if (pac == PMAP_ACTION_RETRY)
760			goto pmap_pv_remove_retry;
761	}
762
763	if (pv_h->va == vaddr && pv_h->pmap == pmap) {
764		/*
765	         * Header is the pv_rooted_entry.
766		 * We can't free that. If there is a queued
767	         * entry after this one we remove that
768	         * from the ppn queue, we remove it from the hash chain
769	         * and copy it to the rooted entry. Then free it instead.
770	         */
771		pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink);
772		if (pv_h != (pv_rooted_entry_t) pvh_e) {
773			/*
774			 * Entry queued to root, remove this from hash
775			 * and install as new root.
776			 */
777			CHK_NPVHASH();
778			pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
779			LOCK_PV_HASH(pvhash_idx);
780			remque(&pvh_e->qlink);
781			pprevh = pvhash(pvhash_idx);
782			if (PV_HASHED_ENTRY_NULL == *pprevh) {
783				panic("pmap_pv_remove(%p,0x%llx,0x%x): "
784				      "empty hash, removing rooted",
785				      pmap, vaddr, ppn);
786			}
787			pmap_pvh_unlink(pvh_e);
788			UNLOCK_PV_HASH(pvhash_idx);
789			pv_h->pmap = pvh_e->pmap;
790			pv_h->va = pvh_e->va;	/* dispose of pvh_e */
791		} else {
792			/* none queued after rooted */
793			pv_h->pmap = PMAP_NULL;
794			pvh_e = PV_HASHED_ENTRY_NULL;
795		}
796	} else {
797		/*
798		 * not removing rooted pv. find it on hash chain, remove from
799		 * ppn queue and hash chain and free it
800		 */
801		CHK_NPVHASH();
802		pvhash_idx = pvhashidx(pmap, vaddr);
803		LOCK_PV_HASH(pvhash_idx);
804		pprevh = pvhash(pvhash_idx);
805		if (PV_HASHED_ENTRY_NULL == *pprevh) {
806			panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash",
807			    pmap, vaddr, ppn, *pte, pte);
808		}
809		pvh_e = *pprevh;
810		pmap_pv_hashlist_walks++;
811		pv_cnt = 0;
812		while (PV_HASHED_ENTRY_NULL != pvh_e) {
813			pv_cnt++;
814			if (pvh_e->pmap == pmap &&
815			    pvh_e->va == vaddr &&
816			    pvh_e->ppn == ppn)
817				break;
818			pprevh = &pvh_e->nexth;
819			pvh_e = pvh_e->nexth;
820		}
821
822		if (PV_HASHED_ENTRY_NULL == pvh_e) {
823			pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT);
824
825			if (pac == PMAP_ACTION_ASSERT)
826				panic("pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va);
827			else {
828				UNLOCK_PV_HASH(pvhash_idx);
829				if (pac == PMAP_ACTION_RETRY_RELOCK) {
830					LOCK_PVH(ppn_to_pai(*ppnp));
831					pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED);
832					goto pmap_pv_remove_retry;
833				}
834				else if (pac == PMAP_ACTION_RETRY) {
835					goto pmap_pv_remove_retry;
836				}
837				else if (pac == PMAP_ACTION_IGNORE) {
838					goto pmap_pv_remove_exit;
839				}
840			}
841		}
842
843		pmap_pv_hashlist_cnts += pv_cnt;
844		if (pmap_pv_hashlist_max < pv_cnt)
845			pmap_pv_hashlist_max = pv_cnt;
846		*pprevh = pvh_e->nexth;
847		remque(&pvh_e->qlink);
848		UNLOCK_PV_HASH(pvhash_idx);
849	}
850pmap_pv_remove_exit:
851	return pvh_e;
852}
853
854
855extern int 	pt_fake_zone_index;
856static inline void
857PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes)
858{
859	thread_t thr = current_thread();
860	task_t task;
861	zinfo_usage_t zinfo;
862
863	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
864
865	if (pt_fake_zone_index != -1 &&
866	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
867		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc);
868}
869
870static inline void
871PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes)
872{
873	thread_t thr = current_thread();
874	task_t task;
875	zinfo_usage_t zinfo;
876
877	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
878
879	if (pt_fake_zone_index != -1 &&
880	    (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
881		OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free);
882}
883
884static inline void
885PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes)
886{
887	pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes);
888}
889
890static inline void
891PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes)
892{
893	pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes);
894}
895
896extern boolean_t	pmap_initialized;/* Has pmap_init completed? */
897#define valid_page(x) (pmap_initialized && pmap_valid_page(x))
898
899// XXX
900#define HIGH_MEM_BASE  ((uint32_t)( -NBPDE) )  /* shared gdt etc seg addr */ /* XXX64 ?? */
901// XXX
902
903
904int		phys_attribute_test(
905			ppnum_t		phys,
906			int		bits);
907void		phys_attribute_clear(
908			ppnum_t		phys,
909			int		bits);
910
911//#define PCID_DEBUG 1
912#if	PCID_DEBUG
913#define pmap_pcid_log(fmt, args...)					\
914	do {								\
915		kprintf(fmt, ##args);					\
916		printf(fmt, ##args);					\
917	} while(0)
918#else
919#define pmap_pcid_log(fmt, args...)
920#endif
921void	pmap_pcid_configure(void);
922
923
924/*
925 * Atomic 64-bit compare and exchange of a page table entry.
926 */
927static inline boolean_t
928pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new)
929{
930	boolean_t		ret;
931
932#ifdef __i386__
933	/*
934	 * Load the old value into %edx:%eax
935	 * Load the new value into %ecx:%ebx
936	 * Compare-exchange-8bytes at address entryp (loaded in %edi)
937	 * If the compare succeeds, the new value is stored, return TRUE.
938	 * Otherwise, no swap is made, return FALSE.
939	 */
940	asm volatile(
941		"	lock; cmpxchg8b (%1)	\n\t"
942		"	setz	%%al		\n\t"
943		"	movzbl	%%al,%0"
944		: "=a" (ret)
945		: "D" (entryp),
946		  "a" ((uint32_t)old),
947		  "d" ((uint32_t)(old >> 32)),
948		  "b" ((uint32_t)new),
949		  "c" ((uint32_t)(new >> 32))
950		: "memory");
951#else
952	/*
953	 * Load the old value into %rax
954	 * Load the new value into another register
955	 * Compare-exchange-quad at address entryp
956	 * If the compare succeeds, the new value is stored, return TRUE.
957	 * Otherwise, no swap is made, return FALSE.
958	 */
959	asm volatile(
960		"	lock; cmpxchgq %2,(%3)	\n\t"
961		"	setz	%%al		\n\t"
962		"	movzbl	%%al,%0"
963		: "=a" (ret)
964		: "a" (old),
965		  "r" (new),
966		  "r" (entryp)
967		: "memory");
968#endif
969	return ret;
970}
971
972extern uint32_t pmap_update_clear_pte_count;
973
974static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) {
975	pt_entry_t npte, opte;
976	do {
977		opte = *mptep;
978		if (__improbable(opte == 0)) {
979			pmap_update_clear_pte_count++;
980			break;
981		}
982		npte = opte & ~(pclear_bits);
983		npte |= pset_bits;
984	}	while (!pmap_cmpx_pte(mptep, opte, npte));
985}
986
987#if	defined(__x86_64__)
988/*
989 * The single pml4 page per pmap is allocated at pmap create time and exists
990 * for the duration of the pmap. we allocate this page in kernel vm.
991 * this returns the address of the requested pml4 entry in the top level page.
992 */
993static inline
994pml4_entry_t *
995pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
996{
997	if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) &&
998		(vaddr < 0xFFFF800000000000ULL))) {
999		return (NULL);
1000	}
1001
1002#if	PMAP_ASSERT
1003	return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]);
1004#else
1005	return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)];
1006#endif
1007}
1008
1009/*
1010 * Returns address of requested PDPT entry in the physmap.
1011 */
1012static inline pdpt_entry_t *
1013pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
1014{
1015	pml4_entry_t	newpf;
1016	pml4_entry_t	*pml4;
1017
1018	pml4 = pmap64_pml4(pmap, vaddr);
1019	if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
1020		newpf = *pml4 & PG_FRAME;
1021		return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf))
1022			[(vaddr >> PDPTSHIFT) & (NPDPTPG-1)];
1023	}
1024	return (NULL);
1025}
1026/*
1027 * Returns the address of the requested PDE entry in the physmap.
1028 */
1029static inline pd_entry_t *
1030pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
1031{
1032	pdpt_entry_t	newpf;
1033	pdpt_entry_t	*pdpt;
1034
1035	pdpt = pmap64_pdpt(pmap, vaddr);
1036
1037	if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
1038		newpf = *pdpt & PG_FRAME;
1039		return &((pd_entry_t *) PHYSMAP_PTOV(newpf))
1040			[(vaddr >> PDSHIFT) & (NPDPG-1)];
1041	}
1042	return (NULL);
1043}
1044
1045static inline pd_entry_t     *
1046pmap_pde(pmap_t m, vm_map_offset_t v)
1047{
1048	pd_entry_t     *pde;
1049
1050	pde = pmap64_pde(m, v);
1051
1052	return pde;
1053}
1054
1055
1056/*
1057 * return address of mapped pte for vaddr va in pmap pmap.
1058 *
1059 * In case the pde maps a superpage, return the pde, which, in this case
1060 * is the actual page table entry.
1061 */
1062static inline pt_entry_t *
1063pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
1064{
1065	pd_entry_t	*pde;
1066	pd_entry_t	newpf;
1067
1068	assert(pmap);
1069	pde = pmap64_pde(pmap, vaddr);
1070
1071	if (pde && ((*pde & INTEL_PTE_VALID))) {
1072		if (*pde & INTEL_PTE_PS)
1073			return pde;
1074		newpf = *pde & PG_FRAME;
1075		return &((pt_entry_t *)PHYSMAP_PTOV(newpf))
1076			[i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)];
1077	}
1078	return (NULL);
1079}
1080#endif
1081#if	DEBUG
1082#define DPRINTF(x...)	kprintf(x)
1083#else
1084#define DPRINTF(x...)
1085#endif
1086
1087#endif /* MACH_KERNEL_PRIVATE */
1088#endif /* _I386_PMAP_INTERNAL_ */
1089