1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58
59/*
60 *	File:	pmap.c
61 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
62 *	(These guys wrote the Vax version)
63 *
64 *	Physical Map management code for Intel i386, i486, and i860.
65 *
66 *	Manages physical address maps.
67 *
68 *	In addition to hardware address maps, this
69 *	module is called upon to provide software-use-only
70 *	maps which may or may not be stored in the same
71 *	form as hardware maps.  These pseudo-maps are
72 *	used to store intermediate results from copy
73 *	operations to and from address spaces.
74 *
75 *	Since the information managed by this module is
76 *	also stored by the logical address mapping module,
77 *	this module may throw away valid virtual-to-physical
78 *	mappings at almost any time.  However, invalidations
79 *	of virtual-to-physical mappings must be done as
80 *	requested.
81 *
82 *	In order to cope with hardware architectures which
83 *	make virtual-to-physical map invalidates expensive,
84 *	this module may delay invalidate or reduced protection
85 *	operations until such time as they are actually
86 *	necessary.  This module is given full information as
87 *	to which processors are currently using which maps,
88 *	and to when physical maps must be made correct.
89 */
90
91#include <string.h>
92#include <norma_vm.h>
93#include <mach_kdb.h>
94#include <mach_ldebug.h>
95
96#include <libkern/OSAtomic.h>
97
98#include <mach/machine/vm_types.h>
99
100#include <mach/boolean.h>
101#include <kern/thread.h>
102#include <kern/zalloc.h>
103#include <kern/queue.h>
104
105#include <kern/lock.h>
106#include <kern/kalloc.h>
107#include <kern/spl.h>
108
109#include <vm/pmap.h>
110#include <vm/vm_map.h>
111#include <vm/vm_kern.h>
112#include <mach/vm_param.h>
113#include <mach/vm_prot.h>
114#include <vm/vm_object.h>
115#include <vm/vm_page.h>
116
117#include <mach/machine/vm_param.h>
118#include <machine/thread.h>
119
120#include <kern/misc_protos.h>			/* prototyping */
121#include <i386/misc_protos.h>
122
123#include <i386/cpuid.h>
124#include <i386/cpu_data.h>
125#include <i386/cpu_number.h>
126#include <i386/machine_cpu.h>
127#include <i386/mp_slave_boot.h>
128#include <i386/seg.h>
129#include <i386/serial_io.h>
130#include <i386/cpu_capabilities.h>
131#include <i386/machine_routines.h>
132#include <i386/proc_reg.h>
133#include <i386/tsc.h>
134
135#if	MACH_KDB
136#include <ddb/db_command.h>
137#include <ddb/db_output.h>
138#include <ddb/db_sym.h>
139#include <ddb/db_print.h>
140#endif	/* MACH_KDB */
141
142#include <vm/vm_protos.h>
143
144#include <i386/mp.h>
145#include <i386/mp_desc.h>
146
147#include <sys/kdebug.h>
148
149/* #define DEBUGINTERRUPTS 1  uncomment to ensure pmap callers have interrupts enabled */
150#ifdef DEBUGINTERRUPTS
151#define pmap_intr_assert() {if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) panic("pmap interrupt assert %s, %d",__FILE__, __LINE__);}
152#else
153#define pmap_intr_assert()
154#endif
155
156#ifdef IWANTTODEBUG
157#undef	DEBUG
158#define DEBUG 1
159#define POSTCODE_DELAY 1
160#include <i386/postcode.h>
161#endif /* IWANTTODEBUG */
162
163//#define PMAP_TRACES 1
164#ifdef	PMAP_TRACES
165boolean_t	pmap_trace = FALSE;
166#define PMAP_TRACE(x,a,b,c,d,e)						\
167	if (pmap_trace) {						\
168		KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e);			\
169	}
170#else
171#define PMAP_TRACE(x,a,b,c,d,e)	KERNEL_DEBUG(x,a,b,c,d,e)
172#endif /* PMAP_TRACES */
173
174/*
175 * Forward declarations for internal functions.
176 */
177void		pmap_expand_pml4(
178			pmap_t		map,
179			vm_map_offset_t	v);
180
181void		pmap_expand_pdpt(
182			pmap_t		map,
183			vm_map_offset_t	v);
184
185void	pmap_remove_range(
186			pmap_t		pmap,
187			vm_map_offset_t	va,
188			pt_entry_t	*spte,
189			pt_entry_t	*epte);
190
191void		phys_attribute_clear(
192			ppnum_t		phys,
193			int		bits);
194
195int		phys_attribute_test(
196			ppnum_t		phys,
197			int		bits);
198
199void		phys_attribute_set(
200			ppnum_t		phys,
201			int		bits);
202
203void		pmap_set_reference(
204			ppnum_t pn);
205
206void		pmap_movepage(
207			unsigned long	from,
208			unsigned long	to,
209			vm_size_t	size);
210
211boolean_t	phys_page_exists(
212			ppnum_t pn);
213
214
215#ifdef PMAP_DEBUG
216void dump_pmap(pmap_t);
217void dump_4GB_pdpt(pmap_t p);
218void dump_4GB_pdpt_thread(thread_t tp);
219#endif
220
221#define	iswired(pte)	((pte) & INTEL_PTE_WIRED)
222
223int nx_enabled = 1;			/* enable no-execute protection */
224#ifdef CONFIG_EMBEDDED
225int allow_data_exec  = 0;	/* no exec from data, embedded is hardcore like that */
226#else
227int allow_data_exec  = VM_ABI_32;	/* 32-bit apps may execute data by default, 64-bit apps may not */
228#endif
229int allow_stack_exec = 0;		/* No apps may execute from the stack by default */
230
231int cpu_64bit  = 0;
232
233/*
234 * when spinning through pmap_remove
235 * ensure that we don't spend too much
236 * time with preemption disabled.
237 * I'm setting the current threshold
238 * to 20us
239 */
240#define MAX_PREEMPTION_LATENCY_NS 20000
241
242uint64_t max_preemption_latency_tsc = 0;
243
244
245/*
246 *	Private data structures.
247 */
248
249/*
250 *	For each vm_page_t, there is a list of all currently
251 *	valid virtual mappings of that page.  An entry is
252 *	a pv_rooted_entry_t; the list is the pv_table.
253 *
254 *      N.B.  with the new combo rooted/hashed scheme it is
255 *      only possibly to remove individual non-rooted entries
256 *      if they are found via the hashed chains as there is no
257 *      way to unlink the singly linked hashed entries if navigated to
258 *      via the queue list off the rooted entries.  Think of it as
259 *      hash/walk/pull, keeping track of the prev pointer while walking
260 *      the singly linked hash list.  All of this is to save memory and
261 *      keep both types of pv_entries as small as possible.
262 */
263
264/*
265
266PV HASHING Changes - JK 1/2007
267
268Pve's establish physical to virtual mappings.  These are used for aliasing of a
269physical page to (potentially many) virtual addresses within pmaps. In the previous
270implementation the structure of the pv_entries (each 16 bytes in size) was
271
272typedef struct pv_entry {
273    struct pv_entry_t    next;
274    pmap_t                    pmap;
275    vm_map_offset_t   va;
276} *pv_entry_t;
277
278An initial array of these is created at boot time, one per physical page of memory,
279indexed by the physical page number. Additionally, a pool of entries is created from a
280pv_zone to be used as needed by pmap_enter() when it is creating new mappings.
281Originally, we kept this pool around because the code in pmap_enter() was unable to
282block if it needed an entry and none were available - we'd panic.  Some time ago I
283restructured the pmap_enter() code so that for user pmaps it can block while zalloc'ing
284a pv structure and restart, removing a panic from the code (in the case of the kernel
285pmap we cannot block and still panic, so, we keep a separate hot pool for use only on
286kernel pmaps).  The pool has not been removed since there is a large performance gain
287keeping freed pv's around for reuse and not suffering the overhead of zalloc for every new pv we need.
288
289As pmap_enter() created new mappings it linked the new pve's for them off the fixed
290pv array for that ppn (off the next pointer).  These pve's are accessed for several
291operations, one of them being address space teardown.  In that case, we basically do this
292
293	for (every page/pte in the space) {
294		calc pve_ptr from the ppn in the pte
295		for (every pv in the list for the ppn) {
296			if (this pv is for this pmap/vaddr) {
297				do housekeeping
298				unlink/free the pv
299			}
300		}
301	}
302
303The problem arose when we were running, say 8000 (or even 2000) apache or other processes
304and one or all terminate. The list hanging off each pv array entry could have thousands of
305entries.  We were continuously linearly searching each of these lists as we stepped through
306the address space we were tearing down.  Because of the locks we hold, likely taking a cache
307miss for each node,  and interrupt disabling for MP issues the system became completely
308unresponsive for many seconds while we did this.
309
310Realizing that pve's are accessed in two distinct ways (linearly running the list by ppn
311for operations like pmap_page_protect and finding and modifying/removing a single pve as
312part of pmap_enter processing) has led to modifying the pve structures and databases.
313
314There are now two types of pve structures.  A "rooted" structure which is basically the
315original structure accessed in an array by ppn, and a ''hashed'' structure accessed on a
316hash list via a hash of [pmap, vaddr].  These have been designed with the two goals of
317minimizing wired memory and making the lookup of a ppn faster.  Since a vast majority of
318pages in the system are not aliased and hence represented by a single pv entry I've kept
319the rooted entry size as small as possible because there is one of these dedicated for
320every physical page of memory.  The hashed pve's are larger due to the addition of the hash
321link and the ppn entry needed for matching while running the hash list to find the entry we
322are looking for.  This way, only systems that have lots of aliasing (like 2000+ httpd procs)
323will pay the extra memory price. Both structures have the same first three fields allowing
324some simplification in the code.
325
326They have these shapes
327
328typedef struct pv_rooted_entry {
329        queue_head_t qlink;
330        vm_map_offset_t va;
331        pmap_t          pmap;
332} *pv_rooted_entry_t;
333
334
335typedef struct pv_hashed_entry {
336  queue_head_t qlink;
337  vm_map_offset_t va;
338  pmap_t        pmap;
339  ppnum_t ppn;
340  struct pv_hashed_entry *nexth;
341} *pv_hashed_entry_t;
342
343The main flow difference is that the code is now aware of the rooted entry and the hashed
344entries.  Code that runs the pv list still starts with the rooted entry and then continues
345down the qlink onto the hashed entries.  Code that is looking up a specific pv entry first
346checks the rooted entry and then hashes and runs the hash list for the match. The hash list
347lengths are much smaller than the original pv lists that contained all aliases for the specific ppn.
348
349*/
350
351typedef struct pv_rooted_entry {     /* first three entries must match pv_hashed_entry_t */
352        queue_head_t qlink;
353	vm_map_offset_t	va;		/* virtual address for mapping */
354	pmap_t		pmap;		/* pmap where mapping lies */
355} *pv_rooted_entry_t;
356
357#define PV_ROOTED_ENTRY_NULL	((pv_rooted_entry_t) 0)
358
359pv_rooted_entry_t	pv_head_table;		/* array of entries, one per page */
360
361typedef struct pv_hashed_entry {     /* first three entries must match pv_rooted_entry_t */
362  queue_head_t qlink;
363  vm_map_offset_t va;
364  pmap_t        pmap;
365  ppnum_t ppn;
366  struct pv_hashed_entry *nexth;
367} *pv_hashed_entry_t;
368
369#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0)
370
371#define NPVHASH 4095   /* MUST BE 2^N - 1 */
372pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
373
374uint32_t npvhash = 0;
375
376/* #define PV_DEBUG 1   uncomment to enable some PV debugging code */
377#ifdef PV_DEBUG
378#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized");
379#else
380#define CHK_NPVHASH()
381#endif
382
383/*
384 *	pv_list entries are kept on a list that can only be accessed
385 *	with the pmap system locked (at SPLVM, not in the cpus_active set).
386 *	The list is refilled from the pv_hashed_list_zone if it becomes empty.
387 */
388pv_rooted_entry_t	pv_free_list = PV_ROOTED_ENTRY_NULL;		/* free list at SPLVM */
389pv_hashed_entry_t	pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
390pv_hashed_entry_t      pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
391decl_simple_lock_data(,pv_hashed_free_list_lock)
392decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
393decl_simple_lock_data(,pv_hash_table_lock)
394
395int pv_free_count = 0;
396int pv_hashed_free_count = 0;
397int pv_kern_free_count = 0;
398int pv_hashed_kern_free_count = 0;
399#define PV_HASHED_LOW_WATER_MARK 5000
400#define PV_HASHED_KERN_LOW_WATER_MARK 100
401#define PV_HASHED_ALLOC_CHUNK 2000
402#define PV_HASHED_KERN_ALLOC_CHUNK 50
403thread_call_t  mapping_adjust_call;
404static thread_call_data_t  mapping_adjust_call_data;
405uint32_t mappingrecurse = 0;
406
407#define	PV_HASHED_ALLOC(pvh_e) { \
408	simple_lock(&pv_hashed_free_list_lock); \
409	if ((pvh_e = pv_hashed_free_list) != 0) { \
410	  pv_hashed_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;	\
411            pv_hashed_free_count--; \
412            if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) \
413              if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
414                thread_call_enter(mapping_adjust_call); \
415	} \
416	simple_unlock(&pv_hashed_free_list_lock); \
417}
418
419#define	PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {	\
420	simple_lock(&pv_hashed_free_list_lock); \
421	pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list;	\
422	pv_hashed_free_list = pvh_eh; \
423        pv_hashed_free_count += pv_cnt; \
424	simple_unlock(&pv_hashed_free_list_lock); \
425}
426
427#define	PV_HASHED_KERN_ALLOC(pvh_e) { \
428	simple_lock(&pv_hashed_kern_free_list_lock); \
429	if ((pvh_e = pv_hashed_kern_free_list) != 0) { \
430	  pv_hashed_kern_free_list = (pv_hashed_entry_t)pvh_e->qlink.next;	\
431            pv_hashed_kern_free_count--; \
432            if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) \
433              if (hw_compare_and_store(0,1,(u_int *)&mappingrecurse)) \
434                thread_call_enter(mapping_adjust_call); \
435	} \
436	simple_unlock(&pv_hashed_kern_free_list_lock); \
437}
438
439#define	PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt) {	 \
440	simple_lock(&pv_hashed_kern_free_list_lock); \
441	pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list;	\
442	pv_hashed_kern_free_list = pvh_eh; \
443        pv_hashed_kern_free_count += pv_cnt; \
444	simple_unlock(&pv_hashed_kern_free_list_lock); \
445}
446
447zone_t		pv_hashed_list_zone;	/* zone of pv_hashed_entry structures */
448
449static zone_t pdpt_zone;
450
451/*
452 *	Each entry in the pv_head_table is locked by a bit in the
453 *	pv_lock_table.  The lock bits are accessed by the physical
454 *	address of the page they lock.
455 */
456
457char	*pv_lock_table;		/* pointer to array of bits */
458#define pv_lock_table_size(n)	(((n)+BYTE_SIZE-1)/BYTE_SIZE)
459
460char    *pv_hash_lock_table;
461#define pv_hash_lock_table_size(n)  (((n)+BYTE_SIZE-1)/BYTE_SIZE)
462
463/*
464 *	First and last physical addresses that we maintain any information
465 *	for.  Initialized to zero so that pmap operations done before
466 *	pmap_init won't touch any non-existent structures.
467 */
468boolean_t	pmap_initialized = FALSE;/* Has pmap_init completed? */
469
470static struct vm_object kptobj_object_store;
471static vm_object_t kptobj;
472
473/*
474 *	Index into pv_head table, its lock bits, and the modify/reference and managed bits
475 */
476
477#define pa_index(pa)	(i386_btop(pa))
478#define ppn_to_pai(ppn)	((int)ppn)
479
480#define pai_to_pvh(pai)		(&pv_head_table[pai])
481#define lock_pvh_pai(pai)	bit_lock(pai, (void *)pv_lock_table)
482#define unlock_pvh_pai(pai)	bit_unlock(pai, (void *)pv_lock_table)
483
484#define pvhashidx(pmap, va) (((uint32_t)pmap ^ ((uint32_t)((uint64_t)va >> PAGE_SHIFT) & 0xFFFFFFFF)) & npvhash)
485#define pvhash(idx)         (&pv_hash_table[idx])
486
487#define lock_hash_hash(hash)	        bit_lock(hash, (void *)pv_hash_lock_table)
488#define unlock_hash_hash(hash)	bit_unlock(hash, (void *)pv_hash_lock_table)
489
490/*
491 *	Array of physical page attribites for managed pages.
492 *	One byte per physical page.
493 */
494char	*pmap_phys_attributes;
495unsigned int	last_managed_page = 0;
496
497/*
498 *	Physical page attributes.  Copy bits from PTE definition.
499 */
500#define	PHYS_MODIFIED	INTEL_PTE_MOD	/* page modified */
501#define	PHYS_REFERENCED	INTEL_PTE_REF	/* page referenced */
502#define PHYS_MANAGED	INTEL_PTE_VALID /* page is managed */
503
504/*
505 *	Amount of virtual memory mapped by one
506 *	page-directory entry.
507 */
508#define	PDE_MAPPED_SIZE		(pdetova(1))
509uint64_t pde_mapped_size;
510
511/*
512 *	Locking and TLB invalidation
513 */
514
515/*
516 *	Locking Protocols: (changed 2/2007 JK)
517 *
518 *	There are two structures in the pmap module that need locking:
519 *	the pmaps themselves, and the per-page pv_lists (which are locked
520 *	by locking the pv_lock_table entry that corresponds to the pv_head
521 *	for the list in question.)  Most routines want to lock a pmap and
522 *	then do operations in it that require pv_list locking -- however
523 *	pmap_remove_all and pmap_copy_on_write operate on a physical page
524 *	basis and want to do the locking in the reverse order, i.e. lock
525 *	a pv_list and then go through all the pmaps referenced by that list.
526 *
527 *      The system wide pmap lock has been removed. Now, paths take a lock
528 *      on the pmap before changing its 'shape' and the reverse order lockers
529 *      (coming in by phys ppn) take a lock on the corresponding pv and then
530 *      retest to be sure nothing changed during the window before they locked
531 *      and can then run up/down the pv lists holding the list lock. This also
532 *      lets the pmap layer run (nearly completely) interrupt enabled, unlike
533 *      previously.
534 */
535
536/*
537 * pmap locking
538 */
539
540#define PMAP_LOCK(pmap) {	\
541	simple_lock(&(pmap)->lock);	\
542}
543
544#define PMAP_UNLOCK(pmap) {		\
545	simple_unlock(&(pmap)->lock);		\
546}
547
548/*
549 * PV locking
550 */
551
552#define LOCK_PVH(index)		{       \
553    mp_disable_preemption();           \
554    lock_pvh_pai(index);               \
555}
556
557#define UNLOCK_PVH(index)  {      \
558    unlock_pvh_pai(index);        \
559    mp_enable_preemption();       \
560}
561
562/*
563 * PV hash locking
564 */
565
566#define LOCK_PV_HASH(hash)         lock_hash_hash(hash)
567
568#define UNLOCK_PV_HASH(hash)       unlock_hash_hash(hash)
569
570#if	USLOCK_DEBUG
571extern int	max_lock_loops;
572#define LOOP_VAR							\
573	unsigned int	loop_count;					\
574	loop_count = disable_serial_output ? max_lock_loops		\
575					: max_lock_loops*100
576#define LOOP_CHECK(msg, pmap)						\
577	if (--loop_count == 0) {					\
578		mp_disable_preemption();				\
579	    	kprintf("%s: cpu %d pmap %x\n",				\
580			  msg, cpu_number(), pmap);			\
581            	Debugger("deadlock detection");				\
582		mp_enable_preemption();					\
583		loop_count = max_lock_loops;				\
584	}
585#else	/* USLOCK_DEBUG */
586#define LOOP_VAR
587#define LOOP_CHECK(msg, pmap)
588#endif	/* USLOCK_DEBUG */
589
590
591static void pmap_flush_tlbs(pmap_t pmap);
592
593#define PMAP_UPDATE_TLBS(pmap, s, e)					\
594	pmap_flush_tlbs(pmap)
595
596
597#define MAX_TBIS_SIZE	32		/* > this -> TBIA */ /* XXX */
598
599
600pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
601
602/*
603 *	Other useful macros.
604 */
605#define current_pmap()		(vm_map_pmap(current_thread()->map))
606
607struct pmap	kernel_pmap_store;
608pmap_t		kernel_pmap;
609
610pd_entry_t    high_shared_pde;
611pd_entry_t    commpage64_pde;
612
613struct zone	*pmap_zone;		/* zone of pmap structures */
614
615int		pmap_debug = 0;		/* flag for debugging prints */
616
617unsigned int	inuse_ptepages_count = 0;
618
619addr64_t	kernel64_cr3;
620boolean_t	no_shared_cr3 = FALSE;	/* -no_shared_cr3 boot arg */
621
622/*
623 *	Pmap cache.  Cache is threaded through ref_count field of pmap.
624 *	Max will eventually be constant -- variable for experimentation.
625 */
626int		pmap_cache_max = 32;
627int		pmap_alloc_chunk = 8;
628pmap_t		pmap_cache_list;
629int		pmap_cache_count;
630decl_simple_lock_data(,pmap_cache_lock)
631
632extern char end;
633
634static int nkpt;
635extern uint32_t lowGlo;
636
637pt_entry_t     *DMAP1, *DMAP2;
638caddr_t         DADDR1;
639caddr_t         DADDR2;
640
641static inline
642void pmap_pvh_unlink(pv_hashed_entry_t pv);
643
644/*
645 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain.
646 * properly deals with the anchor.
647 * must be called with the hash locked, does not unlock it
648 */
649
650static inline
651void pmap_pvh_unlink(pv_hashed_entry_t pvh)
652{
653  pv_hashed_entry_t curh;
654  pv_hashed_entry_t *pprevh;
655  int pvhash_idx;
656
657  CHK_NPVHASH();
658  pvhash_idx = pvhashidx(pvh->pmap, pvh->va);
659
660  pprevh = pvhash(pvhash_idx);
661
662#if PV_DEBUG
663  if (NULL == *pprevh) panic("pvh_unlink null anchor"); /* JK DEBUG */
664#endif
665  curh = *pprevh;
666
667  while (PV_HASHED_ENTRY_NULL != curh) {
668    if (pvh == curh)
669      break;
670    pprevh = &curh->nexth;
671    curh = curh->nexth;
672  }
673  if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh");
674  *pprevh = pvh->nexth;
675  return;
676}
677
678/*
679 * for legacy, returns the address of the pde entry.
680 * for 64 bit, causes the pdpt page containing the pde entry to be mapped,
681 * then returns the mapped address of the pde entry in that page
682 */
683pd_entry_t *
684pmap_pde(pmap_t m, vm_map_offset_t v)
685{
686  pd_entry_t *pde;
687	if (!cpu_64bit || (m == kernel_pmap)) {
688	  pde = (&((m)->dirbase[(vm_offset_t)(v) >> PDESHIFT]));
689	} else {
690	  assert(m);
691	  assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
692	  pde = pmap64_pde(m, v);
693	}
694	return pde;
695}
696
697
698/*
699 * the single pml4 page per pmap is allocated at pmap create time and exists
700 * for the duration of the pmap. we allocate this page in kernel vm (to save us one
701 * level of page table dynamic mapping.
702 * this returns the address of the requested pml4 entry in the top level page.
703 */
704static inline
705pml4_entry_t *
706pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr)
707{
708  return ((pml4_entry_t *)pmap->pm_hold + ((vm_offset_t)((vaddr>>PML4SHIFT)&(NPML4PG-1))));
709}
710
711/*
712 * maps in the pml4 page, if any, containing the pdpt entry requested
713 * and returns the address of the pdpt entry in that mapped page
714 */
715pdpt_entry_t *
716pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr)
717{
718  pml4_entry_t newpf;
719  pml4_entry_t *pml4;
720  int i;
721
722  assert(pmap);
723  assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
724  if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
725    return(0);
726  }
727
728  pml4 = pmap64_pml4(pmap, vaddr);
729
730	if (pml4 && ((*pml4 & INTEL_PTE_VALID))) {
731
732		newpf = *pml4 & PG_FRAME;
733
734
735		for (i=PMAP_PDPT_FIRST_WINDOW; i < PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS; i++) {
736		  if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
737		  return((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
738			 ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
739		  }
740		}
741
742		  current_cpu_datap()->cpu_pmap->pdpt_window_index++;
743		  if (current_cpu_datap()->cpu_pmap->pdpt_window_index > (PMAP_PDPT_FIRST_WINDOW+PMAP_PDPT_NWINDOWS-1))
744		    current_cpu_datap()->cpu_pmap->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
745		  pmap_store_pte(
746				 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CMAP),
747				 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
748		  invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR));
749		  return ((pdpt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pdpt_window_index].prv_CADDR) +
750			  ((vm_offset_t)((vaddr>>PDPTSHIFT)&(NPDPTPG-1))));
751	}
752
753	return (NULL);
754}
755
756/*
757 * maps in the pdpt page, if any, containing the pde entry requested
758 * and returns the address of the pde entry in that mapped page
759 */
760pd_entry_t *
761pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr)
762{
763  pdpt_entry_t newpf;
764  pdpt_entry_t *pdpt;
765  int i;
766
767  assert(pmap);
768  assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
769  if ((vaddr > 0x00007FFFFFFFFFFFULL) && (vaddr < 0xFFFF800000000000ULL)) {
770    return(0);
771  }
772
773  /*  if (vaddr & (1ULL << 63)) panic("neg addr");*/
774  pdpt = pmap64_pdpt(pmap, vaddr);
775
776	  if (pdpt && ((*pdpt & INTEL_PTE_VALID))) {
777
778		newpf = *pdpt & PG_FRAME;
779
780		for (i=PMAP_PDE_FIRST_WINDOW; i < PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS; i++) {
781		  if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
782		  return((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
783			 ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
784		  }
785		}
786
787		  current_cpu_datap()->cpu_pmap->pde_window_index++;
788		  if (current_cpu_datap()->cpu_pmap->pde_window_index > (PMAP_PDE_FIRST_WINDOW+PMAP_PDE_NWINDOWS-1))
789		    current_cpu_datap()->cpu_pmap->pde_window_index = PMAP_PDE_FIRST_WINDOW;
790		  pmap_store_pte(
791				 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CMAP),
792				 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
793		  invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR));
794		  return ((pd_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pde_window_index].prv_CADDR) +
795			  ((vm_offset_t)((vaddr>>PDSHIFT)&(NPDPG-1))));
796	}
797
798	return (NULL);
799}
800
801/*
802 * Because the page tables (top 3 levels) are mapped into per cpu windows,
803 * callers must either disable interrupts or disable preemption before calling
804 * one of the pte mapping routines (e.g. pmap_pte()) as the returned vaddr
805 * is in one of those mapped windows and that cannot be allowed to change until
806 * the caller is done using the returned pte pointer. When done, the caller
807 * restores interrupts or preemption to its previous state after which point the
808 * vaddr for the returned pte can no longer be used
809 */
810
811
812/*
813 * return address of mapped pte for vaddr va in pmap pmap.
814 * must be called with pre-emption or interrupts disabled
815 * if targeted pmap is not the kernel pmap
816 * since we may be passing back a virtual address that is
817 * associated with this cpu... pre-emption or interrupts
818 * must remain disabled until the caller is done using
819 * the pointer that was passed back .
820 *
821 * maps the pde page, if any, containing the pte in and returns
822 * the address of the pte in that mapped page
823 */
824pt_entry_t     *
825pmap_pte(pmap_t pmap, vm_map_offset_t vaddr)
826{
827        pd_entry_t     *pde;
828	pd_entry_t     newpf;
829	int i;
830
831	assert(pmap);
832	pde = pmap_pde(pmap,vaddr);
833
834	if (pde && ((*pde & INTEL_PTE_VALID))) {
835	    if (pmap == kernel_pmap)
836	        return (vtopte(vaddr)); /* compat kernel still has pte's mapped */
837#if TESTING
838	    if (ml_get_interrupts_enabled() && get_preemption_level() == 0)
839	        panic("pmap_pte: unsafe call");
840#endif
841	        assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
842
843		newpf = *pde & PG_FRAME;
844
845		for (i=PMAP_PTE_FIRST_WINDOW; i < PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS; i++) {
846		  if (((*(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP)) & PG_FRAME) == newpf) {
847		  return((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR) +
848			 ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
849		  }
850		}
851
852		  current_cpu_datap()->cpu_pmap->pte_window_index++;
853		  if (current_cpu_datap()->cpu_pmap->pte_window_index > (PMAP_PTE_FIRST_WINDOW+PMAP_PTE_NWINDOWS-1))
854		    current_cpu_datap()->cpu_pmap->pte_window_index = PMAP_PTE_FIRST_WINDOW;
855		  pmap_store_pte(
856				 (current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CMAP),
857				 newpf | INTEL_PTE_RW | INTEL_PTE_VALID);
858		  invlpg((u_int)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR));
859		  return ((pt_entry_t *)(current_cpu_datap()->cpu_pmap->mapwindow[current_cpu_datap()->cpu_pmap->pte_window_index].prv_CADDR) +
860			  ((vm_offset_t)i386_btop(vaddr) & (NPTEPG-1)));
861	}
862
863	return(NULL);
864}
865
866
867/*
868 *	Map memory at initialization.  The physical addresses being
869 *	mapped are not managed and are never unmapped.
870 *
871 *	For now, VM is already on, we only need to map the
872 *	specified memory.
873 */
874vm_offset_t
875pmap_map(
876	vm_offset_t	virt,
877	vm_map_offset_t	start_addr,
878	vm_map_offset_t	end_addr,
879	vm_prot_t	prot,
880	unsigned int	flags)
881{
882	int		ps;
883
884	ps = PAGE_SIZE;
885	while (start_addr < end_addr) {
886		pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
887			   (ppnum_t) i386_btop(start_addr), prot, flags, FALSE);
888		virt += ps;
889		start_addr += ps;
890	}
891	return(virt);
892}
893
894/*
895 *	Back-door routine for mapping kernel VM at initialization.
896 * 	Useful for mapping memory outside the range
897 *      Sets no-cache, A, D.
898 *	Otherwise like pmap_map.
899 */
900vm_offset_t
901pmap_map_bd(
902	vm_offset_t	virt,
903	vm_map_offset_t	start_addr,
904	vm_map_offset_t	end_addr,
905	vm_prot_t	prot,
906	unsigned int	flags)
907{
908	pt_entry_t	template;
909	pt_entry_t	*pte;
910	spl_t           spl;
911
912	template = pa_to_pte(start_addr)
913		| INTEL_PTE_REF
914		| INTEL_PTE_MOD
915		| INTEL_PTE_WIRED
916		| INTEL_PTE_VALID;
917
918	if(flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT)) {
919	    template |= INTEL_PTE_NCACHE;
920	    if(!(flags & (VM_MEM_GUARDED | VM_WIMG_USE_DEFAULT)))
921		    template |= INTEL_PTE_PTA;
922	}
923
924	if (prot & VM_PROT_WRITE)
925	    template |= INTEL_PTE_WRITE;
926
927	while (start_addr < end_addr) {
928	        spl = splhigh();
929		pte = pmap_pte(kernel_pmap, (vm_map_offset_t)virt);
930		if (pte == PT_ENTRY_NULL) {
931			panic("pmap_map_bd: Invalid kernel address\n");
932		}
933		pmap_store_pte(pte, template);
934		splx(spl);
935		pte_increment_pa(template);
936		virt += PAGE_SIZE;
937		start_addr += PAGE_SIZE;
938	}
939
940	flush_tlb();
941	return(virt);
942}
943
944extern	char		*first_avail;
945extern	vm_offset_t	virtual_avail, virtual_end;
946extern	pmap_paddr_t	avail_start, avail_end;
947extern  vm_offset_t     etext;
948extern  void            *sectHIBB;
949extern  int             sectSizeHIB;
950
951void
952pmap_cpu_init(void)
953{
954	/*
955	 * Here early in the life of a processor (from cpu_mode_init()).
956	 * If we're not in 64-bit mode, enable the global TLB feature.
957	 * Note: regardless of mode we continue to set the global attribute
958	 * bit in ptes for all (32-bit) global pages such as the commpage.
959	 */
960	if (!cpu_64bit) {
961		set_cr4(get_cr4() | CR4_PGE);
962	}
963
964	/*
965	 * Initialize the per-cpu, TLB-related fields.
966	 */
967	current_cpu_datap()->cpu_active_cr3 = kernel_pmap->pm_cr3;
968	current_cpu_datap()->cpu_tlb_invalid = FALSE;
969}
970
971vm_offset_t
972pmap_high_shared_remap(enum high_fixed_addresses e, vm_offset_t va, int sz)
973{
974  vm_offset_t ve = pmap_index_to_virt(e);
975  pt_entry_t *ptep;
976  pmap_paddr_t pa;
977  int i;
978  spl_t s;
979
980  assert(0 == (va & PAGE_MASK));  /* expecting page aligned */
981  s = splhigh();
982  ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)ve);
983
984  for (i=0; i< sz; i++) {
985    pa = (pmap_paddr_t) kvtophys(va);
986    pmap_store_pte(ptep, (pa & PG_FRAME)
987				| INTEL_PTE_VALID
988		                | INTEL_PTE_GLOBAL
989				| INTEL_PTE_RW
990				| INTEL_PTE_REF
991				| INTEL_PTE_MOD);
992    va+= PAGE_SIZE;
993    ptep++;
994  }
995  splx(s);
996  return ve;
997}
998
999vm_offset_t
1000pmap_cpu_high_shared_remap(int cpu, enum high_cpu_types e, vm_offset_t va, int sz)
1001{
1002  enum high_fixed_addresses	a = e + HIGH_CPU_END * cpu;
1003  return pmap_high_shared_remap(HIGH_FIXED_CPUS_BEGIN + a, va, sz);
1004}
1005
1006void pmap_init_high_shared(void);
1007
1008extern vm_offset_t gdtptr, idtptr;
1009
1010extern uint32_t low_intstack;
1011
1012extern struct fake_descriptor ldt_desc_pattern;
1013extern struct fake_descriptor tss_desc_pattern;
1014
1015extern char hi_remap_text, hi_remap_etext;
1016extern char t_zero_div;
1017
1018pt_entry_t *pte_unique_base;
1019
1020void
1021pmap_init_high_shared(void)
1022{
1023
1024	vm_offset_t haddr;
1025        struct __gdt_desc_struct gdt_desc = {0,0,0};
1026	struct __idt_desc_struct idt_desc = {0,0,0};
1027	spl_t s;
1028#if MACH_KDB
1029	struct i386_tss *ttss;
1030#endif
1031
1032	kprintf("HIGH_MEM_BASE 0x%x fixed per-cpu begin 0x%x\n",
1033		HIGH_MEM_BASE,pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
1034	s = splhigh();
1035	pte_unique_base = pmap_pte(kernel_pmap, (vm_map_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN));
1036	splx(s);
1037
1038	if (i386_btop(&hi_remap_etext - &hi_remap_text + 1) >
1039				HIGH_FIXED_TRAMPS_END - HIGH_FIXED_TRAMPS + 1)
1040		panic("tramps too large");
1041	haddr = pmap_high_shared_remap(HIGH_FIXED_TRAMPS,
1042					(vm_offset_t) &hi_remap_text, 3);
1043	kprintf("tramp: 0x%x, ",haddr);
1044	printf("hi mem tramps at 0x%x\n",haddr);
1045	/* map gdt up high and update ptr for reload */
1046	haddr = pmap_high_shared_remap(HIGH_FIXED_GDT,
1047					(vm_offset_t) master_gdt, 1);
1048	__asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
1049	gdt_desc.address = haddr;
1050	kprintf("GDT: 0x%x, ",haddr);
1051	/* map ldt up high */
1052	haddr = pmap_high_shared_remap(HIGH_FIXED_LDT_BEGIN,
1053					(vm_offset_t) master_ldt,
1054					HIGH_FIXED_LDT_END - HIGH_FIXED_LDT_BEGIN + 1);
1055	kprintf("LDT: 0x%x, ",haddr);
1056	/* put new ldt addr into gdt */
1057	master_gdt[sel_idx(KERNEL_LDT)] = ldt_desc_pattern;
1058	master_gdt[sel_idx(KERNEL_LDT)].offset = (vm_offset_t) haddr;
1059	fix_desc(&master_gdt[sel_idx(KERNEL_LDT)], 1);
1060	master_gdt[sel_idx(USER_LDT)] = ldt_desc_pattern;
1061	master_gdt[sel_idx(USER_LDT)].offset = (vm_offset_t) haddr;
1062	fix_desc(&master_gdt[sel_idx(USER_LDT)], 1);
1063
1064	/* map idt up high */
1065	haddr = pmap_high_shared_remap(HIGH_FIXED_IDT,
1066					(vm_offset_t) master_idt, 1);
1067	__asm__ __volatile__("sidt %0" : "=m" (idt_desc));
1068	idt_desc.address = haddr;
1069	kprintf("IDT: 0x%x, ", haddr);
1070	/* remap ktss up high and put new high addr into gdt */
1071	haddr = pmap_high_shared_remap(HIGH_FIXED_KTSS,
1072					(vm_offset_t) &master_ktss, 1);
1073	master_gdt[sel_idx(KERNEL_TSS)] = tss_desc_pattern;
1074	master_gdt[sel_idx(KERNEL_TSS)].offset = (vm_offset_t) haddr;
1075	fix_desc(&master_gdt[sel_idx(KERNEL_TSS)], 1);
1076	kprintf("KTSS: 0x%x, ",haddr);
1077#if MACH_KDB
1078	/* remap dbtss up high and put new high addr into gdt */
1079	haddr = pmap_high_shared_remap(HIGH_FIXED_DBTSS,
1080					(vm_offset_t) &master_dbtss, 1);
1081	master_gdt[sel_idx(DEBUG_TSS)] = tss_desc_pattern;
1082	master_gdt[sel_idx(DEBUG_TSS)].offset = (vm_offset_t) haddr;
1083	fix_desc(&master_gdt[sel_idx(DEBUG_TSS)], 1);
1084	ttss = (struct i386_tss *)haddr;
1085	kprintf("DBTSS: 0x%x, ",haddr);
1086#endif	/* MACH_KDB */
1087
1088	/* remap dftss up high and put new high addr into gdt */
1089	haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1090					(vm_offset_t) &master_dftss, 1);
1091	master_gdt[sel_idx(DF_TSS)] = tss_desc_pattern;
1092	master_gdt[sel_idx(DF_TSS)].offset = (vm_offset_t) haddr;
1093	fix_desc(&master_gdt[sel_idx(DF_TSS)], 1);
1094	kprintf("DFTSS: 0x%x\n",haddr);
1095
1096	/* remap mctss up high and put new high addr into gdt */
1097	haddr = pmap_high_shared_remap(HIGH_FIXED_DFTSS,
1098					(vm_offset_t) &master_mctss, 1);
1099	master_gdt[sel_idx(MC_TSS)] = tss_desc_pattern;
1100	master_gdt[sel_idx(MC_TSS)].offset = (vm_offset_t) haddr;
1101	fix_desc(&master_gdt[sel_idx(MC_TSS)], 1);
1102	kprintf("MCTSS: 0x%x\n",haddr);
1103
1104	__asm__ __volatile__("lgdt %0": "=m" (gdt_desc));
1105	__asm__ __volatile__("lidt %0": "=m" (idt_desc));
1106	kprintf("gdt/idt reloaded, ");
1107	set_tr(KERNEL_TSS);
1108	kprintf("tr reset to KERNEL_TSS\n");
1109}
1110
1111
1112/*
1113 *	Bootstrap the system enough to run with virtual memory.
1114 *	Map the kernel's code and data, and allocate the system page table.
1115 *	Called with mapping OFF.  Page_size must already be set.
1116 *
1117 *	Parameters:
1118 *	load_start:	PA where kernel was loaded
1119 *	avail_start	PA of first available physical page -
1120 *			   after kernel page tables
1121 *	avail_end	PA of last available physical page
1122 *	virtual_avail	VA of first available page -
1123 *			   after kernel page tables
1124 *	virtual_end	VA of last available page -
1125 *			   end of kernel address space
1126 *
1127 *	&start_text	start of kernel text
1128 *	&etext		end of kernel text
1129 */
1130
1131void
1132pmap_bootstrap(
1133	__unused vm_offset_t	load_start,
1134	boolean_t		IA32e)
1135{
1136	vm_offset_t	va;
1137	pt_entry_t	*pte;
1138	int i;
1139	int wpkernel, boot_arg;
1140	pdpt_entry_t *pdpt;
1141	spl_t s;
1142
1143	vm_last_addr = VM_MAX_KERNEL_ADDRESS;	/* Set the highest address
1144						 * known to VM */
1145	/*
1146	 *	The kernel's pmap is statically allocated so we don't
1147	 *	have to use pmap_create, which is unlikely to work
1148	 *	correctly at this part of the boot sequence.
1149	 */
1150
1151
1152	kernel_pmap = &kernel_pmap_store;
1153	kernel_pmap->ref_count = 1;
1154	kernel_pmap->nx_enabled = FALSE;
1155	kernel_pmap->pm_task_map = TASK_MAP_32BIT;
1156	kernel_pmap->pm_obj = (vm_object_t) NULL;
1157	kernel_pmap->dirbase = (pd_entry_t *)((unsigned int)IdlePTD | KERNBASE);
1158	kernel_pmap->pdirbase = (pmap_paddr_t)((int)IdlePTD);
1159	pdpt = (pd_entry_t *)((unsigned int)IdlePDPT | KERNBASE );
1160	kernel_pmap->pm_pdpt = pdpt;
1161	kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePDPT);
1162
1163	va = (vm_offset_t)kernel_pmap->dirbase;
1164	/* setup self referential mapping(s) */
1165	for (i = 0; i< NPGPTD; i++, pdpt++) {
1166	  pmap_paddr_t pa;
1167	  pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
1168	  pmap_store_pte(
1169	    (pd_entry_t *) (kernel_pmap->dirbase + PTDPTDI + i),
1170	    (pa & PG_FRAME) | INTEL_PTE_VALID | INTEL_PTE_RW | INTEL_PTE_REF |
1171	      INTEL_PTE_MOD | INTEL_PTE_WIRED) ;
1172	  pmap_store_pte(pdpt, pa | INTEL_PTE_VALID);
1173	}
1174
1175	cpu_64bit = IA32e;
1176
1177	lo_kernel_cr3 = kernel_pmap->pm_cr3;
1178	current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1179
1180	/* save the value we stuff into created pmaps to share the gdts etc */
1181	high_shared_pde = *pmap_pde(kernel_pmap, HIGH_MEM_BASE);
1182	/* make sure G bit is on for high shared pde entry */
1183	high_shared_pde |= INTEL_PTE_GLOBAL;
1184	s = splhigh();
1185	pmap_store_pte(pmap_pde(kernel_pmap, HIGH_MEM_BASE), high_shared_pde);
1186	splx(s);
1187
1188	nkpt = NKPT;
1189	inuse_ptepages_count += NKPT;
1190
1191	virtual_avail = (vm_offset_t)VADDR(KPTDI,0) + (vm_offset_t)first_avail;
1192	virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
1193
1194	/*
1195	 * Reserve some special page table entries/VA space for temporary
1196	 * mapping of pages.
1197	 */
1198#define	SYSMAP(c, p, v, n)	\
1199	v = (c)va; va += ((n)*INTEL_PGBYTES); p = pte; pte += (n)
1200
1201	va = virtual_avail;
1202	pte = vtopte(va);
1203
1204        for (i=0; i<PMAP_NWINDOWS; i++) {
1205            SYSMAP(caddr_t,
1206		   (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
1207                   (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
1208		   1);
1209            *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
1210        }
1211
1212	/* DMAP user for debugger */
1213	SYSMAP(caddr_t, DMAP1, DADDR1, 1);
1214	SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
1215
1216	virtual_avail = va;
1217
1218	if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
1219	  if (0 != ((npvhash+1) & npvhash)) {
1220	    kprintf("invalid hash %d, must be ((2^N)-1), using default %d\n",npvhash,NPVHASH);
1221	    npvhash = NPVHASH;
1222	  }
1223	} else {
1224	  npvhash = NPVHASH;
1225	}
1226	printf("npvhash=%d\n",npvhash);
1227
1228	wpkernel = 1;
1229	if (PE_parse_boot_argn("wpkernel", &boot_arg, sizeof (boot_arg))) {
1230		if (boot_arg == 0)
1231			wpkernel = 0;
1232	}
1233
1234	s = splhigh();
1235
1236	/* Remap kernel text readonly unless the "wpkernel" boot-arg is present
1237 	 * and set to 0.
1238	 */
1239	if (wpkernel)
1240	{
1241		vm_offset_t     myva;
1242		pt_entry_t     *ptep;
1243
1244		for (myva = i386_round_page(MP_BOOT + MP_BOOTSTACK); myva < etext; myva += PAGE_SIZE) {
1245                        if (myva >= (vm_offset_t)sectHIBB && myva < ((vm_offset_t)sectHIBB + sectSizeHIB))
1246                                continue;
1247			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
1248			if (ptep)
1249				pmap_store_pte(ptep, *ptep & ~INTEL_PTE_RW);
1250		}
1251	}
1252
1253	/* no matter what,  kernel page zero is not accessible */
1254	pte = pmap_pte(kernel_pmap, 0);
1255	pmap_store_pte(pte, INTEL_PTE_INVALID);
1256
1257	/* map lowmem global page into fixed addr 0x2000 */
1258	if (0 == (pte = pmap_pte(kernel_pmap,0x2000))) panic("lowmem pte");
1259	assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); /* make sure it is defined on page boundary */
1260	pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)|INTEL_PTE_VALID|INTEL_PTE_REF|INTEL_PTE_MOD|INTEL_PTE_WIRED|INTEL_PTE_RW);
1261	splx(s);
1262	flush_tlb();
1263
1264	simple_lock_init(&kernel_pmap->lock, 0);
1265	simple_lock_init(&pv_hashed_free_list_lock, 0);
1266	simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
1267	simple_lock_init(&pv_hash_table_lock,0);
1268
1269	pmap_init_high_shared();
1270
1271	pde_mapped_size = PDE_MAPPED_SIZE;
1272
1273	if (cpu_64bit) {
1274	  pdpt_entry_t *ppdpt   = (pdpt_entry_t *)IdlePDPT;
1275	  pdpt_entry_t *ppdpt64 = (pdpt_entry_t *)IdlePDPT64;
1276	  pdpt_entry_t *ppml4   = (pdpt_entry_t *)IdlePML4;
1277	  int istate = ml_set_interrupts_enabled(FALSE);
1278
1279	  /*
1280	   * Clone a new 64-bit 3rd-level page table directory, IdlePML4,
1281	   * with page bits set for the correct IA-32e operation and so that
1282	   * the legacy-mode IdlePDPT is retained for slave processor start-up.
1283	   * This is necessary due to the incompatible use of page bits between
1284	   * 64-bit and legacy modes.
1285	   */
1286	  kernel_pmap->pm_cr3 = (pmap_paddr_t)((int)IdlePML4); /* setup in start.s for us */
1287	  kernel_pmap->pm_pml4 = IdlePML4;
1288	  kernel_pmap->pm_pdpt = (pd_entry_t *)
1289					((unsigned int)IdlePDPT64 | KERNBASE );
1290#define PAGE_BITS INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF
1291	  pmap_store_pte(kernel_pmap->pm_pml4,
1292		 	 (uint32_t)IdlePDPT64 | PAGE_BITS);
1293	  pmap_store_pte((ppdpt64+0), *(ppdpt+0) | PAGE_BITS);
1294	  pmap_store_pte((ppdpt64+1), *(ppdpt+1) | PAGE_BITS);
1295	  pmap_store_pte((ppdpt64+2), *(ppdpt+2) | PAGE_BITS);
1296	  pmap_store_pte((ppdpt64+3), *(ppdpt+3) | PAGE_BITS);
1297
1298	  /*
1299	   * The kernel is also mapped in the uber-sapce at the 4GB starting
1300	   * 0xFFFFFF80:00000000. This is the highest entry in the 4th-level.
1301	   */
1302	  pmap_store_pte((ppml4+KERNEL_UBER_PML4_INDEX), *(ppml4+0));
1303
1304	  kernel64_cr3 = (addr64_t) kernel_pmap->pm_cr3;
1305
1306	  /* Re-initialize descriptors and prepare to switch modes */
1307	  cpu_desc_init64(&cpu_data_master, TRUE);
1308	  current_cpu_datap()->cpu_is64bit = TRUE;
1309	  current_cpu_datap()->cpu_active_cr3 = kernel64_cr3;
1310
1311	  pde_mapped_size = 512*4096 ;
1312
1313	  ml_set_interrupts_enabled(istate);
1314	}
1315
1316	/* Set 64-bit mode if required. */
1317	cpu_mode_init(&cpu_data_master);
1318
1319	kernel_pmap->pm_hold = (vm_offset_t)kernel_pmap->pm_pml4;
1320
1321	kprintf("Kernel virtual space from 0x%x to 0x%x.\n",
1322			VADDR(KPTDI,0), virtual_end);
1323	printf("PAE enabled\n");
1324	if (cpu_64bit){
1325	  printf("64 bit mode enabled\n");kprintf("64 bit mode enabled\n"); }
1326
1327	kprintf("Available physical space from 0x%llx to 0x%llx\n",
1328			avail_start, avail_end);
1329
1330	/*
1331	 * By default for 64-bit users loaded at 4GB, share kernel mapping.
1332	 * But this may be overridden by the -no_shared_cr3 boot-arg.
1333	 */
1334	if (PE_parse_boot_argn("-no_shared_cr3", &no_shared_cr3, sizeof (no_shared_cr3))) {
1335		kprintf("Shared kernel address space disabled\n");
1336	}
1337
1338#ifdef	PMAP_TRACES
1339	if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
1340		kprintf("Kernel traces for pmap operations enabled\n");
1341	}
1342#endif	/* PMAP_TRACES */
1343}
1344
1345void
1346pmap_virtual_space(
1347	vm_offset_t *startp,
1348	vm_offset_t *endp)
1349{
1350	*startp = virtual_avail;
1351	*endp = virtual_end;
1352}
1353
1354/*
1355 *	Initialize the pmap module.
1356 *	Called by vm_init, to initialize any structures that the pmap
1357 *	system needs to map virtual memory.
1358 */
1359void
1360pmap_init(void)
1361{
1362	register long		npages;
1363	vm_offset_t		addr;
1364	register vm_size_t	s;
1365	vm_map_offset_t		vaddr;
1366	ppnum_t ppn;
1367
1368	/*
1369	 *	Allocate memory for the pv_head_table and its lock bits,
1370	 *	the modify bit array, and the pte_page table.
1371	 */
1372
1373	/*
1374	 * zero bias all these arrays now instead of off avail_start
1375	 * so we cover all memory
1376	 */
1377
1378	npages = i386_btop(avail_end);
1379	s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
1380			 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
1381			 + pv_lock_table_size(npages)
1382			 + pv_hash_lock_table_size((npvhash+1))
1383				+ npages);
1384
1385	s = round_page(s);
1386	if (kmem_alloc_wired(kernel_map, &addr, s) != KERN_SUCCESS)
1387		panic("pmap_init");
1388
1389	memset((char *)addr, 0, s);
1390
1391#if PV_DEBUG
1392	if (0 == npvhash) panic("npvhash not initialized");
1393#endif
1394
1395	/*
1396	 *	Allocate the structures first to preserve word-alignment.
1397	 */
1398	pv_head_table = (pv_rooted_entry_t) addr;
1399	addr = (vm_offset_t) (pv_head_table + npages);
1400
1401	pv_hash_table = (pv_hashed_entry_t *)addr;
1402	addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
1403
1404	pv_lock_table = (char *) addr;
1405	addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
1406
1407	pv_hash_lock_table = (char *) addr;
1408	addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
1409
1410	pmap_phys_attributes = (char *) addr;
1411	{
1412	        unsigned int i;
1413		unsigned int pn;
1414		ppnum_t  last_pn;
1415		pmap_memory_region_t *pmptr = pmap_memory_regions;
1416
1417		last_pn = i386_btop(avail_end);
1418
1419		for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
1420		        if (pmptr->type == kEfiConventionalMemory) {
1421			        for (pn = pmptr->base; pn <= pmptr->end; pn++) {
1422				        if (pn < last_pn) {
1423					        pmap_phys_attributes[pn] |= PHYS_MANAGED;
1424
1425						if (pn > last_managed_page)
1426						        last_managed_page = pn;
1427					}
1428				}
1429			}
1430		}
1431	}
1432
1433	/*
1434	 *	Create the zone of physical maps,
1435	 *	and of the physical-to-virtual entries.
1436	 */
1437	s = (vm_size_t) sizeof(struct pmap);
1438	pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
1439	s = (vm_size_t) sizeof(struct pv_hashed_entry);
1440	pv_hashed_list_zone = zinit(s, 10000*s, 4096, "pv_list"); /* XXX */
1441	s = 63;
1442	pdpt_zone = zinit(s, 400*s, 4096, "pdpt"); /* XXX */
1443
1444	kptobj = &kptobj_object_store;
1445	_vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG), kptobj);
1446	kernel_pmap->pm_obj = kptobj;
1447
1448	/* create pv entries for kernel pages mapped by low level
1449	   startup code.  these have to exist so we can pmap_remove()
1450	   e.g. kext pages from the middle of our addr space */
1451
1452	vaddr = (vm_map_offset_t)0;
1453	for (ppn = 0; ppn < i386_btop(avail_start) ; ppn++ ) {
1454	  pv_rooted_entry_t	pv_e;
1455
1456	  pv_e = pai_to_pvh(ppn);
1457	  pv_e->va = vaddr;
1458	  vaddr += PAGE_SIZE;
1459	  pv_e->pmap = kernel_pmap;
1460	  queue_init(&pv_e->qlink);
1461	}
1462
1463	pmap_initialized = TRUE;
1464
1465	/*
1466	 *	Initialize pmap cache.
1467	 */
1468	pmap_cache_list = PMAP_NULL;
1469	pmap_cache_count = 0;
1470	simple_lock_init(&pmap_cache_lock, 0);
1471
1472	max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
1473
1474}
1475
1476void
1477x86_lowmem_free(void)
1478{
1479	/* free lowmem pages back to the vm system. we had to defer doing this
1480	   until the vm system was fully up.
1481	   the actual pages that are released are determined by which
1482	   pages the memory sizing code puts into the region table */
1483
1484	ml_static_mfree((vm_offset_t) i386_ptob(pmap_memory_regions[0].base),
1485			(vm_size_t) i386_ptob(pmap_memory_regions[0].end - pmap_memory_regions[0].base));
1486}
1487
1488
1489#define managed_page(x) ( (unsigned int)x <= last_managed_page && (pmap_phys_attributes[x] & PHYS_MANAGED) )
1490
1491/*
1492 * this function is only used for debugging fron the vm layer
1493 */
1494boolean_t
1495pmap_verify_free(
1496		 ppnum_t pn)
1497{
1498	pv_rooted_entry_t	pv_h;
1499	int		pai;
1500	boolean_t	result;
1501
1502	assert(pn != vm_page_fictitious_addr);
1503
1504	if (!pmap_initialized)
1505		return(TRUE);
1506
1507	if (pn == vm_page_guard_addr)
1508		return TRUE;
1509
1510	pai = ppn_to_pai(pn);
1511	if (!managed_page(pai))
1512		return(FALSE);
1513	pv_h = pai_to_pvh(pn);
1514	result = (pv_h->pmap == PMAP_NULL);
1515	return(result);
1516}
1517
1518boolean_t
1519pmap_is_empty(
1520       pmap_t          pmap,
1521       vm_map_offset_t vstart,
1522       vm_map_offset_t vend)
1523{
1524	vm_map_offset_t offset;
1525	ppnum_t         phys_page;
1526
1527	if (pmap == PMAP_NULL) {
1528		return TRUE;
1529	}
1530	for (offset = vstart;
1531	     offset < vend;
1532	     offset += PAGE_SIZE_64) {
1533		phys_page = pmap_find_phys(pmap, offset);
1534		if (phys_page) {
1535			if (pmap != kernel_pmap &&
1536			    pmap->pm_task_map == TASK_MAP_32BIT &&
1537			    offset >= HIGH_MEM_BASE) {
1538				/*
1539				 * The "high_shared_pde" is used to share
1540				 * the entire top-most 2MB of address space
1541				 * between the kernel and all 32-bit tasks.
1542				 * So none of this can be removed from 32-bit
1543				 * tasks.
1544				 * Let's pretend there's nothing up
1545				 * there...
1546				 */
1547				return TRUE;
1548			}
1549			kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1550				"page %d at 0x%llx\n",
1551				pmap, vstart, vend, phys_page, offset);
1552			return FALSE;
1553		}
1554	}
1555
1556	return TRUE;
1557}
1558
1559
1560/*
1561 *	Create and return a physical map.
1562 *
1563 *	If the size specified for the map
1564 *	is zero, the map is an actual physical
1565 *	map, and may be referenced by the
1566 *	hardware.
1567 *
1568 *	If the size specified is non-zero,
1569 *	the map will be used in software only, and
1570 *	is bounded by that size.
1571 */
1572pmap_t
1573pmap_create(
1574	    vm_map_size_t	sz,
1575	    boolean_t		is_64bit)
1576{
1577	pmap_t			p;
1578	int		i;
1579	vm_offset_t	va;
1580	vm_size_t	size;
1581	pdpt_entry_t    *pdpt;
1582	pml4_entry_t    *pml4p;
1583	pd_entry_t      *pdp;
1584	int template;
1585	spl_t s;
1586
1587	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1588		   (int) (sz>>32), (int) sz, (int) is_64bit, 0, 0);
1589
1590	size = (vm_size_t) sz;
1591
1592	/*
1593	 *	A software use-only map doesn't even need a map.
1594	 */
1595
1596	if (size != 0) {
1597		return(PMAP_NULL);
1598	}
1599
1600	p = (pmap_t) zalloc(pmap_zone);
1601	if (PMAP_NULL == p)
1602		panic("pmap_create zalloc");
1603
1604	/* init counts now since we'll be bumping some */
1605	simple_lock_init(&p->lock, 0);
1606	p->stats.resident_count = 0;
1607	p->stats.resident_max = 0;
1608	p->stats.wired_count = 0;
1609	p->ref_count = 1;
1610	p->nx_enabled = 1;
1611	p->pm_shared = FALSE;
1612
1613	assert(!is_64bit || cpu_64bit);
1614	p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1615
1616	if (!cpu_64bit) {
1617		/* legacy 32 bit setup */
1618		/* in the legacy case the pdpt layer is hardwired to 4 entries and each
1619		 * entry covers 1GB of addr space */
1620		if (KERN_SUCCESS != kmem_alloc_wired(kernel_map, (vm_offset_t *)(&p->dirbase), NBPTD))
1621			panic("pmap_create kmem_alloc_wired");
1622		p->pm_hold = (vm_offset_t)zalloc(pdpt_zone);
1623		if ((vm_offset_t)NULL == p->pm_hold) {
1624			panic("pdpt zalloc");
1625		}
1626		pdpt = (pdpt_entry_t *) (( p->pm_hold + 31) & ~31);
1627		p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)pdpt);
1628		if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPGPTD*NPTDPG))))
1629			panic("pmap_create vm_object_allocate");
1630
1631		memset((char *)p->dirbase, 0, NBPTD);
1632
1633		va = (vm_offset_t)p->dirbase;
1634		p->pdirbase = kvtophys(va);
1635
1636		template = cpu_64bit ? INTEL_PTE_VALID|INTEL_PTE_RW|INTEL_PTE_USER|INTEL_PTE_REF : INTEL_PTE_VALID;
1637		for (i = 0; i< NPGPTD; i++, pdpt++ ) {
1638			pmap_paddr_t pa;
1639			pa = (pmap_paddr_t) kvtophys(va + i386_ptob(i));
1640			pmap_store_pte(pdpt, pa | template);
1641		}
1642
1643		/* map the high shared pde */
1644		s = splhigh();
1645		pmap_store_pte(pmap_pde(p, HIGH_MEM_BASE), high_shared_pde);
1646		splx(s);
1647
1648	} else {
1649	        /* 64 bit setup  */
1650
1651	        /* alloc the pml4 page in kernel vm */
1652	        if (KERN_SUCCESS != kmem_alloc_wired(kernel_map, (vm_offset_t *)(&p->pm_hold), PAGE_SIZE))
1653		        panic("pmap_create kmem_alloc_wired pml4");
1654
1655	        memset((char *)p->pm_hold, 0, PAGE_SIZE);
1656		p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_hold);
1657
1658	        vm_page_lock_queues();
1659		inuse_ptepages_count++;
1660		vm_page_unlock_queues();
1661
1662		/* allocate the vm_objs to hold the pdpt, pde and pte pages */
1663
1664		if (NULL == (p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS))))
1665			panic("pmap_create pdpt obj");
1666
1667		if (NULL == (p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS))))
1668			panic("pmap_create pdpt obj");
1669
1670		if (NULL == (p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS))))
1671			panic("pmap_create pte obj");
1672
1673		/* uber space points to uber mapped kernel */
1674		s = splhigh();
1675		pml4p = pmap64_pml4(p, 0ULL);
1676		pmap_store_pte((pml4p+KERNEL_UBER_PML4_INDEX),*kernel_pmap->pm_pml4);
1677
1678
1679		if (!is_64bit) {
1680			while ((pdp = pmap64_pde(p, (uint64_t)HIGH_MEM_BASE)) == PD_ENTRY_NULL) {
1681				splx(s);
1682				pmap_expand_pdpt(p, (uint64_t)HIGH_MEM_BASE); /* need room for another pde entry */
1683				s = splhigh();
1684			}
1685			pmap_store_pte(pdp, high_shared_pde);
1686		}
1687		splx(s);
1688	}
1689
1690	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1691		   (int) p, is_64bit, 0, 0, 0);
1692
1693	return(p);
1694}
1695
1696/*
1697 * The following routines implement the shared address optmization for 64-bit
1698 * users with a 4GB page zero.
1699 *
1700 * pmap_set_4GB_pagezero()
1701 *	is called in the exec and fork paths to mirror the kernel's
1702 *	mapping in the bottom 4G of the user's pmap. The task mapping changes
1703 *	from TASK_MAP_64BIT to TASK_MAP_64BIT_SHARED. This routine returns
1704 *	without doing anything if the -no_shared_cr3 boot-arg is set.
1705 *
1706 * pmap_clear_4GB_pagezero()
1707 *	is called in the exec/exit paths to undo this mirror. The task mapping
1708 *	reverts to TASK_MAP_64BIT. In addition, we switch to the kernel's
1709 *	CR3 by calling pmap_load_kernel_cr3().
1710 *
1711 * pmap_load_kernel_cr3()
1712 *	loads cr3 with the kernel's page table. In addition to being called
1713 * 	by pmap_clear_4GB_pagezero(), it is used both prior to teardown and
1714 *	when we go idle in the context of a shared map.
1715 *
1716 * Further notes on per-cpu data used:
1717 *
1718 *	cpu_kernel_cr3	is the cr3 for the kernel's pmap.
1719 *			This is loaded in a trampoline on entering the kernel
1720 *			from a 32-bit user (or non-shared-cr3 64-bit user).
1721 *	cpu_task_cr3	is the cr3 for the current thread.
1722 *			This is loaded in a trampoline as we exit the kernel.
1723 *	cpu_active_cr3	reflects the cr3 currently loaded.
1724 *			However, the low order bit is set when the
1725 *			processor is idle or interrupts are disabled
1726 *			while the system pmap lock is held. It is used by
1727 *			tlb shoot-down.
1728 *	cpu_task_map	indicates whether the task cr3 belongs to
1729 *			a 32-bit, a 64-bit or a 64-bit shared map.
1730 *			The latter allows the avoidance of the cr3 load
1731 *			on kernel entry and exit.
1732 *	cpu_tlb_invalid	set TRUE when a tlb flush is requested.
1733 *			If the cr3 is "inactive" (the cpu is idle or the
1734 *			system-wide pmap lock is held) this not serviced by
1735 *			an IPI but at time when the cr3 becomes "active".
1736 */
1737
1738void
1739pmap_set_4GB_pagezero(pmap_t p)
1740{
1741	pdpt_entry_t	*user_pdptp;
1742	pdpt_entry_t	*kern_pdptp;
1743
1744	assert(p->pm_task_map != TASK_MAP_32BIT);
1745
1746	/* Kernel-shared cr3 may be disabled by boot arg. */
1747	if (no_shared_cr3)
1748		return;
1749
1750	/*
1751	 * Set the bottom 4 3rd-level pte's to be the kernel's.
1752	 */
1753	PMAP_LOCK(p);
1754	while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
1755		PMAP_UNLOCK(p);
1756		pmap_expand_pml4(p, 0x0);
1757		PMAP_LOCK(p);
1758	}
1759	kern_pdptp = kernel_pmap->pm_pdpt;
1760	pmap_store_pte(user_pdptp+0, *(kern_pdptp+0));
1761	pmap_store_pte(user_pdptp+1, *(kern_pdptp+1));
1762	pmap_store_pte(user_pdptp+2, *(kern_pdptp+2));
1763	pmap_store_pte(user_pdptp+3, *(kern_pdptp+3));
1764	p->pm_task_map = TASK_MAP_64BIT_SHARED;
1765	PMAP_UNLOCK(p);
1766}
1767
1768void
1769pmap_clear_4GB_pagezero(pmap_t p)
1770{
1771	pdpt_entry_t	*user_pdptp;
1772
1773	if (p->pm_task_map != TASK_MAP_64BIT_SHARED)
1774		return;
1775
1776	PMAP_LOCK(p);
1777
1778	p->pm_task_map = TASK_MAP_64BIT;
1779
1780	pmap_load_kernel_cr3();
1781
1782	user_pdptp = pmap64_pdpt(p, 0x0);
1783	pmap_store_pte(user_pdptp+0, 0);
1784	pmap_store_pte(user_pdptp+1, 0);
1785	pmap_store_pte(user_pdptp+2, 0);
1786	pmap_store_pte(user_pdptp+3, 0);
1787
1788	PMAP_UNLOCK(p);
1789}
1790
1791void
1792pmap_load_kernel_cr3(void)
1793{
1794	uint64_t	kernel_cr3;
1795
1796	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
1797
1798	/*
1799	 * Reload cr3 with the true kernel cr3.
1800	 */
1801	kernel_cr3 = current_cpu_datap()->cpu_kernel_cr3;
1802	set64_cr3(kernel_cr3);
1803	current_cpu_datap()->cpu_active_cr3 = kernel_cr3;
1804	current_cpu_datap()->cpu_tlb_invalid = FALSE;
1805	__asm__ volatile("mfence");
1806}
1807
1808/*
1809 *	Retire the given physical map from service.
1810 *	Should only be called if the map contains
1811 *	no valid mappings.
1812 */
1813
1814void
1815pmap_destroy(
1816	register pmap_t	p)
1817{
1818	register int		c;
1819
1820	if (p == PMAP_NULL)
1821		return;
1822
1823	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1824		   (int) p, 0, 0, 0, 0);
1825
1826	PMAP_LOCK(p);
1827
1828	c = --p->ref_count;
1829
1830	if (c == 0) {
1831		/*
1832		 * If some cpu is not using the physical pmap pointer that it
1833		 * is supposed to be (see set_dirbase), we might be using the
1834		 * pmap that is being destroyed! Make sure we are
1835		 * physically on the right pmap:
1836		 */
1837		PMAP_UPDATE_TLBS(p,
1838				 0x0ULL,
1839				 0xFFFFFFFFFFFFF000ULL);
1840	}
1841
1842	PMAP_UNLOCK(p);
1843
1844	if (c != 0) {
1845		PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1846			   (int) p, 1, 0, 0, 0);
1847	        return;	/* still in use */
1848	}
1849
1850	/*
1851	 *	Free the memory maps, then the
1852	 *	pmap structure.
1853	 */
1854	if (!cpu_64bit) {
1855	        vm_page_lock_queues();
1856		inuse_ptepages_count -= p->pm_obj->resident_page_count;
1857		vm_page_unlock_queues();
1858
1859		kmem_free(kernel_map, (vm_offset_t)p->dirbase, NBPTD);
1860		zfree(pdpt_zone, (void *)p->pm_hold);
1861
1862		vm_object_deallocate(p->pm_obj);
1863	} else {
1864	        /* 64 bit */
1865	        int inuse_ptepages = 0;
1866
1867		/* free 64 bit mode structs */
1868		inuse_ptepages++;
1869		kmem_free(kernel_map, (vm_offset_t)p->pm_hold, PAGE_SIZE);
1870
1871		inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1872		vm_object_deallocate(p->pm_obj_pml4);
1873
1874		inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1875		vm_object_deallocate(p->pm_obj_pdpt);
1876
1877		inuse_ptepages += p->pm_obj->resident_page_count;
1878		vm_object_deallocate(p->pm_obj);
1879
1880		vm_page_lock_queues();
1881		inuse_ptepages_count -= inuse_ptepages;
1882		vm_page_unlock_queues();
1883	}
1884	zfree(pmap_zone, p);
1885
1886	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1887		   0, 0, 0, 0, 0);
1888
1889}
1890
1891/*
1892 *	Add a reference to the specified pmap.
1893 */
1894
1895void
1896pmap_reference(
1897	register pmap_t	p)
1898{
1899
1900	if (p != PMAP_NULL) {
1901	        PMAP_LOCK(p);
1902		p->ref_count++;
1903		PMAP_UNLOCK(p);;
1904	}
1905}
1906
1907/*
1908 *	Remove a range of hardware page-table entries.
1909 *	The entries given are the first (inclusive)
1910 *	and last (exclusive) entries for the VM pages.
1911 *	The virtual address is the va for the first pte.
1912 *
1913 *	The pmap must be locked.
1914 *	If the pmap is not the kernel pmap, the range must lie
1915 *	entirely within one pte-page.  This is NOT checked.
1916 *	Assumes that the pte-page exists.
1917 */
1918
1919void
1920pmap_remove_range(
1921	pmap_t			pmap,
1922	vm_map_offset_t		start_vaddr,
1923	pt_entry_t		*spte,
1924	pt_entry_t		*epte)
1925{
1926	register pt_entry_t	*cpte;
1927	pv_hashed_entry_t       pvh_et = PV_HASHED_ENTRY_NULL;
1928	pv_hashed_entry_t       pvh_eh = PV_HASHED_ENTRY_NULL;
1929	pv_hashed_entry_t       pvh_e;
1930	int			pvh_cnt = 0;
1931	int			num_removed, num_unwired, num_found;
1932	int			pai;
1933	pmap_paddr_t		pa;
1934	vm_map_offset_t		vaddr;
1935	int                     pvhash_idx;
1936	uint32_t                pv_cnt;
1937
1938	num_removed = 0;
1939	num_unwired = 0;
1940	num_found   = 0;
1941
1942	if (pmap != kernel_pmap &&
1943	    pmap->pm_task_map == TASK_MAP_32BIT &&
1944	    start_vaddr >= HIGH_MEM_BASE) {
1945		/*
1946		 * The range is in the "high_shared_pde" which is shared
1947		 * between the kernel and all 32-bit tasks.  It holds
1948		 * the 32-bit commpage but also the trampolines, GDT, etc...
1949		 * so we can't let user tasks remove anything from it.
1950		 */
1951		return;
1952	}
1953
1954	/* invalidate the PTEs first to "freeze" them */
1955	for (cpte = spte, vaddr = start_vaddr;
1956	     cpte < epte;
1957	     cpte++, vaddr += PAGE_SIZE_64) {
1958
1959	    pa = pte_to_pa(*cpte);
1960	    if (pa == 0)
1961		continue;
1962	    num_found++;
1963
1964	    if (iswired(*cpte))
1965		num_unwired++;
1966
1967	    pai = pa_index(pa);
1968
1969	    if (!managed_page(pai)) {
1970		/*
1971		 *	Outside range of managed physical memory.
1972		 *	Just remove the mappings.
1973		 */
1974		pmap_store_pte(cpte, 0);
1975		continue;
1976	    }
1977
1978	    /* invalidate the PTE */
1979	    pmap_update_pte(cpte, *cpte, (*cpte & ~INTEL_PTE_VALID));
1980	}
1981
1982	if (num_found == 0) {
1983		/* nothing was changed: we're done */
1984	        goto update_counts;
1985	}
1986
1987	/* propagate the invalidates to other CPUs */
1988
1989	PMAP_UPDATE_TLBS(pmap, start_vaddr, vaddr);
1990
1991	for (cpte = spte, vaddr = start_vaddr;
1992	     cpte < epte;
1993	     cpte++, vaddr += PAGE_SIZE_64) {
1994
1995	    pa = pte_to_pa(*cpte);
1996	    if (pa == 0)
1997		continue;
1998
1999	    pai = pa_index(pa);
2000
2001	    LOCK_PVH(pai);
2002
2003	    pa = pte_to_pa(*cpte);
2004	    if (pa == 0) {
2005	      UNLOCK_PVH(pai);
2006	      continue;
2007	    }
2008
2009	    num_removed++;
2010
2011	    /*
2012	     *	Get the modify and reference bits, then
2013	     *  nuke the entry in the page table
2014	     */
2015	    /* remember reference and change */
2016	    pmap_phys_attributes[pai] |=
2017		    (char)(*cpte & (PHYS_MODIFIED | PHYS_REFERENCED));
2018	    /* completely invalidate the PTE */
2019	    pmap_store_pte(cpte, 0);
2020
2021	    /*
2022	     *	Remove the mapping from the pvlist for
2023	     *	this physical page.
2024	     */
2025	    {
2026	      pv_rooted_entry_t	pv_h;
2027	      pv_hashed_entry_t *pprevh;
2028	      ppnum_t ppn = (ppnum_t)pai;
2029
2030		pv_h = pai_to_pvh(pai);
2031		pvh_e = PV_HASHED_ENTRY_NULL;
2032		if (pv_h->pmap == PMAP_NULL)
2033		    panic("pmap_remove_range: null pv_list!");
2034
2035		if (pv_h->va == vaddr && pv_h->pmap == pmap) { /* rooted or not */
2036		    /*
2037		     * Header is the pv_rooted_entry. We can't free that. If there is a queued
2038		     * entry after this one we remove that
2039		     * from the ppn queue, we remove it from the hash chain
2040		     * and copy it to the rooted entry. Then free it instead.
2041		     */
2042
2043		  pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2044		  if (pv_h != (pv_rooted_entry_t)pvh_e) {  /* any queued after rooted? */
2045		    CHK_NPVHASH();
2046		    pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2047		    LOCK_PV_HASH(pvhash_idx);
2048		    remque(&pvh_e->qlink);
2049		    {
2050		      pprevh = pvhash(pvhash_idx);
2051		      if (PV_HASHED_ENTRY_NULL == *pprevh) {
2052			panic("pmap_remove_range empty hash removing rooted pv");
2053		      }
2054		    }
2055		    pmap_pvh_unlink(pvh_e);
2056		    UNLOCK_PV_HASH(pvhash_idx);
2057		    pv_h->pmap = pvh_e->pmap;
2058		    pv_h->va = pvh_e->va;   /* dispose of pvh_e */
2059		  } else {  /* none queued after rooted */
2060		    pv_h->pmap = PMAP_NULL;
2061		    pvh_e = PV_HASHED_ENTRY_NULL;
2062		  }   /* any queued after rooted */
2063
2064		} else { /* rooted or not */
2065		  /* not removing rooted pv. find it on hash chain, remove from ppn queue and
2066		   * hash chain and free it */
2067		  CHK_NPVHASH();
2068		  pvhash_idx = pvhashidx(pmap,vaddr);
2069		  LOCK_PV_HASH(pvhash_idx);
2070		  pprevh = pvhash(pvhash_idx);
2071		  if (PV_HASHED_ENTRY_NULL == *pprevh) {
2072		    panic("pmap_remove_range empty hash removing hashed pv");
2073		    }
2074		  pvh_e = *pprevh;
2075		  pmap_pv_hashlist_walks++;
2076		  pv_cnt = 0;
2077		  while (PV_HASHED_ENTRY_NULL != pvh_e) {
2078			pv_cnt++;
2079			if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == ppn) break;
2080			pprevh = &pvh_e->nexth;
2081			pvh_e = pvh_e->nexth;
2082		  }
2083		  pmap_pv_hashlist_cnts += pv_cnt;
2084		  if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2085		  if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_remove_range pv not on hash");
2086		  *pprevh = pvh_e->nexth;
2087		  remque(&pvh_e->qlink);
2088		  UNLOCK_PV_HASH(pvhash_idx);
2089
2090		} /* rooted or not */
2091
2092		UNLOCK_PVH(pai);
2093
2094		if (pvh_e != PV_HASHED_ENTRY_NULL) {
2095		  pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2096		  pvh_eh = pvh_e;
2097
2098		  if (pvh_et == PV_HASHED_ENTRY_NULL) {
2099		    pvh_et = pvh_e;
2100		  }
2101
2102		  pvh_cnt++;
2103		}
2104
2105	    } /* removing mappings for this phy page */
2106	} /* for loop */
2107
2108	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2109	    PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2110	}
2111
2112update_counts:
2113	/*
2114	 *	Update the counts
2115	 */
2116#if TESTING
2117	if (pmap->stats.resident_count < num_removed)
2118	        panic("pmap_remove_range: resident_count");
2119#endif
2120	assert(pmap->stats.resident_count >= num_removed);
2121	OSAddAtomic(-num_removed, (SInt32 *) &pmap->stats.resident_count);
2122
2123#if TESTING
2124	if (pmap->stats.wired_count < num_unwired)
2125	        panic("pmap_remove_range: wired_count");
2126#endif
2127	assert(pmap->stats.wired_count >= num_unwired);
2128	OSAddAtomic(-num_unwired, (SInt32 *) &pmap->stats.wired_count);
2129
2130	return;
2131}
2132
2133/*
2134 *	Remove phys addr if mapped in specified map
2135 *
2136 */
2137void
2138pmap_remove_some_phys(
2139	__unused pmap_t		map,
2140	__unused ppnum_t         pn)
2141{
2142
2143/* Implement to support working set code */
2144
2145}
2146
2147/*
2148 *	Remove the given range of addresses
2149 *	from the specified map.
2150 *
2151 *	It is assumed that the start and end are properly
2152 *	rounded to the hardware page size.
2153 */
2154
2155
2156void
2157pmap_remove(
2158	pmap_t		map,
2159	addr64_t	s64,
2160	addr64_t	e64)
2161{
2162	pt_entry_t	*pde;
2163	pt_entry_t	*spte, *epte;
2164	addr64_t	l64;
2165	addr64_t    	orig_s64;
2166	uint64_t        deadline;
2167
2168	pmap_intr_assert();
2169
2170	if (map == PMAP_NULL || s64 == e64)
2171		return;
2172
2173	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
2174		   (int) map,
2175		   (int) (s64>>32), (int) s64,
2176		   (int) (e64>>32), (int) e64);
2177
2178	PMAP_LOCK(map);
2179
2180#if 0
2181	/*
2182	 * Check that address range in the kernel does not overlap the stacks.
2183	 * We initialize local static min/max variables once to avoid making
2184	 * 2 function calls for every remove. Note also that these functions
2185	 * both return 0 before kernel stacks have been initialized, and hence
2186	 * the panic is not triggered in this case.
2187	 */
2188	if (map == kernel_pmap) {
2189		static vm_offset_t	kernel_stack_min = 0;
2190		static vm_offset_t	kernel_stack_max = 0;
2191
2192		if (kernel_stack_min == 0) {
2193			kernel_stack_min = min_valid_stack_address();
2194			kernel_stack_max = max_valid_stack_address();
2195		}
2196		if  ((kernel_stack_min <= s64 && s64 <  kernel_stack_max) ||
2197		     (kernel_stack_min <  e64 && e64 <= kernel_stack_max))
2198			panic("pmap_remove() attempted in kernel stack");
2199	}
2200#else
2201
2202	/*
2203	 * The values of kernel_stack_min and kernel_stack_max are no longer
2204	 * relevant now that we allocate kernel stacks anywhere in the kernel map,
2205	 * so the old code above no longer applies.  If we wanted to check that
2206	 * we weren't removing a mapping of a page in a kernel stack we'd have to
2207	 * mark the PTE with an unused bit and check that here.
2208	 */
2209
2210#endif
2211
2212	deadline = rdtsc64() + max_preemption_latency_tsc;
2213
2214	orig_s64 = s64;
2215
2216	while (s64 < e64) {
2217
2218	    l64 = (s64 + pde_mapped_size) & ~(pde_mapped_size-1);
2219	    if (l64 > e64)
2220		l64 = e64;
2221	    pde = pmap_pde(map, s64);
2222
2223	    if (pde && (*pde & INTEL_PTE_VALID)) {
2224	        spte = (pt_entry_t *)pmap_pte(map, (s64 & ~(pde_mapped_size-1)));
2225		spte = &spte[ptenum(s64)];
2226		epte = &spte[intel_btop(l64-s64)];
2227
2228		pmap_remove_range(map, s64, spte, epte);
2229	    }
2230	    s64 = l64;
2231	    pde++;
2232
2233	    if (s64 < e64 && rdtsc64() >= deadline) {
2234	      PMAP_UNLOCK(map)
2235		PMAP_LOCK(map)
2236
2237	      deadline = rdtsc64() + max_preemption_latency_tsc;
2238	    }
2239
2240	}
2241
2242	PMAP_UNLOCK(map);
2243
2244	PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END,
2245		   (int) map, 0, 0, 0, 0);
2246
2247}
2248
2249/*
2250 *	Routine:	pmap_page_protect
2251 *
2252 *	Function:
2253 *		Lower the permission for all mappings to a given
2254 *		page.
2255 */
2256void
2257pmap_page_protect(
2258        ppnum_t         pn,
2259	vm_prot_t	prot)
2260{
2261	pv_hashed_entry_t		pvh_eh = PV_HASHED_ENTRY_NULL;
2262	pv_hashed_entry_t		pvh_et = PV_HASHED_ENTRY_NULL;
2263	pv_hashed_entry_t       nexth;
2264	int			pvh_cnt = 0;
2265	pv_rooted_entry_t		pv_h;
2266	pv_rooted_entry_t		pv_e;
2267	pv_hashed_entry_t       pvh_e;
2268	pt_entry_t		*pte;
2269	int			pai;
2270	register pmap_t		pmap;
2271	boolean_t		remove;
2272	int                     pvhash_idx;
2273
2274	pmap_intr_assert();
2275	assert(pn != vm_page_fictitious_addr);
2276	if (pn == vm_page_guard_addr)
2277		return;
2278
2279	pai = ppn_to_pai(pn);
2280
2281	if (!managed_page(pai)) {
2282	    /*
2283	     *	Not a managed page.
2284	     */
2285	    return;
2286	}
2287
2288	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START,
2289		   (int) pn, (int) prot, 0, 0, 0);
2290
2291	/*
2292	 * Determine the new protection.
2293	 */
2294	switch (prot) {
2295	    case VM_PROT_READ:
2296	    case VM_PROT_READ|VM_PROT_EXECUTE:
2297		remove = FALSE;
2298		break;
2299	    case VM_PROT_ALL:
2300		return;	/* nothing to do */
2301	    default:
2302		remove = TRUE;
2303		break;
2304	}
2305
2306	pv_h = pai_to_pvh(pai);
2307
2308	LOCK_PVH(pai);
2309
2310	/*
2311	 * Walk down PV list, changing or removing all mappings.
2312	 */
2313	if (pv_h->pmap != PMAP_NULL) {
2314
2315	    pv_e = pv_h;
2316	    pvh_e = (pv_hashed_entry_t)pv_e; /* cheat */
2317
2318	    do {
2319	        register vm_map_offset_t vaddr;
2320		pmap = pv_e->pmap;
2321
2322		vaddr = pv_e->va;
2323		pte = pmap_pte(pmap, vaddr);
2324
2325		if (0 == pte) {
2326		    kprintf("pmap_page_protect pmap %p pn 0x%x vaddr 0x%llx\n",pmap, pn, vaddr);
2327		    panic("pmap_page_protect");
2328		}
2329
2330		nexth = (pv_hashed_entry_t)queue_next(&pvh_e->qlink);  /* if there is one */
2331
2332		/*
2333		 * Remove the mapping if new protection is NONE
2334		 * or if write-protecting a kernel mapping.
2335		 */
2336		if (remove || pmap == kernel_pmap) {
2337		    /*
2338		     * Remove the mapping, collecting any modify bits.
2339		     */
2340		    pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2341
2342		    PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2343
2344		    pmap_phys_attributes[pai] |= *pte & (PHYS_MODIFIED|PHYS_REFERENCED);
2345
2346		    pmap_store_pte(pte, 0);
2347
2348#if TESTING
2349		    if (pmap->stats.resident_count < 1)
2350		        panic("pmap_page_protect: resident_count");
2351#endif
2352		    assert(pmap->stats.resident_count >= 1);
2353		    OSAddAtomic(-1, (SInt32 *) &pmap->stats.resident_count);
2354
2355		    /*
2356		     * Deal with the pv_rooted_entry.
2357		     */
2358
2359		    if (pv_e == pv_h) {
2360			/*
2361			 * Fix up head later.
2362			 */
2363			pv_h->pmap = PMAP_NULL;
2364		    }
2365		    else {
2366			/*
2367			 * Delete this entry.
2368			 */
2369		      CHK_NPVHASH();
2370		      pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2371		      LOCK_PV_HASH(pvhash_idx);
2372		      remque(&pvh_e->qlink);
2373		      pmap_pvh_unlink(pvh_e);
2374		      UNLOCK_PV_HASH(pvhash_idx);
2375
2376		      pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2377			pvh_eh = pvh_e;
2378
2379			if (pvh_et == PV_HASHED_ENTRY_NULL)
2380			    pvh_et = pvh_e;
2381			pvh_cnt++;
2382		    }
2383		} else {
2384		    /*
2385		     * Write-protect.
2386		     */
2387		    pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WRITE));
2388		    PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2389		}
2390
2391		pvh_e = nexth;
2392	    } while ((pv_e = (pv_rooted_entry_t)nexth) != pv_h);
2393
2394	    /*
2395	     * If pv_head mapping was removed, fix it up.
2396	     */
2397
2398	    if (pv_h->pmap == PMAP_NULL) {
2399	      pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2400
2401	      if (pvh_e != (pv_hashed_entry_t)pv_h) {
2402		CHK_NPVHASH();
2403		pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va);
2404		LOCK_PV_HASH(pvhash_idx);
2405		remque(&pvh_e->qlink);
2406		pmap_pvh_unlink(pvh_e);
2407		UNLOCK_PV_HASH(pvhash_idx);
2408		  pv_h->pmap = pvh_e->pmap;
2409		  pv_h->va = pvh_e->va;
2410		  pvh_e->qlink.next = (queue_entry_t)pvh_eh;
2411		    pvh_eh = pvh_e;
2412
2413		    if (pvh_et == PV_HASHED_ENTRY_NULL)
2414		        pvh_et = pvh_e;
2415		    pvh_cnt++;
2416		}
2417	    }
2418	}
2419	if (pvh_eh != PV_HASHED_ENTRY_NULL) {
2420	    PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pvh_cnt);
2421	}
2422
2423	UNLOCK_PVH(pai);
2424
2425	PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END,
2426		   0, 0, 0, 0, 0);
2427
2428}
2429
2430
2431/*
2432 *	Routine:
2433 *		pmap_disconnect
2434 *
2435 *	Function:
2436 *		Disconnect all mappings for this page and return reference and change status
2437 *		in generic format.
2438 *
2439 */
2440unsigned int pmap_disconnect(
2441	ppnum_t pa)
2442{
2443	pmap_page_protect(pa, 0);			/* disconnect the page */
2444	return (pmap_get_refmod(pa));			/* return ref/chg status */
2445}
2446
2447/*
2448 *	Set the physical protection on the
2449 *	specified range of this map as requested.
2450 *	Will not increase permissions.
2451 */
2452void
2453pmap_protect(
2454	pmap_t		map,
2455	vm_map_offset_t	sva,
2456	vm_map_offset_t	eva,
2457	vm_prot_t	prot)
2458{
2459	register pt_entry_t	*pde;
2460	register pt_entry_t	*spte, *epte;
2461	vm_map_offset_t		lva;
2462	vm_map_offset_t		orig_sva;
2463	boolean_t	set_NX;
2464	int		num_found = 0;
2465
2466	pmap_intr_assert();
2467
2468	if (map == PMAP_NULL)
2469		return;
2470
2471	if (prot == VM_PROT_NONE) {
2472		pmap_remove(map, sva, eva);
2473		return;
2474	}
2475
2476	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
2477		   (int) map,
2478		   (int) (sva>>32), (int) sva,
2479		   (int) (eva>>32), (int) eva);
2480
2481	if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled )
2482	        set_NX = FALSE;
2483	else
2484	        set_NX = TRUE;
2485
2486	PMAP_LOCK(map);
2487
2488	orig_sva = sva;
2489	while (sva < eva) {
2490	    lva = (sva + pde_mapped_size) & ~(pde_mapped_size-1);
2491	    if (lva > eva)
2492		lva = eva;
2493	    pde = pmap_pde(map, sva);
2494	    if (pde && (*pde & INTEL_PTE_VALID)) {
2495	        spte = (pt_entry_t *)pmap_pte(map, (sva & ~(pde_mapped_size-1)));
2496		spte = &spte[ptenum(sva)];
2497		epte = &spte[intel_btop(lva-sva)];
2498
2499		while (spte < epte) {
2500
2501		    if (*spte & INTEL_PTE_VALID) {
2502
2503		        if (prot & VM_PROT_WRITE)
2504			    pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_WRITE));
2505			else
2506			    pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_WRITE));
2507
2508			if (set_NX == TRUE)
2509			    pmap_update_pte(spte, *spte, (*spte | INTEL_PTE_NX));
2510			else
2511			    pmap_update_pte(spte, *spte, (*spte & ~INTEL_PTE_NX));
2512
2513			num_found++;
2514		    }
2515		    spte++;
2516		}
2517	    }
2518	    sva = lva;
2519	}
2520	if (num_found)
2521	    PMAP_UPDATE_TLBS(map, orig_sva, eva);
2522
2523	PMAP_UNLOCK(map);
2524
2525	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
2526		   0, 0, 0, 0, 0);
2527
2528}
2529
2530/* Map a (possibly) autogenned block */
2531void
2532pmap_map_block(
2533	pmap_t		pmap,
2534	addr64_t	va,
2535	ppnum_t 	pa,
2536	uint32_t	size,
2537	vm_prot_t	prot,
2538	int		attr,
2539	__unused unsigned int	flags)
2540{
2541    uint32_t page;
2542
2543    for (page = 0; page < size; page++) {
2544	pmap_enter(pmap, va, pa, prot, attr, TRUE);
2545	va += PAGE_SIZE;
2546	pa++;
2547    }
2548}
2549
2550
2551/*
2552 *	Insert the given physical page (p) at
2553 *	the specified virtual address (v) in the
2554 *	target physical map with the protection requested.
2555 *
2556 *	If specified, the page will be wired down, meaning
2557 *	that the related pte cannot be reclaimed.
2558 *
2559 *	NB:  This is the only routine which MAY NOT lazy-evaluate
2560 *	or lose information.  That is, this routine must actually
2561 *	insert this page into the given map NOW.
2562 */
2563void
2564pmap_enter(
2565	register pmap_t		pmap,
2566 	vm_map_offset_t		vaddr,
2567	ppnum_t                 pn,
2568	vm_prot_t		prot,
2569	unsigned int 		flags,
2570	boolean_t		wired)
2571{
2572	register pt_entry_t	*pte;
2573	register pv_rooted_entry_t	pv_h;
2574	register int		pai;
2575	pv_hashed_entry_t		pvh_e;
2576	pv_hashed_entry_t		pvh_new;
2577	pv_hashed_entry_t       *hashp;
2578	pt_entry_t		template;
2579	pmap_paddr_t		old_pa;
2580	pmap_paddr_t             pa = (pmap_paddr_t)i386_ptob(pn);
2581	boolean_t		need_tlbflush = FALSE;
2582	boolean_t		set_NX;
2583	char			oattr;
2584	int                     pvhash_idx;
2585	uint32_t                pv_cnt;
2586	boolean_t               old_pa_locked;
2587
2588	pmap_intr_assert();
2589	assert(pn != vm_page_fictitious_addr);
2590	if (pmap_debug)
2591		printf("pmap(%qx, %x)\n", vaddr, pn);
2592	if (pmap == PMAP_NULL)
2593		return;
2594	if (pn == vm_page_guard_addr)
2595		return;
2596
2597	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
2598		   (int) pmap,
2599		   (int) (vaddr>>32), (int) vaddr,
2600		   (int) pn, prot);
2601
2602	if ( (prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled )
2603	        set_NX = FALSE;
2604	else
2605	        set_NX = TRUE;
2606
2607	/*
2608	 *	Must allocate a new pvlist entry while we're unlocked;
2609	 *	zalloc may cause pageout (which will lock the pmap system).
2610	 *	If we determine we need a pvlist entry, we will unlock
2611	 *	and allocate one.  Then we will retry, throughing away
2612	 *	the allocated entry later (if we no longer need it).
2613	 */
2614
2615	pvh_new = PV_HASHED_ENTRY_NULL;
2616Retry:
2617	pvh_e = PV_HASHED_ENTRY_NULL;
2618
2619	PMAP_LOCK(pmap);
2620
2621	/*
2622	 *	Expand pmap to include this pte.  Assume that
2623	 *	pmap is always expanded to include enough hardware
2624	 *	pages to map one VM page.
2625	 */
2626
2627	while ((pte = pmap_pte(pmap, vaddr)) == PT_ENTRY_NULL) {
2628		/*
2629		 *	Must unlock to expand the pmap.
2630		 */
2631	        PMAP_UNLOCK(pmap);
2632		pmap_expand(pmap, vaddr); /* going to grow pde level page(s) */
2633		PMAP_LOCK(pmap);
2634	}
2635
2636	old_pa = pte_to_pa(*pte);
2637	pai = pa_index(old_pa);
2638	old_pa_locked = FALSE;
2639
2640	/*
2641	 * if we have a previous managed page, lock the pv entry now. after
2642	 * we lock it, check to see if someone beat us to the lock and if so
2643	 * drop the lock
2644	 */
2645
2646	if ((0 != old_pa) && managed_page(pai)) {
2647	  LOCK_PVH(pai);
2648	  old_pa_locked = TRUE;
2649	  old_pa = pte_to_pa(*pte);
2650	  if (0 == old_pa) {
2651	    UNLOCK_PVH(pai);  /* some other path beat us to it */
2652	    old_pa_locked = FALSE;
2653	  }
2654	}
2655
2656
2657	/*
2658	 *	Special case if the incoming physical page is already mapped
2659	 *	at this address.
2660	 */
2661	if (old_pa == pa) {
2662
2663	    /*
2664	     *	May be changing its wired attribute or protection
2665	     */
2666
2667	    template = pa_to_pte(pa) | INTEL_PTE_VALID;
2668
2669	    if(VM_MEM_NOT_CACHEABLE == (flags & (VM_MEM_NOT_CACHEABLE | VM_WIMG_USE_DEFAULT))) {
2670		if(!(flags & VM_MEM_GUARDED))
2671			template |= INTEL_PTE_PTA;
2672		template |= INTEL_PTE_NCACHE;
2673	    }
2674
2675	    if (pmap != kernel_pmap)
2676		template |= INTEL_PTE_USER;
2677	    if (prot & VM_PROT_WRITE)
2678		template |= INTEL_PTE_WRITE;
2679
2680	    if (set_NX == TRUE)
2681		template |= INTEL_PTE_NX;
2682
2683	    if (wired) {
2684		template |= INTEL_PTE_WIRED;
2685		if (!iswired(*pte))
2686		    OSAddAtomic(+1, (SInt32 *) &pmap->stats.wired_count);
2687	    }
2688	    else {
2689		if (iswired(*pte)) {
2690		    assert(pmap->stats.wired_count >= 1);
2691		    OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2692		}
2693	    }
2694
2695	    /* store modified PTE and preserve RC bits */
2696	    pmap_update_pte(pte, *pte, template | (*pte & (INTEL_PTE_REF | INTEL_PTE_MOD)));
2697	    if (old_pa_locked) {
2698	      UNLOCK_PVH(pai);
2699	      old_pa_locked = FALSE;
2700	    }
2701	    need_tlbflush = TRUE;
2702	    goto Done;
2703	}
2704
2705	/*
2706	 *	Outline of code from here:
2707	 *	   1) If va was mapped, update TLBs, remove the mapping
2708	 *	      and remove old pvlist entry.
2709	 *	   2) Add pvlist entry for new mapping
2710	 *	   3) Enter new mapping.
2711	 *
2712	 *	If the old physical page is not managed step 1) is skipped
2713	 *	(except for updating the TLBs), and the mapping is
2714	 *	overwritten at step 3).  If the new physical page is not
2715	 *	managed, step 2) is skipped.
2716	 */
2717
2718	if (old_pa != (pmap_paddr_t) 0) {
2719
2720	    /*
2721	     *	Don't do anything to pages outside valid memory here.
2722	     *	Instead convince the code that enters a new mapping
2723	     *	to overwrite the old one.
2724	     */
2725
2726	    /* invalidate the PTE */
2727	    pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_VALID));
2728	    /* propagate invalidate everywhere */
2729	    PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2730	    /* remember reference and change */
2731	    oattr = (char)(*pte & (PHYS_MODIFIED | PHYS_REFERENCED));
2732	    /* completely invalidate the PTE */
2733	    pmap_store_pte(pte, 0);
2734
2735	    if (managed_page(pai)) {
2736
2737#if TESTING
2738	        if (pmap->stats.resident_count < 1)
2739		    panic("pmap_enter: resident_count");
2740#endif
2741		assert(pmap->stats.resident_count >= 1);
2742		OSAddAtomic(-1, (SInt32 *) &pmap->stats.resident_count);
2743
2744	    	if (iswired(*pte)) {
2745
2746#if TESTING
2747	            if (pmap->stats.wired_count < 1)
2748		        panic("pmap_enter: wired_count");
2749#endif
2750		    assert(pmap->stats.wired_count >= 1);
2751		    OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2752		}
2753
2754		pmap_phys_attributes[pai] |= oattr;
2755		/*
2756		 *	Remove the mapping from the pvlist for
2757		 *	this physical page.
2758		 *      We'll end up with either a rooted pv or a
2759		 *      hashed pv
2760		 */
2761		{
2762
2763		    pv_h = pai_to_pvh(pai);
2764
2765		    if (pv_h->pmap == PMAP_NULL) {
2766			panic("pmap_enter: null pv_list!");
2767		    }
2768
2769		    if (pv_h->va == vaddr && pv_h->pmap == pmap) {
2770			/*
2771			 * Header is the pv_rooted_entry.
2772			 * If there is a next one, copy it to the
2773			 * header and free the next one (we cannot
2774			 * free the header)
2775			 */
2776		      pvh_e = (pv_hashed_entry_t)queue_next(&pv_h->qlink);
2777		      if (pvh_e != (pv_hashed_entry_t)pv_h) {
2778			pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va);
2779			LOCK_PV_HASH(pvhash_idx);
2780			  remque(&pvh_e->qlink);
2781			  pmap_pvh_unlink(pvh_e);
2782			  UNLOCK_PV_HASH(pvhash_idx);
2783			  pv_h->pmap = pvh_e->pmap;
2784			  pv_h->va = pvh_e->va;
2785			}
2786		      else {
2787			pv_h->pmap = PMAP_NULL;
2788			pvh_e = PV_HASHED_ENTRY_NULL;
2789		      }
2790		    }
2791		    else {
2792		      pv_hashed_entry_t *pprevh;
2793		      ppnum_t old_ppn;
2794		      /* wasn't the rooted pv - hash, find it, and unlink it */
2795		      old_ppn = (ppnum_t)pa_index(old_pa);
2796		      CHK_NPVHASH();
2797		      pvhash_idx = pvhashidx(pmap,vaddr);
2798		      LOCK_PV_HASH(pvhash_idx);
2799		      pprevh = pvhash(pvhash_idx);
2800#if PV_DEBUG
2801		      if (NULL==pprevh)panic("pmap enter 1");
2802#endif
2803		      pvh_e = *pprevh;
2804		      pmap_pv_hashlist_walks++;
2805		      pv_cnt = 0;
2806		      while (PV_HASHED_ENTRY_NULL != pvh_e) {
2807			pv_cnt++;
2808			if (pvh_e->pmap == pmap && pvh_e->va == vaddr && pvh_e->ppn == old_ppn) break;
2809			pprevh = &pvh_e->nexth;
2810			pvh_e = pvh_e->nexth;
2811		      }
2812		      pmap_pv_hashlist_cnts += pv_cnt;
2813		      if (pmap_pv_hashlist_max < pv_cnt) pmap_pv_hashlist_max = pv_cnt;
2814		      if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pmap_enter: pv not in hash list");
2815		      if(NULL==pprevh)panic("pmap enter 2");
2816		      *pprevh = pvh_e->nexth;
2817		      remque(&pvh_e->qlink);
2818		      UNLOCK_PV_HASH(pvhash_idx);
2819		    }
2820		}
2821	    }
2822	    else {
2823
2824		/*
2825		 *	old_pa is not managed.
2826		 *	Do removal part of accounting.
2827		 */
2828
2829		if (iswired(*pte)) {
2830		    assert(pmap->stats.wired_count >= 1);
2831		    OSAddAtomic(-1, (SInt32 *) &pmap->stats.wired_count);
2832		}
2833	    }
2834	}
2835
2836	/*
2837	 * if we had a previously managed paged locked, unlock it now
2838	 */
2839
2840	if (old_pa_locked) {
2841	  UNLOCK_PVH(pai);
2842	  old_pa_locked = FALSE;
2843	}
2844
2845	pai = pa_index(pa);     /* now working with new incoming phys page */
2846	if (managed_page(pai)) {
2847
2848	    /*
2849	     *	Step 2) Enter the mapping in the PV list for this
2850	     *	physical page.
2851	     */
2852	    pv_h = pai_to_pvh(pai);
2853
2854	    LOCK_PVH(pai);
2855
2856	    if (pv_h->pmap == PMAP_NULL) {
2857		/*
2858		 *	No mappings yet, use  rooted pv
2859		 */
2860		pv_h->va = vaddr;
2861		pv_h->pmap = pmap;
2862		queue_init(&pv_h->qlink);
2863	    }
2864	    else {
2865		/*
2866		 *	Add new pv_hashed_entry after header.
2867		 */
2868		if ((PV_HASHED_ENTRY_NULL == pvh_e) && pvh_new) {
2869		  pvh_e = pvh_new;
2870		  pvh_new = PV_HASHED_ENTRY_NULL;  /* show we used it */
2871		} else if (PV_HASHED_ENTRY_NULL == pvh_e) {
2872		  PV_HASHED_ALLOC(pvh_e);
2873		  if (PV_HASHED_ENTRY_NULL == pvh_e) {
2874		    /* the pv list is empty.
2875		     * if we are on the kernel pmap we'll use one of the special private
2876		     * kernel pv_e's, else, we need to unlock everything, zalloc a pv_e,
2877		     * and restart bringing in the pv_e with us.
2878		     */
2879		    if (kernel_pmap == pmap) {
2880		      PV_HASHED_KERN_ALLOC(pvh_e);
2881		    } else {
2882		      UNLOCK_PVH(pai);
2883		      PMAP_UNLOCK(pmap);
2884		      pvh_new = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
2885		      goto Retry;
2886		    }
2887		  }
2888		}
2889
2890		if (PV_HASHED_ENTRY_NULL == pvh_e) panic("pvh_e exhaustion");
2891		pvh_e->va = vaddr;
2892		pvh_e->pmap = pmap;
2893		pvh_e->ppn = pn;
2894		CHK_NPVHASH();
2895		pvhash_idx = pvhashidx(pmap,vaddr);
2896		LOCK_PV_HASH(pvhash_idx);
2897		insque(&pvh_e->qlink, &pv_h->qlink);
2898		hashp = pvhash(pvhash_idx);
2899#if PV_DEBUG
2900		if(NULL==hashp)panic("pmap_enter 4");
2901#endif
2902		pvh_e->nexth = *hashp;
2903		*hashp = pvh_e;
2904		UNLOCK_PV_HASH(pvhash_idx);
2905
2906		/*
2907		 *	Remember that we used the pvlist entry.
2908		 */
2909		pvh_e = PV_HASHED_ENTRY_NULL;
2910	    }
2911
2912	    /*
2913	     * only count the mapping
2914	     * for 'managed memory'
2915	     */
2916	    OSAddAtomic(+1, (SInt32 *) &pmap->stats.resident_count);
2917	    if (pmap->stats.resident_count > pmap->stats.resident_max) {
2918		    pmap->stats.resident_max = pmap->stats.resident_count;
2919	    }
2920	}
2921
2922	/*
2923	 * Step 3) Enter the mapping.
2924	 *
2925	 *	Build a template to speed up entering -
2926	 *	only the pfn changes.
2927	 */
2928	template = pa_to_pte(pa) | INTEL_PTE_VALID;
2929
2930	if (flags & VM_MEM_NOT_CACHEABLE) {
2931		if(!(flags & VM_MEM_GUARDED))
2932			template |= INTEL_PTE_PTA;
2933		template |= INTEL_PTE_NCACHE;
2934	}
2935
2936	if (pmap != kernel_pmap)
2937		template |= INTEL_PTE_USER;
2938	if (prot & VM_PROT_WRITE)
2939		template |= INTEL_PTE_WRITE;
2940
2941	if (set_NX == TRUE)
2942		template |= INTEL_PTE_NX;
2943
2944	if (wired) {
2945		template |= INTEL_PTE_WIRED;
2946		OSAddAtomic(+1, (SInt32 *) &pmap->stats.wired_count);
2947	}
2948	pmap_store_pte(pte, template);
2949
2950	/* if this was a managed page we delayed unlocking the pv until here
2951	 * to prevent pmap_page_protect et al from finding it until the pte
2952	 * has been stored */
2953
2954	if (managed_page(pai)) {
2955	  UNLOCK_PVH(pai);
2956	}
2957
2958Done:
2959	if (need_tlbflush == TRUE)
2960	        PMAP_UPDATE_TLBS(pmap, vaddr, vaddr + PAGE_SIZE);
2961
2962	if (pvh_e != PV_HASHED_ENTRY_NULL) {
2963	        PV_HASHED_FREE_LIST(pvh_e, pvh_e, 1);
2964	}
2965
2966	if (pvh_new != PV_HASHED_ENTRY_NULL) {
2967	  PV_HASHED_KERN_FREE_LIST(pvh_new, pvh_new, 1);
2968	}
2969
2970	PMAP_UNLOCK(pmap);
2971	PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, 0, 0, 0, 0, 0);
2972}
2973
2974/*
2975 *	Routine:	pmap_change_wiring
2976 *	Function:	Change the wiring attribute for a map/virtual-address
2977 *			pair.
2978 *	In/out conditions:
2979 *			The mapping must already exist in the pmap.
2980 */
2981void
2982pmap_change_wiring(
2983	register pmap_t	map,
2984	vm_map_offset_t	vaddr,
2985	boolean_t	wired)
2986{
2987	register pt_entry_t	*pte;
2988
2989	/*
2990	 *	We must grab the pmap system lock because we may
2991	 *	change a pte_page queue.
2992	 */
2993	PMAP_LOCK(map);
2994
2995	if ((pte = pmap_pte(map, vaddr)) == PT_ENTRY_NULL)
2996		panic("pmap_change_wiring: pte missing");
2997
2998	if (wired && !iswired(*pte)) {
2999	    /*
3000	     *	wiring down mapping
3001	     */
3002	    OSAddAtomic(+1, (SInt32 *) &map->stats.wired_count);
3003	    pmap_update_pte(pte, *pte, (*pte | INTEL_PTE_WIRED));
3004	}
3005	else if (!wired && iswired(*pte)) {
3006	    /*
3007	     *	unwiring mapping
3008	     */
3009	    assert(map->stats.wired_count >= 1);
3010	    OSAddAtomic(-1, (SInt32 *) &map->stats.wired_count);
3011	    pmap_update_pte(pte, *pte, (*pte & ~INTEL_PTE_WIRED));
3012	}
3013
3014	PMAP_UNLOCK(map);
3015}
3016
3017ppnum_t
3018pmap_find_phys(pmap_t pmap, addr64_t va)
3019{
3020	pt_entry_t     *ptp;
3021	ppnum_t         ppn;
3022
3023	mp_disable_preemption();
3024
3025	ptp = pmap_pte(pmap, va);
3026	if (PT_ENTRY_NULL == ptp) {
3027		ppn = 0;
3028	} else {
3029		ppn = (ppnum_t) i386_btop(pte_to_pa(*ptp));
3030	}
3031	mp_enable_preemption();
3032
3033	return ppn;
3034}
3035
3036/*
3037 *	Routine:	pmap_extract
3038 *	Function:
3039 *		Extract the physical page address associated
3040 *		with the given map/virtual_address pair.
3041 *     Change to shim for backwards compatibility but will not
3042 *     work for 64 bit systems.  Some old drivers that we cannot
3043 *     change need this.
3044 */
3045
3046vm_offset_t
3047pmap_extract(
3048	register pmap_t	pmap,
3049	vm_map_offset_t	vaddr)
3050{
3051        ppnum_t ppn;
3052	vm_offset_t paddr;
3053
3054	paddr = (vm_offset_t)0;
3055	ppn = pmap_find_phys(pmap, vaddr);
3056
3057	if (ppn) {
3058	        paddr = ((vm_offset_t)i386_ptob(ppn)) | (vaddr & INTEL_OFFMASK);
3059	}
3060	return (paddr);
3061}
3062
3063void
3064pmap_expand_pml4(
3065		 pmap_t map,
3066		 vm_map_offset_t vaddr)
3067{
3068	register vm_page_t	m;
3069	register pmap_paddr_t	pa;
3070	uint64_t                i;
3071	spl_t			spl;
3072	ppnum_t                 pn;
3073	pml4_entry_t            *pml4p;
3074
3075	if (kernel_pmap == map) panic("expand kernel pml4");
3076
3077	spl = splhigh();
3078	pml4p = pmap64_pml4(map, vaddr);
3079	splx(spl);
3080	if (PML4_ENTRY_NULL == pml4p) panic("pmap_expand_pml4 no pml4p");
3081
3082	/*
3083	 *	Allocate a VM page for the pml4 page
3084	 */
3085	while ((m = vm_page_grab()) == VM_PAGE_NULL)
3086		VM_PAGE_WAIT();
3087
3088	/*
3089	 *	put the page into the pmap's obj list so it
3090	 *	can be found later.
3091	 */
3092	pn = m->phys_page;
3093	pa = i386_ptob(pn);
3094	i = pml4idx(map, vaddr);
3095
3096	/*
3097	 *	Zero the page.
3098	 */
3099	pmap_zero_page(pn);
3100
3101	vm_page_lock_queues();
3102	vm_page_wire(m);
3103	inuse_ptepages_count++;
3104	vm_page_unlock_queues();
3105
3106	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3107	vm_object_lock(map->pm_obj_pml4);
3108
3109	PMAP_LOCK(map);
3110	/*
3111	 *	See if someone else expanded us first
3112	 */
3113	if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
3114	        PMAP_UNLOCK(map);
3115		vm_object_unlock(map->pm_obj_pml4);
3116
3117		vm_page_lock_queues();
3118		vm_page_free(m);
3119		inuse_ptepages_count--;
3120		vm_page_unlock_queues();
3121
3122		return;
3123	}
3124
3125#if 0 /* DEBUG */
3126       if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
3127	       panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3128		     map, map->pm_obj_pml4, vaddr, i);
3129       }
3130#endif
3131	vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
3132	vm_object_unlock(map->pm_obj_pml4);
3133
3134	/*
3135	 *	Set the page directory entry for this page table.
3136	 */
3137	pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
3138
3139	pmap_store_pte(pml4p, pa_to_pte(pa)
3140				| INTEL_PTE_VALID
3141				| INTEL_PTE_USER
3142				| INTEL_PTE_WRITE);
3143
3144	PMAP_UNLOCK(map);
3145
3146	return;
3147
3148}
3149
3150void
3151pmap_expand_pdpt(
3152		 pmap_t map,
3153		 vm_map_offset_t vaddr)
3154{
3155	register vm_page_t	m;
3156	register pmap_paddr_t	pa;
3157	uint64_t                i;
3158	spl_t			spl;
3159	ppnum_t                 pn;
3160	pdpt_entry_t            *pdptp;
3161
3162	if (kernel_pmap == map) panic("expand kernel pdpt");
3163
3164	spl = splhigh();
3165	while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
3166	        splx(spl);
3167		pmap_expand_pml4(map, vaddr); /* need room for another pdpt entry */
3168		spl = splhigh();
3169	}
3170	splx(spl);
3171
3172	/*
3173	 *	Allocate a VM page for the pdpt page
3174	 */
3175	while ((m = vm_page_grab()) == VM_PAGE_NULL)
3176		VM_PAGE_WAIT();
3177
3178	/*
3179	 *	put the page into the pmap's obj list so it
3180	 *	can be found later.
3181	 */
3182	pn = m->phys_page;
3183	pa = i386_ptob(pn);
3184	i = pdptidx(map, vaddr);
3185
3186	/*
3187	 *	Zero the page.
3188	 */
3189	pmap_zero_page(pn);
3190
3191	vm_page_lock_queues();
3192	vm_page_wire(m);
3193	inuse_ptepages_count++;
3194	vm_page_unlock_queues();
3195
3196	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3197	vm_object_lock(map->pm_obj_pdpt);
3198
3199	PMAP_LOCK(map);
3200	/*
3201	 *	See if someone else expanded us first
3202	 */
3203	if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
3204		PMAP_UNLOCK(map);
3205		vm_object_unlock(map->pm_obj_pdpt);
3206
3207		vm_page_lock_queues();
3208		vm_page_free(m);
3209		inuse_ptepages_count--;
3210		vm_page_unlock_queues();
3211
3212		return;
3213	}
3214
3215#if 0 /* DEBUG */
3216       if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
3217	       panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
3218		     map, map->pm_obj_pdpt, vaddr, i);
3219       }
3220#endif
3221	vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
3222	vm_object_unlock(map->pm_obj_pdpt);
3223
3224	/*
3225	 *	Set the page directory entry for this page table.
3226	 */
3227	pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
3228
3229	pmap_store_pte(pdptp, pa_to_pte(pa)
3230				| INTEL_PTE_VALID
3231				| INTEL_PTE_USER
3232				| INTEL_PTE_WRITE);
3233
3234	PMAP_UNLOCK(map);
3235
3236	return;
3237
3238}
3239
3240
3241
3242/*
3243 *	Routine:	pmap_expand
3244 *
3245 *	Expands a pmap to be able to map the specified virtual address.
3246 *
3247 *	Allocates new virtual memory for the P0 or P1 portion of the
3248 *	pmap, then re-maps the physical pages that were in the old
3249 *	pmap to be in the new pmap.
3250 *
3251 *	Must be called with the pmap system and the pmap unlocked,
3252 *	since these must be unlocked to use vm_allocate or vm_deallocate.
3253 *	Thus it must be called in a loop that checks whether the map
3254 *	has been expanded enough.
3255 *	(We won't loop forever, since page tables aren't shrunk.)
3256 */
3257void
3258pmap_expand(
3259	pmap_t		map,
3260	vm_map_offset_t	vaddr)
3261{
3262	pt_entry_t		*pdp;
3263	register vm_page_t	m;
3264	register pmap_paddr_t	pa;
3265	uint64_t                 i;
3266	spl_t			spl;
3267	ppnum_t                 pn;
3268
3269	/*
3270	 * if not the kernel map (while we are still compat kernel mode)
3271	 * and we are 64 bit, propagate expand upwards
3272	 */
3273
3274	if (cpu_64bit && (map != kernel_pmap)) {
3275	        spl = splhigh();
3276		while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
3277		        splx(spl);
3278			pmap_expand_pdpt(map, vaddr); /* need room for another pde entry */
3279			spl = splhigh();
3280		}
3281		splx(spl);
3282	}
3283
3284	/*
3285	 *	Allocate a VM page for the pde entries.
3286	 */
3287	while ((m = vm_page_grab()) == VM_PAGE_NULL)
3288		VM_PAGE_WAIT();
3289
3290	/*
3291	 *	put the page into the pmap's obj list so it
3292	 *	can be found later.
3293	 */
3294	pn = m->phys_page;
3295	pa = i386_ptob(pn);
3296	i = pdeidx(map, vaddr);
3297
3298	/*
3299	 *	Zero the page.
3300	 */
3301	pmap_zero_page(pn);
3302
3303	vm_page_lock_queues();
3304	vm_page_wire(m);
3305	inuse_ptepages_count++;
3306	vm_page_unlock_queues();
3307
3308	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
3309	vm_object_lock(map->pm_obj);
3310
3311	PMAP_LOCK(map);
3312	/*
3313	 *	See if someone else expanded us first
3314	 */
3315
3316	if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
3317		PMAP_UNLOCK(map);
3318		vm_object_unlock(map->pm_obj);
3319
3320		vm_page_lock_queues();
3321		vm_page_free(m);
3322		inuse_ptepages_count--;
3323		vm_page_unlock_queues();
3324
3325		return;
3326	}
3327
3328#if 0 /* DEBUG */
3329       if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
3330	       panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
3331		     map, map->pm_obj, vaddr, i);
3332       }
3333#endif
3334	vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
3335	vm_object_unlock(map->pm_obj);
3336
3337	/*
3338	 * refetch while locked
3339	 */
3340
3341	pdp = pmap_pde(map, vaddr);
3342
3343	/*
3344	 *	Set the page directory entry for this page table.
3345	 */
3346	pmap_store_pte(pdp, pa_to_pte(pa)
3347				| INTEL_PTE_VALID
3348				| INTEL_PTE_USER
3349				| INTEL_PTE_WRITE);
3350
3351	PMAP_UNLOCK(map);
3352
3353	return;
3354}
3355
3356
3357/*
3358 * pmap_sync_page_data_phys(ppnum_t pa)
3359 *
3360 * Invalidates all of the instruction cache on a physical page and
3361 * pushes any dirty data from the data cache for the same physical page
3362 * Not required in i386.
3363 */
3364void
3365pmap_sync_page_data_phys(__unused ppnum_t pa)
3366{
3367	return;
3368}
3369
3370/*
3371 * pmap_sync_page_attributes_phys(ppnum_t pa)
3372 *
3373 * Write back and invalidate all cachelines on a physical page.
3374 */
3375void
3376pmap_sync_page_attributes_phys(ppnum_t pa)
3377{
3378	cache_flush_page_phys(pa);
3379}
3380
3381
3382
3383#ifdef CURRENTLY_UNUSED_AND_UNTESTED
3384
3385int	collect_ref;
3386int	collect_unref;
3387
3388/*
3389 *	Routine:	pmap_collect
3390 *	Function:
3391 *		Garbage collects the physical map system for
3392 *		pages which are no longer used.
3393 *		Success need not be guaranteed -- that is, there
3394 *		may well be pages which are not referenced, but
3395 *		others may be collected.
3396 *	Usage:
3397 *		Called by the pageout daemon when pages are scarce.
3398 */
3399void
3400pmap_collect(
3401	pmap_t 		p)
3402{
3403	register pt_entry_t	*pdp, *ptp;
3404	pt_entry_t		*eptp;
3405	int			wired;
3406
3407	if (p == PMAP_NULL)
3408		return;
3409
3410	if (p == kernel_pmap)
3411		return;
3412
3413	/*
3414	 *	Garbage collect map.
3415	 */
3416	PMAP_LOCK(p);
3417
3418	for (pdp = (pt_entry_t *)p->dirbase;
3419	     pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
3420	     pdp++)
3421	{
3422	   if (*pdp & INTEL_PTE_VALID) {
3423	      if(*pdp & INTEL_PTE_REF) {
3424		pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
3425		collect_ref++;
3426	      } else {
3427		collect_unref++;
3428		ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
3429		eptp = ptp + NPTEPG;
3430
3431		/*
3432		 * If the pte page has any wired mappings, we cannot
3433		 * free it.
3434		 */
3435		wired = 0;
3436		{
3437		    register pt_entry_t *ptep;
3438		    for (ptep = ptp; ptep < eptp; ptep++) {
3439			if (iswired(*ptep)) {
3440			    wired = 1;
3441			    break;
3442			}
3443		    }
3444		}
3445		if (!wired) {
3446		    /*
3447		     * Remove the virtual addresses mapped by this pte page.
3448		     */
3449		    pmap_remove_range(p,
3450				pdetova(pdp - (pt_entry_t *)p->dirbase),
3451				ptp,
3452				eptp);
3453
3454		    /*
3455		     * Invalidate the page directory pointer.
3456		     */
3457		    pmap_store_pte(pdp, 0x0);
3458
3459		    PMAP_UNLOCK(p);
3460
3461		    /*
3462		     * And free the pte page itself.
3463		     */
3464		    {
3465			register vm_page_t m;
3466
3467			vm_object_lock(p->pm_obj);
3468
3469			m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
3470			if (m == VM_PAGE_NULL)
3471			    panic("pmap_collect: pte page not in object");
3472
3473			vm_page_lock_queues();
3474			vm_page_free(m);
3475			inuse_ptepages_count--;
3476			vm_page_unlock_queues();
3477
3478			vm_object_unlock(p->pm_obj);
3479		    }
3480
3481		    PMAP_LOCK(p);
3482		}
3483	      }
3484	   }
3485	}
3486
3487	PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
3488	PMAP_UNLOCK(p);
3489	return;
3490
3491}
3492#endif
3493
3494
3495void
3496pmap_copy_page(ppnum_t src, ppnum_t dst)
3497{
3498  bcopy_phys((addr64_t)i386_ptob(src),
3499	     (addr64_t)i386_ptob(dst),
3500	     PAGE_SIZE);
3501}
3502
3503
3504/*
3505 *	Routine:	pmap_pageable
3506 *	Function:
3507 *		Make the specified pages (by pmap, offset)
3508 *		pageable (or not) as requested.
3509 *
3510 *		A page which is not pageable may not take
3511 *		a fault; therefore, its page table entry
3512 *		must remain valid for the duration.
3513 *
3514 *		This routine is merely advisory; pmap_enter
3515 *		will specify that these pages are to be wired
3516 *		down (or not) as appropriate.
3517 */
3518void
3519pmap_pageable(
3520	__unused pmap_t		pmap,
3521	__unused vm_map_offset_t	start_addr,
3522	__unused vm_map_offset_t	end_addr,
3523	__unused boolean_t	pageable)
3524{
3525#ifdef	lint
3526	pmap++; start_addr++; end_addr++; pageable++;
3527#endif	/* lint */
3528}
3529
3530/*
3531 *	Clear specified attribute bits.
3532 */
3533void
3534phys_attribute_clear(
3535	ppnum_t		pn,
3536	int		bits)
3537{
3538	pv_rooted_entry_t		pv_h;
3539	register pv_hashed_entry_t	pv_e;
3540	register pt_entry_t	*pte;
3541	int			pai;
3542	register pmap_t		pmap;
3543
3544	pmap_intr_assert();
3545	assert(pn != vm_page_fictitious_addr);
3546	if (pn == vm_page_guard_addr)
3547		return;
3548
3549	pai = ppn_to_pai(pn);
3550
3551	if (!managed_page(pai)) {
3552	    /*
3553	     *	Not a managed page.
3554	     */
3555	    return;
3556	}
3557
3558	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START,
3559		   (int) pn, bits, 0, 0, 0);
3560
3561	pv_h = pai_to_pvh(pai);
3562
3563	LOCK_PVH(pai);
3564
3565	/*
3566	 * Walk down PV list, clearing all modify or reference bits.
3567	 * We do not have to lock the pv_list because we have
3568	 * the entire pmap system locked.
3569	 */
3570	if (pv_h->pmap != PMAP_NULL) {
3571	    /*
3572	     * There are some mappings.
3573	     */
3574
3575	  pv_e = (pv_hashed_entry_t)pv_h;
3576
3577	  do {
3578		pmap = pv_e->pmap;
3579
3580		{
3581		    vm_map_offset_t va;
3582
3583		    va = pv_e->va;
3584
3585		    /*
3586		     * Clear modify and/or reference bits.
3587		     */
3588
3589		    pte = pmap_pte(pmap, va);
3590		    pmap_update_pte(pte, *pte, (*pte & ~bits));
3591		    /* Ensure all processors using this translation
3592		     * invalidate this TLB entry. The invalidation *must* follow
3593		     * the PTE update, to ensure that the TLB shadow of the
3594		     * 'D' bit (in particular) is synchronized with the
3595		     * updated PTE.
3596		     */
3597		    PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3598		}
3599
3600		pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3601
3602	  } while (pv_e != (pv_hashed_entry_t)pv_h);
3603	}
3604	pmap_phys_attributes[pai] &= ~bits;
3605
3606	UNLOCK_PVH(pai);
3607
3608	PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END,
3609		   0, 0, 0, 0, 0);
3610
3611}
3612
3613/*
3614 *	Check specified attribute bits.
3615 */
3616int
3617phys_attribute_test(
3618	ppnum_t		pn,
3619	int		bits)
3620{
3621	pv_rooted_entry_t		pv_h;
3622	register pv_hashed_entry_t	pv_e;
3623	register pt_entry_t	*pte;
3624	int			pai;
3625	register pmap_t		pmap;
3626	int			attributes = 0;
3627
3628	pmap_intr_assert();
3629	assert(pn != vm_page_fictitious_addr);
3630	if (pn == vm_page_guard_addr)
3631		return 0;
3632
3633	pai = ppn_to_pai(pn);
3634
3635	if (!managed_page(pai)) {
3636	    /*
3637	     *	Not a managed page.
3638	     */
3639	    return (0);
3640	}
3641
3642	/*
3643	 * super fast check...  if bits already collected
3644	 * no need to take any locks...
3645	 * if not set, we need to recheck after taking
3646	 * the lock in case they got pulled in while
3647	 * we were waiting for the lock
3648	 */
3649	if ( (pmap_phys_attributes[pai] & bits) == bits)
3650	    return (bits);
3651
3652	pv_h = pai_to_pvh(pai);
3653
3654	LOCK_PVH(pai);
3655
3656	attributes = pmap_phys_attributes[pai] & bits;
3657
3658	/*
3659	 * Walk down PV list, checking the mappings until we
3660	 * reach the end or we've found the attributes we've asked for
3661	 * We do not have to lock the pv_list because we have
3662	 * the entire pmap system locked.
3663	 */
3664	if (pv_h->pmap != PMAP_NULL) {
3665	    /*
3666	     * There are some mappings.
3667	     */
3668	  pv_e = (pv_hashed_entry_t)pv_h;
3669	  if (attributes != bits) do {
3670
3671	        pmap = pv_e->pmap;
3672
3673		{
3674		    vm_map_offset_t va;
3675
3676		    va = pv_e->va;
3677		    /*
3678		     * first make sure any processor actively
3679		     * using this pmap, flushes its TLB state
3680		     */
3681		    PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
3682
3683		    /*
3684		     * pick up modify and/or reference bits from this mapping
3685		     */
3686
3687		    pte = pmap_pte(pmap, va);
3688		    attributes |= *pte & bits;
3689
3690		}
3691
3692		pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink);
3693
3694	    } while ((attributes != bits) && (pv_e != (pv_hashed_entry_t)pv_h));
3695	}
3696
3697	UNLOCK_PVH(pai);
3698	return (attributes);
3699}
3700
3701/*
3702 *	Set specified attribute bits.
3703 */
3704void
3705phys_attribute_set(
3706	ppnum_t		pn,
3707	int		bits)
3708{
3709	int		pai;
3710
3711	pmap_intr_assert();
3712	assert(pn != vm_page_fictitious_addr);
3713	if (pn == vm_page_guard_addr)
3714		return;
3715
3716	pai = ppn_to_pai(pn);
3717
3718	if (!managed_page(pai)) {
3719	    /*
3720	     *	Not a managed page.
3721	     */
3722	    return;
3723	}
3724
3725	LOCK_PVH(pai);
3726
3727	pmap_phys_attributes[pai] |= bits;
3728
3729	UNLOCK_PVH(pai);
3730}
3731
3732/*
3733 *	Set the modify bit on the specified physical page.
3734 */
3735
3736void pmap_set_modify(
3737		     ppnum_t pn)
3738{
3739	phys_attribute_set(pn, PHYS_MODIFIED);
3740}
3741
3742/*
3743 *	Clear the modify bits on the specified physical page.
3744 */
3745
3746void
3747pmap_clear_modify(
3748		  ppnum_t pn)
3749{
3750	phys_attribute_clear(pn, PHYS_MODIFIED);
3751}
3752
3753/*
3754 *	pmap_is_modified:
3755 *
3756 *	Return whether or not the specified physical page is modified
3757 *	by any physical maps.
3758 */
3759
3760boolean_t
3761pmap_is_modified(
3762		 ppnum_t pn)
3763{
3764        if (phys_attribute_test(pn, PHYS_MODIFIED))
3765	        return TRUE;
3766
3767	return FALSE;
3768}
3769
3770/*
3771 *	pmap_clear_reference:
3772 *
3773 *	Clear the reference bit on the specified physical page.
3774 */
3775
3776void
3777pmap_clear_reference(
3778		     ppnum_t pn)
3779{
3780	phys_attribute_clear(pn, PHYS_REFERENCED);
3781}
3782
3783void
3784pmap_set_reference(ppnum_t pn)
3785{
3786	phys_attribute_set(pn, PHYS_REFERENCED);
3787}
3788
3789/*
3790 *	pmap_is_referenced:
3791 *
3792 *	Return whether or not the specified physical page is referenced
3793 *	by any physical maps.
3794 */
3795
3796boolean_t
3797pmap_is_referenced(
3798		   ppnum_t pn)
3799{
3800        if (phys_attribute_test(pn, PHYS_REFERENCED))
3801	        return TRUE;
3802
3803	return FALSE;
3804}
3805
3806/*
3807 * pmap_get_refmod(phys)
3808 *  returns the referenced and modified bits of the specified
3809 *  physical page.
3810 */
3811unsigned int
3812pmap_get_refmod(ppnum_t pa)
3813{
3814        int	refmod;
3815	unsigned int retval = 0;
3816
3817	refmod = phys_attribute_test(pa, PHYS_MODIFIED | PHYS_REFERENCED);
3818
3819	if (refmod & PHYS_MODIFIED)
3820	        retval |= VM_MEM_MODIFIED;
3821	if (refmod & PHYS_REFERENCED)
3822	        retval |= VM_MEM_REFERENCED;
3823
3824	return (retval);
3825}
3826
3827/*
3828 * pmap_clear_refmod(phys, mask)
3829 *  clears the referenced and modified bits as specified by the mask
3830 *  of the specified physical page.
3831 */
3832void
3833pmap_clear_refmod(ppnum_t pa, unsigned int mask)
3834{
3835	unsigned int  x86Mask;
3836
3837	x86Mask = (   ((mask &   VM_MEM_MODIFIED)?   PHYS_MODIFIED : 0)
3838	            | ((mask & VM_MEM_REFERENCED)? PHYS_REFERENCED : 0));
3839	phys_attribute_clear(pa, x86Mask);
3840}
3841
3842void
3843invalidate_icache(__unused vm_offset_t	addr,
3844		  __unused unsigned	cnt,
3845		  __unused int		phys)
3846{
3847	return;
3848}
3849void
3850flush_dcache(__unused vm_offset_t	addr,
3851	     __unused unsigned		count,
3852	     __unused int		phys)
3853{
3854	return;
3855}
3856
3857#if CONFIG_DTRACE
3858/*
3859 * Constrain DTrace copyin/copyout actions
3860 */
3861extern kern_return_t dtrace_copyio_preflight(addr64_t);
3862extern kern_return_t dtrace_copyio_postflight(addr64_t);
3863
3864kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
3865{
3866	thread_t thread = current_thread();
3867
3868	if (current_map() == kernel_map)
3869		return KERN_FAILURE;
3870	else if (thread->machine.specFlags & CopyIOActive)
3871		return KERN_FAILURE;
3872	else
3873		return KERN_SUCCESS;
3874}
3875
3876kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
3877{
3878	return KERN_SUCCESS;
3879}
3880#endif /* CONFIG_DTRACE */
3881
3882#if	MACH_KDB
3883
3884/* show phys page mappings and attributes */
3885
3886extern void	db_show_page(pmap_paddr_t pa);
3887
3888#if 0
3889void
3890db_show_page(pmap_paddr_t pa)
3891{
3892	pv_entry_t	pv_h;
3893	int		pai;
3894	char 		attr;
3895
3896	pai = pa_index(pa);
3897	pv_h = pai_to_pvh(pai);
3898
3899	attr = pmap_phys_attributes[pai];
3900	printf("phys page %llx ", pa);
3901	if (attr & PHYS_MODIFIED)
3902		printf("modified, ");
3903	if (attr & PHYS_REFERENCED)
3904		printf("referenced, ");
3905	if (pv_h->pmap || pv_h->next)
3906		printf(" mapped at\n");
3907	else
3908		printf(" not mapped\n");
3909	for (; pv_h; pv_h = pv_h->next)
3910		if (pv_h->pmap)
3911			printf("%llx in pmap %p\n", pv_h->va, pv_h->pmap);
3912}
3913#endif
3914
3915#endif /* MACH_KDB */
3916
3917#if	MACH_KDB
3918#if 0
3919void db_kvtophys(vm_offset_t);
3920void db_show_vaddrs(pt_entry_t  *);
3921
3922/*
3923 *	print out the results of kvtophys(arg)
3924 */
3925void
3926db_kvtophys(
3927	vm_offset_t	vaddr)
3928{
3929	db_printf("0x%qx", kvtophys(vaddr));
3930}
3931
3932/*
3933 *	Walk the pages tables.
3934 */
3935void
3936db_show_vaddrs(
3937	pt_entry_t	*dirbase)
3938{
3939	pt_entry_t	*ptep, *pdep, tmp;
3940	unsigned int	x, y, pdecnt, ptecnt;
3941
3942	if (dirbase == 0) {
3943		dirbase = kernel_pmap->dirbase;
3944	}
3945	if (dirbase == 0) {
3946		db_printf("need a dirbase...\n");
3947		return;
3948	}
3949	dirbase = (pt_entry_t *) (int) ((unsigned long) dirbase & ~INTEL_OFFMASK);
3950
3951	db_printf("dirbase: 0x%x\n", dirbase);
3952
3953	pdecnt = ptecnt = 0;
3954	pdep = &dirbase[0];
3955	for (y = 0; y < NPDEPG; y++, pdep++) {
3956		if (((tmp = *pdep) & INTEL_PTE_VALID) == 0) {
3957			continue;
3958		}
3959		pdecnt++;
3960		ptep = (pt_entry_t *) ((unsigned long)(*pdep) & ~INTEL_OFFMASK);
3961		db_printf("dir[%4d]: 0x%x\n", y, *pdep);
3962		for (x = 0; x < NPTEPG; x++, ptep++) {
3963			if (((tmp = *ptep) & INTEL_PTE_VALID) == 0) {
3964				continue;
3965			}
3966			ptecnt++;
3967			db_printf("   tab[%4d]: 0x%x, va=0x%x, pa=0x%x\n",
3968				x,
3969				*ptep,
3970				(y << 22) | (x << 12),
3971				*ptep & ~INTEL_OFFMASK);
3972		}
3973	}
3974
3975	db_printf("total: %d tables, %d page table entries.\n", pdecnt, ptecnt);
3976
3977}
3978#endif
3979#endif	/* MACH_KDB */
3980
3981#include <mach_vm_debug.h>
3982#if	MACH_VM_DEBUG
3983#include <vm/vm_debug.h>
3984
3985int
3986pmap_list_resident_pages(
3987	__unused pmap_t		pmap,
3988	__unused vm_offset_t	*listp,
3989	__unused int		space)
3990{
3991	return 0;
3992}
3993#endif	/* MACH_VM_DEBUG */
3994
3995
3996
3997/* temporary workaround */
3998boolean_t
3999coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
4000{
4001#if 0
4002	pt_entry_t     *ptep;
4003
4004	ptep = pmap_pte(map->pmap, va);
4005	if (0 == ptep)
4006		return FALSE;
4007	return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
4008#else
4009	return TRUE;
4010#endif
4011}
4012
4013
4014boolean_t
4015phys_page_exists(
4016		 ppnum_t pn)
4017{
4018	assert(pn != vm_page_fictitious_addr);
4019
4020	if (!pmap_initialized)
4021		return (TRUE);
4022
4023	if (pn == vm_page_guard_addr)
4024		return FALSE;
4025
4026	if (!managed_page(ppn_to_pai(pn)))
4027		return (FALSE);
4028
4029	return TRUE;
4030}
4031
4032void
4033mapping_free_prime(void)
4034{
4035	int             i;
4036	pv_hashed_entry_t      pvh_e;
4037	pv_hashed_entry_t      pvh_eh;
4038	pv_hashed_entry_t      pvh_et;
4039	int		pv_cnt;
4040
4041	pv_cnt = 0;
4042	pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4043	for (i = 0; i < (5 * PV_HASHED_ALLOC_CHUNK); i++) {
4044		pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4045
4046		pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4047		pvh_eh = pvh_e;
4048
4049		if (pvh_et == PV_HASHED_ENTRY_NULL)
4050		        pvh_et = pvh_e;
4051		pv_cnt++;
4052	}
4053	PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4054
4055	pv_cnt = 0;
4056	pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4057	for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
4058		pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4059
4060		pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4061		pvh_eh = pvh_e;
4062
4063		if (pvh_et == PV_HASHED_ENTRY_NULL)
4064		        pvh_et = pvh_e;
4065		pv_cnt++;
4066	}
4067	PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4068
4069}
4070
4071void
4072mapping_adjust(void)
4073{
4074	pv_hashed_entry_t      pvh_e;
4075	pv_hashed_entry_t      pvh_eh;
4076	pv_hashed_entry_t      pvh_et;
4077	int		pv_cnt;
4078	int             i;
4079
4080	if (mapping_adjust_call == NULL) {
4081		thread_call_setup(&mapping_adjust_call_data,
4082				  (thread_call_func_t) mapping_adjust,
4083				  (thread_call_param_t) NULL);
4084		mapping_adjust_call = &mapping_adjust_call_data;
4085	}
4086
4087	pv_cnt = 0;
4088	pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4089	if (pv_hashed_kern_free_count < PV_HASHED_KERN_LOW_WATER_MARK) {
4090		for (i = 0; i < PV_HASHED_KERN_ALLOC_CHUNK; i++) {
4091			pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4092
4093			pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4094			pvh_eh = pvh_e;
4095
4096			if (pvh_et == PV_HASHED_ENTRY_NULL)
4097			        pvh_et = pvh_e;
4098			pv_cnt++;
4099		}
4100		PV_HASHED_KERN_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4101	}
4102
4103	pv_cnt = 0;
4104	pvh_eh = pvh_et = PV_HASHED_ENTRY_NULL;
4105	if (pv_hashed_free_count < PV_HASHED_LOW_WATER_MARK) {
4106		for (i = 0; i < PV_HASHED_ALLOC_CHUNK; i++) {
4107			pvh_e = (pv_hashed_entry_t) zalloc(pv_hashed_list_zone);
4108
4109			pvh_e->qlink.next = (queue_entry_t)pvh_eh;
4110			pvh_eh = pvh_e;
4111
4112			if (pvh_et == PV_HASHED_ENTRY_NULL)
4113			        pvh_et = pvh_e;
4114			pv_cnt++;
4115		}
4116		PV_HASHED_FREE_LIST(pvh_eh, pvh_et, pv_cnt);
4117	}
4118	mappingrecurse = 0;
4119}
4120
4121void
4122pmap_commpage32_init(vm_offset_t kernel_commpage, vm_offset_t user_commpage, int cnt)
4123{
4124	int i;
4125	pt_entry_t *opte, *npte;
4126	pt_entry_t pte;
4127	spl_t s;
4128
4129	for (i = 0; i < cnt; i++) {
4130	        s = splhigh();
4131		opte = pmap_pte(kernel_pmap, (vm_map_offset_t)kernel_commpage);
4132		if (0 == opte)
4133			panic("kernel_commpage");
4134		pte = *opte | INTEL_PTE_USER|INTEL_PTE_GLOBAL;
4135		pte &= ~INTEL_PTE_WRITE; // ensure read only
4136		npte = pmap_pte(kernel_pmap, (vm_map_offset_t)user_commpage);
4137		if (0 == npte)
4138			panic("user_commpage");
4139		pmap_store_pte(npte, pte);
4140		splx(s);
4141		kernel_commpage += INTEL_PGBYTES;
4142		user_commpage += INTEL_PGBYTES;
4143	}
4144}
4145
4146
4147#define PMAP_COMMPAGE64_CNT  (_COMM_PAGE64_AREA_USED/PAGE_SIZE)
4148pt_entry_t pmap_commpage64_ptes[PMAP_COMMPAGE64_CNT];
4149
4150void
4151pmap_commpage64_init(vm_offset_t kernel_commpage, __unused vm_map_offset_t user_commpage, int cnt)
4152{
4153    int i;
4154    pt_entry_t *kptep;
4155
4156    PMAP_LOCK(kernel_pmap);
4157
4158    for (i = 0; i < cnt; i++) {
4159        kptep = pmap_pte(kernel_pmap, (uint64_t)kernel_commpage + (i*PAGE_SIZE));
4160	if ((0 == kptep) || (0 == (*kptep & INTEL_PTE_VALID)))
4161	    panic("pmap_commpage64_init pte");
4162	pmap_commpage64_ptes[i] = ((*kptep & ~INTEL_PTE_WRITE) | INTEL_PTE_USER);
4163    }
4164    PMAP_UNLOCK(kernel_pmap);
4165}
4166
4167
4168static cpu_pmap_t		cpu_pmap_master;
4169
4170struct cpu_pmap *
4171pmap_cpu_alloc(boolean_t is_boot_cpu)
4172{
4173	int			ret;
4174	int			i;
4175	cpu_pmap_t		*cp;
4176	vm_offset_t		address;
4177	vm_map_address_t	mapaddr;
4178	vm_map_entry_t		entry;
4179	pt_entry_t		*pte;
4180
4181	if (is_boot_cpu) {
4182		cp = &cpu_pmap_master;
4183	} else {
4184		/*
4185		 * The per-cpu pmap data structure itself.
4186		 */
4187		ret = kmem_alloc(kernel_map,
4188				 (vm_offset_t *) &cp, sizeof(cpu_pmap_t));
4189		if (ret != KERN_SUCCESS) {
4190			printf("pmap_cpu_alloc() failed ret=%d\n", ret);
4191			return NULL;
4192		}
4193		bzero((void *)cp, sizeof(cpu_pmap_t));
4194
4195		/*
4196		 * The temporary windows used for copy/zero - see loose_ends.c
4197		 */
4198		ret = vm_map_find_space(kernel_map,
4199		    &mapaddr, PMAP_NWINDOWS*PAGE_SIZE, (vm_map_offset_t)0, 0, &entry);
4200		if (ret != KERN_SUCCESS) {
4201			printf("pmap_cpu_alloc() "
4202				"vm_map_find_space ret=%d\n", ret);
4203			pmap_cpu_free(cp);
4204			return NULL;
4205		}
4206		address = (vm_offset_t)mapaddr;
4207
4208		for (i = 0; i < PMAP_NWINDOWS; i++, address += PAGE_SIZE) {
4209		  spl_t s;
4210		        s = splhigh();
4211			while ((pte = pmap_pte(kernel_pmap, (vm_map_offset_t)address)) == 0)
4212				pmap_expand(kernel_pmap, (vm_map_offset_t)address);
4213			* (int *) pte = 0;
4214			cp->mapwindow[i].prv_CADDR = (caddr_t) address;
4215			cp->mapwindow[i].prv_CMAP = pte;
4216			splx(s);
4217		}
4218		vm_map_unlock(kernel_map);
4219	}
4220
4221	cp->pdpt_window_index = PMAP_PDPT_FIRST_WINDOW;
4222	cp->pde_window_index = PMAP_PDE_FIRST_WINDOW;
4223	cp->pte_window_index = PMAP_PTE_FIRST_WINDOW;
4224
4225	return cp;
4226}
4227
4228void
4229pmap_cpu_free(struct cpu_pmap *cp)
4230{
4231	if (cp != NULL && cp != &cpu_pmap_master) {
4232		kfree((void *) cp, sizeof(cpu_pmap_t));
4233	}
4234}
4235
4236
4237mapwindow_t *
4238pmap_get_mapwindow(pt_entry_t pentry)
4239{
4240    mapwindow_t *mp;
4241    int i;
4242
4243    assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4244
4245    /*
4246     * Note: 0th map reserved for pmap_pte()
4247     */
4248    for (i = PMAP_NWINDOWS_FIRSTFREE; i < PMAP_NWINDOWS; i++) {
4249            mp = &current_cpu_datap()->cpu_pmap->mapwindow[i];
4250
4251	    if (*mp->prv_CMAP == 0) {
4252	            pmap_store_pte(mp->prv_CMAP, pentry);
4253
4254		    invlpg((uintptr_t)mp->prv_CADDR);
4255
4256		    return (mp);
4257	    }
4258    }
4259    panic("pmap_get_mapwindow: no windows available");
4260
4261    return NULL;
4262}
4263
4264
4265void
4266pmap_put_mapwindow(mapwindow_t *mp)
4267{
4268    pmap_store_pte(mp->prv_CMAP, 0);
4269}
4270
4271
4272/*
4273 * The Intel platform can nest at the PDE level, so NBPDE (i.e. 2MB) at a time,
4274 * on a NBPDE boundary.
4275 */
4276uint64_t pmap_nesting_size_min = NBPDE;
4277uint64_t pmap_nesting_size_max = 0 - (uint64_t)NBPDE; /* no limit, really... */
4278
4279/*
4280 *	kern_return_t pmap_nest(grand, subord, vstart, size)
4281 *
4282 *	grand  = the pmap that we will nest subord into
4283 *	subord = the pmap that goes into the grand
4284 *	vstart  = start of range in pmap to be inserted
4285 *	nstart  = start of range in pmap nested pmap
4286 *	size   = Size of nest area (up to 16TB)
4287 *
4288 *	Inserts a pmap into another.  This is used to implement shared segments.
4289 *
4290 *      on x86 this is very limited right now.  must be exactly 1 segment.
4291 *
4292 *	Note that we depend upon higher level VM locks to insure that things don't change while
4293 *	we are doing this.  For example, VM should not be doing any pmap enters while it is nesting
4294 *	or do 2 nests at once.
4295 */
4296
4297
4298kern_return_t pmap_nest(pmap_t grand, pmap_t subord, addr64_t vstart, addr64_t nstart, uint64_t size) {
4299
4300        vm_map_offset_t	vaddr, nvaddr;
4301	pd_entry_t	*pde,*npde;
4302	unsigned int	i;
4303	uint64_t	num_pde;
4304
4305	// do validity tests
4306	if (size & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4307	if(vstart & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4308	if(nstart & (pmap_nesting_size_min-1)) return KERN_INVALID_VALUE;
4309	if((size >> 28) > 65536)  return KERN_INVALID_VALUE;	/* Max size we can nest is 16TB */
4310	if(size == 0) {
4311		panic("pmap_nest: size is invalid - %016llX\n", size);
4312	}
4313
4314	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
4315		   (int) grand, (int) subord,
4316		   (int) (vstart>>32), (int) vstart, 0);
4317
4318	subord->pm_shared = TRUE;
4319	nvaddr = (vm_map_offset_t)nstart;
4320	num_pde = size >> PDESHIFT;
4321
4322	PMAP_LOCK(subord);
4323	for (i = 0; i < num_pde; i++) {
4324	  npde = pmap_pde(subord, nvaddr);
4325	  while (0 == npde || ((*npde & INTEL_PTE_VALID) == 0)) {
4326	    PMAP_UNLOCK(subord);
4327	    pmap_expand(subord, nvaddr); // pmap_expand handles races
4328	    PMAP_LOCK(subord);
4329	    npde = pmap_pde(subord, nvaddr);
4330	  }
4331	  nvaddr += NBPDE;
4332	}
4333
4334	PMAP_UNLOCK(subord);
4335
4336	vaddr = (vm_map_offset_t)vstart;
4337
4338	PMAP_LOCK(grand);
4339
4340	for (i = 0;i < num_pde; i++) {
4341	  pd_entry_t tpde;
4342
4343	  npde = pmap_pde(subord, nstart);
4344	  if (npde == 0)
4345		  panic("pmap_nest: no npde, subord %p nstart 0x%llx", subord, nstart);
4346	  tpde = *npde;
4347	  nstart += NBPDE;
4348	  pde = pmap_pde(grand, vaddr);
4349/* Legacy mode does not require expansion.
4350 * DRK: consider a debug mode test to verify that no PTEs are extant within
4351 * this range.
4352 */
4353	  if ((0 == pde) && cpu_64bit) {
4354	    PMAP_UNLOCK(grand);
4355	    pmap_expand_pdpt(grand, vaddr);
4356	    PMAP_LOCK(grand);
4357	    pde = pmap_pde(grand, vaddr);
4358	  }
4359
4360	  if (pde == 0)
4361		  panic("pmap_nest: no pde, grand  %p vaddr 0x%llx", grand, vaddr);
4362	  vaddr += NBPDE;
4363	  pmap_store_pte(pde, tpde);
4364	}
4365
4366	/* XXX FBDP: why do we need to flush here ? */
4367	PMAP_UPDATE_TLBS(grand, vstart, vstart + size - 1);
4368
4369	PMAP_UNLOCK(grand);
4370
4371	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
4372
4373	return KERN_SUCCESS;
4374}
4375
4376/*
4377 *	kern_return_t pmap_unnest(grand, vaddr)
4378 *
4379 *	grand  = the pmap that we will nest subord into
4380 *	vaddr  = start of range in pmap to be unnested
4381 *
4382 *	Removes a pmap from another.  This is used to implement shared segments.
4383 *	On the current PPC processors, this is limited to segment (256MB) aligned
4384 *	segment sized ranges.
4385 */
4386
4387kern_return_t pmap_unnest(pmap_t grand, addr64_t vaddr, uint64_t size) {
4388
4389	pd_entry_t *pde;
4390	unsigned int i;
4391	unsigned int num_pde;
4392	addr64_t vstart, vend;
4393
4394	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
4395		   (int) grand,
4396		   (int) (vaddr>>32), (int) vaddr, 0, 0);
4397
4398	if ((size & (pmap_nesting_size_min-1)) ||
4399	    (vaddr & (pmap_nesting_size_min-1))) {
4400		panic("pmap_unnest(%p,0x%llx,0x%llx): unaligned...\n",
4401		      grand, vaddr, size);
4402	}
4403
4404	/* align everything to PDE boundaries */
4405	vstart = vaddr & ~(NBPDE-1);
4406	vend = (vaddr + size + NBPDE - 1) & ~(NBPDE-1);
4407	size = vend - vstart;
4408
4409	PMAP_LOCK(grand);
4410
4411	// invalidate all pdes for segment at vaddr in pmap grand
4412
4413	num_pde = size >> PDESHIFT;
4414
4415	vaddr = vstart;
4416	for (i=0;i<num_pde;i++,pde++) {
4417	  pde = pmap_pde(grand, (vm_map_offset_t)vaddr);
4418	  if (pde == 0) panic("pmap_unnest: no pde, grand %p vaddr 0x%llx\n", grand, vaddr);
4419	  pmap_store_pte(pde, (pd_entry_t)0);
4420	  vaddr += NBPDE;
4421	}
4422	PMAP_UPDATE_TLBS(grand, vstart, vend);
4423
4424	PMAP_UNLOCK(grand);
4425
4426	PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, 0, 0, 0, 0, 0);
4427
4428	return KERN_SUCCESS;
4429}
4430
4431void
4432pmap_switch(pmap_t tpmap)
4433{
4434        spl_t	s;
4435  	int	my_cpu;
4436
4437	s = splhigh();		/* Make sure interruptions are disabled */
4438	my_cpu = cpu_number();
4439
4440	set_dirbase(tpmap, my_cpu);
4441
4442	splx(s);
4443}
4444
4445
4446/*
4447 * disable no-execute capability on
4448 * the specified pmap
4449 */
4450void pmap_disable_NX(pmap_t pmap) {
4451
4452        pmap->nx_enabled = 0;
4453}
4454
4455void
4456pt_fake_zone_info(int *count, vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size,
4457		  vm_size_t *alloc_size, int *collectable, int *exhaustable)
4458{
4459        *count      = inuse_ptepages_count;
4460	*cur_size   = PAGE_SIZE * inuse_ptepages_count;
4461	*max_size   = PAGE_SIZE * (inuse_ptepages_count + vm_page_inactive_count + vm_page_active_count + vm_page_free_count);
4462	*elem_size  = PAGE_SIZE;
4463	*alloc_size = PAGE_SIZE;
4464
4465	*collectable = 1;
4466	*exhaustable = 0;
4467}
4468
4469vm_offset_t pmap_cpu_high_map_vaddr(int cpu, enum high_cpu_types e)
4470{
4471  enum high_fixed_addresses a;
4472  a = e + HIGH_CPU_END * cpu;
4473  return pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4474}
4475
4476vm_offset_t pmap_high_map_vaddr(enum high_cpu_types e)
4477{
4478  return pmap_cpu_high_map_vaddr(cpu_number(), e);
4479}
4480
4481vm_offset_t pmap_high_map(pt_entry_t pte, enum high_cpu_types e)
4482{
4483  enum high_fixed_addresses a;
4484  vm_offset_t vaddr;
4485
4486  a = e + HIGH_CPU_END * cpu_number();
4487  vaddr = (vm_offset_t)pmap_index_to_virt(HIGH_FIXED_CPUS_BEGIN + a);
4488  pmap_store_pte(pte_unique_base + a, pte);
4489
4490  /* TLB flush for this page for this  cpu */
4491  invlpg((uintptr_t)vaddr);
4492
4493  return  vaddr;
4494}
4495
4496static inline void
4497pmap_cpuset_NMIPI(cpu_set cpu_mask) {
4498	unsigned int cpu, cpu_bit;
4499	uint64_t deadline;
4500
4501	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4502		if (cpu_mask & cpu_bit)
4503			cpu_NMI_interrupt(cpu);
4504	}
4505	deadline = mach_absolute_time() + (LockTimeOut >> 2);
4506	while (mach_absolute_time() < deadline)
4507		cpu_pause();
4508}
4509
4510
4511/*
4512 * Called with pmap locked, we:
4513 *  - scan through per-cpu data to see which other cpus need to flush
4514 *  - send an IPI to each non-idle cpu to be flushed
4515 *  - wait for all to signal back that they are inactive or we see that
4516 *    they are in an interrupt handler or at a safe point
4517 *  - flush the local tlb is active for this pmap
4518 *  - return ... the caller will unlock the pmap
4519 */
4520void
4521pmap_flush_tlbs(pmap_t	pmap)
4522{
4523	unsigned int	cpu;
4524	unsigned int	cpu_bit;
4525	cpu_set		cpus_to_signal;
4526	unsigned int	my_cpu = cpu_number();
4527	pmap_paddr_t	pmap_cr3 = pmap->pm_cr3;
4528	boolean_t	flush_self = FALSE;
4529	uint64_t	deadline;
4530
4531	assert((processor_avail_count < 2) ||
4532	       (ml_get_interrupts_enabled() && get_preemption_level() != 0));
4533
4534	/*
4535	 * Scan other cpus for matching active or task CR3.
4536	 * For idle cpus (with no active map) we mark them invalid but
4537	 * don't signal -- they'll check as they go busy.
4538	 * Note: for the kernel pmap we look for 64-bit shared address maps.
4539	 */
4540	cpus_to_signal = 0;
4541	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4542		if (!cpu_datap(cpu)->cpu_running)
4543			continue;
4544		if ((cpu_datap(cpu)->cpu_task_cr3 == pmap_cr3) ||
4545		    (CPU_GET_ACTIVE_CR3(cpu)      == pmap_cr3) ||
4546		    (pmap->pm_shared) ||
4547		    ((pmap == kernel_pmap) &&
4548		     (!CPU_CR3_IS_ACTIVE(cpu) ||
4549		      cpu_datap(cpu)->cpu_task_map == TASK_MAP_64BIT_SHARED))) {
4550			if (cpu == my_cpu) {
4551				flush_self = TRUE;
4552				continue;
4553			}
4554			cpu_datap(cpu)->cpu_tlb_invalid = TRUE;
4555			__asm__ volatile("mfence");
4556
4557			if (CPU_CR3_IS_ACTIVE(cpu)) {
4558				cpus_to_signal |= cpu_bit;
4559				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
4560			}
4561		}
4562	}
4563
4564	PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
4565		   (int) pmap, cpus_to_signal, flush_self, 0, 0);
4566
4567	if (cpus_to_signal) {
4568		cpu_set	cpus_to_respond = cpus_to_signal;
4569
4570		deadline = mach_absolute_time() + LockTimeOut;
4571		/*
4572		 * Wait for those other cpus to acknowledge
4573		 */
4574		while (cpus_to_respond != 0) {
4575			if (mach_absolute_time() > deadline) {
4576				if (!panic_active()) {
4577					pmap_tlb_flush_timeout = TRUE;
4578					pmap_cpuset_NMIPI(cpus_to_respond);
4579				}
4580				panic("pmap_flush_tlbs() timeout: "
4581				    "cpu(s) failing to respond to interrupts, pmap=%p cpus_to_respond=0x%lx",
4582				    pmap, cpus_to_respond);
4583			}
4584
4585			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
4586				if ((cpus_to_respond & cpu_bit) != 0) {
4587					if (!cpu_datap(cpu)->cpu_running ||
4588					    cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
4589					    !CPU_CR3_IS_ACTIVE(cpu)) {
4590						cpus_to_respond &= ~cpu_bit;
4591					}
4592					cpu_pause();
4593				}
4594				if (cpus_to_respond == 0)
4595					break;
4596			}
4597		}
4598	}
4599
4600	/*
4601	 * Flush local tlb if required.
4602	 * We need this flush even if the pmap being changed
4603	 * is the user map... in case we do a copyin/out
4604	 * before returning to user mode.
4605	 */
4606	if (flush_self)
4607		flush_tlb();
4608
4609	PMAP_TRACE(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
4610		   (int) pmap, cpus_to_signal, flush_self, 0, 0);
4611}
4612
4613void
4614process_pmap_updates(void)
4615{
4616	assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
4617
4618	flush_tlb();
4619
4620	current_cpu_datap()->cpu_tlb_invalid = FALSE;
4621	__asm__ volatile("mfence");
4622}
4623
4624void
4625pmap_update_interrupt(void)
4626{
4627        PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
4628		   0, 0, 0, 0, 0);
4629
4630	process_pmap_updates();
4631
4632        PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
4633		   0, 0, 0, 0, 0);
4634}
4635
4636
4637unsigned int pmap_cache_attributes(ppnum_t pn) {
4638
4639	if (!managed_page(ppn_to_pai(pn)))
4640	        return (VM_WIMG_IO);
4641
4642	return (VM_WIMG_COPYBACK);
4643}
4644
4645#ifdef PMAP_DEBUG
4646void
4647pmap_dump(pmap_t p)
4648{
4649  int i;
4650
4651  kprintf("pmap 0x%x\n",p);
4652
4653  kprintf("  pm_cr3 0x%llx\n",p->pm_cr3);
4654  kprintf("  pm_pml4 0x%x\n",p->pm_pml4);
4655  kprintf("  pm_pdpt 0x%x\n",p->pm_pdpt);
4656
4657  kprintf("    pml4[0] 0x%llx\n",*p->pm_pml4);
4658  for (i=0;i<8;i++)
4659    kprintf("    pdpt[%d] 0x%llx\n",i, p->pm_pdpt[i]);
4660}
4661
4662void pmap_dump_wrap(void)
4663{
4664  pmap_dump(current_cpu_datap()->cpu_active_thread->task->map->pmap);
4665}
4666
4667void
4668dump_4GB_pdpt(pmap_t p)
4669{
4670	int		spl;
4671	pdpt_entry_t	*user_pdptp;
4672	pdpt_entry_t	*kern_pdptp;
4673	pdpt_entry_t	*pml4p;
4674
4675	spl = splhigh();
4676	while ((user_pdptp = pmap64_pdpt(p, 0x0)) == PDPT_ENTRY_NULL) {
4677		splx(spl);
4678		pmap_expand_pml4(p, 0x0);
4679		spl = splhigh();
4680	}
4681	kern_pdptp = kernel_pmap->pm_pdpt;
4682	if (kern_pdptp == NULL)
4683		panic("kern_pdptp == NULL");
4684	kprintf("dump_4GB_pdpt(%p)\n"
4685		"kern_pdptp=%p (phys=0x%016llx)\n"
4686		"\t 0x%08x: 0x%016llx\n"
4687		"\t 0x%08x: 0x%016llx\n"
4688		"\t 0x%08x: 0x%016llx\n"
4689		"\t 0x%08x: 0x%016llx\n"
4690		"\t 0x%08x: 0x%016llx\n"
4691		"user_pdptp=%p (phys=0x%016llx)\n"
4692		"\t 0x%08x: 0x%016llx\n"
4693		"\t 0x%08x: 0x%016llx\n"
4694		"\t 0x%08x: 0x%016llx\n"
4695		"\t 0x%08x: 0x%016llx\n"
4696		"\t 0x%08x: 0x%016llx\n",
4697		p, kern_pdptp, kvtophys(kern_pdptp),
4698		kern_pdptp+0, *(kern_pdptp+0),
4699		kern_pdptp+1, *(kern_pdptp+1),
4700		kern_pdptp+2, *(kern_pdptp+2),
4701		kern_pdptp+3, *(kern_pdptp+3),
4702		kern_pdptp+4, *(kern_pdptp+4),
4703		user_pdptp, kvtophys(user_pdptp),
4704		user_pdptp+0, *(user_pdptp+0),
4705		user_pdptp+1, *(user_pdptp+1),
4706		user_pdptp+2, *(user_pdptp+2),
4707		user_pdptp+3, *(user_pdptp+3),
4708		user_pdptp+4, *(user_pdptp+4));
4709	kprintf("user pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4710		p->pm_cr3, p->pm_hold, p->pm_pml4);
4711	pml4p = (pdpt_entry_t *)p->pm_hold;
4712	if (pml4p == NULL)
4713		panic("user pml4p == NULL");
4714	kprintf("\t 0x%08x: 0x%016llx\n"
4715		"\t 0x%08x: 0x%016llx\n",
4716		pml4p+0, *(pml4p),
4717		pml4p+KERNEL_UBER_PML4_INDEX, *(pml4p+KERNEL_UBER_PML4_INDEX));
4718	kprintf("kern pm_cr3=0x%016llx pm_hold=0x%08x pm_pml4=0x%08x\n",
4719		kernel_pmap->pm_cr3, kernel_pmap->pm_hold, kernel_pmap->pm_pml4);
4720	pml4p = (pdpt_entry_t *)kernel_pmap->pm_hold;
4721	if (pml4p == NULL)
4722		panic("kern pml4p == NULL");
4723	kprintf("\t 0x%08x: 0x%016llx\n"
4724		"\t 0x%08x: 0x%016llx\n",
4725		pml4p+0, *(pml4p),
4726		pml4p+511, *(pml4p+511));
4727	splx(spl);
4728}
4729
4730void dump_4GB_pdpt_thread(thread_t tp)
4731{
4732	dump_4GB_pdpt(tp->map->pmap);
4733}
4734
4735
4736#endif
4737