htable.c revision 7240:c4957ab6a78e
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/types.h>
30#include <sys/sysmacros.h>
31#include <sys/kmem.h>
32#include <sys/atomic.h>
33#include <sys/bitmap.h>
34#include <sys/machparam.h>
35#include <sys/machsystm.h>
36#include <sys/mman.h>
37#include <sys/systm.h>
38#include <sys/cpuvar.h>
39#include <sys/thread.h>
40#include <sys/proc.h>
41#include <sys/cpu.h>
42#include <sys/kmem.h>
43#include <sys/disp.h>
44#include <sys/vmem.h>
45#include <sys/vmsystm.h>
46#include <sys/promif.h>
47#include <sys/var.h>
48#include <sys/x86_archext.h>
49#include <sys/archsystm.h>
50#include <sys/bootconf.h>
51#include <sys/dumphdr.h>
52#include <vm/seg_kmem.h>
53#include <vm/seg_kpm.h>
54#include <vm/hat.h>
55#include <vm/hat_i86.h>
56#include <sys/cmn_err.h>
57#include <sys/panic.h>
58
59#ifdef __xpv
60#include <sys/hypervisor.h>
61#include <sys/xpv_panic.h>
62#endif
63
64#include <sys/bootinfo.h>
65#include <vm/kboot_mmu.h>
66
67static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count);
68
69kmem_cache_t *htable_cache;
70
71/*
72 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT,
73 * is used in order to facilitate testing of the htable_steal() code.
74 * By resetting htable_reserve_amount to a lower value, we can force
75 * stealing to occur.  The reserve amount is a guess to get us through boot.
76 */
77#define	HTABLE_RESERVE_AMOUNT	(200)
78uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT;
79kmutex_t htable_reserve_mutex;
80uint_t htable_reserve_cnt;
81htable_t *htable_reserve_pool;
82
83/*
84 * Used to hand test htable_steal().
85 */
86#ifdef DEBUG
87ulong_t force_steal = 0;
88ulong_t ptable_cnt = 0;
89#endif
90
91/*
92 * This variable is so that we can tune this via /etc/system
93 * Any value works, but a power of two <= mmu.ptes_per_table is best.
94 */
95uint_t htable_steal_passes = 8;
96
97/*
98 * mutex stuff for access to htable hash
99 */
100#define	NUM_HTABLE_MUTEX 128
101kmutex_t htable_mutex[NUM_HTABLE_MUTEX];
102#define	HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1))
103
104#define	HTABLE_ENTER(h)	mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
105#define	HTABLE_EXIT(h)	mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]);
106
107/*
108 * forward declarations
109 */
110static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr);
111static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr);
112static void htable_free(htable_t *ht);
113static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index);
114static void x86pte_release_pagetable(htable_t *ht);
115static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old,
116	x86pte_t new);
117
118/*
119 * A counter to track if we are stealing or reaping htables. When non-zero
120 * htable_free() will directly free htables (either to the reserve or kmem)
121 * instead of putting them in a hat's htable cache.
122 */
123uint32_t htable_dont_cache = 0;
124
125/*
126 * Track the number of active pagetables, so we can know how many to reap
127 */
128static uint32_t active_ptables = 0;
129
130#ifdef __xpv
131/*
132 * Deal with hypervisor complications.
133 */
134void
135xen_flush_va(caddr_t va)
136{
137	struct mmuext_op t;
138	uint_t count;
139
140	if (IN_XPV_PANIC()) {
141		mmu_tlbflush_entry((caddr_t)va);
142	} else {
143		t.cmd = MMUEXT_INVLPG_LOCAL;
144		t.arg1.linear_addr = (uintptr_t)va;
145		if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
146			panic("HYPERVISOR_mmuext_op() failed");
147		ASSERT(count == 1);
148	}
149}
150
151void
152xen_gflush_va(caddr_t va, cpuset_t cpus)
153{
154	struct mmuext_op t;
155	uint_t count;
156
157	if (IN_XPV_PANIC()) {
158		mmu_tlbflush_entry((caddr_t)va);
159		return;
160	}
161
162	t.cmd = MMUEXT_INVLPG_MULTI;
163	t.arg1.linear_addr = (uintptr_t)va;
164	/*LINTED: constant in conditional context*/
165	set_xen_guest_handle(t.arg2.vcpumask, &cpus);
166	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
167		panic("HYPERVISOR_mmuext_op() failed");
168	ASSERT(count == 1);
169}
170
171void
172xen_flush_tlb()
173{
174	struct mmuext_op t;
175	uint_t count;
176
177	if (IN_XPV_PANIC()) {
178		xpv_panic_reload_cr3();
179	} else {
180		t.cmd = MMUEXT_TLB_FLUSH_LOCAL;
181		if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
182			panic("HYPERVISOR_mmuext_op() failed");
183		ASSERT(count == 1);
184	}
185}
186
187void
188xen_gflush_tlb(cpuset_t cpus)
189{
190	struct mmuext_op t;
191	uint_t count;
192
193	ASSERT(!IN_XPV_PANIC());
194	t.cmd = MMUEXT_TLB_FLUSH_MULTI;
195	/*LINTED: constant in conditional context*/
196	set_xen_guest_handle(t.arg2.vcpumask, &cpus);
197	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
198		panic("HYPERVISOR_mmuext_op() failed");
199	ASSERT(count == 1);
200}
201
202/*
203 * Install/Adjust a kpm mapping under the hypervisor.
204 * Value of "how" should be:
205 *	PT_WRITABLE | PT_VALID - regular kpm mapping
206 *	PT_VALID - make mapping read-only
207 *	0	- remove mapping
208 *
209 * returns 0 on success. non-zero for failure.
210 */
211int
212xen_kpm_page(pfn_t pfn, uint_t how)
213{
214	paddr_t pa = mmu_ptob((paddr_t)pfn);
215	x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD;
216
217	if (kpm_vbase == NULL)
218		return (0);
219
220	if (how)
221		pte |= pa_to_ma(pa) | how;
222	else
223		pte = 0;
224	return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa,
225	    pte, UVMF_INVLPG | UVMF_ALL));
226}
227
228void
229xen_pin(pfn_t pfn, level_t lvl)
230{
231	struct mmuext_op t;
232	uint_t count;
233
234	t.cmd = MMUEXT_PIN_L1_TABLE + lvl;
235	t.arg1.mfn = pfn_to_mfn(pfn);
236	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
237		panic("HYPERVISOR_mmuext_op() failed");
238	ASSERT(count == 1);
239}
240
241void
242xen_unpin(pfn_t pfn)
243{
244	struct mmuext_op t;
245	uint_t count;
246
247	t.cmd = MMUEXT_UNPIN_TABLE;
248	t.arg1.mfn = pfn_to_mfn(pfn);
249	if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0)
250		panic("HYPERVISOR_mmuext_op() failed");
251	ASSERT(count == 1);
252}
253
254static void
255xen_map(uint64_t pte, caddr_t va)
256{
257	if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte,
258	    UVMF_INVLPG | UVMF_LOCAL))
259		panic("HYPERVISOR_update_va_mapping() failed");
260}
261#endif /* __xpv */
262
263/*
264 * Allocate a memory page for a hardware page table.
265 *
266 * A wrapper around page_get_physical(), with some extra checks.
267 */
268static pfn_t
269ptable_alloc(uintptr_t seed)
270{
271	pfn_t pfn;
272	page_t *pp;
273
274	pfn = PFN_INVALID;
275	atomic_add_32(&active_ptables, 1);
276
277	/*
278	 * The first check is to see if there is memory in the system. If we
279	 * drop to throttlefree, then fail the ptable_alloc() and let the
280	 * stealing code kick in. Note that we have to do this test here,
281	 * since the test in page_create_throttle() would let the NOSLEEP
282	 * allocation go through and deplete the page reserves.
283	 *
284	 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check.
285	 */
286	if (!NOMEMWAIT() && freemem <= throttlefree + 1)
287		return (PFN_INVALID);
288
289#ifdef DEBUG
290	/*
291	 * This code makes htable_steal() easier to test. By setting
292	 * force_steal we force pagetable allocations to fall
293	 * into the stealing code. Roughly 1 in ever "force_steal"
294	 * page table allocations will fail.
295	 */
296	if (proc_pageout != NULL && force_steal > 1 &&
297	    ++ptable_cnt > force_steal) {
298		ptable_cnt = 0;
299		return (PFN_INVALID);
300	}
301#endif /* DEBUG */
302
303	pp = page_get_physical(seed);
304	if (pp == NULL)
305		return (PFN_INVALID);
306	pfn = pp->p_pagenum;
307	page_downgrade(pp);
308	ASSERT(PAGE_SHARED(pp));
309
310	if (pfn == PFN_INVALID)
311		panic("ptable_alloc(): Invalid PFN!!");
312	HATSTAT_INC(hs_ptable_allocs);
313	return (pfn);
314}
315
316/*
317 * Free an htable's associated page table page.  See the comments
318 * for ptable_alloc().
319 */
320static void
321ptable_free(pfn_t pfn)
322{
323	page_t *pp = page_numtopp_nolock(pfn);
324
325	/*
326	 * need to destroy the page used for the pagetable
327	 */
328	ASSERT(pfn != PFN_INVALID);
329	HATSTAT_INC(hs_ptable_frees);
330	atomic_add_32(&active_ptables, -1);
331	if (pp == NULL)
332		panic("ptable_free(): no page for pfn!");
333	ASSERT(PAGE_SHARED(pp));
334	ASSERT(pfn == pp->p_pagenum);
335	ASSERT(!IN_XPV_PANIC());
336
337	/*
338	 * Get an exclusive lock, might have to wait for a kmem reader.
339	 */
340	if (!page_tryupgrade(pp)) {
341		page_unlock(pp);
342		/*
343		 * RFE: we could change this to not loop forever
344		 * George Cameron had some idea on how to do that.
345		 * For now looping works - it's just like sfmmu.
346		 */
347		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
348			continue;
349	}
350#ifdef __xpv
351	if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
352		panic("failure making kpm r/w pfn=0x%lx", pfn);
353#endif
354	page_free(pp, 1);
355	page_unresv(1);
356}
357
358/*
359 * Put one htable on the reserve list.
360 */
361static void
362htable_put_reserve(htable_t *ht)
363{
364	ht->ht_hat = NULL;		/* no longer tied to a hat */
365	ASSERT(ht->ht_pfn == PFN_INVALID);
366	HATSTAT_INC(hs_htable_rputs);
367	mutex_enter(&htable_reserve_mutex);
368	ht->ht_next = htable_reserve_pool;
369	htable_reserve_pool = ht;
370	++htable_reserve_cnt;
371	mutex_exit(&htable_reserve_mutex);
372}
373
374/*
375 * Take one htable from the reserve.
376 */
377static htable_t *
378htable_get_reserve(void)
379{
380	htable_t *ht = NULL;
381
382	mutex_enter(&htable_reserve_mutex);
383	if (htable_reserve_cnt != 0) {
384		ht = htable_reserve_pool;
385		ASSERT(ht != NULL);
386		ASSERT(ht->ht_pfn == PFN_INVALID);
387		htable_reserve_pool = ht->ht_next;
388		--htable_reserve_cnt;
389		HATSTAT_INC(hs_htable_rgets);
390	}
391	mutex_exit(&htable_reserve_mutex);
392	return (ht);
393}
394
395/*
396 * Allocate initial htables and put them on the reserve list
397 */
398void
399htable_initial_reserve(uint_t count)
400{
401	htable_t *ht;
402
403	count += HTABLE_RESERVE_AMOUNT;
404	while (count > 0) {
405		ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP);
406		ASSERT(ht != NULL);
407
408		ASSERT(use_boot_reserve);
409		ht->ht_pfn = PFN_INVALID;
410		htable_put_reserve(ht);
411		--count;
412	}
413}
414
415/*
416 * Readjust the reserves after a thread finishes using them.
417 */
418void
419htable_adjust_reserve()
420{
421	htable_t *ht;
422
423	/*
424	 * Free any excess htables in the reserve list
425	 */
426	while (htable_reserve_cnt > htable_reserve_amount &&
427	    !USE_HAT_RESERVES()) {
428		ht = htable_get_reserve();
429		if (ht == NULL)
430			return;
431		ASSERT(ht->ht_pfn == PFN_INVALID);
432		kmem_cache_free(htable_cache, ht);
433	}
434}
435
436
437/*
438 * This routine steals htables from user processes for htable_alloc() or
439 * for htable_reap().
440 */
441static htable_t *
442htable_steal(uint_t cnt)
443{
444	hat_t		*hat = kas.a_hat;	/* list starts with khat */
445	htable_t	*list = NULL;
446	htable_t	*ht;
447	htable_t	*higher;
448	uint_t		h;
449	uint_t		h_start;
450	static uint_t	h_seed = 0;
451	uint_t		e;
452	uintptr_t	va;
453	x86pte_t	pte;
454	uint_t		stolen = 0;
455	uint_t		pass;
456	uint_t		threshold;
457
458	/*
459	 * Limit htable_steal_passes to something reasonable
460	 */
461	if (htable_steal_passes == 0)
462		htable_steal_passes = 1;
463	if (htable_steal_passes > mmu.ptes_per_table)
464		htable_steal_passes = mmu.ptes_per_table;
465
466	/*
467	 * Loop through all user hats. The 1st pass takes cached htables that
468	 * aren't in use. The later passes steal by removing mappings, too.
469	 */
470	atomic_add_32(&htable_dont_cache, 1);
471	for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) {
472		threshold = pass * mmu.ptes_per_table / htable_steal_passes;
473		hat = kas.a_hat;
474		for (;;) {
475
476			/*
477			 * Clear the victim flag and move to next hat
478			 */
479			mutex_enter(&hat_list_lock);
480			if (hat != kas.a_hat) {
481				hat->hat_flags &= ~HAT_VICTIM;
482				cv_broadcast(&hat_list_cv);
483			}
484			hat = hat->hat_next;
485
486			/*
487			 * Skip any hat that is already being stolen from.
488			 *
489			 * We skip SHARED hats, as these are dummy
490			 * hats that host ISM shared page tables.
491			 *
492			 * We also skip if HAT_FREEING because hat_pte_unmap()
493			 * won't zero out the PTE's. That would lead to hitting
494			 * stale PTEs either here or under hat_unload() when we
495			 * steal and unload the same page table in competing
496			 * threads.
497			 */
498			while (hat != NULL &&
499			    (hat->hat_flags &
500			    (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
501				hat = hat->hat_next;
502
503			if (hat == NULL) {
504				mutex_exit(&hat_list_lock);
505				break;
506			}
507
508			/*
509			 * Are we finished?
510			 */
511			if (stolen == cnt) {
512				/*
513				 * Try to spread the pain of stealing,
514				 * move victim HAT to the end of the HAT list.
515				 */
516				if (pass >= 1 && cnt == 1 &&
517				    kas.a_hat->hat_prev != hat) {
518
519					/* unlink victim hat */
520					if (hat->hat_prev)
521						hat->hat_prev->hat_next =
522						    hat->hat_next;
523					else
524						kas.a_hat->hat_next =
525						    hat->hat_next;
526					if (hat->hat_next)
527						hat->hat_next->hat_prev =
528						    hat->hat_prev;
529					else
530						kas.a_hat->hat_prev =
531						    hat->hat_prev;
532
533
534					/* relink at end of hat list */
535					hat->hat_next = NULL;
536					hat->hat_prev = kas.a_hat->hat_prev;
537					if (hat->hat_prev)
538						hat->hat_prev->hat_next = hat;
539					else
540						kas.a_hat->hat_next = hat;
541					kas.a_hat->hat_prev = hat;
542
543				}
544
545				mutex_exit(&hat_list_lock);
546				break;
547			}
548
549			/*
550			 * Mark the HAT as a stealing victim.
551			 */
552			hat->hat_flags |= HAT_VICTIM;
553			mutex_exit(&hat_list_lock);
554
555			/*
556			 * Take any htables from the hat's cached "free" list.
557			 */
558			hat_enter(hat);
559			while ((ht = hat->hat_ht_cached) != NULL &&
560			    stolen < cnt) {
561				hat->hat_ht_cached = ht->ht_next;
562				ht->ht_next = list;
563				list = ht;
564				++stolen;
565			}
566			hat_exit(hat);
567
568			/*
569			 * Don't steal on first pass.
570			 */
571			if (pass == 0 || stolen == cnt)
572				continue;
573
574			/*
575			 * Search the active htables for one to steal.
576			 * Start at a different hash bucket every time to
577			 * help spread the pain of stealing.
578			 */
579			h = h_start = h_seed++ % hat->hat_num_hash;
580			do {
581				higher = NULL;
582				HTABLE_ENTER(h);
583				for (ht = hat->hat_ht_hash[h]; ht;
584				    ht = ht->ht_next) {
585
586					/*
587					 * Can we rule out reaping?
588					 */
589					if (ht->ht_busy != 0 ||
590					    (ht->ht_flags & HTABLE_SHARED_PFN)||
591					    ht->ht_level > 0 ||
592					    ht->ht_valid_cnt > threshold ||
593					    ht->ht_lock_cnt != 0)
594						continue;
595
596					/*
597					 * Increment busy so the htable can't
598					 * disappear. We drop the htable mutex
599					 * to avoid deadlocks with
600					 * hat_pageunload() and the hment mutex
601					 * while we call hat_pte_unmap()
602					 */
603					++ht->ht_busy;
604					HTABLE_EXIT(h);
605
606					/*
607					 * Try stealing.
608					 * - unload and invalidate all PTEs
609					 */
610					for (e = 0, va = ht->ht_vaddr;
611					    e < HTABLE_NUM_PTES(ht) &&
612					    ht->ht_valid_cnt > 0 &&
613					    ht->ht_busy == 1 &&
614					    ht->ht_lock_cnt == 0;
615					    ++e, va += MMU_PAGESIZE) {
616						pte = x86pte_get(ht, e);
617						if (!PTE_ISVALID(pte))
618							continue;
619						hat_pte_unmap(ht, e,
620						    HAT_UNLOAD, pte, NULL);
621					}
622
623					/*
624					 * Reacquire htable lock. If we didn't
625					 * remove all mappings in the table,
626					 * or another thread added a new mapping
627					 * behind us, give up on this table.
628					 */
629					HTABLE_ENTER(h);
630					if (ht->ht_busy != 1 ||
631					    ht->ht_valid_cnt != 0 ||
632					    ht->ht_lock_cnt != 0) {
633						--ht->ht_busy;
634						continue;
635					}
636
637					/*
638					 * Steal it and unlink the page table.
639					 */
640					higher = ht->ht_parent;
641					unlink_ptp(higher, ht, ht->ht_vaddr);
642
643					/*
644					 * remove from the hash list
645					 */
646					if (ht->ht_next)
647						ht->ht_next->ht_prev =
648						    ht->ht_prev;
649
650					if (ht->ht_prev) {
651						ht->ht_prev->ht_next =
652						    ht->ht_next;
653					} else {
654						ASSERT(hat->hat_ht_hash[h] ==
655						    ht);
656						hat->hat_ht_hash[h] =
657						    ht->ht_next;
658					}
659
660					/*
661					 * Break to outer loop to release the
662					 * higher (ht_parent) pagetable. This
663					 * spreads out the pain caused by
664					 * pagefaults.
665					 */
666					ht->ht_next = list;
667					list = ht;
668					++stolen;
669					break;
670				}
671				HTABLE_EXIT(h);
672				if (higher != NULL)
673					htable_release(higher);
674				if (++h == hat->hat_num_hash)
675					h = 0;
676			} while (stolen < cnt && h != h_start);
677		}
678	}
679	atomic_add_32(&htable_dont_cache, -1);
680	return (list);
681}
682
683
684/*
685 * This is invoked from kmem when the system is low on memory.  We try
686 * to free hments, htables, and ptables to improve the memory situation.
687 */
688/*ARGSUSED*/
689static void
690htable_reap(void *handle)
691{
692	uint_t		reap_cnt;
693	htable_t	*list;
694	htable_t	*ht;
695
696	HATSTAT_INC(hs_reap_attempts);
697	if (!can_steal_post_boot)
698		return;
699
700	/*
701	 * Try to reap 5% of the page tables bounded by a maximum of
702	 * 5% of physmem and a minimum of 10.
703	 */
704	reap_cnt = MIN(MAX(physmem / 20, active_ptables / 20), 10);
705
706	/*
707	 * Let htable_steal() do the work, we just call htable_free()
708	 */
709	XPV_DISALLOW_MIGRATE();
710	list = htable_steal(reap_cnt);
711	XPV_ALLOW_MIGRATE();
712	while ((ht = list) != NULL) {
713		list = ht->ht_next;
714		HATSTAT_INC(hs_reaped);
715		htable_free(ht);
716	}
717
718	/*
719	 * Free up excess reserves
720	 */
721	htable_adjust_reserve();
722	hment_adjust_reserve();
723}
724
725/*
726 * Allocate an htable, stealing one or using the reserve if necessary
727 */
728static htable_t *
729htable_alloc(
730	hat_t		*hat,
731	uintptr_t	vaddr,
732	level_t		level,
733	htable_t	*shared)
734{
735	htable_t	*ht = NULL;
736	uint_t		is_vlp;
737	uint_t		is_bare = 0;
738	uint_t		need_to_zero = 1;
739	int		kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
740
741	if (level < 0 || level > TOP_LEVEL(hat))
742		panic("htable_alloc(): level %d out of range\n", level);
743
744	is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
745	if (is_vlp || shared != NULL)
746		is_bare = 1;
747
748	/*
749	 * First reuse a cached htable from the hat_ht_cached field, this
750	 * avoids unnecessary trips through kmem/page allocators.
751	 */
752	if (hat->hat_ht_cached != NULL && !is_bare) {
753		hat_enter(hat);
754		ht = hat->hat_ht_cached;
755		if (ht != NULL) {
756			hat->hat_ht_cached = ht->ht_next;
757			need_to_zero = 0;
758			/* XX64 ASSERT() they're all zero somehow */
759			ASSERT(ht->ht_pfn != PFN_INVALID);
760		}
761		hat_exit(hat);
762	}
763
764	if (ht == NULL) {
765		/*
766		 * Allocate an htable, possibly refilling the reserves.
767		 */
768		if (USE_HAT_RESERVES()) {
769			ht = htable_get_reserve();
770		} else {
771			/*
772			 * Donate successful htable allocations to the reserve.
773			 */
774			for (;;) {
775				ht = kmem_cache_alloc(htable_cache, kmflags);
776				if (ht == NULL)
777					break;
778				ht->ht_pfn = PFN_INVALID;
779				if (USE_HAT_RESERVES() ||
780				    htable_reserve_cnt >= htable_reserve_amount)
781					break;
782				htable_put_reserve(ht);
783			}
784		}
785
786		/*
787		 * allocate a page for the hardware page table if needed
788		 */
789		if (ht != NULL && !is_bare) {
790			ht->ht_hat = hat;
791			ht->ht_pfn = ptable_alloc((uintptr_t)ht);
792			if (ht->ht_pfn == PFN_INVALID) {
793				if (USE_HAT_RESERVES())
794					htable_put_reserve(ht);
795				else
796					kmem_cache_free(htable_cache, ht);
797				ht = NULL;
798			}
799		}
800	}
801
802	/*
803	 * If allocations failed, kick off a kmem_reap() and resort to
804	 * htable steal(). We may spin here if the system is very low on
805	 * memory. If the kernel itself has consumed all memory and kmem_reap()
806	 * can't free up anything, then we'll really get stuck here.
807	 * That should only happen in a system where the administrator has
808	 * misconfigured VM parameters via /etc/system.
809	 */
810	while (ht == NULL && can_steal_post_boot) {
811		kmem_reap();
812		ht = htable_steal(1);
813		HATSTAT_INC(hs_steals);
814
815		/*
816		 * If we stole for a bare htable, release the pagetable page.
817		 */
818		if (ht != NULL) {
819			if (is_bare) {
820				ptable_free(ht->ht_pfn);
821				ht->ht_pfn = PFN_INVALID;
822#if defined(__xpv) && defined(__amd64)
823			/*
824			 * make stolen page table writable again in kpm
825			 */
826			} else if (kpm_vbase && xen_kpm_page(ht->ht_pfn,
827			    PT_VALID | PT_WRITABLE) < 0) {
828				panic("failure making kpm r/w pfn=0x%lx",
829				    ht->ht_pfn);
830#endif
831			}
832		}
833	}
834
835	/*
836	 * All attempts to allocate or steal failed. This should only happen
837	 * if we run out of memory during boot, due perhaps to a huge
838	 * boot_archive. At this point there's no way to continue.
839	 */
840	if (ht == NULL)
841		panic("htable_alloc(): couldn't steal\n");
842
843#if defined(__amd64) && defined(__xpv)
844	/*
845	 * Under the 64-bit hypervisor, we have 2 top level page tables.
846	 * If this allocation fails, we'll resort to stealing.
847	 * We use the stolen page indirectly, by freeing the
848	 * stolen htable first.
849	 */
850	if (level == mmu.max_level) {
851		for (;;) {
852			htable_t *stolen;
853
854			hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
855			if (hat->hat_user_ptable != PFN_INVALID)
856				break;
857			stolen = htable_steal(1);
858			if (stolen == NULL)
859				panic("2nd steal ptable failed\n");
860			htable_free(stolen);
861		}
862		block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable),
863		    MMU_PAGESIZE);
864	}
865#endif
866
867	/*
868	 * Shared page tables have all entries locked and entries may not
869	 * be added or deleted.
870	 */
871	ht->ht_flags = 0;
872	if (shared != NULL) {
873		ASSERT(shared->ht_valid_cnt > 0);
874		ht->ht_flags |= HTABLE_SHARED_PFN;
875		ht->ht_pfn = shared->ht_pfn;
876		ht->ht_lock_cnt = 0;
877		ht->ht_valid_cnt = 0;		/* updated in hat_share() */
878		ht->ht_shares = shared;
879		need_to_zero = 0;
880	} else {
881		ht->ht_shares = NULL;
882		ht->ht_lock_cnt = 0;
883		ht->ht_valid_cnt = 0;
884	}
885
886	/*
887	 * setup flags, etc. for VLP htables
888	 */
889	if (is_vlp) {
890		ht->ht_flags |= HTABLE_VLP;
891		ASSERT(ht->ht_pfn == PFN_INVALID);
892		need_to_zero = 0;
893	}
894
895	/*
896	 * fill in the htable
897	 */
898	ht->ht_hat = hat;
899	ht->ht_parent = NULL;
900	ht->ht_vaddr = vaddr;
901	ht->ht_level = level;
902	ht->ht_busy = 1;
903	ht->ht_next = NULL;
904	ht->ht_prev = NULL;
905
906	/*
907	 * Zero out any freshly allocated page table
908	 */
909	if (need_to_zero)
910		x86pte_zero(ht, 0, mmu.ptes_per_table);
911
912#if defined(__amd64) && defined(__xpv)
913	if (!is_bare && kpm_vbase) {
914		(void) xen_kpm_page(ht->ht_pfn, PT_VALID);
915		if (level == mmu.max_level)
916			(void) xen_kpm_page(hat->hat_user_ptable, PT_VALID);
917	}
918#endif
919
920	return (ht);
921}
922
923/*
924 * Free up an htable, either to a hat's cached list, the reserves or
925 * back to kmem.
926 */
927static void
928htable_free(htable_t *ht)
929{
930	hat_t *hat = ht->ht_hat;
931
932	/*
933	 * If the process isn't exiting, cache the free htable in the hat
934	 * structure. We always do this for the boot time reserve. We don't
935	 * do this if the hat is exiting or we are stealing/reaping htables.
936	 */
937	if (hat != NULL &&
938	    !(ht->ht_flags & HTABLE_SHARED_PFN) &&
939	    (use_boot_reserve ||
940	    (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
941		ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
942		ASSERT(ht->ht_pfn != PFN_INVALID);
943		hat_enter(hat);
944		ht->ht_next = hat->hat_ht_cached;
945		hat->hat_ht_cached = ht;
946		hat_exit(hat);
947		return;
948	}
949
950	/*
951	 * If we have a hardware page table, free it.
952	 * We don't free page tables that are accessed by sharing.
953	 */
954	if (ht->ht_flags & HTABLE_SHARED_PFN) {
955		ASSERT(ht->ht_pfn != PFN_INVALID);
956	} else if (!(ht->ht_flags & HTABLE_VLP)) {
957		ptable_free(ht->ht_pfn);
958#if defined(__amd64) && defined(__xpv)
959		if (ht->ht_level == mmu.max_level) {
960			ptable_free(hat->hat_user_ptable);
961			hat->hat_user_ptable = PFN_INVALID;
962		}
963#endif
964	}
965	ht->ht_pfn = PFN_INVALID;
966
967	/*
968	 * Free it or put into reserves.
969	 */
970	if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) {
971		htable_put_reserve(ht);
972	} else {
973		kmem_cache_free(htable_cache, ht);
974		htable_adjust_reserve();
975	}
976}
977
978
979/*
980 * This is called when a hat is being destroyed or swapped out. We reap all
981 * the remaining htables in the hat cache. If destroying all left over
982 * htables are also destroyed.
983 *
984 * We also don't need to invalidate any of the PTPs nor do any demapping.
985 */
986void
987htable_purge_hat(hat_t *hat)
988{
989	htable_t *ht;
990	int h;
991
992	/*
993	 * Purge the htable cache if just reaping.
994	 */
995	if (!(hat->hat_flags & HAT_FREEING)) {
996		atomic_add_32(&htable_dont_cache, 1);
997		for (;;) {
998			hat_enter(hat);
999			ht = hat->hat_ht_cached;
1000			if (ht == NULL) {
1001				hat_exit(hat);
1002				break;
1003			}
1004			hat->hat_ht_cached = ht->ht_next;
1005			hat_exit(hat);
1006			htable_free(ht);
1007		}
1008		atomic_add_32(&htable_dont_cache, -1);
1009		return;
1010	}
1011
1012	/*
1013	 * if freeing, no locking is needed
1014	 */
1015	while ((ht = hat->hat_ht_cached) != NULL) {
1016		hat->hat_ht_cached = ht->ht_next;
1017		htable_free(ht);
1018	}
1019
1020	/*
1021	 * walk thru the htable hash table and free all the htables in it.
1022	 */
1023	for (h = 0; h < hat->hat_num_hash; ++h) {
1024		while ((ht = hat->hat_ht_hash[h]) != NULL) {
1025			if (ht->ht_next)
1026				ht->ht_next->ht_prev = ht->ht_prev;
1027
1028			if (ht->ht_prev) {
1029				ht->ht_prev->ht_next = ht->ht_next;
1030			} else {
1031				ASSERT(hat->hat_ht_hash[h] == ht);
1032				hat->hat_ht_hash[h] = ht->ht_next;
1033			}
1034			htable_free(ht);
1035		}
1036	}
1037}
1038
1039/*
1040 * Unlink an entry for a table at vaddr and level out of the existing table
1041 * one level higher. We are always holding the HASH_ENTER() when doing this.
1042 */
1043static void
1044unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
1045{
1046	uint_t		entry = htable_va2entry(vaddr, higher);
1047	x86pte_t	expect = MAKEPTP(old->ht_pfn, old->ht_level);
1048	x86pte_t	found;
1049	hat_t		*hat = old->ht_hat;
1050
1051	ASSERT(higher->ht_busy > 0);
1052	ASSERT(higher->ht_valid_cnt > 0);
1053	ASSERT(old->ht_valid_cnt == 0);
1054	found = x86pte_cas(higher, entry, expect, 0);
1055#ifdef __xpv
1056	/*
1057	 * This is weird, but Xen apparently automatically unlinks empty
1058	 * pagetables from the upper page table. So allow PTP to be 0 already.
1059	 */
1060	if (found != expect && found != 0)
1061#else
1062	if (found != expect)
1063#endif
1064		panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE,
1065		    found, expect);
1066
1067	/*
1068	 * When a top level VLP page table entry changes, we must issue
1069	 * a reload of cr3 on all processors.
1070	 *
1071	 * If we don't need do do that, then we still have to INVLPG against
1072	 * an address covered by the inner page table, as the latest processors
1073	 * have TLB-like caches for non-leaf page table entries.
1074	 */
1075	if (!(hat->hat_flags & HAT_FREEING)) {
1076		hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
1077		    DEMAP_ALL_ADDR : old->ht_vaddr);
1078	}
1079
1080	HTABLE_DEC(higher->ht_valid_cnt);
1081}
1082
1083/*
1084 * Link an entry for a new table at vaddr and level into the existing table
1085 * one level higher. We are always holding the HASH_ENTER() when doing this.
1086 */
1087static void
1088link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
1089{
1090	uint_t		entry = htable_va2entry(vaddr, higher);
1091	x86pte_t	newptp = MAKEPTP(new->ht_pfn, new->ht_level);
1092	x86pte_t	found;
1093
1094	ASSERT(higher->ht_busy > 0);
1095
1096	ASSERT(new->ht_level != mmu.max_level);
1097
1098	HTABLE_INC(higher->ht_valid_cnt);
1099
1100	found = x86pte_cas(higher, entry, 0, newptp);
1101	if ((found & ~PT_REF) != 0)
1102		panic("HAT: ptp not 0, found=" FMT_PTE, found);
1103
1104	/*
1105	 * When any top level VLP page table entry changes, we must issue
1106	 * a reload of cr3 on all processors using it.
1107	 * We also need to do this for the kernel hat on PAE 32 bit kernel.
1108	 */
1109	if (
1110#ifdef __i386
1111	    (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
1112#endif
1113	    (higher->ht_flags & HTABLE_VLP))
1114		hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
1115}
1116
1117/*
1118 * Release of hold on an htable. If this is the last use and the pagetable
1119 * is empty we may want to free it, then recursively look at the pagetable
1120 * above it. The recursion is handled by the outer while() loop.
1121 *
1122 * On the metal, during process exit, we don't bother unlinking the tables from
1123 * upper level pagetables. They are instead handled in bulk by hat_free_end().
1124 * We can't do this on the hypervisor as we need the page table to be
1125 * implicitly unpinnned before it goes to the free page lists. This can't
1126 * happen unless we fully unlink it from the page table hierarchy.
1127 */
1128void
1129htable_release(htable_t *ht)
1130{
1131	uint_t		hashval;
1132	htable_t	*shared;
1133	htable_t	*higher;
1134	hat_t		*hat;
1135	uintptr_t	va;
1136	level_t		level;
1137
1138	while (ht != NULL) {
1139		shared = NULL;
1140		for (;;) {
1141			hat = ht->ht_hat;
1142			va = ht->ht_vaddr;
1143			level = ht->ht_level;
1144			hashval = HTABLE_HASH(hat, va, level);
1145
1146			/*
1147			 * The common case is that this isn't the last use of
1148			 * an htable so we don't want to free the htable.
1149			 */
1150			HTABLE_ENTER(hashval);
1151			ASSERT(ht->ht_valid_cnt >= 0);
1152			ASSERT(ht->ht_busy > 0);
1153			if (ht->ht_valid_cnt > 0)
1154				break;
1155			if (ht->ht_busy > 1)
1156				break;
1157			ASSERT(ht->ht_lock_cnt == 0);
1158
1159#if !defined(__xpv)
1160			/*
1161			 * we always release empty shared htables
1162			 */
1163			if (!(ht->ht_flags & HTABLE_SHARED_PFN)) {
1164
1165				/*
1166				 * don't release if in address space tear down
1167				 */
1168				if (hat->hat_flags & HAT_FREEING)
1169					break;
1170
1171				/*
1172				 * At and above max_page_level, free if it's for
1173				 * a boot-time kernel mapping below kernelbase.
1174				 */
1175				if (level >= mmu.max_page_level &&
1176				    (hat != kas.a_hat || va >= kernelbase))
1177					break;
1178			}
1179#endif /* __xpv */
1180
1181			/*
1182			 * Remember if we destroy an htable that shares its PFN
1183			 * from elsewhere.
1184			 */
1185			if (ht->ht_flags & HTABLE_SHARED_PFN) {
1186				ASSERT(shared == NULL);
1187				shared = ht->ht_shares;
1188				HATSTAT_INC(hs_htable_unshared);
1189			}
1190
1191			/*
1192			 * Handle release of a table and freeing the htable_t.
1193			 * Unlink it from the table higher (ie. ht_parent).
1194			 */
1195			ASSERT(ht->ht_lock_cnt == 0);
1196			higher = ht->ht_parent;
1197			ASSERT(higher != NULL);
1198
1199			/*
1200			 * Unlink the pagetable.
1201			 */
1202			unlink_ptp(higher, ht, va);
1203
1204			/*
1205			 * remove this htable from its hash list
1206			 */
1207			if (ht->ht_next)
1208				ht->ht_next->ht_prev = ht->ht_prev;
1209
1210			if (ht->ht_prev) {
1211				ht->ht_prev->ht_next = ht->ht_next;
1212			} else {
1213				ASSERT(hat->hat_ht_hash[hashval] == ht);
1214				hat->hat_ht_hash[hashval] = ht->ht_next;
1215			}
1216			HTABLE_EXIT(hashval);
1217			htable_free(ht);
1218			ht = higher;
1219		}
1220
1221		ASSERT(ht->ht_busy >= 1);
1222		--ht->ht_busy;
1223		HTABLE_EXIT(hashval);
1224
1225		/*
1226		 * If we released a shared htable, do a release on the htable
1227		 * from which it shared
1228		 */
1229		ht = shared;
1230	}
1231}
1232
1233/*
1234 * Find the htable for the pagetable at the given level for the given address.
1235 * If found acquires a hold that eventually needs to be htable_release()d
1236 */
1237htable_t *
1238htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
1239{
1240	uintptr_t	base;
1241	uint_t		hashval;
1242	htable_t	*ht = NULL;
1243
1244	ASSERT(level >= 0);
1245	ASSERT(level <= TOP_LEVEL(hat));
1246
1247	if (level == TOP_LEVEL(hat)) {
1248#if defined(__amd64)
1249		/*
1250		 * 32 bit address spaces on 64 bit kernels need to check
1251		 * for overflow of the 32 bit address space
1252		 */
1253		if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
1254			return (NULL);
1255#endif
1256		base = 0;
1257	} else {
1258		base = vaddr & LEVEL_MASK(level + 1);
1259	}
1260
1261	hashval = HTABLE_HASH(hat, base, level);
1262	HTABLE_ENTER(hashval);
1263	for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) {
1264		if (ht->ht_hat == hat &&
1265		    ht->ht_vaddr == base &&
1266		    ht->ht_level == level)
1267			break;
1268	}
1269	if (ht)
1270		++ht->ht_busy;
1271
1272	HTABLE_EXIT(hashval);
1273	return (ht);
1274}
1275
1276/*
1277 * Acquires a hold on a known htable (from a locked hment entry).
1278 */
1279void
1280htable_acquire(htable_t *ht)
1281{
1282	hat_t		*hat = ht->ht_hat;
1283	level_t		level = ht->ht_level;
1284	uintptr_t	base = ht->ht_vaddr;
1285	uint_t		hashval = HTABLE_HASH(hat, base, level);
1286
1287	HTABLE_ENTER(hashval);
1288#ifdef DEBUG
1289	/*
1290	 * make sure the htable is there
1291	 */
1292	{
1293		htable_t	*h;
1294
1295		for (h = hat->hat_ht_hash[hashval];
1296		    h && h != ht;
1297		    h = h->ht_next)
1298			;
1299		ASSERT(h == ht);
1300	}
1301#endif /* DEBUG */
1302	++ht->ht_busy;
1303	HTABLE_EXIT(hashval);
1304}
1305
1306/*
1307 * Find the htable for the pagetable at the given level for the given address.
1308 * If found acquires a hold that eventually needs to be htable_release()d
1309 * If not found the table is created.
1310 *
1311 * Since we can't hold a hash table mutex during allocation, we have to
1312 * drop it and redo the search on a create. Then we may have to free the newly
1313 * allocated htable if another thread raced in and created it ahead of us.
1314 */
1315htable_t *
1316htable_create(
1317	hat_t		*hat,
1318	uintptr_t	vaddr,
1319	level_t		level,
1320	htable_t	*shared)
1321{
1322	uint_t		h;
1323	level_t		l;
1324	uintptr_t	base;
1325	htable_t	*ht;
1326	htable_t	*higher = NULL;
1327	htable_t	*new = NULL;
1328
1329	if (level < 0 || level > TOP_LEVEL(hat))
1330		panic("htable_create(): level %d out of range\n", level);
1331
1332	/*
1333	 * Create the page tables in top down order.
1334	 */
1335	for (l = TOP_LEVEL(hat); l >= level; --l) {
1336		new = NULL;
1337		if (l == TOP_LEVEL(hat))
1338			base = 0;
1339		else
1340			base = vaddr & LEVEL_MASK(l + 1);
1341
1342		h = HTABLE_HASH(hat, base, l);
1343try_again:
1344		/*
1345		 * look up the htable at this level
1346		 */
1347		HTABLE_ENTER(h);
1348		if (l == TOP_LEVEL(hat)) {
1349			ht = hat->hat_htable;
1350		} else {
1351			for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
1352				ASSERT(ht->ht_hat == hat);
1353				if (ht->ht_vaddr == base &&
1354				    ht->ht_level == l)
1355					break;
1356			}
1357		}
1358
1359		/*
1360		 * if we found the htable, increment its busy cnt
1361		 * and if we had allocated a new htable, free it.
1362		 */
1363		if (ht != NULL) {
1364			/*
1365			 * If we find a pre-existing shared table, it must
1366			 * share from the same place.
1367			 */
1368			if (l == level && shared && ht->ht_shares &&
1369			    ht->ht_shares != shared) {
1370				panic("htable shared from wrong place "
1371				    "found htable=%p shared=%p",
1372				    (void *)ht, (void *)shared);
1373			}
1374			++ht->ht_busy;
1375			HTABLE_EXIT(h);
1376			if (new)
1377				htable_free(new);
1378			if (higher != NULL)
1379				htable_release(higher);
1380			higher = ht;
1381
1382		/*
1383		 * if we didn't find it on the first search
1384		 * allocate a new one and search again
1385		 */
1386		} else if (new == NULL) {
1387			HTABLE_EXIT(h);
1388			new = htable_alloc(hat, base, l,
1389			    l == level ? shared : NULL);
1390			goto try_again;
1391
1392		/*
1393		 * 2nd search and still not there, use "new" table
1394		 * Link new table into higher, when not at top level.
1395		 */
1396		} else {
1397			ht = new;
1398			if (higher != NULL) {
1399				link_ptp(higher, ht, base);
1400				ht->ht_parent = higher;
1401			}
1402			ht->ht_next = hat->hat_ht_hash[h];
1403			ASSERT(ht->ht_prev == NULL);
1404			if (hat->hat_ht_hash[h])
1405				hat->hat_ht_hash[h]->ht_prev = ht;
1406			hat->hat_ht_hash[h] = ht;
1407			HTABLE_EXIT(h);
1408
1409			/*
1410			 * Note we don't do htable_release(higher).
1411			 * That happens recursively when "new" is removed by
1412			 * htable_release() or htable_steal().
1413			 */
1414			higher = ht;
1415
1416			/*
1417			 * If we just created a new shared page table we
1418			 * increment the shared htable's busy count, so that
1419			 * it can't be the victim of a steal even if it's empty.
1420			 */
1421			if (l == level && shared) {
1422				(void) htable_lookup(shared->ht_hat,
1423				    shared->ht_vaddr, shared->ht_level);
1424				HATSTAT_INC(hs_htable_shared);
1425			}
1426		}
1427	}
1428
1429	return (ht);
1430}
1431
1432/*
1433 * Inherit initial pagetables from the boot program. On the 64-bit
1434 * hypervisor we also temporarily mark the p_index field of page table
1435 * pages, so we know not to try making them writable in seg_kpm.
1436 */
1437void
1438htable_attach(
1439	hat_t *hat,
1440	uintptr_t base,
1441	level_t level,
1442	htable_t *parent,
1443	pfn_t pfn)
1444{
1445	htable_t	*ht;
1446	uint_t		h;
1447	uint_t		i;
1448	x86pte_t	pte;
1449	x86pte_t	*ptep;
1450	page_t		*pp;
1451	extern page_t	*boot_claim_page(pfn_t);
1452
1453	ht = htable_get_reserve();
1454	if (level == mmu.max_level)
1455		kas.a_hat->hat_htable = ht;
1456	ht->ht_hat = hat;
1457	ht->ht_parent = parent;
1458	ht->ht_vaddr = base;
1459	ht->ht_level = level;
1460	ht->ht_busy = 1;
1461	ht->ht_next = NULL;
1462	ht->ht_prev = NULL;
1463	ht->ht_flags = 0;
1464	ht->ht_pfn = pfn;
1465	ht->ht_lock_cnt = 0;
1466	ht->ht_valid_cnt = 0;
1467	if (parent != NULL)
1468		++parent->ht_busy;
1469
1470	h = HTABLE_HASH(hat, base, level);
1471	HTABLE_ENTER(h);
1472	ht->ht_next = hat->hat_ht_hash[h];
1473	ASSERT(ht->ht_prev == NULL);
1474	if (hat->hat_ht_hash[h])
1475		hat->hat_ht_hash[h]->ht_prev = ht;
1476	hat->hat_ht_hash[h] = ht;
1477	HTABLE_EXIT(h);
1478
1479	/*
1480	 * make sure the page table physical page is not FREE
1481	 */
1482	if (page_resv(1, KM_NOSLEEP) == 0)
1483		panic("page_resv() failed in ptable alloc");
1484
1485	pp = boot_claim_page(pfn);
1486	ASSERT(pp != NULL);
1487	page_downgrade(pp);
1488#if defined(__xpv) && defined(__amd64)
1489	/*
1490	 * Record in the page_t that is a pagetable for segkpm setup.
1491	 */
1492	if (kpm_vbase)
1493		pp->p_index = 1;
1494#endif
1495
1496	/*
1497	 * Count valid mappings and recursively attach lower level pagetables.
1498	 */
1499	ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1500	for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) {
1501		if (mmu.pae_hat)
1502			pte = ptep[i];
1503		else
1504			pte = ((x86pte32_t *)ptep)[i];
1505		if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) {
1506			++ht->ht_valid_cnt;
1507			if (!PTE_ISPAGE(pte, level)) {
1508				htable_attach(hat, base, level - 1,
1509				    ht, PTE2PFN(pte, level));
1510				ptep = kbm_remap_window(pfn_to_pa(pfn), 0);
1511			}
1512		}
1513		base += LEVEL_SIZE(level);
1514		if (base == mmu.hole_start)
1515			base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK;
1516	}
1517
1518	/*
1519	 * As long as all the mappings we had were below kernel base
1520	 * we can release the htable.
1521	 */
1522	if (base < kernelbase)
1523		htable_release(ht);
1524}
1525
1526/*
1527 * Walk through a given htable looking for the first valid entry.  This
1528 * routine takes both a starting and ending address.  The starting address
1529 * is required to be within the htable provided by the caller, but there is
1530 * no such restriction on the ending address.
1531 *
1532 * If the routine finds a valid entry in the htable (at or beyond the
1533 * starting address), the PTE (and its address) will be returned.
1534 * This PTE may correspond to either a page or a pagetable - it is the
1535 * caller's responsibility to determine which.  If no valid entry is
1536 * found, 0 (and invalid PTE) and the next unexamined address will be
1537 * returned.
1538 *
1539 * The loop has been carefully coded for optimization.
1540 */
1541static x86pte_t
1542htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr)
1543{
1544	uint_t e;
1545	x86pte_t found_pte = (x86pte_t)0;
1546	caddr_t pte_ptr;
1547	caddr_t end_pte_ptr;
1548	int l = ht->ht_level;
1549	uintptr_t va = *vap & LEVEL_MASK(l);
1550	size_t pgsize = LEVEL_SIZE(l);
1551
1552	ASSERT(va >= ht->ht_vaddr);
1553	ASSERT(va <= HTABLE_LAST_PAGE(ht));
1554
1555	/*
1556	 * Compute the starting index and ending virtual address
1557	 */
1558	e = htable_va2entry(va, ht);
1559
1560	/*
1561	 * The following page table scan code knows that the valid
1562	 * bit of a PTE is in the lowest byte AND that x86 is little endian!!
1563	 */
1564	pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0);
1565	end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht));
1566	pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e);
1567	while (!PTE_ISVALID(*pte_ptr)) {
1568		va += pgsize;
1569		if (va >= eaddr)
1570			break;
1571		pte_ptr += mmu.pte_size;
1572		ASSERT(pte_ptr <= end_pte_ptr);
1573		if (pte_ptr == end_pte_ptr)
1574			break;
1575	}
1576
1577	/*
1578	 * if we found a valid PTE, load the entire PTE
1579	 */
1580	if (va < eaddr && pte_ptr != end_pte_ptr)
1581		found_pte = GET_PTE((x86pte_t *)pte_ptr);
1582	x86pte_release_pagetable(ht);
1583
1584#if defined(__amd64)
1585	/*
1586	 * deal with VA hole on amd64
1587	 */
1588	if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end)
1589		va = mmu.hole_end + va - mmu.hole_start;
1590#endif /* __amd64 */
1591
1592	*vap = va;
1593	return (found_pte);
1594}
1595
1596/*
1597 * Find the address and htable for the first populated translation at or
1598 * above the given virtual address.  The caller may also specify an upper
1599 * limit to the address range to search.  Uses level information to quickly
1600 * skip unpopulated sections of virtual address spaces.
1601 *
1602 * If not found returns NULL. When found, returns the htable and virt addr
1603 * and has a hold on the htable.
1604 */
1605x86pte_t
1606htable_walk(
1607	struct hat *hat,
1608	htable_t **htp,
1609	uintptr_t *vaddr,
1610	uintptr_t eaddr)
1611{
1612	uintptr_t va = *vaddr;
1613	htable_t *ht;
1614	htable_t *prev = *htp;
1615	level_t l;
1616	level_t max_mapped_level;
1617	x86pte_t pte;
1618
1619	ASSERT(eaddr > va);
1620
1621	/*
1622	 * If this is a user address, then we know we need not look beyond
1623	 * kernelbase.
1624	 */
1625	ASSERT(hat == kas.a_hat || eaddr <= kernelbase ||
1626	    eaddr == HTABLE_WALK_TO_END);
1627	if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END)
1628		eaddr = kernelbase;
1629
1630	/*
1631	 * If we're coming in with a previous page table, search it first
1632	 * without doing an htable_lookup(), this should be frequent.
1633	 */
1634	if (prev) {
1635		ASSERT(prev->ht_busy > 0);
1636		ASSERT(prev->ht_vaddr <= va);
1637		l = prev->ht_level;
1638		if (va <= HTABLE_LAST_PAGE(prev)) {
1639			pte = htable_scan(prev, &va, eaddr);
1640
1641			if (PTE_ISPAGE(pte, l)) {
1642				*vaddr = va;
1643				*htp = prev;
1644				return (pte);
1645			}
1646		}
1647
1648		/*
1649		 * We found nothing in the htable provided by the caller,
1650		 * so fall through and do the full search
1651		 */
1652		htable_release(prev);
1653	}
1654
1655	/*
1656	 * Find the level of the largest pagesize used by this HAT.
1657	 */
1658	if (hat->hat_ism_pgcnt > 0) {
1659		max_mapped_level = mmu.umax_page_level;
1660	} else {
1661		max_mapped_level = 0;
1662		for (l = 1; l <= mmu.max_page_level; ++l)
1663			if (hat->hat_pages_mapped[l] != 0)
1664				max_mapped_level = l;
1665	}
1666
1667	while (va < eaddr && va >= *vaddr) {
1668		ASSERT(!IN_VA_HOLE(va));
1669
1670		/*
1671		 *  Find lowest table with any entry for given address.
1672		 */
1673		for (l = 0; l <= TOP_LEVEL(hat); ++l) {
1674			ht = htable_lookup(hat, va, l);
1675			if (ht != NULL) {
1676				pte = htable_scan(ht, &va, eaddr);
1677				if (PTE_ISPAGE(pte, l)) {
1678					*vaddr = va;
1679					*htp = ht;
1680					return (pte);
1681				}
1682				htable_release(ht);
1683				break;
1684			}
1685
1686			/*
1687			 * No htable at this level for the address. If there
1688			 * is no larger page size that could cover it, we can
1689			 * skip right to the start of the next page table.
1690			 */
1691			ASSERT(l < TOP_LEVEL(hat));
1692			if (l >= max_mapped_level) {
1693				va = NEXT_ENTRY_VA(va, l + 1);
1694				if (va >= eaddr)
1695					break;
1696			}
1697		}
1698	}
1699
1700	*vaddr = 0;
1701	*htp = NULL;
1702	return (0);
1703}
1704
1705/*
1706 * Find the htable and page table entry index of the given virtual address
1707 * with pagesize at or below given level.
1708 * If not found returns NULL. When found, returns the htable, sets
1709 * entry, and has a hold on the htable.
1710 */
1711htable_t *
1712htable_getpte(
1713	struct hat *hat,
1714	uintptr_t vaddr,
1715	uint_t *entry,
1716	x86pte_t *pte,
1717	level_t level)
1718{
1719	htable_t	*ht;
1720	level_t		l;
1721	uint_t		e;
1722
1723	ASSERT(level <= mmu.max_page_level);
1724
1725	for (l = 0; l <= level; ++l) {
1726		ht = htable_lookup(hat, vaddr, l);
1727		if (ht == NULL)
1728			continue;
1729		e = htable_va2entry(vaddr, ht);
1730		if (entry != NULL)
1731			*entry = e;
1732		if (pte != NULL)
1733			*pte = x86pte_get(ht, e);
1734		return (ht);
1735	}
1736	return (NULL);
1737}
1738
1739/*
1740 * Find the htable and page table entry index of the given virtual address.
1741 * There must be a valid page mapped at the given address.
1742 * If not found returns NULL. When found, returns the htable, sets
1743 * entry, and has a hold on the htable.
1744 */
1745htable_t *
1746htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry)
1747{
1748	htable_t	*ht;
1749	uint_t		e;
1750	x86pte_t	pte;
1751
1752	ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level);
1753	if (ht == NULL)
1754		return (NULL);
1755
1756	if (entry)
1757		*entry = e;
1758
1759	if (PTE_ISPAGE(pte, ht->ht_level))
1760		return (ht);
1761	htable_release(ht);
1762	return (NULL);
1763}
1764
1765
1766void
1767htable_init()
1768{
1769	/*
1770	 * To save on kernel VA usage, we avoid debug information in 32 bit
1771	 * kernels.
1772	 */
1773#if defined(__amd64)
1774	int	kmem_flags = KMC_NOHASH;
1775#elif defined(__i386)
1776	int	kmem_flags = KMC_NOHASH | KMC_NODEBUG;
1777#endif
1778
1779	/*
1780	 * initialize kmem caches
1781	 */
1782	htable_cache = kmem_cache_create("htable_t",
1783	    sizeof (htable_t), 0, NULL, NULL,
1784	    htable_reap, NULL, hat_memload_arena, kmem_flags);
1785}
1786
1787/*
1788 * get the pte index for the virtual address in the given htable's pagetable
1789 */
1790uint_t
1791htable_va2entry(uintptr_t va, htable_t *ht)
1792{
1793	level_t	l = ht->ht_level;
1794
1795	ASSERT(va >= ht->ht_vaddr);
1796	ASSERT(va <= HTABLE_LAST_PAGE(ht));
1797	return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1));
1798}
1799
1800/*
1801 * Given an htable and the index of a pte in it, return the virtual address
1802 * of the page.
1803 */
1804uintptr_t
1805htable_e2va(htable_t *ht, uint_t entry)
1806{
1807	level_t	l = ht->ht_level;
1808	uintptr_t va;
1809
1810	ASSERT(entry < HTABLE_NUM_PTES(ht));
1811	va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l));
1812
1813	/*
1814	 * Need to skip over any VA hole in top level table
1815	 */
1816#if defined(__amd64)
1817	if (ht->ht_level == mmu.max_level && va >= mmu.hole_start)
1818		va += ((mmu.hole_end - mmu.hole_start) + 1);
1819#endif
1820
1821	return (va);
1822}
1823
1824/*
1825 * The code uses compare and swap instructions to read/write PTE's to
1826 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems.
1827 * will naturally be atomic.
1828 *
1829 * The combination of using kpreempt_disable()/_enable() and the hci_mutex
1830 * are used to ensure that an interrupt won't overwrite a temporary mapping
1831 * while it's in use. If an interrupt thread tries to access a PTE, it will
1832 * yield briefly back to the pinned thread which holds the cpu's hci_mutex.
1833 */
1834void
1835x86pte_cpu_init(cpu_t *cpu)
1836{
1837	struct hat_cpu_info *hci;
1838
1839	hci = kmem_zalloc(sizeof (*hci), KM_SLEEP);
1840	mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL);
1841	cpu->cpu_hat_info = hci;
1842}
1843
1844void
1845x86pte_cpu_fini(cpu_t *cpu)
1846{
1847	struct hat_cpu_info *hci = cpu->cpu_hat_info;
1848
1849	kmem_free(hci, sizeof (*hci));
1850	cpu->cpu_hat_info = NULL;
1851}
1852
1853#ifdef __i386
1854/*
1855 * On 32 bit kernels, loading a 64 bit PTE is a little tricky
1856 */
1857x86pte_t
1858get_pte64(x86pte_t *ptr)
1859{
1860	volatile uint32_t *p = (uint32_t *)ptr;
1861	x86pte_t t;
1862
1863	ASSERT(mmu.pae_hat != 0);
1864	for (;;) {
1865		t = p[0];
1866		t |= (uint64_t)p[1] << 32;
1867		if ((t & 0xffffffff) == p[0])
1868			return (t);
1869	}
1870}
1871#endif /* __i386 */
1872
1873/*
1874 * Disable preemption and establish a mapping to the pagetable with the
1875 * given pfn. This is optimized for there case where it's the same
1876 * pfn as we last used referenced from this CPU.
1877 */
1878static x86pte_t *
1879x86pte_access_pagetable(htable_t *ht, uint_t index)
1880{
1881	/*
1882	 * VLP pagetables are contained in the hat_t
1883	 */
1884	if (ht->ht_flags & HTABLE_VLP)
1885		return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
1886	return (x86pte_mapin(ht->ht_pfn, index, ht));
1887}
1888
1889/*
1890 * map the given pfn into the page table window.
1891 */
1892/*ARGSUSED*/
1893x86pte_t *
1894x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
1895{
1896	x86pte_t *pteptr;
1897	x86pte_t pte = 0;
1898	x86pte_t newpte;
1899	int x;
1900
1901	ASSERT(pfn != PFN_INVALID);
1902
1903	if (!khat_running) {
1904		caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1);
1905		return (PT_INDEX_PTR(va, index));
1906	}
1907
1908	/*
1909	 * If kpm is available, use it.
1910	 */
1911	if (kpm_vbase)
1912		return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index));
1913
1914	/*
1915	 * Disable preemption and grab the CPU's hci_mutex
1916	 */
1917	kpreempt_disable();
1918	ASSERT(CPU->cpu_hat_info != NULL);
1919	mutex_enter(&CPU->cpu_hat_info->hci_mutex);
1920	x = PWIN_TABLE(CPU->cpu_id);
1921	pteptr = (x86pte_t *)PWIN_PTE_VA(x);
1922#ifndef __xpv
1923	if (mmu.pae_hat)
1924		pte = *pteptr;
1925	else
1926		pte = *(x86pte32_t *)pteptr;
1927#endif
1928
1929	newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx;
1930
1931	/*
1932	 * For hardware we can use a writable mapping.
1933	 */
1934#ifdef __xpv
1935	if (IN_XPV_PANIC())
1936#endif
1937		newpte |= PT_WRITABLE;
1938
1939	if (!PTE_EQUIV(newpte, pte)) {
1940
1941#ifdef __xpv
1942		if (!IN_XPV_PANIC()) {
1943			xen_map(newpte, PWIN_VA(x));
1944		} else
1945#endif
1946		{
1947			XPV_ALLOW_PAGETABLE_UPDATES();
1948			if (mmu.pae_hat)
1949				*pteptr = newpte;
1950			else
1951				*(x86pte32_t *)pteptr = newpte;
1952			XPV_DISALLOW_PAGETABLE_UPDATES();
1953			mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
1954		}
1955	}
1956	return (PT_INDEX_PTR(PWIN_VA(x), index));
1957}
1958
1959/*
1960 * Release access to a page table.
1961 */
1962static void
1963x86pte_release_pagetable(htable_t *ht)
1964{
1965	/*
1966	 * nothing to do for VLP htables
1967	 */
1968	if (ht->ht_flags & HTABLE_VLP)
1969		return;
1970
1971	x86pte_mapout();
1972}
1973
1974void
1975x86pte_mapout(void)
1976{
1977	if (kpm_vbase != NULL || !khat_running)
1978		return;
1979
1980	/*
1981	 * Drop the CPU's hci_mutex and restore preemption.
1982	 */
1983#ifdef __xpv
1984	if (!IN_XPV_PANIC()) {
1985		uintptr_t va;
1986
1987		/*
1988		 * We need to always clear the mapping in case a page
1989		 * that was once a page table page is ballooned out.
1990		 */
1991		va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id));
1992		(void) HYPERVISOR_update_va_mapping(va, 0,
1993		    UVMF_INVLPG | UVMF_LOCAL);
1994	}
1995#endif
1996	mutex_exit(&CPU->cpu_hat_info->hci_mutex);
1997	kpreempt_enable();
1998}
1999
2000/*
2001 * Atomic retrieval of a pagetable entry
2002 */
2003x86pte_t
2004x86pte_get(htable_t *ht, uint_t entry)
2005{
2006	x86pte_t	pte;
2007	x86pte_t	*ptep;
2008
2009	/*
2010	 * Be careful that loading PAE entries in 32 bit kernel is atomic.
2011	 */
2012	ASSERT(entry < mmu.ptes_per_table);
2013	ptep = x86pte_access_pagetable(ht, entry);
2014	pte = GET_PTE(ptep);
2015	x86pte_release_pagetable(ht);
2016	return (pte);
2017}
2018
2019/*
2020 * Atomic unconditional set of a page table entry, it returns the previous
2021 * value. For pre-existing mappings if the PFN changes, then we don't care
2022 * about the old pte's REF / MOD bits. If the PFN remains the same, we leave
2023 * the MOD/REF bits unchanged.
2024 *
2025 * If asked to overwrite a link to a lower page table with a large page
2026 * mapping, this routine returns the special value of LPAGE_ERROR. This
2027 * allows the upper HAT layers to retry with a smaller mapping size.
2028 */
2029x86pte_t
2030x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
2031{
2032	x86pte_t	old;
2033	x86pte_t	prev;
2034	x86pte_t	*ptep;
2035	level_t		l = ht->ht_level;
2036	x86pte_t	pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR;
2037	x86pte_t	n;
2038	uintptr_t	addr = htable_e2va(ht, entry);
2039	hat_t		*hat = ht->ht_hat;
2040
2041	ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */
2042	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2043	if (ptr == NULL)
2044		ptep = x86pte_access_pagetable(ht, entry);
2045	else
2046		ptep = ptr;
2047
2048	/*
2049	 * Install the new PTE. If remapping the same PFN, then
2050	 * copy existing REF/MOD bits to new mapping.
2051	 */
2052	do {
2053		prev = GET_PTE(ptep);
2054		n = new;
2055		if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask))
2056			n |= prev & (PT_REF | PT_MOD);
2057
2058		/*
2059		 * Another thread may have installed this mapping already,
2060		 * flush the local TLB and be done.
2061		 */
2062		if (prev == n) {
2063			old = new;
2064#ifdef __xpv
2065			if (!IN_XPV_PANIC())
2066				xen_flush_va((caddr_t)addr);
2067			else
2068#endif
2069				mmu_tlbflush_entry((caddr_t)addr);
2070			goto done;
2071		}
2072
2073		/*
2074		 * Detect if we have a collision of installing a large
2075		 * page mapping where there already is a lower page table.
2076		 */
2077		if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) {
2078			old = LPAGE_ERROR;
2079			goto done;
2080		}
2081
2082		XPV_ALLOW_PAGETABLE_UPDATES();
2083		old = CAS_PTE(ptep, prev, n);
2084		XPV_DISALLOW_PAGETABLE_UPDATES();
2085	} while (old != prev);
2086
2087	/*
2088	 * Do a TLB demap if needed, ie. the old pte was valid.
2089	 *
2090	 * Note that a stale TLB writeback to the PTE here either can't happen
2091	 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST
2092	 * mappings, but they were created with REF and MOD already set, so
2093	 * no stale writeback will happen.
2094	 *
2095	 * Segmap is the only place where remaps happen on the same pfn and for
2096	 * that we want to preserve the stale REF/MOD bits.
2097	 */
2098	if (old & PT_REF)
2099		hat_tlb_inval(hat, addr);
2100
2101done:
2102	if (ptr == NULL)
2103		x86pte_release_pagetable(ht);
2104	return (old);
2105}
2106
2107/*
2108 * Atomic compare and swap of a page table entry. No TLB invalidates are done.
2109 * This is used for links between pagetables of different levels.
2110 * Note we always create these links with dirty/access set, so they should
2111 * never change.
2112 */
2113x86pte_t
2114x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
2115{
2116	x86pte_t	pte;
2117	x86pte_t	*ptep;
2118#ifdef __xpv
2119	/*
2120	 * We can't use writable pagetables for upper level tables, so fake it.
2121	 */
2122	mmu_update_t t[2];
2123	int cnt = 1;
2124	int count;
2125	maddr_t ma;
2126
2127	if (!IN_XPV_PANIC()) {
2128		ASSERT(!(ht->ht_flags & HTABLE_VLP));	/* no VLP yet */
2129		ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2130		t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2131		t[0].val = new;
2132
2133#if defined(__amd64)
2134		/*
2135		 * On the 64-bit hypervisor we need to maintain the user mode
2136		 * top page table too.
2137		 */
2138		if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) {
2139			ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(
2140			    ht->ht_hat->hat_user_ptable), entry));
2141			t[1].ptr = ma | MMU_NORMAL_PT_UPDATE;
2142			t[1].val = new;
2143			++cnt;
2144		}
2145#endif	/* __amd64 */
2146
2147		if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF))
2148			panic("HYPERVISOR_mmu_update() failed");
2149		ASSERT(count == cnt);
2150		return (old);
2151	}
2152#endif
2153	ptep = x86pte_access_pagetable(ht, entry);
2154	XPV_ALLOW_PAGETABLE_UPDATES();
2155	pte = CAS_PTE(ptep, old, new);
2156	XPV_DISALLOW_PAGETABLE_UPDATES();
2157	x86pte_release_pagetable(ht);
2158	return (pte);
2159}
2160
2161/*
2162 * Invalidate a page table entry as long as it currently maps something that
2163 * matches the value determined by expect.
2164 *
2165 * Also invalidates any TLB entries and returns the previous value of the PTE.
2166 */
2167x86pte_t
2168x86pte_inval(
2169	htable_t *ht,
2170	uint_t entry,
2171	x86pte_t expect,
2172	x86pte_t *pte_ptr)
2173{
2174	x86pte_t	*ptep;
2175	x86pte_t	oldpte;
2176	x86pte_t	found;
2177
2178	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2179	ASSERT(ht->ht_level <= mmu.max_page_level);
2180
2181	if (pte_ptr != NULL)
2182		ptep = pte_ptr;
2183	else
2184		ptep = x86pte_access_pagetable(ht, entry);
2185
2186#if defined(__xpv)
2187	/*
2188	 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing
2189	 * with anything else.
2190	 */
2191	if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) {
2192		int count;
2193		mmu_update_t t[1];
2194		maddr_t ma;
2195
2196		oldpte = GET_PTE(ptep);
2197		if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2198			goto done;
2199		ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
2200		t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
2201		t[0].val = 0;
2202		if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF))
2203			panic("HYPERVISOR_mmu_update() failed");
2204		ASSERT(count == 1);
2205		goto done;
2206	}
2207#endif /* __xpv */
2208
2209	/*
2210	 * Note that the loop is needed to handle changes due to h/w updating
2211	 * of PT_MOD/PT_REF.
2212	 */
2213	do {
2214		oldpte = GET_PTE(ptep);
2215		if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR))
2216			goto done;
2217		XPV_ALLOW_PAGETABLE_UPDATES();
2218		found = CAS_PTE(ptep, oldpte, 0);
2219		XPV_DISALLOW_PAGETABLE_UPDATES();
2220	} while (found != oldpte);
2221	if (oldpte & (PT_REF | PT_MOD))
2222		hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2223
2224done:
2225	if (pte_ptr == NULL)
2226		x86pte_release_pagetable(ht);
2227	return (oldpte);
2228}
2229
2230/*
2231 * Change a page table entry af it currently matches the value in expect.
2232 */
2233x86pte_t
2234x86pte_update(
2235	htable_t *ht,
2236	uint_t entry,
2237	x86pte_t expect,
2238	x86pte_t new)
2239{
2240	x86pte_t	*ptep;
2241	x86pte_t	found;
2242
2243	ASSERT(new != 0);
2244	ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN));
2245	ASSERT(ht->ht_level <= mmu.max_page_level);
2246
2247	ptep = x86pte_access_pagetable(ht, entry);
2248	XPV_ALLOW_PAGETABLE_UPDATES();
2249	found = CAS_PTE(ptep, expect, new);
2250	XPV_DISALLOW_PAGETABLE_UPDATES();
2251	if (found == expect) {
2252		hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry));
2253
2254		/*
2255		 * When removing write permission *and* clearing the
2256		 * MOD bit, check if a write happened via a stale
2257		 * TLB entry before the TLB shootdown finished.
2258		 *
2259		 * If it did happen, simply re-enable write permission and
2260		 * act like the original CAS failed.
2261		 */
2262		if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE &&
2263		    (new & (PT_WRITABLE | PT_MOD)) == 0 &&
2264		    (GET_PTE(ptep) & PT_MOD) != 0) {
2265			do {
2266				found = GET_PTE(ptep);
2267				XPV_ALLOW_PAGETABLE_UPDATES();
2268				found =
2269				    CAS_PTE(ptep, found, found | PT_WRITABLE);
2270				XPV_DISALLOW_PAGETABLE_UPDATES();
2271			} while ((found & PT_WRITABLE) == 0);
2272		}
2273	}
2274	x86pte_release_pagetable(ht);
2275	return (found);
2276}
2277
2278#ifndef __xpv
2279/*
2280 * Copy page tables - this is just a little more complicated than the
2281 * previous routines. Note that it's also not atomic! It also is never
2282 * used for VLP pagetables.
2283 */
2284void
2285x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2286{
2287	caddr_t	src_va;
2288	caddr_t dst_va;
2289	size_t size;
2290	x86pte_t *pteptr;
2291	x86pte_t pte;
2292
2293	ASSERT(khat_running);
2294	ASSERT(!(dest->ht_flags & HTABLE_VLP));
2295	ASSERT(!(src->ht_flags & HTABLE_VLP));
2296	ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
2297	ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2298
2299	/*
2300	 * Acquire access to the CPU pagetable windows for the dest and source.
2301	 */
2302	dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2303	if (kpm_vbase) {
2304		src_va = (caddr_t)
2305		    PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry);
2306	} else {
2307		uint_t x = PWIN_SRC(CPU->cpu_id);
2308
2309		/*
2310		 * Finish defining the src pagetable mapping
2311		 */
2312		src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2313		pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx;
2314		pteptr = (x86pte_t *)PWIN_PTE_VA(x);
2315		if (mmu.pae_hat)
2316			*pteptr = pte;
2317		else
2318			*(x86pte32_t *)pteptr = pte;
2319		mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
2320	}
2321
2322	/*
2323	 * now do the copy
2324	 */
2325	size = count << mmu.pte_size_shift;
2326	bcopy(src_va, dst_va, size);
2327
2328	x86pte_release_pagetable(dest);
2329}
2330
2331#else /* __xpv */
2332
2333/*
2334 * The hypervisor only supports writable pagetables at level 0, so we have
2335 * to install these 1 by 1 the slow way.
2336 */
2337void
2338x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
2339{
2340	caddr_t	src_va;
2341	x86pte_t pte;
2342
2343	ASSERT(!IN_XPV_PANIC());
2344	src_va = (caddr_t)x86pte_access_pagetable(src, entry);
2345	while (count) {
2346		if (mmu.pae_hat)
2347			pte = *(x86pte_t *)src_va;
2348		else
2349			pte = *(x86pte32_t *)src_va;
2350		if (pte != 0) {
2351			set_pteval(pfn_to_pa(dest->ht_pfn), entry,
2352			    dest->ht_level, pte);
2353#ifdef __amd64
2354			if (dest->ht_level == mmu.max_level &&
2355			    htable_e2va(dest, entry) < HYPERVISOR_VIRT_END)
2356				set_pteval(
2357				    pfn_to_pa(dest->ht_hat->hat_user_ptable),
2358				    entry, dest->ht_level, pte);
2359#endif
2360		}
2361		--count;
2362		++entry;
2363		src_va += mmu.pte_size;
2364	}
2365	x86pte_release_pagetable(src);
2366}
2367#endif /* __xpv */
2368
2369/*
2370 * Zero page table entries - Note this doesn't use atomic stores!
2371 */
2372static void
2373x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
2374{
2375	caddr_t dst_va;
2376	size_t size;
2377#ifdef __xpv
2378	int x;
2379	x86pte_t newpte;
2380#endif
2381
2382	/*
2383	 * Map in the page table to be zeroed.
2384	 */
2385	ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
2386	ASSERT(!(dest->ht_flags & HTABLE_VLP));
2387
2388	/*
2389	 * On the hypervisor we don't use x86pte_access_pagetable() since
2390	 * in this case the page is not pinned yet.
2391	 */
2392#ifdef __xpv
2393	if (kpm_vbase == NULL) {
2394		kpreempt_disable();
2395		ASSERT(CPU->cpu_hat_info != NULL);
2396		mutex_enter(&CPU->cpu_hat_info->hci_mutex);
2397		x = PWIN_TABLE(CPU->cpu_id);
2398		newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE;
2399		xen_map(newpte, PWIN_VA(x));
2400		dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry);
2401	} else
2402#endif
2403		dst_va = (caddr_t)x86pte_access_pagetable(dest, entry);
2404
2405	size = count << mmu.pte_size_shift;
2406	ASSERT(size > BLOCKZEROALIGN);
2407#ifdef __i386
2408	if ((x86_feature & X86_SSE2) == 0)
2409		bzero(dst_va, size);
2410	else
2411#endif
2412		block_zero_no_xmm(dst_va, size);
2413
2414#ifdef __xpv
2415	if (kpm_vbase == NULL) {
2416		xen_map(0, PWIN_VA(x));
2417		mutex_exit(&CPU->cpu_hat_info->hci_mutex);
2418		kpreempt_enable();
2419	} else
2420#endif
2421		x86pte_release_pagetable(dest);
2422}
2423
2424/*
2425 * Called to ensure that all pagetables are in the system dump
2426 */
2427void
2428hat_dump(void)
2429{
2430	hat_t *hat;
2431	uint_t h;
2432	htable_t *ht;
2433
2434	/*
2435	 * Dump all page tables
2436	 */
2437	for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
2438		for (h = 0; h < hat->hat_num_hash; ++h) {
2439			for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
2440				if ((ht->ht_flags & HTABLE_VLP) == 0)
2441					dump_page(ht->ht_pfn);
2442			}
2443		}
2444	}
2445}
2446