1/*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <i386/proc_reg.h>
30#include <i386/cpuid.h>
31#include <i386/tsc.h>
32#include <vm/pmap.h>
33#include <vm/vm_map.h>
34#include <i386/pmap_internal.h>
35#include <i386/pmap_pcid.h>
36#include <mach/branch_predicates.h>
37
38/*
39 * PCID (Process context identifier) aka tagged TLB support.
40 * On processors with this feature, unless disabled via the -pmap_pcid_disable
41 * boot-arg, the following algorithm is in effect:
42 * Each processor maintains an array of tag refcounts indexed by tag.
43 * Each address space maintains an array of tags indexed by CPU number.
44 * Each address space maintains a coherency vector, indexed by CPU
45 * indicating that the TLB state for that address space has a pending
46 * invalidation.
47 * On a context switch, a refcounted tag is lazily assigned to the newly
48 * dispatched (CPU, address space) tuple.
49 * When an inactive address space is invalidated on a remote CPU, it is marked
50 * for invalidation upon the next dispatch. Some invalidations are
51 * also processed at the user/kernel boundary.
52 * Provisions are made for the case where a CPU is overcommmitted, i.e.
53 * more active address spaces exist than the number of logical tags
54 * provided for by the processor architecture (currently 4096).
55 * The algorithm assumes the processor remaps the logical tags
56 * to physical TLB context IDs in an LRU fashion for efficiency. (DRK '10)
57 */
58
59uint32_t	pmap_pcid_ncpus;
60boolean_t 	pmap_pcid_disabled = FALSE;
61
62void	pmap_pcid_configure(void) {
63	int ccpu = cpu_number();
64	uintptr_t cr4 = get_cr4();
65	boolean_t pcid_present = FALSE;
66
67	pmap_pcid_log("PCID configure invoked on CPU %d\n", ccpu);
68	pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0);
69	pmap_assert(cpu_mode_is64bit());
70
71	if (PE_parse_boot_argn("-pmap_pcid_disable", &pmap_pcid_disabled, sizeof (pmap_pcid_disabled))) {
72		pmap_pcid_log("PMAP: PCID feature disabled\n");
73		printf("PMAP: PCID feature disabled, %u\n", pmap_pcid_disabled);
74		kprintf("PMAP: PCID feature disabled %u\n", pmap_pcid_disabled);
75	}
76	 /* no_shared_cr3+PCID is currently unsupported */
77#if	DEBUG
78	if (pmap_pcid_disabled == FALSE)
79		no_shared_cr3 = FALSE;
80	else
81		no_shared_cr3 = TRUE;
82#else
83	if (no_shared_cr3)
84		pmap_pcid_disabled = TRUE;
85#endif
86	if (pmap_pcid_disabled || no_shared_cr3) {
87		unsigned i;
88		/* Reset PCID status, as we may have picked up
89		 * strays if discovered prior to platform
90		 * expert initialization.
91		 */
92		for (i = 0; i < real_ncpus; i++) {
93			if (cpu_datap(i)) {
94				cpu_datap(i)->cpu_pmap_pcid_enabled = FALSE;
95			}
96			pmap_pcid_ncpus = 0;
97		}
98		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE;
99		return;
100	}
101	/* DRKTODO: assert if features haven't been discovered yet. Redundant
102	 * invocation of cpu_mode_init and descendants masks this for now.
103	 */
104	if ((cpuid_features() & CPUID_FEATURE_PCID))
105		pcid_present = TRUE;
106	else {
107		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE;
108		pmap_pcid_log("PMAP: PCID not detected CPU %d\n", ccpu);
109		return;
110	}
111	if ((cr4 & (CR4_PCIDE | CR4_PGE)) == (CR4_PCIDE|CR4_PGE)) {
112		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE;
113		pmap_pcid_log("PMAP: PCID already enabled %d\n", ccpu);
114		return;
115	}
116	if (pcid_present == TRUE) {
117		pmap_pcid_log("Pre-PCID:CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, cr4);
118
119		if (cpu_number() >= PMAP_PCID_MAX_CPUS) {
120			panic("PMAP_PCID_MAX_CPUS %d\n", cpu_number());
121		}
122		if ((get_cr4() & CR4_PGE) == 0) {
123			set_cr4(get_cr4() | CR4_PGE);
124			pmap_pcid_log("Toggled PGE ON (CPU: %d\n", ccpu);
125		}
126		set_cr4(get_cr4() | CR4_PCIDE);
127		pmap_pcid_log("Post PCID: CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n", get_cr0(), get_cr3_raw(), ccpu, get_cr4());
128		tlb_flush_global();
129		cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE;
130
131		if (OSIncrementAtomic(&pmap_pcid_ncpus) == machine_info.max_cpus) {
132			pmap_pcid_log("All PCIDs enabled: real_ncpus: %d, pmap_pcid_ncpus: %d\n", real_ncpus, pmap_pcid_ncpus);
133		}
134		cpu_datap(ccpu)->cpu_pmap_pcid_coherentp =
135		    cpu_datap(ccpu)->cpu_pmap_pcid_coherentp_kernel =
136		    &(kernel_pmap->pmap_pcid_coherency_vector[ccpu]);
137		cpu_datap(ccpu)->cpu_pcid_refcounts[0] = 1;
138	}
139}
140
141void pmap_pcid_initialize(pmap_t p) {
142	unsigned i;
143	unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t);
144
145	pmap_assert(nc >= real_ncpus);
146	for (i = 0; i < nc; i++) {
147		p->pmap_pcid_cpus[i] = PMAP_PCID_INVALID_PCID;
148		/* We assume here that the coherency vector is zeroed by
149		 * pmap_create
150		 */
151	}
152}
153
154void pmap_pcid_initialize_kernel(pmap_t p) {
155	unsigned i;
156	unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t);
157
158	for (i = 0; i < nc; i++) {
159		p->pmap_pcid_cpus[i] = 0;
160		/* We assume here that the coherency vector is zeroed by
161		 * pmap_create
162		 */
163	}
164}
165
166pcid_t	pmap_pcid_allocate_pcid(int ccpu) {
167	int i;
168	pcid_ref_t 	cur_min = 0xFF;
169	uint32_t	cur_min_index = ~1;
170	pcid_ref_t	*cpu_pcid_refcounts = &cpu_datap(ccpu)->cpu_pcid_refcounts[0];
171	pcid_ref_t	old_count;
172
173	if ((i = cpu_datap(ccpu)->cpu_pcid_free_hint) != 0) {
174		if (cpu_pcid_refcounts[i] == 0) {
175			(void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1);
176			cpu_datap(ccpu)->cpu_pcid_free_hint = 0;
177			return i;
178		}
179	}
180	/* Linear scan to discover free slot, with hint. Room for optimization
181	 * but with intelligent prefetchers this should be
182	 * adequately performant, as it is invoked
183	 * only on first dispatch of a new address space onto
184	 * a given processor. DRKTODO: use larger loads and
185	 * zero byte discovery -- any pattern != ~1 should
186	 * signify a free slot.
187	 */
188	for (i = PMAP_PCID_MIN_PCID; i < PMAP_PCID_MAX_PCID; i++) {
189		pcid_ref_t cur_refcount = cpu_pcid_refcounts[i];
190
191		pmap_assert(cur_refcount < PMAP_PCID_MAX_REFCOUNT);
192
193		if (cur_refcount == 0) {
194			(void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1);
195			return i;
196		}
197		else {
198			if (cur_refcount < cur_min) {
199				cur_min_index = i;
200				cur_min = cur_refcount;
201			}
202		}
203	}
204	pmap_assert(cur_min_index > 0 && cur_min_index < PMAP_PCID_MAX_PCID);
205	/* Consider "rebalancing" tags actively in highly oversubscribed cases
206	 * perhaps selecting tags with lower activity.
207	 */
208
209	old_count = __sync_fetch_and_add(&cpu_pcid_refcounts[cur_min_index], 1);
210	pmap_assert(old_count < PMAP_PCID_MAX_REFCOUNT);
211	return cur_min_index;
212}
213
214void	pmap_pcid_deallocate_pcid(int ccpu, pmap_t tpmap) {
215	pcid_t pcid;
216	pmap_t lp;
217	pcid_ref_t prior_count;
218
219	pcid = tpmap->pmap_pcid_cpus[ccpu];
220	pmap_assert(pcid != PMAP_PCID_INVALID_PCID);
221	if (pcid == PMAP_PCID_INVALID_PCID)
222		return;
223
224	lp = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[pcid];
225	pmap_assert(pcid > 0 && pcid < PMAP_PCID_MAX_PCID);
226	pmap_assert(cpu_datap(ccpu)->cpu_pcid_refcounts[pcid] >= 1);
227
228	if (lp == tpmap)
229		(void)__sync_bool_compare_and_swap(&cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[pcid], tpmap, PMAP_INVALID);
230
231	if ((prior_count = __sync_fetch_and_sub(&cpu_datap(ccpu)->cpu_pcid_refcounts[pcid], 1)) == 1) {
232		    cpu_datap(ccpu)->cpu_pcid_free_hint = pcid;
233	}
234	pmap_assert(prior_count <= PMAP_PCID_MAX_REFCOUNT);
235}
236
237void	pmap_destroy_pcid_sync(pmap_t p) {
238	int i;
239	pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0);
240	for (i = 0; i < PMAP_PCID_MAX_CPUS; i++)
241		if (p->pmap_pcid_cpus[i] != PMAP_PCID_INVALID_PCID)
242			pmap_pcid_deallocate_pcid(i, p);
243}
244
245pcid_t	pcid_for_pmap_cpu_tuple(pmap_t pmap, int ccpu) {
246	return pmap->pmap_pcid_cpus[ccpu];
247}
248#if PMAP_ASSERT
249#define PCID_RECORD_SIZE 128
250uint64_t pcid_record_array[PCID_RECORD_SIZE];
251#endif
252
253void	pmap_pcid_activate(pmap_t tpmap, int ccpu) {
254	pcid_t		new_pcid = tpmap->pmap_pcid_cpus[ccpu];
255	pmap_t		last_pmap;
256	boolean_t	pcid_conflict = FALSE, pending_flush = FALSE;
257
258	pmap_assert(cpu_datap(ccpu)->cpu_pmap_pcid_enabled);
259	if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) {
260		new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu);
261	}
262	pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID);
263#ifdef	PCID_ASSERT
264	cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid;
265#endif
266	cpu_datap(ccpu)->cpu_active_pcid = new_pcid;
267
268	pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0);
269	if (__probable(pending_flush == FALSE)) {
270		last_pmap = cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid];
271		pcid_conflict = ((last_pmap != NULL) &&(tpmap != last_pmap));
272	}
273	if (__improbable(pending_flush || pcid_conflict)) {
274		pmap_pcid_validate_cpu(tpmap, ccpu);
275	}
276	/* Consider making this a unique id */
277	cpu_datap(ccpu)->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap;
278
279	pmap_assert(new_pcid < PMAP_PCID_MAX_PCID);
280	pmap_assert(((tpmap ==  kernel_pmap) && new_pcid == 0) || ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0)));
281#if	PMAP_ASSERT
282	pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) <<63);
283	pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL);
284	/* Diagnostic to detect pagetable anchor corruption */
285	if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX])
286		__asm__ volatile("int3");
287#endif	/* PMAP_ASSERT */
288	set_cr3_composed(tpmap->pm_cr3, new_pcid, !(pending_flush || pcid_conflict));
289
290	if (!pending_flush) {
291		/* We did not previously observe a pending invalidation for this
292		 * ASID. However, the load from the coherency vector
293		 * could've been reordered ahead of the store to the
294		 * active_cr3 field (in the context switch path, our
295		 * caller). Re-consult the pending invalidation vector
296		 * after the CR3 write. We rely on MOV CR3's documented
297		 * serializing property to avoid insertion of an expensive
298		 * barrier. (DRK)
299		 */
300		pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0);
301		if (__improbable(pending_flush != 0)) {
302			pmap_pcid_validate_cpu(tpmap, ccpu);
303			set_cr3_composed(tpmap->pm_cr3, new_pcid, FALSE);
304		}
305	}
306	cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]);
307#if	DEBUG
308	KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap, new_pcid, pending_flush, pcid_conflict, 0);
309#endif
310}
311