vmm_mem.c revision 221828
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD$
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/linker.h>
36#include <sys/systm.h>
37#include <sys/malloc.h>
38#include <sys/kernel.h>
39
40#include <vm/vm.h>
41#include <vm/pmap.h>
42
43#include <machine/md_var.h>
44#include <machine/metadata.h>
45#include <machine/pc/bios.h>
46#include <machine/vmparam.h>
47#include <machine/pmap.h>
48
49#include "vmm_util.h"
50#include "vmm_mem.h"
51
52static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
53
54#define	MB		(1024 * 1024)
55#define	GB		(1024 * MB)
56
57#define	VMM_MEM_MAXSEGS	64
58
59/* protected by vmm_mem_mtx */
60static struct {
61	vm_paddr_t	base;
62	vm_size_t	length;
63} vmm_mem_avail[VMM_MEM_MAXSEGS];
64
65static int vmm_mem_nsegs;
66
67static vm_paddr_t maxaddr;
68
69static struct mtx vmm_mem_mtx;
70
71/*
72 * Steal any memory that was deliberately hidden from FreeBSD either by
73 * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
74 */
75static int
76vmm_mem_steal_memory(void)
77{
78	int nsegs;
79	caddr_t kmdp;
80	uint32_t smapsize;
81	uint64_t base, length;
82	struct bios_smap *smapbase, *smap, *smapend;
83
84	/*
85	 * Borrowed from hammer_time() and getmemsize() in machdep.c
86	 */
87	kmdp = preload_search_by_type("elf kernel");
88	if (kmdp == NULL)
89		kmdp = preload_search_by_type("elf64 kernel");
90
91	smapbase = (struct bios_smap *)preload_search_info(kmdp,
92		MODINFO_METADATA | MODINFOMD_SMAP);
93	if (smapbase == NULL)
94		panic("No BIOS smap info from loader!");
95
96	smapsize = *((uint32_t *)smapbase - 1);
97	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
98
99	nsegs = 0;
100	for (smap = smapbase; smap < smapend; smap++) {
101		/*
102		 * XXX
103		 * Assuming non-overlapping, monotonically increasing
104		 * memory segments.
105		 */
106		if (smap->type != SMAP_TYPE_MEMORY)
107			continue;
108		if (smap->length == 0)
109			break;
110
111		base = roundup(smap->base, NBPDR);
112		length = rounddown(smap->length, NBPDR);
113
114		/* Skip this segment if FreeBSD is using all of it. */
115		if (base + length <= ptoa(Maxmem))
116			continue;
117
118		/*
119		 * If FreeBSD is using part of this segment then adjust
120		 * 'base' and 'length' accordingly.
121		 */
122		if (base < ptoa(Maxmem)) {
123			uint64_t used;
124			used = roundup(ptoa(Maxmem), NBPDR) - base;
125			base += used;
126			length -= used;
127		}
128
129		if (length == 0)
130			continue;
131
132		vmm_mem_avail[nsegs].base = base;
133		vmm_mem_avail[nsegs].length = length;
134
135		if (base + length > maxaddr)
136			maxaddr = base + length;
137
138		if (0 && bootverbose) {
139			printf("vmm_mem_populate: index %d, base 0x%0lx, "
140			       "length %ld\n",
141			       nsegs, vmm_mem_avail[nsegs].base,
142			       vmm_mem_avail[nsegs].length);
143		}
144
145		nsegs++;
146		if (nsegs >= VMM_MEM_MAXSEGS) {
147			printf("vmm_mem_populate: maximum number of vmm memory "
148			       "segments reached!\n");
149			return (ENOSPC);
150		}
151	}
152
153	vmm_mem_nsegs = nsegs;
154
155	return (0);
156}
157
158static void
159vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
160{
161	vm_paddr_t addr, remaining;
162	int pdpi, pdi, superpage_size;
163	pml4_entry_t *pml4p;
164	pdp_entry_t *pdp;
165	pd_entry_t *pd;
166	uint64_t page_attr_bits;
167
168	if (end >= NBPML4)
169		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
170
171	/* XXX FreeBSD 8.1 does not use 1G superpages in the direct map */
172	if (0 && vmm_supports_1G_pages())
173		superpage_size = NBPDP;
174	else
175		superpage_size = NBPDR;
176
177	/*
178	 * Get the page directory pointer page that contains the direct
179	 * map address mappings.
180	 */
181	pml4p = kernel_pmap->pm_pml4;
182	pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
183
184	page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
185	addr = start;
186	while (addr < end) {
187		remaining = end - addr;
188		pdpi = addr / NBPDP;
189		if (superpage_size == NBPDP &&
190		    remaining >= NBPDP &&
191		    addr % NBPDP == 0) {
192			/*
193			 * If there isn't a mapping for this address then
194			 * create one but if there is one already make sure
195			 * it matches what we expect it to be.
196			 */
197			if (pdp[pdpi] == 0) {
198				pdp[pdpi] = addr | page_attr_bits;
199				if (0 && bootverbose) {
200					printf("vmm_mem_populate: mapping "
201					       "0x%lx with 1GB page at "
202					       "pdpi %d\n", addr, pdpi);
203				}
204			} else {
205				pdp_entry_t pdpe = pdp[pdpi];
206				if ((pdpe & ~PAGE_MASK) != addr ||
207				    (pdpe & page_attr_bits) != page_attr_bits) {
208					panic("An invalid mapping 0x%016lx "
209					      "already exists for 0x%016lx\n",
210					      pdpe, addr);
211				}
212			}
213			addr += NBPDP;
214		} else {
215			if (remaining < NBPDR) {
216				panic("vmm_mem_populate: remaining (%ld) must "
217				      "be greater than NBPDR (%d)\n",
218				      remaining, NBPDR);
219			}
220			if (pdp[pdpi] == 0) {
221				/*
222				 * XXX we lose this memory forever because
223				 * we do not keep track of the virtual address
224				 * that would be required to free this page.
225				 */
226				pd = malloc(PAGE_SIZE, M_VMM_MEM,
227					    M_WAITOK | M_ZERO);
228				if ((uintptr_t)pd & PAGE_MASK) {
229					panic("vmm_mem_populate: page directory"
230					      "page not aligned on %d "
231					      "boundary\n", PAGE_SIZE);
232				}
233				pdp[pdpi] = vtophys(pd);
234				pdp[pdpi] |= PG_RW | PG_V | PG_U;
235				if (0 && bootverbose) {
236					printf("Creating page directory "
237					       "at pdp index %d for 0x%016lx\n",
238					       pdpi, addr);
239				}
240			}
241			pdi = (addr % NBPDP) / NBPDR;
242			pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
243
244			/*
245			 * Create a new mapping if one doesn't already exist
246			 * or validate it if it does.
247			 */
248			if (pd[pdi] == 0) {
249				pd[pdi] = addr | page_attr_bits;
250				if (0 && bootverbose) {
251					printf("vmm_mem_populate: mapping "
252					       "0x%lx with 2MB page at "
253					       "pdpi %d, pdi %d\n",
254					       addr, pdpi, pdi);
255				}
256			} else {
257				pd_entry_t pde = pd[pdi];
258				if ((pde & ~PAGE_MASK) != addr ||
259				    (pde & page_attr_bits) != page_attr_bits) {
260					panic("An invalid mapping 0x%016lx "
261					      "already exists for 0x%016lx\n",
262					      pde, addr);
263				}
264			}
265			addr += NBPDR;
266		}
267	}
268}
269
270static int
271vmm_mem_populate(void)
272{
273	int seg, error;
274	vm_paddr_t start, end;
275
276	/* populate the vmm_mem_avail[] array */
277	error = vmm_mem_steal_memory();
278	if (error)
279		return (error);
280
281	/*
282	 * Now map the memory that was hidden from FreeBSD in
283	 * the direct map VA space.
284	 */
285	for (seg = 0; seg < vmm_mem_nsegs; seg++) {
286		start = vmm_mem_avail[seg].base;
287		end = start + vmm_mem_avail[seg].length;
288		if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
289			panic("start (0x%016lx) and end (0x%016lx) must be "
290			      "aligned on a %dMB boundary\n",
291			      start, end, NBPDR / MB);
292		}
293		vmm_mem_direct_map(start, end);
294	}
295
296	return (0);
297}
298
299int
300vmm_mem_init(void)
301{
302	int error;
303
304	mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
305
306	error = vmm_mem_populate();
307	if (error)
308		return (error);
309
310	return (0);
311}
312
313vm_paddr_t
314vmm_mem_alloc(size_t size)
315{
316	int i;
317	vm_paddr_t addr;
318
319	if ((size & PDRMASK) != 0) {
320		panic("vmm_mem_alloc: size 0x%0lx must be "
321		      "aligned on a 0x%0x boundary\n", size, NBPDR);
322	}
323
324	addr = 0;
325
326	mtx_lock(&vmm_mem_mtx);
327	for (i = 0; i < vmm_mem_nsegs; i++) {
328		if (vmm_mem_avail[i].length >= size) {
329			addr = vmm_mem_avail[i].base;
330			vmm_mem_avail[i].base += size;
331			vmm_mem_avail[i].length -= size;
332			/* remove a zero length segment */
333			if (vmm_mem_avail[i].length == 0) {
334				memmove(&vmm_mem_avail[i],
335					&vmm_mem_avail[i + 1],
336					(vmm_mem_nsegs - (i + 1)) *
337					 sizeof(vmm_mem_avail[0]));
338				vmm_mem_nsegs--;
339			}
340			break;
341		}
342	}
343	mtx_unlock(&vmm_mem_mtx);
344
345	return (addr);
346}
347
348void
349vmm_mem_free(vm_paddr_t base, size_t length)
350{
351	int i;
352
353	if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
354		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
355		      "aligned on a 0x%0x boundary\n", base, length, NBPDR);
356	}
357
358	mtx_lock(&vmm_mem_mtx);
359
360	for (i = 0; i < vmm_mem_nsegs; i++) {
361		if (vmm_mem_avail[i].base > base)
362			break;
363	}
364
365	if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
366		panic("vmm_mem_free: cannot free any more segments");
367
368	/* Create a new segment at index 'i' */
369	memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
370		(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
371
372	vmm_mem_avail[i].base = base;
373	vmm_mem_avail[i].length = length;
374
375	vmm_mem_nsegs++;
376
377coalesce_some_more:
378	for (i = 0; i < vmm_mem_nsegs - 1; i++) {
379		if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
380		    vmm_mem_avail[i + 1].base) {
381			vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
382			memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
383			  (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
384			vmm_mem_nsegs--;
385			goto coalesce_some_more;
386		}
387	}
388
389	mtx_unlock(&vmm_mem_mtx);
390}
391
392vm_paddr_t
393vmm_mem_maxaddr(void)
394{
395
396	return (maxaddr);
397}
398
399void
400vmm_mem_dump(void)
401{
402	int i;
403	vm_paddr_t base;
404	vm_size_t length;
405
406	mtx_lock(&vmm_mem_mtx);
407	for (i = 0; i < vmm_mem_nsegs; i++) {
408		base = vmm_mem_avail[i].base;
409		length = vmm_mem_avail[i].length;
410		printf("%-4d0x%016lx    0x%016lx\n", i, base, base + length);
411	}
412	mtx_unlock(&vmm_mem_mtx);
413}
414