vmm_mem.c revision 239700
133975Sjdp/*-
234495Sjdp * Copyright (c) 2011 NetApp, Inc.
333975Sjdp * All rights reserved.
433975Sjdp *
533975Sjdp * Redistribution and use in source and binary forms, with or without
633975Sjdp * modification, are permitted provided that the following conditions
733975Sjdp * are met:
833975Sjdp * 1. Redistributions of source code must retain the above copyright
933975Sjdp *    notice, this list of conditions and the following disclaimer.
1033975Sjdp * 2. Redistributions in binary form must reproduce the above copyright
1133975Sjdp *    notice, this list of conditions and the following disclaimer in the
1233975Sjdp *    documentation and/or other materials provided with the distribution.
1333975Sjdp *
1433975Sjdp * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
1533975Sjdp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1633975Sjdp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1733975Sjdp * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
1833975Sjdp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1933975Sjdp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2033975Sjdp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2133975Sjdp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2233975Sjdp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2333975Sjdp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2433975Sjdp * SUCH DAMAGE.
2534495Sjdp *
2634495Sjdp * $FreeBSD$
2734495Sjdp */
2834495Sjdp
2934495Sjdp#include <sys/cdefs.h>
3033975Sjdp__FBSDID("$FreeBSD$");
3133975Sjdp
3234495Sjdp#include <sys/param.h>
3334495Sjdp#include <sys/lock.h>
3434495Sjdp#include <sys/mutex.h>
3534495Sjdp#include <sys/linker.h>
3634495Sjdp#include <sys/systm.h>
3734495Sjdp#include <sys/malloc.h>
3834495Sjdp#include <sys/kernel.h>
3934495Sjdp
40#include <vm/vm.h>
41#include <vm/pmap.h>
42
43#include <machine/md_var.h>
44#include <machine/metadata.h>
45#include <machine/pc/bios.h>
46#include <machine/vmparam.h>
47#include <machine/pmap.h>
48
49#include "vmm_util.h"
50#include "vmm_mem.h"
51
52static MALLOC_DEFINE(M_VMM_MEM, "vmm memory", "vmm memory");
53
54#define	MB		(1024 * 1024)
55#define	GB		(1024 * MB)
56
57#define	VMM_MEM_MAXSEGS	64
58
59/* protected by vmm_mem_mtx */
60static struct {
61	vm_paddr_t	base;
62	vm_size_t	length;
63} vmm_mem_avail[VMM_MEM_MAXSEGS];
64
65static int vmm_mem_nsegs;
66size_t vmm_mem_total_bytes;
67
68static vm_paddr_t maxaddr;
69
70static struct mtx vmm_mem_mtx;
71
72/*
73 * Steal any memory that was deliberately hidden from FreeBSD either by
74 * the use of MAXMEM kernel config option or the hw.physmem loader tunable.
75 */
76static int
77vmm_mem_steal_memory(void)
78{
79	int nsegs;
80	caddr_t kmdp;
81	uint32_t smapsize;
82	uint64_t base, length;
83	struct bios_smap *smapbase, *smap, *smapend;
84
85	/*
86	 * Borrowed from hammer_time() and getmemsize() in machdep.c
87	 */
88	kmdp = preload_search_by_type("elf kernel");
89	if (kmdp == NULL)
90		kmdp = preload_search_by_type("elf64 kernel");
91
92	smapbase = (struct bios_smap *)preload_search_info(kmdp,
93		MODINFO_METADATA | MODINFOMD_SMAP);
94	if (smapbase == NULL)
95		panic("No BIOS smap info from loader!");
96
97	smapsize = *((uint32_t *)smapbase - 1);
98	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
99
100	vmm_mem_total_bytes = 0;
101	nsegs = 0;
102	for (smap = smapbase; smap < smapend; smap++) {
103		/*
104		 * XXX
105		 * Assuming non-overlapping, monotonically increasing
106		 * memory segments.
107		 */
108		if (smap->type != SMAP_TYPE_MEMORY)
109			continue;
110		if (smap->length == 0)
111			break;
112
113		base = roundup(smap->base, NBPDR);
114		length = rounddown(smap->length, NBPDR);
115
116		/* Skip this segment if FreeBSD is using all of it. */
117		if (base + length <= ptoa(Maxmem))
118			continue;
119
120		/*
121		 * If FreeBSD is using part of this segment then adjust
122		 * 'base' and 'length' accordingly.
123		 */
124		if (base < ptoa(Maxmem)) {
125			uint64_t used;
126			used = roundup(ptoa(Maxmem), NBPDR) - base;
127			base += used;
128			length -= used;
129		}
130
131		if (length == 0)
132			continue;
133
134		vmm_mem_avail[nsegs].base = base;
135		vmm_mem_avail[nsegs].length = length;
136		vmm_mem_total_bytes += length;
137
138		if (base + length > maxaddr)
139			maxaddr = base + length;
140
141		if (0 && bootverbose) {
142			printf("vmm_mem_populate: index %d, base 0x%0lx, "
143			       "length %ld\n",
144			       nsegs, vmm_mem_avail[nsegs].base,
145			       vmm_mem_avail[nsegs].length);
146		}
147
148		nsegs++;
149		if (nsegs >= VMM_MEM_MAXSEGS) {
150			printf("vmm_mem_populate: maximum number of vmm memory "
151			       "segments reached!\n");
152			return (ENOSPC);
153		}
154	}
155
156	vmm_mem_nsegs = nsegs;
157
158	return (0);
159}
160
161static void
162vmm_mem_direct_map(vm_paddr_t start, vm_paddr_t end)
163{
164	vm_paddr_t addr, remaining;
165	int pdpi, pdi, superpage_size;
166	pml4_entry_t *pml4p;
167	pdp_entry_t *pdp;
168	pd_entry_t *pd;
169	uint64_t page_attr_bits;
170
171	if (end >= NBPML4)
172		panic("Cannot map memory beyond %ldGB", NBPML4 / GB);
173
174	if (vmm_supports_1G_pages())
175		superpage_size = NBPDP;
176	else
177		superpage_size = NBPDR;
178
179	/*
180	 * Get the page directory pointer page that contains the direct
181	 * map address mappings.
182	 */
183	pml4p = kernel_pmap->pm_pml4;
184	pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4p[DMPML4I] & ~PAGE_MASK);
185
186	page_attr_bits = PG_RW | PG_V | PG_PS | PG_G;
187	addr = start;
188	while (addr < end) {
189		remaining = end - addr;
190		pdpi = addr / NBPDP;
191		if (superpage_size == NBPDP &&
192		    remaining >= NBPDP &&
193		    addr % NBPDP == 0) {
194			/*
195			 * If there isn't a mapping for this address then
196			 * create one but if there is one already make sure
197			 * it matches what we expect it to be.
198			 */
199			if (pdp[pdpi] == 0) {
200				pdp[pdpi] = addr | page_attr_bits;
201				if (0 && bootverbose) {
202					printf("vmm_mem_populate: mapping "
203					       "0x%lx with 1GB page at "
204					       "pdpi %d\n", addr, pdpi);
205				}
206			} else {
207				pdp_entry_t pdpe = pdp[pdpi];
208				if ((pdpe & ~PAGE_MASK) != addr ||
209				    (pdpe & page_attr_bits) != page_attr_bits) {
210					panic("An invalid mapping 0x%016lx "
211					      "already exists for 0x%016lx\n",
212					      pdpe, addr);
213				}
214			}
215			addr += NBPDP;
216		} else {
217			if (remaining < NBPDR) {
218				panic("vmm_mem_populate: remaining (%ld) must "
219				      "be greater than NBPDR (%d)\n",
220				      remaining, NBPDR);
221			}
222			if (pdp[pdpi] == 0) {
223				/*
224				 * XXX we lose this memory forever because
225				 * we do not keep track of the virtual address
226				 * that would be required to free this page.
227				 */
228				pd = malloc(PAGE_SIZE, M_VMM_MEM,
229					    M_WAITOK | M_ZERO);
230				if ((uintptr_t)pd & PAGE_MASK) {
231					panic("vmm_mem_populate: page directory"
232					      "page not aligned on %d "
233					      "boundary\n", PAGE_SIZE);
234				}
235				pdp[pdpi] = vtophys(pd);
236				pdp[pdpi] |= PG_RW | PG_V | PG_U;
237				if (0 && bootverbose) {
238					printf("Creating page directory "
239					       "at pdp index %d for 0x%016lx\n",
240					       pdpi, addr);
241				}
242			}
243			pdi = (addr % NBPDP) / NBPDR;
244			pd = (pd_entry_t *)PHYS_TO_DMAP(pdp[pdpi] & ~PAGE_MASK);
245
246			/*
247			 * Create a new mapping if one doesn't already exist
248			 * or validate it if it does.
249			 */
250			if (pd[pdi] == 0) {
251				pd[pdi] = addr | page_attr_bits;
252				if (0 && bootverbose) {
253					printf("vmm_mem_populate: mapping "
254					       "0x%lx with 2MB page at "
255					       "pdpi %d, pdi %d\n",
256					       addr, pdpi, pdi);
257				}
258			} else {
259				pd_entry_t pde = pd[pdi];
260				if ((pde & ~PAGE_MASK) != addr ||
261				    (pde & page_attr_bits) != page_attr_bits) {
262					panic("An invalid mapping 0x%016lx "
263					      "already exists for 0x%016lx\n",
264					      pde, addr);
265				}
266			}
267			addr += NBPDR;
268		}
269	}
270}
271
272static int
273vmm_mem_populate(void)
274{
275	int seg, error;
276	vm_paddr_t start, end;
277
278	/* populate the vmm_mem_avail[] array */
279	error = vmm_mem_steal_memory();
280	if (error)
281		return (error);
282
283	/*
284	 * Now map the memory that was hidden from FreeBSD in
285	 * the direct map VA space.
286	 */
287	for (seg = 0; seg < vmm_mem_nsegs; seg++) {
288		start = vmm_mem_avail[seg].base;
289		end = start + vmm_mem_avail[seg].length;
290		if ((start & PDRMASK) != 0 || (end & PDRMASK) != 0) {
291			panic("start (0x%016lx) and end (0x%016lx) must be "
292			      "aligned on a %dMB boundary\n",
293			      start, end, NBPDR / MB);
294		}
295		vmm_mem_direct_map(start, end);
296	}
297
298	return (0);
299}
300
301int
302vmm_mem_init(void)
303{
304	int error;
305
306	mtx_init(&vmm_mem_mtx, "vmm_mem_mtx", NULL, MTX_DEF);
307
308	error = vmm_mem_populate();
309	if (error)
310		return (error);
311
312	return (0);
313}
314
315vm_paddr_t
316vmm_mem_alloc(size_t size)
317{
318	int i;
319	vm_paddr_t addr;
320
321	if ((size & PDRMASK) != 0) {
322		panic("vmm_mem_alloc: size 0x%0lx must be "
323		      "aligned on a 0x%0x boundary\n", size, NBPDR);
324	}
325
326	addr = 0;
327
328	mtx_lock(&vmm_mem_mtx);
329	for (i = 0; i < vmm_mem_nsegs; i++) {
330		if (vmm_mem_avail[i].length >= size) {
331			addr = vmm_mem_avail[i].base;
332			vmm_mem_avail[i].base += size;
333			vmm_mem_avail[i].length -= size;
334			/* remove a zero length segment */
335			if (vmm_mem_avail[i].length == 0) {
336				memmove(&vmm_mem_avail[i],
337					&vmm_mem_avail[i + 1],
338					(vmm_mem_nsegs - (i + 1)) *
339					 sizeof(vmm_mem_avail[0]));
340				vmm_mem_nsegs--;
341			}
342			break;
343		}
344	}
345	mtx_unlock(&vmm_mem_mtx);
346
347	return (addr);
348}
349
350size_t
351vmm_mem_get_mem_total(void)
352{
353	return vmm_mem_total_bytes;
354}
355
356size_t
357vmm_mem_get_mem_free(void)
358{
359	size_t length = 0;
360	int i;
361
362	mtx_lock(&vmm_mem_mtx);
363	for (i = 0; i < vmm_mem_nsegs; i++) {
364		length += vmm_mem_avail[i].length;
365	}
366	mtx_unlock(&vmm_mem_mtx);
367
368	return(length);
369}
370
371void
372vmm_mem_free(vm_paddr_t base, size_t length)
373{
374	int i;
375
376	if ((base & PDRMASK) != 0 || (length & PDRMASK) != 0) {
377		panic("vmm_mem_free: base 0x%0lx and length 0x%0lx must be "
378		      "aligned on a 0x%0x boundary\n", base, length, NBPDR);
379	}
380
381	mtx_lock(&vmm_mem_mtx);
382
383	for (i = 0; i < vmm_mem_nsegs; i++) {
384		if (vmm_mem_avail[i].base > base)
385			break;
386	}
387
388	if (vmm_mem_nsegs >= VMM_MEM_MAXSEGS)
389		panic("vmm_mem_free: cannot free any more segments");
390
391	/* Create a new segment at index 'i' */
392	memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i],
393		(vmm_mem_nsegs - i) * sizeof(vmm_mem_avail[0]));
394
395	vmm_mem_avail[i].base = base;
396	vmm_mem_avail[i].length = length;
397
398	vmm_mem_nsegs++;
399
400coalesce_some_more:
401	for (i = 0; i < vmm_mem_nsegs - 1; i++) {
402		if (vmm_mem_avail[i].base + vmm_mem_avail[i].length ==
403		    vmm_mem_avail[i + 1].base) {
404			vmm_mem_avail[i].length += vmm_mem_avail[i + 1].length;
405			memmove(&vmm_mem_avail[i + 1], &vmm_mem_avail[i + 2],
406			  (vmm_mem_nsegs - (i + 2)) * sizeof(vmm_mem_avail[0]));
407			vmm_mem_nsegs--;
408			goto coalesce_some_more;
409		}
410	}
411
412	mtx_unlock(&vmm_mem_mtx);
413}
414
415vm_paddr_t
416vmm_mem_maxaddr(void)
417{
418
419	return (maxaddr);
420}
421
422void
423vmm_mem_dump(void)
424{
425	int i;
426	vm_paddr_t base;
427	vm_size_t length;
428
429	mtx_lock(&vmm_mem_mtx);
430	for (i = 0; i < vmm_mem_nsegs; i++) {
431		base = vmm_mem_avail[i].base;
432		length = vmm_mem_avail[i].length;
433		printf("%-4d0x%016lx    0x%016lx\n", i, base, base + length);
434	}
435	mtx_unlock(&vmm_mem_mtx);
436}
437