vm_kern.c revision 15722
155839Sasmodai/*
269626Sru * Copyright (c) 1991, 1993
369626Sru *	The Regents of the University of California.  All rights reserved.
4114402Sru *
5114402Sru * This code is derived from software contributed to Berkeley by
669626Sru * The Mach Operating System project at Carnegie-Mellon University.
7114402Sru *
8104862Sru * Redistribution and use in source and binary forms, with or without
969626Sru * modification, are permitted provided that the following conditions
1055839Sasmodai * are met:
11104862Sru * 1. Redistributions of source code must retain the above copyright
1255839Sasmodai *    notice, this list of conditions and the following disclaimer.
1355839Sasmodai * 2. Redistributions in binary form must reproduce the above copyright
1455839Sasmodai *    notice, this list of conditions and the following disclaimer in the
1555839Sasmodai *    documentation and/or other materials provided with the distribution.
1655839Sasmodai * 3. All advertising materials mentioning features or use of this software
1755839Sasmodai *    must display the following acknowledgement:
18104862Sru *	This product includes software developed by the University of
19104862Sru *	California, Berkeley and its contributors.
20104862Sru * 4. Neither the name of the University nor the names of its contributors
21104862Sru *    may be used to endorse or promote products derived from this software
22104862Sru *    without specific prior written permission.
23104862Sru *
24114402Sru * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25104862Sru * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26114402Sru * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27114402Sru * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28104862Sru * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29104862Sru * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30104862Sru * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31104862Sru * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32104862Sru * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33104862Sru * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34104862Sru * SUCH DAMAGE.
35104862Sru *
36104862Sru *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
37104862Sru *
38104862Sru *
39104862Sru * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40104862Sru * All rights reserved.
41104862Sru *
42104862Sru * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43104862Sru *
44104862Sru * Permission to use, copy, modify and distribute this software and
4569626Sru * its documentation is hereby granted, provided that both the copyright
4669626Sru * notice and this permission notice appear in all copies of the
4769626Sru * software, derivative works or modified versions, and any portions
4875584Sru * thereof, and that both notices appear in supporting documentation.
4975584Sru *
5069626Sru * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
5169626Sru * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
5269626Sru * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
5369626Sru *
5475584Sru * Carnegie Mellon requests users of this software to return to
5575584Sru *
5669626Sru *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
5769626Sru *  School of Computer Science
5869626Sru *  Carnegie Mellon University
5969626Sru *  Pittsburgh PA 15213-3890
6075584Sru *
6175584Sru * any improvements or extensions that they make and grant Carnegie the
6269626Sru * rights to redistribute these changes.
6375584Sru *
6469626Sru * $Id: vm_kern.c,v 1.23 1996/04/24 04:16:44 dyson Exp $
6569626Sru */
6669626Sru
6769626Sru/*
68114402Sru *	Kernel memory management.
69114402Sru */
7075584Sru
71114402Sru#include <sys/param.h>
7275584Sru#include <sys/systm.h>
73114402Sru#include <sys/kernel.h>
7475584Sru#include <sys/proc.h>
7575584Sru#include <sys/malloc.h>
7675584Sru#include <sys/syslog.h>
7769626Sru#include <sys/queue.h>
7875584Sru#include <sys/vmmeter.h>
7975584Sru
8075584Sru#include <vm/vm.h>
8169626Sru#include <vm/vm_param.h>
8275584Sru#include <vm/vm_prot.h>
83104862Sru#include <vm/lock.h>
84104862Sru#include <vm/pmap.h>
85104862Sru#include <vm/vm_map.h>
86104862Sru#include <vm/vm_object.h>
87104862Sru#include <vm/vm_page.h>
88104862Sru#include <vm/vm_pageout.h>
89104862Sru#include <vm/vm_kern.h>
90104862Sru#include <vm/vm_extern.h>
91104862Sru
92104862Sruvm_map_t buffer_map;
93104862Sruvm_map_t kernel_map;
94104862Sruvm_map_t kmem_map;
95104862Sruvm_map_t mb_map;
96104862Sruint mb_map_full;
97104862Sruvm_map_t mcl_map;
98104862Sruint mcl_map_full;
99104862Sruvm_map_t io_map;
100104862Sruvm_map_t clean_map;
101104862Sruvm_map_t phys_map;
102104862Sruvm_map_t exec_map;
103104862Sruvm_map_t u_map;
104104862Sru
105104862Sru/*
106104862Sru *	kmem_alloc_pageable:
107104862Sru *
108104862Sru *	Allocate pageable memory to the kernel's address map.
109104862Sru *	"map" must be kernel_map or a submap of kernel_map.
110104862Sru */
111104862Sru
112114402Sruvm_offset_t
113104862Srukmem_alloc_pageable(map, size)
114104862Sru	vm_map_t map;
115104862Sru	register vm_size_t size;
116104862Sru{
117104862Sru	vm_offset_t addr;
118104862Sru	register int result;
119104862Sru
120104862Sru	size = round_page(size);
12175584Sru	addr = vm_map_min(map);
12275584Sru	result = vm_map_find(map, NULL, (vm_offset_t) 0,
12375584Sru	    &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0);
124104862Sru	if (result != KERN_SUCCESS) {
12575584Sru		return (0);
126104862Sru	}
127104862Sru	return (addr);
128104862Sru}
12975584Sru
130104862Sru/*
131104862Sru *	Allocate wired-down memory in the kernel's address map
13275584Sru *	or a submap.
13375584Sru */
134104862Sruvm_offset_t
135104862Srukmem_alloc(map, size)
136104862Sru	register vm_map_t map;
13775584Sru	register vm_size_t size;
138104862Sru{
139104862Sru	vm_offset_t addr;
140104862Sru	register vm_offset_t offset;
14175584Sru	vm_offset_t i;
142104862Sru
14375584Sru	size = round_page(size);
14475584Sru
14575584Sru	/*
14669626Sru	 * Use the kernel object for wired-down kernel pages. Assume that no
14769626Sru	 * region of the kernel object is referenced more than once.
14869626Sru	 */
14975584Sru
15075584Sru	/*
15175584Sru	 * Locate sufficient space in the map.  This will give us the final
15275584Sru	 * virtual address for the new memory, and thus will tell us the
153114402Sru	 * offset within the kernel map.
15475584Sru	 */
155104862Sru	vm_map_lock(map);
156104862Sru	if (vm_map_findspace(map, 0, size, &addr)) {
157104862Sru		vm_map_unlock(map);
158114402Sru		return (0);
159104862Sru	}
160104862Sru	offset = addr - VM_MIN_KERNEL_ADDRESS;
16169626Sru	vm_object_reference(kernel_object);
16269626Sru	vm_map_insert(map, kernel_object, offset, addr, addr + size,
163104862Sru		VM_PROT_ALL, VM_PROT_ALL, 0);
164114402Sru	vm_map_unlock(map);
165104862Sru
16675584Sru	/*
167104862Sru	 * Guarantee that there are pages already in this object before
168104862Sru	 * calling vm_map_pageable.  This is to prevent the following
169104862Sru	 * scenario:
170114402Sru	 *
171104862Sru	 * 1) Threads have swapped out, so that there is a pager for the
17269626Sru	 * kernel_object. 2) The kmsg zone is empty, and so we are
17369626Sru	 * kmem_allocing a new page for it. 3) vm_map_pageable calls vm_fault;
17475584Sru	 * there is no page, but there is a pager, so we call
17575584Sru	 * pager_data_request.  But the kmsg zone is empty, so we must
17675584Sru	 * kmem_alloc. 4) goto 1 5) Even if the kmsg zone is not empty: when
17775584Sru	 * we get the data back from the pager, it will be (very stale)
17875584Sru	 * non-zero data.  kmem_alloc is defined to return zero-filled memory.
17975584Sru	 *
18075584Sru	 * We're intentionally not activating the pages we allocate to prevent a
18175584Sru	 * race with page-out.  vm_map_pageable will wire the pages.
182104862Sru	 */
18375584Sru
184104862Sru	for (i = 0; i < size; i += PAGE_SIZE) {
185104862Sru		vm_page_t mem;
186104862Sru
18775584Sru		while ((mem = vm_page_alloc(kernel_object,
188104862Sru			OFF_TO_IDX(offset + i), VM_ALLOC_ZERO)) == NULL) {
189104862Sru			VM_WAIT;
19075584Sru		}
19175584Sru		if ((mem->flags & PG_ZERO) == 0)
192104862Sru			vm_page_zero_fill(mem);
193104862Sru		mem->flags &= ~(PG_BUSY|PG_ZERO);
194104862Sru		mem->valid = VM_PAGE_BITS_ALL;
19575584Sru	}
196104862Sru
197104862Sru	/*
198104862Sru	 * And finally, mark the data as non-pageable.
19975584Sru	 */
200104862Sru
20175584Sru	(void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE);
20275584Sru
20375584Sru	/*
20475584Sru	 * Try to coalesce the map
20575584Sru	 */
20675584Sru	vm_map_simplify(map, addr);
20775584Sru
208104862Sru	return (addr);
209104862Sru}
210104862Sru
211104862Sru/*
212104862Sru *	kmem_free:
213104862Sru *
214104862Sru *	Release a region of kernel virtual memory allocated
215104862Sru *	with kmem_alloc, and return the physical pages
216104862Sru *	associated with that region.
217104862Sru */
218104862Sruvoid
219104862Srukmem_free(map, addr, size)
220104862Sru	vm_map_t map;
221104862Sru	register vm_offset_t addr;
222104862Sru	vm_size_t size;
223104862Sru{
224104862Sru	(void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
225104862Sru}
226104862Sru
227104862Sru/*
228104862Sru *	kmem_suballoc:
229104862Sru *
230104862Sru *	Allocates a map to manage a subrange
231104862Sru *	of the kernel virtual address space.
232104862Sru *
233104862Sru *	Arguments are as follows:
234104862Sru *
235104862Sru *	parent		Map to take range from
236104862Sru *	size		Size of range to find
23775584Sru *	min, max	Returned endpoints of map
23875584Sru *	pageable	Can the region be paged
239104862Sru */
24075584Sruvm_map_t
241104862Srukmem_suballoc(parent, min, max, size, pageable)
24269626Sru	register vm_map_t parent;
24369626Sru	vm_offset_t *min, *max;
244104862Sru	register vm_size_t size;
245104862Sru	boolean_t pageable;
246104862Sru{
247104862Sru	register int ret;
248104862Sru	vm_map_t result;
249104862Sru
250104862Sru	size = round_page(size);
25175584Sru
252104862Sru	*min = (vm_offset_t) vm_map_min(parent);
253104862Sru	ret = vm_map_find(parent, NULL, (vm_offset_t) 0,
25475584Sru	    min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0);
25575584Sru	if (ret != KERN_SUCCESS) {
256104862Sru		printf("kmem_suballoc: bad status return of %d.\n", ret);
257104862Sru		panic("kmem_suballoc");
258104862Sru	}
259104862Sru	*max = *min + size;
260104862Sru	pmap_reference(vm_map_pmap(parent));
26175584Sru	result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable);
26269626Sru	if (result == NULL)
26369626Sru		panic("kmem_suballoc: cannot create submap");
26469626Sru	if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS)
26575584Sru		panic("kmem_suballoc: unable to change range to submap");
26675584Sru	return (result);
26775584Sru}
268104862Sru
269104862Sru/*
270104862Sru * Allocate wired-down memory in the kernel's address map for the higher
27169626Sru * level kernel memory allocator (kern/kern_malloc.c).  We cannot use
27269626Sru * kmem_alloc() because we may need to allocate memory at interrupt
273104862Sru * level where we cannot block (canwait == FALSE).
274104862Sru *
275104862Sru * This routine has its own private kernel submap (kmem_map) and object
276104862Sru * (kmem_object).  This, combined with the fact that only malloc uses
27769626Sru * this routine, ensures that we will never block in map or object waits.
27869626Sru *
279104862Sru * Note that this still only works in a uni-processor environment and
280104862Sru * when called at splhigh().
281104862Sru *
282104862Sru * We don't worry about expanding the map (adding entries) since entries
283104862Sru * for wired maps are statically allocated.
284104862Sru */
285104862Sruvm_offset_t
286104862Srukmem_malloc(map, size, waitflag)
287104862Sru	register vm_map_t map;
288104862Sru	register vm_size_t size;
289104862Sru	boolean_t waitflag;
29075584Sru{
29175584Sru	register vm_offset_t offset, i;
29275584Sru	vm_map_entry_t entry;
29369626Sru	vm_offset_t addr;
29475584Sru	vm_page_t m;
29575584Sru
29675584Sru	if (map != kmem_map && map != mb_map && map != mcl_map)
29775584Sru		panic("kmem_malloc: map != {kmem,mb,mcl}_map");
29875584Sru
29975584Sru	size = round_page(size);
30075584Sru	addr = vm_map_min(map);
30175584Sru
30275584Sru	/*
30375584Sru	 * Locate sufficient space in the map.  This will give us the final
30475584Sru	 * virtual address for the new memory, and thus will tell us the
30575584Sru	 * offset within the kernel map.
30675584Sru	 */
30775584Sru	vm_map_lock(map);
308104862Sru	if (vm_map_findspace(map, 0, size, &addr)) {
309104862Sru		vm_map_unlock(map);
310104862Sru		if (map == mb_map) {
311104862Sru			mb_map_full = TRUE;
312104862Sru			log(LOG_ERR, "Out of mbufs - increase maxusers!\n");
313104862Sru			return (0);
314104862Sru		}
315104862Sru		if (map == mcl_map) {
316104862Sru			mcl_map_full = TRUE;
317104862Sru			log(LOG_ERR,
318104862Sru			    "Out of mbuf clusters - increase maxusers!\n");
319104862Sru			return (0);
320104862Sru		}
321104862Sru		if (waitflag == M_WAITOK)
322104862Sru			panic("kmem_malloc: kmem_map too small");
323104862Sru		return (0);
324104862Sru	}
325104862Sru	offset = addr - VM_MIN_KERNEL_ADDRESS;
326104862Sru	vm_object_reference(kmem_object);
327104862Sru	vm_map_insert(map, kmem_object, offset, addr, addr + size,
328104862Sru		VM_PROT_ALL, VM_PROT_ALL, 0);
329104862Sru
33075584Sru	/*
33175584Sru	 * If we can wait, just mark the range as wired (will fault pages as
33275584Sru	 * necessary).
333114402Sru	 */
33475584Sru	if (waitflag == M_WAITOK) {
33575584Sru		vm_map_unlock(map);
336104862Sru		(void) vm_map_pageable(map, (vm_offset_t) addr, addr + size,
337104862Sru		    FALSE);
338104862Sru		vm_map_simplify(map, addr);
33975584Sru		return (addr);
34075584Sru	}
34175584Sru	/*
34275584Sru	 * If we cannot wait then we must allocate all memory up front,
34375584Sru	 * pulling it off the active queue to prevent pageout.
34475584Sru	 */
34575584Sru	for (i = 0; i < size; i += PAGE_SIZE) {
34675584Sru		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i),
34775584Sru			(waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM);
34875584Sru
34975584Sru		/*
35075584Sru		 * Ran out of space, free everything up and return. Don't need
35175584Sru		 * to lock page queues here as we know that the pages we got
35275584Sru		 * aren't on any queues.
35375584Sru		 */
35475584Sru		if (m == NULL) {
355104862Sru			while (i != 0) {
356104862Sru				i -= PAGE_SIZE;
357104862Sru				m = vm_page_lookup(kmem_object,
358104862Sru					OFF_TO_IDX(offset + i));
35975584Sru				vm_page_free(m);
360104862Sru			}
36175584Sru			vm_map_delete(map, addr, addr + size);
36275584Sru			vm_map_unlock(map);
36375584Sru			return (0);
364114402Sru		}
36555839Sasmodai		m->flags &= ~(PG_BUSY|PG_ZERO);
366104862Sru		m->valid = VM_PAGE_BITS_ALL;
36755839Sasmodai	}
36855839Sasmodai
36955839Sasmodai	/*
37055839Sasmodai	 * Mark map entry as non-pageable. Assert: vm_map_insert() will never
37155839Sasmodai	 * be able to extend the previous entry so there will be a new entry
37269626Sru	 * exactly corresponding to this address range and it will have
373114402Sru	 * wired_count == 0.
374114402Sru	 */
375114402Sru	if (!vm_map_lookup_entry(map, addr, &entry) ||
376104862Sru	    entry->start != addr || entry->end != addr + size ||
37755839Sasmodai	    entry->wired_count)
37855839Sasmodai		panic("kmem_malloc: entry not found or misaligned");
37955839Sasmodai	entry->wired_count++;
380104862Sru
38155839Sasmodai	/*
38255839Sasmodai	 * Loop thru pages, entering them in the pmap. (We cannot add them to
38355839Sasmodai	 * the wired count without wrapping the vm_page_queue_lock in
384104862Sru	 * splimp...)
38555839Sasmodai	 */
38655839Sasmodai	for (i = 0; i < size; i += PAGE_SIZE) {
387104862Sru		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
388104862Sru		vm_page_wire(m);
38955839Sasmodai		pmap_kenter(addr + i, VM_PAGE_TO_PHYS(m));
390104862Sru	}
39155839Sasmodai	vm_map_unlock(map);
39255839Sasmodai
393104862Sru	vm_map_simplify(map, addr);
394104862Sru	return (addr);
395104862Sru}
396104862Sru
397104862Sru/*
398104862Sru *	kmem_alloc_wait
399104862Sru *
40055839Sasmodai *	Allocates pageable memory from a sub-map of the kernel.  If the submap
40175584Sru *	has no room, the caller sleeps waiting for more memory in the submap.
40275584Sru *
40375584Sru */
40475584Sruvm_offset_t
40575584Srukmem_alloc_wait(map, size)
40675584Sru	vm_map_t map;
40775584Sru	vm_size_t size;
40875584Sru{
40975584Sru	vm_offset_t addr;
410104862Sru
41175584Sru	size = round_page(size);
41275584Sru
41375584Sru	for (;;) {
41475584Sru		/*
41575584Sru		 * To make this work for more than one map, use the map's lock
41675584Sru		 * to lock out sleepers/wakers.
41775584Sru		 */
41875584Sru		vm_map_lock(map);
41975584Sru		if (vm_map_findspace(map, 0, size, &addr) == 0)
42075584Sru			break;
42155839Sasmodai		/* no space now; see if we can ever get space */
42255839Sasmodai		if (vm_map_max(map) - vm_map_min(map) < size) {
42355839Sasmodai			vm_map_unlock(map);
42455839Sasmodai			return (0);
42569626Sru		}
42669626Sru		vm_map_unlock(map);
42769626Sru		tsleep(map, PVM, "kmaw", 0);
428104862Sru	}
42955839Sasmodai	vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
43055839Sasmodai	vm_map_unlock(map);
43155839Sasmodai	return (addr);
43255839Sasmodai}
43355839Sasmodai
434114402Sru/*
43569626Sru *	kmem_free_wakeup
43655839Sasmodai *
43755839Sasmodai *	Returns memory to a submap of the kernel, and wakes up any processes
43875584Sru *	waiting for memory in that map.
43975584Sru */
44075584Sruvoid
44175584Srukmem_free_wakeup(map, addr, size)
44275584Sru	vm_map_t map;
44375584Sru	vm_offset_t addr;
44475584Sru	vm_size_t size;
44555839Sasmodai{
44655839Sasmodai	vm_map_lock(map);
44769626Sru	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
44869626Sru	wakeup(map);
44969626Sru	vm_map_unlock(map);
45055839Sasmodai}
45155839Sasmodai
45255839Sasmodai/*
45355839Sasmodai * Create the kernel map; insert a mapping covering kernel text, data, bss,
45455839Sasmodai * and all space allocated thus far (`boostrap' data).  The new map will thus
45569626Sru * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and
45669626Sru * the range between `start' and `end' as free.
45769626Sru */
45869626Sruvoid
45969626Srukmem_init(start, end)
46069626Sru	vm_offset_t start, end;
46169626Sru{
46269626Sru	register vm_map_t m;
46355839Sasmodai
46455839Sasmodai	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE);
46575584Sru	vm_map_lock(m);
46655839Sasmodai	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
46775584Sru	kernel_map = m;
46855839Sasmodai	(void) vm_map_insert(m, NULL, (vm_offset_t) 0,
46969626Sru	    VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0);
47069626Sru	/* ... and ending with the completion of the above `insert' */
47155839Sasmodai	vm_map_unlock(m);
47269626Sru}
47355839Sasmodai