vm_kern.c revision 248084
130994Sphk/*-
22729Sdfr * Copyright (c) 1991, 1993
32729Sdfr *	The Regents of the University of California.  All rights reserved.
42729Sdfr *
52729Sdfr * This code is derived from software contributed to Berkeley by
62729Sdfr * The Mach Operating System project at Carnegie-Mellon University.
72729Sdfr *
82729Sdfr * Redistribution and use in source and binary forms, with or without
92729Sdfr * modification, are permitted provided that the following conditions
102729Sdfr * are met:
112729Sdfr * 1. Redistributions of source code must retain the above copyright
122729Sdfr *    notice, this list of conditions and the following disclaimer.
132729Sdfr * 2. Redistributions in binary form must reproduce the above copyright
142729Sdfr *    notice, this list of conditions and the following disclaimer in the
152729Sdfr *    documentation and/or other materials provided with the distribution.
162729Sdfr * 4. Neither the name of the University nor the names of its contributors
172729Sdfr *    may be used to endorse or promote products derived from this software
182729Sdfr *    without specific prior written permission.
192729Sdfr *
202729Sdfr * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
212729Sdfr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
222729Sdfr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
232729Sdfr * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2411626Sbde * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
252729Sdfr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
262729Sdfr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
272729Sdfr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2811626Sbde * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
292729Sdfr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3010653Sdg * SUCH DAMAGE.
3110358Sjulian *
3210358Sjulian *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
332729Sdfr *
342729Sdfr *
352729Sdfr * Copyright (c) 1987, 1990 Carnegie-Mellon University.
3612866Speter * All rights reserved.
3711626Sbde *
3830994Sphk * Authors: Avadis Tevanian, Jr., Michael Wayne Young
3911626Sbde *
4030994Sphk * Permission to use, copy, modify and distribute this software and
4111626Sbde * its documentation is hereby granted, provided that both the copyright
4230994Sphk * notice and this permission notice appear in all copies of the
4311626Sbde * software, derivative works or modified versions, and any portions
4430994Sphk * thereof, and that both notices appear in supporting documentation.
4512866Speter *
4611626Sbde * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
472729Sdfr * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
4811626Sbde * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
4912819Sphk *
5011626Sbde * Carnegie Mellon requests users of this software to return to
5111626Sbde *
5211626Sbde *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
532729Sdfr *  School of Computer Science
5412819Sphk *  Carnegie Mellon University
5512819Sphk *  Pittsburgh PA 15213-3890
5612819Sphk *
579759Sbde * any improvements or extensions that they make and grant Carnegie the
589759Sbde * rights to redistribute these changes.
599759Sbde */
609759Sbde
612729Sdfr/*
622836Sdg *	Kernel memory management.
6311626Sbde */
6411626Sbde
652729Sdfr#include <sys/cdefs.h>
662729Sdfr__FBSDID("$FreeBSD: head/sys/vm/vm_kern.c 248084 2013-03-09 02:32:23Z attilio $");
672729Sdfr
682729Sdfr#include <sys/param.h>
692729Sdfr#include <sys/systm.h>
702729Sdfr#include <sys/kernel.h>		/* for ticks and hz */
712729Sdfr#include <sys/eventhandler.h>
722729Sdfr#include <sys/lock.h>
732729Sdfr#include <sys/proc.h>
742729Sdfr#include <sys/malloc.h>
752729Sdfr#include <sys/rwlock.h>
762729Sdfr#include <sys/sysctl.h>
772729Sdfr
782729Sdfr#include <vm/vm.h>
792729Sdfr#include <vm/vm_param.h>
802729Sdfr#include <vm/pmap.h>
812729Sdfr#include <vm/vm_map.h>
822729Sdfr#include <vm/vm_object.h>
832729Sdfr#include <vm/vm_page.h>
842729Sdfr#include <vm/vm_pageout.h>
852729Sdfr#include <vm/vm_extern.h>
862729Sdfr#include <vm/uma.h>
872729Sdfr
882729Sdfrvm_map_t kernel_map=0;
892729Sdfrvm_map_t kmem_map=0;
902729Sdfrvm_map_t exec_map=0;
912729Sdfrvm_map_t pipe_map;
922729Sdfrvm_map_t buffer_map=0;
932729Sdfr
942729Sdfrconst void *zero_region;
952729SdfrCTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
962729Sdfr
972729SdfrSYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
982729Sdfr    NULL, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
992729Sdfr
1002729SdfrSYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
1012729Sdfr#if defined(__arm__) || defined(__sparc64__)
1022729Sdfr    &vm_max_kernel_address, 0,
1032729Sdfr#else
1042729Sdfr    NULL, VM_MAX_KERNEL_ADDRESS,
1052729Sdfr#endif
1062729Sdfr    "Max kernel address");
1072729Sdfr
1082729Sdfr/*
1092729Sdfr *	kmem_alloc_nofault:
1102729Sdfr *
1112729Sdfr *	Allocate a virtual address range with no underlying object and
1122729Sdfr *	no initial mapping to physical memory.  Any mapping from this
1132729Sdfr *	range to physical memory must be explicitly created prior to
1142729Sdfr *	its use, typically with pmap_qenter().  Any attempt to create
1152729Sdfr *	a mapping on demand through vm_fault() will result in a panic.
1162729Sdfr */
1172729Sdfrvm_offset_t
1182729Sdfrkmem_alloc_nofault(map, size)
1192729Sdfr	vm_map_t map;
1202729Sdfr	vm_size_t size;
1212729Sdfr{
1222729Sdfr	vm_offset_t addr;
12330994Sphk	int result;
12411626Sbde
12511626Sbde	size = round_page(size);
12611626Sbde	addr = vm_map_min(map);
12711626Sbde	result = vm_map_find(map, NULL, 0, &addr, size, VMFS_ANY_SPACE,
12811626Sbde	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
12911626Sbde	if (result != KERN_SUCCESS) {
13011626Sbde		return (0);
13111626Sbde	}
13211626Sbde	return (addr);
13311626Sbde}
1342729Sdfr
1352729Sdfr/*
1362729Sdfr *	kmem_alloc_nofault_space:
1372729Sdfr *
13830994Sphk *	Allocate a virtual address range with no underlying object and
1392729Sdfr *	no initial mapping to physical memory within the specified
1402729Sdfr *	address space.  Any mapping from this range to physical memory
1412729Sdfr *	must be explicitly created prior to its use, typically with
1422729Sdfr *	pmap_qenter().  Any attempt to create a mapping on demand
1432729Sdfr *	through vm_fault() will result in a panic.
1442729Sdfr */
1452729Sdfrvm_offset_t
1462729Sdfrkmem_alloc_nofault_space(map, size, find_space)
1472729Sdfr	vm_map_t map;
1482729Sdfr	vm_size_t size;
1492729Sdfr	int find_space;
1502729Sdfr{
1512729Sdfr	vm_offset_t addr;
1522729Sdfr	int result;
1532729Sdfr
1542729Sdfr	size = round_page(size);
1552729Sdfr	addr = vm_map_min(map);
1562729Sdfr	result = vm_map_find(map, NULL, 0, &addr, size, find_space,
1572729Sdfr	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1582729Sdfr	if (result != KERN_SUCCESS) {
1592729Sdfr		return (0);
1602729Sdfr	}
1612729Sdfr	return (addr);
1622729Sdfr}
1632729Sdfr
1642729Sdfr/*
16512866Speter *	Allocate wired-down memory in the kernel's address map
1662729Sdfr *	or a submap.
1672729Sdfr */
1682729Sdfrvm_offset_t
16912866Speterkmem_alloc(map, size)
1702729Sdfr	vm_map_t map;
17112866Speter	vm_size_t size;
1722729Sdfr{
17312866Speter	vm_offset_t addr;
17430994Sphk	vm_offset_t offset;
1752729Sdfr
1762729Sdfr	size = round_page(size);
1772729Sdfr
1782729Sdfr	/*
1792729Sdfr	 * Use the kernel object for wired-down kernel pages. Assume that no
18012866Speter	 * region of the kernel object is referenced more than once.
1812729Sdfr	 */
1823308Sphk
1832729Sdfr	/*
1842729Sdfr	 * Locate sufficient space in the map.  This will give us the final
1852729Sdfr	 * virtual address for the new memory, and thus will tell us the
1862729Sdfr	 * offset within the kernel map.
1872729Sdfr	 */
1882729Sdfr	vm_map_lock(map);
1892729Sdfr	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
1902729Sdfr		vm_map_unlock(map);
1912729Sdfr		return (0);
1922729Sdfr	}
1932729Sdfr	offset = addr - VM_MIN_KERNEL_ADDRESS;
1942729Sdfr	vm_object_reference(kernel_object);
1952729Sdfr	vm_map_insert(map, kernel_object, offset, addr, addr + size,
1962729Sdfr		VM_PROT_ALL, VM_PROT_ALL, 0);
1972729Sdfr	vm_map_unlock(map);
1982729Sdfr
1992729Sdfr	/*
2002729Sdfr	 * And finally, mark the data as non-pageable.
2012729Sdfr	 */
2022729Sdfr	(void) vm_map_wire(map, addr, addr + size,
2032729Sdfr	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
2042729Sdfr
2052729Sdfr	return (addr);
2062729Sdfr}
2072729Sdfr
2082729Sdfr/*
2092729Sdfr *	Allocates a region from the kernel address map and physical pages
2102729Sdfr *	within the specified address range to the kernel object.  Creates a
2112729Sdfr *	wired mapping from this region to these pages, and returns the
2122729Sdfr *	region's starting virtual address.  The allocated pages are not
2132729Sdfr *	necessarily physically contiguous.  If M_ZERO is specified through the
2142729Sdfr *	given flags, then the pages are zeroed before they are mapped.
2152729Sdfr */
2162729Sdfrvm_offset_t
2172729Sdfrkmem_alloc_attr(vm_map_t map, vm_size_t size, int flags, vm_paddr_t low,
2182729Sdfr    vm_paddr_t high, vm_memattr_t memattr)
2192729Sdfr{
2202729Sdfr	vm_object_t object = kernel_object;
2212729Sdfr	vm_offset_t addr;
2222729Sdfr	vm_ooffset_t end_offset, offset;
2232729Sdfr	vm_page_t m;
2242729Sdfr	int pflags, tries;
2252729Sdfr
2262729Sdfr	size = round_page(size);
2272729Sdfr	vm_map_lock(map);
2282729Sdfr	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
2292729Sdfr		vm_map_unlock(map);
2302729Sdfr		return (0);
2312729Sdfr	}
2322729Sdfr	offset = addr - VM_MIN_KERNEL_ADDRESS;
2332729Sdfr	vm_object_reference(object);
2342729Sdfr	vm_map_insert(map, object, offset, addr, addr + size, VM_PROT_ALL,
2352729Sdfr	    VM_PROT_ALL, 0);
2362729Sdfr	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY;
2372729Sdfr	VM_OBJECT_WLOCK(object);
2382729Sdfr	end_offset = offset + size;
2392729Sdfr	for (; offset < end_offset; offset += PAGE_SIZE) {
2402729Sdfr		tries = 0;
2412729Sdfrretry:
2422729Sdfr		m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags, 1,
2432729Sdfr		    low, high, PAGE_SIZE, 0, memattr);
2442729Sdfr		if (m == NULL) {
2452729Sdfr			VM_OBJECT_WUNLOCK(object);
2462729Sdfr			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
2472729Sdfr				vm_map_unlock(map);
2482729Sdfr				vm_pageout_grow_cache(tries, low, high);
2492729Sdfr				vm_map_lock(map);
2502729Sdfr				VM_OBJECT_WLOCK(object);
2512729Sdfr				tries++;
2522729Sdfr				goto retry;
2532729Sdfr			}
2542729Sdfr
2552729Sdfr			/*
2562729Sdfr			 * Since the pages that were allocated by any previous
2572729Sdfr			 * iterations of this loop are not busy, they can be
2582729Sdfr			 * freed by vm_object_page_remove(), which is called
2592729Sdfr			 * by vm_map_delete().
2602729Sdfr			 */
2612729Sdfr			vm_map_delete(map, addr, addr + size);
2622729Sdfr			vm_map_unlock(map);
2632729Sdfr			return (0);
2642729Sdfr		}
2652729Sdfr		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
2662729Sdfr			pmap_zero_page(m);
2672729Sdfr		m->valid = VM_PAGE_BITS_ALL;
2682729Sdfr	}
2692729Sdfr	VM_OBJECT_WUNLOCK(object);
2702729Sdfr	vm_map_unlock(map);
2712729Sdfr	vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM |
2722729Sdfr	    VM_MAP_WIRE_NOHOLES);
2732729Sdfr	return (addr);
2742729Sdfr}
2752729Sdfr
2762729Sdfr/*
2772729Sdfr *	Allocates a region from the kernel address map and physically
2782729Sdfr *	contiguous pages within the specified address range to the kernel
2792729Sdfr *	object.  Creates a wired mapping from this region to these pages, and
2802729Sdfr *	returns the region's starting virtual address.  If M_ZERO is specified
2812729Sdfr *	through the given flags, then the pages are zeroed before they are
2822729Sdfr *	mapped.
2832729Sdfr */
2842729Sdfrvm_offset_t
2852729Sdfrkmem_alloc_contig(vm_map_t map, vm_size_t size, int flags, vm_paddr_t low,
2862729Sdfr    vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
2872729Sdfr    vm_memattr_t memattr)
2882729Sdfr{
2892729Sdfr	vm_object_t object = kernel_object;
2902729Sdfr	vm_offset_t addr;
2912729Sdfr	vm_ooffset_t offset;
2922729Sdfr	vm_page_t end_m, m;
2932729Sdfr	int pflags, tries;
2942729Sdfr
2952729Sdfr	size = round_page(size);
2962729Sdfr	vm_map_lock(map);
29730994Sphk	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
2982729Sdfr		vm_map_unlock(map);
2992729Sdfr		return (0);
3002729Sdfr	}
30112866Speter	offset = addr - VM_MIN_KERNEL_ADDRESS;
3022729Sdfr	vm_object_reference(object);
3032729Sdfr	vm_map_insert(map, object, offset, addr, addr + size, VM_PROT_ALL,
3042729Sdfr	    VM_PROT_ALL, 0);
3052729Sdfr	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY;
30612866Speter	VM_OBJECT_WLOCK(object);
3072729Sdfr	tries = 0;
30812866Speterretry:
30930994Sphk	m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
3102729Sdfr	    atop(size), low, high, alignment, boundary, memattr);
3112729Sdfr	if (m == NULL) {
3122729Sdfr		VM_OBJECT_WUNLOCK(object);
3132729Sdfr		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
3142729Sdfr			vm_map_unlock(map);
3152729Sdfr			vm_pageout_grow_cache(tries, low, high);
3162729Sdfr			vm_map_lock(map);
3172836Sdg			VM_OBJECT_WLOCK(object);
3182729Sdfr			tries++;
3192729Sdfr			goto retry;
3202729Sdfr		}
3212729Sdfr		vm_map_delete(map, addr, addr + size);
3222729Sdfr		vm_map_unlock(map);
3232729Sdfr		return (0);
3242729Sdfr	}
3252729Sdfr	end_m = m + atop(size);
3262729Sdfr	for (; m < end_m; m++) {
3272729Sdfr		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
3282729Sdfr			pmap_zero_page(m);
3292729Sdfr		m->valid = VM_PAGE_BITS_ALL;
3302729Sdfr	}
3312729Sdfr	VM_OBJECT_WUNLOCK(object);
3322729Sdfr	vm_map_unlock(map);
3332729Sdfr	vm_map_wire(map, addr, addr + size, VM_MAP_WIRE_SYSTEM |
3342729Sdfr	    VM_MAP_WIRE_NOHOLES);
3352729Sdfr	return (addr);
3362729Sdfr}
3372729Sdfr
3382729Sdfr/*
3392729Sdfr *	kmem_free:
3402729Sdfr *
3412729Sdfr *	Release a region of kernel virtual memory allocated
3422729Sdfr *	with kmem_alloc, and return the physical pages
3432729Sdfr *	associated with that region.
3442729Sdfr *
3452729Sdfr *	This routine may not block on kernel maps.
3462729Sdfr */
3472729Sdfrvoid
3482729Sdfrkmem_free(map, addr, size)
3492729Sdfr	vm_map_t map;
3502729Sdfr	vm_offset_t addr;
3512729Sdfr	vm_size_t size;
3522729Sdfr{
3532729Sdfr
3542729Sdfr	(void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
3552729Sdfr}
3562729Sdfr
3572729Sdfr/*
3582729Sdfr *	kmem_suballoc:
3592729Sdfr *
3602729Sdfr *	Allocates a map to manage a subrange
3612729Sdfr *	of the kernel virtual address space.
3622729Sdfr *
3632729Sdfr *	Arguments are as follows:
3642729Sdfr *
3652729Sdfr *	parent		Map to take range from
3662729Sdfr *	min, max	Returned endpoints of map
3672729Sdfr *	size		Size of range to find
3682729Sdfr *	superpage_align	Request that min is superpage aligned
3692729Sdfr */
3702729Sdfrvm_map_t
3718876Srgrimeskmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max,
3722729Sdfr    vm_size_t size, boolean_t superpage_align)
3732729Sdfr{
3742729Sdfr	int ret;
3752729Sdfr	vm_map_t result;
3762729Sdfr
3772729Sdfr	size = round_page(size);
3782729Sdfr
3792729Sdfr	*min = vm_map_min(parent);
3802729Sdfr	ret = vm_map_find(parent, NULL, 0, min, size, superpage_align ?
3812729Sdfr	    VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
3822729Sdfr	    MAP_ACC_NO_CHARGE);
3832729Sdfr	if (ret != KERN_SUCCESS)
3842729Sdfr		panic("kmem_suballoc: bad status return of %d", ret);
3852729Sdfr	*max = *min + size;
3862729Sdfr	result = vm_map_create(vm_map_pmap(parent), *min, *max);
3872729Sdfr	if (result == NULL)
3882729Sdfr		panic("kmem_suballoc: cannot create submap");
3892729Sdfr	if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS)
3902729Sdfr		panic("kmem_suballoc: unable to change range to submap");
3912729Sdfr	return (result);
3922729Sdfr}
3932729Sdfr
3942729Sdfr/*
3952729Sdfr *	kmem_malloc:
3962729Sdfr *
3972729Sdfr * 	Allocate wired-down memory in the kernel's address map for the higher
3982729Sdfr * 	level kernel memory allocator (kern/kern_malloc.c).  We cannot use
3992729Sdfr * 	kmem_alloc() because we may need to allocate memory at interrupt
4002729Sdfr * 	level where we cannot block (canwait == FALSE).
4012729Sdfr *
4022729Sdfr * 	This routine has its own private kernel submap (kmem_map) and object
40330994Sphk * 	(kmem_object).  This, combined with the fact that only malloc uses
4042729Sdfr * 	this routine, ensures that we will never block in map or object waits.
4052729Sdfr *
4062729Sdfr * 	We don't worry about expanding the map (adding entries) since entries
40712866Speter * 	for wired maps are statically allocated.
4082729Sdfr *
4092729Sdfr *	`map' is ONLY allowed to be kmem_map or one of the mbuf submaps to
41012866Speter *	which we never free.
4112729Sdfr */
4122729Sdfrvm_offset_t
4132729Sdfrkmem_malloc(map, size, flags)
41412866Speter	vm_map_t map;
4152729Sdfr	vm_size_t size;
41612866Speter	int flags;
41730994Sphk{
4182729Sdfr	vm_offset_t addr;
4192729Sdfr	int i, rv;
4202729Sdfr
4212729Sdfr	size = round_page(size);
42212866Speter	addr = vm_map_min(map);
4232729Sdfr
4242729Sdfr	/*
4252729Sdfr	 * Locate sufficient space in the map.  This will give us the final
4262729Sdfr	 * virtual address for the new memory, and thus will tell us the
4272729Sdfr	 * offset within the kernel map.
4282729Sdfr	 */
4292729Sdfr	vm_map_lock(map);
4302729Sdfr	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
4312729Sdfr		vm_map_unlock(map);
4322729Sdfr                if ((flags & M_NOWAIT) == 0) {
4332729Sdfr			for (i = 0; i < 8; i++) {
4342729Sdfr				EVENTHANDLER_INVOKE(vm_lowmem, 0);
4352729Sdfr				uma_reclaim();
4362729Sdfr				vm_map_lock(map);
4372729Sdfr				if (vm_map_findspace(map, vm_map_min(map),
4382729Sdfr				    size, &addr) == 0) {
4392729Sdfr					break;
4402729Sdfr				}
4412729Sdfr				vm_map_unlock(map);
4422729Sdfr				tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
4432729Sdfr			}
4442729Sdfr			if (i == 8) {
4452729Sdfr				panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
4462729Sdfr				    (long)size, (long)map->size);
4472729Sdfr			}
4482729Sdfr		} else {
4492729Sdfr			return (0);
4502729Sdfr		}
4512729Sdfr	}
4522729Sdfr
4532729Sdfr	rv = kmem_back(map, addr, size, flags);
4542729Sdfr	vm_map_unlock(map);
4552729Sdfr	return (rv == KERN_SUCCESS ? addr : 0);
4562729Sdfr}
4572729Sdfr
4582729Sdfr/*
4592729Sdfr *	kmem_back:
4602729Sdfr *
4612729Sdfr *	Allocate physical pages for the specified virtual address range.
4622729Sdfr */
4632729Sdfrint
4642729Sdfrkmem_back(vm_map_t map, vm_offset_t addr, vm_size_t size, int flags)
4652729Sdfr{
4662729Sdfr	vm_offset_t offset, i;
4672729Sdfr	vm_map_entry_t entry;
4682729Sdfr	vm_page_t m;
4692729Sdfr	int pflags;
4702729Sdfr	boolean_t found;
4712729Sdfr
4722729Sdfr	KASSERT(vm_map_locked(map), ("kmem_back: map %p is not locked", map));
4732729Sdfr	offset = addr - VM_MIN_KERNEL_ADDRESS;
4742729Sdfr	vm_object_reference(kmem_object);
4752729Sdfr	vm_map_insert(map, kmem_object, offset, addr, addr + size,
4762729Sdfr	    VM_PROT_ALL, VM_PROT_ALL, 0);
4772729Sdfr
4782729Sdfr	/*
4792729Sdfr	 * Assert: vm_map_insert() will never be able to extend the
4802836Sdg	 * previous entry so vm_map_lookup_entry() will find a new
4812729Sdfr	 * entry exactly corresponding to this address range and it
4822729Sdfr	 * will have wired_count == 0.
4832729Sdfr	 */
4842729Sdfr	found = vm_map_lookup_entry(map, addr, &entry);
4852729Sdfr	KASSERT(found && entry->start == addr && entry->end == addr + size &&
4862729Sdfr	    entry->wired_count == 0 && (entry->eflags & MAP_ENTRY_IN_TRANSITION)
4872729Sdfr	    == 0, ("kmem_back: entry not found or misaligned"));
4882729Sdfr
4892729Sdfr	pflags = malloc2vm_flags(flags) | VM_ALLOC_WIRED;
4902729Sdfr
4912729Sdfr	VM_OBJECT_WLOCK(kmem_object);
4922729Sdfr	for (i = 0; i < size; i += PAGE_SIZE) {
4932729Sdfrretry:
4942729Sdfr		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags);
4952729Sdfr
4962729Sdfr		/*
4972729Sdfr		 * Ran out of space, free everything up and return. Don't need
4982729Sdfr		 * to lock page queues here as we know that the pages we got
4992729Sdfr		 * aren't on any queues.
5002729Sdfr		 */
5012729Sdfr		if (m == NULL) {
5022729Sdfr			if ((flags & M_NOWAIT) == 0) {
5032729Sdfr				VM_OBJECT_WUNLOCK(kmem_object);
5042729Sdfr				entry->eflags |= MAP_ENTRY_IN_TRANSITION;
5052729Sdfr				vm_map_unlock(map);
5062729Sdfr				VM_WAIT;
5072729Sdfr				vm_map_lock(map);
5082729Sdfr				KASSERT(
5092729Sdfr(entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_NEEDS_WAKEUP)) ==
5102729Sdfr				    MAP_ENTRY_IN_TRANSITION,
5112729Sdfr				    ("kmem_back: volatile entry"));
5122729Sdfr				entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
5132729Sdfr				VM_OBJECT_WLOCK(kmem_object);
5142729Sdfr				goto retry;
5152729Sdfr			}
5162729Sdfr			/*
5172729Sdfr			 * Free the pages before removing the map entry.
5182729Sdfr			 * They are already marked busy.  Calling
5192729Sdfr			 * vm_map_delete before the pages has been freed or
5202729Sdfr			 * unbusied will cause a deadlock.
5212729Sdfr			 */
5222729Sdfr			while (i != 0) {
5232729Sdfr				i -= PAGE_SIZE;
5242729Sdfr				m = vm_page_lookup(kmem_object,
5252729Sdfr						   OFF_TO_IDX(offset + i));
5262729Sdfr				vm_page_unwire(m, 0);
5272729Sdfr				vm_page_free(m);
5282729Sdfr			}
5292729Sdfr			VM_OBJECT_WUNLOCK(kmem_object);
5302729Sdfr			vm_map_delete(map, addr, addr + size);
5312729Sdfr			return (KERN_NO_SPACE);
5322729Sdfr		}
5332729Sdfr		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
5342729Sdfr			pmap_zero_page(m);
5352729Sdfr		m->valid = VM_PAGE_BITS_ALL;
5362729Sdfr		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5372729Sdfr		    ("kmem_malloc: page %p is managed", m));
5382729Sdfr	}
5392729Sdfr	VM_OBJECT_WUNLOCK(kmem_object);
5402729Sdfr
5412729Sdfr	/*
5422729Sdfr	 * Mark map entry as non-pageable.  Repeat the assert.
5432729Sdfr	 */
5442729Sdfr	KASSERT(entry->start == addr && entry->end == addr + size &&
5452729Sdfr	    entry->wired_count == 0,
5462729Sdfr	    ("kmem_back: entry not found or misaligned after allocation"));
5472729Sdfr	entry->wired_count = 1;
5482729Sdfr
5492729Sdfr	/*
5502729Sdfr	 * At this point, the kmem_object must be unlocked because
5512729Sdfr	 * vm_map_simplify_entry() calls vm_object_deallocate(), which
5522729Sdfr	 * locks the kmem_object.
5532729Sdfr	 */
5542729Sdfr	vm_map_simplify_entry(map, entry);
5552729Sdfr
5562729Sdfr	/*
5572729Sdfr	 * Loop thru pages, entering them in the pmap.
5582729Sdfr	 */
5592729Sdfr	VM_OBJECT_WLOCK(kmem_object);
5602729Sdfr	for (i = 0; i < size; i += PAGE_SIZE) {
5612729Sdfr		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
5622729Sdfr		/*
5632729Sdfr		 * Because this is kernel_pmap, this call will not block.
5642729Sdfr		 */
5652729Sdfr		pmap_enter(kernel_pmap, addr + i, VM_PROT_ALL, m, VM_PROT_ALL,
5662729Sdfr		    TRUE);
5672729Sdfr		vm_page_wakeup(m);
5682729Sdfr	}
5692729Sdfr	VM_OBJECT_WUNLOCK(kmem_object);
5702729Sdfr
5712729Sdfr	return (KERN_SUCCESS);
5722729Sdfr}
5732729Sdfr
5742729Sdfr/*
5752729Sdfr *	kmem_alloc_wait:
5762729Sdfr *
5772729Sdfr *	Allocates pageable memory from a sub-map of the kernel.  If the submap
5782729Sdfr *	has no room, the caller sleeps waiting for more memory in the submap.
5792729Sdfr *
5802729Sdfr *	This routine may block.
5812729Sdfr */
5822729Sdfrvm_offset_t
5832729Sdfrkmem_alloc_wait(map, size)
5842729Sdfr	vm_map_t map;
5852729Sdfr	vm_size_t size;
5862729Sdfr{
5872729Sdfr	vm_offset_t addr;
5882729Sdfr
5892729Sdfr	size = round_page(size);
5902729Sdfr	if (!swap_reserve(size))
5912729Sdfr		return (0);
5922729Sdfr
5932729Sdfr	for (;;) {
5942729Sdfr		/*
5952729Sdfr		 * To make this work for more than one map, use the map's lock
5962729Sdfr		 * to lock out sleepers/wakers.
5972729Sdfr		 */
5982729Sdfr		vm_map_lock(map);
5992729Sdfr		if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0)
6002729Sdfr			break;
6012729Sdfr		/* no space now; see if we can ever get space */
6022729Sdfr		if (vm_map_max(map) - vm_map_min(map) < size) {
6032729Sdfr			vm_map_unlock(map);
6042729Sdfr			swap_release(size);
6052729Sdfr			return (0);
6062729Sdfr		}
6072729Sdfr		map->needs_wakeup = TRUE;
6082729Sdfr		vm_map_unlock_and_wait(map, 0);
6092729Sdfr	}
6102729Sdfr	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
6112729Sdfr	    VM_PROT_ALL, MAP_ACC_CHARGED);
6122729Sdfr	vm_map_unlock(map);
6132729Sdfr	return (addr);
6142729Sdfr}
6152729Sdfr
6162729Sdfr/*
6172729Sdfr *	kmem_free_wakeup:
6182729Sdfr *
6192729Sdfr *	Returns memory to a submap of the kernel, and wakes up any processes
6202729Sdfr *	waiting for memory in that map.
6212729Sdfr */
6222729Sdfrvoid
6232729Sdfrkmem_free_wakeup(map, addr, size)
6242729Sdfr	vm_map_t map;
6252729Sdfr	vm_offset_t addr;
6262729Sdfr	vm_size_t size;
6272729Sdfr{
6282729Sdfr
6292729Sdfr	vm_map_lock(map);
6302729Sdfr	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
6312729Sdfr	if (map->needs_wakeup) {
6322729Sdfr		map->needs_wakeup = FALSE;
6332729Sdfr		vm_map_wakeup(map);
6342729Sdfr	}
6352729Sdfr	vm_map_unlock(map);
6362729Sdfr}
6372729Sdfr
6382729Sdfrstatic void
6392729Sdfrkmem_init_zero_region(void)
6402729Sdfr{
6412729Sdfr	vm_offset_t addr, i;
6422729Sdfr	vm_page_t m;
6432729Sdfr	int error;
6442729Sdfr
6452729Sdfr	/*
6462729Sdfr	 * Map a single physical page of zeros to a larger virtual range.
6472729Sdfr	 * This requires less looping in places that want large amounts of
6482729Sdfr	 * zeros, while not using much more physical resources.
64917971Sbde	 */
6502729Sdfr	addr = kmem_alloc_nofault(kernel_map, ZERO_REGION_SIZE);
6512729Sdfr	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
6522729Sdfr	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
6532729Sdfr	if ((m->flags & PG_ZERO) == 0)
6542729Sdfr		pmap_zero_page(m);
6552729Sdfr	for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
6562729Sdfr		pmap_qenter(addr + i, &m, 1);
6572729Sdfr	error = vm_map_protect(kernel_map, addr, addr + ZERO_REGION_SIZE,
6582729Sdfr	    VM_PROT_READ, TRUE);
6592729Sdfr	KASSERT(error == 0, ("error=%d", error));
6602729Sdfr
6612729Sdfr	zero_region = (const void *)addr;
6622729Sdfr}
6632729Sdfr
6642729Sdfr/*
6652729Sdfr * 	kmem_init:
6662729Sdfr *
6672729Sdfr *	Create the kernel map; insert a mapping covering kernel text,
6682729Sdfr *	data, bss, and all space allocated thus far (`boostrap' data).  The
6692729Sdfr *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and
6702729Sdfr *	`start' as allocated, and the range between `start' and `end' as free.
6712729Sdfr */
6722729Sdfrvoid
6732729Sdfrkmem_init(start, end)
6742729Sdfr	vm_offset_t start, end;
6752729Sdfr{
6762729Sdfr	vm_map_t m;
6772729Sdfr
6782729Sdfr	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end);
6792729Sdfr	m->system_map = 1;
6802729Sdfr	vm_map_lock(m);
6812729Sdfr	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
6822729Sdfr	kernel_map = m;
6832729Sdfr	(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
6842729Sdfr#ifdef __amd64__
6852729Sdfr	    KERNBASE,
6862729Sdfr#else
6872729Sdfr	    VM_MIN_KERNEL_ADDRESS,
6882729Sdfr#endif
6892729Sdfr	    start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
6902729Sdfr	/* ... and ending with the completion of the above `insert' */
69117971Sbde	vm_map_unlock(m);
6922729Sdfr
6932729Sdfr	kmem_init_zero_region();
6942729Sdfr}
6952729Sdfr
6962729Sdfr#ifdef DIAGNOSTIC
6972729Sdfr/*
6982729Sdfr * Allow userspace to directly trigger the VM drain routine for testing
6992729Sdfr * purposes.
7002729Sdfr */
7012729Sdfrstatic int
7022729Sdfrdebug_vm_lowmem(SYSCTL_HANDLER_ARGS)
7032729Sdfr{
7042729Sdfr	int error, i;
7052729Sdfr
7062729Sdfr	i = 0;
7072729Sdfr	error = sysctl_handle_int(oidp, &i, 0, req);
7082729Sdfr	if (error)
7092729Sdfr		return (error);
7102729Sdfr	if (i)
7112729Sdfr		EVENTHANDLER_INVOKE(vm_lowmem, 0);
7122729Sdfr	return (0);
7132729Sdfr}
7142729Sdfr
7152729SdfrSYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
7162729Sdfr    debug_vm_lowmem, "I", "set to trigger vm_lowmem event");
7172729Sdfr#endif
7182729Sdfr