1/*-
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39 *
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
45 *
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49 *
50 * Carnegie Mellon requests users of this software to return to
51 *
52 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53 *  School of Computer Science
54 *  Carnegie Mellon University
55 *  Pittsburgh PA 15213-3890
56 *
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
59 */
60
61/*
62 *	Kernel memory management.
63 */
64
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD$");
67
68#include <sys/param.h>
69#include <sys/systm.h>
70#include <sys/kernel.h>		/* for ticks and hz */
71#include <sys/eventhandler.h>
72#include <sys/lock.h>
73#include <sys/mutex.h>
74#include <sys/proc.h>
75#include <sys/malloc.h>
76#include <sys/sysctl.h>
77
78#include <vm/vm.h>
79#include <vm/vm_param.h>
80#include <vm/pmap.h>
81#include <vm/vm_map.h>
82#include <vm/vm_object.h>
83#include <vm/vm_page.h>
84#include <vm/vm_pageout.h>
85#include <vm/vm_extern.h>
86#include <vm/uma.h>
87
88vm_map_t kernel_map;
89vm_map_t kmem_map;
90vm_map_t exec_map;
91vm_map_t pipe_map;
92vm_map_t buffer_map;
93vm_map_t bio_transient_map;
94
95const void *zero_region;
96CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
97
98/*
99 *	kmem_alloc_nofault:
100 *
101 *	Allocate a virtual address range with no underlying object and
102 *	no initial mapping to physical memory.  Any mapping from this
103 *	range to physical memory must be explicitly created prior to
104 *	its use, typically with pmap_qenter().  Any attempt to create
105 *	a mapping on demand through vm_fault() will result in a panic.
106 */
107vm_offset_t
108kmem_alloc_nofault(map, size)
109	vm_map_t map;
110	vm_size_t size;
111{
112	vm_offset_t addr;
113	int result;
114
115	size = round_page(size);
116	addr = vm_map_min(map);
117	result = vm_map_find(map, NULL, 0, &addr, size, VMFS_ANY_SPACE,
118	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
119	if (result != KERN_SUCCESS) {
120		return (0);
121	}
122	return (addr);
123}
124
125/*
126 *	kmem_alloc_nofault_space:
127 *
128 *	Allocate a virtual address range with no underlying object and
129 *	no initial mapping to physical memory within the specified
130 *	address space.  Any mapping from this range to physical memory
131 *	must be explicitly created prior to its use, typically with
132 *	pmap_qenter().  Any attempt to create a mapping on demand
133 *	through vm_fault() will result in a panic.
134 */
135vm_offset_t
136kmem_alloc_nofault_space(map, size, find_space)
137	vm_map_t map;
138	vm_size_t size;
139	int find_space;
140{
141	vm_offset_t addr;
142	int result;
143
144	size = round_page(size);
145	addr = vm_map_min(map);
146	result = vm_map_find(map, NULL, 0, &addr, size, find_space,
147	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
148	if (result != KERN_SUCCESS) {
149		return (0);
150	}
151	return (addr);
152}
153
154/*
155 *	Allocate wired-down memory in the kernel's address map
156 *	or a submap.
157 */
158vm_offset_t
159kmem_alloc(map, size)
160	vm_map_t map;
161	vm_size_t size;
162{
163	vm_offset_t addr;
164	vm_offset_t offset;
165
166	size = round_page(size);
167
168	/*
169	 * Use the kernel object for wired-down kernel pages. Assume that no
170	 * region of the kernel object is referenced more than once.
171	 */
172
173	/*
174	 * Locate sufficient space in the map.  This will give us the final
175	 * virtual address for the new memory, and thus will tell us the
176	 * offset within the kernel map.
177	 */
178	vm_map_lock(map);
179	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
180		vm_map_unlock(map);
181		return (0);
182	}
183	offset = addr - VM_MIN_KERNEL_ADDRESS;
184	vm_object_reference(kernel_object);
185	vm_map_insert(map, kernel_object, offset, addr, addr + size,
186		VM_PROT_ALL, VM_PROT_ALL, 0);
187	vm_map_unlock(map);
188
189	/*
190	 * And finally, mark the data as non-pageable.
191	 */
192	(void) vm_map_wire(map, addr, addr + size,
193	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
194
195	return (addr);
196}
197
198/*
199 *	kmem_free:
200 *
201 *	Release a region of kernel virtual memory allocated
202 *	with kmem_alloc, and return the physical pages
203 *	associated with that region.
204 *
205 *	This routine may not block on kernel maps.
206 */
207void
208kmem_free(map, addr, size)
209	vm_map_t map;
210	vm_offset_t addr;
211	vm_size_t size;
212{
213
214	(void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
215}
216
217/*
218 *	kmem_suballoc:
219 *
220 *	Allocates a map to manage a subrange
221 *	of the kernel virtual address space.
222 *
223 *	Arguments are as follows:
224 *
225 *	parent		Map to take range from
226 *	min, max	Returned endpoints of map
227 *	size		Size of range to find
228 *	superpage_align	Request that min is superpage aligned
229 */
230vm_map_t
231kmem_suballoc(vm_map_t parent, vm_offset_t *min, vm_offset_t *max,
232    vm_size_t size, boolean_t superpage_align)
233{
234	int ret;
235	vm_map_t result;
236
237	size = round_page(size);
238
239	*min = vm_map_min(parent);
240	ret = vm_map_find(parent, NULL, 0, min, size, superpage_align ?
241	    VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
242	    MAP_ACC_NO_CHARGE);
243	if (ret != KERN_SUCCESS)
244		panic("kmem_suballoc: bad status return of %d", ret);
245	*max = *min + size;
246	result = vm_map_create(vm_map_pmap(parent), *min, *max);
247	if (result == NULL)
248		panic("kmem_suballoc: cannot create submap");
249	if (vm_map_submap(parent, *min, *max, result) != KERN_SUCCESS)
250		panic("kmem_suballoc: unable to change range to submap");
251	return (result);
252}
253
254/*
255 *	kmem_malloc:
256 *
257 * 	Allocate wired-down memory in the kernel's address map for the higher
258 * 	level kernel memory allocator (kern/kern_malloc.c).  We cannot use
259 * 	kmem_alloc() because we may need to allocate memory at interrupt
260 * 	level where we cannot block (canwait == FALSE).
261 *
262 * 	This routine has its own private kernel submap (kmem_map) and object
263 * 	(kmem_object).  This, combined with the fact that only malloc uses
264 * 	this routine, ensures that we will never block in map or object waits.
265 *
266 * 	We don't worry about expanding the map (adding entries) since entries
267 * 	for wired maps are statically allocated.
268 *
269 *	`map' is ONLY allowed to be kmem_map or one of the mbuf submaps to
270 *	which we never free.
271 */
272vm_offset_t
273kmem_malloc(map, size, flags)
274	vm_map_t map;
275	vm_size_t size;
276	int flags;
277{
278	vm_offset_t addr;
279	int i, rv;
280
281	size = round_page(size);
282	addr = vm_map_min(map);
283
284	/*
285	 * Locate sufficient space in the map.  This will give us the final
286	 * virtual address for the new memory, and thus will tell us the
287	 * offset within the kernel map.
288	 */
289	vm_map_lock(map);
290	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
291		vm_map_unlock(map);
292                if ((flags & M_NOWAIT) == 0) {
293			for (i = 0; i < 8; i++) {
294				EVENTHANDLER_INVOKE(vm_lowmem, 0);
295				uma_reclaim();
296				vm_map_lock(map);
297				if (vm_map_findspace(map, vm_map_min(map),
298				    size, &addr) == 0) {
299					break;
300				}
301				vm_map_unlock(map);
302				tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
303			}
304			if (i == 8) {
305				panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
306				    (long)size, (long)map->size);
307			}
308		} else {
309			return (0);
310		}
311	}
312
313	rv = kmem_back(map, addr, size, flags);
314	vm_map_unlock(map);
315	return (rv == KERN_SUCCESS ? addr : 0);
316}
317
318/*
319 *	kmem_back:
320 *
321 *	Allocate physical pages for the specified virtual address range.
322 */
323int
324kmem_back(vm_map_t map, vm_offset_t addr, vm_size_t size, int flags)
325{
326	vm_offset_t offset, i;
327	vm_map_entry_t entry;
328	vm_page_t m;
329	int pflags;
330	boolean_t found;
331
332	KASSERT(vm_map_locked(map), ("kmem_back: map %p is not locked", map));
333	offset = addr - VM_MIN_KERNEL_ADDRESS;
334	vm_object_reference(kmem_object);
335	vm_map_insert(map, kmem_object, offset, addr, addr + size,
336	    VM_PROT_ALL, VM_PROT_ALL, 0);
337
338	/*
339	 * Assert: vm_map_insert() will never be able to extend the
340	 * previous entry so vm_map_lookup_entry() will find a new
341	 * entry exactly corresponding to this address range and it
342	 * will have wired_count == 0.
343	 */
344	found = vm_map_lookup_entry(map, addr, &entry);
345	KASSERT(found && entry->start == addr && entry->end == addr + size &&
346	    entry->wired_count == 0 && (entry->eflags & MAP_ENTRY_IN_TRANSITION)
347	    == 0, ("kmem_back: entry not found or misaligned"));
348
349	if ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
350		pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED;
351	else
352		pflags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED;
353
354	if (flags & M_ZERO)
355		pflags |= VM_ALLOC_ZERO;
356	if (flags & M_NODUMP)
357		pflags |= VM_ALLOC_NODUMP;
358
359	VM_OBJECT_LOCK(kmem_object);
360	for (i = 0; i < size; i += PAGE_SIZE) {
361retry:
362		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags);
363
364		/*
365		 * Ran out of space, free everything up and return. Don't need
366		 * to lock page queues here as we know that the pages we got
367		 * aren't on any queues.
368		 */
369		if (m == NULL) {
370			if ((flags & M_NOWAIT) == 0) {
371				VM_OBJECT_UNLOCK(kmem_object);
372				entry->eflags |= MAP_ENTRY_IN_TRANSITION;
373				vm_map_unlock(map);
374				VM_WAIT;
375				vm_map_lock(map);
376				KASSERT(
377(entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_NEEDS_WAKEUP)) ==
378				    MAP_ENTRY_IN_TRANSITION,
379				    ("kmem_back: volatile entry"));
380				entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
381				VM_OBJECT_LOCK(kmem_object);
382				goto retry;
383			}
384			/*
385			 * Free the pages before removing the map entry.
386			 * They are already marked busy.  Calling
387			 * vm_map_delete before the pages has been freed or
388			 * unbusied will cause a deadlock.
389			 */
390			while (i != 0) {
391				i -= PAGE_SIZE;
392				m = vm_page_lookup(kmem_object,
393						   OFF_TO_IDX(offset + i));
394				vm_page_unwire(m, 0);
395				vm_page_free(m);
396			}
397			VM_OBJECT_UNLOCK(kmem_object);
398			vm_map_delete(map, addr, addr + size);
399			return (KERN_NO_SPACE);
400		}
401		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
402			pmap_zero_page(m);
403		m->valid = VM_PAGE_BITS_ALL;
404		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
405		    ("kmem_malloc: page %p is managed", m));
406	}
407	VM_OBJECT_UNLOCK(kmem_object);
408
409	/*
410	 * Mark map entry as non-pageable.  Repeat the assert.
411	 */
412	KASSERT(entry->start == addr && entry->end == addr + size &&
413	    entry->wired_count == 0,
414	    ("kmem_back: entry not found or misaligned after allocation"));
415	entry->wired_count = 1;
416
417	/*
418	 * At this point, the kmem_object must be unlocked because
419	 * vm_map_simplify_entry() calls vm_object_deallocate(), which
420	 * locks the kmem_object.
421	 */
422	vm_map_simplify_entry(map, entry);
423
424	/*
425	 * Loop thru pages, entering them in the pmap.
426	 */
427	VM_OBJECT_LOCK(kmem_object);
428	for (i = 0; i < size; i += PAGE_SIZE) {
429		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
430		/*
431		 * Because this is kernel_pmap, this call will not block.
432		 */
433		pmap_enter(kernel_pmap, addr + i, VM_PROT_ALL, m, VM_PROT_ALL,
434		    TRUE);
435		vm_page_wakeup(m);
436	}
437	VM_OBJECT_UNLOCK(kmem_object);
438
439	return (KERN_SUCCESS);
440}
441
442/*
443 *	kmem_alloc_wait:
444 *
445 *	Allocates pageable memory from a sub-map of the kernel.  If the submap
446 *	has no room, the caller sleeps waiting for more memory in the submap.
447 *
448 *	This routine may block.
449 */
450vm_offset_t
451kmem_alloc_wait(map, size)
452	vm_map_t map;
453	vm_size_t size;
454{
455	vm_offset_t addr;
456
457	size = round_page(size);
458	if (!swap_reserve(size))
459		return (0);
460
461	for (;;) {
462		/*
463		 * To make this work for more than one map, use the map's lock
464		 * to lock out sleepers/wakers.
465		 */
466		vm_map_lock(map);
467		if (vm_map_findspace(map, vm_map_min(map), size, &addr) == 0)
468			break;
469		/* no space now; see if we can ever get space */
470		if (vm_map_max(map) - vm_map_min(map) < size) {
471			vm_map_unlock(map);
472			swap_release(size);
473			return (0);
474		}
475		map->needs_wakeup = TRUE;
476		vm_map_unlock_and_wait(map, 0);
477	}
478	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
479	    VM_PROT_ALL, MAP_ACC_CHARGED);
480	vm_map_unlock(map);
481	return (addr);
482}
483
484/*
485 *	kmem_free_wakeup:
486 *
487 *	Returns memory to a submap of the kernel, and wakes up any processes
488 *	waiting for memory in that map.
489 */
490void
491kmem_free_wakeup(map, addr, size)
492	vm_map_t map;
493	vm_offset_t addr;
494	vm_size_t size;
495{
496
497	vm_map_lock(map);
498	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
499	if (map->needs_wakeup) {
500		map->needs_wakeup = FALSE;
501		vm_map_wakeup(map);
502	}
503	vm_map_unlock(map);
504}
505
506static void
507kmem_init_zero_region(void)
508{
509	vm_offset_t addr, i;
510	vm_page_t m;
511	int error;
512
513	/*
514	 * Map a single physical page of zeros to a larger virtual range.
515	 * This requires less looping in places that want large amounts of
516	 * zeros, while not using much more physical resources.
517	 */
518	addr = kmem_alloc_nofault(kernel_map, ZERO_REGION_SIZE);
519	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
520	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
521	if ((m->flags & PG_ZERO) == 0)
522		pmap_zero_page(m);
523	for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
524		pmap_qenter(addr + i, &m, 1);
525	error = vm_map_protect(kernel_map, addr, addr + ZERO_REGION_SIZE,
526	    VM_PROT_READ, TRUE);
527	KASSERT(error == 0, ("error=%d", error));
528
529	zero_region = (const void *)addr;
530}
531
532/*
533 * 	kmem_init:
534 *
535 *	Create the kernel map; insert a mapping covering kernel text,
536 *	data, bss, and all space allocated thus far (`boostrap' data).  The
537 *	new map will thus map the range between VM_MIN_KERNEL_ADDRESS and
538 *	`start' as allocated, and the range between `start' and `end' as free.
539 */
540void
541kmem_init(start, end)
542	vm_offset_t start, end;
543{
544	vm_map_t m;
545
546	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end);
547	m->system_map = 1;
548	vm_map_lock(m);
549	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
550	kernel_map = m;
551	(void) vm_map_insert(m, NULL, (vm_ooffset_t) 0,
552#ifdef __amd64__
553	    KERNBASE,
554#else
555	    VM_MIN_KERNEL_ADDRESS,
556#endif
557	    start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
558	/* ... and ending with the completion of the above `insert' */
559	vm_map_unlock(m);
560
561	kmem_init_zero_region();
562}
563
564#ifdef DIAGNOSTIC
565/*
566 * Allow userspace to directly trigger the VM drain routine for testing
567 * purposes.
568 */
569static int
570debug_vm_lowmem(SYSCTL_HANDLER_ARGS)
571{
572	int error, i;
573
574	i = 0;
575	error = sysctl_handle_int(oidp, &i, 0, req);
576	if (error)
577		return (error);
578	if (i)
579		EVENTHANDLER_INVOKE(vm_lowmem, 0);
580	return (0);
581}
582
583SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
584    debug_vm_lowmem, "I", "set to trigger vm_lowmem event");
585#endif
586