vm_kern.c revision 25164
1169695Skan/* 2169695Skan * Copyright (c) 1991, 1993 3169695Skan * The Regents of the University of California. All rights reserved. 4169695Skan * 5169695Skan * This code is derived from software contributed to Berkeley by 6169695Skan * The Mach Operating System project at Carnegie-Mellon University. 7169695Skan * 8169695Skan * Redistribution and use in source and binary forms, with or without 9169695Skan * modification, are permitted provided that the following conditions 10169695Skan * are met: 11169695Skan * 1. Redistributions of source code must retain the above copyright 12169695Skan * notice, this list of conditions and the following disclaimer. 13169695Skan * 2. Redistributions in binary form must reproduce the above copyright 14169695Skan * notice, this list of conditions and the following disclaimer in the 15169695Skan * documentation and/or other materials provided with the distribution. 16169695Skan * 3. All advertising materials mentioning features or use of this software 17169695Skan * must display the following acknowledgement: 18169695Skan * This product includes software developed by the University of 19169695Skan * California, Berkeley and its contributors. 20169695Skan * 4. Neither the name of the University nor the names of its contributors 21169695Skan * may be used to endorse or promote products derived from this software 22169695Skan * without specific prior written permission. 23169695Skan * 24169695Skan * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25169695Skan * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26169695Skan * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27169695Skan * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28169695Skan * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29169695Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30169695Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31169695Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32169695Skan * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33169695Skan * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34169695Skan * SUCH DAMAGE. 35169695Skan * 36169695Skan * from: @(#)vm_kern.c 8.3 (Berkeley) 1/12/94 37169695Skan * 38169695Skan * 39169695Skan * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40169695Skan * All rights reserved. 41169695Skan * 42169695Skan * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43169695Skan * 44169695Skan * Permission to use, copy, modify and distribute this software and 45169695Skan * its documentation is hereby granted, provided that both the copyright 46169695Skan * notice and this permission notice appear in all copies of the 47169695Skan * software, derivative works or modified versions, and any portions 48169695Skan * thereof, and that both notices appear in supporting documentation. 49169695Skan * 50169695Skan * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51169695Skan * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52169695Skan * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53169695Skan * 54169695Skan * Carnegie Mellon requests users of this software to return to 55169695Skan * 56169695Skan * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57169695Skan * School of Computer Science 58169695Skan * Carnegie Mellon University 59169695Skan * Pittsburgh PA 15213-3890 60169695Skan * 61169695Skan * any improvements or extensions that they make and grant Carnegie the 62169695Skan * rights to redistribute these changes. 63169695Skan * 64169695Skan * $Id: vm_kern.c,v 1.34 1997/03/31 11:11:24 davidg Exp $ 65169695Skan */ 66169695Skan 67169695Skan/* 68169695Skan * Kernel memory management. 69169695Skan */ 70169695Skan#include "opt_smp.h" 71169695Skan#include "opt_smp_privpages.h" 72169695Skan 73169695Skan#include <sys/param.h> 74169695Skan#include <sys/systm.h> 75169695Skan#include <sys/kernel.h> 76169695Skan#include <sys/proc.h> 77169695Skan#include <sys/malloc.h> 78169695Skan#include <sys/syslog.h> 79169695Skan#include <sys/queue.h> 80169695Skan#include <sys/vmmeter.h> 81169695Skan 82169695Skan#include <vm/vm.h> 83169695Skan#include <vm/vm_param.h> 84169695Skan#include <vm/vm_prot.h> 85169695Skan#include <sys/lock.h> 86169695Skan#include <vm/pmap.h> 87169695Skan#include <vm/vm_map.h> 88169695Skan#include <vm/vm_object.h> 89169695Skan#include <vm/vm_page.h> 90169695Skan#include <vm/vm_pageout.h> 91169695Skan#include <vm/vm_kern.h> 92169695Skan#include <vm/vm_extern.h> 93169695Skan 94169695Skanvm_map_t kernel_map=0; 95169695Skanvm_map_t kmem_map=0; 96169695Skanvm_map_t exec_map=0; 97169695Skanvm_map_t clean_map=0; 98169695Skanvm_map_t u_map=0; 99169695Skanvm_map_t buffer_map=0; 100169695Skanvm_map_t mb_map=0; 101169695Skanint mb_map_full=0; 102169695Skanvm_map_t io_map=0; 103169695Skanvm_map_t phys_map=0; 104169695Skan#if defined(SMP) && defined(SMP_PRIVPAGES) 105169695Skanvm_map_t ppage_map=0; 106169695Skan#endif 107169695Skan 108169695Skan 109169695Skan/* 110169695Skan * kmem_alloc_pageable: 111169695Skan * 112169695Skan * Allocate pageable memory to the kernel's address map. 113169695Skan * "map" must be kernel_map or a submap of kernel_map. 114169695Skan */ 115169695Skan 116169695Skanvm_offset_t 117169695Skankmem_alloc_pageable(map, size) 118169695Skan vm_map_t map; 119169695Skan register vm_size_t size; 120169695Skan{ 121169695Skan vm_offset_t addr; 122169695Skan register int result; 123169695Skan 124169695Skan size = round_page(size); 125169695Skan addr = vm_map_min(map); 126169695Skan result = vm_map_find(map, NULL, (vm_offset_t) 0, 127169695Skan &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); 128169695Skan if (result != KERN_SUCCESS) { 129169695Skan return (0); 130169695Skan } 131169695Skan return (addr); 132169695Skan} 133169695Skan 134169695Skan/* 135169695Skan * Allocate wired-down memory in the kernel's address map 136169695Skan * or a submap. 137169695Skan */ 138169695Skanvm_offset_t 139169695Skankmem_alloc(map, size) 140169695Skan register vm_map_t map; 141169695Skan register vm_size_t size; 142169695Skan{ 143169695Skan vm_offset_t addr; 144169695Skan register vm_offset_t offset; 145169695Skan vm_offset_t i; 146169695Skan 147169695Skan size = round_page(size); 148169695Skan 149169695Skan /* 150169695Skan * Use the kernel object for wired-down kernel pages. Assume that no 151169695Skan * region of the kernel object is referenced more than once. 152169695Skan */ 153169695Skan 154169695Skan /* 155169695Skan * Locate sufficient space in the map. This will give us the final 156169695Skan * virtual address for the new memory, and thus will tell us the 157169695Skan * offset within the kernel map. 158169695Skan */ 159169695Skan vm_map_lock(map); 160169695Skan if (vm_map_findspace(map, 0, size, &addr)) { 161169695Skan vm_map_unlock(map); 162169695Skan return (0); 163169695Skan } 164169695Skan offset = addr - VM_MIN_KERNEL_ADDRESS; 165169695Skan vm_object_reference(kernel_object); 166169695Skan vm_map_insert(map, kernel_object, offset, addr, addr + size, 167169695Skan VM_PROT_ALL, VM_PROT_ALL, 0); 168169695Skan vm_map_unlock(map); 169169695Skan 170169695Skan /* 171169695Skan * Guarantee that there are pages already in this object before 172169695Skan * calling vm_map_pageable. This is to prevent the following 173169695Skan * scenario: 174169695Skan * 175169695Skan * 1) Threads have swapped out, so that there is a pager for the 176169695Skan * kernel_object. 2) The kmsg zone is empty, and so we are 177169695Skan * kmem_allocing a new page for it. 3) vm_map_pageable calls vm_fault; 178169695Skan * there is no page, but there is a pager, so we call 179169695Skan * pager_data_request. But the kmsg zone is empty, so we must 180169695Skan * kmem_alloc. 4) goto 1 5) Even if the kmsg zone is not empty: when 181169695Skan * we get the data back from the pager, it will be (very stale) 182169695Skan * non-zero data. kmem_alloc is defined to return zero-filled memory. 183169695Skan * 184169695Skan * We're intentionally not activating the pages we allocate to prevent a 185169695Skan * race with page-out. vm_map_pageable will wire the pages. 186169695Skan */ 187169695Skan 188169695Skan for (i = 0; i < size; i += PAGE_SIZE) { 189169695Skan vm_page_t mem; 190169695Skan 191169695Skan while ((mem = vm_page_alloc(kernel_object, 192169695Skan OFF_TO_IDX(offset + i), VM_ALLOC_ZERO)) == NULL) { 193169695Skan VM_WAIT; 194169695Skan } 195169695Skan if ((mem->flags & PG_ZERO) == 0) 196169695Skan vm_page_zero_fill(mem); 197169695Skan mem->flags &= ~(PG_BUSY|PG_ZERO); 198169695Skan mem->valid = VM_PAGE_BITS_ALL; 199169695Skan } 200169695Skan 201169695Skan /* 202169695Skan * And finally, mark the data as non-pageable. 203169695Skan */ 204169695Skan 205169695Skan (void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE); 206169695Skan 207169695Skan return (addr); 208169695Skan} 209169695Skan 210169695Skan/* 211169695Skan * kmem_free: 212169695Skan * 213169695Skan * Release a region of kernel virtual memory allocated 214169695Skan * with kmem_alloc, and return the physical pages 215169695Skan * associated with that region. 216169695Skan */ 217169695Skanvoid 218169695Skankmem_free(map, addr, size) 219169695Skan vm_map_t map; 220169695Skan register vm_offset_t addr; 221169695Skan vm_size_t size; 222169695Skan{ 223169695Skan (void) vm_map_remove(map, trunc_page(addr), round_page(addr + size)); 224169695Skan} 225169695Skan 226169695Skan/* 227169695Skan * kmem_suballoc: 228169695Skan * 229169695Skan * Allocates a map to manage a subrange 230169695Skan * of the kernel virtual address space. 231169695Skan * 232169695Skan * Arguments are as follows: 233169695Skan * 234169695Skan * parent Map to take range from 235169695Skan * size Size of range to find 236169695Skan * min, max Returned endpoints of map 237169695Skan * pageable Can the region be paged 238169695Skan */ 239169695Skanvm_map_t 240169695Skankmem_suballoc(parent, min, max, size, pageable) 241169695Skan register vm_map_t parent; 242169695Skan vm_offset_t *min, *max; 243169695Skan register vm_size_t size; 244169695Skan boolean_t pageable; 245169695Skan{ 246169695Skan register int ret; 247169695Skan vm_map_t result; 248169695Skan 249169695Skan size = round_page(size); 250169695Skan 251169695Skan *min = (vm_offset_t) vm_map_min(parent); 252169695Skan ret = vm_map_find(parent, NULL, (vm_offset_t) 0, 253169695Skan min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0); 254169695Skan if (ret != KERN_SUCCESS) { 255169695Skan printf("kmem_suballoc: bad status return of %d.\n", ret); 256169695Skan panic("kmem_suballoc"); 257169695Skan } 258169695Skan *max = *min + size; 259169695Skan pmap_reference(vm_map_pmap(parent)); 260169695Skan result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable); 261169695Skan if (result == NULL) 262169695Skan panic("kmem_suballoc: cannot create submap"); 263169695Skan if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS) 264169695Skan panic("kmem_suballoc: unable to change range to submap"); 265169695Skan return (result); 266169695Skan} 267169695Skan 268169695Skan/* 269169695Skan * Allocate wired-down memory in the kernel's address map for the higher 270169695Skan * level kernel memory allocator (kern/kern_malloc.c). We cannot use 271169695Skan * kmem_alloc() because we may need to allocate memory at interrupt 272169695Skan * level where we cannot block (canwait == FALSE). 273169695Skan * 274169695Skan * This routine has its own private kernel submap (kmem_map) and object 275169695Skan * (kmem_object). This, combined with the fact that only malloc uses 276169695Skan * this routine, ensures that we will never block in map or object waits. 277169695Skan * 278169695Skan * Note that this still only works in a uni-processor environment and 279169695Skan * when called at splhigh(). 280169695Skan * 281169695Skan * We don't worry about expanding the map (adding entries) since entries 282169695Skan * for wired maps are statically allocated. 283169695Skan */ 284169695Skanvm_offset_t 285169695Skankmem_malloc(map, size, waitflag) 286169695Skan register vm_map_t map; 287169695Skan register vm_size_t size; 288169695Skan boolean_t waitflag; 289169695Skan{ 290169695Skan register vm_offset_t offset, i; 291169695Skan vm_map_entry_t entry; 292169695Skan vm_offset_t addr; 293169695Skan vm_page_t m; 294169695Skan 295169695Skan if (map != kmem_map && map != mb_map) 296169695Skan panic("kmem_malloc: map != {kmem,mb}_map"); 297169695Skan 298169695Skan size = round_page(size); 299169695Skan addr = vm_map_min(map); 300169695Skan 301169695Skan /* 302169695Skan * Locate sufficient space in the map. This will give us the final 303169695Skan * virtual address for the new memory, and thus will tell us the 304169695Skan * offset within the kernel map. 305169695Skan */ 306169695Skan vm_map_lock(map); 307169695Skan if (vm_map_findspace(map, 0, size, &addr)) { 308169695Skan vm_map_unlock(map); 309169695Skan if (map == mb_map) { 310169695Skan mb_map_full = TRUE; 311169695Skan log(LOG_ERR, "Out of mbuf clusters - increase maxusers!\n"); 312169695Skan return (0); 313169695Skan } 314169695Skan if (waitflag == M_WAITOK) 315169695Skan panic("kmem_malloc: kmem_map too small"); 316169695Skan return (0); 317169695Skan } 318169695Skan offset = addr - VM_MIN_KERNEL_ADDRESS; 319169695Skan vm_object_reference(kmem_object); 320169695Skan vm_map_insert(map, kmem_object, offset, addr, addr + size, 321169695Skan VM_PROT_ALL, VM_PROT_ALL, 0); 322169695Skan 323169695Skan for (i = 0; i < size; i += PAGE_SIZE) { 324169695Skanretry: 325169695Skan m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), 326169695Skan (waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM); 327169695Skan 328169695Skan /* 329169695Skan * Ran out of space, free everything up and return. Don't need 330169695Skan * to lock page queues here as we know that the pages we got 331169695Skan * aren't on any queues. 332169695Skan */ 333169695Skan if (m == NULL) { 334169695Skan if (waitflag == M_WAITOK) { 335169695Skan VM_WAIT; 336169695Skan goto retry; 337169695Skan } 338169695Skan while (i != 0) { 339169695Skan i -= PAGE_SIZE; 340169695Skan m = vm_page_lookup(kmem_object, 341169695Skan OFF_TO_IDX(offset + i)); 342169695Skan PAGE_WAKEUP(m); 343169695Skan vm_page_free(m); 344169695Skan } 345169695Skan vm_map_delete(map, addr, addr + size); 346169695Skan vm_map_unlock(map); 347169695Skan return (0); 348169695Skan } 349169695Skan m->flags &= ~PG_ZERO; 350169695Skan m->valid = VM_PAGE_BITS_ALL; 351169695Skan } 352169695Skan 353169695Skan /* 354169695Skan * Mark map entry as non-pageable. Assert: vm_map_insert() will never 355169695Skan * be able to extend the previous entry so there will be a new entry 356169695Skan * exactly corresponding to this address range and it will have 357169695Skan * wired_count == 0. 358169695Skan */ 359169695Skan if (!vm_map_lookup_entry(map, addr, &entry) || 360169695Skan entry->start != addr || entry->end != addr + size || 361169695Skan entry->wired_count) 362169695Skan panic("kmem_malloc: entry not found or misaligned"); 363169695Skan entry->wired_count++; 364169695Skan 365169695Skan vm_map_simplify_entry(map, entry); 366169695Skan 367169695Skan /* 368169695Skan * Loop thru pages, entering them in the pmap. (We cannot add them to 369169695Skan * the wired count without wrapping the vm_page_queue_lock in 370169695Skan * splimp...) 371169695Skan */ 372169695Skan for (i = 0; i < size; i += PAGE_SIZE) { 373169695Skan m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i)); 374169695Skan vm_page_wire(m); 375169695Skan PAGE_WAKEUP(m); 376169695Skan pmap_enter(kernel_pmap, addr + i, VM_PAGE_TO_PHYS(m), 377169695Skan VM_PROT_ALL, 1); 378169695Skan m->flags |= PG_MAPPED|PG_WRITEABLE; 379169695Skan } 380169695Skan vm_map_unlock(map); 381169695Skan 382169695Skan return (addr); 383169695Skan} 384169695Skan 385169695Skan/* 386169695Skan * kmem_alloc_wait 387169695Skan * 388169695Skan * Allocates pageable memory from a sub-map of the kernel. If the submap 389169695Skan * has no room, the caller sleeps waiting for more memory in the submap. 390169695Skan * 391169695Skan */ 392169695Skanvm_offset_t 393169695Skankmem_alloc_wait(map, size) 394169695Skan vm_map_t map; 395169695Skan vm_size_t size; 396169695Skan{ 397169695Skan vm_offset_t addr; 398169695Skan 399169695Skan size = round_page(size); 400169695Skan 401169695Skan for (;;) { 402169695Skan /* 403169695Skan * To make this work for more than one map, use the map's lock 404169695Skan * to lock out sleepers/wakers. 405169695Skan */ 406169695Skan vm_map_lock(map); 407169695Skan if (vm_map_findspace(map, 0, size, &addr) == 0) 408169695Skan break; 409169695Skan /* no space now; see if we can ever get space */ 410169695Skan if (vm_map_max(map) - vm_map_min(map) < size) { 411169695Skan vm_map_unlock(map); 412169695Skan return (0); 413169695Skan } 414169695Skan vm_map_unlock(map); 415169695Skan tsleep(map, PVM, "kmaw", 0); 416169695Skan } 417169695Skan vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0); 418169695Skan vm_map_unlock(map); 419169695Skan return (addr); 420169695Skan} 421169695Skan 422169695Skan/* 423169695Skan * kmem_free_wakeup 424169695Skan * 425169695Skan * Returns memory to a submap of the kernel, and wakes up any processes 426169695Skan * waiting for memory in that map. 427169695Skan */ 428169695Skanvoid 429169695Skankmem_free_wakeup(map, addr, size) 430169695Skan vm_map_t map; 431169695Skan vm_offset_t addr; 432169695Skan vm_size_t size; 433169695Skan{ 434169695Skan vm_map_lock(map); 435169695Skan (void) vm_map_delete(map, trunc_page(addr), round_page(addr + size)); 436169695Skan wakeup(map); 437169695Skan vm_map_unlock(map); 438169695Skan} 439169695Skan 440169695Skan/* 441169695Skan * Create the kernel map; insert a mapping covering kernel text, data, bss, 442169695Skan * and all space allocated thus far (`boostrap' data). The new map will thus 443169695Skan * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and 444169695Skan * the range between `start' and `end' as free. 445169695Skan */ 446169695Skanvoid 447169695Skankmem_init(start, end) 448169695Skan vm_offset_t start, end; 449169695Skan{ 450169695Skan register vm_map_t m; 451169695Skan 452169695Skan m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE); 453169695Skan vm_map_lock(m); 454169695Skan /* N.B.: cannot use kgdb to debug, starting with this assignment ... */ 455169695Skan kernel_map = m; 456169695Skan (void) vm_map_insert(m, NULL, (vm_offset_t) 0, 457169695Skan VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0); 458169695Skan /* ... and ending with the completion of the above `insert' */ 459169695Skan vm_map_unlock(m); 460169695Skan} 461169695Skan