kern_malloc.c revision 117878
1/* 2 * Copyright (c) 1987, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 34 */ 35 36#include <sys/cdefs.h> 37__FBSDID("$FreeBSD: head/sys/kern/kern_malloc.c 117878 2003-07-22 10:24:41Z phk $"); 38 39#include "opt_vm.h" 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/kernel.h> 44#include <sys/lock.h> 45#include <sys/malloc.h> 46#include <sys/mbuf.h> 47#include <sys/mutex.h> 48#include <sys/vmmeter.h> 49#include <sys/proc.h> 50#include <sys/sysctl.h> 51#include <sys/time.h> 52 53#include <vm/vm.h> 54#include <vm/pmap.h> 55#include <vm/vm_param.h> 56#include <vm/vm_kern.h> 57#include <vm/vm_extern.h> 58#include <vm/vm_map.h> 59#include <vm/vm_page.h> 60#include <vm/uma.h> 61#include <vm/uma_int.h> 62#include <vm/uma_dbg.h> 63 64#if defined(INVARIANTS) && defined(__i386__) 65#include <machine/cpu.h> 66#endif 67 68/* 69 * When realloc() is called, if the new size is sufficiently smaller than 70 * the old size, realloc() will allocate a new, smaller block to avoid 71 * wasting memory. 'Sufficiently smaller' is defined as: newsize <= 72 * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'. 73 */ 74#ifndef REALLOC_FRACTION 75#define REALLOC_FRACTION 1 /* new block if <= half the size */ 76#endif 77 78MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches"); 79MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory"); 80MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers"); 81 82MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); 83MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); 84 85static void kmeminit(void *); 86SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL) 87 88static MALLOC_DEFINE(M_FREE, "free", "should be on free list"); 89 90static struct malloc_type *kmemstatistics; 91static char *kmembase; 92static char *kmemlimit; 93 94#define KMEM_ZSHIFT 4 95#define KMEM_ZBASE 16 96#define KMEM_ZMASK (KMEM_ZBASE - 1) 97 98#define KMEM_ZMAX 65536 99#define KMEM_ZSIZE (KMEM_ZMAX >> KMEM_ZSHIFT) 100static u_int8_t kmemsize[KMEM_ZSIZE + 1]; 101 102/* These won't be powers of two for long */ 103struct { 104 int kz_size; 105 char *kz_name; 106 uma_zone_t kz_zone; 107} kmemzones[] = { 108 {16, "16", NULL}, 109 {32, "32", NULL}, 110 {64, "64", NULL}, 111 {128, "128", NULL}, 112 {256, "256", NULL}, 113 {512, "512", NULL}, 114 {1024, "1024", NULL}, 115 {2048, "2048", NULL}, 116 {4096, "4096", NULL}, 117 {8192, "8192", NULL}, 118 {16384, "16384", NULL}, 119 {32768, "32768", NULL}, 120 {65536, "65536", NULL}, 121 {0, NULL}, 122}; 123 124u_int vm_kmem_size; 125 126/* 127 * The malloc_mtx protects the kmemstatistics linked list. 128 */ 129 130struct mtx malloc_mtx; 131 132#ifdef MALLOC_PROFILE 133uint64_t krequests[KMEM_ZSIZE + 1]; 134 135static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS); 136#endif 137 138static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS); 139 140/* time_uptime of last malloc(9) failure */ 141static time_t t_malloc_fail; 142 143#ifdef MALLOC_MAKE_FAILURES 144/* 145 * Causes malloc failures every (n) mallocs with M_NOWAIT. If set to 0, 146 * doesn't cause failures. 147 */ 148SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0, 149 "Kernel malloc debugging options"); 150 151static int malloc_failure_rate; 152static int malloc_nowait_count; 153static int malloc_failure_count; 154SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW, 155 &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail"); 156TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate); 157SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD, 158 &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures"); 159#endif 160 161int 162malloc_last_fail(void) 163{ 164 165 return (time_uptime - t_malloc_fail); 166} 167 168/* 169 * malloc: 170 * 171 * Allocate a block of memory. 172 * 173 * If M_NOWAIT is set, this routine will not block and return NULL if 174 * the allocation fails. 175 */ 176void * 177malloc(size, type, flags) 178 unsigned long size; 179 struct malloc_type *type; 180 int flags; 181{ 182 int indx; 183 caddr_t va; 184 uma_zone_t zone; 185#ifdef DIAGNOSTIC 186 unsigned long osize = size; 187#endif 188 register struct malloc_type *ksp = type; 189 190#ifdef INVARIANTS 191 /* 192 * To make sure that WAITOK or NOWAIT is set, but not more than 193 * one, and check against the API botches that are common. 194 */ 195 indx = flags & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT); 196 if (indx != M_NOWAIT && indx != M_WAITOK) { 197 static struct timeval lasterr; 198 static int curerr, once; 199 if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) { 200 printf("Bad malloc flags: %x\n", indx); 201 backtrace(); 202 flags |= M_WAITOK; 203 once++; 204 } 205 } 206#endif 207#if 0 208 if (size == 0) 209 Debugger("zero size malloc"); 210#endif 211#ifdef MALLOC_MAKE_FAILURES 212 if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) { 213 atomic_add_int(&malloc_nowait_count, 1); 214 if ((malloc_nowait_count % malloc_failure_rate) == 0) { 215 atomic_add_int(&malloc_failure_count, 1); 216 t_malloc_fail = time_uptime; 217 return (NULL); 218 } 219 } 220#endif 221 if (flags & M_WAITOK) 222 KASSERT(curthread->td_intr_nesting_level == 0, 223 ("malloc(M_WAITOK) in interrupt context")); 224 if (size <= KMEM_ZMAX) { 225 if (size & KMEM_ZMASK) 226 size = (size & ~KMEM_ZMASK) + KMEM_ZBASE; 227 indx = kmemsize[size >> KMEM_ZSHIFT]; 228 zone = kmemzones[indx].kz_zone; 229#ifdef MALLOC_PROFILE 230 krequests[size >> KMEM_ZSHIFT]++; 231#endif 232 va = uma_zalloc(zone, flags); 233 mtx_lock(&ksp->ks_mtx); 234 if (va == NULL) 235 goto out; 236 237 ksp->ks_size |= 1 << indx; 238 size = zone->uz_size; 239 } else { 240 size = roundup(size, PAGE_SIZE); 241 zone = NULL; 242 va = uma_large_malloc(size, flags); 243 mtx_lock(&ksp->ks_mtx); 244 if (va == NULL) 245 goto out; 246 } 247 ksp->ks_memuse += size; 248 ksp->ks_inuse++; 249out: 250 ksp->ks_calls++; 251 if (ksp->ks_memuse > ksp->ks_maxused) 252 ksp->ks_maxused = ksp->ks_memuse; 253 254 mtx_unlock(&ksp->ks_mtx); 255 if (flags & M_WAITOK) 256 KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); 257 else if (va == NULL) 258 t_malloc_fail = time_uptime; 259#ifdef DIAGNOSTIC 260 if (va != NULL && !(flags & M_ZERO)) { 261 memset(va, 0x70, osize); 262 } 263#endif 264 return ((void *) va); 265} 266 267/* 268 * free: 269 * 270 * Free a block of memory allocated by malloc. 271 * 272 * This routine may not block. 273 */ 274void 275free(arg, type) 276 void const *arg; 277 struct malloc_type *type; 278{ 279 register struct malloc_type *ksp = type; 280 uma_slab_t slab; 281 u_long size; 282 void *addr; 283 284 addr = __DECONST(void *, arg); 285 /* free(NULL, ...) does nothing */ 286 if (addr == NULL) 287 return; 288 289 KASSERT(ksp->ks_memuse > 0, 290 ("malloc(9)/free(9) confusion.\n%s", 291 "Probably freeing with wrong type, but maybe not here.")); 292 size = 0; 293 294 slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK)); 295 296 if (slab == NULL) 297 panic("free: address %p(%p) has not been allocated.\n", 298 addr, (void *)((u_long)addr & (~UMA_SLAB_MASK))); 299 300 301 if (!(slab->us_flags & UMA_SLAB_MALLOC)) { 302#ifdef INVARIANTS 303 struct malloc_type **mtp = addr; 304#endif 305 size = slab->us_zone->uz_size; 306#ifdef INVARIANTS 307 /* 308 * Cache a pointer to the malloc_type that most recently freed 309 * this memory here. This way we know who is most likely to 310 * have stepped on it later. 311 * 312 * This code assumes that size is a multiple of 8 bytes for 313 * 64 bit machines 314 */ 315 mtp = (struct malloc_type **) 316 ((unsigned long)mtp & ~UMA_ALIGN_PTR); 317 mtp += (size - sizeof(struct malloc_type *)) / 318 sizeof(struct malloc_type *); 319 *mtp = type; 320#endif 321 uma_zfree_arg(slab->us_zone, addr, slab); 322 } else { 323 size = slab->us_size; 324 uma_large_free(slab); 325 } 326 mtx_lock(&ksp->ks_mtx); 327 KASSERT(size <= ksp->ks_memuse, 328 ("malloc(9)/free(9) confusion.\n%s", 329 "Probably freeing with wrong type, but maybe not here.")); 330 ksp->ks_memuse -= size; 331 ksp->ks_inuse--; 332 mtx_unlock(&ksp->ks_mtx); 333} 334 335/* 336 * realloc: change the size of a memory block 337 */ 338void * 339realloc(addr, size, type, flags) 340 void *addr; 341 unsigned long size; 342 struct malloc_type *type; 343 int flags; 344{ 345 uma_slab_t slab; 346 unsigned long alloc; 347 void *newaddr; 348 349 /* realloc(NULL, ...) is equivalent to malloc(...) */ 350 if (addr == NULL) 351 return (malloc(size, type, flags)); 352 353 slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK)); 354 355 /* Sanity check */ 356 KASSERT(slab != NULL, 357 ("realloc: address %p out of range", (void *)addr)); 358 359 /* Get the size of the original block */ 360 if (slab->us_zone) 361 alloc = slab->us_zone->uz_size; 362 else 363 alloc = slab->us_size; 364 365 /* Reuse the original block if appropriate */ 366 if (size <= alloc 367 && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE)) 368 return (addr); 369 370 /* Allocate a new, bigger (or smaller) block */ 371 if ((newaddr = malloc(size, type, flags)) == NULL) 372 return (NULL); 373 374 /* Copy over original contents */ 375 bcopy(addr, newaddr, min(size, alloc)); 376 free(addr, type); 377 return (newaddr); 378} 379 380/* 381 * reallocf: same as realloc() but free memory on failure. 382 */ 383void * 384reallocf(addr, size, type, flags) 385 void *addr; 386 unsigned long size; 387 struct malloc_type *type; 388 int flags; 389{ 390 void *mem; 391 392 mem = realloc(addr, size, type, flags); 393 if (mem == NULL) 394 free(addr, type); 395 return (mem); 396} 397 398/* 399 * Initialize the kernel memory allocator 400 */ 401/* ARGSUSED*/ 402static void 403kmeminit(dummy) 404 void *dummy; 405{ 406 u_int8_t indx; 407 u_long npg; 408 u_long mem_size; 409 int i; 410 411 mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF); 412 413 /* 414 * Try to auto-tune the kernel memory size, so that it is 415 * more applicable for a wider range of machine sizes. 416 * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while 417 * a VM_KMEM_SIZE of 12MB is a fair compromise. The 418 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space 419 * available, and on an X86 with a total KVA space of 256MB, 420 * try to keep VM_KMEM_SIZE_MAX at 80MB or below. 421 * 422 * Note that the kmem_map is also used by the zone allocator, 423 * so make sure that there is enough space. 424 */ 425 vm_kmem_size = VM_KMEM_SIZE; 426 mem_size = cnt.v_page_count; 427 428#if defined(VM_KMEM_SIZE_SCALE) 429 if ((mem_size / VM_KMEM_SIZE_SCALE) > (vm_kmem_size / PAGE_SIZE)) 430 vm_kmem_size = (mem_size / VM_KMEM_SIZE_SCALE) * PAGE_SIZE; 431#endif 432 433#if defined(VM_KMEM_SIZE_MAX) 434 if (vm_kmem_size >= VM_KMEM_SIZE_MAX) 435 vm_kmem_size = VM_KMEM_SIZE_MAX; 436#endif 437 438 /* Allow final override from the kernel environment */ 439 TUNABLE_INT_FETCH("kern.vm.kmem.size", &vm_kmem_size); 440 441 /* 442 * Limit kmem virtual size to twice the physical memory. 443 * This allows for kmem map sparseness, but limits the size 444 * to something sane. Be careful to not overflow the 32bit 445 * ints while doing the check. 446 */ 447 if (((vm_kmem_size / 2) / PAGE_SIZE) > cnt.v_page_count) 448 vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE; 449 450 /* 451 * Tune settings based on the kernel map's size at this time. 452 */ 453 init_param3(vm_kmem_size / PAGE_SIZE); 454 455 /* 456 * In mbuf_init(), we set up submaps for mbufs and clusters, in which 457 * case we rounddown() (nmbufs * MSIZE) and (nmbclusters * MCLBYTES), 458 * respectively. Mathematically, this means that what we do here may 459 * amount to slightly more address space than we need for the submaps, 460 * but it never hurts to have an extra page in kmem_map. 461 */ 462 npg = (nmbufs*MSIZE + nmbclusters*MCLBYTES + vm_kmem_size) / PAGE_SIZE; 463 464 kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, 465 (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE)); 466 kmem_map->system_map = 1; 467 468 uma_startup2(); 469 470 for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) { 471 int size = kmemzones[indx].kz_size; 472 char *name = kmemzones[indx].kz_name; 473 474 kmemzones[indx].kz_zone = uma_zcreate(name, size, 475#ifdef INVARIANTS 476 mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini, 477#else 478 NULL, NULL, NULL, NULL, 479#endif 480 UMA_ALIGN_PTR, UMA_ZONE_MALLOC); 481 482 for (;i <= size; i+= KMEM_ZBASE) 483 kmemsize[i >> KMEM_ZSHIFT] = indx; 484 485 } 486} 487 488void 489malloc_init(data) 490 void *data; 491{ 492 struct malloc_type *type = (struct malloc_type *)data; 493 494 mtx_lock(&malloc_mtx); 495 if (type->ks_magic != M_MAGIC) 496 panic("malloc type lacks magic"); 497 498 if (cnt.v_page_count == 0) 499 panic("malloc_init not allowed before vm init"); 500 501 if (type->ks_next != NULL) 502 return; 503 504 type->ks_next = kmemstatistics; 505 kmemstatistics = type; 506 mtx_init(&type->ks_mtx, type->ks_shortdesc, "Malloc Stats", MTX_DEF); 507 mtx_unlock(&malloc_mtx); 508} 509 510void 511malloc_uninit(data) 512 void *data; 513{ 514 struct malloc_type *type = (struct malloc_type *)data; 515 struct malloc_type *t; 516 517 mtx_lock(&malloc_mtx); 518 mtx_lock(&type->ks_mtx); 519 if (type->ks_magic != M_MAGIC) 520 panic("malloc type lacks magic"); 521 522 if (cnt.v_page_count == 0) 523 panic("malloc_uninit not allowed before vm init"); 524 525 if (type == kmemstatistics) 526 kmemstatistics = type->ks_next; 527 else { 528 for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) { 529 if (t->ks_next == type) { 530 t->ks_next = type->ks_next; 531 break; 532 } 533 } 534 } 535 type->ks_next = NULL; 536 mtx_destroy(&type->ks_mtx); 537 mtx_unlock(&malloc_mtx); 538} 539 540static int 541sysctl_kern_malloc(SYSCTL_HANDLER_ARGS) 542{ 543 struct malloc_type *type; 544 int linesize = 128; 545 int curline; 546 int bufsize; 547 int first; 548 int error; 549 char *buf; 550 char *p; 551 int cnt; 552 int len; 553 int i; 554 555 cnt = 0; 556 557 mtx_lock(&malloc_mtx); 558 for (type = kmemstatistics; type != NULL; type = type->ks_next) 559 cnt++; 560 561 mtx_unlock(&malloc_mtx); 562 bufsize = linesize * (cnt + 1); 563 p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); 564 mtx_lock(&malloc_mtx); 565 566 len = snprintf(p, linesize, 567 "\n Type InUse MemUse HighUse Requests Size(s)\n"); 568 p += len; 569 570 for (type = kmemstatistics; cnt != 0 && type != NULL; 571 type = type->ks_next, cnt--) { 572 if (type->ks_calls == 0) 573 continue; 574 575 curline = linesize - 2; /* Leave room for the \n */ 576 len = snprintf(p, curline, "%13s%6lu%6luK%7luK%9llu", 577 type->ks_shortdesc, 578 type->ks_inuse, 579 (type->ks_memuse + 1023) / 1024, 580 (type->ks_maxused + 1023) / 1024, 581 (long long unsigned)type->ks_calls); 582 curline -= len; 583 p += len; 584 585 first = 1; 586 for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1; 587 i++) { 588 if (type->ks_size & (1 << i)) { 589 if (first) 590 len = snprintf(p, curline, " "); 591 else 592 len = snprintf(p, curline, ","); 593 curline -= len; 594 p += len; 595 596 len = snprintf(p, curline, 597 "%s", kmemzones[i].kz_name); 598 curline -= len; 599 p += len; 600 601 first = 0; 602 } 603 } 604 605 len = snprintf(p, 2, "\n"); 606 p += len; 607 } 608 609 mtx_unlock(&malloc_mtx); 610 error = SYSCTL_OUT(req, buf, p - buf); 611 612 free(buf, M_TEMP); 613 return (error); 614} 615 616SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD, 617 NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats"); 618 619#ifdef MALLOC_PROFILE 620 621static int 622sysctl_kern_mprof(SYSCTL_HANDLER_ARGS) 623{ 624 int linesize = 64; 625 uint64_t count; 626 uint64_t waste; 627 uint64_t mem; 628 int bufsize; 629 int error; 630 char *buf; 631 int rsize; 632 int size; 633 char *p; 634 int len; 635 int i; 636 637 bufsize = linesize * (KMEM_ZSIZE + 1); 638 bufsize += 128; /* For the stats line */ 639 bufsize += 128; /* For the banner line */ 640 waste = 0; 641 mem = 0; 642 643 p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); 644 len = snprintf(p, bufsize, 645 "\n Size Requests Real Size\n"); 646 bufsize -= len; 647 p += len; 648 649 for (i = 0; i < KMEM_ZSIZE; i++) { 650 size = i << KMEM_ZSHIFT; 651 rsize = kmemzones[kmemsize[i]].kz_size; 652 count = (long long unsigned)krequests[i]; 653 654 len = snprintf(p, bufsize, "%6d%28llu%11d\n", 655 size, (unsigned long long)count, rsize); 656 bufsize -= len; 657 p += len; 658 659 if ((rsize * count) > (size * count)) 660 waste += (rsize * count) - (size * count); 661 mem += (rsize * count); 662 } 663 664 len = snprintf(p, bufsize, 665 "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n", 666 (unsigned long long)mem, (unsigned long long)waste); 667 p += len; 668 669 error = SYSCTL_OUT(req, buf, p - buf); 670 671 free(buf, M_TEMP); 672 return (error); 673} 674 675SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD, 676 NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling"); 677#endif /* MALLOC_PROFILE */ 678