1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 */ 58/* 59 * File: kern/zalloc.c 60 * Author: Avadis Tevanian, Jr. 61 * 62 * Zone-based memory allocator. A zone is a collection of fixed size 63 * data blocks for which quick allocation/deallocation is possible. 64 */ 65#include <zone_debug.h> 66#include <zone_alias_addr.h> 67 68#include <mach/mach_types.h> 69#include <mach/vm_param.h> 70#include <mach/kern_return.h> 71#include <mach/mach_host_server.h> 72#include <mach/task_server.h> 73#include <mach/machine/vm_types.h> 74#include <mach_debug/zone_info.h> 75#include <mach/vm_map.h> 76 77#include <kern/kern_types.h> 78#include <kern/assert.h> 79#include <kern/host.h> 80#include <kern/macro_help.h> 81#include <kern/sched.h> 82#include <kern/locks.h> 83#include <kern/sched_prim.h> 84#include <kern/misc_protos.h> 85#include <kern/thread_call.h> 86#include <kern/zalloc.h> 87#include <kern/kalloc.h> 88#include <kern/btlog.h> 89 90#include <vm/pmap.h> 91#include <vm/vm_map.h> 92#include <vm/vm_kern.h> 93#include <vm/vm_page.h> 94 95#include <pexpert/pexpert.h> 96 97#include <machine/machparam.h> 98#include <machine/machine_routines.h> /* ml_cpu_get_info */ 99 100#include <libkern/OSDebug.h> 101#include <libkern/OSAtomic.h> 102#include <sys/kdebug.h> 103 104/* 105 * ZONE_ALIAS_ADDR 106 * 107 * With this option enabled, zones with alloc_size <= PAGE_SIZE allocate 108 * a virtual page from the zone_map, but before zcram-ing the allocated memory 109 * into the zone, the page is translated to use the alias address of the page 110 * in the static kernel region. zone_gc reverses that translation when 111 * scanning the freelist to collect free pages so that it can look up the page 112 * in the zone_page_table, and free it to kmem_free. 113 * 114 * The static kernel region is a flat 1:1 mapping of physical memory passed 115 * to xnu by the booter. It is mapped to the range: 116 * [gVirtBase, gVirtBase + gPhysSize] 117 * 118 * Accessing memory via the static kernel region is faster due to the 119 * entire region being mapped via large pages, cutting down 120 * on TLB misses. 121 * 122 * zinit favors using PAGE_SIZE backing allocations for a zone unless it would 123 * waste more than 10% space to use a single page, in order to take advantage 124 * of the speed benefit for as many zones as possible. 125 * 126 * Zones with > PAGE_SIZE allocations can't take advantage of this 127 * because kernel_memory_allocate doesn't give out physically contiguous pages. 128 * 129 * zone_virtual_addr() 130 * - translates an address from the static kernel region to the zone_map 131 * - returns the same address if it's not from the static kernel region 132 * It relies on the fact that a physical page mapped to the 133 * zone_map is not mapped anywhere else (except the static kernel region). 134 * 135 * zone_alias_addr() 136 * - translates a virtual memory address from the zone_map to the 137 * corresponding address in the static kernel region 138 * 139 */ 140 141#if !ZONE_ALIAS_ADDR 142#define from_zone_map(addr, size) \ 143 ((vm_offset_t)(addr) >= zone_map_min_address && \ 144 ((vm_offset_t)(addr) + size - 1) < zone_map_max_address ) 145#else 146#define from_zone_map(addr, size) \ 147 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) >= zone_map_min_address && \ 148 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) + size -1) < zone_map_max_address ) 149#endif 150 151/* 152 * Zone Corruption Debugging 153 * 154 * We use three techniques to detect modification of a zone element 155 * after it's been freed. 156 * 157 * (1) Check the freelist next pointer for sanity. 158 * (2) Store a backup of the next pointer at the end of the element, 159 * and compare it to the primary next pointer when the element is allocated 160 * to detect corruption of the freelist due to use-after-free bugs. 161 * The backup pointer is also XORed with a per-boot random cookie. 162 * (3) Poison the freed element by overwriting it with 0xdeadbeef, 163 * and check for that value when the element is being reused to make sure 164 * no part of the element has been modified while it was on the freelist. 165 * This will also help catch read-after-frees, as code will now dereference 166 * 0xdeadbeef instead of a valid but freed pointer. 167 * 168 * (1) and (2) occur for every allocation and free to a zone. 169 * This is done to make it slightly more difficult for an attacker to 170 * manipulate the freelist to behave in a specific way. 171 * 172 * Poisoning (3) occurs periodically for every N frees (counted per-zone) 173 * and on every free for zones smaller than a cacheline. If -zp 174 * is passed as a boot arg, poisoning occurs for every free. 175 * 176 * Performance slowdown is inversely proportional to the frequency of poisoning, 177 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32 178 * and higher. You can expect to find a 100% reproducible bug in an average of 179 * N tries, with a standard deviation of about N, but you will want to set 180 * "-zp" to always poison every free if you are attempting to reproduce 181 * a known bug. 182 * 183 * For a more heavyweight, but finer-grained method of detecting misuse 184 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c. 185 * 186 * Zone Corruption Logging 187 * 188 * You can also track where corruptions come from by using the boot-arguments 189 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later 190 * in this document for more implementation and usage information. 191 * 192 * Zone Leak Detection 193 * 194 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks' 195 * found later in this file via the showtopztrace and showz* macros in kgmacros, 196 * or use zlog without the -zc argument. 197 * 198 */ 199 200/* Returns TRUE if we rolled over the counter at factor */ 201static inline boolean_t 202sample_counter(volatile uint32_t * count_p, uint32_t factor) 203{ 204 uint32_t old_count, new_count; 205 boolean_t rolled_over; 206 207 do { 208 new_count = old_count = *count_p; 209 210 if (++new_count >= factor) { 211 rolled_over = TRUE; 212 new_count = 0; 213 } else { 214 rolled_over = FALSE; 215 } 216 217 } while (!OSCompareAndSwap(old_count, new_count, count_p)); 218 219 return rolled_over; 220} 221 222#if defined(__LP64__) 223#define ZP_POISON 0xdeadbeefdeadbeef 224#else 225#define ZP_POISON 0xdeadbeef 226#endif 227 228#define ZP_DEFAULT_SAMPLING_FACTOR 16 229#define ZP_DEFAULT_SCALE_FACTOR 4 230 231/* 232 * A zp_factor of 0 indicates zone poisoning is disabled, 233 * however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline). 234 * Passing the -no-zp boot-arg disables even this behavior. 235 * In all cases, we record and check the integrity of a backup pointer. 236 */ 237 238/* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */ 239uint32_t zp_factor = 0; 240 241/* set by zp-scale=N boot arg, scales zp_factor by zone size */ 242uint32_t zp_scale = 0; 243 244/* set in zp_init, zero indicates -no-zp boot-arg */ 245vm_size_t zp_tiny_zone_limit = 0; 246 247/* initialized to a per-boot random value in zp_init */ 248uintptr_t zp_poisoned_cookie = 0; 249uintptr_t zp_nopoison_cookie = 0; 250 251 252/* 253 * initialize zone poisoning 254 * called from zone_bootstrap before any allocations are made from zalloc 255 */ 256static inline void 257zp_init(void) 258{ 259 char temp_buf[16]; 260 261 /* 262 * Initialize backup pointer random cookie for poisoned elements 263 * Try not to call early_random() back to back, it may return 264 * the same value if mach_absolute_time doesn't have sufficient time 265 * to tick over between calls. <rdar://problem/11597395> 266 * (This is only a problem on embedded devices) 267 */ 268 zp_poisoned_cookie = (uintptr_t) early_random(); 269 270 /* 271 * Always poison zones smaller than a cacheline, 272 * because it's pretty close to free 273 */ 274 ml_cpu_info_t cpu_info; 275 ml_cpu_get_info(&cpu_info); 276 zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size; 277 278 zp_factor = ZP_DEFAULT_SAMPLING_FACTOR; 279 zp_scale = ZP_DEFAULT_SCALE_FACTOR; 280 281 //TODO: Bigger permutation? 282 /* 283 * Permute the default factor +/- 1 to make it less predictable 284 * This adds or subtracts ~4 poisoned objects per 1000 frees. 285 */ 286 if (zp_factor != 0) { 287 uint32_t rand_bits = early_random() & 0x3; 288 289 if (rand_bits == 0x1) 290 zp_factor += 1; 291 else if (rand_bits == 0x2) 292 zp_factor -= 1; 293 /* if 0x0 or 0x3, leave it alone */ 294 } 295 296 /* -zp: enable poisoning for every alloc and free */ 297 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { 298 zp_factor = 1; 299 } 300 301 /* -no-zp: disable poisoning completely even for tiny zones */ 302 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { 303 zp_factor = 0; 304 zp_tiny_zone_limit = 0; 305 printf("Zone poisoning disabled\n"); 306 } 307 308 /* zp-factor=XXXX: override how often to poison freed zone elements */ 309 if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) { 310 printf("Zone poisoning factor override: %u\n", zp_factor); 311 } 312 313 /* zp-scale=XXXX: override how much zone size scales zp-factor by */ 314 if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) { 315 printf("Zone poisoning scale factor override: %u\n", zp_scale); 316 } 317 318 /* Initialize backup pointer random cookie for unpoisoned elements */ 319 zp_nopoison_cookie = (uintptr_t) early_random(); 320 321#if MACH_ASSERT 322 if (zp_poisoned_cookie == zp_nopoison_cookie) 323 panic("early_random() is broken: %p and %p are not random\n", 324 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie); 325#endif 326 327 /* 328 * Use the last bit in the backup pointer to hint poisoning state 329 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so 330 * the low bits are zero. 331 */ 332 zp_poisoned_cookie |= (uintptr_t)0x1ULL; 333 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL); 334 335#if defined(__LP64__) 336 /* 337 * Make backup pointers more obvious in GDB for 64 bit 338 * by making OxFFFFFF... ^ cookie = 0xFACADE... 339 * (0xFACADE = 0xFFFFFF ^ 0x053521) 340 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011) 341 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked 342 * by the sanity check, so it's OK for that part of the cookie to be predictable. 343 * 344 * TODO: Use #defines, xors, and shifts 345 */ 346 347 zp_poisoned_cookie &= 0x000000FFFFFFFFFF; 348 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */ 349 350 zp_nopoison_cookie &= 0x000000FFFFFFFFFF; 351 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */ 352#endif 353} 354 355/* zone_map page count for page table structure */ 356uint64_t zone_map_table_page_count = 0; 357 358/* 359 * These macros are used to keep track of the number 360 * of pages being used by the zone currently. The 361 * z->page_count is protected by the zone lock. 362 */ 363#define ZONE_PAGE_COUNT_INCR(z, count) \ 364{ \ 365 OSAddAtomic64(count, &(z->page_count)); \ 366} 367 368#define ZONE_PAGE_COUNT_DECR(z, count) \ 369{ \ 370 OSAddAtomic64(-count, &(z->page_count)); \ 371} 372 373/* for is_sane_zone_element and garbage collection */ 374 375vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */ 376vm_offset_t zone_map_max_address = 0; 377 378/* Helpful for walking through a zone's free element list. */ 379struct zone_free_element { 380 struct zone_free_element *next; 381 /* ... */ 382 /* void *backup_ptr; */ 383}; 384 385struct zone_page_metadata { 386 queue_chain_t pages; 387 struct zone_free_element *elements; 388 zone_t zone; 389 uint16_t alloc_count; 390 uint16_t free_count; 391}; 392 393/* The backup pointer is stored in the last pointer-sized location in an element. */ 394static inline vm_offset_t * 395get_backup_ptr(vm_size_t elem_size, 396 vm_offset_t *element) 397{ 398 return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t)); 399} 400 401static inline struct zone_page_metadata * 402get_zone_page_metadata(struct zone_free_element *element) 403{ 404 return (struct zone_page_metadata *)(trunc_page((vm_offset_t)element) + PAGE_SIZE - sizeof(struct zone_page_metadata)); 405} 406 407/* 408 * Zone checking helper function. 409 * A pointer that satisfies these conditions is OK to be a freelist next pointer 410 * A pointer that doesn't satisfy these conditions indicates corruption 411 */ 412static inline boolean_t 413is_sane_zone_ptr(zone_t zone, 414 vm_offset_t addr, 415 size_t obj_size) 416{ 417 /* Must be aligned to pointer boundary */ 418 if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0)) 419 return FALSE; 420 421 /* Must be a kernel address */ 422 if (__improbable(!pmap_kernel_va(addr))) 423 return FALSE; 424 425 /* Must be from zone map if the zone only uses memory from the zone_map */ 426 /* 427 * TODO: Remove the zone->collectable check when every 428 * zone using foreign memory is properly tagged with allows_foreign 429 */ 430 if (zone->collectable && !zone->allows_foreign) { 431#if ZONE_ALIAS_ADDR 432 /* 433 * If this address is in the static kernel region, it might be 434 * the alias address of a valid zone element. 435 * If we tried to find the zone_virtual_addr() of an invalid 436 * address in the static kernel region, it will panic, so don't 437 * check addresses in this region. 438 * 439 * TODO: Use a safe variant of zone_virtual_addr to 440 * make this check more accurate 441 * 442 * The static kernel region is mapped at: 443 * [gVirtBase, gVirtBase + gPhysSize] 444 */ 445 if ((addr - gVirtBase) < gPhysSize) 446 return TRUE; 447#endif 448 /* check if addr is from zone map */ 449 if (addr >= zone_map_min_address && 450 (addr + obj_size - 1) < zone_map_max_address ) 451 return TRUE; 452 453 return FALSE; 454 } 455 456 return TRUE; 457} 458 459static inline boolean_t 460is_sane_zone_page_metadata(zone_t zone, 461 vm_offset_t page_meta) 462{ 463 /* NULL page metadata structures are invalid */ 464 if (page_meta == 0) 465 return FALSE; 466 return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata)); 467} 468 469static inline boolean_t 470is_sane_zone_element(zone_t zone, 471 vm_offset_t addr) 472{ 473 /* NULL is OK because it indicates the tail of the list */ 474 if (addr == 0) 475 return TRUE; 476 return is_sane_zone_ptr(zone, addr, zone->elem_size); 477} 478 479/* Someone wrote to freed memory. */ 480static inline void /* noreturn */ 481zone_element_was_modified_panic(zone_t zone, 482 vm_offset_t element, 483 vm_offset_t found, 484 vm_offset_t expected, 485 vm_offset_t offset) 486{ 487 panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p", 488 zone->zone_name, 489 (void *) expected, 490 (void *) found, 491 (void *) (expected ^ found), 492 (uint32_t) offset, 493 (uint32_t) zone->elem_size, 494 (void *) element, 495 (void *) zp_nopoison_cookie, 496 (void *) zp_poisoned_cookie); 497} 498 499/* 500 * The primary and backup pointers don't match. 501 * Determine which one was likely the corrupted pointer, find out what it 502 * probably should have been, and panic. 503 * I would like to mark this as noreturn, but panic() isn't marked noreturn. 504 */ 505static void /* noreturn */ 506backup_ptr_mismatch_panic(zone_t zone, 507 vm_offset_t element, 508 vm_offset_t primary, 509 vm_offset_t backup) 510{ 511 vm_offset_t likely_backup; 512 513 boolean_t sane_backup; 514 boolean_t sane_primary = is_sane_zone_element(zone, primary); 515 boolean_t element_was_poisoned = (backup & 0x1) ? TRUE : FALSE; 516 517#if defined(__LP64__) 518 /* We can inspect the tag in the upper bits for additional confirmation */ 519 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) 520 element_was_poisoned = TRUE; 521 else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) 522 element_was_poisoned = FALSE; 523#endif 524 525 if (element_was_poisoned) { 526 likely_backup = backup ^ zp_poisoned_cookie; 527 sane_backup = is_sane_zone_element(zone, likely_backup); 528 } else { 529 likely_backup = backup ^ zp_nopoison_cookie; 530 sane_backup = is_sane_zone_element(zone, likely_backup); 531 } 532 533 /* The primary is definitely the corrupted one */ 534 if (!sane_primary && sane_backup) 535 zone_element_was_modified_panic(zone, element, primary, likely_backup, 0); 536 537 /* The backup is definitely the corrupted one */ 538 if (sane_primary && !sane_backup) 539 zone_element_was_modified_panic(zone, element, backup, 540 (primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)), 541 zone->elem_size - sizeof(vm_offset_t)); 542 543 /* 544 * Not sure which is the corrupted one. 545 * It's less likely that the backup pointer was overwritten with 546 * ( (sane address) ^ (valid cookie) ), so we'll guess that the 547 * primary pointer has been overwritten with a sane but incorrect address. 548 */ 549 if (sane_primary && sane_backup) 550 zone_element_was_modified_panic(zone, element, primary, likely_backup, 0); 551 552 /* Neither are sane, so just guess. */ 553 zone_element_was_modified_panic(zone, element, primary, likely_backup, 0); 554} 555 556 557/* 558 * Sets the next element of tail to elem. 559 * elem can be NULL. 560 * Preserves the poisoning state of the element. 561 */ 562static inline void 563append_zone_element(zone_t zone, 564 struct zone_free_element *tail, 565 struct zone_free_element *elem) 566{ 567 vm_offset_t *backup = get_backup_ptr(zone->elem_size, (vm_offset_t *) tail); 568 569 vm_offset_t old_backup = *backup; 570 571 vm_offset_t old_next = (vm_offset_t) tail->next; 572 vm_offset_t new_next = (vm_offset_t) elem; 573 574 if (old_next == (old_backup ^ zp_nopoison_cookie)) 575 *backup = new_next ^ zp_nopoison_cookie; 576 else if (old_next == (old_backup ^ zp_poisoned_cookie)) 577 *backup = new_next ^ zp_poisoned_cookie; 578 else 579 backup_ptr_mismatch_panic(zone, 580 (vm_offset_t) tail, 581 old_next, 582 old_backup); 583 584 tail->next = elem; 585} 586 587 588/* 589 * Insert a linked list of elements (delineated by head and tail) at the head of 590 * the zone free list. Every element in the list being added has already gone 591 * through append_zone_element, so their backup pointers are already 592 * set properly. 593 * Precondition: There should be no elements after tail 594 */ 595static inline void 596add_list_to_zone(zone_t zone, 597 struct zone_free_element *head, 598 struct zone_free_element *tail) 599{ 600 assert(tail->next == NULL); 601 assert(!zone->use_page_list); 602 603 append_zone_element(zone, tail, zone->free_elements); 604 605 zone->free_elements = head; 606} 607 608 609/* 610 * Adds the element to the head of the zone's free list 611 * Keeps a backup next-pointer at the end of the element 612 */ 613static inline void 614free_to_zone(zone_t zone, 615 vm_offset_t element, 616 boolean_t poison) 617{ 618 vm_offset_t old_head; 619 struct zone_page_metadata *page_meta; 620 621 vm_offset_t *primary = (vm_offset_t *) element; 622 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); 623 624 if (zone->use_page_list) { 625 page_meta = get_zone_page_metadata((struct zone_free_element *)element); 626 assert(page_meta->zone == zone); 627 old_head = (vm_offset_t)page_meta->elements; 628 } else { 629 old_head = (vm_offset_t)zone->free_elements; 630 } 631 632#if MACH_ASSERT 633 if (__improbable(!is_sane_zone_element(zone, old_head))) 634 panic("zfree: invalid head pointer %p for freelist of zone %s\n", 635 (void *) old_head, zone->zone_name); 636#endif 637 638 if (__improbable(!is_sane_zone_element(zone, element))) 639 panic("zfree: freeing invalid pointer %p to zone %s\n", 640 (void *) element, zone->zone_name); 641 642 /* 643 * Always write a redundant next pointer 644 * So that it is more difficult to forge, xor it with a random cookie 645 * A poisoned element is indicated by using zp_poisoned_cookie 646 * instead of zp_nopoison_cookie 647 */ 648 649 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie); 650 651 /* Insert this element at the head of the free list */ 652 *primary = old_head; 653 if (zone->use_page_list) { 654 page_meta->elements = (struct zone_free_element *)element; 655 page_meta->free_count++; 656 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { 657 if (page_meta->free_count == 1) { 658 /* first foreign element freed on page, move from all_used */ 659 remqueue((queue_entry_t)page_meta); 660 enqueue_tail(&zone->pages.any_free_foreign, (queue_entry_t)page_meta); 661 } else { 662 /* no other list transitions */ 663 } 664 } else if (page_meta->free_count == page_meta->alloc_count) { 665 /* whether the page was on the intermediate or all_used, queue, move it to free */ 666 remqueue((queue_entry_t)page_meta); 667 enqueue_tail(&zone->pages.all_free, (queue_entry_t)page_meta); 668 } else if (page_meta->free_count == 1) { 669 /* first free element on page, move from all_used */ 670 remqueue((queue_entry_t)page_meta); 671 enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); 672 } 673 } else { 674 zone->free_elements = (struct zone_free_element *)element; 675 } 676 zone->count--; 677 zone->countfree++; 678} 679 680 681/* 682 * Removes an element from the zone's free list, returning 0 if the free list is empty. 683 * Verifies that the next-pointer and backup next-pointer are intact, 684 * and verifies that a poisoned element hasn't been modified. 685 */ 686static inline vm_offset_t 687try_alloc_from_zone(zone_t zone, 688 boolean_t* check_poison) 689{ 690 vm_offset_t element; 691 struct zone_page_metadata *page_meta; 692 693 *check_poison = FALSE; 694 695 /* if zone is empty, bail */ 696 if (zone->use_page_list) { 697 if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) 698 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); 699 else if (!queue_empty(&zone->pages.intermediate)) 700 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); 701 else if (!queue_empty(&zone->pages.all_free)) 702 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); 703 else { 704 return 0; 705 } 706 707 /* Check if page_meta passes is_sane_zone_element */ 708 if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) 709 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n", 710 (void *) page_meta, zone->zone_name); 711 assert(page_meta->zone == zone); 712 element = (vm_offset_t)page_meta->elements; 713 } else { 714 if (zone->free_elements == NULL) 715 return 0; 716 717 element = (vm_offset_t)zone->free_elements; 718 } 719 720#if MACH_ASSERT 721 if (__improbable(!is_sane_zone_element(zone, element))) 722 panic("zfree: invalid head pointer %p for freelist of zone %s\n", 723 (void *) element, zone->zone_name); 724#endif 725 726 vm_offset_t *primary = (vm_offset_t *) element; 727 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); 728 729 vm_offset_t next_element = *primary; 730 vm_offset_t next_element_backup = *backup; 731 732 /* 733 * backup_ptr_mismatch_panic will determine what next_element 734 * should have been, and print it appropriately 735 */ 736 if (__improbable(!is_sane_zone_element(zone, next_element))) 737 backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup); 738 739 /* Check the backup pointer for the regular cookie */ 740 if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) { 741 742 /* Check for the poisoned cookie instead */ 743 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) 744 /* Neither cookie is valid, corruption has occurred */ 745 backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup); 746 747 /* 748 * Element was marked as poisoned, so check its integrity before using it. 749 */ 750 *check_poison = TRUE; 751 } 752 753 if (zone->use_page_list) { 754 755 /* Make sure the page_meta is at the correct offset from the start of page */ 756 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element))) 757 panic("zalloc: metadata located at incorrect location on page of zone %s\n", 758 zone->zone_name); 759 760 /* Make sure next_element belongs to the same page as page_meta */ 761 if (next_element) { 762 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element))) 763 panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n", 764 (void *)next_element, (void *)element, zone->zone_name); 765 } 766 } 767 768 /* Remove this element from the free list */ 769 if (zone->use_page_list) { 770 771 page_meta->elements = (struct zone_free_element *)next_element; 772 page_meta->free_count--; 773 774 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { 775 if (page_meta->free_count == 0) { 776 /* move to all used */ 777 remqueue((queue_entry_t)page_meta); 778 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); 779 } else { 780 /* no other list transitions */ 781 } 782 } else if (page_meta->free_count == 0) { 783 /* remove from intermediate or free, move to all_used */ 784 remqueue((queue_entry_t)page_meta); 785 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); 786 } else if (page_meta->alloc_count == page_meta->free_count + 1) { 787 /* remove from free, move to intermediate */ 788 remqueue((queue_entry_t)page_meta); 789 enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); 790 } 791 } else { 792 zone->free_elements = (struct zone_free_element *)next_element; 793 } 794 zone->countfree--; 795 zone->count++; 796 zone->sum_count++; 797 798 return element; 799} 800 801 802/* 803 * End of zone poisoning 804 */ 805 806/* 807 * Fake zones for things that want to report via zprint but are not actually zones. 808 */ 809struct fake_zone_info { 810 const char* name; 811 void (*init)(int); 812 void (*query)(int *, 813 vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, 814 uint64_t *, int *, int *, int *); 815}; 816 817static const struct fake_zone_info fake_zones[] = { 818 { 819 .name = "kernel_stacks", 820 .init = stack_fake_zone_init, 821 .query = stack_fake_zone_info, 822 }, 823 { 824 .name = "page_tables", 825 .init = pt_fake_zone_init, 826 .query = pt_fake_zone_info, 827 }, 828 { 829 .name = "kalloc.large", 830 .init = kalloc_fake_zone_init, 831 .query = kalloc_fake_zone_info, 832 }, 833}; 834static const unsigned int num_fake_zones = 835 sizeof (fake_zones) / sizeof (fake_zones[0]); 836 837/* 838 * Zone info options 839 */ 840boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */ 841#define ZINFO_SLOTS 200 /* for now */ 842#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1) 843 844/* 845 * Support for garbage collection of unused zone pages 846 * 847 * The kernel virtually allocates the "zone map" submap of the kernel 848 * map. When an individual zone needs more storage, memory is allocated 849 * out of the zone map, and the two-level "zone_page_table" is 850 * on-demand expanded so that it has entries for those pages. 851 * zone_page_init()/zone_page_alloc() initialize "alloc_count" 852 * to the number of zone elements that occupy the zone page (which may 853 * be a minimum of 1, including if a zone element spans multiple 854 * pages). 855 * 856 * Asynchronously, the zone_gc() logic attempts to walk zone free 857 * lists to see if all the elements on a zone page are free. If 858 * "collect_count" (which it increments during the scan) matches 859 * "alloc_count", the zone page is a candidate for collection and the 860 * physical page is returned to the VM system. During this process, the 861 * first word of the zone page is re-used to maintain a linked list of 862 * to-be-collected zone pages. 863 */ 864typedef uint32_t zone_page_index_t; 865#define ZONE_PAGE_INDEX_INVALID ((zone_page_index_t)0xFFFFFFFFU) 866 867struct zone_page_table_entry { 868 volatile uint16_t alloc_count; 869 volatile uint16_t collect_count; 870}; 871 872#define ZONE_PAGE_USED 0 873#define ZONE_PAGE_UNUSED 0xffff 874 875/* Forwards */ 876void zone_page_init( 877 vm_offset_t addr, 878 vm_size_t size); 879 880void zone_page_alloc( 881 vm_offset_t addr, 882 vm_size_t size); 883 884void zone_page_free_element( 885 zone_page_index_t *free_page_head, 886 zone_page_index_t *free_page_tail, 887 vm_offset_t addr, 888 vm_size_t size); 889 890void zone_page_collect( 891 vm_offset_t addr, 892 vm_size_t size); 893 894boolean_t zone_page_collectable( 895 vm_offset_t addr, 896 vm_size_t size); 897 898void zone_page_keep( 899 vm_offset_t addr, 900 vm_size_t size); 901 902void zone_display_zprint(void); 903 904zone_t zone_find_largest(void); 905 906/* 907 * Async allocation of zones 908 * This mechanism allows for bootstrapping an empty zone which is setup with 909 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call 910 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free. 911 * This will prime the zone for the next use. 912 * 913 * Currently the thread_callout function (zalloc_async) will loop through all zones 914 * looking for any zone with async_pending set and do the work for it. 915 * 916 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call, 917 * then zalloc_noblock to an empty zone may succeed. 918 */ 919void zalloc_async( 920 thread_call_param_t p0, 921 thread_call_param_t p1); 922 923static thread_call_data_t call_async_alloc; 924 925vm_map_t zone_map = VM_MAP_NULL; 926 927zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */ 928 929zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */ 930 931/* 932 * The VM system gives us an initial chunk of memory. 933 * It has to be big enough to allocate the zone_zone 934 * all the way through the pmap zone. 935 */ 936 937vm_offset_t zdata; 938vm_size_t zdata_size; 939 940#define zone_wakeup(zone) thread_wakeup((event_t)(zone)) 941#define zone_sleep(zone) \ 942 (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT); 943 944/* 945 * The zone_locks_grp allows for collecting lock statistics. 946 * All locks are associated to this group in zinit. 947 * Look at tools/lockstat for debugging lock contention. 948 */ 949 950lck_grp_t zone_locks_grp; 951lck_grp_attr_t zone_locks_grp_attr; 952 953#define lock_zone_init(zone) \ 954MACRO_BEGIN \ 955 lck_attr_setdefault(&(zone)->lock_attr); \ 956 lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext, \ 957 &zone_locks_grp, &(zone)->lock_attr); \ 958MACRO_END 959 960#define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock) 961 962/* 963 * Garbage collection map information 964 */ 965#define ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE (32) 966struct zone_page_table_entry * volatile zone_page_table[ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE]; 967vm_size_t zone_page_table_used_size; 968unsigned int zone_pages; 969unsigned int zone_page_table_second_level_size; /* power of 2 */ 970unsigned int zone_page_table_second_level_shift_amount; 971 972#define zone_page_table_first_level_slot(x) ((x) >> zone_page_table_second_level_shift_amount) 973#define zone_page_table_second_level_slot(x) ((x) & (zone_page_table_second_level_size - 1)) 974 975void zone_page_table_expand(zone_page_index_t pindex); 976struct zone_page_table_entry *zone_page_table_lookup(zone_page_index_t pindex); 977 978/* 979 * Exclude more than one concurrent garbage collection 980 */ 981decl_lck_mtx_data(, zone_gc_lock) 982 983lck_attr_t zone_gc_lck_attr; 984lck_grp_t zone_gc_lck_grp; 985lck_grp_attr_t zone_gc_lck_grp_attr; 986lck_mtx_ext_t zone_gc_lck_ext; 987 988/* 989 * Protects first_zone, last_zone, num_zones, 990 * and the next_zone field of zones. 991 */ 992decl_simple_lock_data(, all_zones_lock) 993zone_t first_zone; 994zone_t *last_zone; 995unsigned int num_zones; 996 997boolean_t zone_gc_allowed = TRUE; 998boolean_t zone_gc_forced = FALSE; 999boolean_t panic_include_zprint = FALSE; 1000boolean_t zone_gc_allowed_by_time_throttle = TRUE; 1001 1002#define ZALLOC_DEBUG_ZONEGC 0x00000001 1003#define ZALLOC_DEBUG_ZCRAM 0x00000002 1004uint32_t zalloc_debug = 0; 1005 1006/* 1007 * Zone leak debugging code 1008 * 1009 * When enabled, this code keeps a log to track allocations to a particular zone that have not 1010 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated 1011 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is 1012 * off by default. 1013 * 1014 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone> 1015 * is the name of the zone you wish to log. 1016 * 1017 * This code only tracks one zone, so you need to identify which one is leaking first. 1018 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone 1019 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one 1020 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This 1021 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The 1022 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. 1023 * See the help in the kgmacros for usage info. 1024 * 1025 * 1026 * Zone corruption logging 1027 * 1028 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone 1029 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction 1030 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the 1031 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated 1032 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been 1033 * corrupted to examine its history. This should lead to the source of the corruption. 1034 */ 1035 1036static int log_records; /* size of the log, expressed in number of records */ 1037 1038#define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */ 1039 1040static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */ 1041 1042/* Log allocations and frees to help debug a zone element corruption */ 1043boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */ 1044 1045/* 1046 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to 1047 * the number of records you want in the log. For example, "zrecs=1000" sets it to 1000 records. Note 1048 * that the larger the size of the log, the slower the system will run due to linear searching in the log, 1049 * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000 1050 * records since going much larger than this tends to make the system unresponsive and unbootable on small 1051 * memory configurations. The default value is 4000 records. 1052 */ 1053 1054#if defined(__LP64__) 1055#define ZRECORDS_MAX 128000 /* Max records allowed in the log */ 1056#else 1057#define ZRECORDS_MAX 8000 /* Max records allowed in the log */ 1058#endif 1059#define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */ 1060 1061/* 1062 * Each record in the log contains a pointer to the zone element it refers to, 1063 * and a small array to hold the pc's from the stack trace. A 1064 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, 1065 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. 1066 * If the log fills, old records are replaced as if it were a circular buffer. 1067 */ 1068 1069 1070/* 1071 * Opcodes for the btlog operation field: 1072 */ 1073 1074#define ZOP_ALLOC 1 1075#define ZOP_FREE 0 1076 1077/* 1078 * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest 1079 */ 1080static btlog_t *zlog_btlog; /* the log itself, dynamically allocated when logging is enabled */ 1081static zone_t zone_of_interest = NULL; /* the zone being watched; corresponds to zone_name_to_log */ 1082 1083/* 1084 * Decide if we want to log this zone by doing a string compare between a zone name and the name 1085 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not 1086 * possible to include spaces in strings passed in via the boot-args, a period in the logname will 1087 * match a space in the zone name. 1088 */ 1089 1090static int 1091log_this_zone(const char *zonename, const char *logname) 1092{ 1093 int len; 1094 const char *zc = zonename; 1095 const char *lc = logname; 1096 1097 /* 1098 * Compare the strings. We bound the compare by MAX_ZONE_NAME. 1099 */ 1100 1101 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { 1102 1103 /* 1104 * If the current characters don't match, check for a space in 1105 * in the zone name and a corresponding period in the log name. 1106 * If that's not there, then the strings don't match. 1107 */ 1108 1109 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) 1110 break; 1111 1112 /* 1113 * The strings are equal so far. If we're at the end, then it's a match. 1114 */ 1115 1116 if (*zc == '\0') 1117 return TRUE; 1118 } 1119 1120 return FALSE; 1121} 1122 1123 1124/* 1125 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and 1126 * the buffer for the records has been allocated. 1127 */ 1128 1129#define DO_LOGGING(z) (zlog_btlog && (z) == zone_of_interest) 1130 1131extern boolean_t kmem_alloc_ready; 1132 1133#if CONFIG_ZLEAKS 1134#pragma mark - 1135#pragma mark Zone Leak Detection 1136 1137/* 1138 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding 1139 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a 1140 * backtrace. Every free, we examine the table and determine if the allocation was being tracked, 1141 * and stop tracking it if it was being tracked. 1142 * 1143 * We track the allocations in the zallocations hash table, which stores the address that was returned from 1144 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which 1145 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large 1146 * backtraces - we don't store them more than once. 1147 * 1148 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up 1149 * a large amount of virtual space. 1150 */ 1151#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ 1152#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ 1153#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ 1154#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ 1155uint32_t zleak_state = 0; /* State of collection, as above */ 1156 1157boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ 1158vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ 1159vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ 1160unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ 1161 1162/* 1163 * Counters for allocation statistics. 1164 */ 1165 1166/* Times two active records want to occupy the same spot */ 1167unsigned int z_alloc_collisions = 0; 1168unsigned int z_trace_collisions = 0; 1169 1170/* Times a new record lands on a spot previously occupied by a freed allocation */ 1171unsigned int z_alloc_overwrites = 0; 1172unsigned int z_trace_overwrites = 0; 1173 1174/* Times a new alloc or trace is put into the hash table */ 1175unsigned int z_alloc_recorded = 0; 1176unsigned int z_trace_recorded = 0; 1177 1178/* Times zleak_log returned false due to not being able to acquire the lock */ 1179unsigned int z_total_conflicts = 0; 1180 1181 1182#pragma mark struct zallocation 1183/* 1184 * Structure for keeping track of an allocation 1185 * An allocation bucket is in use if its element is not NULL 1186 */ 1187struct zallocation { 1188 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ 1189 vm_size_t za_size; /* how much memory did this allocation take up? */ 1190 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ 1191 /* TODO: #if this out */ 1192 uint32_t za_hit_count; /* for determining effectiveness of hash function */ 1193}; 1194 1195/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ 1196uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; 1197uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; 1198 1199vm_size_t zleak_max_zonemap_size; 1200 1201/* Hashmaps of allocations and their corresponding traces */ 1202static struct zallocation* zallocations; 1203static struct ztrace* ztraces; 1204 1205/* not static so that panic can see this, see kern/debug.c */ 1206struct ztrace* top_ztrace; 1207 1208/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ 1209static lck_spin_t zleak_lock; 1210static lck_attr_t zleak_lock_attr; 1211static lck_grp_t zleak_lock_grp; 1212static lck_grp_attr_t zleak_lock_grp_attr; 1213 1214/* 1215 * Initializes the zone leak monitor. Called from zone_init() 1216 */ 1217static void 1218zleak_init(vm_size_t max_zonemap_size) 1219{ 1220 char scratch_buf[16]; 1221 boolean_t zleak_enable_flag = FALSE; 1222 1223 zleak_max_zonemap_size = max_zonemap_size; 1224 zleak_global_tracking_threshold = max_zonemap_size / 2; 1225 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; 1226 1227 /* -zleakoff (flag to disable zone leak monitor) */ 1228 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { 1229 zleak_enable_flag = FALSE; 1230 printf("zone leak detection disabled\n"); 1231 } else { 1232 zleak_enable_flag = TRUE; 1233 printf("zone leak detection enabled\n"); 1234 } 1235 1236 /* zfactor=XXXX (override how often to sample the zone allocator) */ 1237 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { 1238 printf("Zone leak factor override: %u\n", zleak_sample_factor); 1239 } 1240 1241 /* zleak-allocs=XXXX (override number of buckets in zallocations) */ 1242 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { 1243 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets); 1244 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ 1245 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) { 1246 printf("Override isn't a power of two, bad things might happen!\n"); 1247 } 1248 } 1249 1250 /* zleak-traces=XXXX (override number of buckets in ztraces) */ 1251 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { 1252 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets); 1253 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ 1254 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) { 1255 printf("Override isn't a power of two, bad things might happen!\n"); 1256 } 1257 } 1258 1259 /* allocate the zleak_lock */ 1260 lck_grp_attr_setdefault(&zleak_lock_grp_attr); 1261 lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr); 1262 lck_attr_setdefault(&zleak_lock_attr); 1263 lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr); 1264 1265 if (zleak_enable_flag) { 1266 zleak_state = ZLEAK_STATE_ENABLED; 1267 } 1268} 1269 1270#if CONFIG_ZLEAKS 1271 1272/* 1273 * Support for kern.zleak.active sysctl - a simplified 1274 * version of the zleak_state variable. 1275 */ 1276int 1277get_zleak_state(void) 1278{ 1279 if (zleak_state & ZLEAK_STATE_FAILED) 1280 return (-1); 1281 if (zleak_state & ZLEAK_STATE_ACTIVE) 1282 return (1); 1283 return (0); 1284} 1285 1286#endif 1287 1288 1289kern_return_t 1290zleak_activate(void) 1291{ 1292 kern_return_t retval; 1293 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); 1294 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); 1295 void *allocations_ptr = NULL; 1296 void *traces_ptr = NULL; 1297 1298 /* Only one thread attempts to activate at a time */ 1299 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { 1300 return KERN_SUCCESS; 1301 } 1302 1303 /* Indicate that we're doing the setup */ 1304 lck_spin_lock(&zleak_lock); 1305 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { 1306 lck_spin_unlock(&zleak_lock); 1307 return KERN_SUCCESS; 1308 } 1309 1310 zleak_state |= ZLEAK_STATE_ACTIVATING; 1311 lck_spin_unlock(&zleak_lock); 1312 1313 /* Allocate and zero tables */ 1314 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size); 1315 if (retval != KERN_SUCCESS) { 1316 goto fail; 1317 } 1318 1319 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size); 1320 if (retval != KERN_SUCCESS) { 1321 goto fail; 1322 } 1323 1324 bzero(allocations_ptr, z_alloc_size); 1325 bzero(traces_ptr, z_trace_size); 1326 1327 /* Everything's set. Install tables, mark active. */ 1328 zallocations = allocations_ptr; 1329 ztraces = traces_ptr; 1330 1331 /* 1332 * Initialize the top_ztrace to the first entry in ztraces, 1333 * so we don't have to check for null in zleak_log 1334 */ 1335 top_ztrace = &ztraces[0]; 1336 1337 /* 1338 * Note that we do need a barrier between installing 1339 * the tables and setting the active flag, because the zfree() 1340 * path accesses the table without a lock if we're active. 1341 */ 1342 lck_spin_lock(&zleak_lock); 1343 zleak_state |= ZLEAK_STATE_ACTIVE; 1344 zleak_state &= ~ZLEAK_STATE_ACTIVATING; 1345 lck_spin_unlock(&zleak_lock); 1346 1347 return 0; 1348 1349fail: 1350 /* 1351 * If we fail to allocate memory, don't further tax 1352 * the system by trying again. 1353 */ 1354 lck_spin_lock(&zleak_lock); 1355 zleak_state |= ZLEAK_STATE_FAILED; 1356 zleak_state &= ~ZLEAK_STATE_ACTIVATING; 1357 lck_spin_unlock(&zleak_lock); 1358 1359 if (allocations_ptr != NULL) { 1360 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); 1361 } 1362 1363 if (traces_ptr != NULL) { 1364 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); 1365 } 1366 1367 return retval; 1368} 1369 1370/* 1371 * TODO: What about allocations that never get deallocated, 1372 * especially ones with unique backtraces? Should we wait to record 1373 * until after boot has completed? 1374 * (How many persistent zallocs are there?) 1375 */ 1376 1377/* 1378 * This function records the allocation in the allocations table, 1379 * and stores the associated backtrace in the traces table 1380 * (or just increments the refcount if the trace is already recorded) 1381 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and 1382 * the associated trace's refcount is decremented. 1383 * If the trace slot is in use, it returns. 1384 * The refcount is incremented by the amount of memory the allocation consumes. 1385 * The return value indicates whether to try again next time. 1386 */ 1387static boolean_t 1388zleak_log(uintptr_t* bt, 1389 uintptr_t addr, 1390 uint32_t depth, 1391 vm_size_t allocation_size) 1392{ 1393 /* Quit if there's someone else modifying the hash tables */ 1394 if (!lck_spin_try_lock(&zleak_lock)) { 1395 z_total_conflicts++; 1396 return FALSE; 1397 } 1398 1399 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; 1400 1401 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); 1402 struct ztrace* trace = &ztraces[trace_index]; 1403 1404 allocation->za_hit_count++; 1405 trace->zt_hit_count++; 1406 1407 /* 1408 * If the allocation bucket we want to be in is occupied, and if the occupier 1409 * has the same trace as us, just bail. 1410 */ 1411 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { 1412 z_alloc_collisions++; 1413 1414 lck_spin_unlock(&zleak_lock); 1415 return TRUE; 1416 } 1417 1418 /* STEP 1: Store the backtrace in the traces array. */ 1419 /* A size of zero indicates that the trace bucket is free. */ 1420 1421 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) { 1422 /* 1423 * Different unique trace with same hash! 1424 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated 1425 * and get out of the way for later chances 1426 */ 1427 trace->zt_collisions++; 1428 z_trace_collisions++; 1429 1430 lck_spin_unlock(&zleak_lock); 1431 return TRUE; 1432 } else if (trace->zt_size > 0) { 1433 /* Same trace, already added, so increment refcount */ 1434 trace->zt_size += allocation_size; 1435 } else { 1436 /* Found an unused trace bucket, record the trace here! */ 1437 if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */ 1438 z_trace_overwrites++; 1439 1440 z_trace_recorded++; 1441 trace->zt_size = allocation_size; 1442 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) ); 1443 1444 trace->zt_depth = depth; 1445 trace->zt_collisions = 0; 1446 } 1447 1448 /* STEP 2: Store the allocation record in the allocations array. */ 1449 1450 if (allocation->za_element != (uintptr_t) 0) { 1451 /* 1452 * Straight up replace any allocation record that was there. We don't want to do the work 1453 * to preserve the allocation entries that were there, because we only record a subset of the 1454 * allocations anyways. 1455 */ 1456 1457 z_alloc_collisions++; 1458 1459 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; 1460 /* Knock off old allocation's size, not the new allocation */ 1461 associated_trace->zt_size -= allocation->za_size; 1462 } else if (allocation->za_trace_index != 0) { 1463 /* Slot previously used but not currently in use */ 1464 z_alloc_overwrites++; 1465 } 1466 1467 allocation->za_element = addr; 1468 allocation->za_trace_index = trace_index; 1469 allocation->za_size = allocation_size; 1470 1471 z_alloc_recorded++; 1472 1473 if (top_ztrace->zt_size < trace->zt_size) 1474 top_ztrace = trace; 1475 1476 lck_spin_unlock(&zleak_lock); 1477 return TRUE; 1478} 1479 1480/* 1481 * Free the allocation record and release the stacktrace. 1482 * This should be as fast as possible because it will be called for every free. 1483 */ 1484static void 1485zleak_free(uintptr_t addr, 1486 vm_size_t allocation_size) 1487{ 1488 if (addr == (uintptr_t) 0) 1489 return; 1490 1491 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; 1492 1493 /* Double-checked locking: check to find out if we're interested, lock, check to make 1494 * sure it hasn't changed, then modify it, and release the lock. 1495 */ 1496 1497 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { 1498 /* if the allocation was the one, grab the lock, check again, then delete it */ 1499 lck_spin_lock(&zleak_lock); 1500 1501 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { 1502 struct ztrace *trace; 1503 1504 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ 1505 if (allocation->za_size != allocation_size) { 1506 panic("Freeing as size %lu memory that was allocated with size %lu\n", 1507 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); 1508 } 1509 1510 trace = &ztraces[allocation->za_trace_index]; 1511 1512 /* size of 0 indicates trace bucket is unused */ 1513 if (trace->zt_size > 0) { 1514 trace->zt_size -= allocation_size; 1515 } 1516 1517 /* A NULL element means the allocation bucket is unused */ 1518 allocation->za_element = 0; 1519 } 1520 lck_spin_unlock(&zleak_lock); 1521 } 1522} 1523 1524#endif /* CONFIG_ZLEAKS */ 1525 1526/* These functions outside of CONFIG_ZLEAKS because they are also used in 1527 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. 1528 */ 1529 1530/* 1531 * This function captures a backtrace from the current stack and 1532 * returns the number of frames captured, limited by max_frames. 1533 * It's fast because it does no checking to make sure there isn't bad data. 1534 * Since it's only called from threads that we're going to keep executing, 1535 * if there's bad data we were going to die eventually. 1536 * If this function is inlined, it doesn't record the frame of the function it's inside. 1537 * (because there's no stack frame!) 1538 */ 1539 1540uint32_t 1541fastbacktrace(uintptr_t* bt, uint32_t max_frames) 1542{ 1543 uintptr_t* frameptr = NULL, *frameptr_next = NULL; 1544 uintptr_t retaddr = 0; 1545 uint32_t frame_index = 0, frames = 0; 1546 uintptr_t kstackb, kstackt; 1547 thread_t cthread = current_thread(); 1548 1549 if (__improbable(cthread == NULL)) 1550 return 0; 1551 1552 kstackb = cthread->kernel_stack; 1553 kstackt = kstackb + kernel_stack_size; 1554 /* Load stack frame pointer (EBP on x86) into frameptr */ 1555 frameptr = __builtin_frame_address(0); 1556 if (((uintptr_t)frameptr > kstackt) || ((uintptr_t)frameptr < kstackb)) 1557 frameptr = NULL; 1558 1559 while (frameptr != NULL && frame_index < max_frames ) { 1560 /* Next frame pointer is pointed to by the previous one */ 1561 frameptr_next = (uintptr_t*) *frameptr; 1562 1563 /* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */ 1564 /* That also means the return address is worthless, so don't record it */ 1565 if (frameptr_next == NULL) 1566 break; 1567 /* Verify thread stack bounds */ 1568 if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb)) 1569 break; 1570 /* Pull return address from one spot above the frame pointer */ 1571 retaddr = *(frameptr + 1); 1572 1573 /* Store it in the backtrace array */ 1574 bt[frame_index++] = retaddr; 1575 1576 frameptr = frameptr_next; 1577 } 1578 1579 /* Save the number of frames captured for return value */ 1580 frames = frame_index; 1581 1582 /* Fill in the rest of the backtrace with zeros */ 1583 while (frame_index < max_frames) 1584 bt[frame_index++] = 0; 1585 1586 return frames; 1587} 1588 1589/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ 1590uintptr_t 1591hash_mix(uintptr_t x) 1592{ 1593#ifndef __LP64__ 1594 x += ~(x << 15); 1595 x ^= (x >> 10); 1596 x += (x << 3 ); 1597 x ^= (x >> 6 ); 1598 x += ~(x << 11); 1599 x ^= (x >> 16); 1600#else 1601 x += ~(x << 32); 1602 x ^= (x >> 22); 1603 x += ~(x << 13); 1604 x ^= (x >> 8 ); 1605 x += (x << 3 ); 1606 x ^= (x >> 15); 1607 x += ~(x << 27); 1608 x ^= (x >> 31); 1609#endif 1610 return x; 1611} 1612 1613uint32_t 1614hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) 1615{ 1616 1617 uintptr_t hash = 0; 1618 uintptr_t mask = max_size - 1; 1619 1620 while (depth) { 1621 hash += bt[--depth]; 1622 } 1623 1624 hash = hash_mix(hash) & mask; 1625 1626 assert(hash < max_size); 1627 1628 return (uint32_t) hash; 1629} 1630 1631/* 1632 * TODO: Determine how well distributed this is 1633 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask 1634 */ 1635uint32_t 1636hashaddr(uintptr_t pt, uint32_t max_size) 1637{ 1638 uintptr_t hash = 0; 1639 uintptr_t mask = max_size - 1; 1640 1641 hash = hash_mix(pt) & mask; 1642 1643 assert(hash < max_size); 1644 1645 return (uint32_t) hash; 1646} 1647 1648/* End of all leak-detection code */ 1649#pragma mark - 1650 1651/* 1652 * zinit initializes a new zone. The zone data structures themselves 1653 * are stored in a zone, which is initially a static structure that 1654 * is initialized by zone_init. 1655 */ 1656zone_t 1657zinit( 1658 vm_size_t size, /* the size of an element */ 1659 vm_size_t max, /* maximum memory to use */ 1660 vm_size_t alloc, /* allocation size */ 1661 const char *name) /* a name for the zone */ 1662{ 1663 zone_t z; 1664 boolean_t use_page_list = FALSE; 1665 1666 if (zone_zone == ZONE_NULL) { 1667 1668 z = (struct zone *)zdata; 1669 /* special handling in zcram() because the first element is being used */ 1670 } else 1671 z = (zone_t) zalloc(zone_zone); 1672 1673 if (z == ZONE_NULL) 1674 return(ZONE_NULL); 1675 1676 /* Zone elements must fit both a next pointer and a backup pointer */ 1677 vm_size_t minimum_element_size = sizeof(vm_offset_t) * 2; 1678 if (size < minimum_element_size) 1679 size = minimum_element_size; 1680 1681 /* 1682 * Round element size to a multiple of sizeof(pointer) 1683 * This also enforces that allocations will be aligned on pointer boundaries 1684 */ 1685 size = ((size-1) + sizeof(vm_offset_t)) - 1686 ((size-1) % sizeof(vm_offset_t)); 1687 1688 if (alloc == 0) 1689 alloc = PAGE_SIZE; 1690 1691 alloc = round_page(alloc); 1692 max = round_page(max); 1693 1694 /* 1695 * we look for an allocation size with less than 1% waste 1696 * up to 5 pages in size... 1697 * otherwise, we look for an allocation size with least fragmentation 1698 * in the range of 1 - 5 pages 1699 * This size will be used unless 1700 * the user suggestion is larger AND has less fragmentation 1701 */ 1702#if ZONE_ALIAS_ADDR 1703 /* Favor PAGE_SIZE allocations unless we waste >10% space */ 1704 if ((size < PAGE_SIZE) && (PAGE_SIZE % size <= PAGE_SIZE / 10)) 1705 alloc = PAGE_SIZE; 1706 else 1707#endif 1708#if defined(__LP64__) 1709 if (((alloc % size) != 0) || (alloc > PAGE_SIZE * 8)) 1710#endif 1711 { 1712 vm_size_t best, waste; unsigned int i; 1713 best = PAGE_SIZE; 1714 waste = best % size; 1715 1716 for (i = 1; i <= 5; i++) { 1717 vm_size_t tsize, twaste; 1718 1719 tsize = i * PAGE_SIZE; 1720 1721 if ((tsize % size) < (tsize / 100)) { 1722 alloc = tsize; 1723 goto use_this_allocation; 1724 } 1725 twaste = tsize % size; 1726 if (twaste < waste) 1727 best = tsize, waste = twaste; 1728 } 1729 if (alloc <= best || (alloc % size >= waste)) 1730 alloc = best; 1731 } 1732use_this_allocation: 1733 if (max && (max < alloc)) 1734 max = alloc; 1735 1736 /* 1737 * Opt into page list tracking if we can reliably map an allocation 1738 * to its page_metadata, and if the wastage in the tail of 1739 * the allocation is not too large 1740 */ 1741 if (alloc == PAGE_SIZE) { 1742 if ((PAGE_SIZE % size) >= sizeof(struct zone_page_metadata)) { 1743 use_page_list = TRUE; 1744 } else if ((PAGE_SIZE - sizeof(struct zone_page_metadata)) % size <= PAGE_SIZE / 100) { 1745 use_page_list = TRUE; 1746 } 1747 } 1748 1749 z->free_elements = NULL; 1750 queue_init(&z->pages.any_free_foreign); 1751 queue_init(&z->pages.all_free); 1752 queue_init(&z->pages.intermediate); 1753 queue_init(&z->pages.all_used); 1754 z->cur_size = 0; 1755 z->page_count = 0; 1756 z->max_size = max; 1757 z->elem_size = size; 1758 z->alloc_size = alloc; 1759 z->zone_name = name; 1760 z->count = 0; 1761 z->countfree = 0; 1762 z->sum_count = 0LL; 1763 z->doing_alloc = FALSE; 1764 z->doing_gc = FALSE; 1765 z->exhaustible = FALSE; 1766 z->collectable = TRUE; 1767 z->allows_foreign = FALSE; 1768 z->expandable = TRUE; 1769 z->waiting = FALSE; 1770 z->async_pending = FALSE; 1771 z->caller_acct = TRUE; 1772 z->noencrypt = FALSE; 1773 z->no_callout = FALSE; 1774 z->async_prio_refill = FALSE; 1775 z->gzalloc_exempt = FALSE; 1776 z->alignment_required = FALSE; 1777 z->use_page_list = use_page_list; 1778 z->prio_refill_watermark = 0; 1779 z->zone_replenish_thread = NULL; 1780 z->zp_count = 0; 1781#if CONFIG_ZLEAKS 1782 z->zleak_capture = 0; 1783 z->zleak_on = FALSE; 1784#endif /* CONFIG_ZLEAKS */ 1785 1786#if ZONE_DEBUG 1787 z->active_zones.next = z->active_zones.prev = NULL; 1788 zone_debug_enable(z); 1789#endif /* ZONE_DEBUG */ 1790 lock_zone_init(z); 1791 1792 /* 1793 * Add the zone to the all-zones list. 1794 * If we are tracking zone info per task, and we have 1795 * already used all the available stat slots, then keep 1796 * using the overflow zone slot. 1797 */ 1798 z->next_zone = ZONE_NULL; 1799 simple_lock(&all_zones_lock); 1800 *last_zone = z; 1801 last_zone = &z->next_zone; 1802 z->index = num_zones; 1803 if (zinfo_per_task) { 1804 if (num_zones > ZONES_MAX) 1805 z->index = ZONES_MAX; 1806 } 1807 num_zones++; 1808 simple_unlock(&all_zones_lock); 1809 1810 /* 1811 * Check if we should be logging this zone. If so, remember the zone pointer. 1812 */ 1813 if (log_this_zone(z->zone_name, zone_name_to_log)) { 1814 zone_of_interest = z; 1815 } 1816 1817 /* 1818 * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are 1819 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to 1820 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one 1821 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again 1822 * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized 1823 * right now. 1824 */ 1825 if (zone_of_interest != NULL && zlog_btlog == NULL && kmem_alloc_ready) { 1826 zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, NULL, NULL, NULL); 1827 if (zlog_btlog) { 1828 printf("zone: logging started for zone %s\n", zone_of_interest->zone_name); 1829 } else { 1830 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); 1831 zone_of_interest = NULL; 1832 } 1833 } 1834#if CONFIG_GZALLOC 1835 gzalloc_zone_init(z); 1836#endif 1837 return(z); 1838} 1839unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count; 1840 1841static void zone_replenish_thread(zone_t); 1842 1843/* High priority VM privileged thread used to asynchronously refill a designated 1844 * zone, such as the reserved VM map entry zone. 1845 */ 1846static void zone_replenish_thread(zone_t z) { 1847 vm_size_t free_size; 1848 current_thread()->options |= TH_OPT_VMPRIV; 1849 1850 for (;;) { 1851 lock_zone(z); 1852 assert(z->prio_refill_watermark != 0); 1853 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) { 1854 assert(z->doing_alloc == FALSE); 1855 assert(z->async_prio_refill == TRUE); 1856 1857 unlock_zone(z); 1858 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT; 1859 vm_offset_t space, alloc_size; 1860 kern_return_t kr; 1861 1862 if (vm_pool_low()) 1863 alloc_size = round_page(z->elem_size); 1864 else 1865 alloc_size = z->alloc_size; 1866 1867 if (z->noencrypt) 1868 zflags |= KMA_NOENCRYPT; 1869 1870 kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags); 1871 1872 if (kr == KERN_SUCCESS) { 1873#if ZONE_ALIAS_ADDR 1874 if (alloc_size == PAGE_SIZE) 1875 space = zone_alias_addr(space); 1876#endif 1877 ZONE_PAGE_COUNT_INCR(z, (alloc_size / PAGE_SIZE)); 1878 zcram(z, space, alloc_size); 1879 } else if (kr == KERN_RESOURCE_SHORTAGE) { 1880 VM_PAGE_WAIT(); 1881 } else if (kr == KERN_NO_SPACE) { 1882 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags); 1883 if (kr == KERN_SUCCESS) { 1884#if ZONE_ALIAS_ADDR 1885 if (alloc_size == PAGE_SIZE) 1886 space = zone_alias_addr(space); 1887#endif 1888 zcram(z, space, alloc_size); 1889 } else { 1890 assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC); 1891 thread_block(THREAD_CONTINUE_NULL); 1892 } 1893 } 1894 1895 lock_zone(z); 1896 zone_replenish_loops++; 1897 } 1898 1899 unlock_zone(z); 1900 /* Signal any potential throttled consumers, terminating 1901 * their timer-bounded waits. 1902 */ 1903 thread_wakeup(z); 1904 1905 assert_wait(&z->zone_replenish_thread, THREAD_UNINT); 1906 thread_block(THREAD_CONTINUE_NULL); 1907 zone_replenish_wakeups++; 1908 } 1909} 1910 1911void 1912zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) { 1913 z->prio_refill_watermark = low_water_mark; 1914 1915 z->async_prio_refill = TRUE; 1916 OSMemoryBarrier(); 1917 kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread); 1918 1919 if (tres != KERN_SUCCESS) { 1920 panic("zone_prio_refill_configure, thread create: 0x%x", tres); 1921 } 1922 1923 thread_deallocate(z->zone_replenish_thread); 1924} 1925 1926/* 1927 * Cram the given memory into the specified zone. 1928 */ 1929void 1930zcram( 1931 zone_t zone, 1932 vm_offset_t newmem, 1933 vm_size_t size) 1934{ 1935 vm_size_t elem_size; 1936 boolean_t from_zm = FALSE; 1937 1938 /* Basic sanity checks */ 1939 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0); 1940 assert(!zone->collectable || zone->allows_foreign 1941 || (from_zone_map(newmem, size))); 1942 1943 elem_size = zone->elem_size; 1944 1945 if (from_zone_map(newmem, size)) 1946 from_zm = TRUE; 1947 1948 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) 1949 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name, 1950 (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size); 1951 1952 if (from_zm && !zone->use_page_list) 1953 zone_page_init(newmem, size); 1954 1955 lock_zone(zone); 1956 1957 if (zone->use_page_list) { 1958 struct zone_page_metadata *page_metadata; 1959 1960 assert((newmem & PAGE_MASK) == 0); 1961 assert((size & PAGE_MASK) == 0); 1962 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { 1963 1964 vm_size_t pos_in_page; 1965 page_metadata = (struct zone_page_metadata *)(newmem + PAGE_SIZE - sizeof(struct zone_page_metadata)); 1966 1967 page_metadata->pages.next = NULL; 1968 page_metadata->pages.prev = NULL; 1969 page_metadata->elements = NULL; 1970 page_metadata->zone = zone; 1971 page_metadata->alloc_count = 0; 1972 page_metadata->free_count = 0; 1973 1974 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_metadata); 1975 1976 for (pos_in_page = 0; (newmem + pos_in_page + elem_size) < (vm_offset_t)page_metadata; pos_in_page += elem_size) { 1977 page_metadata->alloc_count++; 1978 zone->count++; /* compensate for free_to_zone */ 1979 if ((newmem + pos_in_page) == (vm_offset_t)zone) { 1980 /* 1981 * special case for the "zone_zone" zone, which is using the first 1982 * allocation of its pmap_steal_memory()-ed allocation for 1983 * the "zone_zone" variable already. 1984 */ 1985 } else { 1986 free_to_zone(zone, newmem + pos_in_page, FALSE); 1987 } 1988 zone->cur_size += elem_size; 1989 } 1990 } 1991 } else { 1992 while (size >= elem_size) { 1993 zone->count++; /* compensate for free_to_zone */ 1994 if (newmem == (vm_offset_t)zone) { 1995 /* Don't free zone_zone zone */ 1996 } else { 1997 free_to_zone(zone, newmem, FALSE); 1998 } 1999 if (from_zm) 2000 zone_page_alloc(newmem, elem_size); 2001 size -= elem_size; 2002 newmem += elem_size; 2003 zone->cur_size += elem_size; 2004 } 2005 } 2006 unlock_zone(zone); 2007} 2008 2009 2010/* 2011 * Steal memory for the zone package. Called from 2012 * vm_page_bootstrap(). 2013 */ 2014void 2015zone_steal_memory(void) 2016{ 2017#if CONFIG_GZALLOC 2018 gzalloc_configure(); 2019#endif 2020 /* Request enough early memory to get to the pmap zone */ 2021 zdata_size = 12 * sizeof(struct zone); 2022 zdata_size = round_page(zdata_size); 2023 zdata = (vm_offset_t)pmap_steal_memory(zdata_size); 2024} 2025 2026 2027/* 2028 * Fill a zone with enough memory to contain at least nelem elements. 2029 * Memory is obtained with kmem_alloc_kobject from the kernel_map. 2030 * Return the number of elements actually put into the zone, which may 2031 * be more than the caller asked for since the memory allocation is 2032 * rounded up to a full page. 2033 */ 2034int 2035zfill( 2036 zone_t zone, 2037 int nelem) 2038{ 2039 kern_return_t kr; 2040 vm_size_t size; 2041 vm_offset_t memory; 2042 int nalloc; 2043 2044 assert(nelem > 0); 2045 if (nelem <= 0) 2046 return 0; 2047 size = nelem * zone->elem_size; 2048 size = round_page(size); 2049 kr = kmem_alloc_kobject(kernel_map, &memory, size); 2050 if (kr != KERN_SUCCESS) 2051 return 0; 2052 2053 zone_change(zone, Z_FOREIGN, TRUE); 2054 ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE)); 2055 zcram(zone, memory, size); 2056 nalloc = (int)(size / zone->elem_size); 2057 assert(nalloc >= nelem); 2058 2059 return nalloc; 2060} 2061 2062/* 2063 * Initialize the "zone of zones" which uses fixed memory allocated 2064 * earlier in memory initialization. zone_bootstrap is called 2065 * before zone_init. 2066 */ 2067void 2068zone_bootstrap(void) 2069{ 2070 char temp_buf[16]; 2071 2072 if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) { 2073 zinfo_per_task = TRUE; 2074 } 2075 2076 if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) 2077 zalloc_debug = 0; 2078 2079 /* Set up zone element poisoning */ 2080 zp_init(); 2081 2082 /* should zlog log to debug zone corruption instead of leaks? */ 2083 if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) { 2084 corruption_debug_flag = TRUE; 2085 } 2086 2087 /* 2088 * Check for and set up zone leak detection if requested via boot-args. We recognized two 2089 * boot-args: 2090 * 2091 * zlog=<zone_to_log> 2092 * zrecs=<num_records_in_log> 2093 * 2094 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to 2095 * control the size of the log. If zrecs is not specified, a default value is used. 2096 */ 2097 2098 if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { 2099 if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) { 2100 2101 /* 2102 * Don't allow more than ZRECORDS_MAX records even if the user asked for more. 2103 * This prevents accidentally hogging too much kernel memory and making the system 2104 * unusable. 2105 */ 2106 2107 log_records = MIN(ZRECORDS_MAX, log_records); 2108 2109 } else { 2110 log_records = ZRECORDS_DEFAULT; 2111 } 2112 } 2113 2114 simple_lock_init(&all_zones_lock, 0); 2115 2116 first_zone = ZONE_NULL; 2117 last_zone = &first_zone; 2118 num_zones = 0; 2119 thread_call_setup(&call_async_alloc, zalloc_async, NULL); 2120 2121 /* assertion: nobody else called zinit before us */ 2122 assert(zone_zone == ZONE_NULL); 2123 2124 /* initializing global lock group for zones */ 2125 lck_grp_attr_setdefault(&zone_locks_grp_attr); 2126 lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr); 2127 2128 zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone), 2129 sizeof(struct zone), "zones"); 2130 zone_change(zone_zone, Z_COLLECT, FALSE); 2131 zone_change(zone_zone, Z_CALLERACCT, FALSE); 2132 zone_change(zone_zone, Z_NOENCRYPT, TRUE); 2133 2134 zcram(zone_zone, zdata, zdata_size); 2135 2136 /* initialize fake zones and zone info if tracking by task */ 2137 if (zinfo_per_task) { 2138 vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS; 2139 unsigned int i; 2140 2141 for (i = 0; i < num_fake_zones; i++) 2142 fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i); 2143 zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX, 2144 zisize, "per task zinfo"); 2145 zone_change(zinfo_zone, Z_CALLERACCT, FALSE); 2146 } 2147} 2148 2149void 2150zinfo_task_init(task_t task) 2151{ 2152 if (zinfo_per_task) { 2153 task->tkm_zinfo = zalloc(zinfo_zone); 2154 memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS); 2155 } else { 2156 task->tkm_zinfo = NULL; 2157 } 2158} 2159 2160void 2161zinfo_task_free(task_t task) 2162{ 2163 assert(task != kernel_task); 2164 if (task->tkm_zinfo != NULL) { 2165 zfree(zinfo_zone, task->tkm_zinfo); 2166 task->tkm_zinfo = NULL; 2167 } 2168} 2169 2170/* Global initialization of Zone Allocator. 2171 * Runs after zone_bootstrap. 2172 */ 2173void 2174zone_init( 2175 vm_size_t max_zonemap_size) 2176{ 2177 kern_return_t retval; 2178 vm_offset_t zone_min; 2179 vm_offset_t zone_max; 2180 2181 retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size, 2182 FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT, 2183 &zone_map); 2184 2185 if (retval != KERN_SUCCESS) 2186 panic("zone_init: kmem_suballoc failed"); 2187 zone_max = zone_min + round_page(max_zonemap_size); 2188#if CONFIG_GZALLOC 2189 gzalloc_init(max_zonemap_size); 2190#endif 2191 /* 2192 * Setup garbage collection information: 2193 */ 2194 zone_map_min_address = zone_min; 2195 zone_map_max_address = zone_max; 2196 2197#if defined(__LP64__) 2198 /* 2199 * ensure that any vm_page_t that gets created from 2200 * the vm_page zone can be packed properly (see vm_page.h 2201 * for the packing requirements 2202 */ 2203 if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_min_address)) != (vm_page_t)zone_map_min_address) 2204 panic("VM_PAGE_PACK_PTR failed on zone_map_min_address - %p", (void *)zone_map_min_address); 2205 2206 if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address)) != (vm_page_t)zone_map_max_address) 2207 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address); 2208#endif 2209 2210 zone_pages = (unsigned int)atop_kernel(zone_max - zone_min); 2211 zone_page_table_used_size = sizeof(zone_page_table); 2212 2213 zone_page_table_second_level_size = 1; 2214 zone_page_table_second_level_shift_amount = 0; 2215 2216 /* 2217 * Find the power of 2 for the second level that allows 2218 * the first level to fit in ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE 2219 * slots. 2220 */ 2221 while ((zone_page_table_first_level_slot(zone_pages-1)) >= ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE) { 2222 zone_page_table_second_level_size <<= 1; 2223 zone_page_table_second_level_shift_amount++; 2224 } 2225 2226 lck_grp_attr_setdefault(&zone_gc_lck_grp_attr); 2227 lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr); 2228 lck_attr_setdefault(&zone_gc_lck_attr); 2229 lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr); 2230 2231#if CONFIG_ZLEAKS 2232 /* 2233 * Initialize the zone leak monitor 2234 */ 2235 zleak_init(max_zonemap_size); 2236#endif /* CONFIG_ZLEAKS */ 2237} 2238 2239void 2240zone_page_table_expand(zone_page_index_t pindex) 2241{ 2242 unsigned int first_index; 2243 struct zone_page_table_entry * volatile * first_level_ptr; 2244 2245 assert(pindex < zone_pages); 2246 2247 first_index = zone_page_table_first_level_slot(pindex); 2248 first_level_ptr = &zone_page_table[first_index]; 2249 2250 if (*first_level_ptr == NULL) { 2251 /* 2252 * We were able to verify the old first-level slot 2253 * had NULL, so attempt to populate it. 2254 */ 2255 2256 vm_offset_t second_level_array = 0; 2257 vm_size_t second_level_size = round_page(zone_page_table_second_level_size * sizeof(struct zone_page_table_entry)); 2258 zone_page_index_t i; 2259 struct zone_page_table_entry *entry_array; 2260 2261 if (kmem_alloc_kobject(zone_map, &second_level_array, 2262 second_level_size) != KERN_SUCCESS) { 2263 panic("zone_page_table_expand"); 2264 } 2265 zone_map_table_page_count += (second_level_size / PAGE_SIZE); 2266 2267 /* 2268 * zone_gc() may scan the "zone_page_table" directly, 2269 * so make sure any slots have a valid unused state. 2270 */ 2271 entry_array = (struct zone_page_table_entry *)second_level_array; 2272 for (i=0; i < zone_page_table_second_level_size; i++) { 2273 entry_array[i].alloc_count = ZONE_PAGE_UNUSED; 2274 entry_array[i].collect_count = 0; 2275 } 2276 2277 if (OSCompareAndSwapPtr(NULL, entry_array, first_level_ptr)) { 2278 /* Old slot was NULL, replaced with expanded level */ 2279 OSAddAtomicLong(second_level_size, &zone_page_table_used_size); 2280 } else { 2281 /* Old slot was not NULL, someone else expanded first */ 2282 kmem_free(zone_map, second_level_array, second_level_size); 2283 zone_map_table_page_count -= (second_level_size / PAGE_SIZE); 2284 } 2285 } else { 2286 /* Old slot was not NULL, already been expanded */ 2287 } 2288} 2289 2290struct zone_page_table_entry * 2291zone_page_table_lookup(zone_page_index_t pindex) 2292{ 2293 unsigned int first_index = zone_page_table_first_level_slot(pindex); 2294 struct zone_page_table_entry *second_level = zone_page_table[first_index]; 2295 2296 if (second_level) { 2297 return &second_level[zone_page_table_second_level_slot(pindex)]; 2298 } 2299 2300 return NULL; 2301} 2302 2303extern volatile SInt32 kfree_nop_count; 2304 2305#pragma mark - 2306#pragma mark zalloc_canblock 2307 2308/* 2309 * zalloc returns an element from the specified zone. 2310 */ 2311static void * 2312zalloc_internal( 2313 zone_t zone, 2314 boolean_t canblock, 2315 boolean_t nopagewait) 2316{ 2317 vm_offset_t addr = 0; 2318 kern_return_t retval; 2319 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ 2320 int numsaved = 0; 2321 boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE; 2322#if CONFIG_GZALLOC || ZONE_DEBUG 2323 boolean_t did_gzalloc = FALSE; 2324#endif 2325 thread_t thr = current_thread(); 2326 boolean_t check_poison = FALSE; 2327 2328#if CONFIG_ZLEAKS 2329 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ 2330#endif /* CONFIG_ZLEAKS */ 2331 2332 assert(zone != ZONE_NULL); 2333 2334#if CONFIG_GZALLOC 2335 addr = gzalloc_alloc(zone, canblock); 2336 did_gzalloc = (addr != 0); 2337#endif 2338 2339 /* 2340 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. 2341 */ 2342 if (__improbable(DO_LOGGING(zone))) 2343 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH); 2344 2345#if CONFIG_ZLEAKS 2346 /* 2347 * Zone leak detection: capture a backtrace every zleak_sample_factor 2348 * allocations in this zone. 2349 */ 2350 if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) { 2351 /* Avoid backtracing twice if zone logging is on */ 2352 if (numsaved == 0) 2353 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); 2354 else 2355 zleak_tracedepth = numsaved; 2356 } 2357#endif /* CONFIG_ZLEAKS */ 2358 2359 lock_zone(zone); 2360 2361 if (zone->async_prio_refill && zone->zone_replenish_thread) { 2362 do { 2363 vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size)); 2364 vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size; 2365 zone_replenish_wakeup = (zfreec < zrefillwm); 2366 zone_alloc_throttle = (zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0); 2367 2368 if (zone_replenish_wakeup) { 2369 zone_replenish_wakeups_initiated++; 2370 unlock_zone(zone); 2371 /* Signal the potentially waiting 2372 * refill thread. 2373 */ 2374 thread_wakeup(&zone->zone_replenish_thread); 2375 2376 /* Scheduling latencies etc. may prevent 2377 * the refill thread from keeping up 2378 * with demand. Throttle consumers 2379 * when we fall below half the 2380 * watermark, unless VM privileged 2381 */ 2382 if (zone_alloc_throttle) { 2383 zone_replenish_throttle_count++; 2384 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); 2385 thread_block(THREAD_CONTINUE_NULL); 2386 } 2387 lock_zone(zone); 2388 } 2389 } while (zone_alloc_throttle == TRUE); 2390 } 2391 2392 if (__probable(addr == 0)) 2393 addr = try_alloc_from_zone(zone, &check_poison); 2394 2395 2396 while ((addr == 0) && canblock) { 2397 /* 2398 * If nothing was there, try to get more 2399 */ 2400 if (zone->doing_alloc) { 2401 /* 2402 * Someone is allocating memory for this zone. 2403 * Wait for it to show up, then try again. 2404 */ 2405 zone->waiting = TRUE; 2406 zone_sleep(zone); 2407 } else if (zone->doing_gc) { 2408 /* zone_gc() is running. Since we need an element 2409 * from the free list that is currently being 2410 * collected, set the waiting bit and try to 2411 * interrupt the GC process, and try again 2412 * when we obtain the lock. 2413 */ 2414 zone->waiting = TRUE; 2415 zone_sleep(zone); 2416 } else { 2417 vm_offset_t space; 2418 vm_size_t alloc_size; 2419 int retry = 0; 2420 2421 if ((zone->cur_size + zone->elem_size) > 2422 zone->max_size) { 2423 if (zone->exhaustible) 2424 break; 2425 if (zone->expandable) { 2426 /* 2427 * We're willing to overflow certain 2428 * zones, but not without complaining. 2429 * 2430 * This is best used in conjunction 2431 * with the collectable flag. What we 2432 * want is an assurance we can get the 2433 * memory back, assuming there's no 2434 * leak. 2435 */ 2436 zone->max_size += (zone->max_size >> 1); 2437 } else { 2438 unlock_zone(zone); 2439 2440 panic_include_zprint = TRUE; 2441#if CONFIG_ZLEAKS 2442 if (zleak_state & ZLEAK_STATE_ACTIVE) 2443 panic_include_ztrace = TRUE; 2444#endif /* CONFIG_ZLEAKS */ 2445 panic("zalloc: zone \"%s\" empty.", zone->zone_name); 2446 } 2447 } 2448 zone->doing_alloc = TRUE; 2449 unlock_zone(zone); 2450 2451 for (;;) { 2452 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT; 2453 2454 if (vm_pool_low() || retry >= 1) 2455 alloc_size = 2456 round_page(zone->elem_size); 2457 else 2458 alloc_size = zone->alloc_size; 2459 2460 if (zone->noencrypt) 2461 zflags |= KMA_NOENCRYPT; 2462 2463 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags); 2464 if (retval == KERN_SUCCESS) { 2465#if ZONE_ALIAS_ADDR 2466 if (alloc_size == PAGE_SIZE) 2467 space = zone_alias_addr(space); 2468#endif 2469 2470#if CONFIG_ZLEAKS 2471 if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) { 2472 if (zone_map->size >= zleak_global_tracking_threshold) { 2473 kern_return_t kr; 2474 2475 kr = zleak_activate(); 2476 if (kr != KERN_SUCCESS) { 2477 printf("Failed to activate live zone leak debugging (%d).\n", kr); 2478 } 2479 } 2480 } 2481 2482 if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) { 2483 if (zone->cur_size > zleak_per_zone_tracking_threshold) { 2484 zone->zleak_on = TRUE; 2485 } 2486 } 2487#endif /* CONFIG_ZLEAKS */ 2488 ZONE_PAGE_COUNT_INCR(zone, (alloc_size / PAGE_SIZE)); 2489 zcram(zone, space, alloc_size); 2490 2491 break; 2492 } else if (retval != KERN_RESOURCE_SHORTAGE) { 2493 retry++; 2494 2495 if (retry == 2) { 2496 zone_gc(TRUE); 2497 printf("zalloc did gc\n"); 2498 zone_display_zprint(); 2499 } 2500 if (retry == 3) { 2501 panic_include_zprint = TRUE; 2502#if CONFIG_ZLEAKS 2503 if ((zleak_state & ZLEAK_STATE_ACTIVE)) { 2504 panic_include_ztrace = TRUE; 2505 } 2506#endif /* CONFIG_ZLEAKS */ 2507 if (retval == KERN_NO_SPACE) { 2508 zone_t zone_largest = zone_find_largest(); 2509 panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)", 2510 zone->zone_name, zone_largest->zone_name, 2511 (unsigned long)zone_largest->cur_size, zone_largest->count); 2512 2513 } 2514 panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count); 2515 } 2516 } else { 2517 break; 2518 } 2519 } 2520 lock_zone(zone); 2521 zone->doing_alloc = FALSE; 2522 if (zone->waiting) { 2523 zone->waiting = FALSE; 2524 zone_wakeup(zone); 2525 } 2526 addr = try_alloc_from_zone(zone, &check_poison); 2527 if (addr == 0 && 2528 retval == KERN_RESOURCE_SHORTAGE) { 2529 if (nopagewait == TRUE) 2530 break; /* out of the main while loop */ 2531 unlock_zone(zone); 2532 2533 VM_PAGE_WAIT(); 2534 lock_zone(zone); 2535 } 2536 } 2537 if (addr == 0) 2538 addr = try_alloc_from_zone(zone, &check_poison); 2539 } 2540 2541#if CONFIG_ZLEAKS 2542 /* Zone leak detection: 2543 * If we're sampling this allocation, add it to the zleaks hash table. 2544 */ 2545 if (addr && zleak_tracedepth > 0) { 2546 /* Sampling can fail if another sample is happening at the same time in a different zone. */ 2547 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { 2548 /* If it failed, roll back the counter so we sample the next allocation instead. */ 2549 zone->zleak_capture = zleak_sample_factor; 2550 } 2551 } 2552#endif /* CONFIG_ZLEAKS */ 2553 2554 2555 if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) { 2556 zone->async_pending = TRUE; 2557 unlock_zone(zone); 2558 thread_call_enter(&call_async_alloc); 2559 lock_zone(zone); 2560 addr = try_alloc_from_zone(zone, &check_poison); 2561 } 2562 2563 /* 2564 * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is 2565 * suspected, so this code rarely executes. We need to do this code while still holding the zone lock 2566 * since it protects the various log related data structures. 2567 */ 2568 2569 if (__improbable(DO_LOGGING(zone) && addr)) { 2570 btlog_add_entry(zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved); 2571 } 2572 2573 vm_offset_t inner_size = zone->elem_size; 2574 2575#if ZONE_DEBUG 2576 if (!did_gzalloc && addr && zone_debug_enabled(zone)) { 2577 enqueue_tail(&zone->active_zones, (queue_entry_t)addr); 2578 addr += ZONE_DEBUG_OFFSET; 2579 inner_size -= ZONE_DEBUG_OFFSET; 2580 } 2581#endif 2582 2583 unlock_zone(zone); 2584 2585 if (__improbable(check_poison && addr)) { 2586 vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; 2587 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); 2588 2589 for ( ; element_cursor < backup ; element_cursor++) 2590 if (__improbable(*element_cursor != ZP_POISON)) 2591 zone_element_was_modified_panic(zone, 2592 addr, 2593 *element_cursor, 2594 ZP_POISON, 2595 ((vm_offset_t)element_cursor) - addr); 2596 } 2597 2598 if (addr) { 2599 /* 2600 * Clear out the old next pointer and backup to avoid leaking the cookie 2601 * and so that only values on the freelist have a valid cookie 2602 */ 2603 2604 vm_offset_t *primary = (vm_offset_t *) addr; 2605 vm_offset_t *backup = get_backup_ptr(inner_size, primary); 2606 2607 *primary = ZP_POISON; 2608 *backup = ZP_POISON; 2609 } 2610 2611 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr); 2612 2613 if (addr) { 2614 task_t task; 2615 zinfo_usage_t zinfo; 2616 vm_size_t sz = zone->elem_size; 2617 2618 if (zone->caller_acct) 2619 ledger_credit(thr->t_ledger, task_ledgers.tkm_private, sz); 2620 else 2621 ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, sz); 2622 2623 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 2624 OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].alloc); 2625 } 2626 return((void *)addr); 2627} 2628 2629 2630void * 2631zalloc(zone_t zone) 2632{ 2633 return (zalloc_internal(zone, TRUE, FALSE)); 2634} 2635 2636void * 2637zalloc_noblock(zone_t zone) 2638{ 2639 return (zalloc_internal(zone, FALSE, FALSE)); 2640} 2641 2642void * 2643zalloc_nopagewait(zone_t zone) 2644{ 2645 return (zalloc_internal(zone, TRUE, TRUE)); 2646} 2647 2648void * 2649zalloc_canblock(zone_t zone, boolean_t canblock) 2650{ 2651 return (zalloc_internal(zone, canblock, FALSE)); 2652} 2653 2654 2655void 2656zalloc_async( 2657 __unused thread_call_param_t p0, 2658 __unused thread_call_param_t p1) 2659{ 2660 zone_t current_z = NULL, head_z; 2661 unsigned int max_zones, i; 2662 void *elt = NULL; 2663 boolean_t pending = FALSE; 2664 2665 simple_lock(&all_zones_lock); 2666 head_z = first_zone; 2667 max_zones = num_zones; 2668 simple_unlock(&all_zones_lock); 2669 current_z = head_z; 2670 for (i = 0; i < max_zones; i++) { 2671 lock_zone(current_z); 2672 if (current_z->async_pending == TRUE) { 2673 current_z->async_pending = FALSE; 2674 pending = TRUE; 2675 } 2676 unlock_zone(current_z); 2677 2678 if (pending == TRUE) { 2679 elt = zalloc_canblock(current_z, TRUE); 2680 zfree(current_z, elt); 2681 pending = FALSE; 2682 } 2683 /* 2684 * This is based on assumption that zones never get 2685 * freed once allocated and linked. 2686 * Hence a read outside of lock is OK. 2687 */ 2688 current_z = current_z->next_zone; 2689 } 2690} 2691 2692/* 2693 * zget returns an element from the specified zone 2694 * and immediately returns nothing if there is nothing there. 2695 * 2696 * This form should be used when you can not block (like when 2697 * processing an interrupt). 2698 * 2699 * XXX: It seems like only vm_page_grab_fictitious_common uses this, and its 2700 * friend vm_page_more_fictitious can block, so it doesn't seem like 2701 * this is used for interrupts any more.... 2702 */ 2703void * 2704zget( 2705 register zone_t zone) 2706{ 2707 vm_offset_t addr; 2708 boolean_t check_poison = FALSE; 2709 2710#if CONFIG_ZLEAKS 2711 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */ 2712 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ 2713#endif /* CONFIG_ZLEAKS */ 2714 2715 assert( zone != ZONE_NULL ); 2716 2717#if CONFIG_ZLEAKS 2718 /* 2719 * Zone leak detection: capture a backtrace 2720 */ 2721 if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) { 2722 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); 2723 } 2724#endif /* CONFIG_ZLEAKS */ 2725 2726 if (!lock_try_zone(zone)) 2727 return NULL; 2728 2729 addr = try_alloc_from_zone(zone, &check_poison); 2730 2731 vm_offset_t inner_size = zone->elem_size; 2732 2733#if ZONE_DEBUG 2734 if (addr && zone_debug_enabled(zone)) { 2735 enqueue_tail(&zone->active_zones, (queue_entry_t)addr); 2736 addr += ZONE_DEBUG_OFFSET; 2737 inner_size -= ZONE_DEBUG_OFFSET; 2738 } 2739#endif /* ZONE_DEBUG */ 2740 2741#if CONFIG_ZLEAKS 2742 /* 2743 * Zone leak detection: record the allocation 2744 */ 2745 if (zone->zleak_on && zleak_tracedepth > 0 && addr) { 2746 /* Sampling can fail if another sample is happening at the same time in a different zone. */ 2747 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { 2748 /* If it failed, roll back the counter so we sample the next allocation instead. */ 2749 zone->zleak_capture = zleak_sample_factor; 2750 } 2751 } 2752#endif /* CONFIG_ZLEAKS */ 2753 2754 unlock_zone(zone); 2755 2756 if (__improbable(check_poison && addr)) { 2757 vm_offset_t *element_cursor = ((vm_offset_t *) addr) + 1; 2758 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *) addr); 2759 2760 for ( ; element_cursor < backup ; element_cursor++) 2761 if (__improbable(*element_cursor != ZP_POISON)) 2762 zone_element_was_modified_panic(zone, 2763 addr, 2764 *element_cursor, 2765 ZP_POISON, 2766 ((vm_offset_t)element_cursor) - addr); 2767 } 2768 2769 if (addr) { 2770 /* 2771 * Clear out the old next pointer and backup to avoid leaking the cookie 2772 * and so that only values on the freelist have a valid cookie 2773 */ 2774 vm_offset_t *primary = (vm_offset_t *) addr; 2775 vm_offset_t *backup = get_backup_ptr(inner_size, primary); 2776 2777 *primary = ZP_POISON; 2778 *backup = ZP_POISON; 2779 } 2780 2781 return((void *) addr); 2782} 2783 2784/* Keep this FALSE by default. Large memory machine run orders of magnitude 2785 slower in debug mode when true. Use debugger to enable if needed */ 2786/* static */ boolean_t zone_check = FALSE; 2787 2788static void zone_check_freelist(zone_t zone, vm_offset_t elem) 2789{ 2790 struct zone_free_element *this; 2791 struct zone_page_metadata *thispage; 2792 2793 if (zone->use_page_list) { 2794 if (zone->allows_foreign) { 2795 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); 2796 !queue_end(&zone->pages.any_free_foreign, (queue_entry_t)thispage); 2797 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2798 for (this = thispage->elements; 2799 this != NULL; 2800 this = this->next) { 2801 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2802 panic("zone_check_freelist"); 2803 } 2804 } 2805 } 2806 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); 2807 !queue_end(&zone->pages.all_free, (queue_entry_t)thispage); 2808 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2809 for (this = thispage->elements; 2810 this != NULL; 2811 this = this->next) { 2812 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2813 panic("zone_check_freelist"); 2814 } 2815 } 2816 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); 2817 !queue_end(&zone->pages.intermediate, (queue_entry_t)thispage); 2818 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2819 for (this = thispage->elements; 2820 this != NULL; 2821 this = this->next) { 2822 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2823 panic("zone_check_freelist"); 2824 } 2825 } 2826 } else { 2827 for (this = zone->free_elements; 2828 this != NULL; 2829 this = this->next) { 2830 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2831 panic("zone_check_freelist"); 2832 } 2833 } 2834} 2835 2836static zone_t zone_last_bogus_zone = ZONE_NULL; 2837static vm_offset_t zone_last_bogus_elem = 0; 2838 2839void 2840zfree( 2841 register zone_t zone, 2842 void *addr) 2843{ 2844 vm_offset_t elem = (vm_offset_t) addr; 2845 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ 2846 int numsaved = 0; 2847 boolean_t gzfreed = FALSE; 2848 boolean_t poison = FALSE; 2849 2850 assert(zone != ZONE_NULL); 2851 2852#if 1 2853 if (zone->use_page_list) { 2854 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr); 2855 if (zone != page_meta->zone) { 2856 /* 2857 * Something bad has happened. Someone tried to zfree a pointer but the metadata says it is from 2858 * a different zone (or maybe it's from a zone that doesn't use page free lists at all). We can repair 2859 * some cases of this, if: 2860 * 1) The specified zone had use_page_list, and the true zone also has use_page_list set. In that case 2861 * we can swap the zone_t 2862 * 2) The specified zone had use_page_list, but the true zone does not. In this case page_meta is garbage, 2863 * and dereferencing page_meta->zone might panic. 2864 * To distinguish the two, we enumerate the zone list to match it up. 2865 * We do not handle the case where an incorrect zone is passed that does not have use_page_list set, 2866 * even if the true zone did have this set. 2867 */ 2868 zone_t fixed_zone = NULL; 2869 int fixed_i, max_zones; 2870 2871 simple_lock(&all_zones_lock); 2872 max_zones = num_zones; 2873 fixed_zone = first_zone; 2874 simple_unlock(&all_zones_lock); 2875 2876 for (fixed_i=0; fixed_i < max_zones; fixed_i++, fixed_zone = fixed_zone->next_zone) { 2877 if (fixed_zone == page_meta->zone && fixed_zone->use_page_list) { 2878 /* we can fix this */ 2879 printf("Fixing incorrect zfree from zone %s to zone %s\n", zone->zone_name, fixed_zone->zone_name); 2880 zone = fixed_zone; 2881 break; 2882 } 2883 } 2884 } 2885 } 2886#endif 2887 2888 /* 2889 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. 2890 */ 2891 2892 if (__improbable(DO_LOGGING(zone) && corruption_debug_flag)) 2893 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH); 2894 2895#if MACH_ASSERT 2896 /* Basic sanity checks */ 2897 if (zone == ZONE_NULL || elem == (vm_offset_t)0) 2898 panic("zfree: NULL"); 2899 /* zone_gc assumes zones are never freed */ 2900 if (zone == zone_zone) 2901 panic("zfree: freeing to zone_zone breaks zone_gc!"); 2902#endif 2903 2904#if CONFIG_GZALLOC 2905 gzfreed = gzalloc_free(zone, addr); 2906#endif 2907 2908 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr); 2909 2910 if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign && 2911 !from_zone_map(elem, zone->elem_size))) { 2912#if MACH_ASSERT 2913 panic("zfree: non-allocated memory in collectable zone!"); 2914#endif 2915 zone_last_bogus_zone = zone; 2916 zone_last_bogus_elem = elem; 2917 return; 2918 } 2919 2920 if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) { 2921 /* 2922 * Poison the memory before it ends up on the freelist to catch 2923 * use-after-free and use of uninitialized memory 2924 * 2925 * Always poison tiny zones' elements (limit is 0 if -no-zp is set) 2926 * Also poison larger elements periodically 2927 */ 2928 2929 vm_offset_t inner_size = zone->elem_size; 2930 2931#if ZONE_DEBUG 2932 if (!gzfreed && zone_debug_enabled(zone)) { 2933 inner_size -= ZONE_DEBUG_OFFSET; 2934 } 2935#endif 2936 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale); 2937 2938 if (inner_size <= zp_tiny_zone_limit) 2939 poison = TRUE; 2940 else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) 2941 poison = TRUE; 2942 2943 if (__improbable(poison)) { 2944 2945 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */ 2946 /* Poison everything but primary and backup */ 2947 vm_offset_t *element_cursor = ((vm_offset_t *) elem) + 1; 2948 vm_offset_t *backup = get_backup_ptr(inner_size, (vm_offset_t *)elem); 2949 2950 for ( ; element_cursor < backup; element_cursor++) 2951 *element_cursor = ZP_POISON; 2952 } 2953 } 2954 2955 lock_zone(zone); 2956 2957 /* 2958 * See if we're doing logging on this zone. There are two styles of logging used depending on 2959 * whether we're trying to catch a leak or corruption. See comments above in zalloc for details. 2960 */ 2961 2962 if (__improbable(DO_LOGGING(zone))) { 2963 if (corruption_debug_flag) { 2964 /* 2965 * We're logging to catch a corruption. Add a record of this zfree operation 2966 * to log. 2967 */ 2968 btlog_add_entry(zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved); 2969 } else { 2970 /* 2971 * We're logging to catch a leak. Remove any record we might have for this 2972 * element since it's being freed. Note that we may not find it if the buffer 2973 * overflowed and that's OK. Since the log is of a limited size, old records 2974 * get overwritten if there are more zallocs than zfrees. 2975 */ 2976 btlog_remove_entries_for_element(zlog_btlog, (void *)addr); 2977 } 2978 } 2979 2980#if ZONE_DEBUG 2981 if (!gzfreed && zone_debug_enabled(zone)) { 2982 queue_t tmp_elem; 2983 2984 elem -= ZONE_DEBUG_OFFSET; 2985 if (zone_check) { 2986 /* check the zone's consistency */ 2987 2988 for (tmp_elem = queue_first(&zone->active_zones); 2989 !queue_end(tmp_elem, &zone->active_zones); 2990 tmp_elem = queue_next(tmp_elem)) 2991 if (elem == (vm_offset_t)tmp_elem) 2992 break; 2993 if (elem != (vm_offset_t)tmp_elem) 2994 panic("zfree()ing element from wrong zone"); 2995 } 2996 remqueue((queue_t) elem); 2997 } 2998#endif /* ZONE_DEBUG */ 2999 if (zone_check) { 3000 zone_check_freelist(zone, elem); 3001 } 3002 3003 if (__probable(!gzfreed)) 3004 free_to_zone(zone, elem, poison); 3005 3006#if MACH_ASSERT 3007 if (zone->count < 0) 3008 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone", 3009 zone->zone_name, addr); 3010#endif 3011 3012 3013#if CONFIG_ZLEAKS 3014 /* 3015 * Zone leak detection: un-track the allocation 3016 */ 3017 if (zone->zleak_on) { 3018 zleak_free(elem, zone->elem_size); 3019 } 3020#endif /* CONFIG_ZLEAKS */ 3021 3022 /* 3023 * If elements have one or more pages, and memory is low, 3024 * request to run the garbage collection in the zone the next 3025 * time the pageout thread runs. 3026 */ 3027 if (zone->elem_size >= PAGE_SIZE && 3028 vm_pool_low()){ 3029 zone_gc_forced = TRUE; 3030 } 3031 unlock_zone(zone); 3032 3033 { 3034 thread_t thr = current_thread(); 3035 task_t task; 3036 zinfo_usage_t zinfo; 3037 vm_size_t sz = zone->elem_size; 3038 3039 if (zone->caller_acct) 3040 ledger_debit(thr->t_ledger, task_ledgers.tkm_private, sz); 3041 else 3042 ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, sz); 3043 3044 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 3045 OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].free); 3046 } 3047} 3048 3049 3050/* Change a zone's flags. 3051 * This routine must be called immediately after zinit. 3052 */ 3053void 3054zone_change( 3055 zone_t zone, 3056 unsigned int item, 3057 boolean_t value) 3058{ 3059 assert( zone != ZONE_NULL ); 3060 assert( value == TRUE || value == FALSE ); 3061 3062 switch(item){ 3063 case Z_NOENCRYPT: 3064 zone->noencrypt = value; 3065 break; 3066 case Z_EXHAUST: 3067 zone->exhaustible = value; 3068 break; 3069 case Z_COLLECT: 3070 zone->collectable = value; 3071 break; 3072 case Z_EXPAND: 3073 zone->expandable = value; 3074 break; 3075 case Z_FOREIGN: 3076 zone->allows_foreign = value; 3077 break; 3078 case Z_CALLERACCT: 3079 zone->caller_acct = value; 3080 break; 3081 case Z_NOCALLOUT: 3082 zone->no_callout = value; 3083 break; 3084 case Z_GZALLOC_EXEMPT: 3085 zone->gzalloc_exempt = value; 3086#if CONFIG_GZALLOC 3087 gzalloc_reconfigure(zone); 3088#endif 3089 break; 3090 case Z_ALIGNMENT_REQUIRED: 3091 zone->alignment_required = value; 3092#if ZONE_DEBUG 3093 zone_debug_disable(zone); 3094#endif 3095#if CONFIG_GZALLOC 3096 gzalloc_reconfigure(zone); 3097#endif 3098 break; 3099 default: 3100 panic("Zone_change: Wrong Item Type!"); 3101 /* break; */ 3102 } 3103} 3104 3105/* 3106 * Return the expected number of free elements in the zone. 3107 * This calculation will be incorrect if items are zfree'd that 3108 * were never zalloc'd/zget'd. The correct way to stuff memory 3109 * into a zone is by zcram. 3110 */ 3111 3112integer_t 3113zone_free_count(zone_t zone) 3114{ 3115 integer_t free_count; 3116 3117 lock_zone(zone); 3118 free_count = zone->countfree; 3119 unlock_zone(zone); 3120 3121 assert(free_count >= 0); 3122 3123 return(free_count); 3124} 3125 3126/* 3127 * Zone garbage collection subroutines 3128 */ 3129 3130boolean_t 3131zone_page_collectable( 3132 vm_offset_t addr, 3133 vm_size_t size) 3134{ 3135 struct zone_page_table_entry *zp; 3136 zone_page_index_t i, j; 3137 3138#if ZONE_ALIAS_ADDR 3139 addr = zone_virtual_addr(addr); 3140#endif 3141#if MACH_ASSERT 3142 if (!from_zone_map(addr, size)) 3143 panic("zone_page_collectable"); 3144#endif 3145 3146 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3147 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3148 3149 for (; i <= j; i++) { 3150 zp = zone_page_table_lookup(i); 3151 if (zp->collect_count == zp->alloc_count) 3152 return (TRUE); 3153 } 3154 3155 return (FALSE); 3156} 3157 3158void 3159zone_page_keep( 3160 vm_offset_t addr, 3161 vm_size_t size) 3162{ 3163 struct zone_page_table_entry *zp; 3164 zone_page_index_t i, j; 3165 3166#if ZONE_ALIAS_ADDR 3167 addr = zone_virtual_addr(addr); 3168#endif 3169#if MACH_ASSERT 3170 if (!from_zone_map(addr, size)) 3171 panic("zone_page_keep"); 3172#endif 3173 3174 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3175 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3176 3177 for (; i <= j; i++) { 3178 zp = zone_page_table_lookup(i); 3179 zp->collect_count = 0; 3180 } 3181} 3182 3183void 3184zone_page_collect( 3185 vm_offset_t addr, 3186 vm_size_t size) 3187{ 3188 struct zone_page_table_entry *zp; 3189 zone_page_index_t i, j; 3190 3191#if ZONE_ALIAS_ADDR 3192 addr = zone_virtual_addr(addr); 3193#endif 3194#if MACH_ASSERT 3195 if (!from_zone_map(addr, size)) 3196 panic("zone_page_collect"); 3197#endif 3198 3199 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3200 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3201 3202 for (; i <= j; i++) { 3203 zp = zone_page_table_lookup(i); 3204 ++zp->collect_count; 3205 } 3206} 3207 3208void 3209zone_page_init( 3210 vm_offset_t addr, 3211 vm_size_t size) 3212{ 3213 struct zone_page_table_entry *zp; 3214 zone_page_index_t i, j; 3215 3216#if ZONE_ALIAS_ADDR 3217 addr = zone_virtual_addr(addr); 3218#endif 3219#if MACH_ASSERT 3220 if (!from_zone_map(addr, size)) 3221 panic("zone_page_init"); 3222#endif 3223 3224 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3225 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3226 3227 for (; i <= j; i++) { 3228 /* make sure entry exists before marking unused */ 3229 zone_page_table_expand(i); 3230 3231 zp = zone_page_table_lookup(i); 3232 assert(zp); 3233 zp->alloc_count = ZONE_PAGE_UNUSED; 3234 zp->collect_count = 0; 3235 } 3236} 3237 3238void 3239zone_page_alloc( 3240 vm_offset_t addr, 3241 vm_size_t size) 3242{ 3243 struct zone_page_table_entry *zp; 3244 zone_page_index_t i, j; 3245 3246#if ZONE_ALIAS_ADDR 3247 addr = zone_virtual_addr(addr); 3248#endif 3249#if MACH_ASSERT 3250 if (!from_zone_map(addr, size)) 3251 panic("zone_page_alloc"); 3252#endif 3253 3254 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3255 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3256 3257 for (; i <= j; i++) { 3258 zp = zone_page_table_lookup(i); 3259 assert(zp); 3260 3261 /* 3262 * Set alloc_count to ZONE_PAGE_USED if 3263 * it was previously set to ZONE_PAGE_UNUSED. 3264 */ 3265 if (zp->alloc_count == ZONE_PAGE_UNUSED) 3266 zp->alloc_count = ZONE_PAGE_USED; 3267 3268 ++zp->alloc_count; 3269 } 3270} 3271 3272void 3273zone_page_free_element( 3274 zone_page_index_t *free_page_head, 3275 zone_page_index_t *free_page_tail, 3276 vm_offset_t addr, 3277 vm_size_t size) 3278{ 3279 struct zone_page_table_entry *zp; 3280 zone_page_index_t i, j; 3281 3282#if ZONE_ALIAS_ADDR 3283 addr = zone_virtual_addr(addr); 3284#endif 3285#if MACH_ASSERT 3286 if (!from_zone_map(addr, size)) 3287 panic("zone_page_free_element"); 3288#endif 3289 3290 /* Clear out the old next and backup pointers */ 3291 vm_offset_t *primary = (vm_offset_t *) addr; 3292 vm_offset_t *backup = get_backup_ptr(size, primary); 3293 3294 *primary = ZP_POISON; 3295 *backup = ZP_POISON; 3296 3297 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3298 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3299 3300 for (; i <= j; i++) { 3301 zp = zone_page_table_lookup(i); 3302 3303 if (zp->collect_count > 0) 3304 --zp->collect_count; 3305 if (--zp->alloc_count == 0) { 3306 vm_address_t free_page_address; 3307 vm_address_t prev_free_page_address; 3308 3309 zp->alloc_count = ZONE_PAGE_UNUSED; 3310 zp->collect_count = 0; 3311 3312 3313 /* 3314 * This element was the last one on this page, re-use the page's 3315 * storage for a page freelist 3316 */ 3317 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i); 3318 *(zone_page_index_t *)free_page_address = ZONE_PAGE_INDEX_INVALID; 3319 3320 if (*free_page_head == ZONE_PAGE_INDEX_INVALID) { 3321 *free_page_head = i; 3322 *free_page_tail = i; 3323 } else { 3324 prev_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)(*free_page_tail)); 3325 *(zone_page_index_t *)prev_free_page_address = i; 3326 *free_page_tail = i; 3327 } 3328 } 3329 } 3330} 3331 3332 3333 3334 3335struct { 3336 uint64_t zgc_invoked; 3337 uint64_t zgc_bailed; 3338 uint32_t pgs_freed; 3339 3340 uint32_t elems_collected, 3341 elems_freed, 3342 elems_kept; 3343} zgc_stats; 3344 3345/* Zone garbage collection 3346 * 3347 * zone_gc will walk through all the free elements in all the 3348 * zones that are marked collectable looking for reclaimable 3349 * pages. zone_gc is called by consider_zone_gc when the system 3350 * begins to run out of memory. 3351 */ 3352void 3353zone_gc(boolean_t all_zones) 3354{ 3355 unsigned int max_zones; 3356 zone_t z; 3357 unsigned int i; 3358 uint32_t old_pgs_freed; 3359 zone_page_index_t zone_free_page_head; 3360 zone_page_index_t zone_free_page_tail; 3361 thread_t mythread = current_thread(); 3362 3363 lck_mtx_lock(&zone_gc_lock); 3364 3365 zgc_stats.zgc_invoked++; 3366 old_pgs_freed = zgc_stats.pgs_freed; 3367 3368 simple_lock(&all_zones_lock); 3369 max_zones = num_zones; 3370 z = first_zone; 3371 simple_unlock(&all_zones_lock); 3372 3373 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3374 kprintf("zone_gc(all_zones=%s) starting...\n", all_zones ? "TRUE" : "FALSE"); 3375 3376 /* 3377 * it's ok to allow eager kernel preemption while 3378 * while holding a zone lock since it's taken 3379 * as a spin lock (which prevents preemption) 3380 */ 3381 thread_set_eager_preempt(mythread); 3382 3383#if MACH_ASSERT 3384 for (i = 0; i < zone_pages; i++) { 3385 struct zone_page_table_entry *zp; 3386 3387 zp = zone_page_table_lookup(i); 3388 assert(!zp || (zp->collect_count == 0)); 3389 } 3390#endif /* MACH_ASSERT */ 3391 3392 for (i = 0; i < max_zones; i++, z = z->next_zone) { 3393 unsigned int n, m; 3394 vm_size_t elt_size, size_freed; 3395 struct zone_free_element *elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail; 3396 int kmem_frees = 0, total_freed_pages = 0; 3397 struct zone_page_metadata *page_meta; 3398 queue_head_t page_meta_head; 3399 3400 assert(z != ZONE_NULL); 3401 3402 if (!z->collectable) 3403 continue; 3404 3405 if (all_zones == FALSE && z->elem_size < PAGE_SIZE && !z->use_page_list) 3406 continue; 3407 3408 lock_zone(z); 3409 3410 elt_size = z->elem_size; 3411 3412 /* 3413 * Do a quick feasibility check before we scan the zone: 3414 * skip unless there is likelihood of getting pages back 3415 * (i.e we need a whole allocation block's worth of free 3416 * elements before we can garbage collect) and 3417 * the zone has more than 10 percent of it's elements free 3418 * or the element size is a multiple of the PAGE_SIZE 3419 */ 3420 if ((elt_size & PAGE_MASK) && 3421 !z->use_page_list && 3422 (((z->cur_size - z->count * elt_size) <= (2 * z->alloc_size)) || 3423 ((z->cur_size - z->count * elt_size) <= (z->cur_size / 10)))) { 3424 unlock_zone(z); 3425 continue; 3426 } 3427 3428 z->doing_gc = TRUE; 3429 3430 /* 3431 * Snatch all of the free elements away from the zone. 3432 */ 3433 3434 if (z->use_page_list) { 3435 queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages); 3436 queue_init(&z->pages.all_free); 3437 } else { 3438 scan = (void *)z->free_elements; 3439 z->free_elements = 0; 3440 } 3441 3442 unlock_zone(z); 3443 3444 if (z->use_page_list) { 3445 /* 3446 * For zones that maintain page lists (which in turn 3447 * track free elements on those pages), zone_gc() 3448 * is incredibly easy, and we bypass all the logic 3449 * for scanning elements and mapping them to 3450 * collectable pages 3451 */ 3452 3453 size_freed = 0; 3454 3455 queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) { 3456 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */ 3457 3458 zgc_stats.elems_freed += page_meta->free_count; 3459 size_freed += elt_size * page_meta->free_count; 3460 zgc_stats.elems_collected += page_meta->free_count; 3461 } 3462 3463 lock_zone(z); 3464 3465 if (size_freed > 0) { 3466 z->cur_size -= size_freed; 3467 z->countfree -= size_freed/elt_size; 3468 } 3469 3470 z->doing_gc = FALSE; 3471 if (z->waiting) { 3472 z->waiting = FALSE; 3473 zone_wakeup(z); 3474 } 3475 3476 unlock_zone(z); 3477 3478 if (queue_empty(&page_meta_head)) 3479 continue; 3480 3481 thread_clear_eager_preempt(mythread); 3482 3483 while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) { 3484 vm_address_t free_page_address; 3485 3486 free_page_address = trunc_page((vm_address_t)page_meta); 3487#if ZONE_ALIAS_ADDR 3488 free_page_address = zone_virtual_addr(free_page_address); 3489#endif 3490 kmem_free(zone_map, free_page_address, PAGE_SIZE); 3491 ZONE_PAGE_COUNT_DECR(z, 1); 3492 total_freed_pages++; 3493 zgc_stats.pgs_freed += 1; 3494 3495 if (++kmem_frees == 32) { 3496 thread_yield_internal(1); 3497 kmem_frees = 0; 3498 } 3499 } 3500 3501 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3502 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); 3503 3504 thread_set_eager_preempt(mythread); 3505 continue; /* go to next zone */ 3506 } 3507 3508 /* 3509 * Pass 1: 3510 * 3511 * Determine which elements we can attempt to collect 3512 * and count them up in the page table. Foreign elements 3513 * are returned to the zone. 3514 */ 3515 3516 prev = (void *)&scan; 3517 elt = scan; 3518 n = 0; tail = keep = NULL; 3519 3520 zone_free_page_head = ZONE_PAGE_INDEX_INVALID; 3521 zone_free_page_tail = ZONE_PAGE_INDEX_INVALID; 3522 3523 3524 while (elt != NULL) { 3525 if (from_zone_map(elt, elt_size)) { 3526 zone_page_collect((vm_offset_t)elt, elt_size); 3527 3528 prev = elt; 3529 elt = elt->next; 3530 3531 ++zgc_stats.elems_collected; 3532 } 3533 else { 3534 if (keep == NULL) 3535 keep = tail = elt; 3536 else { 3537 append_zone_element(z, tail, elt); 3538 tail = elt; 3539 } 3540 3541 append_zone_element(z, prev, elt->next); 3542 elt = elt->next; 3543 append_zone_element(z, tail, NULL); 3544 } 3545 3546 /* 3547 * Dribble back the elements we are keeping. 3548 * If there are none, give some elements that we haven't looked at yet 3549 * back to the freelist so that others waiting on the zone don't get stuck 3550 * for too long. This might prevent us from recovering some memory, 3551 * but allows us to avoid having to allocate new memory to serve requests 3552 * while zone_gc has all the free memory tied up. 3553 * <rdar://problem/3893406> 3554 */ 3555 3556 if (++n >= 50) { 3557 if (z->waiting == TRUE) { 3558 /* z->waiting checked without lock held, rechecked below after locking */ 3559 lock_zone(z); 3560 3561 if (keep != NULL) { 3562 add_list_to_zone(z, keep, tail); 3563 tail = keep = NULL; 3564 } else { 3565 m =0; 3566 base_elt = elt; 3567 base_prev = prev; 3568 while ((elt != NULL) && (++m < 50)) { 3569 prev = elt; 3570 elt = elt->next; 3571 } 3572 if (m !=0 ) { 3573 /* Extract the elements from the list and 3574 * give them back */ 3575 append_zone_element(z, prev, NULL); 3576 add_list_to_zone(z, base_elt, prev); 3577 append_zone_element(z, base_prev, elt); 3578 prev = base_prev; 3579 } 3580 } 3581 3582 if (z->waiting) { 3583 z->waiting = FALSE; 3584 zone_wakeup(z); 3585 } 3586 3587 unlock_zone(z); 3588 } 3589 n =0; 3590 } 3591 } 3592 3593 /* 3594 * Return any remaining elements. 3595 */ 3596 3597 if (keep != NULL) { 3598 lock_zone(z); 3599 3600 add_list_to_zone(z, keep, tail); 3601 3602 if (z->waiting) { 3603 z->waiting = FALSE; 3604 zone_wakeup(z); 3605 } 3606 3607 unlock_zone(z); 3608 } 3609 3610 /* 3611 * Pass 2: 3612 * 3613 * Determine which pages we can reclaim and 3614 * free those elements. 3615 */ 3616 3617 size_freed = 0; 3618 elt = scan; 3619 n = 0; tail = keep = NULL; 3620 3621 while (elt != NULL) { 3622 if (zone_page_collectable((vm_offset_t)elt, elt_size)) { 3623 struct zone_free_element *next_elt = elt->next; 3624 3625 size_freed += elt_size; 3626 3627 /* 3628 * If this is the last allocation on the page(s), 3629 * we may use their storage to maintain the linked 3630 * list of free-able pages. So store elt->next because 3631 * "elt" may be scribbled over. 3632 */ 3633 zone_page_free_element(&zone_free_page_head, &zone_free_page_tail, (vm_offset_t)elt, elt_size); 3634 3635 elt = next_elt; 3636 3637 ++zgc_stats.elems_freed; 3638 } 3639 else { 3640 zone_page_keep((vm_offset_t)elt, elt_size); 3641 3642 if (keep == NULL) 3643 keep = tail = elt; 3644 else { 3645 append_zone_element(z, tail, elt); 3646 tail = elt; 3647 } 3648 3649 elt = elt->next; 3650 append_zone_element(z, tail, NULL); 3651 3652 ++zgc_stats.elems_kept; 3653 } 3654 3655 /* 3656 * Dribble back the elements we are keeping, 3657 * and update the zone size info. 3658 */ 3659 3660 if (++n >= 50) { 3661 lock_zone(z); 3662 3663 z->cur_size -= size_freed; 3664 z->countfree -= size_freed/elt_size; 3665 size_freed = 0; 3666 3667 if (keep != NULL) { 3668 add_list_to_zone(z, keep, tail); 3669 } 3670 3671 if (z->waiting) { 3672 z->waiting = FALSE; 3673 zone_wakeup(z); 3674 } 3675 3676 unlock_zone(z); 3677 3678 n = 0; tail = keep = NULL; 3679 } 3680 } 3681 3682 /* 3683 * Return any remaining elements, and update 3684 * the zone size info. 3685 */ 3686 3687 lock_zone(z); 3688 3689 if (size_freed > 0 || keep != NULL) { 3690 3691 z->cur_size -= size_freed; 3692 z->countfree -= size_freed/elt_size; 3693 3694 if (keep != NULL) { 3695 add_list_to_zone(z, keep, tail); 3696 } 3697 3698 } 3699 3700 z->doing_gc = FALSE; 3701 if (z->waiting) { 3702 z->waiting = FALSE; 3703 zone_wakeup(z); 3704 } 3705 unlock_zone(z); 3706 3707 if (zone_free_page_head == ZONE_PAGE_INDEX_INVALID) 3708 continue; 3709 3710 /* 3711 * we don't want to allow eager kernel preemption while holding the 3712 * various locks taken in the kmem_free path of execution 3713 */ 3714 thread_clear_eager_preempt(mythread); 3715 3716 3717 /* 3718 * This loop counts the number of pages that should be freed by the 3719 * next loop that tries to coalesce the kmem_frees() 3720 */ 3721 uint32_t pages_to_free_count = 0; 3722 vm_address_t fpa; 3723 zone_page_index_t index; 3724 for (index = zone_free_page_head; index != ZONE_PAGE_INDEX_INVALID;) { 3725 pages_to_free_count++; 3726 fpa = zone_map_min_address + PAGE_SIZE * ((vm_size_t)index); 3727 index = *(zone_page_index_t *)fpa; 3728 } 3729 3730 /* 3731 * Reclaim the pages we are freeing. 3732 */ 3733 while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { 3734 zone_page_index_t zind = zone_free_page_head; 3735 vm_address_t free_page_address; 3736 int page_count; 3737 3738 /* 3739 * Use the first word of the page about to be freed to find the next free page 3740 */ 3741 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind); 3742 zone_free_page_head = *(zone_page_index_t *)free_page_address; 3743 3744 page_count = 1; 3745 total_freed_pages++; 3746 3747 while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { 3748 zone_page_index_t next_zind = zone_free_page_head; 3749 vm_address_t next_free_page_address; 3750 3751 next_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)next_zind); 3752 3753 if (next_free_page_address == (free_page_address - PAGE_SIZE)) { 3754 free_page_address = next_free_page_address; 3755 } else if (next_free_page_address != (free_page_address + (PAGE_SIZE * page_count))) 3756 break; 3757 3758 zone_free_page_head = *(zone_page_index_t *)next_free_page_address; 3759 page_count++; 3760 total_freed_pages++; 3761 } 3762 kmem_free(zone_map, free_page_address, page_count * PAGE_SIZE); 3763 ZONE_PAGE_COUNT_DECR(z, page_count); 3764 zgc_stats.pgs_freed += page_count; 3765 pages_to_free_count -= page_count; 3766 3767 if (++kmem_frees == 32) { 3768 thread_yield_internal(1); 3769 kmem_frees = 0; 3770 } 3771 } 3772 3773 /* Check that we actually free the exact number of pages we were supposed to */ 3774 assert(pages_to_free_count == 0); 3775 3776 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3777 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); 3778 3779 thread_set_eager_preempt(mythread); 3780 } 3781 3782 if (old_pgs_freed == zgc_stats.pgs_freed) 3783 zgc_stats.zgc_bailed++; 3784 3785 thread_clear_eager_preempt(mythread); 3786 3787 lck_mtx_unlock(&zone_gc_lock); 3788 3789} 3790 3791extern vm_offset_t kmapoff_kaddr; 3792extern unsigned int kmapoff_pgcnt; 3793 3794/* 3795 * consider_zone_gc: 3796 * 3797 * Called by the pageout daemon when the system needs more free pages. 3798 */ 3799 3800void 3801consider_zone_gc(boolean_t force) 3802{ 3803 boolean_t all_zones = FALSE; 3804 3805 if (kmapoff_kaddr != 0) { 3806 /* 3807 * One-time reclaim of kernel_map resources we allocated in 3808 * early boot. 3809 */ 3810 (void) vm_deallocate(kernel_map, 3811 kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64); 3812 kmapoff_kaddr = 0; 3813 } 3814 3815 if (zone_gc_allowed && 3816 (zone_gc_allowed_by_time_throttle || 3817 zone_gc_forced || 3818 force)) { 3819 if (zone_gc_allowed_by_time_throttle == TRUE) { 3820 zone_gc_allowed_by_time_throttle = FALSE; 3821 all_zones = TRUE; 3822 } 3823 zone_gc_forced = FALSE; 3824 3825 zone_gc(all_zones); 3826 } 3827} 3828 3829/* 3830 * By default, don't attempt zone GC more frequently 3831 * than once / 1 minutes. 3832 */ 3833void 3834compute_zone_gc_throttle(void *arg __unused) 3835{ 3836 zone_gc_allowed_by_time_throttle = TRUE; 3837} 3838 3839 3840#if CONFIG_TASK_ZONE_INFO 3841 3842kern_return_t 3843task_zone_info( 3844 task_t task, 3845 mach_zone_name_array_t *namesp, 3846 mach_msg_type_number_t *namesCntp, 3847 task_zone_info_array_t *infop, 3848 mach_msg_type_number_t *infoCntp) 3849{ 3850 mach_zone_name_t *names; 3851 vm_offset_t names_addr; 3852 vm_size_t names_size; 3853 task_zone_info_t *info; 3854 vm_offset_t info_addr; 3855 vm_size_t info_size; 3856 unsigned int max_zones, i; 3857 zone_t z; 3858 mach_zone_name_t *zn; 3859 task_zone_info_t *zi; 3860 kern_return_t kr; 3861 3862 vm_size_t used; 3863 vm_map_copy_t copy; 3864 3865 3866 if (task == TASK_NULL) 3867 return KERN_INVALID_TASK; 3868 3869 /* 3870 * We assume that zones aren't freed once allocated. 3871 * We won't pick up any zones that are allocated later. 3872 */ 3873 3874 simple_lock(&all_zones_lock); 3875 max_zones = (unsigned int)(num_zones + num_fake_zones); 3876 z = first_zone; 3877 simple_unlock(&all_zones_lock); 3878 3879 names_size = round_page(max_zones * sizeof *names); 3880 kr = kmem_alloc_pageable(ipc_kernel_map, 3881 &names_addr, names_size); 3882 if (kr != KERN_SUCCESS) 3883 return kr; 3884 names = (mach_zone_name_t *) names_addr; 3885 3886 info_size = round_page(max_zones * sizeof *info); 3887 kr = kmem_alloc_pageable(ipc_kernel_map, 3888 &info_addr, info_size); 3889 if (kr != KERN_SUCCESS) { 3890 kmem_free(ipc_kernel_map, 3891 names_addr, names_size); 3892 return kr; 3893 } 3894 3895 info = (task_zone_info_t *) info_addr; 3896 3897 zn = &names[0]; 3898 zi = &info[0]; 3899 3900 for (i = 0; i < max_zones - num_fake_zones; i++) { 3901 struct zone zcopy; 3902 3903 assert(z != ZONE_NULL); 3904 3905 lock_zone(z); 3906 zcopy = *z; 3907 unlock_zone(z); 3908 3909 simple_lock(&all_zones_lock); 3910 z = z->next_zone; 3911 simple_unlock(&all_zones_lock); 3912 3913 /* assuming here the name data is static */ 3914 (void) strncpy(zn->mzn_name, zcopy.zone_name, 3915 sizeof zn->mzn_name); 3916 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3917 3918 zi->tzi_count = (uint64_t)zcopy.count; 3919 zi->tzi_cur_size = (uint64_t)zcopy.cur_size; 3920 zi->tzi_max_size = (uint64_t)zcopy.max_size; 3921 zi->tzi_elem_size = (uint64_t)zcopy.elem_size; 3922 zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size; 3923 zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size; 3924 zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible; 3925 zi->tzi_collectable = (uint64_t)zcopy.collectable; 3926 zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct; 3927 if (task->tkm_zinfo != NULL) { 3928 zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc; 3929 zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free; 3930 } else { 3931 zi->tzi_task_alloc = 0; 3932 zi->tzi_task_free = 0; 3933 } 3934 zn++; 3935 zi++; 3936 } 3937 3938 /* 3939 * loop through the fake zones and fill them using the specialized 3940 * functions 3941 */ 3942 for (i = 0; i < num_fake_zones; i++) { 3943 int count, collectable, exhaustible, caller_acct, index; 3944 vm_size_t cur_size, max_size, elem_size, alloc_size; 3945 uint64_t sum_size; 3946 3947 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); 3948 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3949 fake_zones[i].query(&count, &cur_size, 3950 &max_size, &elem_size, 3951 &alloc_size, &sum_size, 3952 &collectable, &exhaustible, &caller_acct); 3953 zi->tzi_count = (uint64_t)count; 3954 zi->tzi_cur_size = (uint64_t)cur_size; 3955 zi->tzi_max_size = (uint64_t)max_size; 3956 zi->tzi_elem_size = (uint64_t)elem_size; 3957 zi->tzi_alloc_size = (uint64_t)alloc_size; 3958 zi->tzi_sum_size = sum_size; 3959 zi->tzi_collectable = (uint64_t)collectable; 3960 zi->tzi_exhaustible = (uint64_t)exhaustible; 3961 zi->tzi_caller_acct = (uint64_t)caller_acct; 3962 if (task->tkm_zinfo != NULL) { 3963 index = ZINFO_SLOTS - num_fake_zones + i; 3964 zi->tzi_task_alloc = task->tkm_zinfo[index].alloc; 3965 zi->tzi_task_free = task->tkm_zinfo[index].free; 3966 } else { 3967 zi->tzi_task_alloc = 0; 3968 zi->tzi_task_free = 0; 3969 } 3970 zn++; 3971 zi++; 3972 } 3973 3974 used = max_zones * sizeof *names; 3975 if (used != names_size) 3976 bzero((char *) (names_addr + used), names_size - used); 3977 3978 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 3979 (vm_map_size_t)names_size, TRUE, ©); 3980 assert(kr == KERN_SUCCESS); 3981 3982 *namesp = (mach_zone_name_t *) copy; 3983 *namesCntp = max_zones; 3984 3985 used = max_zones * sizeof *info; 3986 3987 if (used != info_size) 3988 bzero((char *) (info_addr + used), info_size - used); 3989 3990 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 3991 (vm_map_size_t)info_size, TRUE, ©); 3992 assert(kr == KERN_SUCCESS); 3993 3994 *infop = (task_zone_info_t *) copy; 3995 *infoCntp = max_zones; 3996 3997 return KERN_SUCCESS; 3998} 3999 4000#else /* CONFIG_TASK_ZONE_INFO */ 4001 4002kern_return_t 4003task_zone_info( 4004 __unused task_t task, 4005 __unused mach_zone_name_array_t *namesp, 4006 __unused mach_msg_type_number_t *namesCntp, 4007 __unused task_zone_info_array_t *infop, 4008 __unused mach_msg_type_number_t *infoCntp) 4009{ 4010 return KERN_FAILURE; 4011} 4012 4013#endif /* CONFIG_TASK_ZONE_INFO */ 4014 4015kern_return_t 4016mach_zone_info( 4017 host_priv_t host, 4018 mach_zone_name_array_t *namesp, 4019 mach_msg_type_number_t *namesCntp, 4020 mach_zone_info_array_t *infop, 4021 mach_msg_type_number_t *infoCntp) 4022{ 4023 mach_zone_name_t *names; 4024 vm_offset_t names_addr; 4025 vm_size_t names_size; 4026 mach_zone_info_t *info; 4027 vm_offset_t info_addr; 4028 vm_size_t info_size; 4029 unsigned int max_zones, i; 4030 zone_t z; 4031 mach_zone_name_t *zn; 4032 mach_zone_info_t *zi; 4033 kern_return_t kr; 4034 4035 vm_size_t used; 4036 vm_map_copy_t copy; 4037 4038 4039 if (host == HOST_NULL) 4040 return KERN_INVALID_HOST; 4041#if CONFIG_DEBUGGER_FOR_ZONE_INFO 4042 if (!PE_i_can_has_debugger(NULL)) 4043 return KERN_INVALID_HOST; 4044#endif 4045 4046 /* 4047 * We assume that zones aren't freed once allocated. 4048 * We won't pick up any zones that are allocated later. 4049 */ 4050 4051 simple_lock(&all_zones_lock); 4052 max_zones = (unsigned int)(num_zones + num_fake_zones); 4053 z = first_zone; 4054 simple_unlock(&all_zones_lock); 4055 4056 names_size = round_page(max_zones * sizeof *names); 4057 kr = kmem_alloc_pageable(ipc_kernel_map, 4058 &names_addr, names_size); 4059 if (kr != KERN_SUCCESS) 4060 return kr; 4061 names = (mach_zone_name_t *) names_addr; 4062 4063 info_size = round_page(max_zones * sizeof *info); 4064 kr = kmem_alloc_pageable(ipc_kernel_map, 4065 &info_addr, info_size); 4066 if (kr != KERN_SUCCESS) { 4067 kmem_free(ipc_kernel_map, 4068 names_addr, names_size); 4069 return kr; 4070 } 4071 4072 info = (mach_zone_info_t *) info_addr; 4073 4074 zn = &names[0]; 4075 zi = &info[0]; 4076 4077 for (i = 0; i < max_zones - num_fake_zones; i++) { 4078 struct zone zcopy; 4079 4080 assert(z != ZONE_NULL); 4081 4082 lock_zone(z); 4083 zcopy = *z; 4084 unlock_zone(z); 4085 4086 simple_lock(&all_zones_lock); 4087 z = z->next_zone; 4088 simple_unlock(&all_zones_lock); 4089 4090 /* assuming here the name data is static */ 4091 (void) strncpy(zn->mzn_name, zcopy.zone_name, 4092 sizeof zn->mzn_name); 4093 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 4094 4095 zi->mzi_count = (uint64_t)zcopy.count; 4096 zi->mzi_cur_size = (uint64_t)zcopy.cur_size; 4097 zi->mzi_max_size = (uint64_t)zcopy.max_size; 4098 zi->mzi_elem_size = (uint64_t)zcopy.elem_size; 4099 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size; 4100 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size; 4101 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible; 4102 zi->mzi_collectable = (uint64_t)zcopy.collectable; 4103 zn++; 4104 zi++; 4105 } 4106 4107 /* 4108 * loop through the fake zones and fill them using the specialized 4109 * functions 4110 */ 4111 for (i = 0; i < num_fake_zones; i++) { 4112 int count, collectable, exhaustible, caller_acct; 4113 vm_size_t cur_size, max_size, elem_size, alloc_size; 4114 uint64_t sum_size; 4115 4116 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); 4117 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 4118 fake_zones[i].query(&count, &cur_size, 4119 &max_size, &elem_size, 4120 &alloc_size, &sum_size, 4121 &collectable, &exhaustible, &caller_acct); 4122 zi->mzi_count = (uint64_t)count; 4123 zi->mzi_cur_size = (uint64_t)cur_size; 4124 zi->mzi_max_size = (uint64_t)max_size; 4125 zi->mzi_elem_size = (uint64_t)elem_size; 4126 zi->mzi_alloc_size = (uint64_t)alloc_size; 4127 zi->mzi_sum_size = sum_size; 4128 zi->mzi_collectable = (uint64_t)collectable; 4129 zi->mzi_exhaustible = (uint64_t)exhaustible; 4130 4131 zn++; 4132 zi++; 4133 } 4134 4135 used = max_zones * sizeof *names; 4136 if (used != names_size) 4137 bzero((char *) (names_addr + used), names_size - used); 4138 4139 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 4140 (vm_map_size_t)names_size, TRUE, ©); 4141 assert(kr == KERN_SUCCESS); 4142 4143 *namesp = (mach_zone_name_t *) copy; 4144 *namesCntp = max_zones; 4145 4146 used = max_zones * sizeof *info; 4147 4148 if (used != info_size) 4149 bzero((char *) (info_addr + used), info_size - used); 4150 4151 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 4152 (vm_map_size_t)info_size, TRUE, ©); 4153 assert(kr == KERN_SUCCESS); 4154 4155 *infop = (mach_zone_info_t *) copy; 4156 *infoCntp = max_zones; 4157 4158 return KERN_SUCCESS; 4159} 4160 4161/* 4162 * host_zone_info - LEGACY user interface for Mach zone information 4163 * Should use mach_zone_info() instead! 4164 */ 4165kern_return_t 4166host_zone_info( 4167 host_priv_t host, 4168 zone_name_array_t *namesp, 4169 mach_msg_type_number_t *namesCntp, 4170 zone_info_array_t *infop, 4171 mach_msg_type_number_t *infoCntp) 4172{ 4173 zone_name_t *names; 4174 vm_offset_t names_addr; 4175 vm_size_t names_size; 4176 zone_info_t *info; 4177 vm_offset_t info_addr; 4178 vm_size_t info_size; 4179 unsigned int max_zones, i; 4180 zone_t z; 4181 zone_name_t *zn; 4182 zone_info_t *zi; 4183 kern_return_t kr; 4184 4185 vm_size_t used; 4186 vm_map_copy_t copy; 4187 4188 4189 if (host == HOST_NULL) 4190 return KERN_INVALID_HOST; 4191#if CONFIG_DEBUGGER_FOR_ZONE_INFO 4192 if (!PE_i_can_has_debugger(NULL)) 4193 return KERN_INVALID_HOST; 4194#endif 4195 4196#if defined(__LP64__) 4197 if (!thread_is_64bit(current_thread())) 4198 return KERN_NOT_SUPPORTED; 4199#else 4200 if (thread_is_64bit(current_thread())) 4201 return KERN_NOT_SUPPORTED; 4202#endif 4203 4204 /* 4205 * We assume that zones aren't freed once allocated. 4206 * We won't pick up any zones that are allocated later. 4207 */ 4208 4209 simple_lock(&all_zones_lock); 4210 max_zones = (unsigned int)(num_zones + num_fake_zones); 4211 z = first_zone; 4212 simple_unlock(&all_zones_lock); 4213 4214 names_size = round_page(max_zones * sizeof *names); 4215 kr = kmem_alloc_pageable(ipc_kernel_map, 4216 &names_addr, names_size); 4217 if (kr != KERN_SUCCESS) 4218 return kr; 4219 names = (zone_name_t *) names_addr; 4220 4221 info_size = round_page(max_zones * sizeof *info); 4222 kr = kmem_alloc_pageable(ipc_kernel_map, 4223 &info_addr, info_size); 4224 if (kr != KERN_SUCCESS) { 4225 kmem_free(ipc_kernel_map, 4226 names_addr, names_size); 4227 return kr; 4228 } 4229 4230 info = (zone_info_t *) info_addr; 4231 4232 zn = &names[0]; 4233 zi = &info[0]; 4234 4235 for (i = 0; i < max_zones - num_fake_zones; i++) { 4236 struct zone zcopy; 4237 4238 assert(z != ZONE_NULL); 4239 4240 lock_zone(z); 4241 zcopy = *z; 4242 unlock_zone(z); 4243 4244 simple_lock(&all_zones_lock); 4245 z = z->next_zone; 4246 simple_unlock(&all_zones_lock); 4247 4248 /* assuming here the name data is static */ 4249 (void) strncpy(zn->zn_name, zcopy.zone_name, 4250 sizeof zn->zn_name); 4251 zn->zn_name[sizeof zn->zn_name - 1] = '\0'; 4252 4253 zi->zi_count = zcopy.count; 4254 zi->zi_cur_size = zcopy.cur_size; 4255 zi->zi_max_size = zcopy.max_size; 4256 zi->zi_elem_size = zcopy.elem_size; 4257 zi->zi_alloc_size = zcopy.alloc_size; 4258 zi->zi_exhaustible = zcopy.exhaustible; 4259 zi->zi_collectable = zcopy.collectable; 4260 4261 zn++; 4262 zi++; 4263 } 4264 4265 /* 4266 * loop through the fake zones and fill them using the specialized 4267 * functions 4268 */ 4269 for (i = 0; i < num_fake_zones; i++) { 4270 int caller_acct; 4271 uint64_t sum_space; 4272 strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name); 4273 zn->zn_name[sizeof zn->zn_name - 1] = '\0'; 4274 fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size, 4275 &zi->zi_max_size, &zi->zi_elem_size, 4276 &zi->zi_alloc_size, &sum_space, 4277 &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct); 4278 zn++; 4279 zi++; 4280 } 4281 4282 used = max_zones * sizeof *names; 4283 if (used != names_size) 4284 bzero((char *) (names_addr + used), names_size - used); 4285 4286 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 4287 (vm_map_size_t)names_size, TRUE, ©); 4288 assert(kr == KERN_SUCCESS); 4289 4290 *namesp = (zone_name_t *) copy; 4291 *namesCntp = max_zones; 4292 4293 used = max_zones * sizeof *info; 4294 if (used != info_size) 4295 bzero((char *) (info_addr + used), info_size - used); 4296 4297 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 4298 (vm_map_size_t)info_size, TRUE, ©); 4299 assert(kr == KERN_SUCCESS); 4300 4301 *infop = (zone_info_t *) copy; 4302 *infoCntp = max_zones; 4303 4304 return KERN_SUCCESS; 4305} 4306 4307kern_return_t 4308mach_zone_force_gc( 4309 host_t host) 4310{ 4311 4312 if (host == HOST_NULL) 4313 return KERN_INVALID_HOST; 4314 4315 consider_zone_gc(TRUE); 4316 4317 return (KERN_SUCCESS); 4318} 4319 4320extern unsigned int stack_total; 4321extern unsigned long long stack_allocs; 4322 4323#if defined(__i386__) || defined (__x86_64__) 4324extern unsigned int inuse_ptepages_count; 4325extern long long alloc_ptepages_count; 4326#endif 4327 4328void zone_display_zprint() 4329{ 4330 unsigned int i; 4331 zone_t the_zone; 4332 4333 if(first_zone!=NULL) { 4334 the_zone = first_zone; 4335 for (i = 0; i < num_zones; i++) { 4336 if(the_zone->cur_size > (1024*1024)) { 4337 printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size); 4338 } 4339 4340 if(the_zone->next_zone == NULL) { 4341 break; 4342 } 4343 4344 the_zone = the_zone->next_zone; 4345 } 4346 } 4347 4348 printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total)); 4349 4350#if defined(__i386__) || defined (__x86_64__) 4351 printf("PageTables:\t%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count)); 4352#endif 4353 4354 printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total); 4355} 4356 4357zone_t 4358zone_find_largest(void) 4359{ 4360 unsigned int i; 4361 unsigned int max_zones; 4362 zone_t the_zone; 4363 zone_t zone_largest; 4364 4365 simple_lock(&all_zones_lock); 4366 the_zone = first_zone; 4367 max_zones = num_zones; 4368 simple_unlock(&all_zones_lock); 4369 4370 zone_largest = the_zone; 4371 for (i = 0; i < max_zones; i++) { 4372 if (the_zone->cur_size > zone_largest->cur_size) { 4373 zone_largest = the_zone; 4374 } 4375 4376 if (the_zone->next_zone == NULL) { 4377 break; 4378 } 4379 4380 the_zone = the_zone->next_zone; 4381 } 4382 return zone_largest; 4383} 4384 4385#if ZONE_DEBUG 4386 4387/* should we care about locks here ? */ 4388 4389#define zone_in_use(z) ( z->count || z->free_elements \ 4390 || !queue_empty(&z->pages.all_free) \ 4391 || !queue_empty(&z->pages.intermediate) \ 4392 || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign))) 4393 4394void 4395zone_debug_enable( 4396 zone_t z) 4397{ 4398 if (zone_debug_enabled(z) || zone_in_use(z) || 4399 z->alloc_size < (z->elem_size + ZONE_DEBUG_OFFSET)) 4400 return; 4401 queue_init(&z->active_zones); 4402 z->elem_size += ZONE_DEBUG_OFFSET; 4403} 4404 4405void 4406zone_debug_disable( 4407 zone_t z) 4408{ 4409 if (!zone_debug_enabled(z) || zone_in_use(z)) 4410 return; 4411 z->elem_size -= ZONE_DEBUG_OFFSET; 4412 z->active_zones.next = z->active_zones.prev = NULL; 4413} 4414 4415 4416#endif /* ZONE_DEBUG */ 4417