1/* 2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 */ 58/* 59 * File: kern/zalloc.c 60 * Author: Avadis Tevanian, Jr. 61 * 62 * Zone-based memory allocator. A zone is a collection of fixed size 63 * data blocks for which quick allocation/deallocation is possible. 64 */ 65#include <zone_debug.h> 66#include <zone_alias_addr.h> 67 68#include <mach/mach_types.h> 69#include <mach/vm_param.h> 70#include <mach/kern_return.h> 71#include <mach/mach_host_server.h> 72#include <mach/task_server.h> 73#include <mach/machine/vm_types.h> 74#include <mach_debug/zone_info.h> 75#include <mach/vm_map.h> 76 77#include <kern/kern_types.h> 78#include <kern/assert.h> 79#include <kern/host.h> 80#include <kern/macro_help.h> 81#include <kern/sched.h> 82#include <kern/locks.h> 83#include <kern/sched_prim.h> 84#include <kern/misc_protos.h> 85#include <kern/thread_call.h> 86#include <kern/zalloc.h> 87#include <kern/kalloc.h> 88#include <kern/btlog.h> 89 90#include <vm/pmap.h> 91#include <vm/vm_map.h> 92#include <vm/vm_kern.h> 93#include <vm/vm_page.h> 94 95#include <pexpert/pexpert.h> 96 97#include <machine/machparam.h> 98#include <machine/machine_routines.h> /* ml_cpu_get_info */ 99 100#include <libkern/OSDebug.h> 101#include <libkern/OSAtomic.h> 102#include <sys/kdebug.h> 103 104/* 105 * ZONE_ALIAS_ADDR 106 * 107 * With this option enabled, zones with alloc_size <= PAGE_SIZE allocate 108 * a virtual page from the zone_map, but before zcram-ing the allocated memory 109 * into the zone, the page is translated to use the alias address of the page 110 * in the static kernel region. zone_gc reverses that translation when 111 * scanning the freelist to collect free pages so that it can look up the page 112 * in the zone_page_table, and free it to kmem_free. 113 * 114 * The static kernel region is a flat 1:1 mapping of physical memory passed 115 * to xnu by the booter. It is mapped to the range: 116 * [gVirtBase, gVirtBase + gPhysSize] 117 * 118 * Accessing memory via the static kernel region is faster due to the 119 * entire region being mapped via large pages, cutting down 120 * on TLB misses. 121 * 122 * zinit favors using PAGE_SIZE backing allocations for a zone unless it would 123 * waste more than 10% space to use a single page, in order to take advantage 124 * of the speed benefit for as many zones as possible. 125 * 126 * Zones with > PAGE_SIZE allocations can't take advantage of this 127 * because kernel_memory_allocate doesn't give out physically contiguous pages. 128 * 129 * zone_virtual_addr() 130 * - translates an address from the static kernel region to the zone_map 131 * - returns the same address if it's not from the static kernel region 132 * It relies on the fact that a physical page mapped to the 133 * zone_map is not mapped anywhere else (except the static kernel region). 134 * 135 * zone_alias_addr() 136 * - translates a virtual memory address from the zone_map to the 137 * corresponding address in the static kernel region 138 * 139 */ 140 141#if !ZONE_ALIAS_ADDR 142#define from_zone_map(addr, size) \ 143 ((vm_offset_t)(addr) >= zone_map_min_address && \ 144 ((vm_offset_t)(addr) + size - 1) < zone_map_max_address ) 145#else 146#define from_zone_map(addr, size) \ 147 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) >= zone_map_min_address && \ 148 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) + size -1) < zone_map_max_address ) 149#endif 150 151/* 152 * Zone Corruption Debugging 153 * 154 * We use three techniques to detect modification of a zone element 155 * after it's been freed. 156 * 157 * (1) Check the freelist next pointer for sanity. 158 * (2) Store a backup of the next pointer at the end of the element, 159 * and compare it to the primary next pointer when the element is allocated 160 * to detect corruption of the freelist due to use-after-free bugs. 161 * The backup pointer is also XORed with a per-boot random cookie. 162 * (3) Poison the freed element by overwriting it with 0xdeadbeef, 163 * and check for that value when the element is being reused to make sure 164 * no part of the element has been modified while it was on the freelist. 165 * This will also help catch read-after-frees, as code will now dereference 166 * 0xdeadbeef instead of a valid but freed pointer. 167 * 168 * (1) and (2) occur for every allocation and free to a zone. 169 * This is done to make it slightly more difficult for an attacker to 170 * manipulate the freelist to behave in a specific way. 171 * 172 * Poisoning (3) occurs periodically for every N frees (counted per-zone) 173 * and on every free for zones smaller than a cacheline. If -zp 174 * is passed as a boot arg, poisoning occurs for every free. 175 * 176 * Performance slowdown is inversely proportional to the frequency of poisoning, 177 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32 178 * and higher. You can expect to find a 100% reproducible bug in an average of 179 * N tries, with a standard deviation of about N, but you will want to set 180 * "-zp" to always poison every free if you are attempting to reproduce 181 * a known bug. 182 * 183 * For a more heavyweight, but finer-grained method of detecting misuse 184 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c. 185 * 186 * Zone Corruption Logging 187 * 188 * You can also track where corruptions come from by using the boot-arguments 189 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later 190 * in this document for more implementation and usage information. 191 * 192 * Zone Leak Detection 193 * 194 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks' 195 * found later in this file via the showtopztrace and showz* macros in kgmacros, 196 * or use zlog without the -zc argument. 197 * 198 */ 199 200 201#if defined(__LP64__) 202#define ZP_POISON 0xdeadbeefdeadbeef 203#else 204#define ZP_POISON 0xdeadbeef 205#endif 206 207#define ZP_DEFAULT_SAMPLING_FACTOR 16 208 209/* 210 * A zp_factor of 0 indicates zone poisoning is disabled, 211 * however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline). 212 * Passing the -no-zp boot-arg disables even this behavior. 213 * In all cases, we record and check the integrity of a backup pointer. 214 */ 215 216/* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */ 217uint32_t zp_factor = 0; 218 219/* set in zp_init, zero indicates -no-zp boot-arg */ 220vm_size_t zp_tiny_zone_limit = 0; 221 222/* initialized to a per-boot random value in zp_init */ 223uintptr_t zp_poisoned_cookie = 0; 224uintptr_t zp_nopoison_cookie = 0; 225 226 227/* 228 * initialize zone poisoning 229 * called from zone_bootstrap before any allocations are made from zalloc 230 */ 231static inline void 232zp_init(void) 233{ 234 char temp_buf[16]; 235 236 /* 237 * Initialize backup pointer random cookie for poisoned elements 238 * Try not to call early_random() back to back, it may return 239 * the same value if mach_absolute_time doesn't have sufficient time 240 * to tick over between calls. <rdar://problem/11597395> 241 * (This is only a problem on embedded devices) 242 */ 243 zp_poisoned_cookie = (uintptr_t) early_random(); 244 245 /* 246 * Always poison zones smaller than a cacheline, 247 * because it's pretty close to free 248 */ 249 ml_cpu_info_t cpu_info; 250 ml_cpu_get_info(&cpu_info); 251 zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size; 252 253 zp_factor = ZP_DEFAULT_SAMPLING_FACTOR; 254 255 //TODO: Bigger permutation? 256 /* 257 * Permute the default factor +/- 1 to make it less predictable 258 * This adds or subtracts ~4 poisoned objects per 1000 frees. 259 */ 260 if (zp_factor != 0) { 261 uint32_t rand_bits = early_random() & 0x3; 262 263 if (rand_bits == 0x1) 264 zp_factor += 1; 265 else if (rand_bits == 0x2) 266 zp_factor -= 1; 267 /* if 0x0 or 0x3, leave it alone */ 268 } 269 270 /* -zp: enable poisoning for every alloc and free */ 271 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) { 272 zp_factor = 1; 273 } 274 275 /* -no-zp: disable poisoning completely even for tiny zones */ 276 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) { 277 zp_factor = 0; 278 zp_tiny_zone_limit = 0; 279 printf("Zone poisoning disabled\n"); 280 } 281 282 /* zp-factor=XXXX: override how often to poison freed zone elements */ 283 if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) { 284 printf("Zone poisoning factor override: %u\n", zp_factor); 285 } 286 287 /* Initialize backup pointer random cookie for unpoisoned elements */ 288 zp_nopoison_cookie = (uintptr_t) early_random(); 289 290#if MACH_ASSERT 291 if (zp_poisoned_cookie == zp_nopoison_cookie) 292 panic("early_random() is broken: %p and %p are not random\n", 293 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie); 294#endif 295 296 /* 297 * Use the last bit in the backup pointer to hint poisoning state 298 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so 299 * the low bits are zero. 300 */ 301 zp_poisoned_cookie |= (uintptr_t)0x1ULL; 302 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL); 303 304#if defined(__LP64__) 305 /* 306 * Make backup pointers more obvious in GDB for 64 bit 307 * by making OxFFFFFF... ^ cookie = 0xFACADE... 308 * (0xFACADE = 0xFFFFFF ^ 0x053521) 309 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011) 310 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked 311 * by the sanity check, so it's OK for that part of the cookie to be predictable. 312 * 313 * TODO: Use #defines, xors, and shifts 314 */ 315 316 zp_poisoned_cookie &= 0x000000FFFFFFFFFF; 317 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */ 318 319 zp_nopoison_cookie &= 0x000000FFFFFFFFFF; 320 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */ 321#endif 322} 323 324/* zone_map page count for page table structure */ 325uint64_t zone_map_table_page_count = 0; 326 327/* 328 * These macros are used to keep track of the number 329 * of pages being used by the zone currently. The 330 * z->page_count is protected by the zone lock. 331 */ 332#define ZONE_PAGE_COUNT_INCR(z, count) \ 333{ \ 334 OSAddAtomic64(count, &(z->page_count)); \ 335} 336 337#define ZONE_PAGE_COUNT_DECR(z, count) \ 338{ \ 339 OSAddAtomic64(-count, &(z->page_count)); \ 340} 341 342/* for is_sane_zone_element and garbage collection */ 343 344vm_offset_t zone_map_min_address = 0; /* initialized in zone_init */ 345vm_offset_t zone_map_max_address = 0; 346 347/* Helpful for walking through a zone's free element list. */ 348struct zone_free_element { 349 struct zone_free_element *next; 350 /* ... */ 351 /* void *backup_ptr; */ 352}; 353 354struct zone_page_metadata { 355 queue_chain_t pages; 356 struct zone_free_element *elements; 357 zone_t zone; 358 uint16_t alloc_count; 359 uint16_t free_count; 360}; 361 362/* The backup pointer is stored in the last pointer-sized location in an element. */ 363static inline vm_offset_t * 364get_backup_ptr(vm_size_t elem_size, 365 vm_offset_t *element) 366{ 367 return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t)); 368} 369 370static inline struct zone_page_metadata * 371get_zone_page_metadata(struct zone_free_element *element) 372{ 373 return (struct zone_page_metadata *)(trunc_page((vm_offset_t)element) + PAGE_SIZE - sizeof(struct zone_page_metadata)); 374} 375 376/* 377 * Zone checking helper function. 378 * A pointer that satisfies these conditions is OK to be a freelist next pointer 379 * A pointer that doesn't satisfy these conditions indicates corruption 380 */ 381static inline boolean_t 382is_sane_zone_ptr(zone_t zone, 383 vm_offset_t addr, 384 size_t obj_size) 385{ 386 /* Must be aligned to pointer boundary */ 387 if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0)) 388 return FALSE; 389 390 /* Must be a kernel address */ 391 if (__improbable(!pmap_kernel_va(addr))) 392 return FALSE; 393 394 /* Must be from zone map if the zone only uses memory from the zone_map */ 395 /* 396 * TODO: Remove the zone->collectable check when every 397 * zone using foreign memory is properly tagged with allows_foreign 398 */ 399 if (zone->collectable && !zone->allows_foreign) { 400#if ZONE_ALIAS_ADDR 401 /* 402 * If this address is in the static kernel region, it might be 403 * the alias address of a valid zone element. 404 * If we tried to find the zone_virtual_addr() of an invalid 405 * address in the static kernel region, it will panic, so don't 406 * check addresses in this region. 407 * 408 * TODO: Use a safe variant of zone_virtual_addr to 409 * make this check more accurate 410 * 411 * The static kernel region is mapped at: 412 * [gVirtBase, gVirtBase + gPhysSize] 413 */ 414 if ((addr - gVirtBase) < gPhysSize) 415 return TRUE; 416#endif 417 /* check if addr is from zone map */ 418 if (addr >= zone_map_min_address && 419 (addr + obj_size - 1) < zone_map_max_address ) 420 return TRUE; 421 422 return FALSE; 423 } 424 425 return TRUE; 426} 427 428static inline boolean_t 429is_sane_zone_page_metadata(zone_t zone, 430 vm_offset_t page_meta) 431{ 432 /* NULL page metadata structures are invalid */ 433 if (page_meta == 0) 434 return FALSE; 435 return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata)); 436} 437 438static inline boolean_t 439is_sane_zone_element(zone_t zone, 440 vm_offset_t addr) 441{ 442 /* NULL is OK because it indicates the tail of the list */ 443 if (addr == 0) 444 return TRUE; 445 return is_sane_zone_ptr(zone, addr, zone->elem_size); 446} 447 448/* Someone wrote to freed memory. */ 449static inline void /* noreturn */ 450zone_element_was_modified_panic(zone_t zone, 451 vm_offset_t found, 452 vm_offset_t expected, 453 vm_offset_t offset) 454{ 455 panic("a freed zone element has been modified: expected %p but found %p, bits changed %p, at offset %d of %d in zone: %s", 456 (void *) expected, 457 (void *) found, 458 (void *) (expected ^ found), 459 (uint32_t) offset, 460 (uint32_t) zone->elem_size, 461 zone->zone_name); 462} 463 464/* 465 * The primary and backup pointers don't match. 466 * Determine which one was likely the corrupted pointer, find out what it 467 * probably should have been, and panic. 468 * I would like to mark this as noreturn, but panic() isn't marked noreturn. 469 */ 470static void /* noreturn */ 471backup_ptr_mismatch_panic(zone_t zone, 472 vm_offset_t primary, 473 vm_offset_t backup) 474{ 475 vm_offset_t likely_backup; 476 477 boolean_t sane_backup; 478 boolean_t sane_primary = is_sane_zone_element(zone, primary); 479 boolean_t element_was_poisoned = (backup & 0x1) ? TRUE : FALSE; 480 481 if (element_was_poisoned) { 482 likely_backup = backup ^ zp_poisoned_cookie; 483 sane_backup = is_sane_zone_element(zone, likely_backup); 484 } else { 485 likely_backup = backup ^ zp_nopoison_cookie; 486 sane_backup = is_sane_zone_element(zone, likely_backup); 487 } 488 489 /* The primary is definitely the corrupted one */ 490 if (!sane_primary && sane_backup) 491 zone_element_was_modified_panic(zone, primary, likely_backup, 0); 492 493 /* The backup is definitely the corrupted one */ 494 if (sane_primary && !sane_backup) 495 zone_element_was_modified_panic(zone, backup, primary, 496 zone->elem_size - sizeof(vm_offset_t)); 497 498 /* 499 * Not sure which is the corrupted one. 500 * It's less likely that the backup pointer was overwritten with 501 * ( (sane address) ^ (valid cookie) ), so we'll guess that the 502 * primary pointer has been overwritten with a sane but incorrect address. 503 */ 504 if (sane_primary && sane_backup) 505 zone_element_was_modified_panic(zone, primary, likely_backup, 0); 506 507 /* Neither are sane, so just guess. */ 508 zone_element_was_modified_panic(zone, primary, likely_backup, 0); 509} 510 511 512/* 513 * Sets the next element of tail to elem. 514 * elem can be NULL. 515 * Preserves the poisoning state of the element. 516 */ 517static inline void 518append_zone_element(zone_t zone, 519 struct zone_free_element *tail, 520 struct zone_free_element *elem) 521{ 522 vm_offset_t *backup = get_backup_ptr(zone->elem_size, (vm_offset_t *) tail); 523 524 vm_offset_t old_backup = *backup; 525 526 vm_offset_t old_next = (vm_offset_t) tail->next; 527 vm_offset_t new_next = (vm_offset_t) elem; 528 529 if (old_next == (old_backup ^ zp_nopoison_cookie)) 530 *backup = new_next ^ zp_nopoison_cookie; 531 else if (old_next == (old_backup ^ zp_poisoned_cookie)) 532 *backup = new_next ^ zp_poisoned_cookie; 533 else 534 backup_ptr_mismatch_panic(zone, 535 old_next, 536 old_backup); 537 538 tail->next = elem; 539} 540 541 542/* 543 * Insert a linked list of elements (delineated by head and tail) at the head of 544 * the zone free list. Every element in the list being added has already gone 545 * through append_zone_element, so their backup pointers are already 546 * set properly. 547 * Precondition: There should be no elements after tail 548 */ 549static inline void 550add_list_to_zone(zone_t zone, 551 struct zone_free_element *head, 552 struct zone_free_element *tail) 553{ 554 assert(tail->next == NULL); 555 assert(!zone->use_page_list); 556 557 append_zone_element(zone, tail, zone->free_elements); 558 559 zone->free_elements = head; 560} 561 562 563/* 564 * Adds the element to the head of the zone's free list 565 * Keeps a backup next-pointer at the end of the element 566 * Poisons the element with ZP_POISON every zp_factor frees 567 */ 568static inline void 569free_to_zone(zone_t zone, 570 vm_offset_t element) 571{ 572 vm_offset_t old_head; 573 struct zone_page_metadata *page_meta; 574 575 vm_offset_t *primary = (vm_offset_t *) element; 576 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); 577 578 if (zone->use_page_list) { 579 page_meta = get_zone_page_metadata((struct zone_free_element *)element); 580 assert(page_meta->zone == zone); 581 old_head = (vm_offset_t)page_meta->elements; 582 } else { 583 old_head = (vm_offset_t)zone->free_elements; 584 } 585 586#if MACH_ASSERT 587 if (__improbable(!is_sane_zone_element(zone, old_head))) 588 panic("zfree: invalid head pointer %p for freelist of zone %s\n", 589 (void *) old_head, zone->zone_name); 590#endif 591 592 if (__improbable(!is_sane_zone_element(zone, element))) 593 panic("zfree: freeing invalid pointer %p to zone %s\n", 594 (void *) element, zone->zone_name); 595 596 boolean_t poison = FALSE; 597 598 /* Always poison tiny zones' elements (limit is 0 if -no-zp is set) */ 599 if (zone->elem_size <= zp_tiny_zone_limit) 600 poison = TRUE; 601 else if (zp_factor != 0 && ++zone->zp_count >= zp_factor) { 602 /* Poison zone elements periodically */ 603 zone->zp_count = 0; 604 poison = TRUE; 605 } 606 607 if (poison) { 608 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */ 609 vm_offset_t *element_cursor = primary + 1; 610 611 for ( ; element_cursor < backup; element_cursor++) 612 *element_cursor = ZP_POISON; 613 } 614 615 /* 616 * Always write a redundant next pointer 617 * So that it is more difficult to forge, xor it with a random cookie 618 * A poisoned element is indicated by using zp_poisoned_cookie 619 * instead of zp_nopoison_cookie 620 */ 621 622 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie); 623 624 /* Insert this element at the head of the free list */ 625 *primary = old_head; 626 if (zone->use_page_list) { 627 page_meta->elements = (struct zone_free_element *)element; 628 page_meta->free_count++; 629 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { 630 if (page_meta->free_count == 1) { 631 /* first foreign element freed on page, move from all_used */ 632 remqueue((queue_entry_t)page_meta); 633 enqueue_tail(&zone->pages.any_free_foreign, (queue_entry_t)page_meta); 634 } else { 635 /* no other list transitions */ 636 } 637 } else if (page_meta->free_count == page_meta->alloc_count) { 638 /* whether the page was on the intermediate or all_used, queue, move it to free */ 639 remqueue((queue_entry_t)page_meta); 640 enqueue_tail(&zone->pages.all_free, (queue_entry_t)page_meta); 641 } else if (page_meta->free_count == 1) { 642 /* first free element on page, move from all_used */ 643 remqueue((queue_entry_t)page_meta); 644 enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); 645 } 646 } else { 647 zone->free_elements = (struct zone_free_element *)element; 648 } 649 zone->count--; 650 zone->countfree++; 651} 652 653 654/* 655 * Removes an element from the zone's free list, returning 0 if the free list is empty. 656 * Verifies that the next-pointer and backup next-pointer are intact, 657 * and verifies that a poisoned element hasn't been modified. 658 */ 659static inline vm_offset_t 660try_alloc_from_zone(zone_t zone) 661{ 662 vm_offset_t element; 663 struct zone_page_metadata *page_meta; 664 665 /* if zone is empty, bail */ 666 if (zone->use_page_list) { 667 if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) 668 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); 669 else if (!queue_empty(&zone->pages.intermediate)) 670 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); 671 else if (!queue_empty(&zone->pages.all_free)) 672 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); 673 else { 674 return 0; 675 } 676 677 /* Check if page_meta passes is_sane_zone_element */ 678 if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) 679 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n", 680 (void *) page_meta, zone->zone_name); 681 assert(page_meta->zone == zone); 682 element = (vm_offset_t)page_meta->elements; 683 } else { 684 if (zone->free_elements == NULL) 685 return 0; 686 687 element = (vm_offset_t)zone->free_elements; 688 } 689 690#if MACH_ASSERT 691 if (__improbable(!is_sane_zone_element(zone, element))) 692 panic("zfree: invalid head pointer %p for freelist of zone %s\n", 693 (void *) element, zone->zone_name); 694#endif 695 696 vm_offset_t *primary = (vm_offset_t *) element; 697 vm_offset_t *backup = get_backup_ptr(zone->elem_size, primary); 698 699 vm_offset_t next_element = *primary; 700 vm_offset_t next_element_backup = *backup; 701 702 /* 703 * backup_ptr_mismatch_panic will determine what next_element 704 * should have been, and print it appropriately 705 */ 706 if (__improbable(!is_sane_zone_element(zone, next_element))) 707 backup_ptr_mismatch_panic(zone, next_element, next_element_backup); 708 709 /* Check the backup pointer for the regular cookie */ 710 if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) { 711 712 /* Check for the poisoned cookie instead */ 713 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) 714 /* Neither cookie is valid, corruption has occurred */ 715 backup_ptr_mismatch_panic(zone, next_element, next_element_backup); 716 717 /* 718 * Element was marked as poisoned, so check its integrity, 719 * skipping the primary and backup pointers at the beginning and end. 720 */ 721 vm_offset_t *element_cursor = primary + 1; 722 723 for ( ; element_cursor < backup ; element_cursor++) 724 if (__improbable(*element_cursor != ZP_POISON)) 725 zone_element_was_modified_panic(zone, 726 *element_cursor, 727 ZP_POISON, 728 ((vm_offset_t)element_cursor) - element); 729 } 730 731 if (zone->use_page_list) { 732 733 /* Make sure the page_meta is at the correct offset from the start of page */ 734 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element))) 735 panic("zalloc: metadata located at incorrect location on page of zone %s\n", 736 zone->zone_name); 737 738 /* Make sure next_element belongs to the same page as page_meta */ 739 if (next_element) { 740 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element))) 741 panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n", 742 (void *)next_element, (void *)element, zone->zone_name); 743 } 744 } 745 746 /* 747 * Clear out the old next pointer and backup to avoid leaking the cookie 748 * and so that only values on the freelist have a valid cookie 749 */ 750 *primary = ZP_POISON; 751 *backup = ZP_POISON; 752 753 /* Remove this element from the free list */ 754 if (zone->use_page_list) { 755 756 page_meta->elements = (struct zone_free_element *)next_element; 757 page_meta->free_count--; 758 759 if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) { 760 if (page_meta->free_count == 0) { 761 /* move to all used */ 762 remqueue((queue_entry_t)page_meta); 763 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); 764 } else { 765 /* no other list transitions */ 766 } 767 } else if (page_meta->free_count == 0) { 768 /* remove from intermediate or free, move to all_used */ 769 remqueue((queue_entry_t)page_meta); 770 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta); 771 } else if (page_meta->alloc_count == page_meta->free_count + 1) { 772 /* remove from free, move to intermediate */ 773 remqueue((queue_entry_t)page_meta); 774 enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta); 775 } 776 } else { 777 zone->free_elements = (struct zone_free_element *)next_element; 778 } 779 zone->countfree--; 780 zone->count++; 781 zone->sum_count++; 782 783 return element; 784} 785 786 787/* 788 * End of zone poisoning 789 */ 790 791/* 792 * Fake zones for things that want to report via zprint but are not actually zones. 793 */ 794struct fake_zone_info { 795 const char* name; 796 void (*init)(int); 797 void (*query)(int *, 798 vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *, 799 uint64_t *, int *, int *, int *); 800}; 801 802static const struct fake_zone_info fake_zones[] = { 803 { 804 .name = "kernel_stacks", 805 .init = stack_fake_zone_init, 806 .query = stack_fake_zone_info, 807 }, 808 { 809 .name = "page_tables", 810 .init = pt_fake_zone_init, 811 .query = pt_fake_zone_info, 812 }, 813 { 814 .name = "kalloc.large", 815 .init = kalloc_fake_zone_init, 816 .query = kalloc_fake_zone_info, 817 }, 818}; 819static const unsigned int num_fake_zones = 820 sizeof (fake_zones) / sizeof (fake_zones[0]); 821 822/* 823 * Zone info options 824 */ 825boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */ 826#define ZINFO_SLOTS 200 /* for now */ 827#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1) 828 829/* 830 * Support for garbage collection of unused zone pages 831 * 832 * The kernel virtually allocates the "zone map" submap of the kernel 833 * map. When an individual zone needs more storage, memory is allocated 834 * out of the zone map, and the two-level "zone_page_table" is 835 * on-demand expanded so that it has entries for those pages. 836 * zone_page_init()/zone_page_alloc() initialize "alloc_count" 837 * to the number of zone elements that occupy the zone page (which may 838 * be a minimum of 1, including if a zone element spans multiple 839 * pages). 840 * 841 * Asynchronously, the zone_gc() logic attempts to walk zone free 842 * lists to see if all the elements on a zone page are free. If 843 * "collect_count" (which it increments during the scan) matches 844 * "alloc_count", the zone page is a candidate for collection and the 845 * physical page is returned to the VM system. During this process, the 846 * first word of the zone page is re-used to maintain a linked list of 847 * to-be-collected zone pages. 848 */ 849typedef uint32_t zone_page_index_t; 850#define ZONE_PAGE_INDEX_INVALID ((zone_page_index_t)0xFFFFFFFFU) 851 852struct zone_page_table_entry { 853 volatile uint16_t alloc_count; 854 volatile uint16_t collect_count; 855}; 856 857#define ZONE_PAGE_USED 0 858#define ZONE_PAGE_UNUSED 0xffff 859 860/* Forwards */ 861void zone_page_init( 862 vm_offset_t addr, 863 vm_size_t size); 864 865void zone_page_alloc( 866 vm_offset_t addr, 867 vm_size_t size); 868 869void zone_page_free_element( 870 zone_page_index_t *free_page_head, 871 zone_page_index_t *free_page_tail, 872 vm_offset_t addr, 873 vm_size_t size); 874 875void zone_page_collect( 876 vm_offset_t addr, 877 vm_size_t size); 878 879boolean_t zone_page_collectable( 880 vm_offset_t addr, 881 vm_size_t size); 882 883void zone_page_keep( 884 vm_offset_t addr, 885 vm_size_t size); 886 887void zone_display_zprint(void); 888 889zone_t zone_find_largest(void); 890 891/* 892 * Async allocation of zones 893 * This mechanism allows for bootstrapping an empty zone which is setup with 894 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call 895 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free. 896 * This will prime the zone for the next use. 897 * 898 * Currently the thread_callout function (zalloc_async) will loop through all zones 899 * looking for any zone with async_pending set and do the work for it. 900 * 901 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call, 902 * then zalloc_noblock to an empty zone may succeed. 903 */ 904void zalloc_async( 905 thread_call_param_t p0, 906 thread_call_param_t p1); 907 908static thread_call_data_t call_async_alloc; 909 910vm_map_t zone_map = VM_MAP_NULL; 911 912zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */ 913 914zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */ 915 916/* 917 * The VM system gives us an initial chunk of memory. 918 * It has to be big enough to allocate the zone_zone 919 * all the way through the pmap zone. 920 */ 921 922vm_offset_t zdata; 923vm_size_t zdata_size; 924 925#define zone_wakeup(zone) thread_wakeup((event_t)(zone)) 926#define zone_sleep(zone) \ 927 (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT); 928 929/* 930 * The zone_locks_grp allows for collecting lock statistics. 931 * All locks are associated to this group in zinit. 932 * Look at tools/lockstat for debugging lock contention. 933 */ 934 935lck_grp_t zone_locks_grp; 936lck_grp_attr_t zone_locks_grp_attr; 937 938#define lock_zone_init(zone) \ 939MACRO_BEGIN \ 940 lck_attr_setdefault(&(zone)->lock_attr); \ 941 lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext, \ 942 &zone_locks_grp, &(zone)->lock_attr); \ 943MACRO_END 944 945#define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock) 946 947/* 948 * Garbage collection map information 949 */ 950#define ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE (32) 951struct zone_page_table_entry * volatile zone_page_table[ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE]; 952vm_size_t zone_page_table_used_size; 953unsigned int zone_pages; 954unsigned int zone_page_table_second_level_size; /* power of 2 */ 955unsigned int zone_page_table_second_level_shift_amount; 956 957#define zone_page_table_first_level_slot(x) ((x) >> zone_page_table_second_level_shift_amount) 958#define zone_page_table_second_level_slot(x) ((x) & (zone_page_table_second_level_size - 1)) 959 960void zone_page_table_expand(zone_page_index_t pindex); 961struct zone_page_table_entry *zone_page_table_lookup(zone_page_index_t pindex); 962 963/* 964 * Exclude more than one concurrent garbage collection 965 */ 966decl_lck_mtx_data(, zone_gc_lock) 967 968lck_attr_t zone_gc_lck_attr; 969lck_grp_t zone_gc_lck_grp; 970lck_grp_attr_t zone_gc_lck_grp_attr; 971lck_mtx_ext_t zone_gc_lck_ext; 972 973/* 974 * Protects first_zone, last_zone, num_zones, 975 * and the next_zone field of zones. 976 */ 977decl_simple_lock_data(, all_zones_lock) 978zone_t first_zone; 979zone_t *last_zone; 980unsigned int num_zones; 981 982boolean_t zone_gc_allowed = TRUE; 983boolean_t zone_gc_forced = FALSE; 984boolean_t panic_include_zprint = FALSE; 985boolean_t zone_gc_allowed_by_time_throttle = TRUE; 986 987#define ZALLOC_DEBUG_ZONEGC 0x00000001 988#define ZALLOC_DEBUG_ZCRAM 0x00000002 989uint32_t zalloc_debug = 0; 990 991/* 992 * Zone leak debugging code 993 * 994 * When enabled, this code keeps a log to track allocations to a particular zone that have not 995 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated 996 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is 997 * off by default. 998 * 999 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone> 1000 * is the name of the zone you wish to log. 1001 * 1002 * This code only tracks one zone, so you need to identify which one is leaking first. 1003 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone 1004 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one 1005 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This 1006 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The 1007 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. 1008 * See the help in the kgmacros for usage info. 1009 * 1010 * 1011 * Zone corruption logging 1012 * 1013 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone 1014 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction 1015 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the 1016 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated 1017 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been 1018 * corrupted to examine its history. This should lead to the source of the corruption. 1019 */ 1020 1021static int log_records; /* size of the log, expressed in number of records */ 1022 1023#define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */ 1024 1025static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */ 1026 1027/* Log allocations and frees to help debug a zone element corruption */ 1028boolean_t corruption_debug_flag = FALSE; /* enabled by "-zc" boot-arg */ 1029 1030/* 1031 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to 1032 * the number of records you want in the log. For example, "zrecs=1000" sets it to 1000 records. Note 1033 * that the larger the size of the log, the slower the system will run due to linear searching in the log, 1034 * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000 1035 * records since going much larger than this tends to make the system unresponsive and unbootable on small 1036 * memory configurations. The default value is 4000 records. 1037 */ 1038 1039#if defined(__LP64__) 1040#define ZRECORDS_MAX 128000 /* Max records allowed in the log */ 1041#else 1042#define ZRECORDS_MAX 8000 /* Max records allowed in the log */ 1043#endif 1044#define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */ 1045 1046/* 1047 * Each record in the log contains a pointer to the zone element it refers to, 1048 * and a small array to hold the pc's from the stack trace. A 1049 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, 1050 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. 1051 * If the log fills, old records are replaced as if it were a circular buffer. 1052 */ 1053 1054 1055/* 1056 * Opcodes for the btlog operation field: 1057 */ 1058 1059#define ZOP_ALLOC 1 1060#define ZOP_FREE 0 1061 1062/* 1063 * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest 1064 */ 1065static btlog_t *zlog_btlog; /* the log itself, dynamically allocated when logging is enabled */ 1066static zone_t zone_of_interest = NULL; /* the zone being watched; corresponds to zone_name_to_log */ 1067 1068/* 1069 * Decide if we want to log this zone by doing a string compare between a zone name and the name 1070 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not 1071 * possible to include spaces in strings passed in via the boot-args, a period in the logname will 1072 * match a space in the zone name. 1073 */ 1074 1075static int 1076log_this_zone(const char *zonename, const char *logname) 1077{ 1078 int len; 1079 const char *zc = zonename; 1080 const char *lc = logname; 1081 1082 /* 1083 * Compare the strings. We bound the compare by MAX_ZONE_NAME. 1084 */ 1085 1086 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { 1087 1088 /* 1089 * If the current characters don't match, check for a space in 1090 * in the zone name and a corresponding period in the log name. 1091 * If that's not there, then the strings don't match. 1092 */ 1093 1094 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) 1095 break; 1096 1097 /* 1098 * The strings are equal so far. If we're at the end, then it's a match. 1099 */ 1100 1101 if (*zc == '\0') 1102 return TRUE; 1103 } 1104 1105 return FALSE; 1106} 1107 1108 1109/* 1110 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and 1111 * the buffer for the records has been allocated. 1112 */ 1113 1114#define DO_LOGGING(z) (zlog_btlog && (z) == zone_of_interest) 1115 1116extern boolean_t kmem_alloc_ready; 1117 1118#if CONFIG_ZLEAKS 1119#pragma mark - 1120#pragma mark Zone Leak Detection 1121 1122/* 1123 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding 1124 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a 1125 * backtrace. Every free, we examine the table and determine if the allocation was being tracked, 1126 * and stop tracking it if it was being tracked. 1127 * 1128 * We track the allocations in the zallocations hash table, which stores the address that was returned from 1129 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which 1130 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large 1131 * backtraces - we don't store them more than once. 1132 * 1133 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up 1134 * a large amount of virtual space. 1135 */ 1136#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */ 1137#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */ 1138#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */ 1139#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */ 1140uint32_t zleak_state = 0; /* State of collection, as above */ 1141 1142boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */ 1143vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */ 1144vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */ 1145unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */ 1146 1147/* 1148 * Counters for allocation statistics. 1149 */ 1150 1151/* Times two active records want to occupy the same spot */ 1152unsigned int z_alloc_collisions = 0; 1153unsigned int z_trace_collisions = 0; 1154 1155/* Times a new record lands on a spot previously occupied by a freed allocation */ 1156unsigned int z_alloc_overwrites = 0; 1157unsigned int z_trace_overwrites = 0; 1158 1159/* Times a new alloc or trace is put into the hash table */ 1160unsigned int z_alloc_recorded = 0; 1161unsigned int z_trace_recorded = 0; 1162 1163/* Times zleak_log returned false due to not being able to acquire the lock */ 1164unsigned int z_total_conflicts = 0; 1165 1166 1167#pragma mark struct zallocation 1168/* 1169 * Structure for keeping track of an allocation 1170 * An allocation bucket is in use if its element is not NULL 1171 */ 1172struct zallocation { 1173 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */ 1174 vm_size_t za_size; /* how much memory did this allocation take up? */ 1175 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */ 1176 /* TODO: #if this out */ 1177 uint32_t za_hit_count; /* for determining effectiveness of hash function */ 1178}; 1179 1180/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */ 1181uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM; 1182uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM; 1183 1184vm_size_t zleak_max_zonemap_size; 1185 1186/* Hashmaps of allocations and their corresponding traces */ 1187static struct zallocation* zallocations; 1188static struct ztrace* ztraces; 1189 1190/* not static so that panic can see this, see kern/debug.c */ 1191struct ztrace* top_ztrace; 1192 1193/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */ 1194static lck_spin_t zleak_lock; 1195static lck_attr_t zleak_lock_attr; 1196static lck_grp_t zleak_lock_grp; 1197static lck_grp_attr_t zleak_lock_grp_attr; 1198 1199/* 1200 * Initializes the zone leak monitor. Called from zone_init() 1201 */ 1202static void 1203zleak_init(vm_size_t max_zonemap_size) 1204{ 1205 char scratch_buf[16]; 1206 boolean_t zleak_enable_flag = FALSE; 1207 1208 zleak_max_zonemap_size = max_zonemap_size; 1209 zleak_global_tracking_threshold = max_zonemap_size / 2; 1210 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8; 1211 1212 /* -zleakoff (flag to disable zone leak monitor) */ 1213 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) { 1214 zleak_enable_flag = FALSE; 1215 printf("zone leak detection disabled\n"); 1216 } else { 1217 zleak_enable_flag = TRUE; 1218 printf("zone leak detection enabled\n"); 1219 } 1220 1221 /* zfactor=XXXX (override how often to sample the zone allocator) */ 1222 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) { 1223 printf("Zone leak factor override: %u\n", zleak_sample_factor); 1224 } 1225 1226 /* zleak-allocs=XXXX (override number of buckets in zallocations) */ 1227 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) { 1228 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets); 1229 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ 1230 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) { 1231 printf("Override isn't a power of two, bad things might happen!\n"); 1232 } 1233 } 1234 1235 /* zleak-traces=XXXX (override number of buckets in ztraces) */ 1236 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) { 1237 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets); 1238 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */ 1239 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) { 1240 printf("Override isn't a power of two, bad things might happen!\n"); 1241 } 1242 } 1243 1244 /* allocate the zleak_lock */ 1245 lck_grp_attr_setdefault(&zleak_lock_grp_attr); 1246 lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr); 1247 lck_attr_setdefault(&zleak_lock_attr); 1248 lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr); 1249 1250 if (zleak_enable_flag) { 1251 zleak_state = ZLEAK_STATE_ENABLED; 1252 } 1253} 1254 1255#if CONFIG_ZLEAKS 1256 1257/* 1258 * Support for kern.zleak.active sysctl - a simplified 1259 * version of the zleak_state variable. 1260 */ 1261int 1262get_zleak_state(void) 1263{ 1264 if (zleak_state & ZLEAK_STATE_FAILED) 1265 return (-1); 1266 if (zleak_state & ZLEAK_STATE_ACTIVE) 1267 return (1); 1268 return (0); 1269} 1270 1271#endif 1272 1273 1274kern_return_t 1275zleak_activate(void) 1276{ 1277 kern_return_t retval; 1278 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation); 1279 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace); 1280 void *allocations_ptr = NULL; 1281 void *traces_ptr = NULL; 1282 1283 /* Only one thread attempts to activate at a time */ 1284 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { 1285 return KERN_SUCCESS; 1286 } 1287 1288 /* Indicate that we're doing the setup */ 1289 lck_spin_lock(&zleak_lock); 1290 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) { 1291 lck_spin_unlock(&zleak_lock); 1292 return KERN_SUCCESS; 1293 } 1294 1295 zleak_state |= ZLEAK_STATE_ACTIVATING; 1296 lck_spin_unlock(&zleak_lock); 1297 1298 /* Allocate and zero tables */ 1299 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size); 1300 if (retval != KERN_SUCCESS) { 1301 goto fail; 1302 } 1303 1304 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size); 1305 if (retval != KERN_SUCCESS) { 1306 goto fail; 1307 } 1308 1309 bzero(allocations_ptr, z_alloc_size); 1310 bzero(traces_ptr, z_trace_size); 1311 1312 /* Everything's set. Install tables, mark active. */ 1313 zallocations = allocations_ptr; 1314 ztraces = traces_ptr; 1315 1316 /* 1317 * Initialize the top_ztrace to the first entry in ztraces, 1318 * so we don't have to check for null in zleak_log 1319 */ 1320 top_ztrace = &ztraces[0]; 1321 1322 /* 1323 * Note that we do need a barrier between installing 1324 * the tables and setting the active flag, because the zfree() 1325 * path accesses the table without a lock if we're active. 1326 */ 1327 lck_spin_lock(&zleak_lock); 1328 zleak_state |= ZLEAK_STATE_ACTIVE; 1329 zleak_state &= ~ZLEAK_STATE_ACTIVATING; 1330 lck_spin_unlock(&zleak_lock); 1331 1332 return 0; 1333 1334fail: 1335 /* 1336 * If we fail to allocate memory, don't further tax 1337 * the system by trying again. 1338 */ 1339 lck_spin_lock(&zleak_lock); 1340 zleak_state |= ZLEAK_STATE_FAILED; 1341 zleak_state &= ~ZLEAK_STATE_ACTIVATING; 1342 lck_spin_unlock(&zleak_lock); 1343 1344 if (allocations_ptr != NULL) { 1345 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size); 1346 } 1347 1348 if (traces_ptr != NULL) { 1349 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size); 1350 } 1351 1352 return retval; 1353} 1354 1355/* 1356 * TODO: What about allocations that never get deallocated, 1357 * especially ones with unique backtraces? Should we wait to record 1358 * until after boot has completed? 1359 * (How many persistent zallocs are there?) 1360 */ 1361 1362/* 1363 * This function records the allocation in the allocations table, 1364 * and stores the associated backtrace in the traces table 1365 * (or just increments the refcount if the trace is already recorded) 1366 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and 1367 * the associated trace's refcount is decremented. 1368 * If the trace slot is in use, it returns. 1369 * The refcount is incremented by the amount of memory the allocation consumes. 1370 * The return value indicates whether to try again next time. 1371 */ 1372static boolean_t 1373zleak_log(uintptr_t* bt, 1374 uintptr_t addr, 1375 uint32_t depth, 1376 vm_size_t allocation_size) 1377{ 1378 /* Quit if there's someone else modifying the hash tables */ 1379 if (!lck_spin_try_lock(&zleak_lock)) { 1380 z_total_conflicts++; 1381 return FALSE; 1382 } 1383 1384 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; 1385 1386 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets); 1387 struct ztrace* trace = &ztraces[trace_index]; 1388 1389 allocation->za_hit_count++; 1390 trace->zt_hit_count++; 1391 1392 /* 1393 * If the allocation bucket we want to be in is occupied, and if the occupier 1394 * has the same trace as us, just bail. 1395 */ 1396 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) { 1397 z_alloc_collisions++; 1398 1399 lck_spin_unlock(&zleak_lock); 1400 return TRUE; 1401 } 1402 1403 /* STEP 1: Store the backtrace in the traces array. */ 1404 /* A size of zero indicates that the trace bucket is free. */ 1405 1406 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) { 1407 /* 1408 * Different unique trace with same hash! 1409 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated 1410 * and get out of the way for later chances 1411 */ 1412 trace->zt_collisions++; 1413 z_trace_collisions++; 1414 1415 lck_spin_unlock(&zleak_lock); 1416 return TRUE; 1417 } else if (trace->zt_size > 0) { 1418 /* Same trace, already added, so increment refcount */ 1419 trace->zt_size += allocation_size; 1420 } else { 1421 /* Found an unused trace bucket, record the trace here! */ 1422 if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */ 1423 z_trace_overwrites++; 1424 1425 z_trace_recorded++; 1426 trace->zt_size = allocation_size; 1427 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) ); 1428 1429 trace->zt_depth = depth; 1430 trace->zt_collisions = 0; 1431 } 1432 1433 /* STEP 2: Store the allocation record in the allocations array. */ 1434 1435 if (allocation->za_element != (uintptr_t) 0) { 1436 /* 1437 * Straight up replace any allocation record that was there. We don't want to do the work 1438 * to preserve the allocation entries that were there, because we only record a subset of the 1439 * allocations anyways. 1440 */ 1441 1442 z_alloc_collisions++; 1443 1444 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index]; 1445 /* Knock off old allocation's size, not the new allocation */ 1446 associated_trace->zt_size -= allocation->za_size; 1447 } else if (allocation->za_trace_index != 0) { 1448 /* Slot previously used but not currently in use */ 1449 z_alloc_overwrites++; 1450 } 1451 1452 allocation->za_element = addr; 1453 allocation->za_trace_index = trace_index; 1454 allocation->za_size = allocation_size; 1455 1456 z_alloc_recorded++; 1457 1458 if (top_ztrace->zt_size < trace->zt_size) 1459 top_ztrace = trace; 1460 1461 lck_spin_unlock(&zleak_lock); 1462 return TRUE; 1463} 1464 1465/* 1466 * Free the allocation record and release the stacktrace. 1467 * This should be as fast as possible because it will be called for every free. 1468 */ 1469static void 1470zleak_free(uintptr_t addr, 1471 vm_size_t allocation_size) 1472{ 1473 if (addr == (uintptr_t) 0) 1474 return; 1475 1476 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)]; 1477 1478 /* Double-checked locking: check to find out if we're interested, lock, check to make 1479 * sure it hasn't changed, then modify it, and release the lock. 1480 */ 1481 1482 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { 1483 /* if the allocation was the one, grab the lock, check again, then delete it */ 1484 lck_spin_lock(&zleak_lock); 1485 1486 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) { 1487 struct ztrace *trace; 1488 1489 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */ 1490 if (allocation->za_size != allocation_size) { 1491 panic("Freeing as size %lu memory that was allocated with size %lu\n", 1492 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size); 1493 } 1494 1495 trace = &ztraces[allocation->za_trace_index]; 1496 1497 /* size of 0 indicates trace bucket is unused */ 1498 if (trace->zt_size > 0) { 1499 trace->zt_size -= allocation_size; 1500 } 1501 1502 /* A NULL element means the allocation bucket is unused */ 1503 allocation->za_element = 0; 1504 } 1505 lck_spin_unlock(&zleak_lock); 1506 } 1507} 1508 1509#endif /* CONFIG_ZLEAKS */ 1510 1511/* These functions outside of CONFIG_ZLEAKS because they are also used in 1512 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix. 1513 */ 1514 1515/* 1516 * This function captures a backtrace from the current stack and 1517 * returns the number of frames captured, limited by max_frames. 1518 * It's fast because it does no checking to make sure there isn't bad data. 1519 * Since it's only called from threads that we're going to keep executing, 1520 * if there's bad data we were going to die eventually. 1521 * If this function is inlined, it doesn't record the frame of the function it's inside. 1522 * (because there's no stack frame!) 1523 */ 1524 1525uint32_t 1526fastbacktrace(uintptr_t* bt, uint32_t max_frames) 1527{ 1528 uintptr_t* frameptr = NULL, *frameptr_next = NULL; 1529 uintptr_t retaddr = 0; 1530 uint32_t frame_index = 0, frames = 0; 1531 uintptr_t kstackb, kstackt; 1532 thread_t cthread = current_thread(); 1533 1534 if (__improbable(cthread == NULL)) 1535 return 0; 1536 1537 kstackb = cthread->kernel_stack; 1538 kstackt = kstackb + kernel_stack_size; 1539 /* Load stack frame pointer (EBP on x86) into frameptr */ 1540 frameptr = __builtin_frame_address(0); 1541 if (((uintptr_t)frameptr > kstackt) || ((uintptr_t)frameptr < kstackb)) 1542 frameptr = NULL; 1543 1544 while (frameptr != NULL && frame_index < max_frames ) { 1545 /* Next frame pointer is pointed to by the previous one */ 1546 frameptr_next = (uintptr_t*) *frameptr; 1547 1548 /* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */ 1549 /* That also means the return address is worthless, so don't record it */ 1550 if (frameptr_next == NULL) 1551 break; 1552 /* Verify thread stack bounds */ 1553 if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb)) 1554 break; 1555 /* Pull return address from one spot above the frame pointer */ 1556 retaddr = *(frameptr + 1); 1557 1558 /* Store it in the backtrace array */ 1559 bt[frame_index++] = retaddr; 1560 1561 frameptr = frameptr_next; 1562 } 1563 1564 /* Save the number of frames captured for return value */ 1565 frames = frame_index; 1566 1567 /* Fill in the rest of the backtrace with zeros */ 1568 while (frame_index < max_frames) 1569 bt[frame_index++] = 0; 1570 1571 return frames; 1572} 1573 1574/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */ 1575uintptr_t 1576hash_mix(uintptr_t x) 1577{ 1578#ifndef __LP64__ 1579 x += ~(x << 15); 1580 x ^= (x >> 10); 1581 x += (x << 3 ); 1582 x ^= (x >> 6 ); 1583 x += ~(x << 11); 1584 x ^= (x >> 16); 1585#else 1586 x += ~(x << 32); 1587 x ^= (x >> 22); 1588 x += ~(x << 13); 1589 x ^= (x >> 8 ); 1590 x += (x << 3 ); 1591 x ^= (x >> 15); 1592 x += ~(x << 27); 1593 x ^= (x >> 31); 1594#endif 1595 return x; 1596} 1597 1598uint32_t 1599hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size) 1600{ 1601 1602 uintptr_t hash = 0; 1603 uintptr_t mask = max_size - 1; 1604 1605 while (depth) { 1606 hash += bt[--depth]; 1607 } 1608 1609 hash = hash_mix(hash) & mask; 1610 1611 assert(hash < max_size); 1612 1613 return (uint32_t) hash; 1614} 1615 1616/* 1617 * TODO: Determine how well distributed this is 1618 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask 1619 */ 1620uint32_t 1621hashaddr(uintptr_t pt, uint32_t max_size) 1622{ 1623 uintptr_t hash = 0; 1624 uintptr_t mask = max_size - 1; 1625 1626 hash = hash_mix(pt) & mask; 1627 1628 assert(hash < max_size); 1629 1630 return (uint32_t) hash; 1631} 1632 1633/* End of all leak-detection code */ 1634#pragma mark - 1635 1636/* 1637 * zinit initializes a new zone. The zone data structures themselves 1638 * are stored in a zone, which is initially a static structure that 1639 * is initialized by zone_init. 1640 */ 1641zone_t 1642zinit( 1643 vm_size_t size, /* the size of an element */ 1644 vm_size_t max, /* maximum memory to use */ 1645 vm_size_t alloc, /* allocation size */ 1646 const char *name) /* a name for the zone */ 1647{ 1648 zone_t z; 1649 boolean_t use_page_list = FALSE; 1650 1651 if (zone_zone == ZONE_NULL) { 1652 1653 z = (struct zone *)zdata; 1654 /* special handling in zcram() because the first element is being used */ 1655 } else 1656 z = (zone_t) zalloc(zone_zone); 1657 1658 if (z == ZONE_NULL) 1659 return(ZONE_NULL); 1660 1661 /* Zone elements must fit both a next pointer and a backup pointer */ 1662 vm_size_t minimum_element_size = sizeof(vm_offset_t) * 2; 1663 if (size < minimum_element_size) 1664 size = minimum_element_size; 1665 1666 /* 1667 * Round element size to a multiple of sizeof(pointer) 1668 * This also enforces that allocations will be aligned on pointer boundaries 1669 */ 1670 size = ((size-1) + sizeof(vm_offset_t)) - 1671 ((size-1) % sizeof(vm_offset_t)); 1672 1673 if (alloc == 0) 1674 alloc = PAGE_SIZE; 1675 1676 alloc = round_page(alloc); 1677 max = round_page(max); 1678 1679 /* 1680 * we look for an allocation size with less than 1% waste 1681 * up to 5 pages in size... 1682 * otherwise, we look for an allocation size with least fragmentation 1683 * in the range of 1 - 5 pages 1684 * This size will be used unless 1685 * the user suggestion is larger AND has less fragmentation 1686 */ 1687#if ZONE_ALIAS_ADDR 1688 /* Favor PAGE_SIZE allocations unless we waste >10% space */ 1689 if ((size < PAGE_SIZE) && (PAGE_SIZE % size <= PAGE_SIZE / 10)) 1690 alloc = PAGE_SIZE; 1691 else 1692#endif 1693#if defined(__LP64__) 1694 if (((alloc % size) != 0) || (alloc > PAGE_SIZE * 8)) 1695#endif 1696 { 1697 vm_size_t best, waste; unsigned int i; 1698 best = PAGE_SIZE; 1699 waste = best % size; 1700 1701 for (i = 1; i <= 5; i++) { 1702 vm_size_t tsize, twaste; 1703 1704 tsize = i * PAGE_SIZE; 1705 1706 if ((tsize % size) < (tsize / 100)) { 1707 alloc = tsize; 1708 goto use_this_allocation; 1709 } 1710 twaste = tsize % size; 1711 if (twaste < waste) 1712 best = tsize, waste = twaste; 1713 } 1714 if (alloc <= best || (alloc % size >= waste)) 1715 alloc = best; 1716 } 1717use_this_allocation: 1718 if (max && (max < alloc)) 1719 max = alloc; 1720 1721 /* 1722 * Opt into page list tracking if we can reliably map an allocation 1723 * to its page_metadata, and if the wastage in the tail of 1724 * the allocation is not too large 1725 */ 1726 if (alloc == PAGE_SIZE) { 1727 if ((PAGE_SIZE % size) >= sizeof(struct zone_page_metadata)) { 1728 use_page_list = TRUE; 1729 } else if ((PAGE_SIZE - sizeof(struct zone_page_metadata)) % size <= PAGE_SIZE / 100) { 1730 use_page_list = TRUE; 1731 } 1732 } 1733 1734 z->free_elements = NULL; 1735 queue_init(&z->pages.any_free_foreign); 1736 queue_init(&z->pages.all_free); 1737 queue_init(&z->pages.intermediate); 1738 queue_init(&z->pages.all_used); 1739 z->cur_size = 0; 1740 z->page_count = 0; 1741 z->max_size = max; 1742 z->elem_size = size; 1743 z->alloc_size = alloc; 1744 z->zone_name = name; 1745 z->count = 0; 1746 z->countfree = 0; 1747 z->sum_count = 0LL; 1748 z->doing_alloc = FALSE; 1749 z->doing_gc = FALSE; 1750 z->exhaustible = FALSE; 1751 z->collectable = TRUE; 1752 z->allows_foreign = FALSE; 1753 z->expandable = TRUE; 1754 z->waiting = FALSE; 1755 z->async_pending = FALSE; 1756 z->caller_acct = TRUE; 1757 z->noencrypt = FALSE; 1758 z->no_callout = FALSE; 1759 z->async_prio_refill = FALSE; 1760 z->gzalloc_exempt = FALSE; 1761 z->alignment_required = FALSE; 1762 z->use_page_list = use_page_list; 1763 z->prio_refill_watermark = 0; 1764 z->zone_replenish_thread = NULL; 1765 z->zp_count = 0; 1766#if CONFIG_ZLEAKS 1767 z->zleak_capture = 0; 1768 z->zleak_on = FALSE; 1769#endif /* CONFIG_ZLEAKS */ 1770 1771#if ZONE_DEBUG 1772 z->active_zones.next = z->active_zones.prev = NULL; 1773 zone_debug_enable(z); 1774#endif /* ZONE_DEBUG */ 1775 lock_zone_init(z); 1776 1777 /* 1778 * Add the zone to the all-zones list. 1779 * If we are tracking zone info per task, and we have 1780 * already used all the available stat slots, then keep 1781 * using the overflow zone slot. 1782 */ 1783 z->next_zone = ZONE_NULL; 1784 simple_lock(&all_zones_lock); 1785 *last_zone = z; 1786 last_zone = &z->next_zone; 1787 z->index = num_zones; 1788 if (zinfo_per_task) { 1789 if (num_zones > ZONES_MAX) 1790 z->index = ZONES_MAX; 1791 } 1792 num_zones++; 1793 simple_unlock(&all_zones_lock); 1794 1795 /* 1796 * Check if we should be logging this zone. If so, remember the zone pointer. 1797 */ 1798 if (log_this_zone(z->zone_name, zone_name_to_log)) { 1799 zone_of_interest = z; 1800 } 1801 1802 /* 1803 * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are 1804 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. kmem_alloc_ready is set to 1805 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one 1806 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again 1807 * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized 1808 * right now. 1809 */ 1810 if (zone_of_interest != NULL && zlog_btlog == NULL && kmem_alloc_ready) { 1811 zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, NULL, NULL, NULL); 1812 if (zlog_btlog) { 1813 printf("zone: logging started for zone %s\n", zone_of_interest->zone_name); 1814 } else { 1815 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); 1816 zone_of_interest = NULL; 1817 } 1818 } 1819#if CONFIG_GZALLOC 1820 gzalloc_zone_init(z); 1821#endif 1822 return(z); 1823} 1824unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count; 1825 1826static void zone_replenish_thread(zone_t); 1827 1828/* High priority VM privileged thread used to asynchronously refill a designated 1829 * zone, such as the reserved VM map entry zone. 1830 */ 1831static void zone_replenish_thread(zone_t z) { 1832 vm_size_t free_size; 1833 current_thread()->options |= TH_OPT_VMPRIV; 1834 1835 for (;;) { 1836 lock_zone(z); 1837 assert(z->prio_refill_watermark != 0); 1838 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) { 1839 assert(z->doing_alloc == FALSE); 1840 assert(z->async_prio_refill == TRUE); 1841 1842 unlock_zone(z); 1843 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT; 1844 vm_offset_t space, alloc_size; 1845 kern_return_t kr; 1846 1847 if (vm_pool_low()) 1848 alloc_size = round_page(z->elem_size); 1849 else 1850 alloc_size = z->alloc_size; 1851 1852 if (z->noencrypt) 1853 zflags |= KMA_NOENCRYPT; 1854 1855 kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags); 1856 1857 if (kr == KERN_SUCCESS) { 1858#if ZONE_ALIAS_ADDR 1859 if (alloc_size == PAGE_SIZE) 1860 space = zone_alias_addr(space); 1861#endif 1862 ZONE_PAGE_COUNT_INCR(z, (alloc_size / PAGE_SIZE)); 1863 zcram(z, space, alloc_size); 1864 } else if (kr == KERN_RESOURCE_SHORTAGE) { 1865 VM_PAGE_WAIT(); 1866 } else if (kr == KERN_NO_SPACE) { 1867 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags); 1868 if (kr == KERN_SUCCESS) { 1869#if ZONE_ALIAS_ADDR 1870 if (alloc_size == PAGE_SIZE) 1871 space = zone_alias_addr(space); 1872#endif 1873 zcram(z, space, alloc_size); 1874 } else { 1875 assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC); 1876 thread_block(THREAD_CONTINUE_NULL); 1877 } 1878 } 1879 1880 lock_zone(z); 1881 zone_replenish_loops++; 1882 } 1883 1884 unlock_zone(z); 1885 /* Signal any potential throttled consumers, terminating 1886 * their timer-bounded waits. 1887 */ 1888 thread_wakeup(z); 1889 1890 assert_wait(&z->zone_replenish_thread, THREAD_UNINT); 1891 thread_block(THREAD_CONTINUE_NULL); 1892 zone_replenish_wakeups++; 1893 } 1894} 1895 1896void 1897zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) { 1898 z->prio_refill_watermark = low_water_mark; 1899 1900 z->async_prio_refill = TRUE; 1901 OSMemoryBarrier(); 1902 kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread); 1903 1904 if (tres != KERN_SUCCESS) { 1905 panic("zone_prio_refill_configure, thread create: 0x%x", tres); 1906 } 1907 1908 thread_deallocate(z->zone_replenish_thread); 1909} 1910 1911/* 1912 * Cram the given memory into the specified zone. 1913 */ 1914void 1915zcram( 1916 zone_t zone, 1917 vm_offset_t newmem, 1918 vm_size_t size) 1919{ 1920 vm_size_t elem_size; 1921 boolean_t from_zm = FALSE; 1922 1923 /* Basic sanity checks */ 1924 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0); 1925 assert(!zone->collectable || zone->allows_foreign 1926 || (from_zone_map(newmem, size))); 1927 1928 elem_size = zone->elem_size; 1929 1930 if (from_zone_map(newmem, size)) 1931 from_zm = TRUE; 1932 1933 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) 1934 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name, 1935 (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size); 1936 1937 if (from_zm && !zone->use_page_list) 1938 zone_page_init(newmem, size); 1939 1940 lock_zone(zone); 1941 1942 if (zone->use_page_list) { 1943 struct zone_page_metadata *page_metadata; 1944 1945 assert((newmem & PAGE_MASK) == 0); 1946 assert((size & PAGE_MASK) == 0); 1947 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) { 1948 1949 vm_size_t pos_in_page; 1950 page_metadata = (struct zone_page_metadata *)(newmem + PAGE_SIZE - sizeof(struct zone_page_metadata)); 1951 1952 page_metadata->pages.next = NULL; 1953 page_metadata->pages.prev = NULL; 1954 page_metadata->elements = NULL; 1955 page_metadata->zone = zone; 1956 page_metadata->alloc_count = 0; 1957 page_metadata->free_count = 0; 1958 1959 enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_metadata); 1960 1961 for (pos_in_page = 0; (newmem + pos_in_page + elem_size) < (vm_offset_t)page_metadata; pos_in_page += elem_size) { 1962 page_metadata->alloc_count++; 1963 zone->count++; /* compensate for free_to_zone */ 1964 if ((newmem + pos_in_page) == (vm_offset_t)zone) { 1965 /* 1966 * special case for the "zone_zone" zone, which is using the first 1967 * allocation of its pmap_steal_memory()-ed allocation for 1968 * the "zone_zone" variable already. 1969 */ 1970 } else { 1971 free_to_zone(zone, newmem + pos_in_page); 1972 } 1973 zone->cur_size += elem_size; 1974 } 1975 } 1976 } else { 1977 while (size >= elem_size) { 1978 zone->count++; /* compensate for free_to_zone */ 1979 if (newmem == (vm_offset_t)zone) { 1980 /* Don't free zone_zone zone */ 1981 } else { 1982 free_to_zone(zone, newmem); 1983 } 1984 if (from_zm) 1985 zone_page_alloc(newmem, elem_size); 1986 size -= elem_size; 1987 newmem += elem_size; 1988 zone->cur_size += elem_size; 1989 } 1990 } 1991 unlock_zone(zone); 1992} 1993 1994 1995/* 1996 * Steal memory for the zone package. Called from 1997 * vm_page_bootstrap(). 1998 */ 1999void 2000zone_steal_memory(void) 2001{ 2002#if CONFIG_GZALLOC 2003 gzalloc_configure(); 2004#endif 2005 /* Request enough early memory to get to the pmap zone */ 2006 zdata_size = 12 * sizeof(struct zone); 2007 zdata_size = round_page(zdata_size); 2008 zdata = (vm_offset_t)pmap_steal_memory(zdata_size); 2009} 2010 2011 2012/* 2013 * Fill a zone with enough memory to contain at least nelem elements. 2014 * Memory is obtained with kmem_alloc_kobject from the kernel_map. 2015 * Return the number of elements actually put into the zone, which may 2016 * be more than the caller asked for since the memory allocation is 2017 * rounded up to a full page. 2018 */ 2019int 2020zfill( 2021 zone_t zone, 2022 int nelem) 2023{ 2024 kern_return_t kr; 2025 vm_size_t size; 2026 vm_offset_t memory; 2027 int nalloc; 2028 2029 assert(nelem > 0); 2030 if (nelem <= 0) 2031 return 0; 2032 size = nelem * zone->elem_size; 2033 size = round_page(size); 2034 kr = kmem_alloc_kobject(kernel_map, &memory, size); 2035 if (kr != KERN_SUCCESS) 2036 return 0; 2037 2038 zone_change(zone, Z_FOREIGN, TRUE); 2039 ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE)); 2040 zcram(zone, memory, size); 2041 nalloc = (int)(size / zone->elem_size); 2042 assert(nalloc >= nelem); 2043 2044 return nalloc; 2045} 2046 2047/* 2048 * Initialize the "zone of zones" which uses fixed memory allocated 2049 * earlier in memory initialization. zone_bootstrap is called 2050 * before zone_init. 2051 */ 2052void 2053zone_bootstrap(void) 2054{ 2055 char temp_buf[16]; 2056 2057 if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) { 2058 zinfo_per_task = TRUE; 2059 } 2060 2061 if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) 2062 zalloc_debug = 0; 2063 2064 /* Set up zone element poisoning */ 2065 zp_init(); 2066 2067 /* should zlog log to debug zone corruption instead of leaks? */ 2068 if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) { 2069 corruption_debug_flag = TRUE; 2070 } 2071 2072 /* 2073 * Check for and set up zone leak detection if requested via boot-args. We recognized two 2074 * boot-args: 2075 * 2076 * zlog=<zone_to_log> 2077 * zrecs=<num_records_in_log> 2078 * 2079 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to 2080 * control the size of the log. If zrecs is not specified, a default value is used. 2081 */ 2082 2083 if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { 2084 if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) { 2085 2086 /* 2087 * Don't allow more than ZRECORDS_MAX records even if the user asked for more. 2088 * This prevents accidentally hogging too much kernel memory and making the system 2089 * unusable. 2090 */ 2091 2092 log_records = MIN(ZRECORDS_MAX, log_records); 2093 2094 } else { 2095 log_records = ZRECORDS_DEFAULT; 2096 } 2097 } 2098 2099 simple_lock_init(&all_zones_lock, 0); 2100 2101 first_zone = ZONE_NULL; 2102 last_zone = &first_zone; 2103 num_zones = 0; 2104 thread_call_setup(&call_async_alloc, zalloc_async, NULL); 2105 2106 /* assertion: nobody else called zinit before us */ 2107 assert(zone_zone == ZONE_NULL); 2108 2109 /* initializing global lock group for zones */ 2110 lck_grp_attr_setdefault(&zone_locks_grp_attr); 2111 lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr); 2112 2113 zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone), 2114 sizeof(struct zone), "zones"); 2115 zone_change(zone_zone, Z_COLLECT, FALSE); 2116 zone_change(zone_zone, Z_CALLERACCT, FALSE); 2117 zone_change(zone_zone, Z_NOENCRYPT, TRUE); 2118 2119 zcram(zone_zone, zdata, zdata_size); 2120 2121 /* initialize fake zones and zone info if tracking by task */ 2122 if (zinfo_per_task) { 2123 vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS; 2124 unsigned int i; 2125 2126 for (i = 0; i < num_fake_zones; i++) 2127 fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i); 2128 zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX, 2129 zisize, "per task zinfo"); 2130 zone_change(zinfo_zone, Z_CALLERACCT, FALSE); 2131 } 2132} 2133 2134void 2135zinfo_task_init(task_t task) 2136{ 2137 if (zinfo_per_task) { 2138 task->tkm_zinfo = zalloc(zinfo_zone); 2139 memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS); 2140 } else { 2141 task->tkm_zinfo = NULL; 2142 } 2143} 2144 2145void 2146zinfo_task_free(task_t task) 2147{ 2148 assert(task != kernel_task); 2149 if (task->tkm_zinfo != NULL) { 2150 zfree(zinfo_zone, task->tkm_zinfo); 2151 task->tkm_zinfo = NULL; 2152 } 2153} 2154 2155/* Global initialization of Zone Allocator. 2156 * Runs after zone_bootstrap. 2157 */ 2158void 2159zone_init( 2160 vm_size_t max_zonemap_size) 2161{ 2162 kern_return_t retval; 2163 vm_offset_t zone_min; 2164 vm_offset_t zone_max; 2165 2166 retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size, 2167 FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT, 2168 &zone_map); 2169 2170 if (retval != KERN_SUCCESS) 2171 panic("zone_init: kmem_suballoc failed"); 2172 zone_max = zone_min + round_page(max_zonemap_size); 2173#if CONFIG_GZALLOC 2174 gzalloc_init(max_zonemap_size); 2175#endif 2176 /* 2177 * Setup garbage collection information: 2178 */ 2179 zone_map_min_address = zone_min; 2180 zone_map_max_address = zone_max; 2181 2182 zone_pages = (unsigned int)atop_kernel(zone_max - zone_min); 2183 zone_page_table_used_size = sizeof(zone_page_table); 2184 2185 zone_page_table_second_level_size = 1; 2186 zone_page_table_second_level_shift_amount = 0; 2187 2188 /* 2189 * Find the power of 2 for the second level that allows 2190 * the first level to fit in ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE 2191 * slots. 2192 */ 2193 while ((zone_page_table_first_level_slot(zone_pages-1)) >= ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE) { 2194 zone_page_table_second_level_size <<= 1; 2195 zone_page_table_second_level_shift_amount++; 2196 } 2197 2198 lck_grp_attr_setdefault(&zone_gc_lck_grp_attr); 2199 lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr); 2200 lck_attr_setdefault(&zone_gc_lck_attr); 2201 lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr); 2202 2203#if CONFIG_ZLEAKS 2204 /* 2205 * Initialize the zone leak monitor 2206 */ 2207 zleak_init(max_zonemap_size); 2208#endif /* CONFIG_ZLEAKS */ 2209} 2210 2211void 2212zone_page_table_expand(zone_page_index_t pindex) 2213{ 2214 unsigned int first_index; 2215 struct zone_page_table_entry * volatile * first_level_ptr; 2216 2217 assert(pindex < zone_pages); 2218 2219 first_index = zone_page_table_first_level_slot(pindex); 2220 first_level_ptr = &zone_page_table[first_index]; 2221 2222 if (*first_level_ptr == NULL) { 2223 /* 2224 * We were able to verify the old first-level slot 2225 * had NULL, so attempt to populate it. 2226 */ 2227 2228 vm_offset_t second_level_array = 0; 2229 vm_size_t second_level_size = round_page(zone_page_table_second_level_size * sizeof(struct zone_page_table_entry)); 2230 zone_page_index_t i; 2231 struct zone_page_table_entry *entry_array; 2232 2233 if (kmem_alloc_kobject(zone_map, &second_level_array, 2234 second_level_size) != KERN_SUCCESS) { 2235 panic("zone_page_table_expand"); 2236 } 2237 zone_map_table_page_count += (second_level_size / PAGE_SIZE); 2238 2239 /* 2240 * zone_gc() may scan the "zone_page_table" directly, 2241 * so make sure any slots have a valid unused state. 2242 */ 2243 entry_array = (struct zone_page_table_entry *)second_level_array; 2244 for (i=0; i < zone_page_table_second_level_size; i++) { 2245 entry_array[i].alloc_count = ZONE_PAGE_UNUSED; 2246 entry_array[i].collect_count = 0; 2247 } 2248 2249 if (OSCompareAndSwapPtr(NULL, entry_array, first_level_ptr)) { 2250 /* Old slot was NULL, replaced with expanded level */ 2251 OSAddAtomicLong(second_level_size, &zone_page_table_used_size); 2252 } else { 2253 /* Old slot was not NULL, someone else expanded first */ 2254 kmem_free(zone_map, second_level_array, second_level_size); 2255 zone_map_table_page_count -= (second_level_size / PAGE_SIZE); 2256 } 2257 } else { 2258 /* Old slot was not NULL, already been expanded */ 2259 } 2260} 2261 2262struct zone_page_table_entry * 2263zone_page_table_lookup(zone_page_index_t pindex) 2264{ 2265 unsigned int first_index = zone_page_table_first_level_slot(pindex); 2266 struct zone_page_table_entry *second_level = zone_page_table[first_index]; 2267 2268 if (second_level) { 2269 return &second_level[zone_page_table_second_level_slot(pindex)]; 2270 } 2271 2272 return NULL; 2273} 2274 2275extern volatile SInt32 kfree_nop_count; 2276 2277#pragma mark - 2278#pragma mark zalloc_canblock 2279 2280/* 2281 * zalloc returns an element from the specified zone. 2282 */ 2283void * 2284zalloc_canblock( 2285 zone_t zone, 2286 boolean_t canblock) 2287{ 2288 vm_offset_t addr = 0; 2289 kern_return_t retval; 2290 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */ 2291 int numsaved = 0; 2292 boolean_t zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE; 2293#if CONFIG_GZALLOC || ZONE_DEBUG 2294 boolean_t did_gzalloc = FALSE; 2295#endif 2296 thread_t thr = current_thread(); 2297 2298#if CONFIG_ZLEAKS 2299 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ 2300#endif /* CONFIG_ZLEAKS */ 2301 2302 assert(zone != ZONE_NULL); 2303 2304#if CONFIG_GZALLOC 2305 addr = gzalloc_alloc(zone, canblock); 2306 did_gzalloc = (addr != 0); 2307#endif 2308 2309 /* 2310 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. 2311 */ 2312 if (__improbable(DO_LOGGING(zone))) 2313 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH); 2314 2315 lock_zone(zone); 2316 2317 2318#if CONFIG_ZLEAKS 2319 /* 2320 * Zone leak detection: capture a backtrace every zleak_sample_factor 2321 * allocations in this zone. 2322 */ 2323 if (zone->zleak_on && (++zone->zleak_capture >= zleak_sample_factor)) { 2324 zone->zleak_capture = 0; 2325 2326 /* Avoid backtracing twice if zone logging is on */ 2327 if (numsaved == 0 ) 2328 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); 2329 else 2330 zleak_tracedepth = numsaved; 2331 } 2332#endif /* CONFIG_ZLEAKS */ 2333 2334 if (zone->async_prio_refill && zone->zone_replenish_thread) { 2335 do { 2336 vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size)); 2337 vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size; 2338 zone_replenish_wakeup = (zfreec < zrefillwm); 2339 zone_alloc_throttle = (zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0); 2340 2341 if (zone_replenish_wakeup) { 2342 zone_replenish_wakeups_initiated++; 2343 unlock_zone(zone); 2344 /* Signal the potentially waiting 2345 * refill thread. 2346 */ 2347 thread_wakeup(&zone->zone_replenish_thread); 2348 2349 /* Scheduling latencies etc. may prevent 2350 * the refill thread from keeping up 2351 * with demand. Throttle consumers 2352 * when we fall below half the 2353 * watermark, unless VM privileged 2354 */ 2355 if (zone_alloc_throttle) { 2356 zone_replenish_throttle_count++; 2357 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC); 2358 thread_block(THREAD_CONTINUE_NULL); 2359 } 2360 lock_zone(zone); 2361 } 2362 } while (zone_alloc_throttle == TRUE); 2363 } 2364 2365 if (__probable(addr == 0)) 2366 addr = try_alloc_from_zone(zone); 2367 2368 2369 while ((addr == 0) && canblock) { 2370 /* 2371 * If nothing was there, try to get more 2372 */ 2373 if (zone->doing_alloc) { 2374 /* 2375 * Someone is allocating memory for this zone. 2376 * Wait for it to show up, then try again. 2377 */ 2378 zone->waiting = TRUE; 2379 zone_sleep(zone); 2380 } else if (zone->doing_gc) { 2381 /* zone_gc() is running. Since we need an element 2382 * from the free list that is currently being 2383 * collected, set the waiting bit and try to 2384 * interrupt the GC process, and try again 2385 * when we obtain the lock. 2386 */ 2387 zone->waiting = TRUE; 2388 zone_sleep(zone); 2389 } else { 2390 vm_offset_t space; 2391 vm_size_t alloc_size; 2392 int retry = 0; 2393 2394 if ((zone->cur_size + zone->elem_size) > 2395 zone->max_size) { 2396 if (zone->exhaustible) 2397 break; 2398 if (zone->expandable) { 2399 /* 2400 * We're willing to overflow certain 2401 * zones, but not without complaining. 2402 * 2403 * This is best used in conjunction 2404 * with the collectable flag. What we 2405 * want is an assurance we can get the 2406 * memory back, assuming there's no 2407 * leak. 2408 */ 2409 zone->max_size += (zone->max_size >> 1); 2410 } else { 2411 unlock_zone(zone); 2412 2413 panic_include_zprint = TRUE; 2414#if CONFIG_ZLEAKS 2415 if (zleak_state & ZLEAK_STATE_ACTIVE) 2416 panic_include_ztrace = TRUE; 2417#endif /* CONFIG_ZLEAKS */ 2418 panic("zalloc: zone \"%s\" empty.", zone->zone_name); 2419 } 2420 } 2421 zone->doing_alloc = TRUE; 2422 unlock_zone(zone); 2423 2424 for (;;) { 2425 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT; 2426 2427 if (vm_pool_low() || retry >= 1) 2428 alloc_size = 2429 round_page(zone->elem_size); 2430 else 2431 alloc_size = zone->alloc_size; 2432 2433 if (zone->noencrypt) 2434 zflags |= KMA_NOENCRYPT; 2435 2436 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags); 2437 if (retval == KERN_SUCCESS) { 2438#if ZONE_ALIAS_ADDR 2439 if (alloc_size == PAGE_SIZE) 2440 space = zone_alias_addr(space); 2441#endif 2442 2443#if CONFIG_ZLEAKS 2444 if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) { 2445 if (zone_map->size >= zleak_global_tracking_threshold) { 2446 kern_return_t kr; 2447 2448 kr = zleak_activate(); 2449 if (kr != KERN_SUCCESS) { 2450 printf("Failed to activate live zone leak debugging (%d).\n", kr); 2451 } 2452 } 2453 } 2454 2455 if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) { 2456 if (zone->cur_size > zleak_per_zone_tracking_threshold) { 2457 zone->zleak_on = TRUE; 2458 } 2459 } 2460#endif /* CONFIG_ZLEAKS */ 2461 ZONE_PAGE_COUNT_INCR(zone, (alloc_size / PAGE_SIZE)); 2462 zcram(zone, space, alloc_size); 2463 2464 break; 2465 } else if (retval != KERN_RESOURCE_SHORTAGE) { 2466 retry++; 2467 2468 if (retry == 2) { 2469 zone_gc(TRUE); 2470 printf("zalloc did gc\n"); 2471 zone_display_zprint(); 2472 } 2473 if (retry == 3) { 2474 panic_include_zprint = TRUE; 2475#if CONFIG_ZLEAKS 2476 if ((zleak_state & ZLEAK_STATE_ACTIVE)) { 2477 panic_include_ztrace = TRUE; 2478 } 2479#endif /* CONFIG_ZLEAKS */ 2480 if (retval == KERN_NO_SPACE) { 2481 zone_t zone_largest = zone_find_largest(); 2482 panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)", 2483 zone->zone_name, zone_largest->zone_name, 2484 (unsigned long)zone_largest->cur_size, zone_largest->count); 2485 2486 } 2487 panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count); 2488 } 2489 } else { 2490 break; 2491 } 2492 } 2493 lock_zone(zone); 2494 zone->doing_alloc = FALSE; 2495 if (zone->waiting) { 2496 zone->waiting = FALSE; 2497 zone_wakeup(zone); 2498 } 2499 addr = try_alloc_from_zone(zone); 2500 if (addr == 0 && 2501 retval == KERN_RESOURCE_SHORTAGE) { 2502 unlock_zone(zone); 2503 2504 VM_PAGE_WAIT(); 2505 lock_zone(zone); 2506 } 2507 } 2508 if (addr == 0) 2509 addr = try_alloc_from_zone(zone); 2510 } 2511 2512#if CONFIG_ZLEAKS 2513 /* Zone leak detection: 2514 * If we're sampling this allocation, add it to the zleaks hash table. 2515 */ 2516 if (addr && zleak_tracedepth > 0) { 2517 /* Sampling can fail if another sample is happening at the same time in a different zone. */ 2518 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { 2519 /* If it failed, roll back the counter so we sample the next allocation instead. */ 2520 zone->zleak_capture = zleak_sample_factor; 2521 } 2522 } 2523#endif /* CONFIG_ZLEAKS */ 2524 2525 2526 if ((addr == 0) && !canblock && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) { 2527 zone->async_pending = TRUE; 2528 unlock_zone(zone); 2529 thread_call_enter(&call_async_alloc); 2530 lock_zone(zone); 2531 addr = try_alloc_from_zone(zone); 2532 } 2533 2534 /* 2535 * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is 2536 * suspected, so this code rarely executes. We need to do this code while still holding the zone lock 2537 * since it protects the various log related data structures. 2538 */ 2539 2540 if (__improbable(DO_LOGGING(zone) && addr)) { 2541 btlog_add_entry(zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved); 2542 } 2543 2544#if ZONE_DEBUG 2545 if (!did_gzalloc && addr && zone_debug_enabled(zone)) { 2546 enqueue_tail(&zone->active_zones, (queue_entry_t)addr); 2547 addr += ZONE_DEBUG_OFFSET; 2548 } 2549#endif 2550 2551 unlock_zone(zone); 2552 2553 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr); 2554 2555 if (addr) { 2556 task_t task; 2557 zinfo_usage_t zinfo; 2558 vm_size_t sz = zone->elem_size; 2559 2560 if (zone->caller_acct) 2561 ledger_credit(thr->t_ledger, task_ledgers.tkm_private, sz); 2562 else 2563 ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, sz); 2564 2565 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 2566 OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].alloc); 2567 } 2568 return((void *)addr); 2569} 2570 2571 2572void * 2573zalloc( 2574 register zone_t zone) 2575{ 2576 return( zalloc_canblock(zone, TRUE) ); 2577} 2578 2579void * 2580zalloc_noblock( 2581 register zone_t zone) 2582{ 2583 return( zalloc_canblock(zone, FALSE) ); 2584} 2585 2586void 2587zalloc_async( 2588 __unused thread_call_param_t p0, 2589 __unused thread_call_param_t p1) 2590{ 2591 zone_t current_z = NULL, head_z; 2592 unsigned int max_zones, i; 2593 void *elt = NULL; 2594 boolean_t pending = FALSE; 2595 2596 simple_lock(&all_zones_lock); 2597 head_z = first_zone; 2598 max_zones = num_zones; 2599 simple_unlock(&all_zones_lock); 2600 current_z = head_z; 2601 for (i = 0; i < max_zones; i++) { 2602 lock_zone(current_z); 2603 if (current_z->async_pending == TRUE) { 2604 current_z->async_pending = FALSE; 2605 pending = TRUE; 2606 } 2607 unlock_zone(current_z); 2608 2609 if (pending == TRUE) { 2610 elt = zalloc_canblock(current_z, TRUE); 2611 zfree(current_z, elt); 2612 pending = FALSE; 2613 } 2614 /* 2615 * This is based on assumption that zones never get 2616 * freed once allocated and linked. 2617 * Hence a read outside of lock is OK. 2618 */ 2619 current_z = current_z->next_zone; 2620 } 2621} 2622 2623/* 2624 * zget returns an element from the specified zone 2625 * and immediately returns nothing if there is nothing there. 2626 * 2627 * This form should be used when you can not block (like when 2628 * processing an interrupt). 2629 * 2630 * XXX: It seems like only vm_page_grab_fictitious_common uses this, and its 2631 * friend vm_page_more_fictitious can block, so it doesn't seem like 2632 * this is used for interrupts any more.... 2633 */ 2634void * 2635zget( 2636 register zone_t zone) 2637{ 2638 vm_offset_t addr; 2639 2640#if CONFIG_ZLEAKS 2641 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */ 2642 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */ 2643#endif /* CONFIG_ZLEAKS */ 2644 2645 assert( zone != ZONE_NULL ); 2646 2647 if (!lock_try_zone(zone)) 2648 return NULL; 2649 2650#if CONFIG_ZLEAKS 2651 /* 2652 * Zone leak detection: capture a backtrace 2653 */ 2654 if (zone->zleak_on && (++zone->zleak_capture >= zleak_sample_factor)) { 2655 zone->zleak_capture = 0; 2656 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH); 2657 } 2658#endif /* CONFIG_ZLEAKS */ 2659 2660 addr = try_alloc_from_zone(zone); 2661#if ZONE_DEBUG 2662 if (addr && zone_debug_enabled(zone)) { 2663 enqueue_tail(&zone->active_zones, (queue_entry_t)addr); 2664 addr += ZONE_DEBUG_OFFSET; 2665 } 2666#endif /* ZONE_DEBUG */ 2667 2668#if CONFIG_ZLEAKS 2669 /* 2670 * Zone leak detection: record the allocation 2671 */ 2672 if (zone->zleak_on && zleak_tracedepth > 0 && addr) { 2673 /* Sampling can fail if another sample is happening at the same time in a different zone. */ 2674 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) { 2675 /* If it failed, roll back the counter so we sample the next allocation instead. */ 2676 zone->zleak_capture = zleak_sample_factor; 2677 } 2678 } 2679#endif /* CONFIG_ZLEAKS */ 2680 2681 unlock_zone(zone); 2682 2683 return((void *) addr); 2684} 2685 2686/* Keep this FALSE by default. Large memory machine run orders of magnitude 2687 slower in debug mode when true. Use debugger to enable if needed */ 2688/* static */ boolean_t zone_check = FALSE; 2689 2690static void zone_check_freelist(zone_t zone, vm_offset_t elem) 2691{ 2692 struct zone_free_element *this; 2693 struct zone_page_metadata *thispage; 2694 2695 if (zone->use_page_list) { 2696 if (zone->allows_foreign) { 2697 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign); 2698 !queue_end(&zone->pages.any_free_foreign, (queue_entry_t)thispage); 2699 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2700 for (this = thispage->elements; 2701 this != NULL; 2702 this = this->next) { 2703 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2704 panic("zone_check_freelist"); 2705 } 2706 } 2707 } 2708 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free); 2709 !queue_end(&zone->pages.all_free, (queue_entry_t)thispage); 2710 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2711 for (this = thispage->elements; 2712 this != NULL; 2713 this = this->next) { 2714 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2715 panic("zone_check_freelist"); 2716 } 2717 } 2718 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate); 2719 !queue_end(&zone->pages.intermediate, (queue_entry_t)thispage); 2720 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) { 2721 for (this = thispage->elements; 2722 this != NULL; 2723 this = this->next) { 2724 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2725 panic("zone_check_freelist"); 2726 } 2727 } 2728 } else { 2729 for (this = zone->free_elements; 2730 this != NULL; 2731 this = this->next) { 2732 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) 2733 panic("zone_check_freelist"); 2734 } 2735 } 2736} 2737 2738static zone_t zone_last_bogus_zone = ZONE_NULL; 2739static vm_offset_t zone_last_bogus_elem = 0; 2740 2741void 2742zfree( 2743 register zone_t zone, 2744 void *addr) 2745{ 2746 vm_offset_t elem = (vm_offset_t) addr; 2747 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */ 2748 int numsaved = 0; 2749 boolean_t gzfreed = FALSE; 2750 2751 assert(zone != ZONE_NULL); 2752 2753#if 1 2754 if (zone->use_page_list) { 2755 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr); 2756 if (zone != page_meta->zone) { 2757 /* 2758 * Something bad has happened. Someone tried to zfree a pointer but the metadata says it is from 2759 * a different zone (or maybe it's from a zone that doesn't use page free lists at all). We can repair 2760 * some cases of this, if: 2761 * 1) The specified zone had use_page_list, and the true zone also has use_page_list set. In that case 2762 * we can swap the zone_t 2763 * 2) The specified zone had use_page_list, but the true zone does not. In this case page_meta is garbage, 2764 * and dereferencing page_meta->zone might panic. 2765 * To distinguish the two, we enumerate the zone list to match it up. 2766 * We do not handle the case where an incorrect zone is passed that does not have use_page_list set, 2767 * even if the true zone did have this set. 2768 */ 2769 zone_t fixed_zone = NULL; 2770 int fixed_i, max_zones; 2771 2772 simple_lock(&all_zones_lock); 2773 max_zones = num_zones; 2774 fixed_zone = first_zone; 2775 simple_unlock(&all_zones_lock); 2776 2777 for (fixed_i=0; fixed_i < max_zones; fixed_i++, fixed_zone = fixed_zone->next_zone) { 2778 if (fixed_zone == page_meta->zone && fixed_zone->use_page_list) { 2779 /* we can fix this */ 2780 printf("Fixing incorrect zfree from zone %s to zone %s\n", zone->zone_name, fixed_zone->zone_name); 2781 zone = fixed_zone; 2782 break; 2783 } 2784 } 2785 } 2786 } 2787#endif 2788 2789 /* 2790 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. 2791 */ 2792 2793 if (__improbable(DO_LOGGING(zone) && corruption_debug_flag)) 2794 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH); 2795 2796#if MACH_ASSERT 2797 /* Basic sanity checks */ 2798 if (zone == ZONE_NULL || elem == (vm_offset_t)0) 2799 panic("zfree: NULL"); 2800 /* zone_gc assumes zones are never freed */ 2801 if (zone == zone_zone) 2802 panic("zfree: freeing to zone_zone breaks zone_gc!"); 2803#endif 2804 2805#if CONFIG_GZALLOC 2806 gzfreed = gzalloc_free(zone, addr); 2807#endif 2808 2809 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr); 2810 2811 if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign && 2812 !from_zone_map(elem, zone->elem_size))) { 2813#if MACH_ASSERT 2814 panic("zfree: non-allocated memory in collectable zone!"); 2815#endif 2816 zone_last_bogus_zone = zone; 2817 zone_last_bogus_elem = elem; 2818 return; 2819 } 2820 2821 lock_zone(zone); 2822 2823 /* 2824 * See if we're doing logging on this zone. There are two styles of logging used depending on 2825 * whether we're trying to catch a leak or corruption. See comments above in zalloc for details. 2826 */ 2827 2828 if (__improbable(DO_LOGGING(zone))) { 2829 if (corruption_debug_flag) { 2830 /* 2831 * We're logging to catch a corruption. Add a record of this zfree operation 2832 * to log. 2833 */ 2834 btlog_add_entry(zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved); 2835 } else { 2836 /* 2837 * We're logging to catch a leak. Remove any record we might have for this 2838 * element since it's being freed. Note that we may not find it if the buffer 2839 * overflowed and that's OK. Since the log is of a limited size, old records 2840 * get overwritten if there are more zallocs than zfrees. 2841 */ 2842 btlog_remove_entries_for_element(zlog_btlog, (void *)addr); 2843 } 2844 } 2845 2846#if ZONE_DEBUG 2847 if (!gzfreed && zone_debug_enabled(zone)) { 2848 queue_t tmp_elem; 2849 2850 elem -= ZONE_DEBUG_OFFSET; 2851 if (zone_check) { 2852 /* check the zone's consistency */ 2853 2854 for (tmp_elem = queue_first(&zone->active_zones); 2855 !queue_end(tmp_elem, &zone->active_zones); 2856 tmp_elem = queue_next(tmp_elem)) 2857 if (elem == (vm_offset_t)tmp_elem) 2858 break; 2859 if (elem != (vm_offset_t)tmp_elem) 2860 panic("zfree()ing element from wrong zone"); 2861 } 2862 remqueue((queue_t) elem); 2863 } 2864#endif /* ZONE_DEBUG */ 2865 if (zone_check) { 2866 zone_check_freelist(zone, elem); 2867 } 2868 2869 if (__probable(!gzfreed)) 2870 free_to_zone(zone, elem); 2871 2872#if MACH_ASSERT 2873 if (zone->count < 0) 2874 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone", 2875 zone->zone_name, addr); 2876#endif 2877 2878 2879#if CONFIG_ZLEAKS 2880 /* 2881 * Zone leak detection: un-track the allocation 2882 */ 2883 if (zone->zleak_on) { 2884 zleak_free(elem, zone->elem_size); 2885 } 2886#endif /* CONFIG_ZLEAKS */ 2887 2888 /* 2889 * If elements have one or more pages, and memory is low, 2890 * request to run the garbage collection in the zone the next 2891 * time the pageout thread runs. 2892 */ 2893 if (zone->elem_size >= PAGE_SIZE && 2894 vm_pool_low()){ 2895 zone_gc_forced = TRUE; 2896 } 2897 unlock_zone(zone); 2898 2899 { 2900 thread_t thr = current_thread(); 2901 task_t task; 2902 zinfo_usage_t zinfo; 2903 vm_size_t sz = zone->elem_size; 2904 2905 if (zone->caller_acct) 2906 ledger_debit(thr->t_ledger, task_ledgers.tkm_private, sz); 2907 else 2908 ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, sz); 2909 2910 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 2911 OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].free); 2912 } 2913} 2914 2915 2916/* Change a zone's flags. 2917 * This routine must be called immediately after zinit. 2918 */ 2919void 2920zone_change( 2921 zone_t zone, 2922 unsigned int item, 2923 boolean_t value) 2924{ 2925 assert( zone != ZONE_NULL ); 2926 assert( value == TRUE || value == FALSE ); 2927 2928 switch(item){ 2929 case Z_NOENCRYPT: 2930 zone->noencrypt = value; 2931 break; 2932 case Z_EXHAUST: 2933 zone->exhaustible = value; 2934 break; 2935 case Z_COLLECT: 2936 zone->collectable = value; 2937 break; 2938 case Z_EXPAND: 2939 zone->expandable = value; 2940 break; 2941 case Z_FOREIGN: 2942 zone->allows_foreign = value; 2943 break; 2944 case Z_CALLERACCT: 2945 zone->caller_acct = value; 2946 break; 2947 case Z_NOCALLOUT: 2948 zone->no_callout = value; 2949 break; 2950 case Z_GZALLOC_EXEMPT: 2951 zone->gzalloc_exempt = value; 2952#if CONFIG_GZALLOC 2953 gzalloc_reconfigure(zone); 2954#endif 2955 break; 2956 case Z_ALIGNMENT_REQUIRED: 2957 zone->alignment_required = value; 2958#if ZONE_DEBUG 2959 zone_debug_disable(zone); 2960#endif 2961#if CONFIG_GZALLOC 2962 gzalloc_reconfigure(zone); 2963#endif 2964 break; 2965 default: 2966 panic("Zone_change: Wrong Item Type!"); 2967 /* break; */ 2968 } 2969} 2970 2971/* 2972 * Return the expected number of free elements in the zone. 2973 * This calculation will be incorrect if items are zfree'd that 2974 * were never zalloc'd/zget'd. The correct way to stuff memory 2975 * into a zone is by zcram. 2976 */ 2977 2978integer_t 2979zone_free_count(zone_t zone) 2980{ 2981 integer_t free_count; 2982 2983 lock_zone(zone); 2984 free_count = zone->countfree; 2985 unlock_zone(zone); 2986 2987 assert(free_count >= 0); 2988 2989 return(free_count); 2990} 2991 2992/* 2993 * Zone garbage collection subroutines 2994 */ 2995 2996boolean_t 2997zone_page_collectable( 2998 vm_offset_t addr, 2999 vm_size_t size) 3000{ 3001 struct zone_page_table_entry *zp; 3002 zone_page_index_t i, j; 3003 3004#if ZONE_ALIAS_ADDR 3005 addr = zone_virtual_addr(addr); 3006#endif 3007#if MACH_ASSERT 3008 if (!from_zone_map(addr, size)) 3009 panic("zone_page_collectable"); 3010#endif 3011 3012 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3013 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3014 3015 for (; i <= j; i++) { 3016 zp = zone_page_table_lookup(i); 3017 if (zp->collect_count == zp->alloc_count) 3018 return (TRUE); 3019 } 3020 3021 return (FALSE); 3022} 3023 3024void 3025zone_page_keep( 3026 vm_offset_t addr, 3027 vm_size_t size) 3028{ 3029 struct zone_page_table_entry *zp; 3030 zone_page_index_t i, j; 3031 3032#if ZONE_ALIAS_ADDR 3033 addr = zone_virtual_addr(addr); 3034#endif 3035#if MACH_ASSERT 3036 if (!from_zone_map(addr, size)) 3037 panic("zone_page_keep"); 3038#endif 3039 3040 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3041 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3042 3043 for (; i <= j; i++) { 3044 zp = zone_page_table_lookup(i); 3045 zp->collect_count = 0; 3046 } 3047} 3048 3049void 3050zone_page_collect( 3051 vm_offset_t addr, 3052 vm_size_t size) 3053{ 3054 struct zone_page_table_entry *zp; 3055 zone_page_index_t i, j; 3056 3057#if ZONE_ALIAS_ADDR 3058 addr = zone_virtual_addr(addr); 3059#endif 3060#if MACH_ASSERT 3061 if (!from_zone_map(addr, size)) 3062 panic("zone_page_collect"); 3063#endif 3064 3065 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3066 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3067 3068 for (; i <= j; i++) { 3069 zp = zone_page_table_lookup(i); 3070 ++zp->collect_count; 3071 } 3072} 3073 3074void 3075zone_page_init( 3076 vm_offset_t addr, 3077 vm_size_t size) 3078{ 3079 struct zone_page_table_entry *zp; 3080 zone_page_index_t i, j; 3081 3082#if ZONE_ALIAS_ADDR 3083 addr = zone_virtual_addr(addr); 3084#endif 3085#if MACH_ASSERT 3086 if (!from_zone_map(addr, size)) 3087 panic("zone_page_init"); 3088#endif 3089 3090 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3091 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3092 3093 for (; i <= j; i++) { 3094 /* make sure entry exists before marking unused */ 3095 zone_page_table_expand(i); 3096 3097 zp = zone_page_table_lookup(i); 3098 assert(zp); 3099 zp->alloc_count = ZONE_PAGE_UNUSED; 3100 zp->collect_count = 0; 3101 } 3102} 3103 3104void 3105zone_page_alloc( 3106 vm_offset_t addr, 3107 vm_size_t size) 3108{ 3109 struct zone_page_table_entry *zp; 3110 zone_page_index_t i, j; 3111 3112#if ZONE_ALIAS_ADDR 3113 addr = zone_virtual_addr(addr); 3114#endif 3115#if MACH_ASSERT 3116 if (!from_zone_map(addr, size)) 3117 panic("zone_page_alloc"); 3118#endif 3119 3120 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3121 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3122 3123 for (; i <= j; i++) { 3124 zp = zone_page_table_lookup(i); 3125 assert(zp); 3126 3127 /* 3128 * Set alloc_count to ZONE_PAGE_USED if 3129 * it was previously set to ZONE_PAGE_UNUSED. 3130 */ 3131 if (zp->alloc_count == ZONE_PAGE_UNUSED) 3132 zp->alloc_count = ZONE_PAGE_USED; 3133 3134 ++zp->alloc_count; 3135 } 3136} 3137 3138void 3139zone_page_free_element( 3140 zone_page_index_t *free_page_head, 3141 zone_page_index_t *free_page_tail, 3142 vm_offset_t addr, 3143 vm_size_t size) 3144{ 3145 struct zone_page_table_entry *zp; 3146 zone_page_index_t i, j; 3147 3148#if ZONE_ALIAS_ADDR 3149 addr = zone_virtual_addr(addr); 3150#endif 3151#if MACH_ASSERT 3152 if (!from_zone_map(addr, size)) 3153 panic("zone_page_free_element"); 3154#endif 3155 3156 /* Clear out the old next and backup pointers */ 3157 vm_offset_t *primary = (vm_offset_t *) addr; 3158 vm_offset_t *backup = get_backup_ptr(size, primary); 3159 3160 *primary = ZP_POISON; 3161 *backup = ZP_POISON; 3162 3163 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address); 3164 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address); 3165 3166 for (; i <= j; i++) { 3167 zp = zone_page_table_lookup(i); 3168 3169 if (zp->collect_count > 0) 3170 --zp->collect_count; 3171 if (--zp->alloc_count == 0) { 3172 vm_address_t free_page_address; 3173 vm_address_t prev_free_page_address; 3174 3175 zp->alloc_count = ZONE_PAGE_UNUSED; 3176 zp->collect_count = 0; 3177 3178 3179 /* 3180 * This element was the last one on this page, re-use the page's 3181 * storage for a page freelist 3182 */ 3183 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i); 3184 *(zone_page_index_t *)free_page_address = ZONE_PAGE_INDEX_INVALID; 3185 3186 if (*free_page_head == ZONE_PAGE_INDEX_INVALID) { 3187 *free_page_head = i; 3188 *free_page_tail = i; 3189 } else { 3190 prev_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)(*free_page_tail)); 3191 *(zone_page_index_t *)prev_free_page_address = i; 3192 *free_page_tail = i; 3193 } 3194 } 3195 } 3196} 3197 3198 3199 3200 3201struct { 3202 uint64_t zgc_invoked; 3203 uint64_t zgc_bailed; 3204 uint32_t pgs_freed; 3205 3206 uint32_t elems_collected, 3207 elems_freed, 3208 elems_kept; 3209} zgc_stats; 3210 3211/* Zone garbage collection 3212 * 3213 * zone_gc will walk through all the free elements in all the 3214 * zones that are marked collectable looking for reclaimable 3215 * pages. zone_gc is called by consider_zone_gc when the system 3216 * begins to run out of memory. 3217 */ 3218void 3219zone_gc(boolean_t all_zones) 3220{ 3221 unsigned int max_zones; 3222 zone_t z; 3223 unsigned int i; 3224 uint32_t old_pgs_freed; 3225 zone_page_index_t zone_free_page_head; 3226 zone_page_index_t zone_free_page_tail; 3227 thread_t mythread = current_thread(); 3228 3229 lck_mtx_lock(&zone_gc_lock); 3230 3231 zgc_stats.zgc_invoked++; 3232 old_pgs_freed = zgc_stats.pgs_freed; 3233 3234 simple_lock(&all_zones_lock); 3235 max_zones = num_zones; 3236 z = first_zone; 3237 simple_unlock(&all_zones_lock); 3238 3239 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3240 kprintf("zone_gc(all_zones=%s) starting...\n", all_zones ? "TRUE" : "FALSE"); 3241 3242 /* 3243 * it's ok to allow eager kernel preemption while 3244 * while holding a zone lock since it's taken 3245 * as a spin lock (which prevents preemption) 3246 */ 3247 thread_set_eager_preempt(mythread); 3248 3249#if MACH_ASSERT 3250 for (i = 0; i < zone_pages; i++) { 3251 struct zone_page_table_entry *zp; 3252 3253 zp = zone_page_table_lookup(i); 3254 assert(!zp || (zp->collect_count == 0)); 3255 } 3256#endif /* MACH_ASSERT */ 3257 3258 for (i = 0; i < max_zones; i++, z = z->next_zone) { 3259 unsigned int n, m; 3260 vm_size_t elt_size, size_freed; 3261 struct zone_free_element *elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail; 3262 int kmem_frees = 0, total_freed_pages = 0; 3263 struct zone_page_metadata *page_meta; 3264 queue_head_t page_meta_head; 3265 3266 assert(z != ZONE_NULL); 3267 3268 if (!z->collectable) 3269 continue; 3270 3271 if (all_zones == FALSE && z->elem_size < PAGE_SIZE && !z->use_page_list) 3272 continue; 3273 3274 lock_zone(z); 3275 3276 elt_size = z->elem_size; 3277 3278 /* 3279 * Do a quick feasibility check before we scan the zone: 3280 * skip unless there is likelihood of getting pages back 3281 * (i.e we need a whole allocation block's worth of free 3282 * elements before we can garbage collect) and 3283 * the zone has more than 10 percent of it's elements free 3284 * or the element size is a multiple of the PAGE_SIZE 3285 */ 3286 if ((elt_size & PAGE_MASK) && 3287 !z->use_page_list && 3288 (((z->cur_size - z->count * elt_size) <= (2 * z->alloc_size)) || 3289 ((z->cur_size - z->count * elt_size) <= (z->cur_size / 10)))) { 3290 unlock_zone(z); 3291 continue; 3292 } 3293 3294 z->doing_gc = TRUE; 3295 3296 /* 3297 * Snatch all of the free elements away from the zone. 3298 */ 3299 3300 if (z->use_page_list) { 3301 queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages); 3302 queue_init(&z->pages.all_free); 3303 } else { 3304 scan = (void *)z->free_elements; 3305 z->free_elements = 0; 3306 } 3307 3308 unlock_zone(z); 3309 3310 if (z->use_page_list) { 3311 /* 3312 * For zones that maintain page lists (which in turn 3313 * track free elements on those pages), zone_gc() 3314 * is incredibly easy, and we bypass all the logic 3315 * for scanning elements and mapping them to 3316 * collectable pages 3317 */ 3318 3319 size_freed = 0; 3320 3321 queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) { 3322 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */ 3323 3324 zgc_stats.elems_freed += page_meta->free_count; 3325 size_freed += elt_size * page_meta->free_count; 3326 zgc_stats.elems_collected += page_meta->free_count; 3327 } 3328 3329 lock_zone(z); 3330 3331 if (size_freed > 0) { 3332 z->cur_size -= size_freed; 3333 z->countfree -= size_freed/elt_size; 3334 } 3335 3336 z->doing_gc = FALSE; 3337 if (z->waiting) { 3338 z->waiting = FALSE; 3339 zone_wakeup(z); 3340 } 3341 3342 unlock_zone(z); 3343 3344 if (queue_empty(&page_meta_head)) 3345 continue; 3346 3347 thread_clear_eager_preempt(mythread); 3348 3349 while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) { 3350 vm_address_t free_page_address; 3351 3352 free_page_address = trunc_page((vm_address_t)page_meta); 3353#if ZONE_ALIAS_ADDR 3354 free_page_address = zone_virtual_addr(free_page_address); 3355#endif 3356 kmem_free(zone_map, free_page_address, PAGE_SIZE); 3357 ZONE_PAGE_COUNT_DECR(z, 1); 3358 total_freed_pages++; 3359 zgc_stats.pgs_freed += 1; 3360 3361 if (++kmem_frees == 32) { 3362 thread_yield_internal(1); 3363 kmem_frees = 0; 3364 } 3365 } 3366 3367 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3368 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); 3369 3370 thread_set_eager_preempt(mythread); 3371 continue; /* go to next zone */ 3372 } 3373 3374 /* 3375 * Pass 1: 3376 * 3377 * Determine which elements we can attempt to collect 3378 * and count them up in the page table. Foreign elements 3379 * are returned to the zone. 3380 */ 3381 3382 prev = (void *)&scan; 3383 elt = scan; 3384 n = 0; tail = keep = NULL; 3385 3386 zone_free_page_head = ZONE_PAGE_INDEX_INVALID; 3387 zone_free_page_tail = ZONE_PAGE_INDEX_INVALID; 3388 3389 3390 while (elt != NULL) { 3391 if (from_zone_map(elt, elt_size)) { 3392 zone_page_collect((vm_offset_t)elt, elt_size); 3393 3394 prev = elt; 3395 elt = elt->next; 3396 3397 ++zgc_stats.elems_collected; 3398 } 3399 else { 3400 if (keep == NULL) 3401 keep = tail = elt; 3402 else { 3403 append_zone_element(z, tail, elt); 3404 tail = elt; 3405 } 3406 3407 append_zone_element(z, prev, elt->next); 3408 elt = elt->next; 3409 append_zone_element(z, tail, NULL); 3410 } 3411 3412 /* 3413 * Dribble back the elements we are keeping. 3414 * If there are none, give some elements that we haven't looked at yet 3415 * back to the freelist so that others waiting on the zone don't get stuck 3416 * for too long. This might prevent us from recovering some memory, 3417 * but allows us to avoid having to allocate new memory to serve requests 3418 * while zone_gc has all the free memory tied up. 3419 * <rdar://problem/3893406> 3420 */ 3421 3422 if (++n >= 50) { 3423 if (z->waiting == TRUE) { 3424 /* z->waiting checked without lock held, rechecked below after locking */ 3425 lock_zone(z); 3426 3427 if (keep != NULL) { 3428 add_list_to_zone(z, keep, tail); 3429 tail = keep = NULL; 3430 } else { 3431 m =0; 3432 base_elt = elt; 3433 base_prev = prev; 3434 while ((elt != NULL) && (++m < 50)) { 3435 prev = elt; 3436 elt = elt->next; 3437 } 3438 if (m !=0 ) { 3439 /* Extract the elements from the list and 3440 * give them back */ 3441 append_zone_element(z, prev, NULL); 3442 add_list_to_zone(z, base_elt, prev); 3443 append_zone_element(z, base_prev, elt); 3444 prev = base_prev; 3445 } 3446 } 3447 3448 if (z->waiting) { 3449 z->waiting = FALSE; 3450 zone_wakeup(z); 3451 } 3452 3453 unlock_zone(z); 3454 } 3455 n =0; 3456 } 3457 } 3458 3459 /* 3460 * Return any remaining elements. 3461 */ 3462 3463 if (keep != NULL) { 3464 lock_zone(z); 3465 3466 add_list_to_zone(z, keep, tail); 3467 3468 if (z->waiting) { 3469 z->waiting = FALSE; 3470 zone_wakeup(z); 3471 } 3472 3473 unlock_zone(z); 3474 } 3475 3476 /* 3477 * Pass 2: 3478 * 3479 * Determine which pages we can reclaim and 3480 * free those elements. 3481 */ 3482 3483 size_freed = 0; 3484 elt = scan; 3485 n = 0; tail = keep = NULL; 3486 3487 while (elt != NULL) { 3488 if (zone_page_collectable((vm_offset_t)elt, elt_size)) { 3489 struct zone_free_element *next_elt = elt->next; 3490 3491 size_freed += elt_size; 3492 3493 /* 3494 * If this is the last allocation on the page(s), 3495 * we may use their storage to maintain the linked 3496 * list of free-able pages. So store elt->next because 3497 * "elt" may be scribbled over. 3498 */ 3499 zone_page_free_element(&zone_free_page_head, &zone_free_page_tail, (vm_offset_t)elt, elt_size); 3500 3501 elt = next_elt; 3502 3503 ++zgc_stats.elems_freed; 3504 } 3505 else { 3506 zone_page_keep((vm_offset_t)elt, elt_size); 3507 3508 if (keep == NULL) 3509 keep = tail = elt; 3510 else { 3511 append_zone_element(z, tail, elt); 3512 tail = elt; 3513 } 3514 3515 elt = elt->next; 3516 append_zone_element(z, tail, NULL); 3517 3518 ++zgc_stats.elems_kept; 3519 } 3520 3521 /* 3522 * Dribble back the elements we are keeping, 3523 * and update the zone size info. 3524 */ 3525 3526 if (++n >= 50) { 3527 lock_zone(z); 3528 3529 z->cur_size -= size_freed; 3530 z->countfree -= size_freed/elt_size; 3531 size_freed = 0; 3532 3533 if (keep != NULL) { 3534 add_list_to_zone(z, keep, tail); 3535 } 3536 3537 if (z->waiting) { 3538 z->waiting = FALSE; 3539 zone_wakeup(z); 3540 } 3541 3542 unlock_zone(z); 3543 3544 n = 0; tail = keep = NULL; 3545 } 3546 } 3547 3548 /* 3549 * Return any remaining elements, and update 3550 * the zone size info. 3551 */ 3552 3553 lock_zone(z); 3554 3555 if (size_freed > 0 || keep != NULL) { 3556 3557 z->cur_size -= size_freed; 3558 z->countfree -= size_freed/elt_size; 3559 3560 if (keep != NULL) { 3561 add_list_to_zone(z, keep, tail); 3562 } 3563 3564 } 3565 3566 z->doing_gc = FALSE; 3567 if (z->waiting) { 3568 z->waiting = FALSE; 3569 zone_wakeup(z); 3570 } 3571 unlock_zone(z); 3572 3573 if (zone_free_page_head == ZONE_PAGE_INDEX_INVALID) 3574 continue; 3575 3576 /* 3577 * we don't want to allow eager kernel preemption while holding the 3578 * various locks taken in the kmem_free path of execution 3579 */ 3580 thread_clear_eager_preempt(mythread); 3581 3582 3583 /* 3584 * This loop counts the number of pages that should be freed by the 3585 * next loop that tries to coalesce the kmem_frees() 3586 */ 3587 uint32_t pages_to_free_count = 0; 3588 vm_address_t fpa; 3589 zone_page_index_t index; 3590 for (index = zone_free_page_head; index != ZONE_PAGE_INDEX_INVALID;) { 3591 pages_to_free_count++; 3592 fpa = zone_map_min_address + PAGE_SIZE * ((vm_size_t)index); 3593 index = *(zone_page_index_t *)fpa; 3594 } 3595 3596 /* 3597 * Reclaim the pages we are freeing. 3598 */ 3599 while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { 3600 zone_page_index_t zind = zone_free_page_head; 3601 vm_address_t free_page_address; 3602 int page_count; 3603 3604 /* 3605 * Use the first word of the page about to be freed to find the next free page 3606 */ 3607 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind); 3608 zone_free_page_head = *(zone_page_index_t *)free_page_address; 3609 3610 page_count = 1; 3611 total_freed_pages++; 3612 3613 while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) { 3614 zone_page_index_t next_zind = zone_free_page_head; 3615 vm_address_t next_free_page_address; 3616 3617 next_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)next_zind); 3618 3619 if (next_free_page_address == (free_page_address - PAGE_SIZE)) { 3620 free_page_address = next_free_page_address; 3621 } else if (next_free_page_address != (free_page_address + (PAGE_SIZE * page_count))) 3622 break; 3623 3624 zone_free_page_head = *(zone_page_index_t *)next_free_page_address; 3625 page_count++; 3626 total_freed_pages++; 3627 } 3628 kmem_free(zone_map, free_page_address, page_count * PAGE_SIZE); 3629 ZONE_PAGE_COUNT_DECR(z, page_count); 3630 zgc_stats.pgs_freed += page_count; 3631 pages_to_free_count -= page_count; 3632 3633 if (++kmem_frees == 32) { 3634 thread_yield_internal(1); 3635 kmem_frees = 0; 3636 } 3637 } 3638 3639 /* Check that we actually free the exact number of pages we were supposed to */ 3640 assert(pages_to_free_count == 0); 3641 3642 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) 3643 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages); 3644 3645 thread_set_eager_preempt(mythread); 3646 } 3647 3648 if (old_pgs_freed == zgc_stats.pgs_freed) 3649 zgc_stats.zgc_bailed++; 3650 3651 thread_clear_eager_preempt(mythread); 3652 3653 lck_mtx_unlock(&zone_gc_lock); 3654 3655} 3656 3657extern vm_offset_t kmapoff_kaddr; 3658extern unsigned int kmapoff_pgcnt; 3659 3660/* 3661 * consider_zone_gc: 3662 * 3663 * Called by the pageout daemon when the system needs more free pages. 3664 */ 3665 3666void 3667consider_zone_gc(boolean_t force) 3668{ 3669 boolean_t all_zones = FALSE; 3670 3671 if (kmapoff_kaddr != 0) { 3672 /* 3673 * One-time reclaim of kernel_map resources we allocated in 3674 * early boot. 3675 */ 3676 (void) vm_deallocate(kernel_map, 3677 kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64); 3678 kmapoff_kaddr = 0; 3679 } 3680 3681 if (zone_gc_allowed && 3682 (zone_gc_allowed_by_time_throttle || 3683 zone_gc_forced || 3684 force)) { 3685 if (zone_gc_allowed_by_time_throttle == TRUE) { 3686 zone_gc_allowed_by_time_throttle = FALSE; 3687 all_zones = TRUE; 3688 } 3689 zone_gc_forced = FALSE; 3690 3691 zone_gc(all_zones); 3692 } 3693} 3694 3695/* 3696 * By default, don't attempt zone GC more frequently 3697 * than once / 1 minutes. 3698 */ 3699void 3700compute_zone_gc_throttle(void *arg __unused) 3701{ 3702 zone_gc_allowed_by_time_throttle = TRUE; 3703} 3704 3705 3706#if CONFIG_TASK_ZONE_INFO 3707 3708kern_return_t 3709task_zone_info( 3710 task_t task, 3711 mach_zone_name_array_t *namesp, 3712 mach_msg_type_number_t *namesCntp, 3713 task_zone_info_array_t *infop, 3714 mach_msg_type_number_t *infoCntp) 3715{ 3716 mach_zone_name_t *names; 3717 vm_offset_t names_addr; 3718 vm_size_t names_size; 3719 task_zone_info_t *info; 3720 vm_offset_t info_addr; 3721 vm_size_t info_size; 3722 unsigned int max_zones, i; 3723 zone_t z; 3724 mach_zone_name_t *zn; 3725 task_zone_info_t *zi; 3726 kern_return_t kr; 3727 3728 vm_size_t used; 3729 vm_map_copy_t copy; 3730 3731 3732 if (task == TASK_NULL) 3733 return KERN_INVALID_TASK; 3734 3735 /* 3736 * We assume that zones aren't freed once allocated. 3737 * We won't pick up any zones that are allocated later. 3738 */ 3739 3740 simple_lock(&all_zones_lock); 3741 max_zones = (unsigned int)(num_zones + num_fake_zones); 3742 z = first_zone; 3743 simple_unlock(&all_zones_lock); 3744 3745 names_size = round_page(max_zones * sizeof *names); 3746 kr = kmem_alloc_pageable(ipc_kernel_map, 3747 &names_addr, names_size); 3748 if (kr != KERN_SUCCESS) 3749 return kr; 3750 names = (mach_zone_name_t *) names_addr; 3751 3752 info_size = round_page(max_zones * sizeof *info); 3753 kr = kmem_alloc_pageable(ipc_kernel_map, 3754 &info_addr, info_size); 3755 if (kr != KERN_SUCCESS) { 3756 kmem_free(ipc_kernel_map, 3757 names_addr, names_size); 3758 return kr; 3759 } 3760 3761 info = (task_zone_info_t *) info_addr; 3762 3763 zn = &names[0]; 3764 zi = &info[0]; 3765 3766 for (i = 0; i < max_zones - num_fake_zones; i++) { 3767 struct zone zcopy; 3768 3769 assert(z != ZONE_NULL); 3770 3771 lock_zone(z); 3772 zcopy = *z; 3773 unlock_zone(z); 3774 3775 simple_lock(&all_zones_lock); 3776 z = z->next_zone; 3777 simple_unlock(&all_zones_lock); 3778 3779 /* assuming here the name data is static */ 3780 (void) strncpy(zn->mzn_name, zcopy.zone_name, 3781 sizeof zn->mzn_name); 3782 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3783 3784 zi->tzi_count = (uint64_t)zcopy.count; 3785 zi->tzi_cur_size = (uint64_t)zcopy.cur_size; 3786 zi->tzi_max_size = (uint64_t)zcopy.max_size; 3787 zi->tzi_elem_size = (uint64_t)zcopy.elem_size; 3788 zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size; 3789 zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size; 3790 zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible; 3791 zi->tzi_collectable = (uint64_t)zcopy.collectable; 3792 zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct; 3793 if (task->tkm_zinfo != NULL) { 3794 zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc; 3795 zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free; 3796 } else { 3797 zi->tzi_task_alloc = 0; 3798 zi->tzi_task_free = 0; 3799 } 3800 zn++; 3801 zi++; 3802 } 3803 3804 /* 3805 * loop through the fake zones and fill them using the specialized 3806 * functions 3807 */ 3808 for (i = 0; i < num_fake_zones; i++) { 3809 int count, collectable, exhaustible, caller_acct, index; 3810 vm_size_t cur_size, max_size, elem_size, alloc_size; 3811 uint64_t sum_size; 3812 3813 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); 3814 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3815 fake_zones[i].query(&count, &cur_size, 3816 &max_size, &elem_size, 3817 &alloc_size, &sum_size, 3818 &collectable, &exhaustible, &caller_acct); 3819 zi->tzi_count = (uint64_t)count; 3820 zi->tzi_cur_size = (uint64_t)cur_size; 3821 zi->tzi_max_size = (uint64_t)max_size; 3822 zi->tzi_elem_size = (uint64_t)elem_size; 3823 zi->tzi_alloc_size = (uint64_t)alloc_size; 3824 zi->tzi_sum_size = sum_size; 3825 zi->tzi_collectable = (uint64_t)collectable; 3826 zi->tzi_exhaustible = (uint64_t)exhaustible; 3827 zi->tzi_caller_acct = (uint64_t)caller_acct; 3828 if (task->tkm_zinfo != NULL) { 3829 index = ZINFO_SLOTS - num_fake_zones + i; 3830 zi->tzi_task_alloc = task->tkm_zinfo[index].alloc; 3831 zi->tzi_task_free = task->tkm_zinfo[index].free; 3832 } else { 3833 zi->tzi_task_alloc = 0; 3834 zi->tzi_task_free = 0; 3835 } 3836 zn++; 3837 zi++; 3838 } 3839 3840 used = max_zones * sizeof *names; 3841 if (used != names_size) 3842 bzero((char *) (names_addr + used), names_size - used); 3843 3844 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 3845 (vm_map_size_t)names_size, TRUE, ©); 3846 assert(kr == KERN_SUCCESS); 3847 3848 *namesp = (mach_zone_name_t *) copy; 3849 *namesCntp = max_zones; 3850 3851 used = max_zones * sizeof *info; 3852 3853 if (used != info_size) 3854 bzero((char *) (info_addr + used), info_size - used); 3855 3856 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 3857 (vm_map_size_t)info_size, TRUE, ©); 3858 assert(kr == KERN_SUCCESS); 3859 3860 *infop = (task_zone_info_t *) copy; 3861 *infoCntp = max_zones; 3862 3863 return KERN_SUCCESS; 3864} 3865 3866#else /* CONFIG_TASK_ZONE_INFO */ 3867 3868kern_return_t 3869task_zone_info( 3870 __unused task_t task, 3871 __unused mach_zone_name_array_t *namesp, 3872 __unused mach_msg_type_number_t *namesCntp, 3873 __unused task_zone_info_array_t *infop, 3874 __unused mach_msg_type_number_t *infoCntp) 3875{ 3876 return KERN_FAILURE; 3877} 3878 3879#endif /* CONFIG_TASK_ZONE_INFO */ 3880 3881kern_return_t 3882mach_zone_info( 3883 host_priv_t host, 3884 mach_zone_name_array_t *namesp, 3885 mach_msg_type_number_t *namesCntp, 3886 mach_zone_info_array_t *infop, 3887 mach_msg_type_number_t *infoCntp) 3888{ 3889 mach_zone_name_t *names; 3890 vm_offset_t names_addr; 3891 vm_size_t names_size; 3892 mach_zone_info_t *info; 3893 vm_offset_t info_addr; 3894 vm_size_t info_size; 3895 unsigned int max_zones, i; 3896 zone_t z; 3897 mach_zone_name_t *zn; 3898 mach_zone_info_t *zi; 3899 kern_return_t kr; 3900 3901 vm_size_t used; 3902 vm_map_copy_t copy; 3903 3904 3905 if (host == HOST_NULL) 3906 return KERN_INVALID_HOST; 3907#if CONFIG_DEBUGGER_FOR_ZONE_INFO 3908 if (!PE_i_can_has_debugger(NULL)) 3909 return KERN_INVALID_HOST; 3910#endif 3911 3912 /* 3913 * We assume that zones aren't freed once allocated. 3914 * We won't pick up any zones that are allocated later. 3915 */ 3916 3917 simple_lock(&all_zones_lock); 3918 max_zones = (unsigned int)(num_zones + num_fake_zones); 3919 z = first_zone; 3920 simple_unlock(&all_zones_lock); 3921 3922 names_size = round_page(max_zones * sizeof *names); 3923 kr = kmem_alloc_pageable(ipc_kernel_map, 3924 &names_addr, names_size); 3925 if (kr != KERN_SUCCESS) 3926 return kr; 3927 names = (mach_zone_name_t *) names_addr; 3928 3929 info_size = round_page(max_zones * sizeof *info); 3930 kr = kmem_alloc_pageable(ipc_kernel_map, 3931 &info_addr, info_size); 3932 if (kr != KERN_SUCCESS) { 3933 kmem_free(ipc_kernel_map, 3934 names_addr, names_size); 3935 return kr; 3936 } 3937 3938 info = (mach_zone_info_t *) info_addr; 3939 3940 zn = &names[0]; 3941 zi = &info[0]; 3942 3943 for (i = 0; i < max_zones - num_fake_zones; i++) { 3944 struct zone zcopy; 3945 3946 assert(z != ZONE_NULL); 3947 3948 lock_zone(z); 3949 zcopy = *z; 3950 unlock_zone(z); 3951 3952 simple_lock(&all_zones_lock); 3953 z = z->next_zone; 3954 simple_unlock(&all_zones_lock); 3955 3956 /* assuming here the name data is static */ 3957 (void) strncpy(zn->mzn_name, zcopy.zone_name, 3958 sizeof zn->mzn_name); 3959 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3960 3961 zi->mzi_count = (uint64_t)zcopy.count; 3962 zi->mzi_cur_size = (uint64_t)zcopy.cur_size; 3963 zi->mzi_max_size = (uint64_t)zcopy.max_size; 3964 zi->mzi_elem_size = (uint64_t)zcopy.elem_size; 3965 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size; 3966 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size; 3967 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible; 3968 zi->mzi_collectable = (uint64_t)zcopy.collectable; 3969 zn++; 3970 zi++; 3971 } 3972 3973 /* 3974 * loop through the fake zones and fill them using the specialized 3975 * functions 3976 */ 3977 for (i = 0; i < num_fake_zones; i++) { 3978 int count, collectable, exhaustible, caller_acct; 3979 vm_size_t cur_size, max_size, elem_size, alloc_size; 3980 uint64_t sum_size; 3981 3982 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name); 3983 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0'; 3984 fake_zones[i].query(&count, &cur_size, 3985 &max_size, &elem_size, 3986 &alloc_size, &sum_size, 3987 &collectable, &exhaustible, &caller_acct); 3988 zi->mzi_count = (uint64_t)count; 3989 zi->mzi_cur_size = (uint64_t)cur_size; 3990 zi->mzi_max_size = (uint64_t)max_size; 3991 zi->mzi_elem_size = (uint64_t)elem_size; 3992 zi->mzi_alloc_size = (uint64_t)alloc_size; 3993 zi->mzi_sum_size = sum_size; 3994 zi->mzi_collectable = (uint64_t)collectable; 3995 zi->mzi_exhaustible = (uint64_t)exhaustible; 3996 3997 zn++; 3998 zi++; 3999 } 4000 4001 used = max_zones * sizeof *names; 4002 if (used != names_size) 4003 bzero((char *) (names_addr + used), names_size - used); 4004 4005 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 4006 (vm_map_size_t)names_size, TRUE, ©); 4007 assert(kr == KERN_SUCCESS); 4008 4009 *namesp = (mach_zone_name_t *) copy; 4010 *namesCntp = max_zones; 4011 4012 used = max_zones * sizeof *info; 4013 4014 if (used != info_size) 4015 bzero((char *) (info_addr + used), info_size - used); 4016 4017 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 4018 (vm_map_size_t)info_size, TRUE, ©); 4019 assert(kr == KERN_SUCCESS); 4020 4021 *infop = (mach_zone_info_t *) copy; 4022 *infoCntp = max_zones; 4023 4024 return KERN_SUCCESS; 4025} 4026 4027/* 4028 * host_zone_info - LEGACY user interface for Mach zone information 4029 * Should use mach_zone_info() instead! 4030 */ 4031kern_return_t 4032host_zone_info( 4033 host_priv_t host, 4034 zone_name_array_t *namesp, 4035 mach_msg_type_number_t *namesCntp, 4036 zone_info_array_t *infop, 4037 mach_msg_type_number_t *infoCntp) 4038{ 4039 zone_name_t *names; 4040 vm_offset_t names_addr; 4041 vm_size_t names_size; 4042 zone_info_t *info; 4043 vm_offset_t info_addr; 4044 vm_size_t info_size; 4045 unsigned int max_zones, i; 4046 zone_t z; 4047 zone_name_t *zn; 4048 zone_info_t *zi; 4049 kern_return_t kr; 4050 4051 vm_size_t used; 4052 vm_map_copy_t copy; 4053 4054 4055 if (host == HOST_NULL) 4056 return KERN_INVALID_HOST; 4057#if CONFIG_DEBUGGER_FOR_ZONE_INFO 4058 if (!PE_i_can_has_debugger(NULL)) 4059 return KERN_INVALID_HOST; 4060#endif 4061 4062#if defined(__LP64__) 4063 if (!thread_is_64bit(current_thread())) 4064 return KERN_NOT_SUPPORTED; 4065#else 4066 if (thread_is_64bit(current_thread())) 4067 return KERN_NOT_SUPPORTED; 4068#endif 4069 4070 /* 4071 * We assume that zones aren't freed once allocated. 4072 * We won't pick up any zones that are allocated later. 4073 */ 4074 4075 simple_lock(&all_zones_lock); 4076 max_zones = (unsigned int)(num_zones + num_fake_zones); 4077 z = first_zone; 4078 simple_unlock(&all_zones_lock); 4079 4080 names_size = round_page(max_zones * sizeof *names); 4081 kr = kmem_alloc_pageable(ipc_kernel_map, 4082 &names_addr, names_size); 4083 if (kr != KERN_SUCCESS) 4084 return kr; 4085 names = (zone_name_t *) names_addr; 4086 4087 info_size = round_page(max_zones * sizeof *info); 4088 kr = kmem_alloc_pageable(ipc_kernel_map, 4089 &info_addr, info_size); 4090 if (kr != KERN_SUCCESS) { 4091 kmem_free(ipc_kernel_map, 4092 names_addr, names_size); 4093 return kr; 4094 } 4095 4096 info = (zone_info_t *) info_addr; 4097 4098 zn = &names[0]; 4099 zi = &info[0]; 4100 4101 for (i = 0; i < max_zones - num_fake_zones; i++) { 4102 struct zone zcopy; 4103 4104 assert(z != ZONE_NULL); 4105 4106 lock_zone(z); 4107 zcopy = *z; 4108 unlock_zone(z); 4109 4110 simple_lock(&all_zones_lock); 4111 z = z->next_zone; 4112 simple_unlock(&all_zones_lock); 4113 4114 /* assuming here the name data is static */ 4115 (void) strncpy(zn->zn_name, zcopy.zone_name, 4116 sizeof zn->zn_name); 4117 zn->zn_name[sizeof zn->zn_name - 1] = '\0'; 4118 4119 zi->zi_count = zcopy.count; 4120 zi->zi_cur_size = zcopy.cur_size; 4121 zi->zi_max_size = zcopy.max_size; 4122 zi->zi_elem_size = zcopy.elem_size; 4123 zi->zi_alloc_size = zcopy.alloc_size; 4124 zi->zi_exhaustible = zcopy.exhaustible; 4125 zi->zi_collectable = zcopy.collectable; 4126 4127 zn++; 4128 zi++; 4129 } 4130 4131 /* 4132 * loop through the fake zones and fill them using the specialized 4133 * functions 4134 */ 4135 for (i = 0; i < num_fake_zones; i++) { 4136 int caller_acct; 4137 uint64_t sum_space; 4138 strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name); 4139 zn->zn_name[sizeof zn->zn_name - 1] = '\0'; 4140 fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size, 4141 &zi->zi_max_size, &zi->zi_elem_size, 4142 &zi->zi_alloc_size, &sum_space, 4143 &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct); 4144 zn++; 4145 zi++; 4146 } 4147 4148 used = max_zones * sizeof *names; 4149 if (used != names_size) 4150 bzero((char *) (names_addr + used), names_size - used); 4151 4152 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr, 4153 (vm_map_size_t)names_size, TRUE, ©); 4154 assert(kr == KERN_SUCCESS); 4155 4156 *namesp = (zone_name_t *) copy; 4157 *namesCntp = max_zones; 4158 4159 used = max_zones * sizeof *info; 4160 if (used != info_size) 4161 bzero((char *) (info_addr + used), info_size - used); 4162 4163 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr, 4164 (vm_map_size_t)info_size, TRUE, ©); 4165 assert(kr == KERN_SUCCESS); 4166 4167 *infop = (zone_info_t *) copy; 4168 *infoCntp = max_zones; 4169 4170 return KERN_SUCCESS; 4171} 4172 4173kern_return_t 4174mach_zone_force_gc( 4175 host_t host) 4176{ 4177 4178 if (host == HOST_NULL) 4179 return KERN_INVALID_HOST; 4180 4181 consider_zone_gc(TRUE); 4182 4183 return (KERN_SUCCESS); 4184} 4185 4186extern unsigned int stack_total; 4187extern unsigned long long stack_allocs; 4188 4189#if defined(__i386__) || defined (__x86_64__) 4190extern unsigned int inuse_ptepages_count; 4191extern long long alloc_ptepages_count; 4192#endif 4193 4194void zone_display_zprint() 4195{ 4196 unsigned int i; 4197 zone_t the_zone; 4198 4199 if(first_zone!=NULL) { 4200 the_zone = first_zone; 4201 for (i = 0; i < num_zones; i++) { 4202 if(the_zone->cur_size > (1024*1024)) { 4203 printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size); 4204 } 4205 4206 if(the_zone->next_zone == NULL) { 4207 break; 4208 } 4209 4210 the_zone = the_zone->next_zone; 4211 } 4212 } 4213 4214 printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total)); 4215 4216#if defined(__i386__) || defined (__x86_64__) 4217 printf("PageTables:\t%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count)); 4218#endif 4219 4220 printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total); 4221} 4222 4223zone_t 4224zone_find_largest(void) 4225{ 4226 unsigned int i; 4227 unsigned int max_zones; 4228 zone_t the_zone; 4229 zone_t zone_largest; 4230 4231 simple_lock(&all_zones_lock); 4232 the_zone = first_zone; 4233 max_zones = num_zones; 4234 simple_unlock(&all_zones_lock); 4235 4236 zone_largest = the_zone; 4237 for (i = 0; i < max_zones; i++) { 4238 if (the_zone->cur_size > zone_largest->cur_size) { 4239 zone_largest = the_zone; 4240 } 4241 4242 if (the_zone->next_zone == NULL) { 4243 break; 4244 } 4245 4246 the_zone = the_zone->next_zone; 4247 } 4248 return zone_largest; 4249} 4250 4251#if ZONE_DEBUG 4252 4253/* should we care about locks here ? */ 4254 4255#define zone_in_use(z) ( z->count || z->free_elements \ 4256 || !queue_empty(&z->pages.all_free) \ 4257 || !queue_empty(&z->pages.intermediate) \ 4258 || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign))) 4259 4260void 4261zone_debug_enable( 4262 zone_t z) 4263{ 4264 if (zone_debug_enabled(z) || zone_in_use(z) || 4265 z->alloc_size < (z->elem_size + ZONE_DEBUG_OFFSET)) 4266 return; 4267 queue_init(&z->active_zones); 4268 z->elem_size += ZONE_DEBUG_OFFSET; 4269} 4270 4271void 4272zone_debug_disable( 4273 zone_t z) 4274{ 4275 if (!zone_debug_enabled(z) || zone_in_use(z)) 4276 return; 4277 z->elem_size -= ZONE_DEBUG_OFFSET; 4278 z->active_zones.next = z->active_zones.prev = NULL; 4279} 4280 4281 4282#endif /* ZONE_DEBUG */ 4283