1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 *	File:	kern/zalloc.c
60 *	Author:	Avadis Tevanian, Jr.
61 *
62 *	Zone-based memory allocator.  A zone is a collection of fixed size
63 *	data blocks for which quick allocation/deallocation is possible.
64 */
65#include <zone_debug.h>
66#include <zone_alias_addr.h>
67
68#include <mach/mach_types.h>
69#include <mach/vm_param.h>
70#include <mach/kern_return.h>
71#include <mach/mach_host_server.h>
72#include <mach/task_server.h>
73#include <mach/machine/vm_types.h>
74#include <mach_debug/zone_info.h>
75#include <mach/vm_map.h>
76
77#include <kern/kern_types.h>
78#include <kern/assert.h>
79#include <kern/host.h>
80#include <kern/macro_help.h>
81#include <kern/sched.h>
82#include <kern/locks.h>
83#include <kern/sched_prim.h>
84#include <kern/misc_protos.h>
85#include <kern/thread_call.h>
86#include <kern/zalloc.h>
87#include <kern/kalloc.h>
88#include <kern/btlog.h>
89
90#include <vm/pmap.h>
91#include <vm/vm_map.h>
92#include <vm/vm_kern.h>
93#include <vm/vm_page.h>
94
95#include <pexpert/pexpert.h>
96
97#include <machine/machparam.h>
98#include <machine/machine_routines.h>  /* ml_cpu_get_info */
99
100#include <libkern/OSDebug.h>
101#include <libkern/OSAtomic.h>
102#include <sys/kdebug.h>
103
104/*
105 *  ZONE_ALIAS_ADDR
106 *
107 * With this option enabled, zones with alloc_size <= PAGE_SIZE allocate
108 * a virtual page from the zone_map, but before zcram-ing the allocated memory
109 * into the zone, the page is translated to use the alias address of the page
110 * in the static kernel region. zone_gc reverses that translation when
111 * scanning the freelist to collect free pages so that it can look up the page
112 * in the zone_page_table, and free it to kmem_free.
113 *
114 * The static kernel region is a flat 1:1 mapping of physical memory passed
115 * to xnu by the booter. It is mapped to the range:
116 * [gVirtBase, gVirtBase + gPhysSize]
117 *
118 * Accessing memory via the static kernel region is faster due to the
119 * entire region being mapped via large pages, cutting down
120 * on TLB misses.
121 *
122 * zinit favors using PAGE_SIZE backing allocations for a zone unless it would
123 * waste more than 10% space to use a single page, in order to take advantage
124 * of the speed benefit for as many zones as possible.
125 *
126 * Zones with > PAGE_SIZE allocations can't take advantage of this
127 * because kernel_memory_allocate doesn't give out physically contiguous pages.
128 *
129 * zone_virtual_addr()
130 *  - translates an address from the static kernel region to the zone_map
131 *  - returns the same address if it's not from the static kernel region
132 * It relies on the fact that a physical page mapped to the
133 * zone_map is not mapped anywhere else (except the static kernel region).
134 *
135 * zone_alias_addr()
136 *  - translates a virtual memory address from the zone_map to the
137 *    corresponding address in the static kernel region
138 *
139 */
140
141#if     !ZONE_ALIAS_ADDR
142#define from_zone_map(addr, size) \
143        ((vm_offset_t)(addr)             >= zone_map_min_address && \
144        ((vm_offset_t)(addr) + size - 1) <  zone_map_max_address )
145#else
146#define from_zone_map(addr, size) \
147        ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr))            >= zone_map_min_address && \
148        ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)(uintptr_t)addr)) + size -1) <  zone_map_max_address )
149#endif
150
151/*
152 * Zone Corruption Debugging
153 *
154 * We use three techniques to detect modification of a zone element
155 * after it's been freed.
156 *
157 * (1) Check the freelist next pointer for sanity.
158 * (2) Store a backup of the next pointer at the end of the element,
159 *     and compare it to the primary next pointer when the element is allocated
160 *     to detect corruption of the freelist due to use-after-free bugs.
161 *     The backup pointer is also XORed with a per-boot random cookie.
162 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
163 *     and check for that value when the element is being reused to make sure
164 *     no part of the element has been modified while it was on the freelist.
165 *     This will also help catch read-after-frees, as code will now dereference
166 *     0xdeadbeef instead of a valid but freed pointer.
167 *
168 * (1) and (2) occur for every allocation and free to a zone.
169 * This is done to make it slightly more difficult for an attacker to
170 * manipulate the freelist to behave in a specific way.
171 *
172 * Poisoning (3) occurs periodically for every N frees (counted per-zone)
173 * and on every free for zones smaller than a cacheline.  If -zp
174 * is passed as a boot arg, poisoning occurs for every free.
175 *
176 * Performance slowdown is inversely proportional to the frequency of poisoning,
177 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
178 * and higher. You can expect to find a 100% reproducible bug in an average of
179 * N tries, with a standard deviation of about N, but you will want to set
180 * "-zp" to always poison every free if you are attempting to reproduce
181 * a known bug.
182 *
183 * For a more heavyweight, but finer-grained method of detecting misuse
184 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
185 *
186 * Zone Corruption Logging
187 *
188 * You can also track where corruptions come from by using the boot-arguments
189 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
190 * in this document for more implementation and usage information.
191 *
192 * Zone Leak Detection
193 *
194 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
195 * found later in this file via the showtopztrace and showz* macros in kgmacros,
196 * or use zlog without the -zc argument.
197 *
198 */
199
200/* Returns TRUE if we rolled over the counter at factor */
201static inline boolean_t
202sample_counter(volatile uint32_t * count_p, uint32_t factor)
203{
204	uint32_t old_count, new_count;
205	boolean_t rolled_over;
206
207	do {
208		new_count = old_count = *count_p;
209
210		if (++new_count >= factor) {
211			rolled_over = TRUE;
212			new_count = 0;
213		} else {
214			rolled_over = FALSE;
215		}
216
217	} while (!OSCompareAndSwap(old_count, new_count, count_p));
218
219	return rolled_over;
220}
221
222#if defined(__LP64__)
223#define ZP_POISON       0xdeadbeefdeadbeef
224#else
225#define ZP_POISON       0xdeadbeef
226#endif
227
228#define ZP_DEFAULT_SAMPLING_FACTOR 16
229#define ZP_DEFAULT_SCALE_FACTOR 4
230
231/*
232 *  A zp_factor of 0 indicates zone poisoning is disabled,
233 *  however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
234 *  Passing the -no-zp boot-arg disables even this behavior.
235 *  In all cases, we record and check the integrity of a backup pointer.
236 */
237
238/* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
239uint32_t        zp_factor               = 0;
240
241/* set by zp-scale=N boot arg, scales zp_factor by zone size */
242uint32_t        zp_scale                = 0;
243
244/* set in zp_init, zero indicates -no-zp boot-arg */
245vm_size_t       zp_tiny_zone_limit      = 0;
246
247/* initialized to a per-boot random value in zp_init */
248uintptr_t       zp_poisoned_cookie      = 0;
249uintptr_t       zp_nopoison_cookie      = 0;
250
251
252/*
253 * initialize zone poisoning
254 * called from zone_bootstrap before any allocations are made from zalloc
255 */
256static inline void
257zp_init(void)
258{
259	char temp_buf[16];
260
261	/*
262	 * Initialize backup pointer random cookie for poisoned elements
263	 * Try not to call early_random() back to back, it may return
264	 * the same value if mach_absolute_time doesn't have sufficient time
265	 * to tick over between calls.  <rdar://problem/11597395>
266	 * (This is only a problem on embedded devices)
267	 */
268	zp_poisoned_cookie = (uintptr_t) early_random();
269
270	/*
271	 * Always poison zones smaller than a cacheline,
272	 * because it's pretty close to free
273	 */
274	ml_cpu_info_t cpu_info;
275	ml_cpu_get_info(&cpu_info);
276	zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
277
278	zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
279	zp_scale  = ZP_DEFAULT_SCALE_FACTOR;
280
281	//TODO: Bigger permutation?
282	/*
283	 * Permute the default factor +/- 1 to make it less predictable
284	 * This adds or subtracts ~4 poisoned objects per 1000 frees.
285	 */
286	if (zp_factor != 0) {
287		uint32_t rand_bits = early_random() & 0x3;
288
289		if (rand_bits == 0x1)
290			zp_factor += 1;
291		else if (rand_bits == 0x2)
292			zp_factor -= 1;
293		/* if 0x0 or 0x3, leave it alone */
294	}
295
296	/* -zp: enable poisoning for every alloc and free */
297	if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
298		zp_factor = 1;
299	}
300
301	/* -no-zp: disable poisoning completely even for tiny zones */
302	if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
303		zp_factor          = 0;
304		zp_tiny_zone_limit = 0;
305		printf("Zone poisoning disabled\n");
306	}
307
308	/* zp-factor=XXXX: override how often to poison freed zone elements */
309	if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
310		printf("Zone poisoning factor override: %u\n", zp_factor);
311	}
312
313	/* zp-scale=XXXX: override how much zone size scales zp-factor by */
314	if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
315		printf("Zone poisoning scale factor override: %u\n", zp_scale);
316	}
317
318	/* Initialize backup pointer random cookie for unpoisoned elements */
319	zp_nopoison_cookie = (uintptr_t) early_random();
320
321#if MACH_ASSERT
322	if (zp_poisoned_cookie == zp_nopoison_cookie)
323		panic("early_random() is broken: %p and %p are not random\n",
324		      (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
325#endif
326
327	/*
328	 * Use the last bit in the backup pointer to hint poisoning state
329	 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
330	 * the low bits are zero.
331	 */
332	zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
333	zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
334
335#if defined(__LP64__)
336	/*
337	 * Make backup pointers more obvious in GDB for 64 bit
338	 * by making OxFFFFFF... ^ cookie = 0xFACADE...
339	 * (0xFACADE = 0xFFFFFF ^ 0x053521)
340	 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
341	 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
342	 * by the sanity check, so it's OK for that part of the cookie to be predictable.
343	 *
344	 * TODO: Use #defines, xors, and shifts
345	 */
346
347	zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
348	zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
349
350	zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
351	zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
352#endif
353}
354
355/* zone_map page count for page table structure */
356uint64_t zone_map_table_page_count = 0;
357
358/*
359 * These macros are used to keep track of the number
360 * of pages being used by the zone currently. The
361 * z->page_count is protected by the zone lock.
362 */
363#define ZONE_PAGE_COUNT_INCR(z, count)		\
364{						\
365	OSAddAtomic64(count, &(z->page_count));	\
366}
367
368#define ZONE_PAGE_COUNT_DECR(z, count)			\
369{							\
370	OSAddAtomic64(-count, &(z->page_count));	\
371}
372
373/* for is_sane_zone_element and garbage collection */
374
375vm_offset_t     zone_map_min_address = 0;  /* initialized in zone_init */
376vm_offset_t     zone_map_max_address = 0;
377
378/* Helpful for walking through a zone's free element list. */
379struct zone_free_element {
380	struct zone_free_element *next;
381	/* ... */
382	/* void *backup_ptr; */
383};
384
385struct zone_page_metadata {
386	queue_chain_t				pages;
387	struct zone_free_element	*elements;
388	zone_t						zone;
389	uint16_t					alloc_count;
390	uint16_t					free_count;
391};
392
393/* The backup pointer is stored in the last pointer-sized location in an element. */
394static inline vm_offset_t *
395get_backup_ptr(vm_size_t  elem_size,
396               vm_offset_t *element)
397{
398	return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
399}
400
401static inline struct zone_page_metadata *
402get_zone_page_metadata(struct zone_free_element *element)
403{
404	return (struct zone_page_metadata *)(trunc_page((vm_offset_t)element) + PAGE_SIZE - sizeof(struct zone_page_metadata));
405}
406
407/*
408 * Zone checking helper function.
409 * A pointer that satisfies these conditions is OK to be a freelist next pointer
410 * A pointer that doesn't satisfy these conditions indicates corruption
411 */
412static inline boolean_t
413is_sane_zone_ptr(zone_t		zone,
414                 vm_offset_t	addr,
415		 size_t		obj_size)
416{
417	/*  Must be aligned to pointer boundary */
418	if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0))
419		return FALSE;
420
421	/*  Must be a kernel address */
422	if (__improbable(!pmap_kernel_va(addr)))
423		return FALSE;
424
425	/*  Must be from zone map if the zone only uses memory from the zone_map */
426	/*
427	 *  TODO: Remove the zone->collectable check when every
428	 *  zone using foreign memory is properly tagged with allows_foreign
429	 */
430	if (zone->collectable && !zone->allows_foreign) {
431#if ZONE_ALIAS_ADDR
432		/*
433		 * If this address is in the static kernel region, it might be
434		 * the alias address of a valid zone element.
435		 * If we tried to find the zone_virtual_addr() of an invalid
436		 * address in the static kernel region, it will panic, so don't
437		 * check addresses in this region.
438		 *
439		 * TODO: Use a safe variant of zone_virtual_addr to
440		 *  make this check more accurate
441		 *
442		 * The static kernel region is mapped at:
443		 * [gVirtBase, gVirtBase + gPhysSize]
444		 */
445		if ((addr - gVirtBase) < gPhysSize)
446			return TRUE;
447#endif
448		/*  check if addr is from zone map */
449		if (addr                 >= zone_map_min_address &&
450		   (addr + obj_size - 1) <  zone_map_max_address )
451			return TRUE;
452
453		return FALSE;
454	}
455
456	return TRUE;
457}
458
459static inline boolean_t
460is_sane_zone_page_metadata(zone_t 	zone,
461			   vm_offset_t 	page_meta)
462{
463	/* NULL page metadata structures are invalid */
464	if (page_meta == 0)
465		return FALSE;
466	return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
467}
468
469static inline boolean_t
470is_sane_zone_element(zone_t      zone,
471                     vm_offset_t addr)
472{
473	/*  NULL is OK because it indicates the tail of the list */
474	if (addr == 0)
475		return TRUE;
476	return is_sane_zone_ptr(zone, addr, zone->elem_size);
477}
478
479/* Someone wrote to freed memory. */
480static inline void /* noreturn */
481zone_element_was_modified_panic(zone_t        zone,
482                                vm_offset_t   element,
483                                vm_offset_t   found,
484                                vm_offset_t   expected,
485                                vm_offset_t   offset)
486{
487	panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
488	                 zone->zone_name,
489	      (void *)   expected,
490	      (void *)   found,
491	      (void *)   (expected ^ found),
492	      (uint32_t) offset,
493	      (uint32_t) zone->elem_size,
494	      (void *)   element,
495	      (void *)   zp_nopoison_cookie,
496	      (void *)   zp_poisoned_cookie);
497}
498
499/*
500 * The primary and backup pointers don't match.
501 * Determine which one was likely the corrupted pointer, find out what it
502 * probably should have been, and panic.
503 * I would like to mark this as noreturn, but panic() isn't marked noreturn.
504 */
505static void /* noreturn */
506backup_ptr_mismatch_panic(zone_t        zone,
507                          vm_offset_t   element,
508                          vm_offset_t   primary,
509                          vm_offset_t   backup)
510{
511	vm_offset_t likely_backup;
512
513	boolean_t   sane_backup;
514	boolean_t   sane_primary = is_sane_zone_element(zone, primary);
515	boolean_t   element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
516
517#if defined(__LP64__)
518	/* We can inspect the tag in the upper bits for additional confirmation */
519	if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000)
520		element_was_poisoned = TRUE;
521	else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000)
522		element_was_poisoned = FALSE;
523#endif
524
525	if (element_was_poisoned) {
526		likely_backup = backup ^ zp_poisoned_cookie;
527		sane_backup = is_sane_zone_element(zone, likely_backup);
528	} else {
529		likely_backup = backup ^ zp_nopoison_cookie;
530		sane_backup = is_sane_zone_element(zone, likely_backup);
531	}
532
533	/* The primary is definitely the corrupted one */
534	if (!sane_primary && sane_backup)
535		zone_element_was_modified_panic(zone, element, primary, likely_backup, 0);
536
537	/* The backup is definitely the corrupted one */
538	if (sane_primary && !sane_backup)
539		zone_element_was_modified_panic(zone, element, backup,
540		                                (primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
541		                                zone->elem_size - sizeof(vm_offset_t));
542
543	/*
544	 * Not sure which is the corrupted one.
545	 * It's less likely that the backup pointer was overwritten with
546	 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
547	 * primary pointer has been overwritten with a sane but incorrect address.
548	 */
549	if (sane_primary && sane_backup)
550		zone_element_was_modified_panic(zone, element, primary, likely_backup, 0);
551
552	/* Neither are sane, so just guess. */
553	zone_element_was_modified_panic(zone, element, primary, likely_backup, 0);
554}
555
556
557/*
558 * Sets the next element of tail to elem.
559 * elem can be NULL.
560 * Preserves the poisoning state of the element.
561 */
562static inline void
563append_zone_element(zone_t                    zone,
564                    struct zone_free_element *tail,
565                    struct zone_free_element *elem)
566{
567	vm_offset_t *backup = get_backup_ptr(zone->elem_size, (vm_offset_t *) tail);
568
569	vm_offset_t old_backup = *backup;
570
571	vm_offset_t old_next = (vm_offset_t) tail->next;
572	vm_offset_t new_next = (vm_offset_t) elem;
573
574	if      (old_next == (old_backup ^ zp_nopoison_cookie))
575		*backup = new_next ^ zp_nopoison_cookie;
576	else if (old_next == (old_backup ^ zp_poisoned_cookie))
577		*backup = new_next ^ zp_poisoned_cookie;
578	else
579		backup_ptr_mismatch_panic(zone,
580		                          (vm_offset_t) tail,
581		                          old_next,
582		                          old_backup);
583
584	tail->next = elem;
585}
586
587
588/*
589 * Insert a linked list of elements (delineated by head and tail) at the head of
590 * the zone free list. Every element in the list being added has already gone
591 * through append_zone_element, so their backup pointers are already
592 * set properly.
593 * Precondition: There should be no elements after tail
594 */
595static inline void
596add_list_to_zone(zone_t                    zone,
597                 struct zone_free_element *head,
598                 struct zone_free_element *tail)
599{
600	assert(tail->next == NULL);
601	assert(!zone->use_page_list);
602
603	append_zone_element(zone, tail, zone->free_elements);
604
605	zone->free_elements = head;
606}
607
608
609/*
610 * Adds the element to the head of the zone's free list
611 * Keeps a backup next-pointer at the end of the element
612 */
613static inline void
614free_to_zone(zone_t      zone,
615             vm_offset_t element,
616             boolean_t   poison)
617{
618	vm_offset_t old_head;
619	struct zone_page_metadata *page_meta;
620
621	vm_offset_t *primary  = (vm_offset_t *) element;
622	vm_offset_t *backup   = get_backup_ptr(zone->elem_size, primary);
623
624	if (zone->use_page_list) {
625		page_meta = get_zone_page_metadata((struct zone_free_element *)element);
626		assert(page_meta->zone == zone);
627		old_head = (vm_offset_t)page_meta->elements;
628	} else {
629		old_head = (vm_offset_t)zone->free_elements;
630	}
631
632#if MACH_ASSERT
633	if (__improbable(!is_sane_zone_element(zone, old_head)))
634		panic("zfree: invalid head pointer %p for freelist of zone %s\n",
635		      (void *) old_head, zone->zone_name);
636#endif
637
638	if (__improbable(!is_sane_zone_element(zone, element)))
639		panic("zfree: freeing invalid pointer %p to zone %s\n",
640		      (void *) element, zone->zone_name);
641
642	/*
643	 * Always write a redundant next pointer
644	 * So that it is more difficult to forge, xor it with a random cookie
645	 * A poisoned element is indicated by using zp_poisoned_cookie
646	 * instead of zp_nopoison_cookie
647	 */
648
649	*backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
650
651	/* Insert this element at the head of the free list */
652	*primary             = old_head;
653	if (zone->use_page_list) {
654		page_meta->elements = (struct zone_free_element *)element;
655		page_meta->free_count++;
656		if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
657			if (page_meta->free_count == 1) {
658				/* first foreign element freed on page, move from all_used */
659				remqueue((queue_entry_t)page_meta);
660				enqueue_tail(&zone->pages.any_free_foreign, (queue_entry_t)page_meta);
661			} else {
662				/* no other list transitions */
663			}
664		} else if (page_meta->free_count == page_meta->alloc_count) {
665			/* whether the page was on the intermediate or all_used, queue, move it to free */
666			remqueue((queue_entry_t)page_meta);
667			enqueue_tail(&zone->pages.all_free, (queue_entry_t)page_meta);
668		} else if (page_meta->free_count == 1) {
669			/* first free element on page, move from all_used */
670			remqueue((queue_entry_t)page_meta);
671			enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta);
672		}
673	} else {
674		zone->free_elements = (struct zone_free_element *)element;
675	}
676	zone->count--;
677	zone->countfree++;
678}
679
680
681/*
682 * Removes an element from the zone's free list, returning 0 if the free list is empty.
683 * Verifies that the next-pointer and backup next-pointer are intact,
684 * and verifies that a poisoned element hasn't been modified.
685 */
686static inline vm_offset_t
687try_alloc_from_zone(zone_t zone,
688                    boolean_t* check_poison)
689{
690	vm_offset_t  element;
691	struct zone_page_metadata *page_meta;
692
693	*check_poison = FALSE;
694
695	/* if zone is empty, bail */
696	if (zone->use_page_list) {
697		if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign))
698			page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
699		else if (!queue_empty(&zone->pages.intermediate))
700			page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
701		else if (!queue_empty(&zone->pages.all_free))
702			page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
703		else {
704			return 0;
705		}
706
707		/* Check if page_meta passes is_sane_zone_element */
708		if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta)))
709			panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
710				(void *) page_meta, zone->zone_name);
711		assert(page_meta->zone == zone);
712		element = (vm_offset_t)page_meta->elements;
713	} else {
714		if (zone->free_elements == NULL)
715			return 0;
716
717		element = (vm_offset_t)zone->free_elements;
718	}
719
720#if MACH_ASSERT
721	if (__improbable(!is_sane_zone_element(zone, element)))
722		panic("zfree: invalid head pointer %p for freelist of zone %s\n",
723		      (void *) element, zone->zone_name);
724#endif
725
726	vm_offset_t *primary = (vm_offset_t *) element;
727	vm_offset_t *backup  = get_backup_ptr(zone->elem_size, primary);
728
729	vm_offset_t  next_element          = *primary;
730	vm_offset_t  next_element_backup   = *backup;
731
732	/*
733	 * backup_ptr_mismatch_panic will determine what next_element
734	 * should have been, and print it appropriately
735	 */
736	if (__improbable(!is_sane_zone_element(zone, next_element)))
737		backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup);
738
739	/* Check the backup pointer for the regular cookie */
740	if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
741
742		/* Check for the poisoned cookie instead */
743		if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie)))
744			/* Neither cookie is valid, corruption has occurred */
745			backup_ptr_mismatch_panic(zone, element, next_element, next_element_backup);
746
747		/*
748		 * Element was marked as poisoned, so check its integrity before using it.
749		 */
750		*check_poison = TRUE;
751	}
752
753	if (zone->use_page_list) {
754
755		/* Make sure the page_meta is at the correct offset from the start of page */
756		if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element)))
757			panic("zalloc: metadata located at incorrect location on page of zone %s\n",
758				zone->zone_name);
759
760		/* Make sure next_element belongs to the same page as page_meta */
761		if (next_element) {
762			if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element)))
763				panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
764					(void *)next_element, (void *)element, zone->zone_name);
765		}
766	}
767
768	/* Remove this element from the free list */
769	if (zone->use_page_list) {
770
771		page_meta->elements = (struct zone_free_element *)next_element;
772		page_meta->free_count--;
773
774		if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
775			if (page_meta->free_count == 0) {
776				/* move to all used */
777				remqueue((queue_entry_t)page_meta);
778				enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta);
779			} else {
780				/* no other list transitions */
781			}
782		} else if (page_meta->free_count == 0) {
783			/* remove from intermediate or free, move to all_used */
784			remqueue((queue_entry_t)page_meta);
785			enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_meta);
786		} else if (page_meta->alloc_count == page_meta->free_count + 1) {
787			/* remove from free, move to intermediate */
788			remqueue((queue_entry_t)page_meta);
789			enqueue_tail(&zone->pages.intermediate, (queue_entry_t)page_meta);
790		}
791	} else {
792		zone->free_elements = (struct zone_free_element *)next_element;
793	}
794	zone->countfree--;
795	zone->count++;
796	zone->sum_count++;
797
798	return element;
799}
800
801
802/*
803 * End of zone poisoning
804 */
805
806/*
807 * Fake zones for things that want to report via zprint but are not actually zones.
808 */
809struct fake_zone_info {
810	const char* name;
811	void (*init)(int);
812	void (*query)(int *,
813		     vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *,
814		      uint64_t *, int *, int *, int *);
815};
816
817static const struct fake_zone_info fake_zones[] = {
818	{
819		.name = "kernel_stacks",
820		.init = stack_fake_zone_init,
821		.query = stack_fake_zone_info,
822	},
823	{
824		.name = "page_tables",
825		.init = pt_fake_zone_init,
826		.query = pt_fake_zone_info,
827	},
828	{
829		.name = "kalloc.large",
830		.init = kalloc_fake_zone_init,
831		.query = kalloc_fake_zone_info,
832	},
833};
834static const unsigned int num_fake_zones =
835	sizeof (fake_zones) / sizeof (fake_zones[0]);
836
837/*
838 * Zone info options
839 */
840boolean_t zinfo_per_task = FALSE;		/* enabled by -zinfop in boot-args */
841#define ZINFO_SLOTS 200				/* for now */
842#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1)
843
844/*
845 * Support for garbage collection of unused zone pages
846 *
847 * The kernel virtually allocates the "zone map" submap of the kernel
848 * map. When an individual zone needs more storage, memory is allocated
849 * out of the zone map, and the two-level "zone_page_table" is
850 * on-demand expanded so that it has entries for those pages.
851 * zone_page_init()/zone_page_alloc() initialize "alloc_count"
852 * to the number of zone elements that occupy the zone page (which may
853 * be a minimum of 1, including if a zone element spans multiple
854 * pages).
855 *
856 * Asynchronously, the zone_gc() logic attempts to walk zone free
857 * lists to see if all the elements on a zone page are free. If
858 * "collect_count" (which it increments during the scan) matches
859 * "alloc_count", the zone page is a candidate for collection and the
860 * physical page is returned to the VM system. During this process, the
861 * first word of the zone page is re-used to maintain a linked list of
862 * to-be-collected zone pages.
863 */
864typedef uint32_t zone_page_index_t;
865#define ZONE_PAGE_INDEX_INVALID ((zone_page_index_t)0xFFFFFFFFU)
866
867struct zone_page_table_entry {
868	volatile	uint16_t	alloc_count;
869	volatile	uint16_t	collect_count;
870};
871
872#define	ZONE_PAGE_USED  0
873#define ZONE_PAGE_UNUSED 0xffff
874
875/* Forwards */
876void		zone_page_init(
877				vm_offset_t	addr,
878				vm_size_t	size);
879
880void		zone_page_alloc(
881				vm_offset_t	addr,
882				vm_size_t	size);
883
884void		zone_page_free_element(
885				zone_page_index_t	*free_page_head,
886				zone_page_index_t	*free_page_tail,
887				vm_offset_t	addr,
888				vm_size_t	size);
889
890void		zone_page_collect(
891				vm_offset_t	addr,
892				vm_size_t	size);
893
894boolean_t	zone_page_collectable(
895				vm_offset_t	addr,
896				vm_size_t	size);
897
898void		zone_page_keep(
899				vm_offset_t	addr,
900				vm_size_t	size);
901
902void		zone_display_zprint(void);
903
904zone_t		zone_find_largest(void);
905
906/*
907 * Async allocation of zones
908 * This mechanism allows for bootstrapping an empty zone which is setup with
909 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
910 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
911 * This will prime the zone for the next use.
912 *
913 * Currently the thread_callout function (zalloc_async) will loop through all zones
914 * looking for any zone with async_pending set and do the work for it.
915 *
916 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
917 * then zalloc_noblock to an empty zone may succeed.
918 */
919void		zalloc_async(
920				thread_call_param_t	p0,
921				thread_call_param_t	p1);
922
923static thread_call_data_t call_async_alloc;
924
925vm_map_t	zone_map = VM_MAP_NULL;
926
927zone_t		zone_zone = ZONE_NULL;	/* the zone containing other zones */
928
929zone_t		zinfo_zone = ZONE_NULL; /* zone of per-task zone info */
930
931/*
932 *	The VM system gives us an initial chunk of memory.
933 *	It has to be big enough to allocate the zone_zone
934 *	all the way through the pmap zone.
935 */
936
937vm_offset_t	zdata;
938vm_size_t	zdata_size;
939
940#define zone_wakeup(zone) thread_wakeup((event_t)(zone))
941#define zone_sleep(zone)				\
942	(void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT);
943
944/*
945 *	The zone_locks_grp allows for collecting lock statistics.
946 *	All locks are associated to this group in zinit.
947 *	Look at tools/lockstat for debugging lock contention.
948 */
949
950lck_grp_t	zone_locks_grp;
951lck_grp_attr_t	zone_locks_grp_attr;
952
953#define lock_zone_init(zone)				\
954MACRO_BEGIN						\
955	lck_attr_setdefault(&(zone)->lock_attr);			\
956	lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext,		\
957	    &zone_locks_grp, &(zone)->lock_attr);			\
958MACRO_END
959
960#define lock_try_zone(zone)	lck_mtx_try_lock_spin(&zone->lock)
961
962/*
963 *	Garbage collection map information
964 */
965#define ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE (32)
966struct zone_page_table_entry * volatile zone_page_table[ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE];
967vm_size_t			zone_page_table_used_size;
968unsigned int			zone_pages;
969unsigned int                   zone_page_table_second_level_size;                      /* power of 2 */
970unsigned int                   zone_page_table_second_level_shift_amount;
971
972#define zone_page_table_first_level_slot(x)  ((x) >> zone_page_table_second_level_shift_amount)
973#define zone_page_table_second_level_slot(x) ((x) & (zone_page_table_second_level_size - 1))
974
975void   zone_page_table_expand(zone_page_index_t pindex);
976struct zone_page_table_entry *zone_page_table_lookup(zone_page_index_t pindex);
977
978/*
979 *	Exclude more than one concurrent garbage collection
980 */
981decl_lck_mtx_data(, zone_gc_lock)
982
983lck_attr_t      zone_gc_lck_attr;
984lck_grp_t       zone_gc_lck_grp;
985lck_grp_attr_t  zone_gc_lck_grp_attr;
986lck_mtx_ext_t   zone_gc_lck_ext;
987
988/*
989 *	Protects first_zone, last_zone, num_zones,
990 *	and the next_zone field of zones.
991 */
992decl_simple_lock_data(,	all_zones_lock)
993zone_t			first_zone;
994zone_t			*last_zone;
995unsigned int		num_zones;
996
997boolean_t zone_gc_allowed = TRUE;
998boolean_t zone_gc_forced = FALSE;
999boolean_t panic_include_zprint = FALSE;
1000boolean_t zone_gc_allowed_by_time_throttle = TRUE;
1001
1002#define ZALLOC_DEBUG_ZONEGC		0x00000001
1003#define ZALLOC_DEBUG_ZCRAM		0x00000002
1004uint32_t zalloc_debug = 0;
1005
1006/*
1007 * Zone leak debugging code
1008 *
1009 * When enabled, this code keeps a log to track allocations to a particular zone that have not
1010 * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
1011 * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
1012 * off by default.
1013 *
1014 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
1015 * is the name of the zone you wish to log.
1016 *
1017 * This code only tracks one zone, so you need to identify which one is leaking first.
1018 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
1019 * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
1020 * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
1021 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
1022 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
1023 * See the help in the kgmacros for usage info.
1024 *
1025 *
1026 * Zone corruption logging
1027 *
1028 * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
1029 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
1030 * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
1031 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
1032 * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
1033 * corrupted to examine its history.  This should lead to the source of the corruption.
1034 */
1035
1036static int log_records;	/* size of the log, expressed in number of records */
1037
1038#define MAX_ZONE_NAME	32	/* max length of a zone name we can take from the boot-args */
1039
1040static char zone_name_to_log[MAX_ZONE_NAME] = "";	/* the zone name we're logging, if any */
1041
1042/* Log allocations and frees to help debug a zone element corruption */
1043boolean_t       corruption_debug_flag    = FALSE;    /* enabled by "-zc" boot-arg */
1044
1045/*
1046 * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
1047 * the number of records you want in the log.  For example, "zrecs=1000" sets it to 1000 records.  Note
1048 * that the larger the size of the log, the slower the system will run due to linear searching in the log,
1049 * but one doesn't generally care about performance when tracking down a leak.  The log is capped at 8000
1050 * records since going much larger than this tends to make the system unresponsive and unbootable on small
1051 * memory configurations.  The default value is 4000 records.
1052 */
1053
1054#if	defined(__LP64__)
1055#define ZRECORDS_MAX 		128000		/* Max records allowed in the log */
1056#else
1057#define ZRECORDS_MAX 		8000		/* Max records allowed in the log */
1058#endif
1059#define ZRECORDS_DEFAULT	4000		/* default records in log if zrecs is not specificed in boot-args */
1060
1061/*
1062 * Each record in the log contains a pointer to the zone element it refers to,
1063 * and a small array to hold the pc's from the stack trace.  A
1064 * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
1065 * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
1066 * If the log fills, old records are replaced as if it were a circular buffer.
1067 */
1068
1069
1070/*
1071 * Opcodes for the btlog operation field:
1072 */
1073
1074#define ZOP_ALLOC	1
1075#define ZOP_FREE	0
1076
1077/*
1078 * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest
1079 */
1080static btlog_t *zlog_btlog;		/* the log itself, dynamically allocated when logging is enabled  */
1081static zone_t  zone_of_interest = NULL;		/* the zone being watched; corresponds to zone_name_to_log */
1082
1083/*
1084 * Decide if we want to log this zone by doing a string compare between a zone name and the name
1085 * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
1086 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1087 * match a space in the zone name.
1088 */
1089
1090static int
1091log_this_zone(const char *zonename, const char *logname)
1092{
1093	int len;
1094	const char *zc = zonename;
1095	const char *lc = logname;
1096
1097	/*
1098	 * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1099	 */
1100
1101	for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1102
1103		/*
1104		 * If the current characters don't match, check for a space in
1105		 * in the zone name and a corresponding period in the log name.
1106		 * If that's not there, then the strings don't match.
1107		 */
1108
1109		if (*zc != *lc && !(*zc == ' ' && *lc == '.'))
1110			break;
1111
1112		/*
1113		 * The strings are equal so far.  If we're at the end, then it's a match.
1114		 */
1115
1116		if (*zc == '\0')
1117			return TRUE;
1118	}
1119
1120	return FALSE;
1121}
1122
1123
1124/*
1125 * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
1126 * the buffer for the records has been allocated.
1127 */
1128
1129#define DO_LOGGING(z)		(zlog_btlog && (z) == zone_of_interest)
1130
1131extern boolean_t kmem_alloc_ready;
1132
1133#if CONFIG_ZLEAKS
1134#pragma mark -
1135#pragma mark Zone Leak Detection
1136
1137/*
1138 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1139 * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
1140 * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
1141 * and stop tracking it if it was being tracked.
1142 *
1143 * We track the allocations in the zallocations hash table, which stores the address that was returned from
1144 * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
1145 * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
1146 * backtraces - we don't store them more than once.
1147 *
1148 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1149 * a large amount of virtual space.
1150 */
1151#define ZLEAK_STATE_ENABLED		0x01	/* Zone leak monitoring should be turned on if zone_map fills up. */
1152#define ZLEAK_STATE_ACTIVE 		0x02	/* We are actively collecting traces. */
1153#define ZLEAK_STATE_ACTIVATING 		0x04	/* Some thread is doing setup; others should move along. */
1154#define ZLEAK_STATE_FAILED		0x08	/* Attempt to allocate tables failed.  We will not try again. */
1155uint32_t	zleak_state = 0;		/* State of collection, as above */
1156
1157boolean_t	panic_include_ztrace	= FALSE;  	/* Enable zleak logging on panic */
1158vm_size_t 	zleak_global_tracking_threshold;	/* Size of zone map at which to start collecting data */
1159vm_size_t 	zleak_per_zone_tracking_threshold;	/* Size a zone will have before we will collect data on it */
1160unsigned int 	zleak_sample_factor	= 1000;		/* Allocations per sample attempt */
1161
1162/*
1163 * Counters for allocation statistics.
1164 */
1165
1166/* Times two active records want to occupy the same spot */
1167unsigned int z_alloc_collisions = 0;
1168unsigned int z_trace_collisions = 0;
1169
1170/* Times a new record lands on a spot previously occupied by a freed allocation */
1171unsigned int z_alloc_overwrites = 0;
1172unsigned int z_trace_overwrites = 0;
1173
1174/* Times a new alloc or trace is put into the hash table */
1175unsigned int z_alloc_recorded	= 0;
1176unsigned int z_trace_recorded	= 0;
1177
1178/* Times zleak_log returned false due to not being able to acquire the lock */
1179unsigned int z_total_conflicts	= 0;
1180
1181
1182#pragma mark struct zallocation
1183/*
1184 * Structure for keeping track of an allocation
1185 * An allocation bucket is in use if its element is not NULL
1186 */
1187struct zallocation {
1188	uintptr_t		za_element;		/* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1189	vm_size_t		za_size;			/* how much memory did this allocation take up? */
1190	uint32_t		za_trace_index;	/* index into ztraces for backtrace associated with allocation */
1191	/* TODO: #if this out */
1192	uint32_t		za_hit_count;		/* for determining effectiveness of hash function */
1193};
1194
1195/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1196uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1197uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1198
1199vm_size_t zleak_max_zonemap_size;
1200
1201/* Hashmaps of allocations and their corresponding traces */
1202static struct zallocation*	zallocations;
1203static struct ztrace*		ztraces;
1204
1205/* not static so that panic can see this, see kern/debug.c */
1206struct ztrace*				top_ztrace;
1207
1208/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1209static lck_spin_t			zleak_lock;
1210static lck_attr_t			zleak_lock_attr;
1211static lck_grp_t			zleak_lock_grp;
1212static lck_grp_attr_t			zleak_lock_grp_attr;
1213
1214/*
1215 * Initializes the zone leak monitor.  Called from zone_init()
1216 */
1217static void
1218zleak_init(vm_size_t max_zonemap_size)
1219{
1220	char			scratch_buf[16];
1221	boolean_t		zleak_enable_flag = FALSE;
1222
1223	zleak_max_zonemap_size = max_zonemap_size;
1224	zleak_global_tracking_threshold = max_zonemap_size / 2;
1225	zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1226
1227	/* -zleakoff (flag to disable zone leak monitor) */
1228	if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1229		zleak_enable_flag = FALSE;
1230		printf("zone leak detection disabled\n");
1231	} else {
1232		zleak_enable_flag = TRUE;
1233		printf("zone leak detection enabled\n");
1234	}
1235
1236	/* zfactor=XXXX (override how often to sample the zone allocator) */
1237	if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1238		printf("Zone leak factor override: %u\n", zleak_sample_factor);
1239	}
1240
1241	/* zleak-allocs=XXXX (override number of buckets in zallocations) */
1242	if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1243		printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1244		/* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1245		if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) {
1246			printf("Override isn't a power of two, bad things might happen!\n");
1247		}
1248	}
1249
1250	/* zleak-traces=XXXX (override number of buckets in ztraces) */
1251	if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1252		printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1253		/* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1254		if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) {
1255			printf("Override isn't a power of two, bad things might happen!\n");
1256		}
1257	}
1258
1259	/* allocate the zleak_lock */
1260	lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1261	lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1262	lck_attr_setdefault(&zleak_lock_attr);
1263	lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1264
1265	if (zleak_enable_flag) {
1266		zleak_state = ZLEAK_STATE_ENABLED;
1267	}
1268}
1269
1270#if CONFIG_ZLEAKS
1271
1272/*
1273 * Support for kern.zleak.active sysctl - a simplified
1274 * version of the zleak_state variable.
1275 */
1276int
1277get_zleak_state(void)
1278{
1279	if (zleak_state & ZLEAK_STATE_FAILED)
1280		return (-1);
1281	if (zleak_state & ZLEAK_STATE_ACTIVE)
1282		return (1);
1283	return (0);
1284}
1285
1286#endif
1287
1288
1289kern_return_t
1290zleak_activate(void)
1291{
1292	kern_return_t retval;
1293	vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1294	vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1295	void *allocations_ptr = NULL;
1296	void *traces_ptr = NULL;
1297
1298	/* Only one thread attempts to activate at a time */
1299	if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1300		return KERN_SUCCESS;
1301	}
1302
1303	/* Indicate that we're doing the setup */
1304	lck_spin_lock(&zleak_lock);
1305	if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1306		lck_spin_unlock(&zleak_lock);
1307		return KERN_SUCCESS;
1308	}
1309
1310	zleak_state |= ZLEAK_STATE_ACTIVATING;
1311	lck_spin_unlock(&zleak_lock);
1312
1313	/* Allocate and zero tables */
1314	retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size);
1315	if (retval != KERN_SUCCESS) {
1316		goto fail;
1317	}
1318
1319	retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size);
1320	if (retval != KERN_SUCCESS) {
1321		goto fail;
1322	}
1323
1324	bzero(allocations_ptr, z_alloc_size);
1325	bzero(traces_ptr, z_trace_size);
1326
1327	/* Everything's set.  Install tables, mark active. */
1328	zallocations = allocations_ptr;
1329	ztraces = traces_ptr;
1330
1331	/*
1332	 * Initialize the top_ztrace to the first entry in ztraces,
1333	 * so we don't have to check for null in zleak_log
1334	 */
1335	top_ztrace = &ztraces[0];
1336
1337	/*
1338	 * Note that we do need a barrier between installing
1339	 * the tables and setting the active flag, because the zfree()
1340	 * path accesses the table without a lock if we're active.
1341	 */
1342	lck_spin_lock(&zleak_lock);
1343	zleak_state |= ZLEAK_STATE_ACTIVE;
1344	zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1345	lck_spin_unlock(&zleak_lock);
1346
1347	return 0;
1348
1349fail:
1350	/*
1351	 * If we fail to allocate memory, don't further tax
1352	 * the system by trying again.
1353	 */
1354	lck_spin_lock(&zleak_lock);
1355	zleak_state |= ZLEAK_STATE_FAILED;
1356	zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1357	lck_spin_unlock(&zleak_lock);
1358
1359	if (allocations_ptr != NULL) {
1360		kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1361	}
1362
1363	if (traces_ptr != NULL) {
1364		kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1365	}
1366
1367	return retval;
1368}
1369
1370/*
1371 * TODO: What about allocations that never get deallocated,
1372 * especially ones with unique backtraces? Should we wait to record
1373 * until after boot has completed?
1374 * (How many persistent zallocs are there?)
1375 */
1376
1377/*
1378 * This function records the allocation in the allocations table,
1379 * and stores the associated backtrace in the traces table
1380 * (or just increments the refcount if the trace is already recorded)
1381 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1382 * the associated trace's refcount is decremented.
1383 * If the trace slot is in use, it returns.
1384 * The refcount is incremented by the amount of memory the allocation consumes.
1385 * The return value indicates whether to try again next time.
1386 */
1387static boolean_t
1388zleak_log(uintptr_t* bt,
1389		  uintptr_t addr,
1390		  uint32_t depth,
1391		  vm_size_t allocation_size)
1392{
1393	/* Quit if there's someone else modifying the hash tables */
1394	if (!lck_spin_try_lock(&zleak_lock)) {
1395		z_total_conflicts++;
1396		return FALSE;
1397	}
1398
1399	struct zallocation* allocation	= &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1400
1401	uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1402	struct ztrace* trace = &ztraces[trace_index];
1403
1404	allocation->za_hit_count++;
1405	trace->zt_hit_count++;
1406
1407	/*
1408	 * If the allocation bucket we want to be in is occupied, and if the occupier
1409	 * has the same trace as us, just bail.
1410	 */
1411	if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1412		z_alloc_collisions++;
1413
1414		lck_spin_unlock(&zleak_lock);
1415		return TRUE;
1416	}
1417
1418	/* STEP 1: Store the backtrace in the traces array. */
1419	/* A size of zero indicates that the trace bucket is free. */
1420
1421	if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) {
1422		/*
1423		 * Different unique trace with same hash!
1424		 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1425		 * and get out of the way for later chances
1426		 */
1427		trace->zt_collisions++;
1428		z_trace_collisions++;
1429
1430		lck_spin_unlock(&zleak_lock);
1431		return TRUE;
1432	} else if (trace->zt_size > 0) {
1433		/* Same trace, already added, so increment refcount */
1434		trace->zt_size += allocation_size;
1435	} else {
1436		/* Found an unused trace bucket, record the trace here! */
1437		if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */
1438			z_trace_overwrites++;
1439
1440		z_trace_recorded++;
1441		trace->zt_size			= allocation_size;
1442		memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) );
1443
1444		trace->zt_depth		= depth;
1445		trace->zt_collisions	= 0;
1446	}
1447
1448	/* STEP 2: Store the allocation record in the allocations array. */
1449
1450	if (allocation->za_element != (uintptr_t) 0) {
1451		/*
1452		 * Straight up replace any allocation record that was there.  We don't want to do the work
1453		 * to preserve the allocation entries that were there, because we only record a subset of the
1454		 * allocations anyways.
1455		 */
1456
1457		z_alloc_collisions++;
1458
1459		struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
1460		/* Knock off old allocation's size, not the new allocation */
1461		associated_trace->zt_size -= allocation->za_size;
1462	} else if (allocation->za_trace_index != 0) {
1463		/* Slot previously used but not currently in use */
1464		z_alloc_overwrites++;
1465	}
1466
1467	allocation->za_element		= addr;
1468	allocation->za_trace_index	= trace_index;
1469	allocation->za_size		= allocation_size;
1470
1471	z_alloc_recorded++;
1472
1473	if (top_ztrace->zt_size < trace->zt_size)
1474		top_ztrace = trace;
1475
1476	lck_spin_unlock(&zleak_lock);
1477	return TRUE;
1478}
1479
1480/*
1481 * Free the allocation record and release the stacktrace.
1482 * This should be as fast as possible because it will be called for every free.
1483 */
1484static void
1485zleak_free(uintptr_t addr,
1486		   vm_size_t allocation_size)
1487{
1488	if (addr == (uintptr_t) 0)
1489		return;
1490
1491	struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1492
1493	/* Double-checked locking: check to find out if we're interested, lock, check to make
1494	 * sure it hasn't changed, then modify it, and release the lock.
1495	 */
1496
1497	if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1498		/* if the allocation was the one, grab the lock, check again, then delete it */
1499		lck_spin_lock(&zleak_lock);
1500
1501		if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
1502			struct ztrace *trace;
1503
1504			/* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
1505			if (allocation->za_size != allocation_size) {
1506				panic("Freeing as size %lu memory that was allocated with size %lu\n",
1507						(uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
1508			}
1509
1510			trace = &ztraces[allocation->za_trace_index];
1511
1512			/* size of 0 indicates trace bucket is unused */
1513			if (trace->zt_size > 0) {
1514				trace->zt_size -= allocation_size;
1515			}
1516
1517			/* A NULL element means the allocation bucket is unused */
1518			allocation->za_element = 0;
1519		}
1520		lck_spin_unlock(&zleak_lock);
1521	}
1522}
1523
1524#endif /* CONFIG_ZLEAKS */
1525
1526/*  These functions outside of CONFIG_ZLEAKS because they are also used in
1527 *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
1528 */
1529
1530/*
1531 * This function captures a backtrace from the current stack and
1532 * returns the number of frames captured, limited by max_frames.
1533 * It's fast because it does no checking to make sure there isn't bad data.
1534 * Since it's only called from threads that we're going to keep executing,
1535 * if there's bad data we were going to die eventually.
1536 * If this function is inlined, it doesn't record the frame of the function it's inside.
1537 * (because there's no stack frame!)
1538 */
1539
1540uint32_t
1541fastbacktrace(uintptr_t* bt, uint32_t max_frames)
1542{
1543	uintptr_t* frameptr = NULL, *frameptr_next = NULL;
1544	uintptr_t retaddr = 0;
1545	uint32_t frame_index = 0, frames = 0;
1546	uintptr_t kstackb, kstackt;
1547	thread_t cthread = current_thread();
1548
1549	if (__improbable(cthread == NULL))
1550		return 0;
1551
1552	kstackb = cthread->kernel_stack;
1553	kstackt = kstackb + kernel_stack_size;
1554	/* Load stack frame pointer (EBP on x86) into frameptr */
1555	frameptr = __builtin_frame_address(0);
1556	if (((uintptr_t)frameptr > kstackt) || ((uintptr_t)frameptr < kstackb))
1557		frameptr = NULL;
1558
1559	while (frameptr != NULL && frame_index < max_frames ) {
1560		/* Next frame pointer is pointed to by the previous one */
1561		frameptr_next = (uintptr_t*) *frameptr;
1562
1563		/* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */
1564                /* That also means the return address is worthless, so don't record it */
1565		if (frameptr_next == NULL)
1566			break;
1567		/* Verify thread stack bounds */
1568		if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb))
1569			break;
1570		/* Pull return address from one spot above the frame pointer */
1571		retaddr = *(frameptr + 1);
1572
1573		/* Store it in the backtrace array */
1574		bt[frame_index++] = retaddr;
1575
1576		frameptr = frameptr_next;
1577	}
1578
1579	/* Save the number of frames captured for return value */
1580	frames = frame_index;
1581
1582	/* Fill in the rest of the backtrace with zeros */
1583	while (frame_index < max_frames)
1584		bt[frame_index++] = 0;
1585
1586	return frames;
1587}
1588
1589/* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
1590uintptr_t
1591hash_mix(uintptr_t x)
1592{
1593#ifndef __LP64__
1594	x += ~(x << 15);
1595	x ^=  (x >> 10);
1596	x +=  (x << 3 );
1597	x ^=  (x >> 6 );
1598	x += ~(x << 11);
1599	x ^=  (x >> 16);
1600#else
1601	x += ~(x << 32);
1602	x ^=  (x >> 22);
1603	x += ~(x << 13);
1604	x ^=  (x >> 8 );
1605	x +=  (x << 3 );
1606	x ^=  (x >> 15);
1607	x += ~(x << 27);
1608	x ^=  (x >> 31);
1609#endif
1610	return x;
1611}
1612
1613uint32_t
1614hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
1615{
1616
1617	uintptr_t hash = 0;
1618	uintptr_t mask = max_size - 1;
1619
1620	while (depth) {
1621		hash += bt[--depth];
1622	}
1623
1624	hash = hash_mix(hash) & mask;
1625
1626	assert(hash < max_size);
1627
1628	return (uint32_t) hash;
1629}
1630
1631/*
1632 *  TODO: Determine how well distributed this is
1633 *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
1634 */
1635uint32_t
1636hashaddr(uintptr_t pt, uint32_t max_size)
1637{
1638	uintptr_t hash = 0;
1639	uintptr_t mask = max_size - 1;
1640
1641	hash = hash_mix(pt) & mask;
1642
1643	assert(hash < max_size);
1644
1645	return (uint32_t) hash;
1646}
1647
1648/* End of all leak-detection code */
1649#pragma mark -
1650
1651/*
1652 *	zinit initializes a new zone.  The zone data structures themselves
1653 *	are stored in a zone, which is initially a static structure that
1654 *	is initialized by zone_init.
1655 */
1656zone_t
1657zinit(
1658	vm_size_t	size,		/* the size of an element */
1659	vm_size_t	max,		/* maximum memory to use */
1660	vm_size_t	alloc,		/* allocation size */
1661	const char	*name)		/* a name for the zone */
1662{
1663	zone_t		z;
1664	boolean_t	use_page_list = FALSE;
1665
1666	if (zone_zone == ZONE_NULL) {
1667
1668		z = (struct zone *)zdata;
1669		/* special handling in zcram() because the first element is being used */
1670	} else
1671		z = (zone_t) zalloc(zone_zone);
1672
1673	if (z == ZONE_NULL)
1674		return(ZONE_NULL);
1675
1676	/* Zone elements must fit both a next pointer and a backup pointer */
1677	vm_size_t  minimum_element_size = sizeof(vm_offset_t) * 2;
1678	if (size < minimum_element_size)
1679		size = minimum_element_size;
1680
1681	/*
1682	 *  Round element size to a multiple of sizeof(pointer)
1683	 *  This also enforces that allocations will be aligned on pointer boundaries
1684	 */
1685	size = ((size-1) + sizeof(vm_offset_t)) -
1686	       ((size-1) % sizeof(vm_offset_t));
1687
1688	if (alloc == 0)
1689		alloc = PAGE_SIZE;
1690
1691	alloc = round_page(alloc);
1692	max   = round_page(max);
1693
1694	/*
1695	 * we look for an allocation size with less than 1% waste
1696	 * up to 5 pages in size...
1697	 * otherwise, we look for an allocation size with least fragmentation
1698	 * in the range of 1 - 5 pages
1699	 * This size will be used unless
1700	 * the user suggestion is larger AND has less fragmentation
1701	 */
1702#if	ZONE_ALIAS_ADDR
1703	/* Favor PAGE_SIZE allocations unless we waste >10% space */
1704	if ((size < PAGE_SIZE) && (PAGE_SIZE % size <= PAGE_SIZE / 10))
1705		alloc = PAGE_SIZE;
1706	else
1707#endif
1708#if	defined(__LP64__)
1709		if (((alloc % size) != 0) || (alloc > PAGE_SIZE * 8))
1710#endif
1711		{
1712		vm_size_t best, waste; unsigned int i;
1713		best  = PAGE_SIZE;
1714		waste = best % size;
1715
1716		for (i = 1; i <= 5; i++) {
1717			vm_size_t tsize, twaste;
1718
1719			tsize = i * PAGE_SIZE;
1720
1721			if ((tsize % size) < (tsize / 100)) {
1722			        alloc = tsize;
1723				goto use_this_allocation;
1724			}
1725			twaste = tsize % size;
1726			if (twaste < waste)
1727				best = tsize, waste = twaste;
1728		}
1729		if (alloc <= best || (alloc % size >= waste))
1730			alloc = best;
1731	}
1732use_this_allocation:
1733	if (max && (max < alloc))
1734		max = alloc;
1735
1736	/*
1737	 * Opt into page list tracking if we can reliably map an allocation
1738	 * to its page_metadata, and if the wastage in the tail of
1739	 * the allocation is not too large
1740	 */
1741	if (alloc == PAGE_SIZE) {
1742		if ((PAGE_SIZE % size) >= sizeof(struct zone_page_metadata)) {
1743			use_page_list = TRUE;
1744		} else if ((PAGE_SIZE - sizeof(struct zone_page_metadata)) % size <= PAGE_SIZE / 100) {
1745			use_page_list = TRUE;
1746		}
1747	}
1748
1749	z->free_elements = NULL;
1750	queue_init(&z->pages.any_free_foreign);
1751	queue_init(&z->pages.all_free);
1752	queue_init(&z->pages.intermediate);
1753	queue_init(&z->pages.all_used);
1754	z->cur_size = 0;
1755	z->page_count = 0;
1756	z->max_size = max;
1757	z->elem_size = size;
1758	z->alloc_size = alloc;
1759	z->zone_name = name;
1760	z->count = 0;
1761	z->countfree = 0;
1762	z->sum_count = 0LL;
1763	z->doing_alloc = FALSE;
1764	z->doing_gc = FALSE;
1765	z->exhaustible = FALSE;
1766	z->collectable = TRUE;
1767	z->allows_foreign = FALSE;
1768	z->expandable  = TRUE;
1769	z->waiting = FALSE;
1770	z->async_pending = FALSE;
1771	z->caller_acct = TRUE;
1772	z->noencrypt = FALSE;
1773	z->no_callout = FALSE;
1774	z->async_prio_refill = FALSE;
1775	z->gzalloc_exempt = FALSE;
1776	z->alignment_required = FALSE;
1777	z->use_page_list = use_page_list;
1778	z->prio_refill_watermark = 0;
1779	z->zone_replenish_thread = NULL;
1780	z->zp_count = 0;
1781#if CONFIG_ZLEAKS
1782	z->zleak_capture = 0;
1783	z->zleak_on = FALSE;
1784#endif /* CONFIG_ZLEAKS */
1785
1786#if	ZONE_DEBUG
1787	z->active_zones.next = z->active_zones.prev = NULL;
1788	zone_debug_enable(z);
1789#endif	/* ZONE_DEBUG */
1790	lock_zone_init(z);
1791
1792	/*
1793	 *	Add the zone to the all-zones list.
1794	 *	If we are tracking zone info per task, and we have
1795	 *	already used all the available stat slots, then keep
1796	 *	using the overflow zone slot.
1797	 */
1798	z->next_zone = ZONE_NULL;
1799	simple_lock(&all_zones_lock);
1800	*last_zone = z;
1801	last_zone = &z->next_zone;
1802	z->index = num_zones;
1803	if (zinfo_per_task) {
1804		if (num_zones > ZONES_MAX)
1805			z->index = ZONES_MAX;
1806	}
1807	num_zones++;
1808	simple_unlock(&all_zones_lock);
1809
1810	/*
1811	 * Check if we should be logging this zone.  If so, remember the zone pointer.
1812	 */
1813	if (log_this_zone(z->zone_name, zone_name_to_log)) {
1814	 	zone_of_interest = z;
1815	}
1816
1817	/*
1818	 * If we want to log a zone, see if we need to allocate buffer space for the log.  Some vm related zones are
1819	 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case.  kmem_alloc_ready is set to
1820	 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work.  If we want to log one
1821	 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
1822	 * later on some other zone.  So note we may be allocating a buffer to log a zone other than the one being initialized
1823	 * right now.
1824	 */
1825	if (zone_of_interest != NULL && zlog_btlog == NULL && kmem_alloc_ready) {
1826		zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, NULL, NULL, NULL);
1827		if (zlog_btlog) {
1828			printf("zone: logging started for zone %s\n", zone_of_interest->zone_name);
1829		} else {
1830			printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
1831			zone_of_interest = NULL;
1832		}
1833	}
1834#if	CONFIG_GZALLOC
1835	gzalloc_zone_init(z);
1836#endif
1837	return(z);
1838}
1839unsigned	zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
1840
1841static void zone_replenish_thread(zone_t);
1842
1843/* High priority VM privileged thread used to asynchronously refill a designated
1844 * zone, such as the reserved VM map entry zone.
1845 */
1846static void zone_replenish_thread(zone_t z) {
1847	vm_size_t free_size;
1848	current_thread()->options |= TH_OPT_VMPRIV;
1849
1850	for (;;) {
1851		lock_zone(z);
1852		assert(z->prio_refill_watermark != 0);
1853		while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
1854			assert(z->doing_alloc == FALSE);
1855			assert(z->async_prio_refill == TRUE);
1856
1857			unlock_zone(z);
1858			int	zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
1859			vm_offset_t space, alloc_size;
1860			kern_return_t kr;
1861
1862			if (vm_pool_low())
1863				alloc_size = round_page(z->elem_size);
1864			else
1865				alloc_size = z->alloc_size;
1866
1867			if (z->noencrypt)
1868				zflags |= KMA_NOENCRYPT;
1869
1870			kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags);
1871
1872			if (kr == KERN_SUCCESS) {
1873#if	ZONE_ALIAS_ADDR
1874				if (alloc_size == PAGE_SIZE)
1875					space = zone_alias_addr(space);
1876#endif
1877				ZONE_PAGE_COUNT_INCR(z, (alloc_size / PAGE_SIZE));
1878				zcram(z, space, alloc_size);
1879			} else if (kr == KERN_RESOURCE_SHORTAGE) {
1880				VM_PAGE_WAIT();
1881			} else if (kr == KERN_NO_SPACE) {
1882				kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags);
1883				if (kr == KERN_SUCCESS) {
1884#if	ZONE_ALIAS_ADDR
1885					if (alloc_size == PAGE_SIZE)
1886						space = zone_alias_addr(space);
1887#endif
1888					zcram(z, space, alloc_size);
1889				} else {
1890					assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
1891					thread_block(THREAD_CONTINUE_NULL);
1892				}
1893			}
1894
1895			lock_zone(z);
1896			zone_replenish_loops++;
1897		}
1898
1899		unlock_zone(z);
1900		/* Signal any potential throttled consumers, terminating
1901		 * their timer-bounded waits.
1902		 */
1903		thread_wakeup(z);
1904
1905		assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
1906		thread_block(THREAD_CONTINUE_NULL);
1907		zone_replenish_wakeups++;
1908	}
1909}
1910
1911void
1912zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) {
1913	z->prio_refill_watermark = low_water_mark;
1914
1915	z->async_prio_refill = TRUE;
1916	OSMemoryBarrier();
1917	kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
1918
1919	if (tres != KERN_SUCCESS) {
1920		panic("zone_prio_refill_configure, thread create: 0x%x", tres);
1921	}
1922
1923	thread_deallocate(z->zone_replenish_thread);
1924}
1925
1926/*
1927 *	Cram the given memory into the specified zone.
1928 */
1929void
1930zcram(
1931	zone_t		zone,
1932	vm_offset_t			newmem,
1933	vm_size_t		size)
1934{
1935	vm_size_t	elem_size;
1936	boolean_t   from_zm = FALSE;
1937
1938	/* Basic sanity checks */
1939	assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
1940	assert(!zone->collectable || zone->allows_foreign
1941		|| (from_zone_map(newmem, size)));
1942
1943	elem_size = zone->elem_size;
1944
1945	if (from_zone_map(newmem, size))
1946		from_zm = TRUE;
1947
1948	if (zalloc_debug & ZALLOC_DEBUG_ZCRAM)
1949		kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
1950				(unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
1951
1952	if (from_zm && !zone->use_page_list)
1953		zone_page_init(newmem, size);
1954
1955	lock_zone(zone);
1956
1957	if (zone->use_page_list) {
1958		struct zone_page_metadata *page_metadata;
1959
1960		assert((newmem & PAGE_MASK) == 0);
1961		assert((size & PAGE_MASK) == 0);
1962		for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
1963
1964			vm_size_t pos_in_page;
1965			page_metadata = (struct zone_page_metadata *)(newmem + PAGE_SIZE - sizeof(struct zone_page_metadata));
1966
1967			page_metadata->pages.next = NULL;
1968			page_metadata->pages.prev = NULL;
1969			page_metadata->elements = NULL;
1970			page_metadata->zone = zone;
1971			page_metadata->alloc_count = 0;
1972			page_metadata->free_count = 0;
1973
1974			enqueue_tail(&zone->pages.all_used, (queue_entry_t)page_metadata);
1975
1976			for (pos_in_page = 0; (newmem + pos_in_page + elem_size) < (vm_offset_t)page_metadata; pos_in_page += elem_size) {
1977				page_metadata->alloc_count++;
1978				zone->count++;	/* compensate for free_to_zone */
1979				if ((newmem + pos_in_page) == (vm_offset_t)zone) {
1980					/*
1981					 * special case for the "zone_zone" zone, which is using the first
1982					 * allocation of its pmap_steal_memory()-ed allocation for
1983					 * the "zone_zone" variable already.
1984					 */
1985				} else {
1986					free_to_zone(zone, newmem + pos_in_page, FALSE);
1987				}
1988				zone->cur_size += elem_size;
1989			}
1990		}
1991	} else {
1992		while (size >= elem_size) {
1993			zone->count++;	/* compensate for free_to_zone */
1994			if (newmem == (vm_offset_t)zone) {
1995				/* Don't free zone_zone zone */
1996			} else {
1997				free_to_zone(zone, newmem, FALSE);
1998			}
1999			if (from_zm)
2000				zone_page_alloc(newmem, elem_size);
2001			size -= elem_size;
2002			newmem += elem_size;
2003			zone->cur_size += elem_size;
2004		}
2005	}
2006	unlock_zone(zone);
2007}
2008
2009
2010/*
2011 *	Steal memory for the zone package.  Called from
2012 *	vm_page_bootstrap().
2013 */
2014void
2015zone_steal_memory(void)
2016{
2017#if	CONFIG_GZALLOC
2018	gzalloc_configure();
2019#endif
2020	/* Request enough early memory to get to the pmap zone */
2021	zdata_size = 12 * sizeof(struct zone);
2022	zdata_size = round_page(zdata_size);
2023	zdata = (vm_offset_t)pmap_steal_memory(zdata_size);
2024}
2025
2026
2027/*
2028 * Fill a zone with enough memory to contain at least nelem elements.
2029 * Memory is obtained with kmem_alloc_kobject from the kernel_map.
2030 * Return the number of elements actually put into the zone, which may
2031 * be more than the caller asked for since the memory allocation is
2032 * rounded up to a full page.
2033 */
2034int
2035zfill(
2036	zone_t	zone,
2037	int	nelem)
2038{
2039	kern_return_t	kr;
2040	vm_size_t	size;
2041	vm_offset_t	memory;
2042	int		nalloc;
2043
2044	assert(nelem > 0);
2045	if (nelem <= 0)
2046		return 0;
2047	size = nelem * zone->elem_size;
2048	size = round_page(size);
2049	kr = kmem_alloc_kobject(kernel_map, &memory, size);
2050	if (kr != KERN_SUCCESS)
2051		return 0;
2052
2053	zone_change(zone, Z_FOREIGN, TRUE);
2054	ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
2055	zcram(zone, memory, size);
2056	nalloc = (int)(size / zone->elem_size);
2057	assert(nalloc >= nelem);
2058
2059	return nalloc;
2060}
2061
2062/*
2063 *	Initialize the "zone of zones" which uses fixed memory allocated
2064 *	earlier in memory initialization.  zone_bootstrap is called
2065 *	before zone_init.
2066 */
2067void
2068zone_bootstrap(void)
2069{
2070	char temp_buf[16];
2071
2072	if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof(temp_buf))) {
2073		zinfo_per_task = TRUE;
2074	}
2075
2076	if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug)))
2077		zalloc_debug = 0;
2078
2079	/* Set up zone element poisoning */
2080	zp_init();
2081
2082	/* should zlog log to debug zone corruption instead of leaks? */
2083	if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2084		corruption_debug_flag = TRUE;
2085	}
2086
2087	/*
2088	 * Check for and set up zone leak detection if requested via boot-args.  We recognized two
2089	 * boot-args:
2090	 *
2091	 *	zlog=<zone_to_log>
2092	 *	zrecs=<num_records_in_log>
2093	 *
2094	 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
2095	 * control the size of the log.  If zrecs is not specified, a default value is used.
2096	 */
2097
2098	if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2099		if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
2100
2101			/*
2102			 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2103			 * This prevents accidentally hogging too much kernel memory and making the system
2104			 * unusable.
2105			 */
2106
2107			log_records = MIN(ZRECORDS_MAX, log_records);
2108
2109		} else {
2110			log_records = ZRECORDS_DEFAULT;
2111		}
2112	}
2113
2114	simple_lock_init(&all_zones_lock, 0);
2115
2116	first_zone = ZONE_NULL;
2117	last_zone = &first_zone;
2118	num_zones = 0;
2119	thread_call_setup(&call_async_alloc, zalloc_async, NULL);
2120
2121	/* assertion: nobody else called zinit before us */
2122	assert(zone_zone == ZONE_NULL);
2123
2124	/* initializing global lock group for zones */
2125	lck_grp_attr_setdefault(&zone_locks_grp_attr);
2126	lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
2127
2128	zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone),
2129			  sizeof(struct zone), "zones");
2130	zone_change(zone_zone, Z_COLLECT, FALSE);
2131	zone_change(zone_zone, Z_CALLERACCT, FALSE);
2132	zone_change(zone_zone, Z_NOENCRYPT, TRUE);
2133
2134	zcram(zone_zone, zdata, zdata_size);
2135
2136	/* initialize fake zones and zone info if tracking by task */
2137	if (zinfo_per_task) {
2138		vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS;
2139		unsigned int i;
2140
2141		for (i = 0; i < num_fake_zones; i++)
2142			fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i);
2143		zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX,
2144				   zisize, "per task zinfo");
2145		zone_change(zinfo_zone, Z_CALLERACCT, FALSE);
2146	}
2147}
2148
2149void
2150zinfo_task_init(task_t task)
2151{
2152	if (zinfo_per_task) {
2153		task->tkm_zinfo = zalloc(zinfo_zone);
2154		memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS);
2155	} else {
2156		task->tkm_zinfo = NULL;
2157	}
2158}
2159
2160void
2161zinfo_task_free(task_t task)
2162{
2163	assert(task != kernel_task);
2164	if (task->tkm_zinfo != NULL) {
2165		zfree(zinfo_zone, task->tkm_zinfo);
2166		task->tkm_zinfo = NULL;
2167	}
2168}
2169
2170/* Global initialization of Zone Allocator.
2171 * Runs after zone_bootstrap.
2172 */
2173void
2174zone_init(
2175	vm_size_t max_zonemap_size)
2176{
2177	kern_return_t	retval;
2178	vm_offset_t	zone_min;
2179	vm_offset_t	zone_max;
2180
2181	retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
2182			       FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT,
2183			       &zone_map);
2184
2185	if (retval != KERN_SUCCESS)
2186		panic("zone_init: kmem_suballoc failed");
2187	zone_max = zone_min + round_page(max_zonemap_size);
2188#if	CONFIG_GZALLOC
2189	gzalloc_init(max_zonemap_size);
2190#endif
2191	/*
2192	 * Setup garbage collection information:
2193	 */
2194	zone_map_min_address = zone_min;
2195	zone_map_max_address = zone_max;
2196
2197#if defined(__LP64__)
2198	/*
2199	 * ensure that any vm_page_t that gets created from
2200	 * the vm_page zone can be packed properly (see vm_page.h
2201	 * for the packing requirements
2202	 */
2203	if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_min_address)) != (vm_page_t)zone_map_min_address)
2204		panic("VM_PAGE_PACK_PTR failed on zone_map_min_address - %p", (void *)zone_map_min_address);
2205
2206	if (VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address)) != (vm_page_t)zone_map_max_address)
2207		panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
2208#endif
2209
2210	zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
2211	zone_page_table_used_size = sizeof(zone_page_table);
2212
2213	zone_page_table_second_level_size = 1;
2214	zone_page_table_second_level_shift_amount = 0;
2215
2216	/*
2217	 * Find the power of 2 for the second level that allows
2218	 * the first level to fit in ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE
2219	 * slots.
2220	 */
2221	while ((zone_page_table_first_level_slot(zone_pages-1)) >= ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE) {
2222		zone_page_table_second_level_size <<= 1;
2223		zone_page_table_second_level_shift_amount++;
2224	}
2225
2226	lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
2227	lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
2228	lck_attr_setdefault(&zone_gc_lck_attr);
2229	lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
2230
2231#if CONFIG_ZLEAKS
2232	/*
2233	 * Initialize the zone leak monitor
2234	 */
2235	zleak_init(max_zonemap_size);
2236#endif /* CONFIG_ZLEAKS */
2237}
2238
2239void
2240zone_page_table_expand(zone_page_index_t pindex)
2241{
2242	unsigned int first_index;
2243	struct zone_page_table_entry * volatile * first_level_ptr;
2244
2245	assert(pindex < zone_pages);
2246
2247	first_index = zone_page_table_first_level_slot(pindex);
2248	first_level_ptr = &zone_page_table[first_index];
2249
2250	if (*first_level_ptr == NULL) {
2251		/*
2252		 * We were able to verify the old first-level slot
2253		 * had NULL, so attempt to populate it.
2254		 */
2255
2256		vm_offset_t second_level_array = 0;
2257		vm_size_t second_level_size = round_page(zone_page_table_second_level_size * sizeof(struct zone_page_table_entry));
2258		zone_page_index_t i;
2259		struct zone_page_table_entry *entry_array;
2260
2261		if (kmem_alloc_kobject(zone_map, &second_level_array,
2262							   second_level_size) != KERN_SUCCESS) {
2263			panic("zone_page_table_expand");
2264		}
2265		zone_map_table_page_count += (second_level_size / PAGE_SIZE);
2266
2267		/*
2268		 * zone_gc() may scan the "zone_page_table" directly,
2269		 * so make sure any slots have a valid unused state.
2270		 */
2271		entry_array = (struct zone_page_table_entry *)second_level_array;
2272		for (i=0; i < zone_page_table_second_level_size; i++) {
2273			entry_array[i].alloc_count = ZONE_PAGE_UNUSED;
2274			entry_array[i].collect_count = 0;
2275		}
2276
2277		if (OSCompareAndSwapPtr(NULL, entry_array, first_level_ptr)) {
2278			/* Old slot was NULL, replaced with expanded level */
2279			OSAddAtomicLong(second_level_size, &zone_page_table_used_size);
2280		} else {
2281			/* Old slot was not NULL, someone else expanded first */
2282			kmem_free(zone_map, second_level_array, second_level_size);
2283			zone_map_table_page_count -= (second_level_size / PAGE_SIZE);
2284		}
2285	} else {
2286		/* Old slot was not NULL, already been expanded */
2287	}
2288}
2289
2290struct zone_page_table_entry *
2291zone_page_table_lookup(zone_page_index_t pindex)
2292{
2293	unsigned int first_index = zone_page_table_first_level_slot(pindex);
2294	struct zone_page_table_entry *second_level = zone_page_table[first_index];
2295
2296	if (second_level) {
2297		return &second_level[zone_page_table_second_level_slot(pindex)];
2298	}
2299
2300	return NULL;
2301}
2302
2303extern volatile SInt32 kfree_nop_count;
2304
2305#pragma mark -
2306#pragma mark zalloc_canblock
2307
2308/*
2309 *	zalloc returns an element from the specified zone.
2310 */
2311static void *
2312zalloc_internal(
2313	zone_t	zone,
2314	boolean_t canblock,
2315	boolean_t nopagewait)
2316{
2317	vm_offset_t	addr = 0;
2318	kern_return_t	retval;
2319	uintptr_t	zbt[MAX_ZTRACE_DEPTH];	/* used in zone leak logging and zone leak detection */
2320	int 		numsaved = 0;
2321	boolean_t	zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
2322#if	CONFIG_GZALLOC || ZONE_DEBUG
2323	boolean_t	did_gzalloc = FALSE;
2324#endif
2325	thread_t thr = current_thread();
2326	boolean_t       check_poison = FALSE;
2327
2328#if CONFIG_ZLEAKS
2329	uint32_t	zleak_tracedepth = 0;  /* log this allocation if nonzero */
2330#endif /* CONFIG_ZLEAKS */
2331
2332	assert(zone != ZONE_NULL);
2333
2334#if	CONFIG_GZALLOC
2335	addr = gzalloc_alloc(zone, canblock);
2336	did_gzalloc = (addr != 0);
2337#endif
2338
2339	/*
2340	 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
2341	 */
2342	if (__improbable(DO_LOGGING(zone)))
2343	        numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
2344
2345#if CONFIG_ZLEAKS
2346	/*
2347	 * Zone leak detection: capture a backtrace every zleak_sample_factor
2348	 * allocations in this zone.
2349	 */
2350	if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
2351		/* Avoid backtracing twice if zone logging is on */
2352		if (numsaved == 0)
2353			zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH);
2354		else
2355			zleak_tracedepth = numsaved;
2356	}
2357#endif /* CONFIG_ZLEAKS */
2358
2359	lock_zone(zone);
2360
2361	if (zone->async_prio_refill && zone->zone_replenish_thread) {
2362		    do {
2363			    vm_size_t zfreec = (zone->cur_size - (zone->count * zone->elem_size));
2364			    vm_size_t zrefillwm = zone->prio_refill_watermark * zone->elem_size;
2365			    zone_replenish_wakeup = (zfreec < zrefillwm);
2366			    zone_alloc_throttle = (zfreec < (zrefillwm / 2)) && ((thr->options & TH_OPT_VMPRIV) == 0);
2367
2368			    if (zone_replenish_wakeup) {
2369				    zone_replenish_wakeups_initiated++;
2370				    unlock_zone(zone);
2371				    /* Signal the potentially waiting
2372				     * refill thread.
2373				     */
2374				    thread_wakeup(&zone->zone_replenish_thread);
2375
2376				    /* Scheduling latencies etc. may prevent
2377				     * the refill thread from keeping up
2378				     * with demand. Throttle consumers
2379				     * when we fall below half the
2380				     * watermark, unless VM privileged
2381				     */
2382				    if (zone_alloc_throttle) {
2383					    zone_replenish_throttle_count++;
2384					    assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
2385					    thread_block(THREAD_CONTINUE_NULL);
2386				    }
2387				    lock_zone(zone);
2388			    }
2389		    } while (zone_alloc_throttle == TRUE);
2390	}
2391
2392	if (__probable(addr == 0))
2393		addr = try_alloc_from_zone(zone, &check_poison);
2394
2395
2396	while ((addr == 0) && canblock) {
2397		/*
2398 		 *	If nothing was there, try to get more
2399		 */
2400		if (zone->doing_alloc) {
2401			/*
2402			 *	Someone is allocating memory for this zone.
2403			 *	Wait for it to show up, then try again.
2404			 */
2405			zone->waiting = TRUE;
2406			zone_sleep(zone);
2407		} else if (zone->doing_gc) {
2408			/* zone_gc() is running. Since we need an element
2409			 * from the free list that is currently being
2410			 * collected, set the waiting bit and try to
2411			 * interrupt the GC process, and try again
2412			 * when we obtain the lock.
2413			 */
2414			zone->waiting = TRUE;
2415			zone_sleep(zone);
2416		} else {
2417			vm_offset_t space;
2418			vm_size_t alloc_size;
2419			int retry = 0;
2420
2421			if ((zone->cur_size + zone->elem_size) >
2422			    zone->max_size) {
2423				if (zone->exhaustible)
2424					break;
2425				if (zone->expandable) {
2426					/*
2427					 * We're willing to overflow certain
2428					 * zones, but not without complaining.
2429					 *
2430					 * This is best used in conjunction
2431					 * with the collectable flag. What we
2432					 * want is an assurance we can get the
2433					 * memory back, assuming there's no
2434					 * leak.
2435					 */
2436					zone->max_size += (zone->max_size >> 1);
2437				} else {
2438					unlock_zone(zone);
2439
2440					panic_include_zprint = TRUE;
2441#if CONFIG_ZLEAKS
2442					if (zleak_state & ZLEAK_STATE_ACTIVE)
2443						panic_include_ztrace = TRUE;
2444#endif /* CONFIG_ZLEAKS */
2445					panic("zalloc: zone \"%s\" empty.", zone->zone_name);
2446				}
2447			}
2448			zone->doing_alloc = TRUE;
2449			unlock_zone(zone);
2450
2451			for (;;) {
2452				int	zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
2453
2454				if (vm_pool_low() || retry >= 1)
2455					alloc_size =
2456						round_page(zone->elem_size);
2457				else
2458					alloc_size = zone->alloc_size;
2459
2460				if (zone->noencrypt)
2461					zflags |= KMA_NOENCRYPT;
2462
2463				retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags);
2464				if (retval == KERN_SUCCESS) {
2465#if	ZONE_ALIAS_ADDR
2466					if (alloc_size == PAGE_SIZE)
2467						space = zone_alias_addr(space);
2468#endif
2469
2470#if CONFIG_ZLEAKS
2471					if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
2472						if (zone_map->size >= zleak_global_tracking_threshold) {
2473							kern_return_t kr;
2474
2475							kr = zleak_activate();
2476							if (kr != KERN_SUCCESS) {
2477								printf("Failed to activate live zone leak debugging (%d).\n", kr);
2478							}
2479						}
2480					}
2481
2482					if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
2483						if (zone->cur_size > zleak_per_zone_tracking_threshold) {
2484							zone->zleak_on = TRUE;
2485						}
2486					}
2487#endif /* CONFIG_ZLEAKS */
2488					ZONE_PAGE_COUNT_INCR(zone, (alloc_size / PAGE_SIZE));
2489					zcram(zone, space, alloc_size);
2490
2491					break;
2492				} else if (retval != KERN_RESOURCE_SHORTAGE) {
2493					retry++;
2494
2495					if (retry == 2) {
2496						zone_gc(TRUE);
2497						printf("zalloc did gc\n");
2498						zone_display_zprint();
2499					}
2500					if (retry == 3) {
2501						panic_include_zprint = TRUE;
2502#if CONFIG_ZLEAKS
2503						if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
2504							panic_include_ztrace = TRUE;
2505						}
2506#endif /* CONFIG_ZLEAKS */
2507						if (retval == KERN_NO_SPACE) {
2508							zone_t zone_largest = zone_find_largest();
2509							panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
2510							zone->zone_name, zone_largest->zone_name,
2511							(unsigned long)zone_largest->cur_size, zone_largest->count);
2512
2513						}
2514						panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count);
2515					}
2516				} else {
2517					break;
2518				}
2519			}
2520			lock_zone(zone);
2521			zone->doing_alloc = FALSE;
2522			if (zone->waiting) {
2523				zone->waiting = FALSE;
2524				zone_wakeup(zone);
2525			}
2526			addr = try_alloc_from_zone(zone, &check_poison);
2527			if (addr == 0 &&
2528			    retval == KERN_RESOURCE_SHORTAGE) {
2529				if (nopagewait == TRUE)
2530					break;	/* out of the main while loop */
2531				unlock_zone(zone);
2532
2533				VM_PAGE_WAIT();
2534				lock_zone(zone);
2535			}
2536		}
2537		if (addr == 0)
2538			addr = try_alloc_from_zone(zone, &check_poison);
2539	}
2540
2541#if CONFIG_ZLEAKS
2542	/* Zone leak detection:
2543	 * If we're sampling this allocation, add it to the zleaks hash table.
2544	 */
2545	if (addr && zleak_tracedepth > 0)  {
2546		/* Sampling can fail if another sample is happening at the same time in a different zone. */
2547		if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
2548			/* If it failed, roll back the counter so we sample the next allocation instead. */
2549			zone->zleak_capture = zleak_sample_factor;
2550		}
2551	}
2552#endif /* CONFIG_ZLEAKS */
2553
2554
2555	if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
2556		zone->async_pending = TRUE;
2557		unlock_zone(zone);
2558		thread_call_enter(&call_async_alloc);
2559		lock_zone(zone);
2560		addr = try_alloc_from_zone(zone, &check_poison);
2561	}
2562
2563	/*
2564	 * See if we should be logging allocations in this zone.  Logging is rarely done except when a leak is
2565	 * suspected, so this code rarely executes.  We need to do this code while still holding the zone lock
2566	 * since it protects the various log related data structures.
2567	 */
2568
2569	if (__improbable(DO_LOGGING(zone) && addr)) {
2570		btlog_add_entry(zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
2571	}
2572
2573	vm_offset_t     inner_size = zone->elem_size;
2574
2575#if	ZONE_DEBUG
2576	if (!did_gzalloc && addr && zone_debug_enabled(zone)) {
2577		enqueue_tail(&zone->active_zones, (queue_entry_t)addr);
2578		addr += ZONE_DEBUG_OFFSET;
2579		inner_size -= ZONE_DEBUG_OFFSET;
2580	}
2581#endif
2582
2583	unlock_zone(zone);
2584
2585	if (__improbable(check_poison && addr)) {
2586		vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
2587		vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
2588
2589		for ( ; element_cursor < backup ; element_cursor++)
2590			if (__improbable(*element_cursor != ZP_POISON))
2591				zone_element_was_modified_panic(zone,
2592				                                addr,
2593				                                *element_cursor,
2594				                                ZP_POISON,
2595				                                ((vm_offset_t)element_cursor) - addr);
2596	}
2597
2598	if (addr) {
2599		/*
2600		 * Clear out the old next pointer and backup to avoid leaking the cookie
2601		 * and so that only values on the freelist have a valid cookie
2602		 */
2603
2604		vm_offset_t *primary  = (vm_offset_t *) addr;
2605		vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
2606
2607		*primary = ZP_POISON;
2608		*backup  = ZP_POISON;
2609	}
2610
2611	TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
2612
2613	if (addr) {
2614		task_t task;
2615		zinfo_usage_t zinfo;
2616		vm_size_t sz = zone->elem_size;
2617
2618		if (zone->caller_acct)
2619			ledger_credit(thr->t_ledger, task_ledgers.tkm_private, sz);
2620		else
2621			ledger_credit(thr->t_ledger, task_ledgers.tkm_shared, sz);
2622
2623		if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
2624			OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].alloc);
2625	}
2626	return((void *)addr);
2627}
2628
2629
2630void *
2631zalloc(zone_t zone)
2632{
2633	return (zalloc_internal(zone, TRUE, FALSE));
2634}
2635
2636void *
2637zalloc_noblock(zone_t zone)
2638{
2639	return (zalloc_internal(zone, FALSE, FALSE));
2640}
2641
2642void *
2643zalloc_nopagewait(zone_t zone)
2644{
2645	return (zalloc_internal(zone, TRUE, TRUE));
2646}
2647
2648void *
2649zalloc_canblock(zone_t zone, boolean_t canblock)
2650{
2651	return (zalloc_internal(zone, canblock, FALSE));
2652}
2653
2654
2655void
2656zalloc_async(
2657	__unused thread_call_param_t          p0,
2658	__unused thread_call_param_t p1)
2659{
2660	zone_t current_z = NULL, head_z;
2661	unsigned int max_zones, i;
2662	void *elt = NULL;
2663	boolean_t pending = FALSE;
2664
2665	simple_lock(&all_zones_lock);
2666	head_z = first_zone;
2667	max_zones = num_zones;
2668	simple_unlock(&all_zones_lock);
2669	current_z = head_z;
2670	for (i = 0; i < max_zones; i++) {
2671		lock_zone(current_z);
2672		if (current_z->async_pending == TRUE) {
2673			current_z->async_pending = FALSE;
2674			pending = TRUE;
2675		}
2676		unlock_zone(current_z);
2677
2678		if (pending == TRUE) {
2679			elt = zalloc_canblock(current_z, TRUE);
2680			zfree(current_z, elt);
2681			pending = FALSE;
2682		}
2683		/*
2684		 * This is based on assumption that zones never get
2685		 * freed once allocated and linked.
2686		 * Hence a read outside of lock is OK.
2687		 */
2688		current_z = current_z->next_zone;
2689	}
2690}
2691
2692/*
2693 *	zget returns an element from the specified zone
2694 *	and immediately returns nothing if there is nothing there.
2695 *
2696 *	This form should be used when you can not block (like when
2697 *	processing an interrupt).
2698 *
2699 *	XXX: It seems like only vm_page_grab_fictitious_common uses this, and its
2700 *  friend vm_page_more_fictitious can block, so it doesn't seem like
2701 *  this is used for interrupts any more....
2702 */
2703void *
2704zget(
2705	register zone_t	zone)
2706{
2707	vm_offset_t	addr;
2708	boolean_t       check_poison = FALSE;
2709
2710#if CONFIG_ZLEAKS
2711	uintptr_t	zbt[MAX_ZTRACE_DEPTH];		/* used for zone leak detection */
2712	uint32_t	zleak_tracedepth = 0;  /* log this allocation if nonzero */
2713#endif /* CONFIG_ZLEAKS */
2714
2715	assert( zone != ZONE_NULL );
2716
2717#if CONFIG_ZLEAKS
2718	/*
2719	 * Zone leak detection: capture a backtrace
2720	 */
2721	if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
2722		zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH);
2723	}
2724#endif /* CONFIG_ZLEAKS */
2725
2726	if (!lock_try_zone(zone))
2727		return NULL;
2728
2729	addr = try_alloc_from_zone(zone, &check_poison);
2730
2731	vm_offset_t     inner_size = zone->elem_size;
2732
2733#if	ZONE_DEBUG
2734	if (addr && zone_debug_enabled(zone)) {
2735		enqueue_tail(&zone->active_zones, (queue_entry_t)addr);
2736		addr += ZONE_DEBUG_OFFSET;
2737		inner_size -= ZONE_DEBUG_OFFSET;
2738	}
2739#endif	/* ZONE_DEBUG */
2740
2741#if CONFIG_ZLEAKS
2742	/*
2743	 * Zone leak detection: record the allocation
2744	 */
2745	if (zone->zleak_on && zleak_tracedepth > 0 && addr) {
2746		/* Sampling can fail if another sample is happening at the same time in a different zone. */
2747		if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
2748			/* If it failed, roll back the counter so we sample the next allocation instead. */
2749			zone->zleak_capture = zleak_sample_factor;
2750		}
2751	}
2752#endif /* CONFIG_ZLEAKS */
2753
2754	unlock_zone(zone);
2755
2756	if (__improbable(check_poison && addr)) {
2757		vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
2758		vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
2759
2760		for ( ; element_cursor < backup ; element_cursor++)
2761			if (__improbable(*element_cursor != ZP_POISON))
2762				zone_element_was_modified_panic(zone,
2763				                                addr,
2764				                                *element_cursor,
2765				                                ZP_POISON,
2766				                                ((vm_offset_t)element_cursor) - addr);
2767	}
2768
2769	if (addr) {
2770		/*
2771		 * Clear out the old next pointer and backup to avoid leaking the cookie
2772		 * and so that only values on the freelist have a valid cookie
2773		 */
2774		vm_offset_t *primary  = (vm_offset_t *) addr;
2775		vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
2776
2777		*primary = ZP_POISON;
2778		*backup  = ZP_POISON;
2779	}
2780
2781	return((void *) addr);
2782}
2783
2784/* Keep this FALSE by default.  Large memory machine run orders of magnitude
2785   slower in debug mode when true.  Use debugger to enable if needed */
2786/* static */ boolean_t zone_check = FALSE;
2787
2788static void zone_check_freelist(zone_t zone, vm_offset_t elem)
2789{
2790	struct zone_free_element *this;
2791	struct zone_page_metadata *thispage;
2792
2793	if (zone->use_page_list) {
2794		if (zone->allows_foreign) {
2795			for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
2796				 !queue_end(&zone->pages.any_free_foreign, (queue_entry_t)thispage);
2797				 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) {
2798				for (this = thispage->elements;
2799					 this != NULL;
2800					 this = this->next) {
2801					if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2802						panic("zone_check_freelist");
2803				}
2804			}
2805		}
2806		for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
2807			 !queue_end(&zone->pages.all_free, (queue_entry_t)thispage);
2808			 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) {
2809			for (this = thispage->elements;
2810				 this != NULL;
2811				 this = this->next) {
2812				if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2813					panic("zone_check_freelist");
2814			}
2815		}
2816		for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
2817			 !queue_end(&zone->pages.intermediate, (queue_entry_t)thispage);
2818			 thispage = (struct zone_page_metadata *)queue_next((queue_chain_t *)thispage)) {
2819			for (this = thispage->elements;
2820				 this != NULL;
2821				 this = this->next) {
2822				if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2823					panic("zone_check_freelist");
2824			}
2825		}
2826	} else {
2827		for (this = zone->free_elements;
2828			 this != NULL;
2829			 this = this->next) {
2830			if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem)
2831				panic("zone_check_freelist");
2832		}
2833	}
2834}
2835
2836static zone_t zone_last_bogus_zone = ZONE_NULL;
2837static vm_offset_t zone_last_bogus_elem = 0;
2838
2839void
2840zfree(
2841	register zone_t	zone,
2842	void 		*addr)
2843{
2844	vm_offset_t	elem = (vm_offset_t) addr;
2845	uintptr_t	zbt[MAX_ZTRACE_DEPTH];			/* only used if zone logging is enabled via boot-args */
2846	int		numsaved = 0;
2847	boolean_t	gzfreed = FALSE;
2848	boolean_t       poison = FALSE;
2849
2850	assert(zone != ZONE_NULL);
2851
2852#if 1
2853	if (zone->use_page_list) {
2854		struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr);
2855		if (zone != page_meta->zone) {
2856			/*
2857			 * Something bad has happened. Someone tried to zfree a pointer but the metadata says it is from
2858			 * a different zone (or maybe it's from a zone that doesn't use page free lists at all). We can repair
2859			 * some cases of this, if:
2860			 * 1) The specified zone had use_page_list, and the true zone also has use_page_list set. In that case
2861			 *    we can swap the zone_t
2862			 * 2) The specified zone had use_page_list, but the true zone does not. In this case page_meta is garbage,
2863			 *    and dereferencing page_meta->zone might panic.
2864			 * To distinguish the two, we enumerate the zone list to match it up.
2865			 * We do not handle the case where an incorrect zone is passed that does not have use_page_list set,
2866			 * even if the true zone did have this set.
2867			 */
2868			zone_t fixed_zone = NULL;
2869			int fixed_i, max_zones;
2870
2871			simple_lock(&all_zones_lock);
2872			max_zones = num_zones;
2873			fixed_zone = first_zone;
2874			simple_unlock(&all_zones_lock);
2875
2876			for (fixed_i=0; fixed_i < max_zones; fixed_i++, fixed_zone = fixed_zone->next_zone) {
2877				if (fixed_zone == page_meta->zone && fixed_zone->use_page_list) {
2878					/* we can fix this */
2879					printf("Fixing incorrect zfree from zone %s to zone %s\n", zone->zone_name, fixed_zone->zone_name);
2880					zone = fixed_zone;
2881					break;
2882				}
2883			}
2884		}
2885	}
2886#endif
2887
2888	/*
2889	 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
2890	 */
2891
2892	if (__improbable(DO_LOGGING(zone) && corruption_debug_flag))
2893		numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
2894
2895#if MACH_ASSERT
2896	/* Basic sanity checks */
2897	if (zone == ZONE_NULL || elem == (vm_offset_t)0)
2898		panic("zfree: NULL");
2899	/* zone_gc assumes zones are never freed */
2900	if (zone == zone_zone)
2901		panic("zfree: freeing to zone_zone breaks zone_gc!");
2902#endif
2903
2904#if	CONFIG_GZALLOC
2905	gzfreed = gzalloc_free(zone, addr);
2906#endif
2907
2908	TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
2909
2910	if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
2911		!from_zone_map(elem, zone->elem_size))) {
2912#if MACH_ASSERT
2913		panic("zfree: non-allocated memory in collectable zone!");
2914#endif
2915		zone_last_bogus_zone = zone;
2916		zone_last_bogus_elem = elem;
2917		return;
2918	}
2919
2920	if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) {
2921		/*
2922		 * Poison the memory before it ends up on the freelist to catch
2923		 * use-after-free and use of uninitialized memory
2924		 *
2925		 * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
2926		 * Also poison larger elements periodically
2927		 */
2928
2929		vm_offset_t     inner_size = zone->elem_size;
2930
2931#if	ZONE_DEBUG
2932		if (!gzfreed && zone_debug_enabled(zone)) {
2933			inner_size -= ZONE_DEBUG_OFFSET;
2934		}
2935#endif
2936		uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
2937
2938		if (inner_size <= zp_tiny_zone_limit)
2939			poison = TRUE;
2940		else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE)
2941			poison = TRUE;
2942
2943		if (__improbable(poison)) {
2944
2945			/* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
2946			/* Poison everything but primary and backup */
2947			vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
2948			vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
2949
2950			for ( ; element_cursor < backup; element_cursor++)
2951				*element_cursor = ZP_POISON;
2952		}
2953	}
2954
2955	lock_zone(zone);
2956
2957	/*
2958	 * See if we're doing logging on this zone.  There are two styles of logging used depending on
2959	 * whether we're trying to catch a leak or corruption.  See comments above in zalloc for details.
2960	 */
2961
2962	if (__improbable(DO_LOGGING(zone))) {
2963		if (corruption_debug_flag) {
2964			/*
2965			 * We're logging to catch a corruption.  Add a record of this zfree operation
2966			 * to log.
2967			 */
2968			btlog_add_entry(zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
2969		} else {
2970			/*
2971			 * We're logging to catch a leak. Remove any record we might have for this
2972			 * element since it's being freed.  Note that we may not find it if the buffer
2973			 * overflowed and that's OK.  Since the log is of a limited size, old records
2974			 * get overwritten if there are more zallocs than zfrees.
2975			 */
2976			btlog_remove_entries_for_element(zlog_btlog, (void *)addr);
2977		}
2978	}
2979
2980#if	ZONE_DEBUG
2981	if (!gzfreed && zone_debug_enabled(zone)) {
2982		queue_t tmp_elem;
2983
2984		elem -= ZONE_DEBUG_OFFSET;
2985		if (zone_check) {
2986			/* check the zone's consistency */
2987
2988			for (tmp_elem = queue_first(&zone->active_zones);
2989			     !queue_end(tmp_elem, &zone->active_zones);
2990			     tmp_elem = queue_next(tmp_elem))
2991				if (elem == (vm_offset_t)tmp_elem)
2992					break;
2993			if (elem != (vm_offset_t)tmp_elem)
2994				panic("zfree()ing element from wrong zone");
2995		}
2996		remqueue((queue_t) elem);
2997	}
2998#endif	/* ZONE_DEBUG */
2999	if (zone_check) {
3000		zone_check_freelist(zone, elem);
3001	}
3002
3003	if (__probable(!gzfreed))
3004		free_to_zone(zone, elem, poison);
3005
3006#if MACH_ASSERT
3007	if (zone->count < 0)
3008		panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
3009		zone->zone_name, addr);
3010#endif
3011
3012
3013#if CONFIG_ZLEAKS
3014	/*
3015	 * Zone leak detection: un-track the allocation
3016	 */
3017	if (zone->zleak_on) {
3018		zleak_free(elem, zone->elem_size);
3019	}
3020#endif /* CONFIG_ZLEAKS */
3021
3022	/*
3023	 * If elements have one or more pages, and memory is low,
3024	 * request to run the garbage collection in the zone  the next
3025	 * time the pageout thread runs.
3026	 */
3027	if (zone->elem_size >= PAGE_SIZE &&
3028	    vm_pool_low()){
3029		zone_gc_forced = TRUE;
3030	}
3031	unlock_zone(zone);
3032
3033	{
3034		thread_t thr = current_thread();
3035		task_t task;
3036		zinfo_usage_t zinfo;
3037		vm_size_t sz = zone->elem_size;
3038
3039		if (zone->caller_acct)
3040			ledger_debit(thr->t_ledger, task_ledgers.tkm_private, sz);
3041		else
3042			ledger_debit(thr->t_ledger, task_ledgers.tkm_shared, sz);
3043
3044		if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
3045			OSAddAtomic64(sz, (int64_t *)&zinfo[zone->index].free);
3046	}
3047}
3048
3049
3050/*	Change a zone's flags.
3051 *	This routine must be called immediately after zinit.
3052 */
3053void
3054zone_change(
3055	zone_t		zone,
3056	unsigned int	item,
3057	boolean_t	value)
3058{
3059	assert( zone != ZONE_NULL );
3060	assert( value == TRUE || value == FALSE );
3061
3062	switch(item){
3063	        case Z_NOENCRYPT:
3064			zone->noencrypt = value;
3065			break;
3066		case Z_EXHAUST:
3067			zone->exhaustible = value;
3068			break;
3069		case Z_COLLECT:
3070			zone->collectable = value;
3071			break;
3072		case Z_EXPAND:
3073			zone->expandable = value;
3074			break;
3075		case Z_FOREIGN:
3076			zone->allows_foreign = value;
3077			break;
3078		case Z_CALLERACCT:
3079			zone->caller_acct = value;
3080			break;
3081		case Z_NOCALLOUT:
3082			zone->no_callout = value;
3083			break;
3084		case Z_GZALLOC_EXEMPT:
3085			zone->gzalloc_exempt = value;
3086#if	CONFIG_GZALLOC
3087			gzalloc_reconfigure(zone);
3088#endif
3089			break;
3090		case Z_ALIGNMENT_REQUIRED:
3091			zone->alignment_required = value;
3092#if	ZONE_DEBUG
3093			zone_debug_disable(zone);
3094#endif
3095#if	CONFIG_GZALLOC
3096			gzalloc_reconfigure(zone);
3097#endif
3098			break;
3099		default:
3100			panic("Zone_change: Wrong Item Type!");
3101			/* break; */
3102	}
3103}
3104
3105/*
3106 * Return the expected number of free elements in the zone.
3107 * This calculation will be incorrect if items are zfree'd that
3108 * were never zalloc'd/zget'd. The correct way to stuff memory
3109 * into a zone is by zcram.
3110 */
3111
3112integer_t
3113zone_free_count(zone_t zone)
3114{
3115	integer_t free_count;
3116
3117	lock_zone(zone);
3118	free_count = zone->countfree;
3119	unlock_zone(zone);
3120
3121	assert(free_count >= 0);
3122
3123	return(free_count);
3124}
3125
3126/*
3127 *  Zone garbage collection subroutines
3128 */
3129
3130boolean_t
3131zone_page_collectable(
3132	vm_offset_t	addr,
3133	vm_size_t	size)
3134{
3135	struct zone_page_table_entry	*zp;
3136	zone_page_index_t i, j;
3137
3138#if	ZONE_ALIAS_ADDR
3139	addr = zone_virtual_addr(addr);
3140#endif
3141#if MACH_ASSERT
3142	if (!from_zone_map(addr, size))
3143		panic("zone_page_collectable");
3144#endif
3145
3146	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3147	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3148
3149	for (; i <= j; i++) {
3150		zp = zone_page_table_lookup(i);
3151		if (zp->collect_count == zp->alloc_count)
3152			return (TRUE);
3153	}
3154
3155	return (FALSE);
3156}
3157
3158void
3159zone_page_keep(
3160	vm_offset_t	addr,
3161	vm_size_t	size)
3162{
3163	struct zone_page_table_entry	*zp;
3164	zone_page_index_t i, j;
3165
3166#if	ZONE_ALIAS_ADDR
3167	addr = zone_virtual_addr(addr);
3168#endif
3169#if MACH_ASSERT
3170	if (!from_zone_map(addr, size))
3171		panic("zone_page_keep");
3172#endif
3173
3174	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3175	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3176
3177	for (; i <= j; i++) {
3178		zp = zone_page_table_lookup(i);
3179		zp->collect_count = 0;
3180	}
3181}
3182
3183void
3184zone_page_collect(
3185	vm_offset_t	addr,
3186	vm_size_t	size)
3187{
3188	struct zone_page_table_entry	*zp;
3189	zone_page_index_t i, j;
3190
3191#if	ZONE_ALIAS_ADDR
3192	addr = zone_virtual_addr(addr);
3193#endif
3194#if MACH_ASSERT
3195	if (!from_zone_map(addr, size))
3196		panic("zone_page_collect");
3197#endif
3198
3199	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3200	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3201
3202	for (; i <= j; i++) {
3203		zp = zone_page_table_lookup(i);
3204		++zp->collect_count;
3205	}
3206}
3207
3208void
3209zone_page_init(
3210	vm_offset_t	addr,
3211	vm_size_t	size)
3212{
3213	struct zone_page_table_entry	*zp;
3214	zone_page_index_t i, j;
3215
3216#if	ZONE_ALIAS_ADDR
3217	addr = zone_virtual_addr(addr);
3218#endif
3219#if MACH_ASSERT
3220	if (!from_zone_map(addr, size))
3221		panic("zone_page_init");
3222#endif
3223
3224	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3225	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3226
3227	for (; i <= j; i++) {
3228		/* make sure entry exists before marking unused */
3229		zone_page_table_expand(i);
3230
3231		zp = zone_page_table_lookup(i);
3232		assert(zp);
3233		zp->alloc_count = ZONE_PAGE_UNUSED;
3234		zp->collect_count = 0;
3235	}
3236}
3237
3238void
3239zone_page_alloc(
3240	vm_offset_t	addr,
3241	vm_size_t	size)
3242{
3243	struct zone_page_table_entry	*zp;
3244	zone_page_index_t i, j;
3245
3246#if	ZONE_ALIAS_ADDR
3247	addr = zone_virtual_addr(addr);
3248#endif
3249#if MACH_ASSERT
3250	if (!from_zone_map(addr, size))
3251		panic("zone_page_alloc");
3252#endif
3253
3254	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3255	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3256
3257	for (; i <= j; i++) {
3258		zp = zone_page_table_lookup(i);
3259		assert(zp);
3260
3261		/*
3262		 * Set alloc_count to ZONE_PAGE_USED if
3263		 * it was previously set to ZONE_PAGE_UNUSED.
3264		 */
3265		if (zp->alloc_count == ZONE_PAGE_UNUSED)
3266			zp->alloc_count = ZONE_PAGE_USED;
3267
3268		++zp->alloc_count;
3269	}
3270}
3271
3272void
3273zone_page_free_element(
3274	zone_page_index_t	*free_page_head,
3275	zone_page_index_t	*free_page_tail,
3276	vm_offset_t	addr,
3277	vm_size_t	size)
3278{
3279	struct zone_page_table_entry	*zp;
3280	zone_page_index_t i, j;
3281
3282#if	ZONE_ALIAS_ADDR
3283	addr = zone_virtual_addr(addr);
3284#endif
3285#if MACH_ASSERT
3286	if (!from_zone_map(addr, size))
3287		panic("zone_page_free_element");
3288#endif
3289
3290	/* Clear out the old next and backup pointers */
3291	vm_offset_t *primary  = (vm_offset_t *) addr;
3292	vm_offset_t *backup   = get_backup_ptr(size, primary);
3293
3294	*primary = ZP_POISON;
3295	*backup  = ZP_POISON;
3296
3297	i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
3298	j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
3299
3300	for (; i <= j; i++) {
3301		zp = zone_page_table_lookup(i);
3302
3303		if (zp->collect_count > 0)
3304			--zp->collect_count;
3305		if (--zp->alloc_count == 0) {
3306			vm_address_t        free_page_address;
3307			vm_address_t        prev_free_page_address;
3308
3309			zp->alloc_count  = ZONE_PAGE_UNUSED;
3310			zp->collect_count = 0;
3311
3312
3313			/*
3314			 * This element was the last one on this page, re-use the page's
3315			 * storage for a page freelist
3316			 */
3317			free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i);
3318			*(zone_page_index_t *)free_page_address = ZONE_PAGE_INDEX_INVALID;
3319
3320			if (*free_page_head == ZONE_PAGE_INDEX_INVALID) {
3321				*free_page_head = i;
3322				*free_page_tail = i;
3323			} else {
3324				prev_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)(*free_page_tail));
3325				*(zone_page_index_t *)prev_free_page_address = i;
3326				*free_page_tail = i;
3327			}
3328		}
3329	}
3330}
3331
3332
3333
3334
3335struct {
3336	uint64_t	zgc_invoked;
3337	uint64_t	zgc_bailed;
3338	uint32_t	pgs_freed;
3339
3340	uint32_t	elems_collected,
3341				elems_freed,
3342				elems_kept;
3343} zgc_stats;
3344
3345/*	Zone garbage collection
3346 *
3347 *	zone_gc will walk through all the free elements in all the
3348 *	zones that are marked collectable looking for reclaimable
3349 *	pages.  zone_gc is called by consider_zone_gc when the system
3350 *	begins to run out of memory.
3351 */
3352void
3353zone_gc(boolean_t all_zones)
3354{
3355	unsigned int	max_zones;
3356	zone_t			z;
3357	unsigned int	i;
3358	uint32_t 	old_pgs_freed;
3359	zone_page_index_t zone_free_page_head;
3360	zone_page_index_t zone_free_page_tail;
3361	thread_t	mythread = current_thread();
3362
3363	lck_mtx_lock(&zone_gc_lock);
3364
3365	zgc_stats.zgc_invoked++;
3366	old_pgs_freed = zgc_stats.pgs_freed;
3367
3368	simple_lock(&all_zones_lock);
3369	max_zones = num_zones;
3370	z = first_zone;
3371	simple_unlock(&all_zones_lock);
3372
3373	if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
3374		kprintf("zone_gc(all_zones=%s) starting...\n", all_zones ? "TRUE" : "FALSE");
3375
3376	/*
3377	 * it's ok to allow eager kernel preemption while
3378	 * while holding a zone lock since it's taken
3379	 * as a spin lock (which prevents preemption)
3380	 */
3381	thread_set_eager_preempt(mythread);
3382
3383#if MACH_ASSERT
3384	for (i = 0; i < zone_pages; i++) {
3385		struct zone_page_table_entry	*zp;
3386
3387		zp = zone_page_table_lookup(i);
3388		assert(!zp || (zp->collect_count == 0));
3389	}
3390#endif /* MACH_ASSERT */
3391
3392	for (i = 0; i < max_zones; i++, z = z->next_zone) {
3393		unsigned int			n, m;
3394		vm_size_t			elt_size, size_freed;
3395		struct zone_free_element	*elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail;
3396		int				kmem_frees = 0, total_freed_pages = 0;
3397		struct zone_page_metadata		*page_meta;
3398		queue_head_t	page_meta_head;
3399
3400		assert(z != ZONE_NULL);
3401
3402		if (!z->collectable)
3403			continue;
3404
3405		if (all_zones == FALSE && z->elem_size < PAGE_SIZE && !z->use_page_list)
3406			continue;
3407
3408		lock_zone(z);
3409
3410		elt_size = z->elem_size;
3411
3412		/*
3413		 * Do a quick feasibility check before we scan the zone:
3414		 * skip unless there is likelihood of getting pages back
3415		 * (i.e we need a whole allocation block's worth of free
3416		 * elements before we can garbage collect) and
3417		 * the zone has more than 10 percent of it's elements free
3418		 * or the element size is a multiple of the PAGE_SIZE
3419		 */
3420		if ((elt_size & PAGE_MASK) &&
3421		    !z->use_page_list &&
3422		     (((z->cur_size - z->count * elt_size) <= (2 * z->alloc_size)) ||
3423		      ((z->cur_size - z->count * elt_size) <= (z->cur_size / 10)))) {
3424			unlock_zone(z);
3425			continue;
3426		}
3427
3428		z->doing_gc = TRUE;
3429
3430		/*
3431		 * Snatch all of the free elements away from the zone.
3432		 */
3433
3434		if (z->use_page_list) {
3435			queue_new_head(&z->pages.all_free, &page_meta_head, struct zone_page_metadata *, pages);
3436			queue_init(&z->pages.all_free);
3437		} else {
3438			scan = (void *)z->free_elements;
3439			z->free_elements = 0;
3440		}
3441
3442		unlock_zone(z);
3443
3444		if (z->use_page_list) {
3445			/*
3446			 * For zones that maintain page lists (which in turn
3447			 * track free elements on those pages), zone_gc()
3448			 * is incredibly easy, and we bypass all the logic
3449			 * for scanning elements and mapping them to
3450			 * collectable pages
3451			 */
3452
3453			size_freed = 0;
3454
3455			queue_iterate(&page_meta_head, page_meta, struct zone_page_metadata *, pages) {
3456				assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
3457
3458				zgc_stats.elems_freed += page_meta->free_count;
3459				size_freed += elt_size * page_meta->free_count;
3460				zgc_stats.elems_collected += page_meta->free_count;
3461			}
3462
3463			lock_zone(z);
3464
3465			if (size_freed > 0) {
3466				z->cur_size -= size_freed;
3467				z->countfree -= size_freed/elt_size;
3468			}
3469
3470			z->doing_gc = FALSE;
3471			if (z->waiting) {
3472				z->waiting = FALSE;
3473				zone_wakeup(z);
3474			}
3475
3476			unlock_zone(z);
3477
3478			if (queue_empty(&page_meta_head))
3479				continue;
3480
3481			thread_clear_eager_preempt(mythread);
3482
3483			while ((page_meta = (struct zone_page_metadata *)dequeue_head(&page_meta_head)) != NULL) {
3484				vm_address_t		free_page_address;
3485
3486				free_page_address = trunc_page((vm_address_t)page_meta);
3487#if	ZONE_ALIAS_ADDR
3488				free_page_address = zone_virtual_addr(free_page_address);
3489#endif
3490				kmem_free(zone_map, free_page_address, PAGE_SIZE);
3491				ZONE_PAGE_COUNT_DECR(z, 1);
3492				total_freed_pages++;
3493				zgc_stats.pgs_freed += 1;
3494
3495				if (++kmem_frees == 32) {
3496					thread_yield_internal(1);
3497					kmem_frees = 0;
3498				}
3499			}
3500
3501			if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
3502				kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages);
3503
3504			thread_set_eager_preempt(mythread);
3505			continue; /* go to next zone */
3506		}
3507
3508		/*
3509		 * Pass 1:
3510		 *
3511		 * Determine which elements we can attempt to collect
3512		 * and count them up in the page table.  Foreign elements
3513		 * are returned to the zone.
3514		 */
3515
3516		prev = (void *)&scan;
3517		elt = scan;
3518		n = 0; tail = keep = NULL;
3519
3520		zone_free_page_head = ZONE_PAGE_INDEX_INVALID;
3521		zone_free_page_tail = ZONE_PAGE_INDEX_INVALID;
3522
3523
3524		while (elt != NULL) {
3525			if (from_zone_map(elt, elt_size)) {
3526				zone_page_collect((vm_offset_t)elt, elt_size);
3527
3528				prev = elt;
3529				elt = elt->next;
3530
3531				++zgc_stats.elems_collected;
3532			}
3533			else {
3534				if (keep == NULL)
3535					keep = tail = elt;
3536				else {
3537					append_zone_element(z, tail, elt);
3538					tail = elt;
3539				}
3540
3541				append_zone_element(z, prev, elt->next);
3542				elt = elt->next;
3543				append_zone_element(z, tail, NULL);
3544			}
3545
3546			/*
3547			 * Dribble back the elements we are keeping.
3548			 * If there are none, give some elements that we haven't looked at yet
3549			 * back to the freelist so that others waiting on the zone don't get stuck
3550			 * for too long.  This might prevent us from recovering some memory,
3551			 * but allows us to avoid having to allocate new memory to serve requests
3552			 * while zone_gc has all the free memory tied up.
3553			 * <rdar://problem/3893406>
3554			 */
3555
3556			if (++n >= 50) {
3557				if (z->waiting == TRUE) {
3558					/* z->waiting checked without lock held, rechecked below after locking */
3559					lock_zone(z);
3560
3561					if (keep != NULL) {
3562						add_list_to_zone(z, keep, tail);
3563						tail = keep = NULL;
3564					} else {
3565						m =0;
3566						base_elt = elt;
3567						base_prev = prev;
3568						while ((elt != NULL) && (++m < 50)) {
3569							prev = elt;
3570							elt = elt->next;
3571						}
3572						if (m !=0 ) {
3573							/* Extract the elements from the list and
3574							 * give them back */
3575							append_zone_element(z, prev, NULL);
3576							add_list_to_zone(z, base_elt, prev);
3577							append_zone_element(z, base_prev, elt);
3578							prev = base_prev;
3579						}
3580					}
3581
3582					if (z->waiting) {
3583						z->waiting = FALSE;
3584						zone_wakeup(z);
3585					}
3586
3587					unlock_zone(z);
3588				}
3589				n =0;
3590			}
3591		}
3592
3593		/*
3594		 * Return any remaining elements.
3595		 */
3596
3597		if (keep != NULL) {
3598			lock_zone(z);
3599
3600			add_list_to_zone(z, keep, tail);
3601
3602			if (z->waiting) {
3603				z->waiting = FALSE;
3604				zone_wakeup(z);
3605			}
3606
3607			unlock_zone(z);
3608		}
3609
3610		/*
3611		 * Pass 2:
3612		 *
3613		 * Determine which pages we can reclaim and
3614		 * free those elements.
3615		 */
3616
3617		size_freed = 0;
3618		elt = scan;
3619		n = 0; tail = keep = NULL;
3620
3621		while (elt != NULL) {
3622			if (zone_page_collectable((vm_offset_t)elt, elt_size)) {
3623				struct zone_free_element *next_elt = elt->next;
3624
3625				size_freed += elt_size;
3626
3627				/*
3628				 * If this is the last allocation on the page(s),
3629				 * we may use their storage to maintain the linked
3630				 * list of free-able pages. So store elt->next because
3631				 * "elt" may be scribbled over.
3632				 */
3633				zone_page_free_element(&zone_free_page_head, &zone_free_page_tail, (vm_offset_t)elt, elt_size);
3634
3635				elt = next_elt;
3636
3637				++zgc_stats.elems_freed;
3638			}
3639			else {
3640				zone_page_keep((vm_offset_t)elt, elt_size);
3641
3642				if (keep == NULL)
3643					keep = tail = elt;
3644				else {
3645					append_zone_element(z, tail, elt);
3646					tail = elt;
3647				}
3648
3649				elt = elt->next;
3650				append_zone_element(z, tail, NULL);
3651
3652				++zgc_stats.elems_kept;
3653			}
3654
3655			/*
3656			 * Dribble back the elements we are keeping,
3657			 * and update the zone size info.
3658			 */
3659
3660			if (++n >= 50) {
3661				lock_zone(z);
3662
3663				z->cur_size -= size_freed;
3664				z->countfree -= size_freed/elt_size;
3665				size_freed = 0;
3666
3667				if (keep != NULL) {
3668					add_list_to_zone(z, keep, tail);
3669				}
3670
3671				if (z->waiting) {
3672					z->waiting = FALSE;
3673					zone_wakeup(z);
3674				}
3675
3676				unlock_zone(z);
3677
3678				n = 0; tail = keep = NULL;
3679			}
3680		}
3681
3682		/*
3683		 * Return any remaining elements, and update
3684		 * the zone size info.
3685		 */
3686
3687		lock_zone(z);
3688
3689		if (size_freed > 0 || keep != NULL) {
3690
3691			z->cur_size -= size_freed;
3692			z->countfree -= size_freed/elt_size;
3693
3694			if (keep != NULL) {
3695				add_list_to_zone(z, keep, tail);
3696			}
3697
3698		}
3699
3700		z->doing_gc = FALSE;
3701		if (z->waiting) {
3702			z->waiting = FALSE;
3703			zone_wakeup(z);
3704		}
3705		unlock_zone(z);
3706
3707		if (zone_free_page_head == ZONE_PAGE_INDEX_INVALID)
3708			continue;
3709
3710		/*
3711		 * we don't want to allow eager kernel preemption while holding the
3712		 * various locks taken in the kmem_free path of execution
3713		 */
3714		thread_clear_eager_preempt(mythread);
3715
3716
3717		/*
3718		 * This loop counts the number of pages that should be freed by the
3719		 * next loop that tries to coalesce the kmem_frees()
3720		 */
3721		uint32_t pages_to_free_count = 0;
3722		vm_address_t		fpa;
3723		zone_page_index_t index;
3724		for (index = zone_free_page_head; index != ZONE_PAGE_INDEX_INVALID;) {
3725			pages_to_free_count++;
3726			fpa = zone_map_min_address + PAGE_SIZE * ((vm_size_t)index);
3727			index = *(zone_page_index_t *)fpa;
3728		}
3729
3730		/*
3731		 * Reclaim the pages we are freeing.
3732		 */
3733		while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) {
3734			zone_page_index_t	zind = zone_free_page_head;
3735			vm_address_t		free_page_address;
3736			int			page_count;
3737
3738			/*
3739			 * Use the first word of the page about to be freed to find the next free page
3740			 */
3741			free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind);
3742			zone_free_page_head = *(zone_page_index_t *)free_page_address;
3743
3744			page_count = 1;
3745			total_freed_pages++;
3746
3747			while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) {
3748				zone_page_index_t	next_zind = zone_free_page_head;
3749				vm_address_t		next_free_page_address;
3750
3751				next_free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)next_zind);
3752
3753				if (next_free_page_address == (free_page_address - PAGE_SIZE)) {
3754					free_page_address = next_free_page_address;
3755				} else if (next_free_page_address != (free_page_address + (PAGE_SIZE * page_count)))
3756					break;
3757
3758				zone_free_page_head = *(zone_page_index_t *)next_free_page_address;
3759				page_count++;
3760				total_freed_pages++;
3761			}
3762			kmem_free(zone_map, free_page_address, page_count * PAGE_SIZE);
3763			ZONE_PAGE_COUNT_DECR(z, page_count);
3764			zgc_stats.pgs_freed += page_count;
3765			pages_to_free_count -= page_count;
3766
3767			if (++kmem_frees == 32) {
3768				thread_yield_internal(1);
3769				kmem_frees = 0;
3770			}
3771		}
3772
3773		/* Check that we actually free the exact number of pages we were supposed to */
3774		assert(pages_to_free_count == 0);
3775
3776		if (zalloc_debug & ZALLOC_DEBUG_ZONEGC)
3777			kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name, (unsigned long)size_freed/elt_size, total_freed_pages);
3778
3779		thread_set_eager_preempt(mythread);
3780	}
3781
3782	if (old_pgs_freed == zgc_stats.pgs_freed)
3783		zgc_stats.zgc_bailed++;
3784
3785	thread_clear_eager_preempt(mythread);
3786
3787	lck_mtx_unlock(&zone_gc_lock);
3788
3789}
3790
3791extern vm_offset_t kmapoff_kaddr;
3792extern unsigned int kmapoff_pgcnt;
3793
3794/*
3795 *	consider_zone_gc:
3796 *
3797 *	Called by the pageout daemon when the system needs more free pages.
3798 */
3799
3800void
3801consider_zone_gc(boolean_t force)
3802{
3803	boolean_t all_zones = FALSE;
3804
3805	if (kmapoff_kaddr != 0) {
3806		/*
3807		 * One-time reclaim of kernel_map resources we allocated in
3808		 * early boot.
3809		 */
3810		(void) vm_deallocate(kernel_map,
3811		    kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
3812		kmapoff_kaddr = 0;
3813	}
3814
3815	if (zone_gc_allowed &&
3816	    (zone_gc_allowed_by_time_throttle ||
3817	     zone_gc_forced ||
3818	     force)) {
3819		if (zone_gc_allowed_by_time_throttle == TRUE) {
3820			zone_gc_allowed_by_time_throttle = FALSE;
3821			all_zones = TRUE;
3822		}
3823		zone_gc_forced = FALSE;
3824
3825		zone_gc(all_zones);
3826	}
3827}
3828
3829/*
3830 *	By default, don't attempt zone GC more frequently
3831 *	than once / 1 minutes.
3832 */
3833void
3834compute_zone_gc_throttle(void *arg __unused)
3835{
3836	zone_gc_allowed_by_time_throttle = TRUE;
3837}
3838
3839
3840#if CONFIG_TASK_ZONE_INFO
3841
3842kern_return_t
3843task_zone_info(
3844	task_t			task,
3845	mach_zone_name_array_t	*namesp,
3846	mach_msg_type_number_t  *namesCntp,
3847	task_zone_info_array_t	*infop,
3848	mach_msg_type_number_t  *infoCntp)
3849{
3850	mach_zone_name_t	*names;
3851	vm_offset_t		names_addr;
3852	vm_size_t		names_size;
3853	task_zone_info_t	*info;
3854	vm_offset_t		info_addr;
3855	vm_size_t		info_size;
3856	unsigned int		max_zones, i;
3857	zone_t			z;
3858	mach_zone_name_t	*zn;
3859	task_zone_info_t    	*zi;
3860	kern_return_t		kr;
3861
3862	vm_size_t		used;
3863	vm_map_copy_t		copy;
3864
3865
3866	if (task == TASK_NULL)
3867		return KERN_INVALID_TASK;
3868
3869	/*
3870	 *	We assume that zones aren't freed once allocated.
3871	 *	We won't pick up any zones that are allocated later.
3872	 */
3873
3874	simple_lock(&all_zones_lock);
3875	max_zones = (unsigned int)(num_zones + num_fake_zones);
3876	z = first_zone;
3877	simple_unlock(&all_zones_lock);
3878
3879	names_size = round_page(max_zones * sizeof *names);
3880	kr = kmem_alloc_pageable(ipc_kernel_map,
3881				 &names_addr, names_size);
3882	if (kr != KERN_SUCCESS)
3883		return kr;
3884	names = (mach_zone_name_t *) names_addr;
3885
3886	info_size = round_page(max_zones * sizeof *info);
3887	kr = kmem_alloc_pageable(ipc_kernel_map,
3888				 &info_addr, info_size);
3889	if (kr != KERN_SUCCESS) {
3890		kmem_free(ipc_kernel_map,
3891			  names_addr, names_size);
3892		return kr;
3893	}
3894
3895	info = (task_zone_info_t *) info_addr;
3896
3897	zn = &names[0];
3898	zi = &info[0];
3899
3900	for (i = 0; i < max_zones - num_fake_zones; i++) {
3901		struct zone zcopy;
3902
3903		assert(z != ZONE_NULL);
3904
3905		lock_zone(z);
3906		zcopy = *z;
3907		unlock_zone(z);
3908
3909		simple_lock(&all_zones_lock);
3910		z = z->next_zone;
3911		simple_unlock(&all_zones_lock);
3912
3913		/* assuming here the name data is static */
3914		(void) strncpy(zn->mzn_name, zcopy.zone_name,
3915			       sizeof zn->mzn_name);
3916		zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
3917
3918		zi->tzi_count = (uint64_t)zcopy.count;
3919		zi->tzi_cur_size = (uint64_t)zcopy.cur_size;
3920		zi->tzi_max_size = (uint64_t)zcopy.max_size;
3921		zi->tzi_elem_size = (uint64_t)zcopy.elem_size;
3922		zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size;
3923		zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size;
3924		zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible;
3925		zi->tzi_collectable = (uint64_t)zcopy.collectable;
3926		zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct;
3927		if (task->tkm_zinfo != NULL) {
3928			zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc;
3929			zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free;
3930		} else {
3931			zi->tzi_task_alloc = 0;
3932			zi->tzi_task_free = 0;
3933		}
3934		zn++;
3935		zi++;
3936	}
3937
3938	/*
3939	 * loop through the fake zones and fill them using the specialized
3940	 * functions
3941	 */
3942	for (i = 0; i < num_fake_zones; i++) {
3943		int count, collectable, exhaustible, caller_acct, index;
3944		vm_size_t cur_size, max_size, elem_size, alloc_size;
3945		uint64_t sum_size;
3946
3947		strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name);
3948		zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
3949		fake_zones[i].query(&count, &cur_size,
3950				    &max_size, &elem_size,
3951				    &alloc_size, &sum_size,
3952				    &collectable, &exhaustible, &caller_acct);
3953		zi->tzi_count = (uint64_t)count;
3954		zi->tzi_cur_size = (uint64_t)cur_size;
3955		zi->tzi_max_size = (uint64_t)max_size;
3956		zi->tzi_elem_size = (uint64_t)elem_size;
3957		zi->tzi_alloc_size = (uint64_t)alloc_size;
3958		zi->tzi_sum_size = sum_size;
3959		zi->tzi_collectable = (uint64_t)collectable;
3960		zi->tzi_exhaustible = (uint64_t)exhaustible;
3961		zi->tzi_caller_acct = (uint64_t)caller_acct;
3962		if (task->tkm_zinfo != NULL) {
3963			index = ZINFO_SLOTS - num_fake_zones + i;
3964			zi->tzi_task_alloc = task->tkm_zinfo[index].alloc;
3965			zi->tzi_task_free = task->tkm_zinfo[index].free;
3966		} else {
3967			zi->tzi_task_alloc = 0;
3968			zi->tzi_task_free = 0;
3969		}
3970		zn++;
3971		zi++;
3972	}
3973
3974	used = max_zones * sizeof *names;
3975	if (used != names_size)
3976		bzero((char *) (names_addr + used), names_size - used);
3977
3978	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
3979			   (vm_map_size_t)names_size, TRUE, &copy);
3980	assert(kr == KERN_SUCCESS);
3981
3982	*namesp = (mach_zone_name_t *) copy;
3983	*namesCntp = max_zones;
3984
3985	used = max_zones * sizeof *info;
3986
3987	if (used != info_size)
3988		bzero((char *) (info_addr + used), info_size - used);
3989
3990	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
3991			   (vm_map_size_t)info_size, TRUE, &copy);
3992	assert(kr == KERN_SUCCESS);
3993
3994	*infop = (task_zone_info_t *) copy;
3995	*infoCntp = max_zones;
3996
3997	return KERN_SUCCESS;
3998}
3999
4000#else	/* CONFIG_TASK_ZONE_INFO */
4001
4002kern_return_t
4003task_zone_info(
4004	__unused task_t		task,
4005	__unused mach_zone_name_array_t *namesp,
4006	__unused mach_msg_type_number_t *namesCntp,
4007	__unused task_zone_info_array_t *infop,
4008	__unused mach_msg_type_number_t *infoCntp)
4009{
4010	return KERN_FAILURE;
4011}
4012
4013#endif	/* CONFIG_TASK_ZONE_INFO */
4014
4015kern_return_t
4016mach_zone_info(
4017	host_priv_t		host,
4018	mach_zone_name_array_t	*namesp,
4019	mach_msg_type_number_t  *namesCntp,
4020	mach_zone_info_array_t	*infop,
4021	mach_msg_type_number_t  *infoCntp)
4022{
4023	mach_zone_name_t	*names;
4024	vm_offset_t		names_addr;
4025	vm_size_t		names_size;
4026	mach_zone_info_t	*info;
4027	vm_offset_t		info_addr;
4028	vm_size_t		info_size;
4029	unsigned int		max_zones, i;
4030	zone_t			z;
4031	mach_zone_name_t	*zn;
4032	mach_zone_info_t    	*zi;
4033	kern_return_t		kr;
4034
4035	vm_size_t		used;
4036	vm_map_copy_t		copy;
4037
4038
4039	if (host == HOST_NULL)
4040		return KERN_INVALID_HOST;
4041#if CONFIG_DEBUGGER_FOR_ZONE_INFO
4042	if (!PE_i_can_has_debugger(NULL))
4043		return KERN_INVALID_HOST;
4044#endif
4045
4046	/*
4047	 *	We assume that zones aren't freed once allocated.
4048	 *	We won't pick up any zones that are allocated later.
4049	 */
4050
4051	simple_lock(&all_zones_lock);
4052	max_zones = (unsigned int)(num_zones + num_fake_zones);
4053	z = first_zone;
4054	simple_unlock(&all_zones_lock);
4055
4056	names_size = round_page(max_zones * sizeof *names);
4057	kr = kmem_alloc_pageable(ipc_kernel_map,
4058				 &names_addr, names_size);
4059	if (kr != KERN_SUCCESS)
4060		return kr;
4061	names = (mach_zone_name_t *) names_addr;
4062
4063	info_size = round_page(max_zones * sizeof *info);
4064	kr = kmem_alloc_pageable(ipc_kernel_map,
4065				 &info_addr, info_size);
4066	if (kr != KERN_SUCCESS) {
4067		kmem_free(ipc_kernel_map,
4068			  names_addr, names_size);
4069		return kr;
4070	}
4071
4072	info = (mach_zone_info_t *) info_addr;
4073
4074	zn = &names[0];
4075	zi = &info[0];
4076
4077	for (i = 0; i < max_zones - num_fake_zones; i++) {
4078		struct zone zcopy;
4079
4080		assert(z != ZONE_NULL);
4081
4082		lock_zone(z);
4083		zcopy = *z;
4084		unlock_zone(z);
4085
4086		simple_lock(&all_zones_lock);
4087		z = z->next_zone;
4088		simple_unlock(&all_zones_lock);
4089
4090		/* assuming here the name data is static */
4091		(void) strncpy(zn->mzn_name, zcopy.zone_name,
4092			       sizeof zn->mzn_name);
4093		zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
4094
4095		zi->mzi_count = (uint64_t)zcopy.count;
4096		zi->mzi_cur_size = (uint64_t)zcopy.cur_size;
4097		zi->mzi_max_size = (uint64_t)zcopy.max_size;
4098		zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
4099		zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
4100		zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
4101		zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
4102		zi->mzi_collectable = (uint64_t)zcopy.collectable;
4103		zn++;
4104		zi++;
4105	}
4106
4107	/*
4108	 * loop through the fake zones and fill them using the specialized
4109	 * functions
4110	 */
4111	for (i = 0; i < num_fake_zones; i++) {
4112		int count, collectable, exhaustible, caller_acct;
4113		vm_size_t cur_size, max_size, elem_size, alloc_size;
4114		uint64_t sum_size;
4115
4116		strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name);
4117		zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
4118		fake_zones[i].query(&count, &cur_size,
4119				    &max_size, &elem_size,
4120				    &alloc_size, &sum_size,
4121				    &collectable, &exhaustible, &caller_acct);
4122		zi->mzi_count = (uint64_t)count;
4123		zi->mzi_cur_size = (uint64_t)cur_size;
4124		zi->mzi_max_size = (uint64_t)max_size;
4125		zi->mzi_elem_size = (uint64_t)elem_size;
4126		zi->mzi_alloc_size = (uint64_t)alloc_size;
4127		zi->mzi_sum_size = sum_size;
4128		zi->mzi_collectable = (uint64_t)collectable;
4129		zi->mzi_exhaustible = (uint64_t)exhaustible;
4130
4131		zn++;
4132		zi++;
4133	}
4134
4135	used = max_zones * sizeof *names;
4136	if (used != names_size)
4137		bzero((char *) (names_addr + used), names_size - used);
4138
4139	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
4140			   (vm_map_size_t)names_size, TRUE, &copy);
4141	assert(kr == KERN_SUCCESS);
4142
4143	*namesp = (mach_zone_name_t *) copy;
4144	*namesCntp = max_zones;
4145
4146	used = max_zones * sizeof *info;
4147
4148	if (used != info_size)
4149		bzero((char *) (info_addr + used), info_size - used);
4150
4151	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
4152			   (vm_map_size_t)info_size, TRUE, &copy);
4153	assert(kr == KERN_SUCCESS);
4154
4155	*infop = (mach_zone_info_t *) copy;
4156	*infoCntp = max_zones;
4157
4158	return KERN_SUCCESS;
4159}
4160
4161/*
4162 * host_zone_info - LEGACY user interface for Mach zone information
4163 * 		    Should use mach_zone_info() instead!
4164 */
4165kern_return_t
4166host_zone_info(
4167	host_priv_t		host,
4168	zone_name_array_t	*namesp,
4169	mach_msg_type_number_t  *namesCntp,
4170	zone_info_array_t	*infop,
4171	mach_msg_type_number_t  *infoCntp)
4172{
4173	zone_name_t	*names;
4174	vm_offset_t	names_addr;
4175	vm_size_t	names_size;
4176	zone_info_t	*info;
4177	vm_offset_t	info_addr;
4178	vm_size_t	info_size;
4179	unsigned int	max_zones, i;
4180	zone_t		z;
4181	zone_name_t    *zn;
4182	zone_info_t    *zi;
4183	kern_return_t	kr;
4184
4185	vm_size_t	used;
4186	vm_map_copy_t	copy;
4187
4188
4189	if (host == HOST_NULL)
4190		return KERN_INVALID_HOST;
4191#if CONFIG_DEBUGGER_FOR_ZONE_INFO
4192	if (!PE_i_can_has_debugger(NULL))
4193		return KERN_INVALID_HOST;
4194#endif
4195
4196#if defined(__LP64__)
4197	if (!thread_is_64bit(current_thread()))
4198		return KERN_NOT_SUPPORTED;
4199#else
4200	if (thread_is_64bit(current_thread()))
4201		return KERN_NOT_SUPPORTED;
4202#endif
4203
4204	/*
4205	 *	We assume that zones aren't freed once allocated.
4206	 *	We won't pick up any zones that are allocated later.
4207	 */
4208
4209	simple_lock(&all_zones_lock);
4210	max_zones = (unsigned int)(num_zones + num_fake_zones);
4211	z = first_zone;
4212	simple_unlock(&all_zones_lock);
4213
4214	names_size = round_page(max_zones * sizeof *names);
4215	kr = kmem_alloc_pageable(ipc_kernel_map,
4216				 &names_addr, names_size);
4217	if (kr != KERN_SUCCESS)
4218		return kr;
4219	names = (zone_name_t *) names_addr;
4220
4221	info_size = round_page(max_zones * sizeof *info);
4222	kr = kmem_alloc_pageable(ipc_kernel_map,
4223				 &info_addr, info_size);
4224	if (kr != KERN_SUCCESS) {
4225		kmem_free(ipc_kernel_map,
4226			  names_addr, names_size);
4227		return kr;
4228	}
4229
4230	info = (zone_info_t *) info_addr;
4231
4232	zn = &names[0];
4233	zi = &info[0];
4234
4235	for (i = 0; i < max_zones - num_fake_zones; i++) {
4236		struct zone zcopy;
4237
4238		assert(z != ZONE_NULL);
4239
4240		lock_zone(z);
4241		zcopy = *z;
4242		unlock_zone(z);
4243
4244		simple_lock(&all_zones_lock);
4245		z = z->next_zone;
4246		simple_unlock(&all_zones_lock);
4247
4248		/* assuming here the name data is static */
4249		(void) strncpy(zn->zn_name, zcopy.zone_name,
4250			       sizeof zn->zn_name);
4251		zn->zn_name[sizeof zn->zn_name - 1] = '\0';
4252
4253		zi->zi_count = zcopy.count;
4254		zi->zi_cur_size = zcopy.cur_size;
4255		zi->zi_max_size = zcopy.max_size;
4256		zi->zi_elem_size = zcopy.elem_size;
4257		zi->zi_alloc_size = zcopy.alloc_size;
4258		zi->zi_exhaustible = zcopy.exhaustible;
4259		zi->zi_collectable = zcopy.collectable;
4260
4261		zn++;
4262		zi++;
4263	}
4264
4265	/*
4266	 * loop through the fake zones and fill them using the specialized
4267	 * functions
4268	 */
4269	for (i = 0; i < num_fake_zones; i++) {
4270		int caller_acct;
4271		uint64_t sum_space;
4272		strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name);
4273		zn->zn_name[sizeof zn->zn_name - 1] = '\0';
4274		fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size,
4275				    &zi->zi_max_size, &zi->zi_elem_size,
4276				    &zi->zi_alloc_size, &sum_space,
4277				    &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct);
4278		zn++;
4279		zi++;
4280	}
4281
4282	used = max_zones * sizeof *names;
4283	if (used != names_size)
4284		bzero((char *) (names_addr + used), names_size - used);
4285
4286	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
4287			   (vm_map_size_t)names_size, TRUE, &copy);
4288	assert(kr == KERN_SUCCESS);
4289
4290	*namesp = (zone_name_t *) copy;
4291	*namesCntp = max_zones;
4292
4293	used = max_zones * sizeof *info;
4294	if (used != info_size)
4295		bzero((char *) (info_addr + used), info_size - used);
4296
4297	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
4298			   (vm_map_size_t)info_size, TRUE, &copy);
4299	assert(kr == KERN_SUCCESS);
4300
4301	*infop = (zone_info_t *) copy;
4302	*infoCntp = max_zones;
4303
4304	return KERN_SUCCESS;
4305}
4306
4307kern_return_t
4308mach_zone_force_gc(
4309	host_t host)
4310{
4311
4312	if (host == HOST_NULL)
4313		return KERN_INVALID_HOST;
4314
4315	consider_zone_gc(TRUE);
4316
4317	return (KERN_SUCCESS);
4318}
4319
4320extern unsigned int stack_total;
4321extern unsigned long long stack_allocs;
4322
4323#if defined(__i386__) || defined (__x86_64__)
4324extern unsigned int inuse_ptepages_count;
4325extern long long alloc_ptepages_count;
4326#endif
4327
4328void zone_display_zprint()
4329{
4330	unsigned int    i;
4331	zone_t		the_zone;
4332
4333	if(first_zone!=NULL) {
4334		the_zone = first_zone;
4335		for (i = 0; i < num_zones; i++) {
4336			if(the_zone->cur_size > (1024*1024)) {
4337				printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size);
4338			}
4339
4340			if(the_zone->next_zone == NULL) {
4341				break;
4342			}
4343
4344			the_zone = the_zone->next_zone;
4345		}
4346	}
4347
4348	printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total));
4349
4350#if defined(__i386__) || defined (__x86_64__)
4351	printf("PageTables:\t%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count));
4352#endif
4353
4354	printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total);
4355}
4356
4357zone_t
4358zone_find_largest(void)
4359{
4360	unsigned int    i;
4361	unsigned int    max_zones;
4362	zone_t 	        the_zone;
4363	zone_t          zone_largest;
4364
4365	simple_lock(&all_zones_lock);
4366	the_zone = first_zone;
4367	max_zones = num_zones;
4368	simple_unlock(&all_zones_lock);
4369
4370	zone_largest = the_zone;
4371	for (i = 0; i < max_zones; i++) {
4372		if (the_zone->cur_size > zone_largest->cur_size) {
4373			zone_largest = the_zone;
4374		}
4375
4376		if (the_zone->next_zone == NULL) {
4377			break;
4378		}
4379
4380		the_zone = the_zone->next_zone;
4381	}
4382	return zone_largest;
4383}
4384
4385#if	ZONE_DEBUG
4386
4387/* should we care about locks here ? */
4388
4389#define zone_in_use(z) 	( z->count || z->free_elements \
4390						  || !queue_empty(&z->pages.all_free) \
4391						  || !queue_empty(&z->pages.intermediate) \
4392						  || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
4393
4394void
4395zone_debug_enable(
4396	zone_t		z)
4397{
4398	if (zone_debug_enabled(z) || zone_in_use(z) ||
4399	    z->alloc_size < (z->elem_size + ZONE_DEBUG_OFFSET))
4400		return;
4401	queue_init(&z->active_zones);
4402	z->elem_size += ZONE_DEBUG_OFFSET;
4403}
4404
4405void
4406zone_debug_disable(
4407	zone_t		z)
4408{
4409	if (!zone_debug_enabled(z) || zone_in_use(z))
4410		return;
4411	z->elem_size -= ZONE_DEBUG_OFFSET;
4412	z->active_zones.next = z->active_zones.prev = NULL;
4413}
4414
4415
4416#endif	/* ZONE_DEBUG */
4417