1/*
2 * Copyright (c) 2006-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
34 * reserved.  Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator).  Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/queue.h>
50#include <sys/kernel.h>
51#include <sys/systm.h>
52
53#include <kern/debug.h>
54#include <kern/zalloc.h>
55#include <kern/cpu_number.h>
56#include <kern/locks.h>
57#include <kern/thread_call.h>
58
59#include <libkern/libkern.h>
60#include <libkern/OSAtomic.h>
61#include <libkern/OSDebug.h>
62
63#include <mach/vm_param.h>
64#include <machine/limits.h>
65#include <machine/machine_routines.h>
66
67#include <string.h>
68
69#include <sys/mcache.h>
70
71#define	MCACHE_SIZE(n) \
72	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
73
74/* Allocate extra in case we need to manually align the pointer */
75#define	MCACHE_ALLOC_SIZE \
76	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78#define	MCACHE_CPU(c) \
79	(mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81/*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88#define	MCACHE_LIST_LOCK() {				\
89	lck_mtx_lock(mcache_llock);			\
90	mcache_llock_owner = current_thread();		\
91}
92
93#define	MCACHE_LIST_UNLOCK() {				\
94	mcache_llock_owner = NULL;			\
95	lck_mtx_unlock(mcache_llock);			\
96}
97
98#define	MCACHE_LOCK(l)		lck_mtx_lock(l)
99#define	MCACHE_UNLOCK(l)	lck_mtx_unlock(l)
100#define	MCACHE_LOCK_TRY(l)	lck_mtx_try_lock(l)
101
102static int ncpu;
103static unsigned int cache_line_size;
104static lck_mtx_t *mcache_llock;
105static struct thread *mcache_llock_owner;
106static lck_attr_t *mcache_llock_attr;
107static lck_grp_t *mcache_llock_grp;
108static lck_grp_attr_t *mcache_llock_grp_attr;
109static struct zone *mcache_zone;
110static const uint32_t mcache_reap_interval = 15;
111static const uint32_t mcache_reap_interval_leeway = 2;
112static UInt32 mcache_reaping;
113static int mcache_ready;
114static int mcache_updating;
115
116static int mcache_bkt_contention = 3;
117#if DEBUG
118static unsigned int mcache_flags = MCF_DEBUG;
119#else
120static unsigned int mcache_flags = 0;
121#endif
122
123int mca_trn_max = MCA_TRN_MAX;
124
125#define	DUMP_MCA_BUF_SIZE	512
126static char *mca_dump_buf;
127
128static mcache_bkttype_t mcache_bkttype[] = {
129	{ 1,	4096,	32768,	NULL },
130	{ 3,	2048,	16384,	NULL },
131	{ 7,	1024,	12288,	NULL },
132	{ 15,	256,	8192,	NULL },
133	{ 31,	64,	4096,	NULL },
134	{ 47,	0,	2048,	NULL },
135	{ 63,	0,	1024,	NULL },
136	{ 95,	0,	512,	NULL },
137	{ 143,	0,	256,	NULL },
138	{ 165,	0,	0,	NULL },
139};
140
141static mcache_t *mcache_create_common(const char *, size_t, size_t,
142    mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
143    mcache_notifyfn_t, void *, u_int32_t, int, int);
144static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
145    unsigned int, int);
146static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
147static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
148static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
149static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
150    mcache_bkttype_t **);
151static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
152static void mcache_cache_bkt_enable(mcache_t *);
153static void mcache_bkt_purge(mcache_t *);
154static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
155    mcache_bkt_t *, int);
156static void mcache_bkt_ws_update(mcache_t *);
157static void mcache_bkt_ws_reap(mcache_t *);
158static void mcache_dispatch(void (*)(void *), void *);
159static void mcache_cache_reap(mcache_t *);
160static void mcache_cache_update(mcache_t *);
161static void mcache_cache_bkt_resize(void *);
162static void mcache_cache_enable(void *);
163static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
164static void mcache_update_timeout(void *);
165static void mcache_applyall(void (*)(mcache_t *));
166static void mcache_reap_start(void *);
167static void mcache_reap_done(void *);
168static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
169static void mcache_notify(mcache_t *, u_int32_t);
170static void mcache_purge(void *);
171
172static LIST_HEAD(, mcache) mcache_head;
173mcache_t *mcache_audit_cache;
174
175static thread_call_t mcache_reap_tcall;
176static thread_call_t mcache_update_tcall;
177
178/*
179 * Initialize the framework; this is currently called as part of BSD init.
180 */
181__private_extern__ void
182mcache_init(void)
183{
184	mcache_bkttype_t *btp;
185	unsigned int i;
186	char name[32];
187
188	VERIFY(mca_trn_max >= 2);
189
190	ncpu = ml_get_max_cpus();
191	(void) mcache_cache_line_size();	/* prime it */
192
193	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
194	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
195	    mcache_llock_grp_attr);
196	mcache_llock_attr = lck_attr_alloc_init();
197	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
198
199	mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
200	mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
201	if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL)
202		panic("mcache_init: thread_call_allocate failed");
203
204	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
205	    PAGE_SIZE, "mcache");
206	if (mcache_zone == NULL)
207		panic("mcache_init: failed to allocate mcache zone\n");
208	zone_change(mcache_zone, Z_CALLERACCT, FALSE);
209
210	LIST_INIT(&mcache_head);
211
212	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
213		btp = &mcache_bkttype[i];
214		(void) snprintf(name, sizeof (name), "bkt_%d",
215		    btp->bt_bktsize);
216		btp->bt_cache = mcache_create(name,
217		    (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
218	}
219
220	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
221	mcache_flags &= MCF_FLAGS_MASK;
222
223	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
224	    0, 0, MCR_SLEEP);
225
226	mcache_applyall(mcache_cache_bkt_enable);
227	mcache_ready = 1;
228
229	printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
230	    ncpu, CPU_CACHE_LINE_SIZE);
231}
232
233/*
234 * Return the global mcache flags.
235 */
236__private_extern__ unsigned int
237mcache_getflags(void)
238{
239	return (mcache_flags);
240}
241
242/*
243 * Return the CPU cache line size.
244 */
245__private_extern__ unsigned int
246mcache_cache_line_size(void)
247{
248	if (cache_line_size == 0) {
249		ml_cpu_info_t cpu_info;
250		ml_cpu_get_info(&cpu_info);
251		cache_line_size = cpu_info.cache_line_size;
252	}
253	return (cache_line_size);
254}
255
256/*
257 * Create a cache using the zone allocator as the backend slab allocator.
258 * The caller may specify any alignment for the object; if it specifies 0
259 * the default alignment (MCACHE_ALIGN) will be used.
260 */
261__private_extern__ mcache_t *
262mcache_create(const char *name, size_t bufsize, size_t align,
263    u_int32_t flags, int wait)
264{
265	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
266	    mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
267	    wait));
268}
269
270/*
271 * Create a cache using a custom backend slab allocator.  Since the caller
272 * is responsible for allocation, no alignment guarantee will be provided
273 * by this framework.
274 */
275__private_extern__ mcache_t *
276mcache_create_ext(const char *name, size_t bufsize,
277    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
278    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
279    u_int32_t flags, int wait)
280{
281	return (mcache_create_common(name, bufsize, 0, allocfn,
282	    freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
283}
284
285/*
286 * Common cache creation routine.
287 */
288static mcache_t *
289mcache_create_common(const char *name, size_t bufsize, size_t align,
290    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
291    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
292    u_int32_t flags, int need_zone, int wait)
293{
294	mcache_bkttype_t *btp;
295	mcache_t *cp = NULL;
296	size_t chunksize;
297	void *buf, **pbuf;
298	int c;
299	char lck_name[64];
300
301	/* If auditing is on and print buffer is NULL, allocate it now */
302	if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
303		int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
304		MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
305		    malloc_wait | M_ZERO);
306		if (mca_dump_buf == NULL)
307			return (NULL);
308	}
309
310	if (!(wait & MCR_NOSLEEP))
311		buf = zalloc(mcache_zone);
312	else
313		buf = zalloc_noblock(mcache_zone);
314
315	if (buf == NULL)
316		goto fail;
317
318	bzero(buf, MCACHE_ALLOC_SIZE);
319
320	/*
321	 * In case we didn't get a cache-aligned memory, round it up
322	 * accordingly.  This is needed in order to get the rest of
323	 * structure members aligned properly.  It also means that
324	 * the memory span gets shifted due to the round up, but it
325	 * is okay since we've allocated extra space for this.
326	 */
327	cp = (mcache_t *)
328	    P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
329	pbuf = (void **)((intptr_t)cp - sizeof (void *));
330	*pbuf = buf;
331
332	/*
333	 * Guaranteed alignment is valid only when we use the internal
334	 * slab allocator (currently set to use the zone allocator).
335	 */
336	if (!need_zone)
337		align = 1;
338	else if (align == 0)
339		align = MCACHE_ALIGN;
340
341	if ((align & (align - 1)) != 0)
342		panic("mcache_create: bad alignment %lu", align);
343
344	cp->mc_align = align;
345	cp->mc_slab_alloc = allocfn;
346	cp->mc_slab_free = freefn;
347	cp->mc_slab_audit = auditfn;
348	cp->mc_slab_log = logfn;
349	cp->mc_slab_notify = notifyfn;
350	cp->mc_private = need_zone ? cp : arg;
351	cp->mc_bufsize = bufsize;
352	cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
353
354	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
355
356	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
357	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
358	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
359	    cp->mc_cpu_lock_grp_attr);
360	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
361
362	/*
363	 * Allocation chunk size is the object's size plus any extra size
364	 * needed to satisfy the object's alignment.  It is enforced to be
365	 * at least the size of an LP64 pointer to simplify auditing and to
366	 * handle multiple-element allocation requests, where the elements
367	 * returned are linked together in a list.
368	 */
369	chunksize = MAX(bufsize, sizeof (u_int64_t));
370	if (need_zone) {
371		/* Enforce 64-bit minimum alignment for zone-based buffers */
372		align = MAX(align, sizeof (u_int64_t));
373		chunksize += sizeof (void *) + align;
374		chunksize = P2ROUNDUP(chunksize, align);
375		if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
376		    PAGE_SIZE, cp->mc_name)) == NULL)
377			goto fail;
378		zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
379	}
380	cp->mc_chunksize = chunksize;
381
382	/*
383	 * Initialize the bucket layer.
384	 */
385	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
386	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
387	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
388	    cp->mc_bkt_lock_grp_attr);
389	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
390	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
391	    cp->mc_bkt_lock_attr);
392
393	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
394	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
395	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
396	    cp->mc_sync_lock_grp_attr);
397	cp->mc_sync_lock_attr = lck_attr_alloc_init();
398	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
399	    cp->mc_sync_lock_attr);
400
401	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
402		continue;
403
404	cp->cache_bkttype = btp;
405
406	/*
407	 * Initialize the CPU layer.  Each per-CPU structure is aligned
408	 * on the CPU cache line boundary to prevent false sharing.
409	 */
410	for (c = 0; c < ncpu; c++) {
411		mcache_cpu_t *ccp = &cp->mc_cpu[c];
412
413		VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
414		lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
415		    cp->mc_cpu_lock_attr);
416		ccp->cc_objs = -1;
417		ccp->cc_pobjs = -1;
418	}
419
420	if (mcache_ready)
421		mcache_cache_bkt_enable(cp);
422
423	/* TODO: dynamically create sysctl for stats */
424
425	MCACHE_LIST_LOCK();
426	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
427	MCACHE_LIST_UNLOCK();
428
429	/*
430	 * If cache buckets are enabled and this is the first cache
431	 * created, start the periodic cache update.
432	 */
433	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
434		mcache_updating = 1;
435		mcache_update_timeout(NULL);
436	}
437	if (cp->mc_flags & MCF_DEBUG) {
438		printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
439		    "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
440		    arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
441	}
442	return (cp);
443
444fail:
445	if (buf != NULL)
446		zfree(mcache_zone, buf);
447	return (NULL);
448}
449
450/*
451 * Allocate one or more objects from a cache.
452 */
453__private_extern__ unsigned int
454mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
455{
456	mcache_cpu_t *ccp;
457	mcache_obj_t **top = &(*list);
458	mcache_bkt_t *bkt;
459	unsigned int need = num;
460	boolean_t nwretry = FALSE;
461
462	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
463	VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
464
465	ASSERT(list != NULL);
466	*list = NULL;
467
468	if (num == 0)
469		return (0);
470
471retry_alloc:
472	/* We may not always be running in the same CPU in case of retries */
473	ccp = MCACHE_CPU(cp);
474
475	MCACHE_LOCK(&ccp->cc_lock);
476	for (;;) {
477		/*
478		 * If we have an object in the current CPU's filled bucket,
479		 * chain the object to any previous objects and return if
480		 * we've satisfied the number of requested objects.
481		 */
482		if (ccp->cc_objs > 0) {
483			mcache_obj_t *tail;
484			int objs;
485
486			/*
487			 * Objects in the bucket are already linked together
488			 * with the most recently freed object at the head of
489			 * the list; grab as many objects as we can.
490			 */
491			objs = MIN((unsigned int)ccp->cc_objs, need);
492			*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
493			ccp->cc_objs -= objs;
494			ccp->cc_alloc += objs;
495
496			tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
497			list = &tail->obj_next;
498			*list = NULL;
499
500			/* If we got them all, return to caller */
501			if ((need -= objs) == 0) {
502				MCACHE_UNLOCK(&ccp->cc_lock);
503
504				if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
505				    cp->mc_slab_log != NULL)
506					(*cp->mc_slab_log)(num, *top, TRUE);
507
508				if (cp->mc_flags & MCF_DEBUG)
509					goto debug_alloc;
510
511				return (num);
512			}
513		}
514
515		/*
516		 * The CPU's filled bucket is empty.  If the previous filled
517		 * bucket was full, exchange and try again.
518		 */
519		if (ccp->cc_pobjs > 0) {
520			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
521			continue;
522		}
523
524		/*
525		 * If the bucket layer is disabled, allocate from slab.  This
526		 * can happen either because MCF_NOCPUCACHE is set, or because
527		 * the bucket layer is currently being resized.
528		 */
529		if (ccp->cc_bktsize == 0)
530			break;
531
532		/*
533		 * Both of the CPU's buckets are empty; try to get a full
534		 * bucket from the bucket layer.  Upon success, refill this
535		 * CPU and place any empty bucket into the empty list.
536		 */
537		bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
538		if (bkt != NULL) {
539			if (ccp->cc_pfilled != NULL)
540				mcache_bkt_free(cp, &cp->mc_empty,
541				    ccp->cc_pfilled);
542			mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
543			continue;
544		}
545
546		/*
547		 * The bucket layer has no full buckets; allocate the
548		 * object(s) directly from the slab layer.
549		 */
550		break;
551	}
552	MCACHE_UNLOCK(&ccp->cc_lock);
553
554	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
555
556	/*
557	 * If this is a blocking allocation, or if it is non-blocking and
558	 * the cache's full bucket is non-empty, then retry the allocation.
559	 */
560	if (need > 0) {
561		if (!(wait & MCR_NONBLOCKING)) {
562			atomic_add_32(&cp->mc_wretry_cnt, 1);
563			goto retry_alloc;
564		} else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
565		    !mcache_bkt_isempty(cp)) {
566			if (!nwretry)
567				nwretry = TRUE;
568			atomic_add_32(&cp->mc_nwretry_cnt, 1);
569			goto retry_alloc;
570		} else if (nwretry) {
571			atomic_add_32(&cp->mc_nwfail_cnt, 1);
572		}
573	}
574
575	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
576		(*cp->mc_slab_log)((num - need), *top, TRUE);
577
578	if (!(cp->mc_flags & MCF_DEBUG))
579		return (num - need);
580
581debug_alloc:
582	if (cp->mc_flags & MCF_DEBUG) {
583		mcache_obj_t **o = top;
584		unsigned int n;
585
586		n = 0;
587		/*
588		 * Verify that the chain of objects have the same count as
589		 * what we are about to report to the caller.  Any mismatch
590		 * here means that the object list is insanely broken and
591		 * therefore we must panic.
592		 */
593		while (*o != NULL) {
594			o = &(*o)->obj_next;
595			++n;
596		}
597		if (n != (num - need)) {
598			panic("mcache_alloc_ext: %s cp %p corrupted list "
599			    "(got %d actual %d)\n", cp->mc_name,
600			    (void *)cp, num - need, n);
601		}
602	}
603
604	/* Invoke the slab layer audit callback if auditing is enabled */
605	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
606		(*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
607
608	return (num - need);
609}
610
611/*
612 * Allocate a single object from a cache.
613 */
614__private_extern__ void *
615mcache_alloc(mcache_t *cp, int wait)
616{
617	mcache_obj_t *buf;
618
619	(void) mcache_alloc_ext(cp, &buf, 1, wait);
620	return (buf);
621}
622
623__private_extern__ void
624mcache_waiter_inc(mcache_t *cp)
625{
626	atomic_add_32(&cp->mc_waiter_cnt, 1);
627}
628
629__private_extern__ void
630mcache_waiter_dec(mcache_t *cp)
631{
632	atomic_add_32(&cp->mc_waiter_cnt, -1);
633}
634
635__private_extern__ boolean_t
636mcache_bkt_isempty(mcache_t *cp)
637{
638	/*
639	 * This isn't meant to accurately tell whether there are
640	 * any full buckets in the cache; it is simply a way to
641	 * obtain "hints" about the state of the cache.
642	 */
643	return (cp->mc_full.bl_total == 0);
644}
645
646/*
647 * Notify the slab layer about an event.
648 */
649static void
650mcache_notify(mcache_t *cp, u_int32_t event)
651{
652	if (cp->mc_slab_notify != NULL)
653		(*cp->mc_slab_notify)(cp->mc_private, event);
654}
655
656/*
657 * Purge the cache and disable its buckets.
658 */
659static void
660mcache_purge(void *arg)
661{
662	mcache_t *cp = arg;
663
664	mcache_bkt_purge(cp);
665	/*
666	 * We cannot simply call mcache_cache_bkt_enable() from here as
667	 * a bucket resize may be in flight and we would cause the CPU
668	 * layers of the cache to point to different sizes.  Therefore,
669	 * we simply increment the enable count so that during the next
670	 * periodic cache update the buckets can be reenabled.
671	 */
672	lck_mtx_lock_spin(&cp->mc_sync_lock);
673	cp->mc_enable_cnt++;
674	lck_mtx_unlock(&cp->mc_sync_lock);
675}
676
677__private_extern__ boolean_t
678mcache_purge_cache(mcache_t *cp, boolean_t async)
679{
680	/*
681	 * Purging a cache that has no per-CPU caches or is already
682	 * in the process of being purged is rather pointless.
683	 */
684	if (cp->mc_flags & MCF_NOCPUCACHE)
685		return (FALSE);
686
687	lck_mtx_lock_spin(&cp->mc_sync_lock);
688	if (cp->mc_purge_cnt > 0) {
689		lck_mtx_unlock(&cp->mc_sync_lock);
690		return (FALSE);
691	}
692	cp->mc_purge_cnt++;
693	lck_mtx_unlock(&cp->mc_sync_lock);
694
695	if (async)
696		mcache_dispatch(mcache_purge, cp);
697	else
698		mcache_purge(cp);
699
700	return (TRUE);
701}
702
703/*
704 * Free a single object to a cache.
705 */
706__private_extern__ void
707mcache_free(mcache_t *cp, void *buf)
708{
709	((mcache_obj_t *)buf)->obj_next = NULL;
710	mcache_free_ext(cp, (mcache_obj_t *)buf);
711}
712
713/*
714 * Free one or more objects to a cache.
715 */
716__private_extern__ void
717mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
718{
719	mcache_cpu_t *ccp = MCACHE_CPU(cp);
720	mcache_bkttype_t *btp;
721	mcache_obj_t *nlist;
722	mcache_bkt_t *bkt;
723
724	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
725		(*cp->mc_slab_log)(0, list, FALSE);
726
727	/* Invoke the slab layer audit callback if auditing is enabled */
728	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
729		(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
730
731	MCACHE_LOCK(&ccp->cc_lock);
732	for (;;) {
733		/*
734		 * If there is space in the current CPU's filled bucket, put
735		 * the object there and return once all objects are freed.
736		 * Note the cast to unsigned integer takes care of the case
737		 * where the bucket layer is disabled (when cc_objs is -1).
738		 */
739		if ((unsigned int)ccp->cc_objs <
740		    (unsigned int)ccp->cc_bktsize) {
741			/*
742			 * Reverse the list while we place the object into the
743			 * bucket; this effectively causes the most recently
744			 * freed object(s) to be reused during allocation.
745			 */
746			nlist = list->obj_next;
747			list->obj_next = (ccp->cc_objs == 0) ? NULL :
748			    ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
749			ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
750			ccp->cc_free++;
751
752			if ((list = nlist) != NULL)
753				continue;
754
755			/* We are done; return to caller */
756			MCACHE_UNLOCK(&ccp->cc_lock);
757
758			/* If there is a waiter below, notify it */
759			if (cp->mc_waiter_cnt > 0)
760				mcache_notify(cp, MCN_RETRYALLOC);
761			return;
762		}
763
764		/*
765		 * The CPU's filled bucket is full.  If the previous filled
766		 * bucket was empty, exchange and try again.
767		 */
768		if (ccp->cc_pobjs == 0) {
769			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
770			continue;
771		}
772
773		/*
774		 * If the bucket layer is disabled, free to slab.  This can
775		 * happen either because MCF_NOCPUCACHE is set, or because
776		 * the bucket layer is currently being resized.
777		 */
778		if (ccp->cc_bktsize == 0)
779			break;
780
781		/*
782		 * Both of the CPU's buckets are full; try to get an empty
783		 * bucket from the bucket layer.  Upon success, empty this
784		 * CPU and place any full bucket into the full list.
785		 */
786		bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
787		if (bkt != NULL) {
788			if (ccp->cc_pfilled != NULL)
789				mcache_bkt_free(cp, &cp->mc_full,
790				    ccp->cc_pfilled);
791			mcache_cpu_refill(ccp, bkt, 0);
792			continue;
793		}
794
795		/*
796		 * We need an empty bucket to put our freed objects into
797		 * but couldn't get an empty bucket from the bucket layer;
798		 * attempt to allocate one.  We do not want to block for
799		 * allocation here, and if the bucket allocation fails
800		 * we will simply fall through to the slab layer.
801		 */
802		MCACHE_UNLOCK(&ccp->cc_lock);
803		bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
804		MCACHE_LOCK(&ccp->cc_lock);
805
806		if (bkt != NULL) {
807			/*
808			 * We have an empty bucket, but since we drop the
809			 * CPU lock above, the cache's bucket size may have
810			 * changed.  If so, free the bucket and try again.
811			 */
812			if (ccp->cc_bktsize != btp->bt_bktsize) {
813				MCACHE_UNLOCK(&ccp->cc_lock);
814				mcache_free(btp->bt_cache, bkt);
815				MCACHE_LOCK(&ccp->cc_lock);
816				continue;
817			}
818
819			/*
820			 * We have an empty bucket of the right size;
821			 * add it to the bucket layer and try again.
822			 */
823			mcache_bkt_free(cp, &cp->mc_empty, bkt);
824			continue;
825		}
826
827		/*
828		 * The bucket layer has no empty buckets; free the
829		 * object(s) directly to the slab layer.
830		 */
831		break;
832	}
833	MCACHE_UNLOCK(&ccp->cc_lock);
834
835	/* If there is a waiter below, notify it */
836	if (cp->mc_waiter_cnt > 0)
837		mcache_notify(cp, MCN_RETRYALLOC);
838
839	/* Advise the slab layer to purge the object(s) */
840	(*cp->mc_slab_free)(cp->mc_private, list,
841	    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
842}
843
844/*
845 * Cache destruction routine.
846 */
847__private_extern__ void
848mcache_destroy(mcache_t *cp)
849{
850	void **pbuf;
851
852	MCACHE_LIST_LOCK();
853	LIST_REMOVE(cp, mc_list);
854	MCACHE_LIST_UNLOCK();
855
856	mcache_bkt_purge(cp);
857
858	/*
859	 * This cache is dead; there should be no further transaction.
860	 * If it's still invoked, make sure that it induces a fault.
861	 */
862	cp->mc_slab_alloc = NULL;
863	cp->mc_slab_free = NULL;
864	cp->mc_slab_audit = NULL;
865
866	lck_attr_free(cp->mc_bkt_lock_attr);
867	lck_grp_free(cp->mc_bkt_lock_grp);
868	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
869
870	lck_attr_free(cp->mc_cpu_lock_attr);
871	lck_grp_free(cp->mc_cpu_lock_grp);
872	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
873
874	lck_attr_free(cp->mc_sync_lock_attr);
875	lck_grp_free(cp->mc_sync_lock_grp);
876	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
877
878	/*
879	 * TODO: We need to destroy the zone here, but cannot do it
880	 * because there is no such way to achieve that.  Until then
881	 * the memory allocated for the zone structure is leaked.
882	 * Once it is achievable, uncomment these lines:
883	 *
884	 *	if (cp->mc_slab_zone != NULL) {
885	 *		zdestroy(cp->mc_slab_zone);
886	 *		cp->mc_slab_zone = NULL;
887	 *	}
888	 */
889
890	/* Get the original address since we're about to free it */
891	pbuf = (void **)((intptr_t)cp - sizeof (void *));
892
893	zfree(mcache_zone, *pbuf);
894}
895
896/*
897 * Internal slab allocator used as a backend for simple caches.  The current
898 * implementation uses the zone allocator for simplicity reasons.
899 */
900static unsigned int
901mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
902{
903	mcache_t *cp = arg;
904	unsigned int need = num;
905	size_t offset = 0;
906	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
907	u_int32_t flags = cp->mc_flags;
908	void *buf, *base, **pbuf;
909	mcache_obj_t **list = *plist;
910
911	*list = NULL;
912
913	/*
914	 * The address of the object returned to the caller is an
915	 * offset from the 64-bit aligned base address only if the
916	 * cache's alignment requirement is neither 1 nor 8 bytes.
917	 */
918	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
919		offset = cp->mc_align;
920
921	for (;;) {
922		if (!(wait & MCR_NOSLEEP))
923			buf = zalloc(cp->mc_slab_zone);
924		else
925			buf = zalloc_noblock(cp->mc_slab_zone);
926
927		if (buf == NULL)
928			break;
929
930		/* Get the 64-bit aligned base address for this object */
931		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
932		    sizeof (u_int64_t));
933
934		/*
935		 * Wind back a pointer size from the aligned base and
936		 * save the original address so we can free it later.
937		 */
938		pbuf = (void **)((intptr_t)base - sizeof (void *));
939		*pbuf = buf;
940
941		/*
942		 * If auditing is enabled, patternize the contents of
943		 * the buffer starting from the 64-bit aligned base to
944		 * the end of the buffer; the length is rounded up to
945		 * the nearest 64-bit multiply; this is because we use
946		 * 64-bit memory access to set/check the pattern.
947		 */
948		if (flags & MCF_DEBUG) {
949			VERIFY(((intptr_t)base + rsize) <=
950			    ((intptr_t)buf + cp->mc_chunksize));
951			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
952		}
953
954		/*
955		 * Fix up the object's address to fulfill the cache's
956		 * alignment requirement (if needed) and return this
957		 * to the caller.
958		 */
959		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
960		    ((intptr_t)buf + cp->mc_chunksize));
961		*list = (mcache_obj_t *)((intptr_t)base + offset);
962
963		(*list)->obj_next = NULL;
964		list = *plist = &(*list)->obj_next;
965
966		/* If we got them all, return to mcache */
967		if (--need == 0)
968			break;
969	}
970
971	return (num - need);
972}
973
974/*
975 * Internal slab deallocator used as a backend for simple caches.
976 */
977static void
978mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
979{
980	mcache_t *cp = arg;
981	mcache_obj_t *nlist;
982	size_t offset = 0;
983	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
984	u_int32_t flags = cp->mc_flags;
985	void *base;
986	void **pbuf;
987
988	/*
989	 * The address of the object is an offset from a 64-bit
990	 * aligned base address only if the cache's alignment
991	 * requirement is neither 1 nor 8 bytes.
992	 */
993	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
994		offset = cp->mc_align;
995
996	for (;;) {
997		nlist = list->obj_next;
998		list->obj_next = NULL;
999
1000		/* Get the 64-bit aligned base address of this object */
1001		base = (void *)((intptr_t)list - offset);
1002		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1003
1004		/* Get the original address since we're about to free it */
1005		pbuf = (void **)((intptr_t)base - sizeof (void *));
1006
1007		if (flags & MCF_DEBUG) {
1008			VERIFY(((intptr_t)base + rsize) <=
1009			    ((intptr_t)*pbuf + cp->mc_chunksize));
1010			mcache_audit_free_verify(NULL, base, offset, rsize);
1011		}
1012
1013		/* Free it to zone */
1014		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
1015		    ((intptr_t)*pbuf + cp->mc_chunksize));
1016		zfree(cp->mc_slab_zone, *pbuf);
1017
1018		/* No more objects to free; return to mcache */
1019		if ((list = nlist) == NULL)
1020			break;
1021	}
1022}
1023
1024/*
1025 * Internal slab auditor for simple caches.
1026 */
1027static void
1028mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1029{
1030	mcache_t *cp = arg;
1031	size_t offset = 0;
1032	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1033	void *base, **pbuf;
1034
1035	/*
1036	 * The address of the object returned to the caller is an
1037	 * offset from the 64-bit aligned base address only if the
1038	 * cache's alignment requirement is neither 1 nor 8 bytes.
1039	 */
1040	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1041		offset = cp->mc_align;
1042
1043	while (list != NULL) {
1044		mcache_obj_t *next = list->obj_next;
1045
1046		/* Get the 64-bit aligned base address of this object */
1047		base = (void *)((intptr_t)list - offset);
1048		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1049
1050		/* Get the original address */
1051		pbuf = (void **)((intptr_t)base - sizeof (void *));
1052
1053		VERIFY(((intptr_t)base + rsize) <=
1054		    ((intptr_t)*pbuf + cp->mc_chunksize));
1055
1056		if (!alloc)
1057			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1058		else
1059			mcache_audit_free_verify_set(NULL, base, offset, rsize);
1060
1061		list = list->obj_next = next;
1062	}
1063}
1064
1065/*
1066 * Refill the CPU's filled bucket with bkt and save the previous one.
1067 */
1068static void
1069mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1070{
1071	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1072	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1073	ASSERT(ccp->cc_bktsize > 0);
1074
1075	ccp->cc_pfilled = ccp->cc_filled;
1076	ccp->cc_pobjs = ccp->cc_objs;
1077	ccp->cc_filled = bkt;
1078	ccp->cc_objs = objs;
1079}
1080
1081/*
1082 * Allocate a bucket from the bucket layer.
1083 */
1084static mcache_bkt_t *
1085mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1086{
1087	mcache_bkt_t *bkt;
1088
1089	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1090		/*
1091		 * The bucket layer lock is held by another CPU; increase
1092		 * the contention count so that we can later resize the
1093		 * bucket size accordingly.
1094		 */
1095		MCACHE_LOCK(&cp->mc_bkt_lock);
1096		cp->mc_bkt_contention++;
1097	}
1098
1099	if ((bkt = blp->bl_list) != NULL) {
1100		blp->bl_list = bkt->bkt_next;
1101		if (--blp->bl_total < blp->bl_min)
1102			blp->bl_min = blp->bl_total;
1103		blp->bl_alloc++;
1104	}
1105
1106	if (btp != NULL)
1107		*btp = cp->cache_bkttype;
1108
1109	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1110
1111	return (bkt);
1112}
1113
1114/*
1115 * Free a bucket to the bucket layer.
1116 */
1117static void
1118mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1119{
1120	MCACHE_LOCK(&cp->mc_bkt_lock);
1121
1122	bkt->bkt_next = blp->bl_list;
1123	blp->bl_list = bkt;
1124	blp->bl_total++;
1125
1126	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1127}
1128
1129/*
1130 * Enable the bucket layer of a cache.
1131 */
1132static void
1133mcache_cache_bkt_enable(mcache_t *cp)
1134{
1135	mcache_cpu_t *ccp;
1136	int cpu;
1137
1138	if (cp->mc_flags & MCF_NOCPUCACHE)
1139		return;
1140
1141	for (cpu = 0; cpu < ncpu; cpu++) {
1142		ccp = &cp->mc_cpu[cpu];
1143		MCACHE_LOCK(&ccp->cc_lock);
1144		ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1145		MCACHE_UNLOCK(&ccp->cc_lock);
1146	}
1147}
1148
1149/*
1150 * Purge all buckets from a cache and disable its bucket layer.
1151 */
1152static void
1153mcache_bkt_purge(mcache_t *cp)
1154{
1155	mcache_cpu_t *ccp;
1156	mcache_bkt_t *bp, *pbp;
1157	mcache_bkttype_t *btp;
1158	int cpu, objs, pobjs;
1159
1160	for (cpu = 0; cpu < ncpu; cpu++) {
1161		ccp = &cp->mc_cpu[cpu];
1162
1163		MCACHE_LOCK(&ccp->cc_lock);
1164
1165		btp = cp->cache_bkttype;
1166		bp = ccp->cc_filled;
1167		pbp = ccp->cc_pfilled;
1168		objs = ccp->cc_objs;
1169		pobjs = ccp->cc_pobjs;
1170		ccp->cc_filled = NULL;
1171		ccp->cc_pfilled = NULL;
1172		ccp->cc_objs = -1;
1173		ccp->cc_pobjs = -1;
1174		ccp->cc_bktsize = 0;
1175
1176		MCACHE_UNLOCK(&ccp->cc_lock);
1177
1178		if (bp != NULL)
1179			mcache_bkt_destroy(cp, btp, bp, objs);
1180		if (pbp != NULL)
1181			mcache_bkt_destroy(cp, btp, pbp, pobjs);
1182	}
1183
1184	/*
1185	 * Updating the working set back to back essentially sets
1186	 * the working set size to zero, so everything is reapable.
1187	 */
1188	mcache_bkt_ws_update(cp);
1189	mcache_bkt_ws_update(cp);
1190
1191	mcache_bkt_ws_reap(cp);
1192}
1193
1194/*
1195 * Free one or more objects in the bucket to the slab layer,
1196 * and also free the bucket itself.
1197 */
1198static void
1199mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1200    int nobjs)
1201{
1202	if (nobjs > 0) {
1203		mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1204
1205		if (cp->mc_flags & MCF_DEBUG) {
1206			mcache_obj_t *o = top;
1207			int cnt = 0;
1208
1209			/*
1210			 * Verify that the chain of objects in the bucket is
1211			 * valid.  Any mismatch here means a mistake when the
1212			 * object(s) were freed to the CPU layer, so we panic.
1213			 */
1214			while (o != NULL) {
1215				o = o->obj_next;
1216				++cnt;
1217			}
1218			if (cnt != nobjs) {
1219				panic("mcache_bkt_destroy: %s cp %p corrupted "
1220				    "list in bkt %p (nobjs %d actual %d)\n",
1221				    cp->mc_name, (void *)cp, (void *)bkt,
1222				    nobjs, cnt);
1223			}
1224		}
1225
1226		/* Advise the slab layer to purge the object(s) */
1227		(*cp->mc_slab_free)(cp->mc_private, top,
1228		    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1229	}
1230	mcache_free(btp->bt_cache, bkt);
1231}
1232
1233/*
1234 * Update the bucket layer working set statistics.
1235 */
1236static void
1237mcache_bkt_ws_update(mcache_t *cp)
1238{
1239	MCACHE_LOCK(&cp->mc_bkt_lock);
1240
1241	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1242	cp->mc_full.bl_min = cp->mc_full.bl_total;
1243	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1244	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1245
1246	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1247}
1248
1249/*
1250 * Reap all buckets that are beyond the working set.
1251 */
1252static void
1253mcache_bkt_ws_reap(mcache_t *cp)
1254{
1255	long reap;
1256	mcache_bkt_t *bkt;
1257	mcache_bkttype_t *btp;
1258
1259	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1260	while (reap-- &&
1261	    (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1262		mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1263
1264	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1265	while (reap-- &&
1266	    (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1267		mcache_bkt_destroy(cp, btp, bkt, 0);
1268}
1269
1270static void
1271mcache_reap_timeout(thread_call_param_t dummy __unused,
1272    thread_call_param_t arg)
1273{
1274	volatile UInt32 *flag = arg;
1275
1276	ASSERT(flag == &mcache_reaping);
1277
1278	*flag = 0;
1279}
1280
1281static void
1282mcache_reap_done(void *flag)
1283{
1284	uint64_t deadline, leeway;
1285
1286	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1287	    &deadline);
1288	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1289	    NSEC_PER_SEC, &leeway);
1290	thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1291	    deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1292}
1293
1294static void
1295mcache_reap_start(void *arg)
1296{
1297	UInt32 *flag = arg;
1298
1299	ASSERT(flag == &mcache_reaping);
1300
1301	mcache_applyall(mcache_cache_reap);
1302	mcache_dispatch(mcache_reap_done, flag);
1303}
1304
1305__private_extern__ void
1306mcache_reap(void)
1307{
1308	UInt32 *flag = &mcache_reaping;
1309
1310	if (mcache_llock_owner == current_thread() ||
1311	    !OSCompareAndSwap(0, 1, flag))
1312		return;
1313
1314	mcache_dispatch(mcache_reap_start, flag);
1315}
1316
1317static void
1318mcache_cache_reap(mcache_t *cp)
1319{
1320	mcache_bkt_ws_reap(cp);
1321}
1322
1323/*
1324 * Performs period maintenance on a cache.
1325 */
1326static void
1327mcache_cache_update(mcache_t *cp)
1328{
1329	int need_bkt_resize = 0;
1330	int need_bkt_reenable = 0;
1331
1332	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1333
1334	mcache_bkt_ws_update(cp);
1335
1336	/*
1337	 * Cache resize and post-purge reenable are mutually exclusive.
1338	 * If the cache was previously purged, there is no point of
1339	 * increasing the bucket size as there was an indication of
1340	 * memory pressure on the system.
1341	 */
1342	lck_mtx_lock_spin(&cp->mc_sync_lock);
1343	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1344		need_bkt_reenable = 1;
1345	lck_mtx_unlock(&cp->mc_sync_lock);
1346
1347	MCACHE_LOCK(&cp->mc_bkt_lock);
1348	/*
1349	 * If the contention count is greater than the threshold, and if
1350	 * we are not already at the maximum bucket size, increase it.
1351	 * Otherwise, if this cache was previously purged by the user
1352	 * then we simply reenable it.
1353	 */
1354	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1355	    (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1356	    mcache_bkt_contention && !need_bkt_reenable)
1357		need_bkt_resize = 1;
1358
1359	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1360	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1361
1362	if (need_bkt_resize)
1363		mcache_dispatch(mcache_cache_bkt_resize, cp);
1364	else if (need_bkt_reenable)
1365		mcache_dispatch(mcache_cache_enable, cp);
1366}
1367
1368/*
1369 * Recompute a cache's bucket size.  This is an expensive operation
1370 * and should not be done frequently; larger buckets provide for a
1371 * higher transfer rate with the bucket while smaller buckets reduce
1372 * the memory consumption.
1373 */
1374static void
1375mcache_cache_bkt_resize(void *arg)
1376{
1377	mcache_t *cp = arg;
1378	mcache_bkttype_t *btp = cp->cache_bkttype;
1379
1380	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1381		mcache_bkt_purge(cp);
1382
1383		/*
1384		 * Upgrade to the next bucket type with larger bucket size;
1385		 * temporarily set the previous contention snapshot to a
1386		 * negative number to prevent unnecessary resize request.
1387		 */
1388		MCACHE_LOCK(&cp->mc_bkt_lock);
1389		cp->cache_bkttype = ++btp;
1390		cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1391		MCACHE_UNLOCK(&cp->mc_bkt_lock);
1392
1393		mcache_cache_enable(cp);
1394	}
1395}
1396
1397/*
1398 * Reenable a previously disabled cache due to purge.
1399 */
1400static void
1401mcache_cache_enable(void *arg)
1402{
1403	mcache_t *cp = arg;
1404
1405	lck_mtx_lock_spin(&cp->mc_sync_lock);
1406	cp->mc_purge_cnt = 0;
1407	cp->mc_enable_cnt = 0;
1408	lck_mtx_unlock(&cp->mc_sync_lock);
1409
1410	mcache_cache_bkt_enable(cp);
1411}
1412
1413static void
1414mcache_update_timeout(__unused void *arg)
1415{
1416	uint64_t deadline, leeway;
1417
1418	clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1419	    &deadline);
1420	clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1421	    NSEC_PER_SEC, &leeway);
1422	thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1423	    deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1424}
1425
1426static void
1427mcache_update(thread_call_param_t arg __unused,
1428    thread_call_param_t dummy __unused)
1429{
1430	mcache_applyall(mcache_cache_update);
1431	mcache_update_timeout(NULL);
1432}
1433
1434static void
1435mcache_applyall(void (*func)(mcache_t *))
1436{
1437	mcache_t *cp;
1438
1439	MCACHE_LIST_LOCK();
1440	LIST_FOREACH(cp, &mcache_head, mc_list) {
1441		func(cp);
1442	}
1443	MCACHE_LIST_UNLOCK();
1444}
1445
1446static void
1447mcache_dispatch(void (*func)(void *), void *arg)
1448{
1449	ASSERT(func != NULL);
1450	timeout(func, arg, hz/1000);
1451}
1452
1453__private_extern__ void
1454mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1455    struct timeval *base_ts)
1456{
1457	struct timeval now, base = { 0, 0 };
1458	void *stack[MCACHE_STACK_DEPTH + 1];
1459	struct mca_trn *transaction;
1460
1461	transaction = &mca->mca_trns[mca->mca_next_trn];
1462
1463	mca->mca_addr = addr;
1464	mca->mca_cache = cp;
1465
1466	transaction->mca_thread = current_thread();
1467
1468	bzero(stack, sizeof (stack));
1469	transaction->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1470	bcopy(&stack[1], transaction->mca_stack,
1471		sizeof (transaction->mca_stack));
1472
1473	microuptime(&now);
1474	if (base_ts != NULL)
1475		base = *base_ts;
1476	/* tstamp is in ms relative to base_ts */
1477	transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1478	if ((now.tv_sec - base.tv_sec) > 0)
1479		transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1480
1481	mca->mca_next_trn =
1482		(mca->mca_next_trn + 1) % mca_trn_max;
1483}
1484
1485__private_extern__ void
1486mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1487{
1488	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1489	u_int64_t *buf = (u_int64_t *)buf_arg;
1490
1491	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1492	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1493
1494	while (buf < buf_end)
1495		*buf++ = pattern;
1496}
1497
1498__private_extern__ void *
1499mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1500{
1501	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1502	u_int64_t *buf;
1503
1504	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1505	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1506
1507	for (buf = buf_arg; buf < buf_end; buf++) {
1508		if (*buf != pattern)
1509			return (buf);
1510	}
1511	return (NULL);
1512}
1513
1514__private_extern__ void *
1515mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1516    size_t size)
1517{
1518	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1519	u_int64_t *buf;
1520
1521	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1522	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1523
1524	for (buf = buf_arg; buf < buf_end; buf++) {
1525		if (*buf != old) {
1526			mcache_set_pattern(old, buf_arg,
1527			    (uintptr_t)buf - (uintptr_t)buf_arg);
1528			return (buf);
1529		}
1530		*buf = new;
1531	}
1532	return (NULL);
1533}
1534
1535__private_extern__ void
1536mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1537    size_t size)
1538{
1539	void *addr;
1540	u_int64_t *oaddr64;
1541	mcache_obj_t *next;
1542
1543	addr = (void *)((uintptr_t)base + offset);
1544	next = ((mcache_obj_t *)addr)->obj_next;
1545
1546	/* For the "obj_next" pointer in the buffer */
1547	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1548	*oaddr64 = MCACHE_FREE_PATTERN;
1549
1550	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1551	    (caddr_t)base, size)) != NULL) {
1552		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1553		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1554		/* NOTREACHED */
1555	}
1556	((mcache_obj_t *)addr)->obj_next = next;
1557}
1558
1559__private_extern__ void
1560mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1561    size_t size)
1562{
1563	void *addr;
1564	u_int64_t *oaddr64;
1565	mcache_obj_t *next;
1566
1567	addr = (void *)((uintptr_t)base + offset);
1568	next = ((mcache_obj_t *)addr)->obj_next;
1569
1570	/* For the "obj_next" pointer in the buffer */
1571	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1572	*oaddr64 = MCACHE_FREE_PATTERN;
1573
1574	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1575	    MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1576		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1577		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1578		/* NOTREACHED */
1579	}
1580	((mcache_obj_t *)addr)->obj_next = next;
1581}
1582
1583#undef panic
1584
1585#define	DUMP_TRN_FMT() \
1586	    "%s transaction thread %p saved PC stack (%d deep):\n" \
1587	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1588	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1589
1590#define	DUMP_TRN_FIELDS(s, x) \
1591	    s, \
1592	    mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1593	    mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1594	    mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1595	    mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1596	    mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1597	    mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1598	    mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1599	    mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1600	    mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1601
1602#define	MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1603#define	MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1604
1605__private_extern__ char *
1606mcache_dump_mca(mcache_audit_t *mca)
1607{
1608	if (mca_dump_buf == NULL)
1609		return (NULL);
1610
1611	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1612	    "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1613	    DUMP_TRN_FMT()
1614	    DUMP_TRN_FMT(),
1615
1616	    mca, mca->mca_addr, mca->mca_cache,
1617	    mca->mca_cache ? mca->mca_cache->mc_name : "?",
1618	    mca->mca_next_trn,
1619
1620	    DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1621	    DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1622
1623	return (mca_dump_buf);
1624}
1625
1626__private_extern__ void
1627mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1628    int64_t expected, int64_t got)
1629{
1630	if (mca == NULL) {
1631		panic("mcache_audit: buffer %p modified after free at "
1632		    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1633		    offset, got, expected);
1634		/* NOTREACHED */
1635	}
1636
1637	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1638	    "(0x%llx instead of 0x%llx)\n%s\n",
1639	    addr, offset, got, expected, mcache_dump_mca(mca));
1640	/* NOTREACHED */
1641}
1642
1643__private_extern__ int
1644assfail(const char *a, const char *f, int l)
1645{
1646	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1647	return (0);
1648}
1649