1/*
2 * Copyright (c) 2006-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
34 * reserved.  Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator).  Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/queue.h>
50#include <sys/kernel.h>
51#include <sys/systm.h>
52
53#include <kern/debug.h>
54#include <kern/zalloc.h>
55#include <kern/cpu_number.h>
56#include <kern/locks.h>
57
58#include <libkern/libkern.h>
59#include <libkern/OSAtomic.h>
60#include <libkern/OSDebug.h>
61
62#include <mach/vm_param.h>
63#include <machine/limits.h>
64#include <machine/machine_routines.h>
65
66#include <string.h>
67
68#include <sys/mcache.h>
69
70#define	MCACHE_SIZE(n) \
71	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
72
73/* Allocate extra in case we need to manually align the pointer */
74#define	MCACHE_ALLOC_SIZE \
75	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_SIZE)
76
77#define	MCACHE_CPU(c) \
78	(mcache_cpu_t *)((char *)(c) + MCACHE_SIZE(cpu_number()))
79
80/*
81 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
82 * to serialize accesses to the global list of caches in the system.
83 * They also record the thread currently running in the critical
84 * section, so that we can avoid recursive requests to reap the
85 * caches when memory runs low.
86 */
87#define	MCACHE_LIST_LOCK() {				\
88	lck_mtx_lock(mcache_llock);			\
89	mcache_llock_owner = current_thread();		\
90}
91
92#define	MCACHE_LIST_UNLOCK() {				\
93	mcache_llock_owner = NULL;			\
94	lck_mtx_unlock(mcache_llock);			\
95}
96
97#define	MCACHE_LOCK(l)		lck_mtx_lock(l)
98#define	MCACHE_UNLOCK(l)	lck_mtx_unlock(l)
99#define	MCACHE_LOCK_TRY(l)	lck_mtx_try_lock(l)
100
101/* This should be in a header file */
102#define	atomic_add_32(a, n)	((void) OSAddAtomic(n, (volatile SInt32 *)a))
103
104static int ncpu;
105static lck_mtx_t *mcache_llock;
106static struct thread *mcache_llock_owner;
107static lck_attr_t *mcache_llock_attr;
108static lck_grp_t *mcache_llock_grp;
109static lck_grp_attr_t *mcache_llock_grp_attr;
110static struct zone *mcache_zone;
111static unsigned int mcache_reap_interval;
112static UInt32 mcache_reaping;
113static int mcache_ready;
114static int mcache_updating;
115
116static int mcache_bkt_contention = 3;
117#if DEBUG
118static unsigned int mcache_flags = MCF_DEBUG;
119#else
120static unsigned int mcache_flags = 0;
121#endif
122
123#define	DUMP_MCA_BUF_SIZE	512
124static char *mca_dump_buf;
125
126static mcache_bkttype_t mcache_bkttype[] = {
127	{ 1,	4096,	32768,	NULL },
128	{ 3,	2048,	16384,	NULL },
129	{ 7,	1024,	12288,	NULL },
130	{ 15,	256,	8192,	NULL },
131	{ 31,	64,	4096,	NULL },
132	{ 47,	0,	2048,	NULL },
133	{ 63,	0,	1024,	NULL },
134	{ 95,	0,	512,	NULL },
135	{ 143,	0,	256,	NULL },
136	{ 165,	0,	0,	NULL },
137};
138
139static mcache_t *mcache_create_common(const char *, size_t, size_t,
140    mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t,
141    void *, u_int32_t, int, int);
142static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
143    unsigned int, int);
144static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
145static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
146static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
147static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
148    mcache_bkttype_t **);
149static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
150static void mcache_cache_bkt_enable(mcache_t *);
151static void mcache_bkt_purge(mcache_t *);
152static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
153    mcache_bkt_t *, int);
154static void mcache_bkt_ws_update(mcache_t *);
155static void mcache_bkt_ws_reap(mcache_t *);
156static void mcache_dispatch(void (*)(void *), void *);
157static void mcache_cache_reap(mcache_t *);
158static void mcache_cache_update(mcache_t *);
159static void mcache_cache_bkt_resize(void *);
160static void mcache_cache_enable(void *);
161static void mcache_update(void *);
162static void mcache_update_timeout(void *);
163static void mcache_applyall(void (*)(mcache_t *));
164static void mcache_reap_start(void *);
165static void mcache_reap_done(void *);
166static void mcache_reap_timeout(void *);
167static void mcache_notify(mcache_t *, u_int32_t);
168static void mcache_purge(void *);
169
170static LIST_HEAD(, mcache) mcache_head;
171mcache_t *mcache_audit_cache;
172
173/*
174 * Initialize the framework; this is currently called as part of BSD init.
175 */
176__private_extern__ void
177mcache_init(void)
178{
179	mcache_bkttype_t *btp;
180	unsigned int i;
181	char name[32];
182
183	ncpu = ml_get_max_cpus();
184
185	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
186	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
187	    mcache_llock_grp_attr);
188	mcache_llock_attr = lck_attr_alloc_init();
189	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
190
191	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
192	    PAGE_SIZE, "mcache");
193	if (mcache_zone == NULL)
194		panic("mcache_init: failed to allocate mcache zone\n");
195
196	LIST_INIT(&mcache_head);
197
198	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
199		btp = &mcache_bkttype[i];
200		(void) snprintf(name, sizeof (name), "bkt_%d",
201		    btp->bt_bktsize);
202		btp->bt_cache = mcache_create(name,
203		    (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
204	}
205
206	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags));
207	mcache_flags &= MCF_FLAGS_MASK;
208
209	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
210	    0, 0, MCR_SLEEP);
211
212	mcache_reap_interval = 15 * hz;
213	mcache_applyall(mcache_cache_bkt_enable);
214	mcache_ready = 1;
215}
216
217/*
218 * Return the global mcache flags.
219 */
220__private_extern__ unsigned int
221mcache_getflags(void)
222{
223	return (mcache_flags);
224}
225
226/*
227 * Create a cache using the zone allocator as the backend slab allocator.
228 * The caller may specify any alignment for the object; if it specifies 0
229 * the default alignment (MCACHE_ALIGN) will be used.
230 */
231__private_extern__ mcache_t *
232mcache_create(const char *name, size_t bufsize, size_t align,
233    u_int32_t flags, int wait)
234{
235	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
236	    mcache_slab_free, mcache_slab_audit, NULL, NULL, flags, 1, wait));
237}
238
239/*
240 * Create a cache using a custom backend slab allocator.  Since the caller
241 * is responsible for allocation, no alignment guarantee will be provided
242 * by this framework.
243 */
244__private_extern__ mcache_t *
245mcache_create_ext(const char *name, size_t bufsize,
246    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
247    mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int wait)
248{
249	return (mcache_create_common(name, bufsize, 0, allocfn,
250	    freefn, auditfn, notifyfn, arg, flags, 0, wait));
251}
252
253/*
254 * Common cache creation routine.
255 */
256static mcache_t *
257mcache_create_common(const char *name, size_t bufsize, size_t align,
258    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
259    mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int need_zone,
260    int wait)
261{
262	mcache_bkttype_t *btp;
263	mcache_t *cp = NULL;
264	size_t chunksize;
265	void *buf, **pbuf;
266	int c;
267	char lck_name[64];
268
269	/* If auditing is on and print buffer is NULL, allocate it now */
270	if ((flags & MCF_AUDIT) && mca_dump_buf == NULL) {
271		int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
272		MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
273		    malloc_wait | M_ZERO);
274		if (mca_dump_buf == NULL)
275			return (NULL);
276	}
277
278	if (!(wait & MCR_NOSLEEP))
279		buf = zalloc(mcache_zone);
280	else
281		buf = zalloc_noblock(mcache_zone);
282
283	if (buf == NULL)
284		goto fail;
285
286	bzero(buf, MCACHE_ALLOC_SIZE);
287
288	/*
289	 * In case we didn't get a cache-aligned memory, round it up
290	 * accordingly.  This is needed in order to get the rest of
291	 * structure members aligned properly.  It also means that
292	 * the memory span gets shifted due to the round up, but it
293	 * is okay since we've allocated extra space for this.
294	 */
295	cp = (mcache_t *)
296	    P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_SIZE);
297	pbuf = (void **)((intptr_t)cp - sizeof (void *));
298	*pbuf = buf;
299
300	/*
301	 * Guaranteed alignment is valid only when we use the internal
302	 * slab allocator (currently set to use the zone allocator).
303	 */
304	if (!need_zone)
305		align = 1;
306	else if (align == 0)
307		align = MCACHE_ALIGN;
308
309	if ((align & (align - 1)) != 0)
310		panic("mcache_create: bad alignment %lu", align);
311
312	cp->mc_align = align;
313	cp->mc_slab_alloc = allocfn;
314	cp->mc_slab_free = freefn;
315	cp->mc_slab_audit = auditfn;
316	cp->mc_slab_notify = notifyfn;
317	cp->mc_private = need_zone ? cp : arg;
318	cp->mc_bufsize = bufsize;
319	cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
320
321	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
322
323	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
324	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
325	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
326	    cp->mc_cpu_lock_grp_attr);
327	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
328
329	/*
330	 * Allocation chunk size is the object's size plus any extra size
331	 * needed to satisfy the object's alignment.  It is enforced to be
332	 * at least the size of an LP64 pointer to simplify auditing and to
333	 * handle multiple-element allocation requests, where the elements
334	 * returned are linked together in a list.
335	 */
336	chunksize = MAX(bufsize, sizeof (u_int64_t));
337	if (need_zone) {
338		/* Enforce 64-bit minimum alignment for zone-based buffers */
339		align = MAX(align, sizeof (u_int64_t));
340		chunksize += sizeof (void *) + align;
341		chunksize = P2ROUNDUP(chunksize, align);
342		if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
343		    PAGE_SIZE, cp->mc_name)) == NULL)
344			goto fail;
345		zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
346	}
347	cp->mc_chunksize = chunksize;
348
349	/*
350	 * Initialize the bucket layer.
351	 */
352	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
353	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
354	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
355	    cp->mc_bkt_lock_grp_attr);
356	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
357	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
358	    cp->mc_bkt_lock_attr);
359
360	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
361	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
362	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
363	    cp->mc_sync_lock_grp_attr);
364	cp->mc_sync_lock_attr = lck_attr_alloc_init();
365	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
366	    cp->mc_sync_lock_attr);
367
368	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
369		continue;
370
371	cp->cache_bkttype = btp;
372
373	/*
374	 * Initialize the CPU layer.  Each per-CPU structure is aligned
375	 * on the CPU cache line boundary to prevent false sharing.
376	 */
377	for (c = 0; c < ncpu; c++) {
378		mcache_cpu_t *ccp = &cp->mc_cpu[c];
379
380		VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_SIZE));
381		lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
382		    cp->mc_cpu_lock_attr);
383		ccp->cc_objs = -1;
384		ccp->cc_pobjs = -1;
385	}
386
387	if (mcache_ready)
388		mcache_cache_bkt_enable(cp);
389
390	/* TODO: dynamically create sysctl for stats */
391
392	MCACHE_LIST_LOCK();
393	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
394	MCACHE_LIST_UNLOCK();
395
396	/*
397	 * If cache buckets are enabled and this is the first cache
398	 * created, start the periodic cache update.
399	 */
400	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
401		mcache_updating = 1;
402		mcache_update_timeout(NULL);
403	}
404	if (cp->mc_flags & MCF_DEBUG) {
405		printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
406		    "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
407		    arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
408	}
409	return (cp);
410
411fail:
412	if (buf != NULL)
413		zfree(mcache_zone, buf);
414	return (NULL);
415}
416
417/*
418 * Allocate one or more objects from a cache.
419 */
420__private_extern__ unsigned int
421mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
422{
423	mcache_cpu_t *ccp;
424	mcache_obj_t **top = &(*list);
425	mcache_bkt_t *bkt;
426	unsigned int need = num;
427	boolean_t nwretry = FALSE;
428
429	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
430	VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
431
432	ASSERT(list != NULL);
433	*list = NULL;
434
435	if (num == 0)
436		return (0);
437
438retry_alloc:
439	/* We may not always be running in the same CPU in case of retries */
440	ccp = MCACHE_CPU(cp);
441
442	MCACHE_LOCK(&ccp->cc_lock);
443	for (;;) {
444		/*
445		 * If we have an object in the current CPU's filled bucket,
446		 * chain the object to any previous objects and return if
447		 * we've satisfied the number of requested objects.
448		 */
449		if (ccp->cc_objs > 0) {
450			mcache_obj_t *tail;
451			int objs;
452
453			/*
454			 * Objects in the bucket are already linked together
455			 * with the most recently freed object at the head of
456			 * the list; grab as many objects as we can.
457			 */
458			objs = MIN((unsigned int)ccp->cc_objs, need);
459			*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
460			ccp->cc_objs -= objs;
461			ccp->cc_alloc += objs;
462
463			tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
464			list = &tail->obj_next;
465			*list = NULL;
466
467			/* If we got them all, return to caller */
468			if ((need -= objs) == 0) {
469				MCACHE_UNLOCK(&ccp->cc_lock);
470				if (cp->mc_flags & MCF_DEBUG)
471					goto debug_alloc;
472
473				return (num);
474			}
475		}
476
477		/*
478		 * The CPU's filled bucket is empty.  If the previous filled
479		 * bucket was full, exchange and try again.
480		 */
481		if (ccp->cc_pobjs > 0) {
482			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
483			continue;
484		}
485
486		/*
487		 * If the bucket layer is disabled, allocate from slab.  This
488		 * can happen either because MCF_NOCPUCACHE is set, or because
489		 * the bucket layer is currently being resized.
490		 */
491		if (ccp->cc_bktsize == 0)
492			break;
493
494		/*
495		 * Both of the CPU's buckets are empty; try to get a full
496		 * bucket from the bucket layer.  Upon success, refill this
497		 * CPU and place any empty bucket into the empty list.
498		 */
499		bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
500		if (bkt != NULL) {
501			if (ccp->cc_pfilled != NULL)
502				mcache_bkt_free(cp, &cp->mc_empty,
503				    ccp->cc_pfilled);
504			mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
505			continue;
506		}
507
508		/*
509		 * The bucket layer has no full buckets; allocate the
510		 * object(s) directly from the slab layer.
511		 */
512		break;
513	}
514	MCACHE_UNLOCK(&ccp->cc_lock);
515
516	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
517
518	/*
519	 * If this is a blocking allocation, or if it is non-blocking and
520	 * the cache's full bucket is non-empty, then retry the allocation.
521	 */
522	if (need > 0) {
523		if (!(wait & MCR_NONBLOCKING)) {
524			atomic_add_32(&cp->mc_wretry_cnt, 1);
525			goto retry_alloc;
526		} else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
527		    !mcache_bkt_isempty(cp)) {
528			if (!nwretry)
529				nwretry = TRUE;
530			atomic_add_32(&cp->mc_nwretry_cnt, 1);
531			goto retry_alloc;
532		} else if (nwretry) {
533			atomic_add_32(&cp->mc_nwfail_cnt, 1);
534		}
535	}
536
537	if (!(cp->mc_flags & MCF_DEBUG))
538		return (num - need);
539
540debug_alloc:
541	if (cp->mc_flags & MCF_VERIFY) {
542		mcache_obj_t **o = top;
543		unsigned int n;
544
545		n = 0;
546		/*
547		 * Verify that the chain of objects have the same count as
548		 * what we are about to report to the caller.  Any mismatch
549		 * here means that the object list is insanely broken and
550		 * therefore we must panic.
551		 */
552		while (*o != NULL) {
553			o = &(*o)->obj_next;
554			++n;
555		}
556		if (n != (num - need)) {
557			panic("mcache_alloc_ext: %s cp %p corrupted list "
558			    "(got %d actual %d)\n", cp->mc_name,
559			    (void *)cp, num - need, n);
560		}
561	}
562
563	/* Invoke the slab layer audit callback if auditing is enabled */
564	if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL)
565		(*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
566
567	return (num - need);
568}
569
570/*
571 * Allocate a single object from a cache.
572 */
573__private_extern__ void *
574mcache_alloc(mcache_t *cp, int wait)
575{
576	mcache_obj_t *buf;
577
578	(void) mcache_alloc_ext(cp, &buf, 1, wait);
579	return (buf);
580}
581
582__private_extern__ void
583mcache_waiter_inc(mcache_t *cp)
584{
585	atomic_add_32(&cp->mc_waiter_cnt, 1);
586}
587
588__private_extern__ void
589mcache_waiter_dec(mcache_t *cp)
590{
591	atomic_add_32(&cp->mc_waiter_cnt, -1);
592}
593
594__private_extern__ boolean_t
595mcache_bkt_isempty(mcache_t *cp)
596{
597	/*
598	 * This isn't meant to accurately tell whether there are
599	 * any full buckets in the cache; it is simply a way to
600	 * obtain "hints" about the state of the cache.
601	 */
602	return (cp->mc_full.bl_total == 0);
603}
604
605/*
606 * Notify the slab layer about an event.
607 */
608static void
609mcache_notify(mcache_t *cp, u_int32_t event)
610{
611	if (cp->mc_slab_notify != NULL)
612		(*cp->mc_slab_notify)(cp->mc_private, event);
613}
614
615/*
616 * Purge the cache and disable its buckets.
617 */
618static void
619mcache_purge(void *arg)
620{
621	mcache_t *cp = arg;
622
623	mcache_bkt_purge(cp);
624	/*
625	 * We cannot simply call mcache_cache_bkt_enable() from here as
626	 * a bucket resize may be in flight and we would cause the CPU
627	 * layers of the cache to point to different sizes.  Therefore,
628	 * we simply increment the enable count so that during the next
629	 * periodic cache update the buckets can be reenabled.
630	 */
631	lck_mtx_lock_spin(&cp->mc_sync_lock);
632	cp->mc_enable_cnt++;
633	lck_mtx_unlock(&cp->mc_sync_lock);
634
635}
636
637__private_extern__ boolean_t
638mcache_purge_cache(mcache_t *cp)
639{
640	/*
641	 * Purging a cache that has no per-CPU caches or is already
642	 * in the process of being purged is rather pointless.
643	 */
644	if (cp->mc_flags & MCF_NOCPUCACHE)
645		return (FALSE);
646
647	lck_mtx_lock_spin(&cp->mc_sync_lock);
648	if (cp->mc_purge_cnt > 0) {
649		lck_mtx_unlock(&cp->mc_sync_lock);
650		return (FALSE);
651	}
652	cp->mc_purge_cnt++;
653	lck_mtx_unlock(&cp->mc_sync_lock);
654
655	mcache_dispatch(mcache_purge, cp);
656
657	return (TRUE);
658}
659
660/*
661 * Free a single object to a cache.
662 */
663__private_extern__ void
664mcache_free(mcache_t *cp, void *buf)
665{
666	((mcache_obj_t *)buf)->obj_next = NULL;
667	mcache_free_ext(cp, (mcache_obj_t *)buf);
668}
669
670/*
671 * Free one or more objects to a cache.
672 */
673__private_extern__ void
674mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
675{
676	mcache_cpu_t *ccp = MCACHE_CPU(cp);
677	mcache_bkttype_t *btp;
678	mcache_obj_t *nlist;
679	mcache_bkt_t *bkt;
680
681	/* Invoke the slab layer audit callback if auditing is enabled */
682	if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL)
683		(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
684
685	MCACHE_LOCK(&ccp->cc_lock);
686	for (;;) {
687		/*
688		 * If there is space in the current CPU's filled bucket, put
689		 * the object there and return once all objects are freed.
690		 * Note the cast to unsigned integer takes care of the case
691		 * where the bucket layer is disabled (when cc_objs is -1).
692		 */
693		if ((unsigned int)ccp->cc_objs <
694		    (unsigned int)ccp->cc_bktsize) {
695			/*
696			 * Reverse the list while we place the object into the
697			 * bucket; this effectively causes the most recently
698			 * freed object(s) to be reused during allocation.
699			 */
700			nlist = list->obj_next;
701			list->obj_next = (ccp->cc_objs == 0) ? NULL :
702			    ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
703			ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
704			ccp->cc_free++;
705
706			if ((list = nlist) != NULL)
707				continue;
708
709			/* We are done; return to caller */
710			MCACHE_UNLOCK(&ccp->cc_lock);
711
712			/* If there is a waiter below, notify it */
713			if (cp->mc_waiter_cnt > 0)
714				mcache_notify(cp, MCN_RETRYALLOC);
715			return;
716		}
717
718		/*
719		 * The CPU's filled bucket is full.  If the previous filled
720		 * bucket was empty, exchange and try again.
721		 */
722		if (ccp->cc_pobjs == 0) {
723			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
724			continue;
725		}
726
727		/*
728		 * If the bucket layer is disabled, free to slab.  This can
729		 * happen either because MCF_NOCPUCACHE is set, or because
730		 * the bucket layer is currently being resized.
731		 */
732		if (ccp->cc_bktsize == 0)
733			break;
734
735		/*
736		 * Both of the CPU's buckets are full; try to get an empty
737		 * bucket from the bucket layer.  Upon success, empty this
738		 * CPU and place any full bucket into the full list.
739		 */
740		bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
741		if (bkt != NULL) {
742			if (ccp->cc_pfilled != NULL)
743				mcache_bkt_free(cp, &cp->mc_full,
744				    ccp->cc_pfilled);
745			mcache_cpu_refill(ccp, bkt, 0);
746			continue;
747		}
748
749		/*
750		 * We need an empty bucket to put our freed objects into
751		 * but couldn't get an empty bucket from the bucket layer;
752		 * attempt to allocate one.  We do not want to block for
753		 * allocation here, and if the bucket allocation fails
754		 * we will simply fall through to the slab layer.
755		 */
756		MCACHE_UNLOCK(&ccp->cc_lock);
757		bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
758		MCACHE_LOCK(&ccp->cc_lock);
759
760		if (bkt != NULL) {
761			/*
762			 * We have an empty bucket, but since we drop the
763			 * CPU lock above, the cache's bucket size may have
764			 * changed.  If so, free the bucket and try again.
765			 */
766			if (ccp->cc_bktsize != btp->bt_bktsize) {
767				MCACHE_UNLOCK(&ccp->cc_lock);
768				mcache_free(btp->bt_cache, bkt);
769				MCACHE_LOCK(&ccp->cc_lock);
770				continue;
771			}
772
773			/*
774			 * We have an empty bucket of the right size;
775			 * add it to the bucket layer and try again.
776			 */
777			mcache_bkt_free(cp, &cp->mc_empty, bkt);
778			continue;
779		}
780
781		/*
782		 * The bucket layer has no empty buckets; free the
783		 * object(s) directly to the slab layer.
784		 */
785		break;
786	}
787	MCACHE_UNLOCK(&ccp->cc_lock);
788
789	/* If there is a waiter below, notify it */
790	if (cp->mc_waiter_cnt > 0)
791		mcache_notify(cp, MCN_RETRYALLOC);
792
793	/* Advise the slab layer to purge the object(s) */
794	(*cp->mc_slab_free)(cp->mc_private, list,
795	    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
796}
797
798/*
799 * Cache destruction routine.
800 */
801__private_extern__ void
802mcache_destroy(mcache_t *cp)
803{
804	void **pbuf;
805
806	MCACHE_LIST_LOCK();
807	LIST_REMOVE(cp, mc_list);
808	MCACHE_LIST_UNLOCK();
809
810	mcache_bkt_purge(cp);
811
812	/*
813	 * This cache is dead; there should be no further transaction.
814	 * If it's still invoked, make sure that it induces a fault.
815	 */
816	cp->mc_slab_alloc = NULL;
817	cp->mc_slab_free = NULL;
818	cp->mc_slab_audit = NULL;
819
820	lck_attr_free(cp->mc_bkt_lock_attr);
821	lck_grp_free(cp->mc_bkt_lock_grp);
822	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
823
824	lck_attr_free(cp->mc_cpu_lock_attr);
825	lck_grp_free(cp->mc_cpu_lock_grp);
826	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
827
828	lck_attr_free(cp->mc_sync_lock_attr);
829	lck_grp_free(cp->mc_sync_lock_grp);
830	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
831
832	/*
833	 * TODO: We need to destroy the zone here, but cannot do it
834	 * because there is no such way to achieve that.  Until then
835	 * the memory allocated for the zone structure is leaked.
836	 * Once it is achievable, uncomment these lines:
837	 *
838	 *	if (cp->mc_slab_zone != NULL) {
839	 *		zdestroy(cp->mc_slab_zone);
840	 *		cp->mc_slab_zone = NULL;
841	 *	}
842	 */
843
844	/* Get the original address since we're about to free it */
845	pbuf = (void **)((intptr_t)cp - sizeof (void *));
846
847	zfree(mcache_zone, *pbuf);
848}
849
850/*
851 * Internal slab allocator used as a backend for simple caches.  The current
852 * implementation uses the zone allocator for simplicity reasons.
853 */
854static unsigned int
855mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
856{
857	mcache_t *cp = arg;
858	unsigned int need = num;
859	size_t offset = 0;
860	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
861	u_int32_t flags = cp->mc_flags;
862	void *buf, *base, **pbuf;
863	mcache_obj_t **list = *plist;
864
865	*list = NULL;
866
867	/*
868	 * The address of the object returned to the caller is an
869	 * offset from the 64-bit aligned base address only if the
870	 * cache's alignment requirement is neither 1 nor 8 bytes.
871	 */
872	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
873		offset = cp->mc_align;
874
875	for (;;) {
876		if (!(wait & MCR_NOSLEEP))
877			buf = zalloc(cp->mc_slab_zone);
878		else
879			buf = zalloc_noblock(cp->mc_slab_zone);
880
881		if (buf == NULL)
882			break;
883
884		/* Get the 64-bit aligned base address for this object */
885		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
886		    sizeof (u_int64_t));
887
888		/*
889		 * Wind back a pointer size from the aligned base and
890		 * save the original address so we can free it later.
891		 */
892		pbuf = (void **)((intptr_t)base - sizeof (void *));
893		*pbuf = buf;
894
895		/*
896		 * If auditing is enabled, patternize the contents of
897		 * the buffer starting from the 64-bit aligned base to
898		 * the end of the buffer; the length is rounded up to
899		 * the nearest 64-bit multiply; this is because we use
900		 * 64-bit memory access to set/check the pattern.
901		 */
902		if (flags & MCF_AUDIT) {
903			VERIFY(((intptr_t)base + rsize) <=
904			    ((intptr_t)buf + cp->mc_chunksize));
905			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
906		}
907
908		/*
909		 * Fix up the object's address to fulfill the cache's
910		 * alignment requirement (if needed) and return this
911		 * to the caller.
912		 */
913		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
914		    ((intptr_t)buf + cp->mc_chunksize));
915		*list = (mcache_obj_t *)((intptr_t)base + offset);
916
917		(*list)->obj_next = NULL;
918		list = *plist = &(*list)->obj_next;
919
920		/* If we got them all, return to mcache */
921		if (--need == 0)
922			break;
923	}
924
925	return (num - need);
926}
927
928/*
929 * Internal slab deallocator used as a backend for simple caches.
930 */
931static void
932mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
933{
934	mcache_t *cp = arg;
935	mcache_obj_t *nlist;
936	size_t offset = 0;
937	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
938	u_int32_t flags = cp->mc_flags;
939	void *base;
940	void **pbuf;
941
942	/*
943	 * The address of the object is an offset from a 64-bit
944	 * aligned base address only if the cache's alignment
945	 * requirement is neither 1 nor 8 bytes.
946	 */
947	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
948		offset = cp->mc_align;
949
950	for (;;) {
951		nlist = list->obj_next;
952		list->obj_next = NULL;
953
954		/* Get the 64-bit aligned base address of this object */
955		base = (void *)((intptr_t)list - offset);
956		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
957
958		/* Get the original address since we're about to free it */
959		pbuf = (void **)((intptr_t)base - sizeof (void *));
960
961		if (flags & MCF_AUDIT) {
962			VERIFY(((intptr_t)base + rsize) <=
963			    ((intptr_t)*pbuf + cp->mc_chunksize));
964			mcache_audit_free_verify(NULL, base, offset, rsize);
965		}
966
967		/* Free it to zone */
968		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
969		    ((intptr_t)*pbuf + cp->mc_chunksize));
970		zfree(cp->mc_slab_zone, *pbuf);
971
972		/* No more objects to free; return to mcache */
973		if ((list = nlist) == NULL)
974			break;
975	}
976}
977
978/*
979 * Internal slab auditor for simple caches.
980 */
981static void
982mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
983{
984	mcache_t *cp = arg;
985	size_t offset = 0;
986	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
987	void *base, **pbuf;
988
989	/*
990	 * The address of the object returned to the caller is an
991	 * offset from the 64-bit aligned base address only if the
992	 * cache's alignment requirement is neither 1 nor 8 bytes.
993	 */
994	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
995		offset = cp->mc_align;
996
997	while (list != NULL) {
998		mcache_obj_t *next = list->obj_next;
999
1000		/* Get the 64-bit aligned base address of this object */
1001		base = (void *)((intptr_t)list - offset);
1002		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1003
1004		/* Get the original address */
1005		pbuf = (void **)((intptr_t)base - sizeof (void *));
1006
1007		VERIFY(((intptr_t)base + rsize) <=
1008		    ((intptr_t)*pbuf + cp->mc_chunksize));
1009
1010		if (!alloc)
1011			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1012		else
1013			mcache_audit_free_verify_set(NULL, base, offset, rsize);
1014
1015		list = list->obj_next = next;
1016	}
1017}
1018
1019/*
1020 * Refill the CPU's filled bucket with bkt and save the previous one.
1021 */
1022static void
1023mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1024{
1025	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1026	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1027	ASSERT(ccp->cc_bktsize > 0);
1028
1029	ccp->cc_pfilled = ccp->cc_filled;
1030	ccp->cc_pobjs = ccp->cc_objs;
1031	ccp->cc_filled = bkt;
1032	ccp->cc_objs = objs;
1033}
1034
1035/*
1036 * Allocate a bucket from the bucket layer.
1037 */
1038static mcache_bkt_t *
1039mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1040{
1041	mcache_bkt_t *bkt;
1042
1043	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1044		/*
1045		 * The bucket layer lock is held by another CPU; increase
1046		 * the contention count so that we can later resize the
1047		 * bucket size accordingly.
1048		 */
1049		MCACHE_LOCK(&cp->mc_bkt_lock);
1050		cp->mc_bkt_contention++;
1051	}
1052
1053	if ((bkt = blp->bl_list) != NULL) {
1054		blp->bl_list = bkt->bkt_next;
1055		if (--blp->bl_total < blp->bl_min)
1056			blp->bl_min = blp->bl_total;
1057		blp->bl_alloc++;
1058	}
1059
1060	if (btp != NULL)
1061		*btp = cp->cache_bkttype;
1062
1063	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1064
1065	return (bkt);
1066}
1067
1068/*
1069 * Free a bucket to the bucket layer.
1070 */
1071static void
1072mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1073{
1074	MCACHE_LOCK(&cp->mc_bkt_lock);
1075
1076	bkt->bkt_next = blp->bl_list;
1077	blp->bl_list = bkt;
1078	blp->bl_total++;
1079
1080	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1081}
1082
1083/*
1084 * Enable the bucket layer of a cache.
1085 */
1086static void
1087mcache_cache_bkt_enable(mcache_t *cp)
1088{
1089	mcache_cpu_t *ccp;
1090	int cpu;
1091
1092	if (cp->mc_flags & MCF_NOCPUCACHE)
1093		return;
1094
1095	for (cpu = 0; cpu < ncpu; cpu++) {
1096		ccp = &cp->mc_cpu[cpu];
1097		MCACHE_LOCK(&ccp->cc_lock);
1098		ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1099		MCACHE_UNLOCK(&ccp->cc_lock);
1100	}
1101}
1102
1103/*
1104 * Purge all buckets from a cache and disable its bucket layer.
1105 */
1106static void
1107mcache_bkt_purge(mcache_t *cp)
1108{
1109	mcache_cpu_t *ccp;
1110	mcache_bkt_t *bp, *pbp;
1111	mcache_bkttype_t *btp;
1112	int cpu, objs, pobjs;
1113
1114	for (cpu = 0; cpu < ncpu; cpu++) {
1115		ccp = &cp->mc_cpu[cpu];
1116
1117		MCACHE_LOCK(&ccp->cc_lock);
1118
1119		btp = cp->cache_bkttype;
1120		bp = ccp->cc_filled;
1121		pbp = ccp->cc_pfilled;
1122		objs = ccp->cc_objs;
1123		pobjs = ccp->cc_pobjs;
1124		ccp->cc_filled = NULL;
1125		ccp->cc_pfilled = NULL;
1126		ccp->cc_objs = -1;
1127		ccp->cc_pobjs = -1;
1128		ccp->cc_bktsize = 0;
1129
1130		MCACHE_UNLOCK(&ccp->cc_lock);
1131
1132		if (bp != NULL)
1133			mcache_bkt_destroy(cp, btp, bp, objs);
1134		if (pbp != NULL)
1135			mcache_bkt_destroy(cp, btp, pbp, pobjs);
1136	}
1137
1138	/*
1139	 * Updating the working set back to back essentially sets
1140	 * the working set size to zero, so everything is reapable.
1141	 */
1142	mcache_bkt_ws_update(cp);
1143	mcache_bkt_ws_update(cp);
1144
1145	mcache_bkt_ws_reap(cp);
1146}
1147
1148/*
1149 * Free one or more objects in the bucket to the slab layer,
1150 * and also free the bucket itself.
1151 */
1152static void
1153mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1154    int nobjs)
1155{
1156	if (nobjs > 0) {
1157		mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1158
1159		if (cp->mc_flags & MCF_VERIFY) {
1160			mcache_obj_t *o = top;
1161			int cnt = 0;
1162
1163			/*
1164			 * Verify that the chain of objects in the bucket is
1165			 * valid.  Any mismatch here means a mistake when the
1166			 * object(s) were freed to the CPU layer, so we panic.
1167			 */
1168			while (o != NULL) {
1169				o = o->obj_next;
1170				++cnt;
1171			}
1172			if (cnt != nobjs) {
1173				panic("mcache_bkt_destroy: %s cp %p corrupted "
1174				    "list in bkt %p (nobjs %d actual %d)\n",
1175				    cp->mc_name, (void *)cp, (void *)bkt,
1176				    nobjs, cnt);
1177			}
1178		}
1179
1180		/* Advise the slab layer to purge the object(s) */
1181		(*cp->mc_slab_free)(cp->mc_private, top,
1182		    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1183	}
1184	mcache_free(btp->bt_cache, bkt);
1185}
1186
1187/*
1188 * Update the bucket layer working set statistics.
1189 */
1190static void
1191mcache_bkt_ws_update(mcache_t *cp)
1192{
1193	MCACHE_LOCK(&cp->mc_bkt_lock);
1194
1195	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1196	cp->mc_full.bl_min = cp->mc_full.bl_total;
1197	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1198	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1199
1200	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1201}
1202
1203/*
1204 * Reap all buckets that are beyond the working set.
1205 */
1206static void
1207mcache_bkt_ws_reap(mcache_t *cp)
1208{
1209	long reap;
1210	mcache_bkt_t *bkt;
1211	mcache_bkttype_t *btp;
1212
1213	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1214	while (reap-- &&
1215	    (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1216		mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1217
1218	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1219	while (reap-- &&
1220	    (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1221		mcache_bkt_destroy(cp, btp, bkt, 0);
1222}
1223
1224static void
1225mcache_reap_timeout(void *arg)
1226{
1227	volatile UInt32 *flag = arg;
1228
1229	ASSERT(flag == &mcache_reaping);
1230
1231	*flag = 0;
1232}
1233
1234static void
1235mcache_reap_done(void *flag)
1236{
1237	timeout(mcache_reap_timeout, flag, mcache_reap_interval);
1238}
1239
1240static void
1241mcache_reap_start(void *arg)
1242{
1243	UInt32 *flag = arg;
1244
1245	ASSERT(flag == &mcache_reaping);
1246
1247	mcache_applyall(mcache_cache_reap);
1248	mcache_dispatch(mcache_reap_done, flag);
1249}
1250
1251__private_extern__ void
1252mcache_reap(void)
1253{
1254	UInt32 *flag = &mcache_reaping;
1255
1256	if (mcache_llock_owner == current_thread() ||
1257	    !OSCompareAndSwap(0, 1, flag))
1258		return;
1259
1260	mcache_dispatch(mcache_reap_start, flag);
1261}
1262
1263static void
1264mcache_cache_reap(mcache_t *cp)
1265{
1266	mcache_bkt_ws_reap(cp);
1267}
1268
1269/*
1270 * Performs period maintenance on a cache.
1271 */
1272static void
1273mcache_cache_update(mcache_t *cp)
1274{
1275	int need_bkt_resize = 0;
1276	int need_bkt_reenable = 0;
1277
1278	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1279
1280	mcache_bkt_ws_update(cp);
1281
1282	/*
1283	 * Cache resize and post-purge reenable are mutually exclusive.
1284	 * If the cache was previously purged, there is no point of
1285	 * increasing the bucket size as there was an indication of
1286	 * memory pressure on the system.
1287	 */
1288	lck_mtx_lock_spin(&cp->mc_sync_lock);
1289	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1290		need_bkt_reenable = 1;
1291	lck_mtx_unlock(&cp->mc_sync_lock);
1292
1293	MCACHE_LOCK(&cp->mc_bkt_lock);
1294	/*
1295	 * If the contention count is greater than the threshold, and if
1296	 * we are not already at the maximum bucket size, increase it.
1297	 * Otherwise, if this cache was previously purged by the user
1298	 * then we simply reenable it.
1299	 */
1300	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1301	    (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1302	    mcache_bkt_contention && !need_bkt_reenable)
1303		need_bkt_resize = 1;
1304
1305	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1306	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1307
1308	if (need_bkt_resize)
1309		mcache_dispatch(mcache_cache_bkt_resize, cp);
1310	else if (need_bkt_reenable)
1311		mcache_dispatch(mcache_cache_enable, cp);
1312}
1313
1314/*
1315 * Recompute a cache's bucket size.  This is an expensive operation
1316 * and should not be done frequently; larger buckets provide for a
1317 * higher transfer rate with the bucket while smaller buckets reduce
1318 * the memory consumption.
1319 */
1320static void
1321mcache_cache_bkt_resize(void *arg)
1322{
1323	mcache_t *cp = arg;
1324	mcache_bkttype_t *btp = cp->cache_bkttype;
1325
1326	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1327		mcache_bkt_purge(cp);
1328
1329		/*
1330		 * Upgrade to the next bucket type with larger bucket size;
1331		 * temporarily set the previous contention snapshot to a
1332		 * negative number to prevent unnecessary resize request.
1333		 */
1334		MCACHE_LOCK(&cp->mc_bkt_lock);
1335		cp->cache_bkttype = ++btp;
1336		cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1337		MCACHE_UNLOCK(&cp->mc_bkt_lock);
1338
1339		mcache_cache_enable(cp);
1340	}
1341}
1342
1343/*
1344 * Reenable a previously disabled cache due to purge.
1345 */
1346static void
1347mcache_cache_enable(void *arg)
1348{
1349	mcache_t *cp = arg;
1350
1351	lck_mtx_lock_spin(&cp->mc_sync_lock);
1352	cp->mc_purge_cnt = 0;
1353	cp->mc_enable_cnt = 0;
1354	lck_mtx_unlock(&cp->mc_sync_lock);
1355
1356	mcache_cache_bkt_enable(cp);
1357}
1358
1359static void
1360mcache_update_timeout(__unused void *arg)
1361{
1362	timeout(mcache_update, NULL, mcache_reap_interval);
1363}
1364
1365static void
1366mcache_update(__unused void *arg)
1367{
1368	mcache_applyall(mcache_cache_update);
1369	mcache_dispatch(mcache_update_timeout, NULL);
1370}
1371
1372static void
1373mcache_applyall(void (*func)(mcache_t *))
1374{
1375	mcache_t *cp;
1376
1377	MCACHE_LIST_LOCK();
1378	LIST_FOREACH(cp, &mcache_head, mc_list) {
1379		func(cp);
1380	}
1381	MCACHE_LIST_UNLOCK();
1382}
1383
1384static void
1385mcache_dispatch(void (*func)(void *), void *arg)
1386{
1387	ASSERT(func != NULL);
1388	timeout(func, arg, hz/1000);
1389}
1390
1391__private_extern__ void
1392mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp)
1393{
1394	mca->mca_addr = addr;
1395	mca->mca_cache = cp;
1396	mca->mca_pthread = mca->mca_thread;
1397	mca->mca_thread = current_thread();
1398	bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
1399	mca->mca_pdepth = mca->mca_depth;
1400	bzero(mca->mca_stack, sizeof (mca->mca_stack));
1401	mca->mca_depth = OSBacktrace(mca->mca_stack, MCACHE_STACK_DEPTH);
1402}
1403
1404__private_extern__ void
1405mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1406{
1407	u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1408	u_int64_t *buf = (u_int64_t *)buf_arg;
1409
1410	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1411	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1412
1413	while (buf < buf_end)
1414		*buf++ = pattern;
1415}
1416
1417__private_extern__ void *
1418mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1419{
1420	u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1421	u_int64_t *buf;
1422
1423	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1424	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1425
1426	for (buf = buf_arg; buf < buf_end; buf++) {
1427		if (*buf != pattern)
1428			return (buf);
1429	}
1430	return (NULL);
1431}
1432
1433__private_extern__ void *
1434mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1435    size_t size)
1436{
1437	u_int64_t *buf_end = (u_int64_t *)((char *)buf_arg + size);
1438	u_int64_t *buf;
1439
1440	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1441	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1442
1443	for (buf = buf_arg; buf < buf_end; buf++) {
1444		if (*buf != old) {
1445			mcache_set_pattern(old, buf_arg,
1446			    (uintptr_t)buf - (uintptr_t)buf_arg);
1447			return (buf);
1448		}
1449		*buf = new;
1450	}
1451	return (NULL);
1452}
1453
1454__private_extern__ void
1455mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1456    size_t size)
1457{
1458	void *addr;
1459	u_int64_t *oaddr64;
1460	mcache_obj_t *next;
1461
1462	addr = (void *)((uintptr_t)base + offset);
1463	next = ((mcache_obj_t *)addr)->obj_next;
1464
1465	/* For the "obj_next" pointer in the buffer */
1466	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1467	*oaddr64 = MCACHE_FREE_PATTERN;
1468
1469	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1470	    (caddr_t)base, size)) != NULL) {
1471		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1472		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1473		/* NOTREACHED */
1474	}
1475	((mcache_obj_t *)addr)->obj_next = next;
1476}
1477
1478__private_extern__ void
1479mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1480    size_t size)
1481{
1482	void *addr;
1483	u_int64_t *oaddr64;
1484	mcache_obj_t *next;
1485
1486	addr = (void *)((uintptr_t)base + offset);
1487	next = ((mcache_obj_t *)addr)->obj_next;
1488
1489	/* For the "obj_next" pointer in the buffer */
1490	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1491	*oaddr64 = MCACHE_FREE_PATTERN;
1492
1493	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1494	    MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1495		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1496		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1497		/* NOTREACHED */
1498	}
1499	((mcache_obj_t *)addr)->obj_next = next;
1500}
1501
1502#undef panic(...)
1503
1504__private_extern__ char *
1505mcache_dump_mca(mcache_audit_t *mca)
1506{
1507	if (mca_dump_buf == NULL)
1508		return (NULL);
1509
1510	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1511	    "mca %p: addr %p, cache %p (%s)\n"
1512	    "last transaction; thread %p, saved PC stack (%d deep):\n"
1513	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1514	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1515	    "previous transaction; thread %p, saved PC stack (%d deep):\n"
1516	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1517	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n",
1518	    mca, mca->mca_addr, mca->mca_cache,
1519	    mca->mca_cache ? mca->mca_cache->mc_name : "?",
1520	    mca->mca_thread, mca->mca_depth,
1521	    mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
1522	    mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
1523	    mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
1524	    mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
1525	    mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
1526	    mca->mca_stack[15],
1527	    mca->mca_pthread, mca->mca_pdepth,
1528	    mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
1529	    mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
1530	    mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
1531	    mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
1532	    mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
1533	    mca->mca_pstack[15]);
1534
1535	return (mca_dump_buf);
1536}
1537
1538__private_extern__ void
1539mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1540    int64_t expected, int64_t got)
1541{
1542	if (mca == NULL) {
1543		panic("mcache_audit: buffer %p modified after free at "
1544		    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1545		    offset, got, expected);
1546		/* NOTREACHED */
1547	}
1548
1549	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1550	    "(0x%llx instead of 0x%llx)\n%s\n",
1551	    addr, offset, got, expected, mcache_dump_mca(mca));
1552	/* NOTREACHED */
1553}
1554
1555__private_extern__ int
1556assfail(const char *a, const char *f, int l)
1557{
1558	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1559	return (0);
1560}
1561