1/*
2 * Copyright (c) 2006-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
34 * reserved.  Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator).  Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/queue.h>
50#include <sys/kernel.h>
51#include <sys/systm.h>
52
53#include <kern/debug.h>
54#include <kern/zalloc.h>
55#include <kern/cpu_number.h>
56#include <kern/locks.h>
57
58#include <libkern/libkern.h>
59#include <libkern/OSAtomic.h>
60#include <libkern/OSDebug.h>
61
62#include <mach/vm_param.h>
63#include <machine/limits.h>
64#include <machine/machine_routines.h>
65
66#include <string.h>
67
68#include <sys/mcache.h>
69
70#define	MCACHE_SIZE(n) \
71	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
72
73/* Allocate extra in case we need to manually align the pointer */
74#define	MCACHE_ALLOC_SIZE \
75	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
76
77#define	MCACHE_CPU(c) \
78	(mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
79
80/*
81 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
82 * to serialize accesses to the global list of caches in the system.
83 * They also record the thread currently running in the critical
84 * section, so that we can avoid recursive requests to reap the
85 * caches when memory runs low.
86 */
87#define	MCACHE_LIST_LOCK() {				\
88	lck_mtx_lock(mcache_llock);			\
89	mcache_llock_owner = current_thread();		\
90}
91
92#define	MCACHE_LIST_UNLOCK() {				\
93	mcache_llock_owner = NULL;			\
94	lck_mtx_unlock(mcache_llock);			\
95}
96
97#define	MCACHE_LOCK(l)		lck_mtx_lock(l)
98#define	MCACHE_UNLOCK(l)	lck_mtx_unlock(l)
99#define	MCACHE_LOCK_TRY(l)	lck_mtx_try_lock(l)
100
101static int ncpu;
102static unsigned int cache_line_size;
103static lck_mtx_t *mcache_llock;
104static struct thread *mcache_llock_owner;
105static lck_attr_t *mcache_llock_attr;
106static lck_grp_t *mcache_llock_grp;
107static lck_grp_attr_t *mcache_llock_grp_attr;
108static struct zone *mcache_zone;
109static unsigned int mcache_reap_interval;
110static UInt32 mcache_reaping;
111static int mcache_ready;
112static int mcache_updating;
113
114static int mcache_bkt_contention = 3;
115#if DEBUG
116static unsigned int mcache_flags = MCF_DEBUG;
117#else
118static unsigned int mcache_flags = 0;
119#endif
120
121#define	DUMP_MCA_BUF_SIZE	512
122static char *mca_dump_buf;
123
124static mcache_bkttype_t mcache_bkttype[] = {
125	{ 1,	4096,	32768,	NULL },
126	{ 3,	2048,	16384,	NULL },
127	{ 7,	1024,	12288,	NULL },
128	{ 15,	256,	8192,	NULL },
129	{ 31,	64,	4096,	NULL },
130	{ 47,	0,	2048,	NULL },
131	{ 63,	0,	1024,	NULL },
132	{ 95,	0,	512,	NULL },
133	{ 143,	0,	256,	NULL },
134	{ 165,	0,	0,	NULL },
135};
136
137static mcache_t *mcache_create_common(const char *, size_t, size_t,
138    mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
139    mcache_notifyfn_t, void *, u_int32_t, int, int);
140static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
141    unsigned int, int);
142static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
143static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
144static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
145static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
146    mcache_bkttype_t **);
147static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
148static void mcache_cache_bkt_enable(mcache_t *);
149static void mcache_bkt_purge(mcache_t *);
150static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
151    mcache_bkt_t *, int);
152static void mcache_bkt_ws_update(mcache_t *);
153static void mcache_bkt_ws_reap(mcache_t *);
154static void mcache_dispatch(void (*)(void *), void *);
155static void mcache_cache_reap(mcache_t *);
156static void mcache_cache_update(mcache_t *);
157static void mcache_cache_bkt_resize(void *);
158static void mcache_cache_enable(void *);
159static void mcache_update(void *);
160static void mcache_update_timeout(void *);
161static void mcache_applyall(void (*)(mcache_t *));
162static void mcache_reap_start(void *);
163static void mcache_reap_done(void *);
164static void mcache_reap_timeout(void *);
165static void mcache_notify(mcache_t *, u_int32_t);
166static void mcache_purge(void *);
167
168static LIST_HEAD(, mcache) mcache_head;
169mcache_t *mcache_audit_cache;
170
171/*
172 * Initialize the framework; this is currently called as part of BSD init.
173 */
174__private_extern__ void
175mcache_init(void)
176{
177	mcache_bkttype_t *btp;
178	unsigned int i;
179	char name[32];
180
181	ncpu = ml_get_max_cpus();
182	(void) mcache_cache_line_size();	/* prime it */
183
184	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
185	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
186	    mcache_llock_grp_attr);
187	mcache_llock_attr = lck_attr_alloc_init();
188	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
189
190	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
191	    PAGE_SIZE, "mcache");
192	if (mcache_zone == NULL)
193		panic("mcache_init: failed to allocate mcache zone\n");
194	zone_change(mcache_zone, Z_CALLERACCT, FALSE);
195
196	LIST_INIT(&mcache_head);
197
198	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
199		btp = &mcache_bkttype[i];
200		(void) snprintf(name, sizeof (name), "bkt_%d",
201		    btp->bt_bktsize);
202		btp->bt_cache = mcache_create(name,
203		    (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
204	}
205
206	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags));
207	mcache_flags &= MCF_FLAGS_MASK;
208
209	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
210	    0, 0, MCR_SLEEP);
211
212	mcache_reap_interval = 15 * hz;
213	mcache_applyall(mcache_cache_bkt_enable);
214	mcache_ready = 1;
215
216	printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
217	    ncpu, CPU_CACHE_LINE_SIZE);
218}
219
220/*
221 * Return the global mcache flags.
222 */
223__private_extern__ unsigned int
224mcache_getflags(void)
225{
226	return (mcache_flags);
227}
228
229/*
230 * Return the CPU cache line size.
231 */
232__private_extern__ unsigned int
233mcache_cache_line_size(void)
234{
235	if (cache_line_size == 0) {
236		ml_cpu_info_t cpu_info;
237		ml_cpu_get_info(&cpu_info);
238		cache_line_size = cpu_info.cache_line_size;
239	}
240	return (cache_line_size);
241}
242
243/*
244 * Create a cache using the zone allocator as the backend slab allocator.
245 * The caller may specify any alignment for the object; if it specifies 0
246 * the default alignment (MCACHE_ALIGN) will be used.
247 */
248__private_extern__ mcache_t *
249mcache_create(const char *name, size_t bufsize, size_t align,
250    u_int32_t flags, int wait)
251{
252	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
253	    mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
254	    wait));
255}
256
257/*
258 * Create a cache using a custom backend slab allocator.  Since the caller
259 * is responsible for allocation, no alignment guarantee will be provided
260 * by this framework.
261 */
262__private_extern__ mcache_t *
263mcache_create_ext(const char *name, size_t bufsize,
264    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
265    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
266    u_int32_t flags, int wait)
267{
268	return (mcache_create_common(name, bufsize, 0, allocfn,
269	    freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
270}
271
272/*
273 * Common cache creation routine.
274 */
275static mcache_t *
276mcache_create_common(const char *name, size_t bufsize, size_t align,
277    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
278    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
279    u_int32_t flags, int need_zone, int wait)
280{
281	mcache_bkttype_t *btp;
282	mcache_t *cp = NULL;
283	size_t chunksize;
284	void *buf, **pbuf;
285	int c;
286	char lck_name[64];
287
288	/* If auditing is on and print buffer is NULL, allocate it now */
289	if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
290		int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
291		MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
292		    malloc_wait | M_ZERO);
293		if (mca_dump_buf == NULL)
294			return (NULL);
295	}
296
297	if (!(wait & MCR_NOSLEEP))
298		buf = zalloc(mcache_zone);
299	else
300		buf = zalloc_noblock(mcache_zone);
301
302	if (buf == NULL)
303		goto fail;
304
305	bzero(buf, MCACHE_ALLOC_SIZE);
306
307	/*
308	 * In case we didn't get a cache-aligned memory, round it up
309	 * accordingly.  This is needed in order to get the rest of
310	 * structure members aligned properly.  It also means that
311	 * the memory span gets shifted due to the round up, but it
312	 * is okay since we've allocated extra space for this.
313	 */
314	cp = (mcache_t *)
315	    P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_LINE_SIZE);
316	pbuf = (void **)((intptr_t)cp - sizeof (void *));
317	*pbuf = buf;
318
319	/*
320	 * Guaranteed alignment is valid only when we use the internal
321	 * slab allocator (currently set to use the zone allocator).
322	 */
323	if (!need_zone)
324		align = 1;
325	else if (align == 0)
326		align = MCACHE_ALIGN;
327
328	if ((align & (align - 1)) != 0)
329		panic("mcache_create: bad alignment %lu", align);
330
331	cp->mc_align = align;
332	cp->mc_slab_alloc = allocfn;
333	cp->mc_slab_free = freefn;
334	cp->mc_slab_audit = auditfn;
335	cp->mc_slab_log = logfn;
336	cp->mc_slab_notify = notifyfn;
337	cp->mc_private = need_zone ? cp : arg;
338	cp->mc_bufsize = bufsize;
339	cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
340
341	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
342
343	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
344	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
345	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
346	    cp->mc_cpu_lock_grp_attr);
347	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
348
349	/*
350	 * Allocation chunk size is the object's size plus any extra size
351	 * needed to satisfy the object's alignment.  It is enforced to be
352	 * at least the size of an LP64 pointer to simplify auditing and to
353	 * handle multiple-element allocation requests, where the elements
354	 * returned are linked together in a list.
355	 */
356	chunksize = MAX(bufsize, sizeof (u_int64_t));
357	if (need_zone) {
358		/* Enforce 64-bit minimum alignment for zone-based buffers */
359		align = MAX(align, sizeof (u_int64_t));
360		chunksize += sizeof (void *) + align;
361		chunksize = P2ROUNDUP(chunksize, align);
362		if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
363		    PAGE_SIZE, cp->mc_name)) == NULL)
364			goto fail;
365		zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
366	}
367	cp->mc_chunksize = chunksize;
368
369	/*
370	 * Initialize the bucket layer.
371	 */
372	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
373	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
374	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
375	    cp->mc_bkt_lock_grp_attr);
376	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
377	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
378	    cp->mc_bkt_lock_attr);
379
380	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
381	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
382	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
383	    cp->mc_sync_lock_grp_attr);
384	cp->mc_sync_lock_attr = lck_attr_alloc_init();
385	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
386	    cp->mc_sync_lock_attr);
387
388	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
389		continue;
390
391	cp->cache_bkttype = btp;
392
393	/*
394	 * Initialize the CPU layer.  Each per-CPU structure is aligned
395	 * on the CPU cache line boundary to prevent false sharing.
396	 */
397	for (c = 0; c < ncpu; c++) {
398		mcache_cpu_t *ccp = &cp->mc_cpu[c];
399
400		VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
401		lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
402		    cp->mc_cpu_lock_attr);
403		ccp->cc_objs = -1;
404		ccp->cc_pobjs = -1;
405	}
406
407	if (mcache_ready)
408		mcache_cache_bkt_enable(cp);
409
410	/* TODO: dynamically create sysctl for stats */
411
412	MCACHE_LIST_LOCK();
413	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
414	MCACHE_LIST_UNLOCK();
415
416	/*
417	 * If cache buckets are enabled and this is the first cache
418	 * created, start the periodic cache update.
419	 */
420	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
421		mcache_updating = 1;
422		mcache_update_timeout(NULL);
423	}
424	if (cp->mc_flags & MCF_DEBUG) {
425		printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
426		    "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
427		    arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
428	}
429	return (cp);
430
431fail:
432	if (buf != NULL)
433		zfree(mcache_zone, buf);
434	return (NULL);
435}
436
437/*
438 * Allocate one or more objects from a cache.
439 */
440__private_extern__ unsigned int
441mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
442{
443	mcache_cpu_t *ccp;
444	mcache_obj_t **top = &(*list);
445	mcache_bkt_t *bkt;
446	unsigned int need = num;
447	boolean_t nwretry = FALSE;
448
449	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
450	VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
451
452	ASSERT(list != NULL);
453	*list = NULL;
454
455	if (num == 0)
456		return (0);
457
458retry_alloc:
459	/* We may not always be running in the same CPU in case of retries */
460	ccp = MCACHE_CPU(cp);
461
462	MCACHE_LOCK(&ccp->cc_lock);
463	for (;;) {
464		/*
465		 * If we have an object in the current CPU's filled bucket,
466		 * chain the object to any previous objects and return if
467		 * we've satisfied the number of requested objects.
468		 */
469		if (ccp->cc_objs > 0) {
470			mcache_obj_t *tail;
471			int objs;
472
473			/*
474			 * Objects in the bucket are already linked together
475			 * with the most recently freed object at the head of
476			 * the list; grab as many objects as we can.
477			 */
478			objs = MIN((unsigned int)ccp->cc_objs, need);
479			*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
480			ccp->cc_objs -= objs;
481			ccp->cc_alloc += objs;
482
483			tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
484			list = &tail->obj_next;
485			*list = NULL;
486
487			/* If we got them all, return to caller */
488			if ((need -= objs) == 0) {
489				MCACHE_UNLOCK(&ccp->cc_lock);
490
491				if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
492				    cp->mc_slab_log != NULL)
493					(*cp->mc_slab_log)(num, *top, TRUE);
494
495				if (cp->mc_flags & MCF_DEBUG)
496					goto debug_alloc;
497
498				return (num);
499			}
500		}
501
502		/*
503		 * The CPU's filled bucket is empty.  If the previous filled
504		 * bucket was full, exchange and try again.
505		 */
506		if (ccp->cc_pobjs > 0) {
507			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
508			continue;
509		}
510
511		/*
512		 * If the bucket layer is disabled, allocate from slab.  This
513		 * can happen either because MCF_NOCPUCACHE is set, or because
514		 * the bucket layer is currently being resized.
515		 */
516		if (ccp->cc_bktsize == 0)
517			break;
518
519		/*
520		 * Both of the CPU's buckets are empty; try to get a full
521		 * bucket from the bucket layer.  Upon success, refill this
522		 * CPU and place any empty bucket into the empty list.
523		 */
524		bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
525		if (bkt != NULL) {
526			if (ccp->cc_pfilled != NULL)
527				mcache_bkt_free(cp, &cp->mc_empty,
528				    ccp->cc_pfilled);
529			mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
530			continue;
531		}
532
533		/*
534		 * The bucket layer has no full buckets; allocate the
535		 * object(s) directly from the slab layer.
536		 */
537		break;
538	}
539	MCACHE_UNLOCK(&ccp->cc_lock);
540
541	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
542
543	/*
544	 * If this is a blocking allocation, or if it is non-blocking and
545	 * the cache's full bucket is non-empty, then retry the allocation.
546	 */
547	if (need > 0) {
548		if (!(wait & MCR_NONBLOCKING)) {
549			atomic_add_32(&cp->mc_wretry_cnt, 1);
550			goto retry_alloc;
551		} else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
552		    !mcache_bkt_isempty(cp)) {
553			if (!nwretry)
554				nwretry = TRUE;
555			atomic_add_32(&cp->mc_nwretry_cnt, 1);
556			goto retry_alloc;
557		} else if (nwretry) {
558			atomic_add_32(&cp->mc_nwfail_cnt, 1);
559		}
560	}
561
562	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
563		(*cp->mc_slab_log)((num - need), *top, TRUE);
564
565	if (!(cp->mc_flags & MCF_DEBUG))
566		return (num - need);
567
568debug_alloc:
569	if (cp->mc_flags & MCF_DEBUG) {
570		mcache_obj_t **o = top;
571		unsigned int n;
572
573		n = 0;
574		/*
575		 * Verify that the chain of objects have the same count as
576		 * what we are about to report to the caller.  Any mismatch
577		 * here means that the object list is insanely broken and
578		 * therefore we must panic.
579		 */
580		while (*o != NULL) {
581			o = &(*o)->obj_next;
582			++n;
583		}
584		if (n != (num - need)) {
585			panic("mcache_alloc_ext: %s cp %p corrupted list "
586			    "(got %d actual %d)\n", cp->mc_name,
587			    (void *)cp, num - need, n);
588		}
589	}
590
591	/* Invoke the slab layer audit callback if auditing is enabled */
592	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
593		(*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
594
595	return (num - need);
596}
597
598/*
599 * Allocate a single object from a cache.
600 */
601__private_extern__ void *
602mcache_alloc(mcache_t *cp, int wait)
603{
604	mcache_obj_t *buf;
605
606	(void) mcache_alloc_ext(cp, &buf, 1, wait);
607	return (buf);
608}
609
610__private_extern__ void
611mcache_waiter_inc(mcache_t *cp)
612{
613	atomic_add_32(&cp->mc_waiter_cnt, 1);
614}
615
616__private_extern__ void
617mcache_waiter_dec(mcache_t *cp)
618{
619	atomic_add_32(&cp->mc_waiter_cnt, -1);
620}
621
622__private_extern__ boolean_t
623mcache_bkt_isempty(mcache_t *cp)
624{
625	/*
626	 * This isn't meant to accurately tell whether there are
627	 * any full buckets in the cache; it is simply a way to
628	 * obtain "hints" about the state of the cache.
629	 */
630	return (cp->mc_full.bl_total == 0);
631}
632
633/*
634 * Notify the slab layer about an event.
635 */
636static void
637mcache_notify(mcache_t *cp, u_int32_t event)
638{
639	if (cp->mc_slab_notify != NULL)
640		(*cp->mc_slab_notify)(cp->mc_private, event);
641}
642
643/*
644 * Purge the cache and disable its buckets.
645 */
646static void
647mcache_purge(void *arg)
648{
649	mcache_t *cp = arg;
650
651	mcache_bkt_purge(cp);
652	/*
653	 * We cannot simply call mcache_cache_bkt_enable() from here as
654	 * a bucket resize may be in flight and we would cause the CPU
655	 * layers of the cache to point to different sizes.  Therefore,
656	 * we simply increment the enable count so that during the next
657	 * periodic cache update the buckets can be reenabled.
658	 */
659	lck_mtx_lock_spin(&cp->mc_sync_lock);
660	cp->mc_enable_cnt++;
661	lck_mtx_unlock(&cp->mc_sync_lock);
662
663}
664
665__private_extern__ boolean_t
666mcache_purge_cache(mcache_t *cp)
667{
668	/*
669	 * Purging a cache that has no per-CPU caches or is already
670	 * in the process of being purged is rather pointless.
671	 */
672	if (cp->mc_flags & MCF_NOCPUCACHE)
673		return (FALSE);
674
675	lck_mtx_lock_spin(&cp->mc_sync_lock);
676	if (cp->mc_purge_cnt > 0) {
677		lck_mtx_unlock(&cp->mc_sync_lock);
678		return (FALSE);
679	}
680	cp->mc_purge_cnt++;
681	lck_mtx_unlock(&cp->mc_sync_lock);
682
683	mcache_dispatch(mcache_purge, cp);
684
685	return (TRUE);
686}
687
688/*
689 * Free a single object to a cache.
690 */
691__private_extern__ void
692mcache_free(mcache_t *cp, void *buf)
693{
694	((mcache_obj_t *)buf)->obj_next = NULL;
695	mcache_free_ext(cp, (mcache_obj_t *)buf);
696}
697
698/*
699 * Free one or more objects to a cache.
700 */
701__private_extern__ void
702mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
703{
704	mcache_cpu_t *ccp = MCACHE_CPU(cp);
705	mcache_bkttype_t *btp;
706	mcache_obj_t *nlist;
707	mcache_bkt_t *bkt;
708
709	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
710		(*cp->mc_slab_log)(0, list, FALSE);
711
712	/* Invoke the slab layer audit callback if auditing is enabled */
713	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
714		(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
715
716	MCACHE_LOCK(&ccp->cc_lock);
717	for (;;) {
718		/*
719		 * If there is space in the current CPU's filled bucket, put
720		 * the object there and return once all objects are freed.
721		 * Note the cast to unsigned integer takes care of the case
722		 * where the bucket layer is disabled (when cc_objs is -1).
723		 */
724		if ((unsigned int)ccp->cc_objs <
725		    (unsigned int)ccp->cc_bktsize) {
726			/*
727			 * Reverse the list while we place the object into the
728			 * bucket; this effectively causes the most recently
729			 * freed object(s) to be reused during allocation.
730			 */
731			nlist = list->obj_next;
732			list->obj_next = (ccp->cc_objs == 0) ? NULL :
733			    ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
734			ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
735			ccp->cc_free++;
736
737			if ((list = nlist) != NULL)
738				continue;
739
740			/* We are done; return to caller */
741			MCACHE_UNLOCK(&ccp->cc_lock);
742
743			/* If there is a waiter below, notify it */
744			if (cp->mc_waiter_cnt > 0)
745				mcache_notify(cp, MCN_RETRYALLOC);
746			return;
747		}
748
749		/*
750		 * The CPU's filled bucket is full.  If the previous filled
751		 * bucket was empty, exchange and try again.
752		 */
753		if (ccp->cc_pobjs == 0) {
754			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
755			continue;
756		}
757
758		/*
759		 * If the bucket layer is disabled, free to slab.  This can
760		 * happen either because MCF_NOCPUCACHE is set, or because
761		 * the bucket layer is currently being resized.
762		 */
763		if (ccp->cc_bktsize == 0)
764			break;
765
766		/*
767		 * Both of the CPU's buckets are full; try to get an empty
768		 * bucket from the bucket layer.  Upon success, empty this
769		 * CPU and place any full bucket into the full list.
770		 */
771		bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
772		if (bkt != NULL) {
773			if (ccp->cc_pfilled != NULL)
774				mcache_bkt_free(cp, &cp->mc_full,
775				    ccp->cc_pfilled);
776			mcache_cpu_refill(ccp, bkt, 0);
777			continue;
778		}
779
780		/*
781		 * We need an empty bucket to put our freed objects into
782		 * but couldn't get an empty bucket from the bucket layer;
783		 * attempt to allocate one.  We do not want to block for
784		 * allocation here, and if the bucket allocation fails
785		 * we will simply fall through to the slab layer.
786		 */
787		MCACHE_UNLOCK(&ccp->cc_lock);
788		bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
789		MCACHE_LOCK(&ccp->cc_lock);
790
791		if (bkt != NULL) {
792			/*
793			 * We have an empty bucket, but since we drop the
794			 * CPU lock above, the cache's bucket size may have
795			 * changed.  If so, free the bucket and try again.
796			 */
797			if (ccp->cc_bktsize != btp->bt_bktsize) {
798				MCACHE_UNLOCK(&ccp->cc_lock);
799				mcache_free(btp->bt_cache, bkt);
800				MCACHE_LOCK(&ccp->cc_lock);
801				continue;
802			}
803
804			/*
805			 * We have an empty bucket of the right size;
806			 * add it to the bucket layer and try again.
807			 */
808			mcache_bkt_free(cp, &cp->mc_empty, bkt);
809			continue;
810		}
811
812		/*
813		 * The bucket layer has no empty buckets; free the
814		 * object(s) directly to the slab layer.
815		 */
816		break;
817	}
818	MCACHE_UNLOCK(&ccp->cc_lock);
819
820	/* If there is a waiter below, notify it */
821	if (cp->mc_waiter_cnt > 0)
822		mcache_notify(cp, MCN_RETRYALLOC);
823
824	/* Advise the slab layer to purge the object(s) */
825	(*cp->mc_slab_free)(cp->mc_private, list,
826	    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
827}
828
829/*
830 * Cache destruction routine.
831 */
832__private_extern__ void
833mcache_destroy(mcache_t *cp)
834{
835	void **pbuf;
836
837	MCACHE_LIST_LOCK();
838	LIST_REMOVE(cp, mc_list);
839	MCACHE_LIST_UNLOCK();
840
841	mcache_bkt_purge(cp);
842
843	/*
844	 * This cache is dead; there should be no further transaction.
845	 * If it's still invoked, make sure that it induces a fault.
846	 */
847	cp->mc_slab_alloc = NULL;
848	cp->mc_slab_free = NULL;
849	cp->mc_slab_audit = NULL;
850
851	lck_attr_free(cp->mc_bkt_lock_attr);
852	lck_grp_free(cp->mc_bkt_lock_grp);
853	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
854
855	lck_attr_free(cp->mc_cpu_lock_attr);
856	lck_grp_free(cp->mc_cpu_lock_grp);
857	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
858
859	lck_attr_free(cp->mc_sync_lock_attr);
860	lck_grp_free(cp->mc_sync_lock_grp);
861	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
862
863	/*
864	 * TODO: We need to destroy the zone here, but cannot do it
865	 * because there is no such way to achieve that.  Until then
866	 * the memory allocated for the zone structure is leaked.
867	 * Once it is achievable, uncomment these lines:
868	 *
869	 *	if (cp->mc_slab_zone != NULL) {
870	 *		zdestroy(cp->mc_slab_zone);
871	 *		cp->mc_slab_zone = NULL;
872	 *	}
873	 */
874
875	/* Get the original address since we're about to free it */
876	pbuf = (void **)((intptr_t)cp - sizeof (void *));
877
878	zfree(mcache_zone, *pbuf);
879}
880
881/*
882 * Internal slab allocator used as a backend for simple caches.  The current
883 * implementation uses the zone allocator for simplicity reasons.
884 */
885static unsigned int
886mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
887{
888	mcache_t *cp = arg;
889	unsigned int need = num;
890	size_t offset = 0;
891	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
892	u_int32_t flags = cp->mc_flags;
893	void *buf, *base, **pbuf;
894	mcache_obj_t **list = *plist;
895
896	*list = NULL;
897
898	/*
899	 * The address of the object returned to the caller is an
900	 * offset from the 64-bit aligned base address only if the
901	 * cache's alignment requirement is neither 1 nor 8 bytes.
902	 */
903	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
904		offset = cp->mc_align;
905
906	for (;;) {
907		if (!(wait & MCR_NOSLEEP))
908			buf = zalloc(cp->mc_slab_zone);
909		else
910			buf = zalloc_noblock(cp->mc_slab_zone);
911
912		if (buf == NULL)
913			break;
914
915		/* Get the 64-bit aligned base address for this object */
916		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
917		    sizeof (u_int64_t));
918
919		/*
920		 * Wind back a pointer size from the aligned base and
921		 * save the original address so we can free it later.
922		 */
923		pbuf = (void **)((intptr_t)base - sizeof (void *));
924		*pbuf = buf;
925
926		/*
927		 * If auditing is enabled, patternize the contents of
928		 * the buffer starting from the 64-bit aligned base to
929		 * the end of the buffer; the length is rounded up to
930		 * the nearest 64-bit multiply; this is because we use
931		 * 64-bit memory access to set/check the pattern.
932		 */
933		if (flags & MCF_DEBUG) {
934			VERIFY(((intptr_t)base + rsize) <=
935			    ((intptr_t)buf + cp->mc_chunksize));
936			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
937		}
938
939		/*
940		 * Fix up the object's address to fulfill the cache's
941		 * alignment requirement (if needed) and return this
942		 * to the caller.
943		 */
944		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
945		    ((intptr_t)buf + cp->mc_chunksize));
946		*list = (mcache_obj_t *)((intptr_t)base + offset);
947
948		(*list)->obj_next = NULL;
949		list = *plist = &(*list)->obj_next;
950
951		/* If we got them all, return to mcache */
952		if (--need == 0)
953			break;
954	}
955
956	return (num - need);
957}
958
959/*
960 * Internal slab deallocator used as a backend for simple caches.
961 */
962static void
963mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
964{
965	mcache_t *cp = arg;
966	mcache_obj_t *nlist;
967	size_t offset = 0;
968	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
969	u_int32_t flags = cp->mc_flags;
970	void *base;
971	void **pbuf;
972
973	/*
974	 * The address of the object is an offset from a 64-bit
975	 * aligned base address only if the cache's alignment
976	 * requirement is neither 1 nor 8 bytes.
977	 */
978	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
979		offset = cp->mc_align;
980
981	for (;;) {
982		nlist = list->obj_next;
983		list->obj_next = NULL;
984
985		/* Get the 64-bit aligned base address of this object */
986		base = (void *)((intptr_t)list - offset);
987		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
988
989		/* Get the original address since we're about to free it */
990		pbuf = (void **)((intptr_t)base - sizeof (void *));
991
992		if (flags & MCF_DEBUG) {
993			VERIFY(((intptr_t)base + rsize) <=
994			    ((intptr_t)*pbuf + cp->mc_chunksize));
995			mcache_audit_free_verify(NULL, base, offset, rsize);
996		}
997
998		/* Free it to zone */
999		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
1000		    ((intptr_t)*pbuf + cp->mc_chunksize));
1001		zfree(cp->mc_slab_zone, *pbuf);
1002
1003		/* No more objects to free; return to mcache */
1004		if ((list = nlist) == NULL)
1005			break;
1006	}
1007}
1008
1009/*
1010 * Internal slab auditor for simple caches.
1011 */
1012static void
1013mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1014{
1015	mcache_t *cp = arg;
1016	size_t offset = 0;
1017	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1018	void *base, **pbuf;
1019
1020	/*
1021	 * The address of the object returned to the caller is an
1022	 * offset from the 64-bit aligned base address only if the
1023	 * cache's alignment requirement is neither 1 nor 8 bytes.
1024	 */
1025	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1026		offset = cp->mc_align;
1027
1028	while (list != NULL) {
1029		mcache_obj_t *next = list->obj_next;
1030
1031		/* Get the 64-bit aligned base address of this object */
1032		base = (void *)((intptr_t)list - offset);
1033		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1034
1035		/* Get the original address */
1036		pbuf = (void **)((intptr_t)base - sizeof (void *));
1037
1038		VERIFY(((intptr_t)base + rsize) <=
1039		    ((intptr_t)*pbuf + cp->mc_chunksize));
1040
1041		if (!alloc)
1042			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1043		else
1044			mcache_audit_free_verify_set(NULL, base, offset, rsize);
1045
1046		list = list->obj_next = next;
1047	}
1048}
1049
1050/*
1051 * Refill the CPU's filled bucket with bkt and save the previous one.
1052 */
1053static void
1054mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1055{
1056	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1057	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1058	ASSERT(ccp->cc_bktsize > 0);
1059
1060	ccp->cc_pfilled = ccp->cc_filled;
1061	ccp->cc_pobjs = ccp->cc_objs;
1062	ccp->cc_filled = bkt;
1063	ccp->cc_objs = objs;
1064}
1065
1066/*
1067 * Allocate a bucket from the bucket layer.
1068 */
1069static mcache_bkt_t *
1070mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1071{
1072	mcache_bkt_t *bkt;
1073
1074	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1075		/*
1076		 * The bucket layer lock is held by another CPU; increase
1077		 * the contention count so that we can later resize the
1078		 * bucket size accordingly.
1079		 */
1080		MCACHE_LOCK(&cp->mc_bkt_lock);
1081		cp->mc_bkt_contention++;
1082	}
1083
1084	if ((bkt = blp->bl_list) != NULL) {
1085		blp->bl_list = bkt->bkt_next;
1086		if (--blp->bl_total < blp->bl_min)
1087			blp->bl_min = blp->bl_total;
1088		blp->bl_alloc++;
1089	}
1090
1091	if (btp != NULL)
1092		*btp = cp->cache_bkttype;
1093
1094	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1095
1096	return (bkt);
1097}
1098
1099/*
1100 * Free a bucket to the bucket layer.
1101 */
1102static void
1103mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1104{
1105	MCACHE_LOCK(&cp->mc_bkt_lock);
1106
1107	bkt->bkt_next = blp->bl_list;
1108	blp->bl_list = bkt;
1109	blp->bl_total++;
1110
1111	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1112}
1113
1114/*
1115 * Enable the bucket layer of a cache.
1116 */
1117static void
1118mcache_cache_bkt_enable(mcache_t *cp)
1119{
1120	mcache_cpu_t *ccp;
1121	int cpu;
1122
1123	if (cp->mc_flags & MCF_NOCPUCACHE)
1124		return;
1125
1126	for (cpu = 0; cpu < ncpu; cpu++) {
1127		ccp = &cp->mc_cpu[cpu];
1128		MCACHE_LOCK(&ccp->cc_lock);
1129		ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1130		MCACHE_UNLOCK(&ccp->cc_lock);
1131	}
1132}
1133
1134/*
1135 * Purge all buckets from a cache and disable its bucket layer.
1136 */
1137static void
1138mcache_bkt_purge(mcache_t *cp)
1139{
1140	mcache_cpu_t *ccp;
1141	mcache_bkt_t *bp, *pbp;
1142	mcache_bkttype_t *btp;
1143	int cpu, objs, pobjs;
1144
1145	for (cpu = 0; cpu < ncpu; cpu++) {
1146		ccp = &cp->mc_cpu[cpu];
1147
1148		MCACHE_LOCK(&ccp->cc_lock);
1149
1150		btp = cp->cache_bkttype;
1151		bp = ccp->cc_filled;
1152		pbp = ccp->cc_pfilled;
1153		objs = ccp->cc_objs;
1154		pobjs = ccp->cc_pobjs;
1155		ccp->cc_filled = NULL;
1156		ccp->cc_pfilled = NULL;
1157		ccp->cc_objs = -1;
1158		ccp->cc_pobjs = -1;
1159		ccp->cc_bktsize = 0;
1160
1161		MCACHE_UNLOCK(&ccp->cc_lock);
1162
1163		if (bp != NULL)
1164			mcache_bkt_destroy(cp, btp, bp, objs);
1165		if (pbp != NULL)
1166			mcache_bkt_destroy(cp, btp, pbp, pobjs);
1167	}
1168
1169	/*
1170	 * Updating the working set back to back essentially sets
1171	 * the working set size to zero, so everything is reapable.
1172	 */
1173	mcache_bkt_ws_update(cp);
1174	mcache_bkt_ws_update(cp);
1175
1176	mcache_bkt_ws_reap(cp);
1177}
1178
1179/*
1180 * Free one or more objects in the bucket to the slab layer,
1181 * and also free the bucket itself.
1182 */
1183static void
1184mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1185    int nobjs)
1186{
1187	if (nobjs > 0) {
1188		mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1189
1190		if (cp->mc_flags & MCF_DEBUG) {
1191			mcache_obj_t *o = top;
1192			int cnt = 0;
1193
1194			/*
1195			 * Verify that the chain of objects in the bucket is
1196			 * valid.  Any mismatch here means a mistake when the
1197			 * object(s) were freed to the CPU layer, so we panic.
1198			 */
1199			while (o != NULL) {
1200				o = o->obj_next;
1201				++cnt;
1202			}
1203			if (cnt != nobjs) {
1204				panic("mcache_bkt_destroy: %s cp %p corrupted "
1205				    "list in bkt %p (nobjs %d actual %d)\n",
1206				    cp->mc_name, (void *)cp, (void *)bkt,
1207				    nobjs, cnt);
1208			}
1209		}
1210
1211		/* Advise the slab layer to purge the object(s) */
1212		(*cp->mc_slab_free)(cp->mc_private, top,
1213		    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1214	}
1215	mcache_free(btp->bt_cache, bkt);
1216}
1217
1218/*
1219 * Update the bucket layer working set statistics.
1220 */
1221static void
1222mcache_bkt_ws_update(mcache_t *cp)
1223{
1224	MCACHE_LOCK(&cp->mc_bkt_lock);
1225
1226	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1227	cp->mc_full.bl_min = cp->mc_full.bl_total;
1228	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1229	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1230
1231	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1232}
1233
1234/*
1235 * Reap all buckets that are beyond the working set.
1236 */
1237static void
1238mcache_bkt_ws_reap(mcache_t *cp)
1239{
1240	long reap;
1241	mcache_bkt_t *bkt;
1242	mcache_bkttype_t *btp;
1243
1244	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1245	while (reap-- &&
1246	    (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1247		mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1248
1249	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1250	while (reap-- &&
1251	    (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1252		mcache_bkt_destroy(cp, btp, bkt, 0);
1253}
1254
1255static void
1256mcache_reap_timeout(void *arg)
1257{
1258	volatile UInt32 *flag = arg;
1259
1260	ASSERT(flag == &mcache_reaping);
1261
1262	*flag = 0;
1263}
1264
1265static void
1266mcache_reap_done(void *flag)
1267{
1268	timeout(mcache_reap_timeout, flag, mcache_reap_interval);
1269}
1270
1271static void
1272mcache_reap_start(void *arg)
1273{
1274	UInt32 *flag = arg;
1275
1276	ASSERT(flag == &mcache_reaping);
1277
1278	mcache_applyall(mcache_cache_reap);
1279	mcache_dispatch(mcache_reap_done, flag);
1280}
1281
1282__private_extern__ void
1283mcache_reap(void)
1284{
1285	UInt32 *flag = &mcache_reaping;
1286
1287	if (mcache_llock_owner == current_thread() ||
1288	    !OSCompareAndSwap(0, 1, flag))
1289		return;
1290
1291	mcache_dispatch(mcache_reap_start, flag);
1292}
1293
1294static void
1295mcache_cache_reap(mcache_t *cp)
1296{
1297	mcache_bkt_ws_reap(cp);
1298}
1299
1300/*
1301 * Performs period maintenance on a cache.
1302 */
1303static void
1304mcache_cache_update(mcache_t *cp)
1305{
1306	int need_bkt_resize = 0;
1307	int need_bkt_reenable = 0;
1308
1309	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1310
1311	mcache_bkt_ws_update(cp);
1312
1313	/*
1314	 * Cache resize and post-purge reenable are mutually exclusive.
1315	 * If the cache was previously purged, there is no point of
1316	 * increasing the bucket size as there was an indication of
1317	 * memory pressure on the system.
1318	 */
1319	lck_mtx_lock_spin(&cp->mc_sync_lock);
1320	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1321		need_bkt_reenable = 1;
1322	lck_mtx_unlock(&cp->mc_sync_lock);
1323
1324	MCACHE_LOCK(&cp->mc_bkt_lock);
1325	/*
1326	 * If the contention count is greater than the threshold, and if
1327	 * we are not already at the maximum bucket size, increase it.
1328	 * Otherwise, if this cache was previously purged by the user
1329	 * then we simply reenable it.
1330	 */
1331	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1332	    (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1333	    mcache_bkt_contention && !need_bkt_reenable)
1334		need_bkt_resize = 1;
1335
1336	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1337	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1338
1339	if (need_bkt_resize)
1340		mcache_dispatch(mcache_cache_bkt_resize, cp);
1341	else if (need_bkt_reenable)
1342		mcache_dispatch(mcache_cache_enable, cp);
1343}
1344
1345/*
1346 * Recompute a cache's bucket size.  This is an expensive operation
1347 * and should not be done frequently; larger buckets provide for a
1348 * higher transfer rate with the bucket while smaller buckets reduce
1349 * the memory consumption.
1350 */
1351static void
1352mcache_cache_bkt_resize(void *arg)
1353{
1354	mcache_t *cp = arg;
1355	mcache_bkttype_t *btp = cp->cache_bkttype;
1356
1357	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1358		mcache_bkt_purge(cp);
1359
1360		/*
1361		 * Upgrade to the next bucket type with larger bucket size;
1362		 * temporarily set the previous contention snapshot to a
1363		 * negative number to prevent unnecessary resize request.
1364		 */
1365		MCACHE_LOCK(&cp->mc_bkt_lock);
1366		cp->cache_bkttype = ++btp;
1367		cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1368		MCACHE_UNLOCK(&cp->mc_bkt_lock);
1369
1370		mcache_cache_enable(cp);
1371	}
1372}
1373
1374/*
1375 * Reenable a previously disabled cache due to purge.
1376 */
1377static void
1378mcache_cache_enable(void *arg)
1379{
1380	mcache_t *cp = arg;
1381
1382	lck_mtx_lock_spin(&cp->mc_sync_lock);
1383	cp->mc_purge_cnt = 0;
1384	cp->mc_enable_cnt = 0;
1385	lck_mtx_unlock(&cp->mc_sync_lock);
1386
1387	mcache_cache_bkt_enable(cp);
1388}
1389
1390static void
1391mcache_update_timeout(__unused void *arg)
1392{
1393	timeout(mcache_update, NULL, mcache_reap_interval);
1394}
1395
1396static void
1397mcache_update(__unused void *arg)
1398{
1399	mcache_applyall(mcache_cache_update);
1400	mcache_dispatch(mcache_update_timeout, NULL);
1401}
1402
1403static void
1404mcache_applyall(void (*func)(mcache_t *))
1405{
1406	mcache_t *cp;
1407
1408	MCACHE_LIST_LOCK();
1409	LIST_FOREACH(cp, &mcache_head, mc_list) {
1410		func(cp);
1411	}
1412	MCACHE_LIST_UNLOCK();
1413}
1414
1415static void
1416mcache_dispatch(void (*func)(void *), void *arg)
1417{
1418	ASSERT(func != NULL);
1419	timeout(func, arg, hz/1000);
1420}
1421
1422__private_extern__ void
1423mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1424    struct timeval *base_ts)
1425{
1426	struct timeval now, base = { 0, 0 };
1427	void *stack[MCACHE_STACK_DEPTH + 1];
1428
1429	mca->mca_addr = addr;
1430	mca->mca_cache = cp;
1431	mca->mca_pthread = mca->mca_thread;
1432	mca->mca_thread = current_thread();
1433	bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
1434	mca->mca_pdepth = mca->mca_depth;
1435	bzero(stack, sizeof (stack));
1436	mca->mca_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1437	bcopy(&stack[1], mca->mca_stack, sizeof (mca->mca_pstack));
1438
1439	mca->mca_ptstamp = mca->mca_tstamp;
1440	microuptime(&now);
1441	if (base_ts != NULL)
1442		base = *base_ts;
1443	/* tstamp is in ms relative to base_ts */
1444	mca->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1445	if ((now.tv_sec - base.tv_sec) > 0)
1446		mca->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1447}
1448
1449__private_extern__ void
1450mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1451{
1452	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1453	u_int64_t *buf = (u_int64_t *)buf_arg;
1454
1455	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1456	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1457
1458	while (buf < buf_end)
1459		*buf++ = pattern;
1460}
1461
1462__private_extern__ void *
1463mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1464{
1465	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1466	u_int64_t *buf;
1467
1468	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1469	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1470
1471	for (buf = buf_arg; buf < buf_end; buf++) {
1472		if (*buf != pattern)
1473			return (buf);
1474	}
1475	return (NULL);
1476}
1477
1478__private_extern__ void *
1479mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1480    size_t size)
1481{
1482	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1483	u_int64_t *buf;
1484
1485	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1486	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1487
1488	for (buf = buf_arg; buf < buf_end; buf++) {
1489		if (*buf != old) {
1490			mcache_set_pattern(old, buf_arg,
1491			    (uintptr_t)buf - (uintptr_t)buf_arg);
1492			return (buf);
1493		}
1494		*buf = new;
1495	}
1496	return (NULL);
1497}
1498
1499__private_extern__ void
1500mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1501    size_t size)
1502{
1503	void *addr;
1504	u_int64_t *oaddr64;
1505	mcache_obj_t *next;
1506
1507	addr = (void *)((uintptr_t)base + offset);
1508	next = ((mcache_obj_t *)addr)->obj_next;
1509
1510	/* For the "obj_next" pointer in the buffer */
1511	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1512	*oaddr64 = MCACHE_FREE_PATTERN;
1513
1514	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1515	    (caddr_t)base, size)) != NULL) {
1516		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1517		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1518		/* NOTREACHED */
1519	}
1520	((mcache_obj_t *)addr)->obj_next = next;
1521}
1522
1523__private_extern__ void
1524mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1525    size_t size)
1526{
1527	void *addr;
1528	u_int64_t *oaddr64;
1529	mcache_obj_t *next;
1530
1531	addr = (void *)((uintptr_t)base + offset);
1532	next = ((mcache_obj_t *)addr)->obj_next;
1533
1534	/* For the "obj_next" pointer in the buffer */
1535	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1536	*oaddr64 = MCACHE_FREE_PATTERN;
1537
1538	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1539	    MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1540		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1541		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1542		/* NOTREACHED */
1543	}
1544	((mcache_obj_t *)addr)->obj_next = next;
1545}
1546
1547#undef panic
1548
1549__private_extern__ char *
1550mcache_dump_mca(mcache_audit_t *mca)
1551{
1552	if (mca_dump_buf == NULL)
1553		return (NULL);
1554
1555	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1556	    "mca %p: addr %p, cache %p (%s)\n"
1557	    "last transaction; thread %p, saved PC stack (%d deep):\n"
1558	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1559	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1560	    "previous transaction; thread %p, saved PC stack (%d deep):\n"
1561	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1562	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n",
1563	    mca, mca->mca_addr, mca->mca_cache,
1564	    mca->mca_cache ? mca->mca_cache->mc_name : "?",
1565	    mca->mca_thread, mca->mca_depth,
1566	    mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
1567	    mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
1568	    mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
1569	    mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
1570	    mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
1571	    mca->mca_stack[15],
1572	    mca->mca_pthread, mca->mca_pdepth,
1573	    mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
1574	    mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
1575	    mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
1576	    mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
1577	    mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
1578	    mca->mca_pstack[15]);
1579
1580	return (mca_dump_buf);
1581}
1582
1583__private_extern__ void
1584mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1585    int64_t expected, int64_t got)
1586{
1587	if (mca == NULL) {
1588		panic("mcache_audit: buffer %p modified after free at "
1589		    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1590		    offset, got, expected);
1591		/* NOTREACHED */
1592	}
1593
1594	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1595	    "(0x%llx instead of 0x%llx)\n%s\n",
1596	    addr, offset, got, expected, mcache_dump_mca(mca));
1597	/* NOTREACHED */
1598}
1599
1600__private_extern__ int
1601assfail(const char *a, const char *f, int l)
1602{
1603	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1604	return (0);
1605}
1606