kmem.c revision 3448:aaf16568054b
159243Sobrien/*
259243Sobrien * CDDL HEADER START
359243Sobrien *
459243Sobrien * The contents of this file are subject to the terms of the
559243Sobrien * Common Development and Distribution License (the "License").
659243Sobrien * You may not use this file except in compliance with the License.
759243Sobrien *
859243Sobrien * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
959243Sobrien * or http://www.opensolaris.org/os/licensing.
1059243Sobrien * See the License for the specific language governing permissions
1159243Sobrien * and limitations under the License.
1259243Sobrien *
1359243Sobrien * When distributing Covered Code, include this CDDL HEADER in each
1459243Sobrien * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1559243Sobrien * If applicable, add the following below this CDDL HEADER, with the
1659243Sobrien * fields enclosed by brackets "[]" replaced with your own identifying
1759243Sobrien * information: Portions Copyright [yyyy] [name of copyright owner]
1859243Sobrien *
1959243Sobrien * CDDL HEADER END
2059243Sobrien */
2159243Sobrien/*
2259243Sobrien * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
2359243Sobrien * Use is subject to license terms.
2459243Sobrien */
2559243Sobrien
2659243Sobrien#pragma ident	"%Z%%M%	%I%	%E% SMI"
2759243Sobrien
2859243Sobrien/*
2959243Sobrien * Kernel memory allocator, as described in the following two papers:
3059243Sobrien *
3159243Sobrien * Jeff Bonwick,
3259243Sobrien * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
3359243Sobrien * Proceedings of the Summer 1994 Usenix Conference.
3459243Sobrien * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
3559243Sobrien *
3659243Sobrien * Jeff Bonwick and Jonathan Adams,
3759243Sobrien * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
3859243Sobrien * Arbitrary Resources.
3959243Sobrien * Proceedings of the 2001 Usenix Conference.
4059243Sobrien * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
4159243Sobrien */
42
43#include <sys/kmem_impl.h>
44#include <sys/vmem_impl.h>
45#include <sys/param.h>
46#include <sys/sysmacros.h>
47#include <sys/vm.h>
48#include <sys/proc.h>
49#include <sys/tuneable.h>
50#include <sys/systm.h>
51#include <sys/cmn_err.h>
52#include <sys/debug.h>
53#include <sys/mutex.h>
54#include <sys/bitmap.h>
55#include <sys/atomic.h>
56#include <sys/kobj.h>
57#include <sys/disp.h>
58#include <vm/seg_kmem.h>
59#include <sys/log.h>
60#include <sys/callb.h>
61#include <sys/taskq.h>
62#include <sys/modctl.h>
63#include <sys/reboot.h>
64#include <sys/id32.h>
65#include <sys/zone.h>
66#include <sys/netstack.h>
67
68extern void streams_msg_init(void);
69extern int segkp_fromheap;
70extern void segkp_cache_free(void);
71
72struct kmem_cache_kstat {
73	kstat_named_t	kmc_buf_size;
74	kstat_named_t	kmc_align;
75	kstat_named_t	kmc_chunk_size;
76	kstat_named_t	kmc_slab_size;
77	kstat_named_t	kmc_alloc;
78	kstat_named_t	kmc_alloc_fail;
79	kstat_named_t	kmc_free;
80	kstat_named_t	kmc_depot_alloc;
81	kstat_named_t	kmc_depot_free;
82	kstat_named_t	kmc_depot_contention;
83	kstat_named_t	kmc_slab_alloc;
84	kstat_named_t	kmc_slab_free;
85	kstat_named_t	kmc_buf_constructed;
86	kstat_named_t	kmc_buf_avail;
87	kstat_named_t	kmc_buf_inuse;
88	kstat_named_t	kmc_buf_total;
89	kstat_named_t	kmc_buf_max;
90	kstat_named_t	kmc_slab_create;
91	kstat_named_t	kmc_slab_destroy;
92	kstat_named_t	kmc_vmem_source;
93	kstat_named_t	kmc_hash_size;
94	kstat_named_t	kmc_hash_lookup_depth;
95	kstat_named_t	kmc_hash_rescale;
96	kstat_named_t	kmc_full_magazines;
97	kstat_named_t	kmc_empty_magazines;
98	kstat_named_t	kmc_magazine_size;
99} kmem_cache_kstat = {
100	{ "buf_size",		KSTAT_DATA_UINT64 },
101	{ "align",		KSTAT_DATA_UINT64 },
102	{ "chunk_size",		KSTAT_DATA_UINT64 },
103	{ "slab_size",		KSTAT_DATA_UINT64 },
104	{ "alloc",		KSTAT_DATA_UINT64 },
105	{ "alloc_fail",		KSTAT_DATA_UINT64 },
106	{ "free",		KSTAT_DATA_UINT64 },
107	{ "depot_alloc",	KSTAT_DATA_UINT64 },
108	{ "depot_free",		KSTAT_DATA_UINT64 },
109	{ "depot_contention",	KSTAT_DATA_UINT64 },
110	{ "slab_alloc",		KSTAT_DATA_UINT64 },
111	{ "slab_free",		KSTAT_DATA_UINT64 },
112	{ "buf_constructed",	KSTAT_DATA_UINT64 },
113	{ "buf_avail",		KSTAT_DATA_UINT64 },
114	{ "buf_inuse",		KSTAT_DATA_UINT64 },
115	{ "buf_total",		KSTAT_DATA_UINT64 },
116	{ "buf_max",		KSTAT_DATA_UINT64 },
117	{ "slab_create",	KSTAT_DATA_UINT64 },
118	{ "slab_destroy",	KSTAT_DATA_UINT64 },
119	{ "vmem_source",	KSTAT_DATA_UINT64 },
120	{ "hash_size",		KSTAT_DATA_UINT64 },
121	{ "hash_lookup_depth",	KSTAT_DATA_UINT64 },
122	{ "hash_rescale",	KSTAT_DATA_UINT64 },
123	{ "full_magazines",	KSTAT_DATA_UINT64 },
124	{ "empty_magazines",	KSTAT_DATA_UINT64 },
125	{ "magazine_size",	KSTAT_DATA_UINT64 },
126};
127
128static kmutex_t kmem_cache_kstat_lock;
129
130/*
131 * The default set of caches to back kmem_alloc().
132 * These sizes should be reevaluated periodically.
133 *
134 * We want allocations that are multiples of the coherency granularity
135 * (64 bytes) to be satisfied from a cache which is a multiple of 64
136 * bytes, so that it will be 64-byte aligned.  For all multiples of 64,
137 * the next kmem_cache_size greater than or equal to it must be a
138 * multiple of 64.
139 */
140static const int kmem_alloc_sizes[] = {
141	1 * 8,
142	2 * 8,
143	3 * 8,
144	4 * 8,		5 * 8,		6 * 8,		7 * 8,
145	4 * 16,		5 * 16,		6 * 16,		7 * 16,
146	4 * 32,		5 * 32,		6 * 32,		7 * 32,
147	4 * 64,		5 * 64,		6 * 64,		7 * 64,
148	4 * 128,	5 * 128,	6 * 128,	7 * 128,
149	P2ALIGN(8192 / 7, 64),
150	P2ALIGN(8192 / 6, 64),
151	P2ALIGN(8192 / 5, 64),
152	P2ALIGN(8192 / 4, 64),
153	P2ALIGN(8192 / 3, 64),
154	P2ALIGN(8192 / 2, 64),
155	P2ALIGN(8192 / 1, 64),
156	4096 * 3,
157	8192 * 2,
158	8192 * 3,
159	8192 * 4,
160};
161
162#define	KMEM_MAXBUF	32768
163
164static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
165
166static kmem_magtype_t kmem_magtype[] = {
167	{ 1,	8,	3200,	65536	},
168	{ 3,	16,	256,	32768	},
169	{ 7,	32,	64,	16384	},
170	{ 15,	64,	0,	8192	},
171	{ 31,	64,	0,	4096	},
172	{ 47,	64,	0,	2048	},
173	{ 63,	64,	0,	1024	},
174	{ 95,	64,	0,	512	},
175	{ 143,	64,	0,	0	},
176};
177
178static uint32_t kmem_reaping;
179static uint32_t kmem_reaping_idspace;
180
181/*
182 * kmem tunables
183 */
184clock_t kmem_reap_interval;	/* cache reaping rate [15 * HZ ticks] */
185int kmem_depot_contention = 3;	/* max failed tryenters per real interval */
186pgcnt_t kmem_reapahead = 0;	/* start reaping N pages before pageout */
187int kmem_panic = 1;		/* whether to panic on error */
188int kmem_logging = 1;		/* kmem_log_enter() override */
189uint32_t kmem_mtbf = 0;		/* mean time between failures [default: off] */
190size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
191size_t kmem_content_log_size;	/* content log size [2% of memory] */
192size_t kmem_failure_log_size;	/* failure log [4 pages per CPU] */
193size_t kmem_slab_log_size;	/* slab create log [4 pages per CPU] */
194size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
195size_t kmem_lite_minsize = 0;	/* minimum buffer size for KMF_LITE */
196size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
197int kmem_lite_pcs = 4;		/* number of PCs to store in KMF_LITE mode */
198size_t kmem_maxverify;		/* maximum bytes to inspect in debug routines */
199size_t kmem_minfirewall;	/* hardware-enforced redzone threshold */
200
201#ifdef DEBUG
202int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
203#else
204int kmem_flags = 0;
205#endif
206int kmem_ready;
207
208static kmem_cache_t	*kmem_slab_cache;
209static kmem_cache_t	*kmem_bufctl_cache;
210static kmem_cache_t	*kmem_bufctl_audit_cache;
211
212static kmutex_t		kmem_cache_lock;	/* inter-cache linkage only */
213kmem_cache_t		kmem_null_cache;
214
215static taskq_t		*kmem_taskq;
216static kmutex_t		kmem_flags_lock;
217static vmem_t		*kmem_metadata_arena;
218static vmem_t		*kmem_msb_arena;	/* arena for metadata caches */
219static vmem_t		*kmem_cache_arena;
220static vmem_t		*kmem_hash_arena;
221static vmem_t		*kmem_log_arena;
222static vmem_t		*kmem_oversize_arena;
223static vmem_t		*kmem_va_arena;
224static vmem_t		*kmem_default_arena;
225static vmem_t		*kmem_firewall_va_arena;
226static vmem_t		*kmem_firewall_arena;
227
228kmem_log_header_t	*kmem_transaction_log;
229kmem_log_header_t	*kmem_content_log;
230kmem_log_header_t	*kmem_failure_log;
231kmem_log_header_t	*kmem_slab_log;
232
233static int		kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
234
235#define	KMEM_BUFTAG_LITE_ENTER(bt, count, caller)			\
236	if ((count) > 0) {						\
237		pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history;	\
238		pc_t *_e;						\
239		/* memmove() the old entries down one notch */		\
240		for (_e = &_s[(count) - 1]; _e > _s; _e--)		\
241			*_e = *(_e - 1);				\
242		*_s = (uintptr_t)(caller);				\
243	}
244
245#define	KMERR_MODIFIED	0	/* buffer modified while on freelist */
246#define	KMERR_REDZONE	1	/* redzone violation (write past end of buf) */
247#define	KMERR_DUPFREE	2	/* freed a buffer twice */
248#define	KMERR_BADADDR	3	/* freed a bad (unallocated) address */
249#define	KMERR_BADBUFTAG	4	/* buftag corrupted */
250#define	KMERR_BADBUFCTL	5	/* bufctl corrupted */
251#define	KMERR_BADCACHE	6	/* freed a buffer to the wrong cache */
252#define	KMERR_BADSIZE	7	/* alloc size != free size */
253#define	KMERR_BADBASE	8	/* buffer base address wrong */
254
255struct {
256	hrtime_t	kmp_timestamp;	/* timestamp of panic */
257	int		kmp_error;	/* type of kmem error */
258	void		*kmp_buffer;	/* buffer that induced panic */
259	void		*kmp_realbuf;	/* real start address for buffer */
260	kmem_cache_t	*kmp_cache;	/* buffer's cache according to client */
261	kmem_cache_t	*kmp_realcache;	/* actual cache containing buffer */
262	kmem_slab_t	*kmp_slab;	/* slab accoring to kmem_findslab() */
263	kmem_bufctl_t	*kmp_bufctl;	/* bufctl */
264} kmem_panic_info;
265
266
267static void
268copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
269{
270	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
271	uint64_t *buf = buf_arg;
272
273	while (buf < bufend)
274		*buf++ = pattern;
275}
276
277static void *
278verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
279{
280	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
281	uint64_t *buf;
282
283	for (buf = buf_arg; buf < bufend; buf++)
284		if (*buf != pattern)
285			return (buf);
286	return (NULL);
287}
288
289static void *
290verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
291{
292	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
293	uint64_t *buf;
294
295	for (buf = buf_arg; buf < bufend; buf++) {
296		if (*buf != old) {
297			copy_pattern(old, buf_arg,
298				(char *)buf - (char *)buf_arg);
299			return (buf);
300		}
301		*buf = new;
302	}
303
304	return (NULL);
305}
306
307static void
308kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
309{
310	kmem_cache_t *cp;
311
312	mutex_enter(&kmem_cache_lock);
313	for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache;
314	    cp = cp->cache_next)
315		if (tq != NULL)
316			(void) taskq_dispatch(tq, (task_func_t *)func, cp,
317			    tqflag);
318		else
319			func(cp);
320	mutex_exit(&kmem_cache_lock);
321}
322
323static void
324kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
325{
326	kmem_cache_t *cp;
327
328	mutex_enter(&kmem_cache_lock);
329	for (cp = kmem_null_cache.cache_next; cp != &kmem_null_cache;
330	    cp = cp->cache_next) {
331		if (!(cp->cache_cflags & KMC_IDENTIFIER))
332			continue;
333		if (tq != NULL)
334			(void) taskq_dispatch(tq, (task_func_t *)func, cp,
335			    tqflag);
336		else
337			func(cp);
338	}
339	mutex_exit(&kmem_cache_lock);
340}
341
342/*
343 * Debugging support.  Given a buffer address, find its slab.
344 */
345static kmem_slab_t *
346kmem_findslab(kmem_cache_t *cp, void *buf)
347{
348	kmem_slab_t *sp;
349
350	mutex_enter(&cp->cache_lock);
351	for (sp = cp->cache_nullslab.slab_next;
352	    sp != &cp->cache_nullslab; sp = sp->slab_next) {
353		if (KMEM_SLAB_MEMBER(sp, buf)) {
354			mutex_exit(&cp->cache_lock);
355			return (sp);
356		}
357	}
358	mutex_exit(&cp->cache_lock);
359
360	return (NULL);
361}
362
363static void
364kmem_error(int error, kmem_cache_t *cparg, void *bufarg)
365{
366	kmem_buftag_t *btp = NULL;
367	kmem_bufctl_t *bcp = NULL;
368	kmem_cache_t *cp = cparg;
369	kmem_slab_t *sp;
370	uint64_t *off;
371	void *buf = bufarg;
372
373	kmem_logging = 0;	/* stop logging when a bad thing happens */
374
375	kmem_panic_info.kmp_timestamp = gethrtime();
376
377	sp = kmem_findslab(cp, buf);
378	if (sp == NULL) {
379		for (cp = kmem_null_cache.cache_prev; cp != &kmem_null_cache;
380		    cp = cp->cache_prev) {
381			if ((sp = kmem_findslab(cp, buf)) != NULL)
382				break;
383		}
384	}
385
386	if (sp == NULL) {
387		cp = NULL;
388		error = KMERR_BADADDR;
389	} else {
390		if (cp != cparg)
391			error = KMERR_BADCACHE;
392		else
393			buf = (char *)bufarg - ((uintptr_t)bufarg -
394			    (uintptr_t)sp->slab_base) % cp->cache_chunksize;
395		if (buf != bufarg)
396			error = KMERR_BADBASE;
397		if (cp->cache_flags & KMF_BUFTAG)
398			btp = KMEM_BUFTAG(cp, buf);
399		if (cp->cache_flags & KMF_HASH) {
400			mutex_enter(&cp->cache_lock);
401			for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
402				if (bcp->bc_addr == buf)
403					break;
404			mutex_exit(&cp->cache_lock);
405			if (bcp == NULL && btp != NULL)
406				bcp = btp->bt_bufctl;
407			if (kmem_findslab(cp->cache_bufctl_cache, bcp) ==
408			    NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) ||
409			    bcp->bc_addr != buf) {
410				error = KMERR_BADBUFCTL;
411				bcp = NULL;
412			}
413		}
414	}
415
416	kmem_panic_info.kmp_error = error;
417	kmem_panic_info.kmp_buffer = bufarg;
418	kmem_panic_info.kmp_realbuf = buf;
419	kmem_panic_info.kmp_cache = cparg;
420	kmem_panic_info.kmp_realcache = cp;
421	kmem_panic_info.kmp_slab = sp;
422	kmem_panic_info.kmp_bufctl = bcp;
423
424	printf("kernel memory allocator: ");
425
426	switch (error) {
427
428	case KMERR_MODIFIED:
429		printf("buffer modified after being freed\n");
430		off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
431		if (off == NULL)	/* shouldn't happen */
432			off = buf;
433		printf("modification occurred at offset 0x%lx "
434		    "(0x%llx replaced by 0x%llx)\n",
435		    (uintptr_t)off - (uintptr_t)buf,
436		    (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off);
437		break;
438
439	case KMERR_REDZONE:
440		printf("redzone violation: write past end of buffer\n");
441		break;
442
443	case KMERR_BADADDR:
444		printf("invalid free: buffer not in cache\n");
445		break;
446
447	case KMERR_DUPFREE:
448		printf("duplicate free: buffer freed twice\n");
449		break;
450
451	case KMERR_BADBUFTAG:
452		printf("boundary tag corrupted\n");
453		printf("bcp ^ bxstat = %lx, should be %lx\n",
454		    (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
455		    KMEM_BUFTAG_FREE);
456		break;
457
458	case KMERR_BADBUFCTL:
459		printf("bufctl corrupted\n");
460		break;
461
462	case KMERR_BADCACHE:
463		printf("buffer freed to wrong cache\n");
464		printf("buffer was allocated from %s,\n", cp->cache_name);
465		printf("caller attempting free to %s.\n", cparg->cache_name);
466		break;
467
468	case KMERR_BADSIZE:
469		printf("bad free: free size (%u) != alloc size (%u)\n",
470		    KMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
471		    KMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
472		break;
473
474	case KMERR_BADBASE:
475		printf("bad free: free address (%p) != alloc address (%p)\n",
476		    bufarg, buf);
477		break;
478	}
479
480	printf("buffer=%p  bufctl=%p  cache: %s\n",
481	    bufarg, (void *)bcp, cparg->cache_name);
482
483	if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) &&
484	    error != KMERR_BADBUFCTL) {
485		int d;
486		timestruc_t ts;
487		kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp;
488
489		hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts);
490		printf("previous transaction on buffer %p:\n", buf);
491		printf("thread=%p  time=T-%ld.%09ld  slab=%p  cache: %s\n",
492		    (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
493		    (void *)sp, cp->cache_name);
494		for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) {
495			ulong_t off;
496			char *sym = kobj_getsymname(bcap->bc_stack[d], &off);
497			printf("%s+%lx\n", sym ? sym : "?", off);
498		}
499	}
500	if (kmem_panic > 0)
501		panic("kernel heap corruption detected");
502	if (kmem_panic == 0)
503		debug_enter(NULL);
504	kmem_logging = 1;	/* resume logging */
505}
506
507static kmem_log_header_t *
508kmem_log_init(size_t logsize)
509{
510	kmem_log_header_t *lhp;
511	int nchunks = 4 * max_ncpus;
512	size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus];
513	int i;
514
515	/*
516	 * Make sure that lhp->lh_cpu[] is nicely aligned
517	 * to prevent false sharing of cache lines.
518	 */
519	lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN);
520	lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
521	    NULL, NULL, VM_SLEEP);
522	bzero(lhp, lhsize);
523
524	mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL);
525	lhp->lh_nchunks = nchunks;
526	lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE);
527	lhp->lh_base = vmem_alloc(kmem_log_arena,
528	    lhp->lh_chunksize * nchunks, VM_SLEEP);
529	lhp->lh_free = vmem_alloc(kmem_log_arena,
530	    nchunks * sizeof (int), VM_SLEEP);
531	bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
532
533	for (i = 0; i < max_ncpus; i++) {
534		kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
535		mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL);
536		clhp->clh_chunk = i;
537	}
538
539	for (i = max_ncpus; i < nchunks; i++)
540		lhp->lh_free[i] = i;
541
542	lhp->lh_head = max_ncpus;
543	lhp->lh_tail = 0;
544
545	return (lhp);
546}
547
548static void *
549kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size)
550{
551	void *logspace;
552	kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[CPU->cpu_seqid];
553
554	if (lhp == NULL || kmem_logging == 0 || panicstr)
555		return (NULL);
556
557	mutex_enter(&clhp->clh_lock);
558	clhp->clh_hits++;
559	if (size > clhp->clh_avail) {
560		mutex_enter(&lhp->lh_lock);
561		lhp->lh_hits++;
562		lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
563		lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
564		clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
565		lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
566		clhp->clh_current = lhp->lh_base +
567			clhp->clh_chunk * lhp->lh_chunksize;
568		clhp->clh_avail = lhp->lh_chunksize;
569		if (size > lhp->lh_chunksize)
570			size = lhp->lh_chunksize;
571		mutex_exit(&lhp->lh_lock);
572	}
573	logspace = clhp->clh_current;
574	clhp->clh_current += size;
575	clhp->clh_avail -= size;
576	bcopy(data, logspace, size);
577	mutex_exit(&clhp->clh_lock);
578	return (logspace);
579}
580
581#define	KMEM_AUDIT(lp, cp, bcp)						\
582{									\
583	kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp);	\
584	_bcp->bc_timestamp = gethrtime();				\
585	_bcp->bc_thread = curthread;					\
586	_bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH);	\
587	_bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp));	\
588}
589
590static void
591kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp,
592	kmem_slab_t *sp, void *addr)
593{
594	kmem_bufctl_audit_t bca;
595
596	bzero(&bca, sizeof (kmem_bufctl_audit_t));
597	bca.bc_addr = addr;
598	bca.bc_slab = sp;
599	bca.bc_cache = cp;
600	KMEM_AUDIT(lp, cp, &bca);
601}
602
603/*
604 * Create a new slab for cache cp.
605 */
606static kmem_slab_t *
607kmem_slab_create(kmem_cache_t *cp, int kmflag)
608{
609	size_t slabsize = cp->cache_slabsize;
610	size_t chunksize = cp->cache_chunksize;
611	int cache_flags = cp->cache_flags;
612	size_t color, chunks;
613	char *buf, *slab;
614	kmem_slab_t *sp;
615	kmem_bufctl_t *bcp;
616	vmem_t *vmp = cp->cache_arena;
617
618	color = cp->cache_color + cp->cache_align;
619	if (color > cp->cache_maxcolor)
620		color = cp->cache_mincolor;
621	cp->cache_color = color;
622
623	slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS);
624
625	if (slab == NULL)
626		goto vmem_alloc_failure;
627
628	ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
629
630	if (!(cp->cache_cflags & KMC_NOTOUCH))
631		copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize);
632
633	if (cache_flags & KMF_HASH) {
634		if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL)
635			goto slab_alloc_failure;
636		chunks = (slabsize - color) / chunksize;
637	} else {
638		sp = KMEM_SLAB(cp, slab);
639		chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize;
640	}
641
642	sp->slab_cache	= cp;
643	sp->slab_head	= NULL;
644	sp->slab_refcnt	= 0;
645	sp->slab_base	= buf = slab + color;
646	sp->slab_chunks	= chunks;
647
648	ASSERT(chunks > 0);
649	while (chunks-- != 0) {
650		if (cache_flags & KMF_HASH) {
651			bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag);
652			if (bcp == NULL)
653				goto bufctl_alloc_failure;
654			if (cache_flags & KMF_AUDIT) {
655				kmem_bufctl_audit_t *bcap =
656				    (kmem_bufctl_audit_t *)bcp;
657				bzero(bcap, sizeof (kmem_bufctl_audit_t));
658				bcap->bc_cache = cp;
659			}
660			bcp->bc_addr = buf;
661			bcp->bc_slab = sp;
662		} else {
663			bcp = KMEM_BUFCTL(cp, buf);
664		}
665		if (cache_flags & KMF_BUFTAG) {
666			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
667			btp->bt_redzone = KMEM_REDZONE_PATTERN;
668			btp->bt_bufctl = bcp;
669			btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
670			if (cache_flags & KMF_DEADBEEF) {
671				copy_pattern(KMEM_FREE_PATTERN, buf,
672				    cp->cache_verify);
673			}
674		}
675		bcp->bc_next = sp->slab_head;
676		sp->slab_head = bcp;
677		buf += chunksize;
678	}
679
680	kmem_log_event(kmem_slab_log, cp, sp, slab);
681
682	return (sp);
683
684bufctl_alloc_failure:
685
686	while ((bcp = sp->slab_head) != NULL) {
687		sp->slab_head = bcp->bc_next;
688		kmem_cache_free(cp->cache_bufctl_cache, bcp);
689	}
690	kmem_cache_free(kmem_slab_cache, sp);
691
692slab_alloc_failure:
693
694	vmem_free(vmp, slab, slabsize);
695
696vmem_alloc_failure:
697
698	kmem_log_event(kmem_failure_log, cp, NULL, NULL);
699	atomic_add_64(&cp->cache_alloc_fail, 1);
700
701	return (NULL);
702}
703
704/*
705 * Destroy a slab.
706 */
707static void
708kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp)
709{
710	vmem_t *vmp = cp->cache_arena;
711	void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
712
713	if (cp->cache_flags & KMF_HASH) {
714		kmem_bufctl_t *bcp;
715		while ((bcp = sp->slab_head) != NULL) {
716			sp->slab_head = bcp->bc_next;
717			kmem_cache_free(cp->cache_bufctl_cache, bcp);
718		}
719		kmem_cache_free(kmem_slab_cache, sp);
720	}
721	vmem_free(vmp, slab, cp->cache_slabsize);
722}
723
724/*
725 * Allocate a raw (unconstructed) buffer from cp's slab layer.
726 */
727static void *
728kmem_slab_alloc(kmem_cache_t *cp, int kmflag)
729{
730	kmem_bufctl_t *bcp, **hash_bucket;
731	kmem_slab_t *sp;
732	void *buf;
733
734	mutex_enter(&cp->cache_lock);
735	cp->cache_slab_alloc++;
736	sp = cp->cache_freelist;
737	ASSERT(sp->slab_cache == cp);
738	if (sp->slab_head == NULL) {
739		/*
740		 * The freelist is empty.  Create a new slab.
741		 */
742		mutex_exit(&cp->cache_lock);
743		if ((sp = kmem_slab_create(cp, kmflag)) == NULL)
744			return (NULL);
745		mutex_enter(&cp->cache_lock);
746		cp->cache_slab_create++;
747		if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
748			cp->cache_bufmax = cp->cache_buftotal;
749		sp->slab_next = cp->cache_freelist;
750		sp->slab_prev = cp->cache_freelist->slab_prev;
751		sp->slab_next->slab_prev = sp;
752		sp->slab_prev->slab_next = sp;
753		cp->cache_freelist = sp;
754	}
755
756	sp->slab_refcnt++;
757	ASSERT(sp->slab_refcnt <= sp->slab_chunks);
758
759	/*
760	 * If we're taking the last buffer in the slab,
761	 * remove the slab from the cache's freelist.
762	 */
763	bcp = sp->slab_head;
764	if ((sp->slab_head = bcp->bc_next) == NULL) {
765		cp->cache_freelist = sp->slab_next;
766		ASSERT(sp->slab_refcnt == sp->slab_chunks);
767	}
768
769	if (cp->cache_flags & KMF_HASH) {
770		/*
771		 * Add buffer to allocated-address hash table.
772		 */
773		buf = bcp->bc_addr;
774		hash_bucket = KMEM_HASH(cp, buf);
775		bcp->bc_next = *hash_bucket;
776		*hash_bucket = bcp;
777		if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
778			KMEM_AUDIT(kmem_transaction_log, cp, bcp);
779		}
780	} else {
781		buf = KMEM_BUF(cp, bcp);
782	}
783
784	ASSERT(KMEM_SLAB_MEMBER(sp, buf));
785
786	mutex_exit(&cp->cache_lock);
787
788	return (buf);
789}
790
791/*
792 * Free a raw (unconstructed) buffer to cp's slab layer.
793 */
794static void
795kmem_slab_free(kmem_cache_t *cp, void *buf)
796{
797	kmem_slab_t *sp;
798	kmem_bufctl_t *bcp, **prev_bcpp;
799
800	ASSERT(buf != NULL);
801
802	mutex_enter(&cp->cache_lock);
803	cp->cache_slab_free++;
804
805	if (cp->cache_flags & KMF_HASH) {
806		/*
807		 * Look up buffer in allocated-address hash table.
808		 */
809		prev_bcpp = KMEM_HASH(cp, buf);
810		while ((bcp = *prev_bcpp) != NULL) {
811			if (bcp->bc_addr == buf) {
812				*prev_bcpp = bcp->bc_next;
813				sp = bcp->bc_slab;
814				break;
815			}
816			cp->cache_lookup_depth++;
817			prev_bcpp = &bcp->bc_next;
818		}
819	} else {
820		bcp = KMEM_BUFCTL(cp, buf);
821		sp = KMEM_SLAB(cp, buf);
822	}
823
824	if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) {
825		mutex_exit(&cp->cache_lock);
826		kmem_error(KMERR_BADADDR, cp, buf);
827		return;
828	}
829
830	if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
831		if (cp->cache_flags & KMF_CONTENTS)
832			((kmem_bufctl_audit_t *)bcp)->bc_contents =
833			    kmem_log_enter(kmem_content_log, buf,
834				cp->cache_contents);
835		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
836	}
837
838	/*
839	 * If this slab isn't currently on the freelist, put it there.
840	 */
841	if (sp->slab_head == NULL) {
842		ASSERT(sp->slab_refcnt == sp->slab_chunks);
843		ASSERT(cp->cache_freelist != sp);
844		sp->slab_next->slab_prev = sp->slab_prev;
845		sp->slab_prev->slab_next = sp->slab_next;
846		sp->slab_next = cp->cache_freelist;
847		sp->slab_prev = cp->cache_freelist->slab_prev;
848		sp->slab_next->slab_prev = sp;
849		sp->slab_prev->slab_next = sp;
850		cp->cache_freelist = sp;
851	}
852
853	bcp->bc_next = sp->slab_head;
854	sp->slab_head = bcp;
855
856	ASSERT(sp->slab_refcnt >= 1);
857	if (--sp->slab_refcnt == 0) {
858		/*
859		 * There are no outstanding allocations from this slab,
860		 * so we can reclaim the memory.
861		 */
862		sp->slab_next->slab_prev = sp->slab_prev;
863		sp->slab_prev->slab_next = sp->slab_next;
864		if (sp == cp->cache_freelist)
865			cp->cache_freelist = sp->slab_next;
866		cp->cache_slab_destroy++;
867		cp->cache_buftotal -= sp->slab_chunks;
868		mutex_exit(&cp->cache_lock);
869		kmem_slab_destroy(cp, sp);
870		return;
871	}
872	mutex_exit(&cp->cache_lock);
873}
874
875static int
876kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
877    caddr_t caller)
878{
879	kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
880	kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
881	uint32_t mtbf;
882
883	if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
884		kmem_error(KMERR_BADBUFTAG, cp, buf);
885		return (-1);
886	}
887
888	btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC;
889
890	if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
891		kmem_error(KMERR_BADBUFCTL, cp, buf);
892		return (-1);
893	}
894
895	if (cp->cache_flags & KMF_DEADBEEF) {
896		if (!construct && (cp->cache_flags & KMF_LITE)) {
897			if (*(uint64_t *)buf != KMEM_FREE_PATTERN) {
898				kmem_error(KMERR_MODIFIED, cp, buf);
899				return (-1);
900			}
901			if (cp->cache_constructor != NULL)
902				*(uint64_t *)buf = btp->bt_redzone;
903			else
904				*(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN;
905		} else {
906			construct = 1;
907			if (verify_and_copy_pattern(KMEM_FREE_PATTERN,
908			    KMEM_UNINITIALIZED_PATTERN, buf,
909			    cp->cache_verify)) {
910				kmem_error(KMERR_MODIFIED, cp, buf);
911				return (-1);
912			}
913		}
914	}
915	btp->bt_redzone = KMEM_REDZONE_PATTERN;
916
917	if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 &&
918	    gethrtime() % mtbf == 0 &&
919	    (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) {
920		kmem_log_event(kmem_failure_log, cp, NULL, NULL);
921		if (!construct && cp->cache_destructor != NULL)
922			cp->cache_destructor(buf, cp->cache_private);
923	} else {
924		mtbf = 0;
925	}
926
927	if (mtbf || (construct && cp->cache_constructor != NULL &&
928	    cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) {
929		atomic_add_64(&cp->cache_alloc_fail, 1);
930		btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
931		if (cp->cache_flags & KMF_DEADBEEF)
932			copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
933		kmem_slab_free(cp, buf);
934		return (-1);
935	}
936
937	if (cp->cache_flags & KMF_AUDIT) {
938		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
939	}
940
941	if ((cp->cache_flags & KMF_LITE) &&
942	    !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
943		KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
944	}
945
946	return (0);
947}
948
949static int
950kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller)
951{
952	kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
953	kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
954	kmem_slab_t *sp;
955
956	if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) {
957		if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
958			kmem_error(KMERR_DUPFREE, cp, buf);
959			return (-1);
960		}
961		sp = kmem_findslab(cp, buf);
962		if (sp == NULL || sp->slab_cache != cp)
963			kmem_error(KMERR_BADADDR, cp, buf);
964		else
965			kmem_error(KMERR_REDZONE, cp, buf);
966		return (-1);
967	}
968
969	btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
970
971	if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
972		kmem_error(KMERR_BADBUFCTL, cp, buf);
973		return (-1);
974	}
975
976	if (btp->bt_redzone != KMEM_REDZONE_PATTERN) {
977		kmem_error(KMERR_REDZONE, cp, buf);
978		return (-1);
979	}
980
981	if (cp->cache_flags & KMF_AUDIT) {
982		if (cp->cache_flags & KMF_CONTENTS)
983			bcp->bc_contents = kmem_log_enter(kmem_content_log,
984			    buf, cp->cache_contents);
985		KMEM_AUDIT(kmem_transaction_log, cp, bcp);
986	}
987
988	if ((cp->cache_flags & KMF_LITE) &&
989	    !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
990		KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
991	}
992
993	if (cp->cache_flags & KMF_DEADBEEF) {
994		if (cp->cache_flags & KMF_LITE)
995			btp->bt_redzone = *(uint64_t *)buf;
996		else if (cp->cache_destructor != NULL)
997			cp->cache_destructor(buf, cp->cache_private);
998
999		copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1000	}
1001
1002	return (0);
1003}
1004
1005/*
1006 * Free each object in magazine mp to cp's slab layer, and free mp itself.
1007 */
1008static void
1009kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
1010{
1011	int round;
1012
1013	ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread));
1014
1015	for (round = 0; round < nrounds; round++) {
1016		void *buf = mp->mag_round[round];
1017
1018		if (cp->cache_flags & KMF_DEADBEEF) {
1019			if (verify_pattern(KMEM_FREE_PATTERN, buf,
1020			    cp->cache_verify) != NULL) {
1021				kmem_error(KMERR_MODIFIED, cp, buf);
1022				continue;
1023			}
1024			if ((cp->cache_flags & KMF_LITE) &&
1025			    cp->cache_destructor != NULL) {
1026				kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1027				*(uint64_t *)buf = btp->bt_redzone;
1028				cp->cache_destructor(buf, cp->cache_private);
1029				*(uint64_t *)buf = KMEM_FREE_PATTERN;
1030			}
1031		} else if (cp->cache_destructor != NULL) {
1032			cp->cache_destructor(buf, cp->cache_private);
1033		}
1034
1035		kmem_slab_free(cp, buf);
1036	}
1037	ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
1038	kmem_cache_free(cp->cache_magtype->mt_cache, mp);
1039}
1040
1041/*
1042 * Allocate a magazine from the depot.
1043 */
1044static kmem_magazine_t *
1045kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp)
1046{
1047	kmem_magazine_t *mp;
1048
1049	/*
1050	 * If we can't get the depot lock without contention,
1051	 * update our contention count.  We use the depot
1052	 * contention rate to determine whether we need to
1053	 * increase the magazine size for better scalability.
1054	 */
1055	if (!mutex_tryenter(&cp->cache_depot_lock)) {
1056		mutex_enter(&cp->cache_depot_lock);
1057		cp->cache_depot_contention++;
1058	}
1059
1060	if ((mp = mlp->ml_list) != NULL) {
1061		ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
1062		mlp->ml_list = mp->mag_next;
1063		if (--mlp->ml_total < mlp->ml_min)
1064			mlp->ml_min = mlp->ml_total;
1065		mlp->ml_alloc++;
1066	}
1067
1068	mutex_exit(&cp->cache_depot_lock);
1069
1070	return (mp);
1071}
1072
1073/*
1074 * Free a magazine to the depot.
1075 */
1076static void
1077kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp)
1078{
1079	mutex_enter(&cp->cache_depot_lock);
1080	ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
1081	mp->mag_next = mlp->ml_list;
1082	mlp->ml_list = mp;
1083	mlp->ml_total++;
1084	mutex_exit(&cp->cache_depot_lock);
1085}
1086
1087/*
1088 * Update the working set statistics for cp's depot.
1089 */
1090static void
1091kmem_depot_ws_update(kmem_cache_t *cp)
1092{
1093	mutex_enter(&cp->cache_depot_lock);
1094	cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
1095	cp->cache_full.ml_min = cp->cache_full.ml_total;
1096	cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
1097	cp->cache_empty.ml_min = cp->cache_empty.ml_total;
1098	mutex_exit(&cp->cache_depot_lock);
1099}
1100
1101/*
1102 * Reap all magazines that have fallen out of the depot's working set.
1103 */
1104static void
1105kmem_depot_ws_reap(kmem_cache_t *cp)
1106{
1107	long reap;
1108	kmem_magazine_t *mp;
1109
1110	ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread));
1111
1112	reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
1113	while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL)
1114		kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
1115
1116	reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
1117	while (reap-- && (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL)
1118		kmem_magazine_destroy(cp, mp, 0);
1119}
1120
1121static void
1122kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds)
1123{
1124	ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
1125	    (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
1126	ASSERT(ccp->cc_magsize > 0);
1127
1128	ccp->cc_ploaded = ccp->cc_loaded;
1129	ccp->cc_prounds = ccp->cc_rounds;
1130	ccp->cc_loaded = mp;
1131	ccp->cc_rounds = rounds;
1132}
1133
1134/*
1135 * Allocate a constructed object from cache cp.
1136 */
1137void *
1138kmem_cache_alloc(kmem_cache_t *cp, int kmflag)
1139{
1140	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
1141	kmem_magazine_t *fmp;
1142	void *buf;
1143
1144	mutex_enter(&ccp->cc_lock);
1145	for (;;) {
1146		/*
1147		 * If there's an object available in the current CPU's
1148		 * loaded magazine, just take it and return.
1149		 */
1150		if (ccp->cc_rounds > 0) {
1151			buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
1152			ccp->cc_alloc++;
1153			mutex_exit(&ccp->cc_lock);
1154			if ((ccp->cc_flags & KMF_BUFTAG) &&
1155			    kmem_cache_alloc_debug(cp, buf, kmflag, 0,
1156			    caller()) == -1) {
1157				if (kmflag & KM_NOSLEEP)
1158					return (NULL);
1159				mutex_enter(&ccp->cc_lock);
1160				continue;
1161			}
1162			return (buf);
1163		}
1164
1165		/*
1166		 * The loaded magazine is empty.  If the previously loaded
1167		 * magazine was full, exchange them and try again.
1168		 */
1169		if (ccp->cc_prounds > 0) {
1170			kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1171			continue;
1172		}
1173
1174		/*
1175		 * If the magazine layer is disabled, break out now.
1176		 */
1177		if (ccp->cc_magsize == 0)
1178			break;
1179
1180		/*
1181		 * Try to get a full magazine from the depot.
1182		 */
1183		fmp = kmem_depot_alloc(cp, &cp->cache_full);
1184		if (fmp != NULL) {
1185			if (ccp->cc_ploaded != NULL)
1186				kmem_depot_free(cp, &cp->cache_empty,
1187				    ccp->cc_ploaded);
1188			kmem_cpu_reload(ccp, fmp, ccp->cc_magsize);
1189			continue;
1190		}
1191
1192		/*
1193		 * There are no full magazines in the depot,
1194		 * so fall through to the slab layer.
1195		 */
1196		break;
1197	}
1198	mutex_exit(&ccp->cc_lock);
1199
1200	/*
1201	 * We couldn't allocate a constructed object from the magazine layer,
1202	 * so get a raw buffer from the slab layer and apply its constructor.
1203	 */
1204	buf = kmem_slab_alloc(cp, kmflag);
1205
1206	if (buf == NULL)
1207		return (NULL);
1208
1209	if (cp->cache_flags & KMF_BUFTAG) {
1210		/*
1211		 * Make kmem_cache_alloc_debug() apply the constructor for us.
1212		 */
1213		if (kmem_cache_alloc_debug(cp, buf, kmflag, 1,
1214		    caller()) == -1) {
1215			if (kmflag & KM_NOSLEEP)
1216				return (NULL);
1217			/*
1218			 * kmem_cache_alloc_debug() detected corruption
1219			 * but didn't panic (kmem_panic <= 0).  Try again.
1220			 */
1221			return (kmem_cache_alloc(cp, kmflag));
1222		}
1223		return (buf);
1224	}
1225
1226	if (cp->cache_constructor != NULL &&
1227	    cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) {
1228		atomic_add_64(&cp->cache_alloc_fail, 1);
1229		kmem_slab_free(cp, buf);
1230		return (NULL);
1231	}
1232
1233	return (buf);
1234}
1235
1236/*
1237 * Free a constructed object to cache cp.
1238 */
1239void
1240kmem_cache_free(kmem_cache_t *cp, void *buf)
1241{
1242	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
1243	kmem_magazine_t *emp;
1244	kmem_magtype_t *mtp;
1245
1246	if (ccp->cc_flags & KMF_BUFTAG)
1247		if (kmem_cache_free_debug(cp, buf, caller()) == -1)
1248			return;
1249
1250	mutex_enter(&ccp->cc_lock);
1251	for (;;) {
1252		/*
1253		 * If there's a slot available in the current CPU's
1254		 * loaded magazine, just put the object there and return.
1255		 */
1256		if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
1257			ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
1258			ccp->cc_free++;
1259			mutex_exit(&ccp->cc_lock);
1260			return;
1261		}
1262
1263		/*
1264		 * The loaded magazine is full.  If the previously loaded
1265		 * magazine was empty, exchange them and try again.
1266		 */
1267		if (ccp->cc_prounds == 0) {
1268			kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1269			continue;
1270		}
1271
1272		/*
1273		 * If the magazine layer is disabled, break out now.
1274		 */
1275		if (ccp->cc_magsize == 0)
1276			break;
1277
1278		/*
1279		 * Try to get an empty magazine from the depot.
1280		 */
1281		emp = kmem_depot_alloc(cp, &cp->cache_empty);
1282		if (emp != NULL) {
1283			if (ccp->cc_ploaded != NULL)
1284				kmem_depot_free(cp, &cp->cache_full,
1285				    ccp->cc_ploaded);
1286			kmem_cpu_reload(ccp, emp, 0);
1287			continue;
1288		}
1289
1290		/*
1291		 * There are no empty magazines in the depot,
1292		 * so try to allocate a new one.  We must drop all locks
1293		 * across kmem_cache_alloc() because lower layers may
1294		 * attempt to allocate from this cache.
1295		 */
1296		mtp = cp->cache_magtype;
1297		mutex_exit(&ccp->cc_lock);
1298		emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
1299		mutex_enter(&ccp->cc_lock);
1300
1301		if (emp != NULL) {
1302			/*
1303			 * We successfully allocated an empty magazine.
1304			 * However, we had to drop ccp->cc_lock to do it,
1305			 * so the cache's magazine size may have changed.
1306			 * If so, free the magazine and try again.
1307			 */
1308			if (ccp->cc_magsize != mtp->mt_magsize) {
1309				mutex_exit(&ccp->cc_lock);
1310				kmem_cache_free(mtp->mt_cache, emp);
1311				mutex_enter(&ccp->cc_lock);
1312				continue;
1313			}
1314
1315			/*
1316			 * We got a magazine of the right size.  Add it to
1317			 * the depot and try the whole dance again.
1318			 */
1319			kmem_depot_free(cp, &cp->cache_empty, emp);
1320			continue;
1321		}
1322
1323		/*
1324		 * We couldn't allocate an empty magazine,
1325		 * so fall through to the slab layer.
1326		 */
1327		break;
1328	}
1329	mutex_exit(&ccp->cc_lock);
1330
1331	/*
1332	 * We couldn't free our constructed object to the magazine layer,
1333	 * so apply its destructor and free it to the slab layer.
1334	 * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not,
1335	 * kmem_cache_free_debug() will have already applied the destructor.
1336	 */
1337	if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF &&
1338	    cp->cache_destructor != NULL) {
1339		if (cp->cache_flags & KMF_DEADBEEF) {	/* KMF_LITE implied */
1340			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1341			*(uint64_t *)buf = btp->bt_redzone;
1342			cp->cache_destructor(buf, cp->cache_private);
1343			*(uint64_t *)buf = KMEM_FREE_PATTERN;
1344		} else {
1345			cp->cache_destructor(buf, cp->cache_private);
1346		}
1347	}
1348
1349	kmem_slab_free(cp, buf);
1350}
1351
1352void *
1353kmem_zalloc(size_t size, int kmflag)
1354{
1355	size_t index = (size - 1) >> KMEM_ALIGN_SHIFT;
1356	void *buf;
1357
1358	if (index < KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
1359		kmem_cache_t *cp = kmem_alloc_table[index];
1360		buf = kmem_cache_alloc(cp, kmflag);
1361		if (buf != NULL) {
1362			if (cp->cache_flags & KMF_BUFTAG) {
1363				kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1364				((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
1365				((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
1366
1367				if (cp->cache_flags & KMF_LITE) {
1368					KMEM_BUFTAG_LITE_ENTER(btp,
1369					    kmem_lite_count, caller());
1370				}
1371			}
1372			bzero(buf, size);
1373		}
1374	} else {
1375		buf = kmem_alloc(size, kmflag);
1376		if (buf != NULL)
1377			bzero(buf, size);
1378	}
1379	return (buf);
1380}
1381
1382void *
1383kmem_alloc(size_t size, int kmflag)
1384{
1385	size_t index = (size - 1) >> KMEM_ALIGN_SHIFT;
1386	void *buf;
1387
1388	if (index < KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
1389		kmem_cache_t *cp = kmem_alloc_table[index];
1390		buf = kmem_cache_alloc(cp, kmflag);
1391		if ((cp->cache_flags & KMF_BUFTAG) && buf != NULL) {
1392			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1393			((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
1394			((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
1395
1396			if (cp->cache_flags & KMF_LITE) {
1397				KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count,
1398				    caller());
1399			}
1400		}
1401		return (buf);
1402	}
1403	if (size == 0)
1404		return (NULL);
1405	buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS);
1406	if (buf == NULL)
1407		kmem_log_event(kmem_failure_log, NULL, NULL, (void *)size);
1408	return (buf);
1409}
1410
1411void
1412kmem_free(void *buf, size_t size)
1413{
1414	size_t index = (size - 1) >> KMEM_ALIGN_SHIFT;
1415
1416	if (index < KMEM_MAXBUF >> KMEM_ALIGN_SHIFT) {
1417		kmem_cache_t *cp = kmem_alloc_table[index];
1418		if (cp->cache_flags & KMF_BUFTAG) {
1419			kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1420			uint32_t *ip = (uint32_t *)btp;
1421			if (ip[1] != KMEM_SIZE_ENCODE(size)) {
1422				if (*(uint64_t *)buf == KMEM_FREE_PATTERN) {
1423					kmem_error(KMERR_DUPFREE, cp, buf);
1424					return;
1425				}
1426				if (KMEM_SIZE_VALID(ip[1])) {
1427					ip[0] = KMEM_SIZE_ENCODE(size);
1428					kmem_error(KMERR_BADSIZE, cp, buf);
1429				} else {
1430					kmem_error(KMERR_REDZONE, cp, buf);
1431				}
1432				return;
1433			}
1434			if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) {
1435				kmem_error(KMERR_REDZONE, cp, buf);
1436				return;
1437			}
1438			btp->bt_redzone = KMEM_REDZONE_PATTERN;
1439			if (cp->cache_flags & KMF_LITE) {
1440				KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count,
1441				    caller());
1442			}
1443		}
1444		kmem_cache_free(cp, buf);
1445	} else {
1446		if (buf == NULL && size == 0)
1447			return;
1448		vmem_free(kmem_oversize_arena, buf, size);
1449	}
1450}
1451
1452void *
1453kmem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
1454{
1455	size_t realsize = size + vmp->vm_quantum;
1456	void *addr;
1457
1458	/*
1459	 * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
1460	 * vm_quantum will cause integer wraparound.  Check for this, and
1461	 * blow off the firewall page in this case.  Note that such a
1462	 * giant allocation (the entire kernel address space) can never
1463	 * be satisfied, so it will either fail immediately (VM_NOSLEEP)
1464	 * or sleep forever (VM_SLEEP).  Thus, there is no need for a
1465	 * corresponding check in kmem_firewall_va_free().
1466	 */
1467	if (realsize < size)
1468		realsize = size;
1469
1470	/*
1471	 * While boot still owns resource management, make sure that this
1472	 * redzone virtual address allocation is properly accounted for in
1473	 * OBPs "virtual-memory" "available" lists because we're
1474	 * effectively claiming them for a red zone.  If we don't do this,
1475	 * the available lists become too fragmented and too large for the
1476	 * current boot/kernel memory list interface.
1477	 */
1478	addr = vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT);
1479
1480	if (addr != NULL && kvseg.s_base == NULL && realsize != size)
1481		(void) boot_virt_alloc((char *)addr + size, vmp->vm_quantum);
1482
1483	return (addr);
1484}
1485
1486void
1487kmem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
1488{
1489	ASSERT((kvseg.s_base == NULL ?
1490	    va_to_pfn((char *)addr + size) :
1491	    hat_getpfnum(kas.a_hat, (caddr_t)addr + size)) == PFN_INVALID);
1492
1493	vmem_free(vmp, addr, size + vmp->vm_quantum);
1494}
1495
1496/*
1497 * Try to allocate at least `size' bytes of memory without sleeping or
1498 * panicking. Return actual allocated size in `asize'. If allocation failed,
1499 * try final allocation with sleep or panic allowed.
1500 */
1501void *
1502kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag)
1503{
1504	void *p;
1505
1506	*asize = P2ROUNDUP(size, KMEM_ALIGN);
1507	do {
1508		p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC);
1509		if (p != NULL)
1510			return (p);
1511		*asize += KMEM_ALIGN;
1512	} while (*asize <= PAGESIZE);
1513
1514	*asize = P2ROUNDUP(size, KMEM_ALIGN);
1515	return (kmem_alloc(*asize, kmflag));
1516}
1517
1518/*
1519 * Reclaim all unused memory from a cache.
1520 */
1521static void
1522kmem_cache_reap(kmem_cache_t *cp)
1523{
1524	/*
1525	 * Ask the cache's owner to free some memory if possible.
1526	 * The idea is to handle things like the inode cache, which
1527	 * typically sits on a bunch of memory that it doesn't truly
1528	 * *need*.  Reclaim policy is entirely up to the owner; this
1529	 * callback is just an advisory plea for help.
1530	 */
1531	if (cp->cache_reclaim != NULL)
1532		cp->cache_reclaim(cp->cache_private);
1533
1534	kmem_depot_ws_reap(cp);
1535}
1536
1537static void
1538kmem_reap_timeout(void *flag_arg)
1539{
1540	uint32_t *flag = (uint32_t *)flag_arg;
1541
1542	ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
1543	*flag = 0;
1544}
1545
1546static void
1547kmem_reap_done(void *flag)
1548{
1549	(void) timeout(kmem_reap_timeout, flag, kmem_reap_interval);
1550}
1551
1552static void
1553kmem_reap_start(void *flag)
1554{
1555	ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
1556
1557	if (flag == &kmem_reaping) {
1558		kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
1559		/*
1560		 * if we have segkp under heap, reap segkp cache.
1561		 */
1562		if (segkp_fromheap)
1563			segkp_cache_free();
1564	}
1565	else
1566		kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
1567
1568	/*
1569	 * We use taskq_dispatch() to schedule a timeout to clear
1570	 * the flag so that kmem_reap() becomes self-throttling:
1571	 * we won't reap again until the current reap completes *and*
1572	 * at least kmem_reap_interval ticks have elapsed.
1573	 */
1574	if (!taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP))
1575		kmem_reap_done(flag);
1576}
1577
1578static void
1579kmem_reap_common(void *flag_arg)
1580{
1581	uint32_t *flag = (uint32_t *)flag_arg;
1582
1583	if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL ||
1584	    cas32(flag, 0, 1) != 0)
1585		return;
1586
1587	/*
1588	 * It may not be kosher to do memory allocation when a reap is called
1589	 * is called (for example, if vmem_populate() is in the call chain).
1590	 * So we start the reap going with a TQ_NOALLOC dispatch.  If the
1591	 * dispatch fails, we reset the flag, and the next reap will try again.
1592	 */
1593	if (!taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC))
1594		*flag = 0;
1595}
1596
1597/*
1598 * Reclaim all unused memory from all caches.  Called from the VM system
1599 * when memory gets tight.
1600 */
1601void
1602kmem_reap(void)
1603{
1604	kmem_reap_common(&kmem_reaping);
1605}
1606
1607/*
1608 * Reclaim all unused memory from identifier arenas, called when a vmem
1609 * arena not back by memory is exhausted.  Since reaping memory-backed caches
1610 * cannot help with identifier exhaustion, we avoid both a large amount of
1611 * work and unwanted side-effects from reclaim callbacks.
1612 */
1613void
1614kmem_reap_idspace(void)
1615{
1616	kmem_reap_common(&kmem_reaping_idspace);
1617}
1618
1619/*
1620 * Purge all magazines from a cache and set its magazine limit to zero.
1621 * All calls are serialized by the kmem_taskq lock, except for the final
1622 * call from kmem_cache_destroy().
1623 */
1624static void
1625kmem_cache_magazine_purge(kmem_cache_t *cp)
1626{
1627	kmem_cpu_cache_t *ccp;
1628	kmem_magazine_t *mp, *pmp;
1629	int rounds, prounds, cpu_seqid;
1630
1631	ASSERT(cp->cache_next == NULL || taskq_member(kmem_taskq, curthread));
1632	ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
1633
1634	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
1635		ccp = &cp->cache_cpu[cpu_seqid];
1636
1637		mutex_enter(&ccp->cc_lock);
1638		mp = ccp->cc_loaded;
1639		pmp = ccp->cc_ploaded;
1640		rounds = ccp->cc_rounds;
1641		prounds = ccp->cc_prounds;
1642		ccp->cc_loaded = NULL;
1643		ccp->cc_ploaded = NULL;
1644		ccp->cc_rounds = -1;
1645		ccp->cc_prounds = -1;
1646		ccp->cc_magsize = 0;
1647		mutex_exit(&ccp->cc_lock);
1648
1649		if (mp)
1650			kmem_magazine_destroy(cp, mp, rounds);
1651		if (pmp)
1652			kmem_magazine_destroy(cp, pmp, prounds);
1653	}
1654
1655	/*
1656	 * Updating the working set statistics twice in a row has the
1657	 * effect of setting the working set size to zero, so everything
1658	 * is eligible for reaping.
1659	 */
1660	kmem_depot_ws_update(cp);
1661	kmem_depot_ws_update(cp);
1662
1663	kmem_depot_ws_reap(cp);
1664}
1665
1666/*
1667 * Enable per-cpu magazines on a cache.
1668 */
1669static void
1670kmem_cache_magazine_enable(kmem_cache_t *cp)
1671{
1672	int cpu_seqid;
1673
1674	if (cp->cache_flags & KMF_NOMAGAZINE)
1675		return;
1676
1677	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
1678		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
1679		mutex_enter(&ccp->cc_lock);
1680		ccp->cc_magsize = cp->cache_magtype->mt_magsize;
1681		mutex_exit(&ccp->cc_lock);
1682	}
1683
1684}
1685
1686/*
1687 * Reap (almost) everything right now.  See kmem_cache_magazine_purge()
1688 * for explanation of the back-to-back kmem_depot_ws_update() calls.
1689 */
1690void
1691kmem_cache_reap_now(kmem_cache_t *cp)
1692{
1693	kmem_depot_ws_update(cp);
1694	kmem_depot_ws_update(cp);
1695
1696	(void) taskq_dispatch(kmem_taskq,
1697	    (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP);
1698	taskq_wait(kmem_taskq);
1699}
1700
1701/*
1702 * Recompute a cache's magazine size.  The trade-off is that larger magazines
1703 * provide a higher transfer rate with the depot, while smaller magazines
1704 * reduce memory consumption.  Magazine resizing is an expensive operation;
1705 * it should not be done frequently.
1706 *
1707 * Changes to the magazine size are serialized by the kmem_taskq lock.
1708 *
1709 * Note: at present this only grows the magazine size.  It might be useful
1710 * to allow shrinkage too.
1711 */
1712static void
1713kmem_cache_magazine_resize(kmem_cache_t *cp)
1714{
1715	kmem_magtype_t *mtp = cp->cache_magtype;
1716
1717	ASSERT(taskq_member(kmem_taskq, curthread));
1718
1719	if (cp->cache_chunksize < mtp->mt_maxbuf) {
1720		kmem_cache_magazine_purge(cp);
1721		mutex_enter(&cp->cache_depot_lock);
1722		cp->cache_magtype = ++mtp;
1723		cp->cache_depot_contention_prev =
1724		    cp->cache_depot_contention + INT_MAX;
1725		mutex_exit(&cp->cache_depot_lock);
1726		kmem_cache_magazine_enable(cp);
1727	}
1728}
1729
1730/*
1731 * Rescale a cache's hash table, so that the table size is roughly the
1732 * cache size.  We want the average lookup time to be extremely small.
1733 */
1734static void
1735kmem_hash_rescale(kmem_cache_t *cp)
1736{
1737	kmem_bufctl_t **old_table, **new_table, *bcp;
1738	size_t old_size, new_size, h;
1739
1740	ASSERT(taskq_member(kmem_taskq, curthread));
1741
1742	new_size = MAX(KMEM_HASH_INITIAL,
1743	    1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
1744	old_size = cp->cache_hash_mask + 1;
1745
1746	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
1747		return;
1748
1749	new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *),
1750	    VM_NOSLEEP);
1751	if (new_table == NULL)
1752		return;
1753	bzero(new_table, new_size * sizeof (void *));
1754
1755	mutex_enter(&cp->cache_lock);
1756
1757	old_size = cp->cache_hash_mask + 1;
1758	old_table = cp->cache_hash_table;
1759
1760	cp->cache_hash_mask = new_size - 1;
1761	cp->cache_hash_table = new_table;
1762	cp->cache_rescale++;
1763
1764	for (h = 0; h < old_size; h++) {
1765		bcp = old_table[h];
1766		while (bcp != NULL) {
1767			void *addr = bcp->bc_addr;
1768			kmem_bufctl_t *next_bcp = bcp->bc_next;
1769			kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr);
1770			bcp->bc_next = *hash_bucket;
1771			*hash_bucket = bcp;
1772			bcp = next_bcp;
1773		}
1774	}
1775
1776	mutex_exit(&cp->cache_lock);
1777
1778	vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *));
1779}
1780
1781/*
1782 * Perform periodic maintenance on a cache: hash rescaling,
1783 * depot working-set update, and magazine resizing.
1784 */
1785static void
1786kmem_cache_update(kmem_cache_t *cp)
1787{
1788	int need_hash_rescale = 0;
1789	int need_magazine_resize = 0;
1790
1791	ASSERT(MUTEX_HELD(&kmem_cache_lock));
1792
1793	/*
1794	 * If the cache has become much larger or smaller than its hash table,
1795	 * fire off a request to rescale the hash table.
1796	 */
1797	mutex_enter(&cp->cache_lock);
1798
1799	if ((cp->cache_flags & KMF_HASH) &&
1800	    (cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
1801	    (cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
1802	    cp->cache_hash_mask > KMEM_HASH_INITIAL)))
1803		need_hash_rescale = 1;
1804
1805	mutex_exit(&cp->cache_lock);
1806
1807	/*
1808	 * Update the depot working set statistics.
1809	 */
1810	kmem_depot_ws_update(cp);
1811
1812	/*
1813	 * If there's a lot of contention in the depot,
1814	 * increase the magazine size.
1815	 */
1816	mutex_enter(&cp->cache_depot_lock);
1817
1818	if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf &&
1819	    (int)(cp->cache_depot_contention -
1820	    cp->cache_depot_contention_prev) > kmem_depot_contention)
1821		need_magazine_resize = 1;
1822
1823	cp->cache_depot_contention_prev = cp->cache_depot_contention;
1824
1825	mutex_exit(&cp->cache_depot_lock);
1826
1827	if (need_hash_rescale)
1828		(void) taskq_dispatch(kmem_taskq,
1829		    (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP);
1830
1831	if (need_magazine_resize)
1832		(void) taskq_dispatch(kmem_taskq,
1833		    (task_func_t *)kmem_cache_magazine_resize, cp, TQ_NOSLEEP);
1834}
1835
1836static void
1837kmem_update_timeout(void *dummy)
1838{
1839	static void kmem_update(void *);
1840
1841	(void) timeout(kmem_update, dummy, kmem_reap_interval);
1842}
1843
1844static void
1845kmem_update(void *dummy)
1846{
1847	kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP);
1848
1849	/*
1850	 * We use taskq_dispatch() to reschedule the timeout so that
1851	 * kmem_update() becomes self-throttling: it won't schedule
1852	 * new tasks until all previous tasks have completed.
1853	 */
1854	if (!taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP))
1855		kmem_update_timeout(NULL);
1856}
1857
1858static int
1859kmem_cache_kstat_update(kstat_t *ksp, int rw)
1860{
1861	struct kmem_cache_kstat *kmcp = &kmem_cache_kstat;
1862	kmem_cache_t *cp = ksp->ks_private;
1863	kmem_slab_t *sp;
1864	uint64_t cpu_buf_avail;
1865	uint64_t buf_avail = 0;
1866	int cpu_seqid;
1867
1868	ASSERT(MUTEX_HELD(&kmem_cache_kstat_lock));
1869
1870	if (rw == KSTAT_WRITE)
1871		return (EACCES);
1872
1873	mutex_enter(&cp->cache_lock);
1874
1875	kmcp->kmc_alloc_fail.value.ui64		= cp->cache_alloc_fail;
1876	kmcp->kmc_alloc.value.ui64		= cp->cache_slab_alloc;
1877	kmcp->kmc_free.value.ui64		= cp->cache_slab_free;
1878	kmcp->kmc_slab_alloc.value.ui64		= cp->cache_slab_alloc;
1879	kmcp->kmc_slab_free.value.ui64		= cp->cache_slab_free;
1880
1881	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
1882		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
1883
1884		mutex_enter(&ccp->cc_lock);
1885
1886		cpu_buf_avail = 0;
1887		if (ccp->cc_rounds > 0)
1888			cpu_buf_avail += ccp->cc_rounds;
1889		if (ccp->cc_prounds > 0)
1890			cpu_buf_avail += ccp->cc_prounds;
1891
1892		kmcp->kmc_alloc.value.ui64	+= ccp->cc_alloc;
1893		kmcp->kmc_free.value.ui64	+= ccp->cc_free;
1894		buf_avail			+= cpu_buf_avail;
1895
1896		mutex_exit(&ccp->cc_lock);
1897	}
1898
1899	mutex_enter(&cp->cache_depot_lock);
1900
1901	kmcp->kmc_depot_alloc.value.ui64	= cp->cache_full.ml_alloc;
1902	kmcp->kmc_depot_free.value.ui64		= cp->cache_empty.ml_alloc;
1903	kmcp->kmc_depot_contention.value.ui64	= cp->cache_depot_contention;
1904	kmcp->kmc_full_magazines.value.ui64	= cp->cache_full.ml_total;
1905	kmcp->kmc_empty_magazines.value.ui64	= cp->cache_empty.ml_total;
1906	kmcp->kmc_magazine_size.value.ui64	=
1907	    (cp->cache_flags & KMF_NOMAGAZINE) ?
1908	    0 : cp->cache_magtype->mt_magsize;
1909
1910	kmcp->kmc_alloc.value.ui64		+= cp->cache_full.ml_alloc;
1911	kmcp->kmc_free.value.ui64		+= cp->cache_empty.ml_alloc;
1912	buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize;
1913
1914	mutex_exit(&cp->cache_depot_lock);
1915
1916	kmcp->kmc_buf_size.value.ui64	= cp->cache_bufsize;
1917	kmcp->kmc_align.value.ui64	= cp->cache_align;
1918	kmcp->kmc_chunk_size.value.ui64	= cp->cache_chunksize;
1919	kmcp->kmc_slab_size.value.ui64	= cp->cache_slabsize;
1920	kmcp->kmc_buf_constructed.value.ui64 = buf_avail;
1921	for (sp = cp->cache_freelist; sp != &cp->cache_nullslab;
1922	    sp = sp->slab_next)
1923		buf_avail += sp->slab_chunks - sp->slab_refcnt;
1924	kmcp->kmc_buf_avail.value.ui64	= buf_avail;
1925	kmcp->kmc_buf_inuse.value.ui64	= cp->cache_buftotal - buf_avail;
1926	kmcp->kmc_buf_total.value.ui64	= cp->cache_buftotal;
1927	kmcp->kmc_buf_max.value.ui64	= cp->cache_bufmax;
1928	kmcp->kmc_slab_create.value.ui64	= cp->cache_slab_create;
1929	kmcp->kmc_slab_destroy.value.ui64	= cp->cache_slab_destroy;
1930	kmcp->kmc_hash_size.value.ui64	= (cp->cache_flags & KMF_HASH) ?
1931	    cp->cache_hash_mask + 1 : 0;
1932	kmcp->kmc_hash_lookup_depth.value.ui64	= cp->cache_lookup_depth;
1933	kmcp->kmc_hash_rescale.value.ui64	= cp->cache_rescale;
1934	kmcp->kmc_vmem_source.value.ui64	= cp->cache_arena->vm_id;
1935
1936	mutex_exit(&cp->cache_lock);
1937	return (0);
1938}
1939
1940/*
1941 * Return a named statistic about a particular cache.
1942 * This shouldn't be called very often, so it's currently designed for
1943 * simplicity (leverages existing kstat support) rather than efficiency.
1944 */
1945uint64_t
1946kmem_cache_stat(kmem_cache_t *cp, char *name)
1947{
1948	int i;
1949	kstat_t *ksp = cp->cache_kstat;
1950	kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat;
1951	uint64_t value = 0;
1952
1953	if (ksp != NULL) {
1954		mutex_enter(&kmem_cache_kstat_lock);
1955		(void) kmem_cache_kstat_update(ksp, KSTAT_READ);
1956		for (i = 0; i < ksp->ks_ndata; i++) {
1957			if (strcmp(knp[i].name, name) == 0) {
1958				value = knp[i].value.ui64;
1959				break;
1960			}
1961		}
1962		mutex_exit(&kmem_cache_kstat_lock);
1963	}
1964	return (value);
1965}
1966
1967/*
1968 * Return an estimate of currently available kernel heap memory.
1969 * On 32-bit systems, physical memory may exceed virtual memory,
1970 * we just truncate the result at 1GB.
1971 */
1972size_t
1973kmem_avail(void)
1974{
1975	spgcnt_t rmem = availrmem - tune.t_minarmem;
1976	spgcnt_t fmem = freemem - minfree;
1977
1978	return ((size_t)ptob(MIN(MAX(MIN(rmem, fmem), 0),
1979	    1 << (30 - PAGESHIFT))));
1980}
1981
1982/*
1983 * Return the maximum amount of memory that is (in theory) allocatable
1984 * from the heap. This may be used as an estimate only since there
1985 * is no guarentee this space will still be available when an allocation
1986 * request is made, nor that the space may be allocated in one big request
1987 * due to kernel heap fragmentation.
1988 */
1989size_t
1990kmem_maxavail(void)
1991{
1992	spgcnt_t pmem = availrmem - tune.t_minarmem;
1993	spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE));
1994
1995	return ((size_t)ptob(MAX(MIN(pmem, vmem), 0)));
1996}
1997
1998/*
1999 * Indicate whether memory-intensive kmem debugging is enabled.
2000 */
2001int
2002kmem_debugging(void)
2003{
2004	return (kmem_flags & (KMF_AUDIT | KMF_REDZONE));
2005}
2006
2007kmem_cache_t *
2008kmem_cache_create(
2009	char *name,		/* descriptive name for this cache */
2010	size_t bufsize,		/* size of the objects it manages */
2011	size_t align,		/* required object alignment */
2012	int (*constructor)(void *, void *, int), /* object constructor */
2013	void (*destructor)(void *, void *),	/* object destructor */
2014	void (*reclaim)(void *), /* memory reclaim callback */
2015	void *private,		/* pass-thru arg for constr/destr/reclaim */
2016	vmem_t *vmp,		/* vmem source for slab allocation */
2017	int cflags)		/* cache creation flags */
2018{
2019	int cpu_seqid;
2020	size_t chunksize;
2021	kmem_cache_t *cp, *cnext, *cprev;
2022	kmem_magtype_t *mtp;
2023	size_t csize = KMEM_CACHE_SIZE(max_ncpus);
2024
2025#ifdef	DEBUG
2026	/*
2027	 * Cache names should conform to the rules for valid C identifiers
2028	 */
2029	if (!strident_valid(name)) {
2030		cmn_err(CE_CONT,
2031		    "kmem_cache_create: '%s' is an invalid cache name\n"
2032		    "cache names must conform to the rules for "
2033		    "C identifiers\n", name);
2034	}
2035#endif	/* DEBUG */
2036
2037	if (vmp == NULL)
2038		vmp = kmem_default_arena;
2039
2040	/*
2041	 * If this kmem cache has an identifier vmem arena as its source, mark
2042	 * it such to allow kmem_reap_idspace().
2043	 */
2044	ASSERT(!(cflags & KMC_IDENTIFIER));   /* consumer should not set this */
2045	if (vmp->vm_cflags & VMC_IDENTIFIER)
2046		cflags |= KMC_IDENTIFIER;
2047
2048	/*
2049	 * Get a kmem_cache structure.  We arrange that cp->cache_cpu[]
2050	 * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent
2051	 * false sharing of per-CPU data.
2052	 */
2053	cp = vmem_xalloc(kmem_cache_arena, csize, KMEM_CPU_CACHE_SIZE,
2054	    P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), 0, NULL, NULL, VM_SLEEP);
2055	bzero(cp, csize);
2056
2057	if (align == 0)
2058		align = KMEM_ALIGN;
2059
2060	/*
2061	 * If we're not at least KMEM_ALIGN aligned, we can't use free
2062	 * memory to hold bufctl information (because we can't safely
2063	 * perform word loads and stores on it).
2064	 */
2065	if (align < KMEM_ALIGN)
2066		cflags |= KMC_NOTOUCH;
2067
2068	if ((align & (align - 1)) != 0 || align > vmp->vm_quantum)
2069		panic("kmem_cache_create: bad alignment %lu", align);
2070
2071	mutex_enter(&kmem_flags_lock);
2072	if (kmem_flags & KMF_RANDOMIZE)
2073		kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) |
2074		    KMF_RANDOMIZE;
2075	cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG;
2076	mutex_exit(&kmem_flags_lock);
2077
2078	/*
2079	 * Make sure all the various flags are reasonable.
2080	 */
2081	ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH));
2082
2083	if (cp->cache_flags & KMF_LITE) {
2084		if (bufsize >= kmem_lite_minsize &&
2085		    align <= kmem_lite_maxalign &&
2086		    P2PHASE(bufsize, kmem_lite_maxalign) != 0) {
2087			cp->cache_flags |= KMF_BUFTAG;
2088			cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
2089		} else {
2090			cp->cache_flags &= ~KMF_DEBUG;
2091		}
2092	}
2093
2094	if (cp->cache_flags & KMF_DEADBEEF)
2095		cp->cache_flags |= KMF_REDZONE;
2096
2097	if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT))
2098		cp->cache_flags |= KMF_NOMAGAZINE;
2099
2100	if (cflags & KMC_NODEBUG)
2101		cp->cache_flags &= ~KMF_DEBUG;
2102
2103	if (cflags & KMC_NOTOUCH)
2104		cp->cache_flags &= ~KMF_TOUCH;
2105
2106	if (cflags & KMC_NOHASH)
2107		cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
2108
2109	if (cflags & KMC_NOMAGAZINE)
2110		cp->cache_flags |= KMF_NOMAGAZINE;
2111
2112	if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH))
2113		cp->cache_flags |= KMF_REDZONE;
2114
2115	if (!(cp->cache_flags & KMF_AUDIT))
2116		cp->cache_flags &= ~KMF_CONTENTS;
2117
2118	if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall &&
2119	    !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH))
2120		cp->cache_flags |= KMF_FIREWALL;
2121
2122	if (vmp != kmem_default_arena || kmem_firewall_arena == NULL)
2123		cp->cache_flags &= ~KMF_FIREWALL;
2124
2125	if (cp->cache_flags & KMF_FIREWALL) {
2126		cp->cache_flags &= ~KMF_BUFTAG;
2127		cp->cache_flags |= KMF_NOMAGAZINE;
2128		ASSERT(vmp == kmem_default_arena);
2129		vmp = kmem_firewall_arena;
2130	}
2131
2132	/*
2133	 * Set cache properties.
2134	 */
2135	(void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN);
2136	strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN);
2137	cp->cache_bufsize = bufsize;
2138	cp->cache_align = align;
2139	cp->cache_constructor = constructor;
2140	cp->cache_destructor = destructor;
2141	cp->cache_reclaim = reclaim;
2142	cp->cache_private = private;
2143	cp->cache_arena = vmp;
2144	cp->cache_cflags = cflags;
2145
2146	/*
2147	 * Determine the chunk size.
2148	 */
2149	chunksize = bufsize;
2150
2151	if (align >= KMEM_ALIGN) {
2152		chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN);
2153		cp->cache_bufctl = chunksize - KMEM_ALIGN;
2154	}
2155
2156	if (cp->cache_flags & KMF_BUFTAG) {
2157		cp->cache_bufctl = chunksize;
2158		cp->cache_buftag = chunksize;
2159		if (cp->cache_flags & KMF_LITE)
2160			chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count);
2161		else
2162			chunksize += sizeof (kmem_buftag_t);
2163	}
2164
2165	if (cp->cache_flags & KMF_DEADBEEF) {
2166		cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify);
2167		if (cp->cache_flags & KMF_LITE)
2168			cp->cache_verify = sizeof (uint64_t);
2169	}
2170
2171	cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave);
2172
2173	cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align);
2174
2175	/*
2176	 * Now that we know the chunk size, determine the optimal slab size.
2177	 */
2178	if (vmp == kmem_firewall_arena) {
2179		cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum);
2180		cp->cache_mincolor = cp->cache_slabsize - chunksize;
2181		cp->cache_maxcolor = cp->cache_mincolor;
2182		cp->cache_flags |= KMF_HASH;
2183		ASSERT(!(cp->cache_flags & KMF_BUFTAG));
2184	} else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) &&
2185	    !(cp->cache_flags & KMF_AUDIT) &&
2186	    chunksize < vmp->vm_quantum / KMEM_VOID_FRACTION)) {
2187		cp->cache_slabsize = vmp->vm_quantum;
2188		cp->cache_mincolor = 0;
2189		cp->cache_maxcolor =
2190		    (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize;
2191		ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize);
2192		ASSERT(!(cp->cache_flags & KMF_AUDIT));
2193	} else {
2194		size_t chunks, bestfit, waste, slabsize;
2195		size_t minwaste = LONG_MAX;
2196
2197		for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) {
2198			slabsize = P2ROUNDUP(chunksize * chunks,
2199			    vmp->vm_quantum);
2200			chunks = slabsize / chunksize;
2201			waste = (slabsize % chunksize) / chunks;
2202			if (waste < minwaste) {
2203				minwaste = waste;
2204				bestfit = slabsize;
2205			}
2206		}
2207		if (cflags & KMC_QCACHE)
2208			bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max);
2209		cp->cache_slabsize = bestfit;
2210		cp->cache_mincolor = 0;
2211		cp->cache_maxcolor = bestfit % chunksize;
2212		cp->cache_flags |= KMF_HASH;
2213	}
2214
2215	if (cp->cache_flags & KMF_HASH) {
2216		ASSERT(!(cflags & KMC_NOHASH));
2217		cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ?
2218		    kmem_bufctl_audit_cache : kmem_bufctl_cache;
2219	}
2220
2221	if (cp->cache_maxcolor >= vmp->vm_quantum)
2222		cp->cache_maxcolor = vmp->vm_quantum - 1;
2223
2224	cp->cache_color = cp->cache_mincolor;
2225
2226	/*
2227	 * Initialize the rest of the slab layer.
2228	 */
2229	mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL);
2230
2231	cp->cache_freelist = &cp->cache_nullslab;
2232	cp->cache_nullslab.slab_cache = cp;
2233	cp->cache_nullslab.slab_refcnt = -1;
2234	cp->cache_nullslab.slab_next = &cp->cache_nullslab;
2235	cp->cache_nullslab.slab_prev = &cp->cache_nullslab;
2236
2237	if (cp->cache_flags & KMF_HASH) {
2238		cp->cache_hash_table = vmem_alloc(kmem_hash_arena,
2239		    KMEM_HASH_INITIAL * sizeof (void *), VM_SLEEP);
2240		bzero(cp->cache_hash_table,
2241		    KMEM_HASH_INITIAL * sizeof (void *));
2242		cp->cache_hash_mask = KMEM_HASH_INITIAL - 1;
2243		cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1;
2244	}
2245
2246	/*
2247	 * Initialize the depot.
2248	 */
2249	mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL);
2250
2251	for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++)
2252		continue;
2253
2254	cp->cache_magtype = mtp;
2255
2256	/*
2257	 * Initialize the CPU layer.
2258	 */
2259	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
2260		kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
2261		mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL);
2262		ccp->cc_flags = cp->cache_flags;
2263		ccp->cc_rounds = -1;
2264		ccp->cc_prounds = -1;
2265	}
2266
2267	/*
2268	 * Create the cache's kstats.
2269	 */
2270	if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name,
2271	    "kmem_cache", KSTAT_TYPE_NAMED,
2272	    sizeof (kmem_cache_kstat) / sizeof (kstat_named_t),
2273	    KSTAT_FLAG_VIRTUAL)) != NULL) {
2274		cp->cache_kstat->ks_data = &kmem_cache_kstat;
2275		cp->cache_kstat->ks_update = kmem_cache_kstat_update;
2276		cp->cache_kstat->ks_private = cp;
2277		cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock;
2278		kstat_install(cp->cache_kstat);
2279	}
2280
2281	/*
2282	 * Add the cache to the global list.  This makes it visible
2283	 * to kmem_update(), so the cache must be ready for business.
2284	 */
2285	mutex_enter(&kmem_cache_lock);
2286	cp->cache_next = cnext = &kmem_null_cache;
2287	cp->cache_prev = cprev = kmem_null_cache.cache_prev;
2288	cnext->cache_prev = cp;
2289	cprev->cache_next = cp;
2290	mutex_exit(&kmem_cache_lock);
2291
2292	if (kmem_ready)
2293		kmem_cache_magazine_enable(cp);
2294
2295	return (cp);
2296}
2297
2298void
2299kmem_cache_destroy(kmem_cache_t *cp)
2300{
2301	int cpu_seqid;
2302
2303	/*
2304	 * Remove the cache from the global cache list so that no one else
2305	 * can schedule tasks on its behalf, wait for any pending tasks to
2306	 * complete, purge the cache, and then destroy it.
2307	 */
2308	mutex_enter(&kmem_cache_lock);
2309	cp->cache_prev->cache_next = cp->cache_next;
2310	cp->cache_next->cache_prev = cp->cache_prev;
2311	cp->cache_prev = cp->cache_next = NULL;
2312	mutex_exit(&kmem_cache_lock);
2313
2314	if (kmem_taskq != NULL)
2315		taskq_wait(kmem_taskq);
2316
2317	kmem_cache_magazine_purge(cp);
2318
2319	mutex_enter(&cp->cache_lock);
2320	if (cp->cache_buftotal != 0)
2321		cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
2322		    cp->cache_name, (void *)cp);
2323	cp->cache_reclaim = NULL;
2324	/*
2325	 * The cache is now dead.  There should be no further activity.
2326	 * We enforce this by setting land mines in the constructor and
2327	 * destructor routines that induce a kernel text fault if invoked.
2328	 */
2329	cp->cache_constructor = (int (*)(void *, void *, int))1;
2330	cp->cache_destructor = (void (*)(void *, void *))2;
2331	mutex_exit(&cp->cache_lock);
2332
2333	kstat_delete(cp->cache_kstat);
2334
2335	if (cp->cache_hash_table != NULL)
2336		vmem_free(kmem_hash_arena, cp->cache_hash_table,
2337		    (cp->cache_hash_mask + 1) * sizeof (void *));
2338
2339	for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++)
2340		mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
2341
2342	mutex_destroy(&cp->cache_depot_lock);
2343	mutex_destroy(&cp->cache_lock);
2344
2345	vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus));
2346}
2347
2348/*ARGSUSED*/
2349static int
2350kmem_cpu_setup(cpu_setup_t what, int id, void *arg)
2351{
2352	ASSERT(MUTEX_HELD(&cpu_lock));
2353	if (what == CPU_UNCONFIG) {
2354		kmem_cache_applyall(kmem_cache_magazine_purge,
2355		    kmem_taskq, TQ_SLEEP);
2356		kmem_cache_applyall(kmem_cache_magazine_enable,
2357		    kmem_taskq, TQ_SLEEP);
2358	}
2359	return (0);
2360}
2361
2362static void
2363kmem_cache_init(int pass, int use_large_pages)
2364{
2365	int i;
2366	size_t size;
2367	kmem_cache_t *cp;
2368	kmem_magtype_t *mtp;
2369	char name[KMEM_CACHE_NAMELEN + 1];
2370
2371	for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) {
2372		mtp = &kmem_magtype[i];
2373		(void) sprintf(name, "kmem_magazine_%d", mtp->mt_magsize);
2374		mtp->mt_cache = kmem_cache_create(name,
2375		    (mtp->mt_magsize + 1) * sizeof (void *),
2376		    mtp->mt_align, NULL, NULL, NULL, NULL,
2377		    kmem_msb_arena, KMC_NOHASH);
2378	}
2379
2380	kmem_slab_cache = kmem_cache_create("kmem_slab_cache",
2381	    sizeof (kmem_slab_t), 0, NULL, NULL, NULL, NULL,
2382	    kmem_msb_arena, KMC_NOHASH);
2383
2384	kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache",
2385	    sizeof (kmem_bufctl_t), 0, NULL, NULL, NULL, NULL,
2386	    kmem_msb_arena, KMC_NOHASH);
2387
2388	kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache",
2389	    sizeof (kmem_bufctl_audit_t), 0, NULL, NULL, NULL, NULL,
2390	    kmem_msb_arena, KMC_NOHASH);
2391
2392	if (pass == 2) {
2393		kmem_va_arena = vmem_create("kmem_va",
2394		    NULL, 0, PAGESIZE,
2395		    vmem_alloc, vmem_free, heap_arena,
2396		    8 * PAGESIZE, VM_SLEEP);
2397
2398		if (use_large_pages) {
2399			kmem_default_arena = vmem_xcreate("kmem_default",
2400			    NULL, 0, PAGESIZE,
2401			    segkmem_alloc_lp, segkmem_free_lp, kmem_va_arena,
2402			    0, VM_SLEEP);
2403		} else {
2404			kmem_default_arena = vmem_create("kmem_default",
2405			    NULL, 0, PAGESIZE,
2406			    segkmem_alloc, segkmem_free, kmem_va_arena,
2407			    0, VM_SLEEP);
2408		}
2409	} else {
2410		/*
2411		 * During the first pass, the kmem_alloc_* caches
2412		 * are treated as metadata.
2413		 */
2414		kmem_default_arena = kmem_msb_arena;
2415	}
2416
2417	/*
2418	 * Set up the default caches to back kmem_alloc()
2419	 */
2420	size = KMEM_ALIGN;
2421	for (i = 0; i < sizeof (kmem_alloc_sizes) / sizeof (int); i++) {
2422		size_t align = KMEM_ALIGN;
2423		size_t cache_size = kmem_alloc_sizes[i];
2424		/*
2425		 * If they allocate a multiple of the coherency granularity,
2426		 * they get a coherency-granularity-aligned address.
2427		 */
2428		if (IS_P2ALIGNED(cache_size, 64))
2429			align = 64;
2430		if (IS_P2ALIGNED(cache_size, PAGESIZE))
2431			align = PAGESIZE;
2432		(void) sprintf(name, "kmem_alloc_%lu", cache_size);
2433		cp = kmem_cache_create(name, cache_size, align,
2434		    NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC);
2435		while (size <= cache_size) {
2436			kmem_alloc_table[(size - 1) >> KMEM_ALIGN_SHIFT] = cp;
2437			size += KMEM_ALIGN;
2438		}
2439	}
2440}
2441
2442void
2443kmem_init(void)
2444{
2445	kmem_cache_t *cp;
2446	int old_kmem_flags = kmem_flags;
2447	int use_large_pages = 0;
2448	size_t maxverify, minfirewall;
2449
2450	kstat_init();
2451
2452	/*
2453	 * Small-memory systems (< 24 MB) can't handle kmem_flags overhead.
2454	 */
2455	if (physmem < btop(24 << 20) && !(old_kmem_flags & KMF_STICKY))
2456		kmem_flags = 0;
2457
2458	/*
2459	 * Don't do firewalled allocations if the heap is less than 1TB
2460	 * (i.e. on a 32-bit kernel)
2461	 * The resulting VM_NEXTFIT allocations would create too much
2462	 * fragmentation in a small heap.
2463	 */
2464#if defined(_LP64)
2465	maxverify = minfirewall = PAGESIZE / 2;
2466#else
2467	maxverify = minfirewall = ULONG_MAX;
2468#endif
2469
2470	/* LINTED */
2471	ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE);
2472
2473	kmem_null_cache.cache_next = &kmem_null_cache;
2474	kmem_null_cache.cache_prev = &kmem_null_cache;
2475
2476	kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE,
2477	    vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE,
2478	    VM_SLEEP | VMC_NO_QCACHE);
2479
2480	kmem_msb_arena = vmem_create("kmem_msb", NULL, 0,
2481	    PAGESIZE, segkmem_alloc, segkmem_free, kmem_metadata_arena, 0,
2482	    VM_SLEEP);
2483
2484	kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN,
2485	    segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
2486
2487	kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN,
2488	    segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
2489
2490	kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN,
2491	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
2492
2493	kmem_firewall_va_arena = vmem_create("kmem_firewall_va",
2494	    NULL, 0, PAGESIZE,
2495	    kmem_firewall_va_alloc, kmem_firewall_va_free, heap_arena,
2496	    0, VM_SLEEP);
2497
2498	kmem_firewall_arena = vmem_create("kmem_firewall", NULL, 0, PAGESIZE,
2499	    segkmem_alloc, segkmem_free, kmem_firewall_va_arena, 0, VM_SLEEP);
2500
2501	/* temporary oversize arena for mod_read_system_file */
2502	kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE,
2503	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
2504
2505	kmem_null_cache.cache_next = &kmem_null_cache;
2506	kmem_null_cache.cache_prev = &kmem_null_cache;
2507
2508	kmem_reap_interval = 15 * hz;
2509
2510	/*
2511	 * Read /etc/system.  This is a chicken-and-egg problem because
2512	 * kmem_flags may be set in /etc/system, but mod_read_system_file()
2513	 * needs to use the allocator.  The simplest solution is to create
2514	 * all the standard kmem caches, read /etc/system, destroy all the
2515	 * caches we just created, and then create them all again in light
2516	 * of the (possibly) new kmem_flags and other kmem tunables.
2517	 */
2518	kmem_cache_init(1, 0);
2519
2520	mod_read_system_file(boothowto & RB_ASKNAME);
2521
2522	while ((cp = kmem_null_cache.cache_prev) != &kmem_null_cache)
2523		kmem_cache_destroy(cp);
2524
2525	vmem_destroy(kmem_oversize_arena);
2526
2527	if (old_kmem_flags & KMF_STICKY)
2528		kmem_flags = old_kmem_flags;
2529
2530	if (!(kmem_flags & KMF_AUDIT))
2531		vmem_seg_size = offsetof(vmem_seg_t, vs_thread);
2532
2533	if (kmem_maxverify == 0)
2534		kmem_maxverify = maxverify;
2535
2536	if (kmem_minfirewall == 0)
2537		kmem_minfirewall = minfirewall;
2538
2539	/*
2540	 * give segkmem a chance to figure out if we are using large pages
2541	 * for the kernel heap
2542	 */
2543	use_large_pages = segkmem_lpsetup();
2544
2545	/*
2546	 * To protect against corruption, we keep the actual number of callers
2547	 * KMF_LITE records seperate from the tunable.  We arbitrarily clamp
2548	 * to 16, since the overhead for small buffers quickly gets out of
2549	 * hand.
2550	 *
2551	 * The real limit would depend on the needs of the largest KMC_NOHASH
2552	 * cache.
2553	 */
2554	kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16);
2555	kmem_lite_pcs = kmem_lite_count;
2556
2557	/*
2558	 * Normally, we firewall oversized allocations when possible, but
2559	 * if we are using large pages for kernel memory, and we don't have
2560	 * any non-LITE debugging flags set, we want to allocate oversized
2561	 * buffers from large pages, and so skip the firewalling.
2562	 */
2563	if (use_large_pages &&
2564	    ((kmem_flags & KMF_LITE) || !(kmem_flags & KMF_DEBUG))) {
2565		kmem_oversize_arena = vmem_xcreate("kmem_oversize", NULL, 0,
2566		    PAGESIZE, segkmem_alloc_lp, segkmem_free_lp, heap_arena,
2567		    0, VM_SLEEP);
2568	} else {
2569		kmem_oversize_arena = vmem_create("kmem_oversize",
2570		    NULL, 0, PAGESIZE,
2571		    segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
2572		    kmem_firewall_va_arena : heap_arena, 0, VM_SLEEP);
2573	}
2574
2575	kmem_cache_init(2, use_large_pages);
2576
2577	if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
2578		if (kmem_transaction_log_size == 0)
2579			kmem_transaction_log_size = kmem_maxavail() / 50;
2580		kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
2581	}
2582
2583	if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
2584		if (kmem_content_log_size == 0)
2585			kmem_content_log_size = kmem_maxavail() / 50;
2586		kmem_content_log = kmem_log_init(kmem_content_log_size);
2587	}
2588
2589	kmem_failure_log = kmem_log_init(kmem_failure_log_size);
2590
2591	kmem_slab_log = kmem_log_init(kmem_slab_log_size);
2592
2593	/*
2594	 * Initialize STREAMS message caches so allocb() is available.
2595	 * This allows us to initialize the logging framework (cmn_err(9F),
2596	 * strlog(9F), etc) so we can start recording messages.
2597	 */
2598	streams_msg_init();
2599
2600	/*
2601	 * Initialize the ZSD framework in Zones so modules loaded henceforth
2602	 * can register their callbacks.
2603	 */
2604	zone_zsd_init();
2605
2606	log_init();
2607	taskq_init();
2608
2609	/*
2610	 * Warn about invalid or dangerous values of kmem_flags.
2611	 * Always warn about unsupported values.
2612	 */
2613	if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE |
2614	    KMF_CONTENTS | KMF_LITE)) != 0) ||
2615	    ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE))
2616		cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. "
2617		    "See the Solaris Tunable Parameters Reference Manual.",
2618		    kmem_flags);
2619
2620#ifdef DEBUG
2621	if ((kmem_flags & KMF_DEBUG) == 0)
2622		cmn_err(CE_NOTE, "kmem debugging disabled.");
2623#else
2624	/*
2625	 * For non-debug kernels, the only "normal" flags are 0, KMF_LITE,
2626	 * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled
2627	 * if KMF_AUDIT is set). We should warn the user about the performance
2628	 * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE
2629	 * isn't set (since that disables AUDIT).
2630	 */
2631	if (!(kmem_flags & KMF_LITE) &&
2632	    (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0)
2633		cmn_err(CE_WARN, "High-overhead kmem debugging features "
2634		    "enabled (kmem_flags = 0x%x).  Performance degradation "
2635		    "and large memory overhead possible. See the Solaris "
2636		    "Tunable Parameters Reference Manual.", kmem_flags);
2637#endif /* not DEBUG */
2638
2639	kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP);
2640
2641	kmem_ready = 1;
2642
2643	/*
2644	 * Initialize the platform-specific aligned/DMA memory allocator.
2645	 */
2646	ka_init();
2647
2648	/*
2649	 * Initialize 32-bit ID cache.
2650	 */
2651	id32_init();
2652
2653	/*
2654	 * Initialize the networking stack so modules loaded can
2655	 * register their callbacks.
2656	 */
2657	netstack_init();
2658}
2659
2660void
2661kmem_thread_init(void)
2662{
2663	kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri,
2664	    300, INT_MAX, TASKQ_PREPOPULATE);
2665}
2666
2667void
2668kmem_mp_init(void)
2669{
2670	mutex_enter(&cpu_lock);
2671	register_cpu_setup_func(kmem_cpu_setup, NULL);
2672	mutex_exit(&cpu_lock);
2673
2674	kmem_update_timeout(NULL);
2675}
2676