umem.c revision 1219:f89f56c2d9ac
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25 * Use is subject to license terms.
26 */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30/*
31 * based on usr/src/uts/common/os/kmem.c r1.64 from 2001/12/18
32 *
33 * The slab allocator, as described in the following two papers:
34 *
35 *	Jeff Bonwick,
36 *	The Slab Allocator: An Object-Caching Kernel Memory Allocator.
37 *	Proceedings of the Summer 1994 Usenix Conference.
38 *	Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
39 *
40 *	Jeff Bonwick and Jonathan Adams,
41 *	Magazines and vmem: Extending the Slab Allocator to Many CPUs and
42 *	Arbitrary Resources.
43 *	Proceedings of the 2001 Usenix Conference.
44 *	Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
45 *
46 * 1. Overview
47 * -----------
48 * umem is very close to kmem in implementation.  There are four major
49 * areas of divergence:
50 *
51 *	* Initialization
52 *
53 *	* CPU handling
54 *
55 *	* umem_update()
56 *
57 *	* KM_SLEEP v.s. UMEM_NOFAIL
58 *
59 *	* lock ordering
60 *
61 * 2. Initialization
62 * -----------------
63 * kmem is initialized early on in boot, and knows that no one will call
64 * into it before it is ready.  umem does not have these luxuries. Instead,
65 * initialization is divided into two phases:
66 *
67 *	* library initialization, and
68 *
69 *	* first use
70 *
71 * umem's full initialization happens at the time of the first allocation
72 * request (via malloc() and friends, umem_alloc(), or umem_zalloc()),
73 * or the first call to umem_cache_create().
74 *
75 * umem_free(), and umem_cache_alloc() do not require special handling,
76 * since the only way to get valid arguments for them is to successfully
77 * call a function from the first group.
78 *
79 * 2.1. Library Initialization: umem_startup()
80 * -------------------------------------------
81 * umem_startup() is libumem.so's .init section.  It calls pthread_atfork()
82 * to install the handlers necessary for umem's Fork1-Safety.  Because of
83 * race condition issues, all other pre-umem_init() initialization is done
84 * statically (i.e. by the dynamic linker).
85 *
86 * For standalone use, umem_startup() returns everything to its initial
87 * state.
88 *
89 * 2.2. First use: umem_init()
90 * ------------------------------
91 * The first time any memory allocation function is used, we have to
92 * create the backing caches and vmem arenas which are needed for it.
93 * umem_init() is the central point for that task.  When it completes,
94 * umem_ready is either UMEM_READY (all set) or UMEM_READY_INIT_FAILED (unable
95 * to initialize, probably due to lack of memory).
96 *
97 * There are four different paths from which umem_init() is called:
98 *
99 *	* from umem_alloc() or umem_zalloc(), with 0 < size < UMEM_MAXBUF,
100 *
101 *	* from umem_alloc() or umem_zalloc(), with size > UMEM_MAXBUF,
102 *
103 *	* from umem_cache_create(), and
104 *
105 *	* from memalign(), with align > UMEM_ALIGN.
106 *
107 * The last three just check if umem is initialized, and call umem_init()
108 * if it is not.  For performance reasons, the first case is more complicated.
109 *
110 * 2.2.1. umem_alloc()/umem_zalloc(), with 0 < size < UMEM_MAXBUF
111 * -----------------------------------------------------------------
112 * In this case, umem_cache_alloc(&umem_null_cache, ...) is called.
113 * There is special case code in which causes any allocation on
114 * &umem_null_cache to fail by returning (NULL), regardless of the
115 * flags argument.
116 *
117 * So umem_cache_alloc() returns NULL, and umem_alloc()/umem_zalloc() call
118 * umem_alloc_retry().  umem_alloc_retry() sees that the allocation
119 * was agains &umem_null_cache, and calls umem_init().
120 *
121 * If initialization is successful, umem_alloc_retry() returns 1, which
122 * causes umem_alloc()/umem_zalloc() to start over, which causes it to load
123 * the (now valid) cache pointer from umem_alloc_table.
124 *
125 * 2.2.2. Dealing with race conditions
126 * -----------------------------------
127 * There are a couple race conditions resulting from the initialization
128 * code that we have to guard against:
129 *
130 *	* In umem_cache_create(), there is a special UMC_INTERNAL cflag
131 *	that is passed for caches created during initialization.  It
132 *	is illegal for a user to try to create a UMC_INTERNAL cache.
133 *	This allows initialization to proceed, but any other
134 *	umem_cache_create()s will block by calling umem_init().
135 *
136 *	* Since umem_null_cache has a 1-element cache_cpu, it's cache_cpu_mask
137 *	is always zero.  umem_cache_alloc uses cp->cache_cpu_mask to
138 *	mask the cpu number.  This prevents a race between grabbing a
139 *	cache pointer out of umem_alloc_table and growing the cpu array.
140 *
141 *
142 * 3. CPU handling
143 * ---------------
144 * kmem uses the CPU's sequence number to determine which "cpu cache" to
145 * use for an allocation.  Currently, there is no way to get the sequence
146 * number in userspace.
147 *
148 * umem keeps track of cpu information in umem_cpus, an array of umem_max_ncpus
149 * umem_cpu_t structures.  CURCPU() is a a "hint" function, which we then mask
150 * with either umem_cpu_mask or cp->cache_cpu_mask to find the actual "cpu" id.
151 * The mechanics of this is all in the CPU(mask) macro.
152 *
153 * Currently, umem uses _lwp_self() as its hint.
154 *
155 *
156 * 4. The update thread
157 * --------------------
158 * kmem uses a task queue, kmem_taskq, to do periodic maintenance on
159 * every kmem cache.  vmem has a periodic timeout for hash table resizing.
160 * The kmem_taskq also provides a separate context for kmem_cache_reap()'s
161 * to be done in, avoiding issues of the context of kmem_reap() callers.
162 *
163 * Instead, umem has the concept of "updates", which are asynchronous requests
164 * for work attached to single caches.  All caches with pending work are
165 * on a doubly linked list rooted at the umem_null_cache.  All update state
166 * is protected by the umem_update_lock mutex, and the umem_update_cv is used
167 * for notification between threads.
168 *
169 * 4.1. Cache states with regards to updates
170 * -----------------------------------------
171 * A given cache is in one of three states:
172 *
173 * Inactive		cache_uflags is zero, cache_u{next,prev} are NULL
174 *
175 * Work Requested	cache_uflags is non-zero (but UMU_ACTIVE is not set),
176 *			cache_u{next,prev} link the cache onto the global
177 *			update list
178 *
179 * Active		cache_uflags has UMU_ACTIVE set, cache_u{next,prev}
180 *			are NULL, and either umem_update_thr or
181 *			umem_st_update_thr are actively doing work on the
182 *			cache.
183 *
184 * An update can be added to any cache in any state -- if the cache is
185 * Inactive, it transitions to being Work Requested.  If the cache is
186 * Active, the worker will notice the new update and act on it before
187 * transitioning the cache to the Inactive state.
188 *
189 * If a cache is in the Active state, UMU_NOTIFY can be set, which asks
190 * the worker to broadcast the umem_update_cv when it has finished.
191 *
192 * 4.2. Update interface
193 * ---------------------
194 * umem_add_update() adds an update to a particular cache.
195 * umem_updateall() adds an update to all caches.
196 * umem_remove_updates() returns a cache to the Inactive state.
197 *
198 * umem_process_updates() process all caches in the Work Requested state.
199 *
200 * 4.3. Reaping
201 * ------------
202 * When umem_reap() is called (at the time of heap growth), it schedule
203 * UMU_REAP updates on every cache.  It then checks to see if the update
204 * thread exists (umem_update_thr != 0).  If it is, it broadcasts
205 * the umem_update_cv to wake the update thread up, and returns.
206 *
207 * If the update thread does not exist (umem_update_thr == 0), and the
208 * program currently has multiple threads, umem_reap() attempts to create
209 * a new update thread.
210 *
211 * If the process is not multithreaded, or the creation fails, umem_reap()
212 * calls umem_st_update() to do an inline update.
213 *
214 * 4.4. The update thread
215 * ----------------------
216 * The update thread spends most of its time in cond_timedwait() on the
217 * umem_update_cv.  It wakes up under two conditions:
218 *
219 *	* The timedwait times out, in which case it needs to run a global
220 *	update, or
221 *
222 *	* someone cond_broadcast(3THR)s the umem_update_cv, in which case
223 *	it needs to check if there are any caches in the Work Requested
224 *	state.
225 *
226 * When it is time for another global update, umem calls umem_cache_update()
227 * on every cache, then calls vmem_update(), which tunes the vmem structures.
228 * umem_cache_update() can request further work using umem_add_update().
229 *
230 * After any work from the global update completes, the update timer is
231 * reset to umem_reap_interval seconds in the future.  This makes the
232 * updates self-throttling.
233 *
234 * Reaps are similarly self-throttling.  After a UMU_REAP update has
235 * been scheduled on all caches, umem_reap() sets a flag and wakes up the
236 * update thread.  The update thread notices the flag, and resets the
237 * reap state.
238 *
239 * 4.5. Inline updates
240 * -------------------
241 * If the update thread is not running, umem_st_update() is used instead.  It
242 * immediately does a global update (as above), then calls
243 * umem_process_updates() to process both the reaps that umem_reap() added and
244 * any work generated by the global update.  Afterwards, it resets the reap
245 * state.
246 *
247 * While the umem_st_update() is running, umem_st_update_thr holds the thread
248 * id of the thread performing the update.
249 *
250 * 4.6. Updates and fork1()
251 * ------------------------
252 * umem has fork1() pre- and post-handlers which lock up (and release) every
253 * mutex in every cache.  They also lock up the umem_update_lock.  Since
254 * fork1() only copies over a single lwp, other threads (including the update
255 * thread) could have been actively using a cache in the parent.  This
256 * can lead to inconsistencies in the child process.
257 *
258 * Because we locked all of the mutexes, the only possible inconsistancies are:
259 *
260 *	* a umem_cache_alloc() could leak its buffer.
261 *
262 *	* a caller of umem_depot_alloc() could leak a magazine, and all the
263 *	buffers contained in it.
264 *
265 *	* a cache could be in the Active update state.  In the child, there
266 *	would be no thread actually working on it.
267 *
268 *	* a umem_hash_rescale() could leak the new hash table.
269 *
270 *	* a umem_magazine_resize() could be in progress.
271 *
272 *	* a umem_reap() could be in progress.
273 *
274 * The memory leaks we can't do anything about.  umem_release_child() resets
275 * the update state, moves any caches in the Active state to the Work Requested
276 * state.  This might cause some updates to be re-run, but UMU_REAP and
277 * UMU_HASH_RESCALE are effectively idempotent, and the worst that can
278 * happen from umem_magazine_resize() is resizing the magazine twice in close
279 * succession.
280 *
281 * Much of the cleanup in umem_release_child() is skipped if
282 * umem_st_update_thr == thr_self().  This is so that applications which call
283 * fork1() from a cache callback does not break.  Needless to say, any such
284 * application is tremendously broken.
285 *
286 *
287 * 5. KM_SLEEP v.s. UMEM_NOFAIL
288 * ----------------------------
289 * Allocations against kmem and vmem have two basic modes:  SLEEP and
290 * NOSLEEP.  A sleeping allocation is will go to sleep (waiting for
291 * more memory) instead of failing (returning NULL).
292 *
293 * SLEEP allocations presume an extremely multithreaded model, with
294 * a lot of allocation and deallocation activity.  umem cannot presume
295 * that its clients have any particular type of behavior.  Instead,
296 * it provides two types of allocations:
297 *
298 *	* UMEM_DEFAULT, equivalent to KM_NOSLEEP (i.e. return NULL on
299 *	failure)
300 *
301 *	* UMEM_NOFAIL, which, on failure, calls an optional callback
302 *	(registered with umem_nofail_callback()).
303 *
304 * The callback is invoked with no locks held, and can do an arbitrary
305 * amount of work.  It then has a choice between:
306 *
307 *	* Returning UMEM_CALLBACK_RETRY, which will cause the allocation
308 *	to be restarted.
309 *
310 *	* Returning UMEM_CALLBACK_EXIT(status), which will cause exit(2)
311 *	to be invoked with status.  If multiple threads attempt to do
312 *	this simultaneously, only one will call exit(2).
313 *
314 *	* Doing some kind of non-local exit (thr_exit(3thr), longjmp(3C),
315 *	etc.)
316 *
317 * The default callback returns UMEM_CALLBACK_EXIT(255).
318 *
319 * To have these callbacks without risk of state corruption (in the case of
320 * a non-local exit), we have to ensure that the callbacks get invoked
321 * close to the original allocation, with no inconsistent state or held
322 * locks.  The following steps are taken:
323 *
324 *	* All invocations of vmem are VM_NOSLEEP.
325 *
326 *	* All constructor callbacks (which can themselves to allocations)
327 *	are passed UMEM_DEFAULT as their required allocation argument.  This
328 *	way, the constructor will fail, allowing the highest-level allocation
329 *	invoke the nofail callback.
330 *
331 *	If a constructor callback _does_ do a UMEM_NOFAIL allocation, and
332 *	the nofail callback does a non-local exit, we will leak the
333 *	partially-constructed buffer.
334 *
335 *
336 * 6. Lock Ordering
337 * ----------------
338 * umem has a few more locks than kmem does, mostly in the update path.  The
339 * overall lock ordering (earlier locks must be acquired first) is:
340 *
341 *	umem_init_lock
342 *
343 *	vmem_list_lock
344 *	vmem_nosleep_lock.vmpl_mutex
345 *	vmem_t's:
346 *		vm_lock
347 *	sbrk_faillock
348 *
349 *	umem_cache_lock
350 *	umem_update_lock
351 *	umem_flags_lock
352 *	umem_cache_t's:
353 *		cache_cpu[*].cc_lock
354 *		cache_depot_lock
355 *		cache_lock
356 *	umem_log_header_t's:
357 *		lh_cpu[*].clh_lock
358 *		lh_lock
359 */
360
361#include "c_synonyms.h"
362#include <umem_impl.h>
363#include <sys/vmem_impl_user.h>
364#include "umem_base.h"
365#include "vmem_base.h"
366
367#include <sys/processor.h>
368#include <sys/sysmacros.h>
369
370#include <alloca.h>
371#include <errno.h>
372#include <limits.h>
373#include <stdio.h>
374#include <stdlib.h>
375#include <string.h>
376#include <strings.h>
377#include <signal.h>
378#include <unistd.h>
379#include <atomic.h>
380
381#include "misc.h"
382
383#define	UMEM_VMFLAGS(umflag)	(VM_NOSLEEP)
384
385size_t pagesize;
386
387/*
388 * The default set of caches to back umem_alloc().
389 * These sizes should be reevaluated periodically.
390 *
391 * We want allocations that are multiples of the coherency granularity
392 * (64 bytes) to be satisfied from a cache which is a multiple of 64
393 * bytes, so that it will be 64-byte aligned.  For all multiples of 64,
394 * the next kmem_cache_size greater than or equal to it must be a
395 * multiple of 64.
396 */
397static const int umem_alloc_sizes[] = {
398#ifdef _LP64
399	1 * 8,
400	1 * 16,
401	2 * 16,
402	3 * 16,
403#else
404	1 * 8,
405	2 * 8,
406	3 * 8,
407	4 * 8,		5 * 8,		6 * 8,		7 * 8,
408#endif
409	4 * 16,		5 * 16,		6 * 16,		7 * 16,
410	4 * 32,		5 * 32,		6 * 32,		7 * 32,
411	4 * 64,		5 * 64,		6 * 64,		7 * 64,
412	4 * 128,	5 * 128,	6 * 128,	7 * 128,
413	P2ALIGN(8192 / 7, 64),
414	P2ALIGN(8192 / 6, 64),
415	P2ALIGN(8192 / 5, 64),
416	P2ALIGN(8192 / 4, 64),
417	P2ALIGN(8192 / 3, 64),
418	P2ALIGN(8192 / 2, 64),
419	P2ALIGN(8192 / 1, 64),
420	4096 * 3,
421	8192 * 2,
422};
423#define	NUM_ALLOC_SIZES (sizeof (umem_alloc_sizes) / sizeof (*umem_alloc_sizes))
424
425#define	UMEM_MAXBUF	16384
426
427static umem_magtype_t umem_magtype[] = {
428	{ 1,	8,	3200,	65536	},
429	{ 3,	16,	256,	32768	},
430	{ 7,	32,	64,	16384	},
431	{ 15,	64,	0,	8192	},
432	{ 31,	64,	0,	4096	},
433	{ 47,	64,	0,	2048	},
434	{ 63,	64,	0,	1024	},
435	{ 95,	64,	0,	512	},
436	{ 143,	64,	0,	0	},
437};
438
439/*
440 * umem tunables
441 */
442uint32_t umem_max_ncpus;	/* # of CPU caches. */
443
444uint32_t umem_stack_depth = 15; /* # stack frames in a bufctl_audit */
445uint32_t umem_reap_interval = 10; /* max reaping rate (seconds) */
446uint_t umem_depot_contention = 2; /* max failed trylocks per real interval */
447uint_t umem_abort = 1;		/* whether to abort on error */
448uint_t umem_output = 0;		/* whether to write to standard error */
449uint_t umem_logging = 0;	/* umem_log_enter() override */
450uint32_t umem_mtbf = 0;		/* mean time between failures [default: off] */
451size_t umem_transaction_log_size; /* size of transaction log */
452size_t umem_content_log_size;	/* size of content log */
453size_t umem_failure_log_size;	/* failure log [4 pages per CPU] */
454size_t umem_slab_log_size;	/* slab create log [4 pages per CPU] */
455size_t umem_content_maxsave = 256; /* UMF_CONTENTS max bytes to log */
456size_t umem_lite_minsize = 0;	/* minimum buffer size for UMF_LITE */
457size_t umem_lite_maxalign = 1024; /* maximum buffer alignment for UMF_LITE */
458size_t umem_maxverify;		/* maximum bytes to inspect in debug routines */
459size_t umem_minfirewall;	/* hardware-enforced redzone threshold */
460
461uint_t umem_flags = 0;
462
463mutex_t			umem_init_lock;		/* locks initialization */
464cond_t			umem_init_cv;		/* initialization CV */
465thread_t		umem_init_thr;		/* thread initializing */
466int			umem_init_env_ready;	/* environ pre-initted */
467int			umem_ready = UMEM_READY_STARTUP;
468
469static umem_nofail_callback_t *nofail_callback;
470static mutex_t		umem_nofail_exit_lock;
471static thread_t		umem_nofail_exit_thr;
472
473static umem_cache_t	*umem_slab_cache;
474static umem_cache_t	*umem_bufctl_cache;
475static umem_cache_t	*umem_bufctl_audit_cache;
476
477mutex_t			umem_flags_lock;
478
479static vmem_t		*heap_arena;
480static vmem_alloc_t	*heap_alloc;
481static vmem_free_t	*heap_free;
482
483static vmem_t		*umem_internal_arena;
484static vmem_t		*umem_cache_arena;
485static vmem_t		*umem_hash_arena;
486static vmem_t		*umem_log_arena;
487static vmem_t		*umem_oversize_arena;
488static vmem_t		*umem_va_arena;
489static vmem_t		*umem_default_arena;
490static vmem_t		*umem_firewall_va_arena;
491static vmem_t		*umem_firewall_arena;
492
493vmem_t			*umem_memalign_arena;
494
495umem_log_header_t *umem_transaction_log;
496umem_log_header_t *umem_content_log;
497umem_log_header_t *umem_failure_log;
498umem_log_header_t *umem_slab_log;
499
500extern thread_t _thr_self(void);
501#define	CPUHINT()		(_thr_self())
502#define	CPUHINT_MAX()		INT_MAX
503
504#define	CPU(mask)		(umem_cpus + (CPUHINT() & (mask)))
505static umem_cpu_t umem_startup_cpu = {	/* initial, single, cpu */
506	UMEM_CACHE_SIZE(0),
507	0
508};
509
510static uint32_t umem_cpu_mask = 0;			/* global cpu mask */
511static umem_cpu_t *umem_cpus = &umem_startup_cpu;	/* cpu list */
512
513volatile uint32_t umem_reaping;
514
515thread_t		umem_update_thr;
516struct timeval		umem_update_next;	/* timeofday of next update */
517volatile thread_t	umem_st_update_thr;	/* only used when single-thd */
518
519#define	IN_UPDATE()	(thr_self() == umem_update_thr || \
520			    thr_self() == umem_st_update_thr)
521#define	IN_REAP()	IN_UPDATE()
522
523mutex_t			umem_update_lock;	/* cache_u{next,prev,flags} */
524cond_t			umem_update_cv;
525
526volatile hrtime_t umem_reap_next;	/* min hrtime of next reap */
527
528mutex_t			umem_cache_lock;	/* inter-cache linkage only */
529
530#ifdef UMEM_STANDALONE
531umem_cache_t		umem_null_cache;
532static const umem_cache_t umem_null_cache_template = {
533#else
534umem_cache_t		umem_null_cache = {
535#endif
536	0, 0, 0, 0, 0,
537	0, 0,
538	0, 0,
539	0, 0,
540	"invalid_cache",
541	0, 0,
542	NULL, NULL, NULL, NULL,
543	NULL,
544	0, 0, 0, 0,
545	&umem_null_cache, &umem_null_cache,
546	&umem_null_cache, &umem_null_cache,
547	0,
548	DEFAULTMUTEX,				/* start of slab layer */
549	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550	&umem_null_cache.cache_nullslab,
551	{
552		&umem_null_cache,
553		NULL,
554		&umem_null_cache.cache_nullslab,
555		&umem_null_cache.cache_nullslab,
556		NULL,
557		-1,
558		0
559	},
560	NULL,
561	NULL,
562	DEFAULTMUTEX,				/* start of depot layer */
563	NULL, {
564		NULL, 0, 0, 0, 0
565	}, {
566		NULL, 0, 0, 0, 0
567	}, {
568		{
569			DEFAULTMUTEX,		/* start of CPU cache */
570			0, 0, NULL, NULL, -1, -1, 0
571		}
572	}
573};
574
575#define	ALLOC_TABLE_4 \
576	&umem_null_cache, &umem_null_cache, &umem_null_cache, &umem_null_cache
577
578#define	ALLOC_TABLE_64 \
579	ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
580	ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
581	ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, \
582	ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4, ALLOC_TABLE_4
583
584#define	ALLOC_TABLE_1024 \
585	ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
586	ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
587	ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, \
588	ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64, ALLOC_TABLE_64
589
590static umem_cache_t *umem_alloc_table[UMEM_MAXBUF >> UMEM_ALIGN_SHIFT] = {
591	ALLOC_TABLE_1024,
592	ALLOC_TABLE_1024
593};
594
595
596/* Used to constrain audit-log stack traces */
597caddr_t			umem_min_stack;
598caddr_t			umem_max_stack;
599
600
601/*
602 * we use the _ versions, since we don't want to be cancelled.
603 * Actually, this is automatically taken care of by including "mtlib.h".
604 */
605extern int _cond_wait(cond_t *cv, mutex_t *mutex);
606
607#define	UMERR_MODIFIED	0	/* buffer modified while on freelist */
608#define	UMERR_REDZONE	1	/* redzone violation (write past end of buf) */
609#define	UMERR_DUPFREE	2	/* freed a buffer twice */
610#define	UMERR_BADADDR	3	/* freed a bad (unallocated) address */
611#define	UMERR_BADBUFTAG	4	/* buftag corrupted */
612#define	UMERR_BADBUFCTL	5	/* bufctl corrupted */
613#define	UMERR_BADCACHE	6	/* freed a buffer to the wrong cache */
614#define	UMERR_BADSIZE	7	/* alloc size != free size */
615#define	UMERR_BADBASE	8	/* buffer base address wrong */
616
617struct {
618	hrtime_t	ump_timestamp;	/* timestamp of error */
619	int		ump_error;	/* type of umem error (UMERR_*) */
620	void		*ump_buffer;	/* buffer that induced abort */
621	void		*ump_realbuf;	/* real start address for buffer */
622	umem_cache_t	*ump_cache;	/* buffer's cache according to client */
623	umem_cache_t	*ump_realcache;	/* actual cache containing buffer */
624	umem_slab_t	*ump_slab;	/* slab accoring to umem_findslab() */
625	umem_bufctl_t	*ump_bufctl;	/* bufctl */
626} umem_abort_info;
627
628static void
629copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
630{
631	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
632	uint64_t *buf = buf_arg;
633
634	while (buf < bufend)
635		*buf++ = pattern;
636}
637
638static void *
639verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
640{
641	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
642	uint64_t *buf;
643
644	for (buf = buf_arg; buf < bufend; buf++)
645		if (*buf != pattern)
646			return (buf);
647	return (NULL);
648}
649
650static void *
651verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
652{
653	uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
654	uint64_t *buf;
655
656	for (buf = buf_arg; buf < bufend; buf++) {
657		if (*buf != old) {
658			copy_pattern(old, buf_arg,
659			    (char *)buf - (char *)buf_arg);
660			return (buf);
661		}
662		*buf = new;
663	}
664
665	return (NULL);
666}
667
668void
669umem_cache_applyall(void (*func)(umem_cache_t *))
670{
671	umem_cache_t *cp;
672
673	(void) mutex_lock(&umem_cache_lock);
674	for (cp = umem_null_cache.cache_next; cp != &umem_null_cache;
675	    cp = cp->cache_next)
676		func(cp);
677	(void) mutex_unlock(&umem_cache_lock);
678}
679
680static void
681umem_add_update_unlocked(umem_cache_t *cp, int flags)
682{
683	umem_cache_t *cnext, *cprev;
684
685	flags &= ~UMU_ACTIVE;
686
687	if (!flags)
688		return;
689
690	if (cp->cache_uflags & UMU_ACTIVE) {
691		cp->cache_uflags |= flags;
692	} else {
693		if (cp->cache_unext != NULL) {
694			ASSERT(cp->cache_uflags != 0);
695			cp->cache_uflags |= flags;
696		} else {
697			ASSERT(cp->cache_uflags == 0);
698			cp->cache_uflags = flags;
699			cp->cache_unext = cnext = &umem_null_cache;
700			cp->cache_uprev = cprev = umem_null_cache.cache_uprev;
701			cnext->cache_uprev = cp;
702			cprev->cache_unext = cp;
703		}
704	}
705}
706
707static void
708umem_add_update(umem_cache_t *cp, int flags)
709{
710	(void) mutex_lock(&umem_update_lock);
711
712	umem_add_update_unlocked(cp, flags);
713
714	if (!IN_UPDATE())
715		(void) cond_broadcast(&umem_update_cv);
716
717	(void) mutex_unlock(&umem_update_lock);
718}
719
720/*
721 * Remove a cache from the update list, waiting for any in-progress work to
722 * complete first.
723 */
724static void
725umem_remove_updates(umem_cache_t *cp)
726{
727	(void) mutex_lock(&umem_update_lock);
728
729	/*
730	 * Get it out of the active state
731	 */
732	while (cp->cache_uflags & UMU_ACTIVE) {
733		ASSERT(cp->cache_unext == NULL);
734
735		cp->cache_uflags |= UMU_NOTIFY;
736
737		/*
738		 * Make sure the update state is sane, before we wait
739		 */
740		ASSERT(umem_update_thr != 0 || umem_st_update_thr != 0);
741		ASSERT(umem_update_thr != thr_self() &&
742		    umem_st_update_thr != thr_self());
743
744		(void) _cond_wait(&umem_update_cv, &umem_update_lock);
745	}
746	/*
747	 * Get it out of the Work Requested state
748	 */
749	if (cp->cache_unext != NULL) {
750		cp->cache_uprev->cache_unext = cp->cache_unext;
751		cp->cache_unext->cache_uprev = cp->cache_uprev;
752		cp->cache_uprev = cp->cache_unext = NULL;
753		cp->cache_uflags = 0;
754	}
755	/*
756	 * Make sure it is in the Inactive state
757	 */
758	ASSERT(cp->cache_unext == NULL && cp->cache_uflags == 0);
759	(void) mutex_unlock(&umem_update_lock);
760}
761
762static void
763umem_updateall(int flags)
764{
765	umem_cache_t *cp;
766
767	/*
768	 * NOTE:  To prevent deadlock, umem_cache_lock is always acquired first.
769	 *
770	 * (umem_add_update is called from things run via umem_cache_applyall)
771	 */
772	(void) mutex_lock(&umem_cache_lock);
773	(void) mutex_lock(&umem_update_lock);
774
775	for (cp = umem_null_cache.cache_next; cp != &umem_null_cache;
776	    cp = cp->cache_next)
777		umem_add_update_unlocked(cp, flags);
778
779	if (!IN_UPDATE())
780		(void) cond_broadcast(&umem_update_cv);
781
782	(void) mutex_unlock(&umem_update_lock);
783	(void) mutex_unlock(&umem_cache_lock);
784}
785
786/*
787 * Debugging support.  Given a buffer address, find its slab.
788 */
789static umem_slab_t *
790umem_findslab(umem_cache_t *cp, void *buf)
791{
792	umem_slab_t *sp;
793
794	(void) mutex_lock(&cp->cache_lock);
795	for (sp = cp->cache_nullslab.slab_next;
796	    sp != &cp->cache_nullslab; sp = sp->slab_next) {
797		if (UMEM_SLAB_MEMBER(sp, buf)) {
798			(void) mutex_unlock(&cp->cache_lock);
799			return (sp);
800		}
801	}
802	(void) mutex_unlock(&cp->cache_lock);
803
804	return (NULL);
805}
806
807static void
808umem_error(int error, umem_cache_t *cparg, void *bufarg)
809{
810	umem_buftag_t *btp = NULL;
811	umem_bufctl_t *bcp = NULL;
812	umem_cache_t *cp = cparg;
813	umem_slab_t *sp;
814	uint64_t *off;
815	void *buf = bufarg;
816
817	int old_logging = umem_logging;
818
819	umem_logging = 0;	/* stop logging when a bad thing happens */
820
821	umem_abort_info.ump_timestamp = gethrtime();
822
823	sp = umem_findslab(cp, buf);
824	if (sp == NULL) {
825		for (cp = umem_null_cache.cache_prev; cp != &umem_null_cache;
826		    cp = cp->cache_prev) {
827			if ((sp = umem_findslab(cp, buf)) != NULL)
828				break;
829		}
830	}
831
832	if (sp == NULL) {
833		cp = NULL;
834		error = UMERR_BADADDR;
835	} else {
836		if (cp != cparg)
837			error = UMERR_BADCACHE;
838		else
839			buf = (char *)bufarg - ((uintptr_t)bufarg -
840			    (uintptr_t)sp->slab_base) % cp->cache_chunksize;
841		if (buf != bufarg)
842			error = UMERR_BADBASE;
843		if (cp->cache_flags & UMF_BUFTAG)
844			btp = UMEM_BUFTAG(cp, buf);
845		if (cp->cache_flags & UMF_HASH) {
846			(void) mutex_lock(&cp->cache_lock);
847			for (bcp = *UMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
848				if (bcp->bc_addr == buf)
849					break;
850			(void) mutex_unlock(&cp->cache_lock);
851			if (bcp == NULL && btp != NULL)
852				bcp = btp->bt_bufctl;
853			if (umem_findslab(cp->cache_bufctl_cache, bcp) ==
854			    NULL || P2PHASE((uintptr_t)bcp, UMEM_ALIGN) ||
855			    bcp->bc_addr != buf) {
856				error = UMERR_BADBUFCTL;
857				bcp = NULL;
858			}
859		}
860	}
861
862	umem_abort_info.ump_error = error;
863	umem_abort_info.ump_buffer = bufarg;
864	umem_abort_info.ump_realbuf = buf;
865	umem_abort_info.ump_cache = cparg;
866	umem_abort_info.ump_realcache = cp;
867	umem_abort_info.ump_slab = sp;
868	umem_abort_info.ump_bufctl = bcp;
869
870	umem_printf("umem allocator: ");
871
872	switch (error) {
873
874	case UMERR_MODIFIED:
875		umem_printf("buffer modified after being freed\n");
876		off = verify_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
877		if (off == NULL)	/* shouldn't happen */
878			off = buf;
879		umem_printf("modification occurred at offset 0x%lx "
880		    "(0x%llx replaced by 0x%llx)\n",
881		    (uintptr_t)off - (uintptr_t)buf,
882		    (longlong_t)UMEM_FREE_PATTERN, (longlong_t)*off);
883		break;
884
885	case UMERR_REDZONE:
886		umem_printf("redzone violation: write past end of buffer\n");
887		break;
888
889	case UMERR_BADADDR:
890		umem_printf("invalid free: buffer not in cache\n");
891		break;
892
893	case UMERR_DUPFREE:
894		umem_printf("duplicate free: buffer freed twice\n");
895		break;
896
897	case UMERR_BADBUFTAG:
898		umem_printf("boundary tag corrupted\n");
899		umem_printf("bcp ^ bxstat = %lx, should be %lx\n",
900		    (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
901		    UMEM_BUFTAG_FREE);
902		break;
903
904	case UMERR_BADBUFCTL:
905		umem_printf("bufctl corrupted\n");
906		break;
907
908	case UMERR_BADCACHE:
909		umem_printf("buffer freed to wrong cache\n");
910		umem_printf("buffer was allocated from %s,\n", cp->cache_name);
911		umem_printf("caller attempting free to %s.\n",
912		    cparg->cache_name);
913		break;
914
915	case UMERR_BADSIZE:
916		umem_printf("bad free: free size (%u) != alloc size (%u)\n",
917		    UMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
918		    UMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
919		break;
920
921	case UMERR_BADBASE:
922		umem_printf("bad free: free address (%p) != alloc address "
923		    "(%p)\n", bufarg, buf);
924		break;
925	}
926
927	umem_printf("buffer=%p  bufctl=%p  cache: %s\n",
928	    bufarg, (void *)bcp, cparg->cache_name);
929
930	if (bcp != NULL && (cp->cache_flags & UMF_AUDIT) &&
931	    error != UMERR_BADBUFCTL) {
932		int d;
933		timespec_t ts;
934		hrtime_t diff;
935		umem_bufctl_audit_t *bcap = (umem_bufctl_audit_t *)bcp;
936
937		diff = umem_abort_info.ump_timestamp - bcap->bc_timestamp;
938		ts.tv_sec = diff / NANOSEC;
939		ts.tv_nsec = diff % NANOSEC;
940
941		umem_printf("previous transaction on buffer %p:\n", buf);
942		umem_printf("thread=%p  time=T-%ld.%09ld  slab=%p  cache: %s\n",
943		    (void *)(intptr_t)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
944		    (void *)sp, cp->cache_name);
945		for (d = 0; d < MIN(bcap->bc_depth, umem_stack_depth); d++) {
946			(void) print_sym((void *)bcap->bc_stack[d]);
947			umem_printf("\n");
948		}
949	}
950
951	umem_err_recoverable("umem: heap corruption detected");
952
953	umem_logging = old_logging;	/* resume logging */
954}
955
956void
957umem_nofail_callback(umem_nofail_callback_t *cb)
958{
959	nofail_callback = cb;
960}
961
962static int
963umem_alloc_retry(umem_cache_t *cp, int umflag)
964{
965	if (cp == &umem_null_cache) {
966		if (umem_init())
967			return (1);				/* retry */
968		/*
969		 * Initialization failed.  Do normal failure processing.
970		 */
971	}
972	if (umflag & UMEM_NOFAIL) {
973		int def_result = UMEM_CALLBACK_EXIT(255);
974		int result = def_result;
975		umem_nofail_callback_t *callback = nofail_callback;
976
977		if (callback != NULL)
978			result = callback();
979
980		if (result == UMEM_CALLBACK_RETRY)
981			return (1);
982
983		if ((result & ~0xFF) != UMEM_CALLBACK_EXIT(0)) {
984			log_message("nofail callback returned %x\n", result);
985			result = def_result;
986		}
987
988		/*
989		 * only one thread will call exit
990		 */
991		if (umem_nofail_exit_thr == thr_self())
992			umem_panic("recursive UMEM_CALLBACK_EXIT()\n");
993
994		(void) mutex_lock(&umem_nofail_exit_lock);
995		umem_nofail_exit_thr = thr_self();
996		exit(result & 0xFF);
997		/*NOTREACHED*/
998	}
999	return (0);
1000}
1001
1002static umem_log_header_t *
1003umem_log_init(size_t logsize)
1004{
1005	umem_log_header_t *lhp;
1006	int nchunks = 4 * umem_max_ncpus;
1007	size_t lhsize = offsetof(umem_log_header_t, lh_cpu[umem_max_ncpus]);
1008	int i;
1009
1010	if (logsize == 0)
1011		return (NULL);
1012
1013	/*
1014	 * Make sure that lhp->lh_cpu[] is nicely aligned
1015	 * to prevent false sharing of cache lines.
1016	 */
1017	lhsize = P2ROUNDUP(lhsize, UMEM_ALIGN);
1018	lhp = vmem_xalloc(umem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
1019	    NULL, NULL, VM_NOSLEEP);
1020	if (lhp == NULL)
1021		goto fail;
1022
1023	bzero(lhp, lhsize);
1024
1025	(void) mutex_init(&lhp->lh_lock, USYNC_THREAD, NULL);
1026	lhp->lh_nchunks = nchunks;
1027	lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks, PAGESIZE);
1028	if (lhp->lh_chunksize == 0)
1029		lhp->lh_chunksize = PAGESIZE;
1030
1031	lhp->lh_base = vmem_alloc(umem_log_arena,
1032	    lhp->lh_chunksize * nchunks, VM_NOSLEEP);
1033	if (lhp->lh_base == NULL)
1034		goto fail;
1035
1036	lhp->lh_free = vmem_alloc(umem_log_arena,
1037	    nchunks * sizeof (int), VM_NOSLEEP);
1038	if (lhp->lh_free == NULL)
1039		goto fail;
1040
1041	bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
1042
1043	for (i = 0; i < umem_max_ncpus; i++) {
1044		umem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
1045		(void) mutex_init(&clhp->clh_lock, USYNC_THREAD, NULL);
1046		clhp->clh_chunk = i;
1047	}
1048
1049	for (i = umem_max_ncpus; i < nchunks; i++)
1050		lhp->lh_free[i] = i;
1051
1052	lhp->lh_head = umem_max_ncpus;
1053	lhp->lh_tail = 0;
1054
1055	return (lhp);
1056
1057fail:
1058	if (lhp != NULL) {
1059		if (lhp->lh_base != NULL)
1060			vmem_free(umem_log_arena, lhp->lh_base,
1061			    lhp->lh_chunksize * nchunks);
1062
1063		vmem_xfree(umem_log_arena, lhp, lhsize);
1064	}
1065	return (NULL);
1066}
1067
1068static void *
1069umem_log_enter(umem_log_header_t *lhp, void *data, size_t size)
1070{
1071	void *logspace;
1072	umem_cpu_log_header_t *clhp =
1073	    &lhp->lh_cpu[CPU(umem_cpu_mask)->cpu_number];
1074
1075	if (lhp == NULL || umem_logging == 0)
1076		return (NULL);
1077
1078	(void) mutex_lock(&clhp->clh_lock);
1079	clhp->clh_hits++;
1080	if (size > clhp->clh_avail) {
1081		(void) mutex_lock(&lhp->lh_lock);
1082		lhp->lh_hits++;
1083		lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
1084		lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
1085		clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
1086		lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
1087		clhp->clh_current = lhp->lh_base +
1088		    clhp->clh_chunk * lhp->lh_chunksize;
1089		clhp->clh_avail = lhp->lh_chunksize;
1090		if (size > lhp->lh_chunksize)
1091			size = lhp->lh_chunksize;
1092		(void) mutex_unlock(&lhp->lh_lock);
1093	}
1094	logspace = clhp->clh_current;
1095	clhp->clh_current += size;
1096	clhp->clh_avail -= size;
1097	bcopy(data, logspace, size);
1098	(void) mutex_unlock(&clhp->clh_lock);
1099	return (logspace);
1100}
1101
1102#define	UMEM_AUDIT(lp, cp, bcp)						\
1103{									\
1104	umem_bufctl_audit_t *_bcp = (umem_bufctl_audit_t *)(bcp);	\
1105	_bcp->bc_timestamp = gethrtime();				\
1106	_bcp->bc_thread = thr_self();					\
1107	_bcp->bc_depth = getpcstack(_bcp->bc_stack, umem_stack_depth,	\
1108	    (cp != NULL) && (cp->cache_flags & UMF_CHECKSIGNAL));	\
1109	_bcp->bc_lastlog = umem_log_enter((lp), _bcp,			\
1110	    UMEM_BUFCTL_AUDIT_SIZE);					\
1111}
1112
1113static void
1114umem_log_event(umem_log_header_t *lp, umem_cache_t *cp,
1115	umem_slab_t *sp, void *addr)
1116{
1117	umem_bufctl_audit_t *bcp;
1118	UMEM_LOCAL_BUFCTL_AUDIT(&bcp);
1119
1120	bzero(bcp, UMEM_BUFCTL_AUDIT_SIZE);
1121	bcp->bc_addr = addr;
1122	bcp->bc_slab = sp;
1123	bcp->bc_cache = cp;
1124	UMEM_AUDIT(lp, cp, bcp);
1125}
1126
1127/*
1128 * Create a new slab for cache cp.
1129 */
1130static umem_slab_t *
1131umem_slab_create(umem_cache_t *cp, int umflag)
1132{
1133	size_t slabsize = cp->cache_slabsize;
1134	size_t chunksize = cp->cache_chunksize;
1135	int cache_flags = cp->cache_flags;
1136	size_t color, chunks;
1137	char *buf, *slab;
1138	umem_slab_t *sp;
1139	umem_bufctl_t *bcp;
1140	vmem_t *vmp = cp->cache_arena;
1141
1142	color = cp->cache_color + cp->cache_align;
1143	if (color > cp->cache_maxcolor)
1144		color = cp->cache_mincolor;
1145	cp->cache_color = color;
1146
1147	slab = vmem_alloc(vmp, slabsize, UMEM_VMFLAGS(umflag));
1148
1149	if (slab == NULL)
1150		goto vmem_alloc_failure;
1151
1152	ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
1153
1154	if (!(cp->cache_cflags & UMC_NOTOUCH) &&
1155	    (cp->cache_flags & UMF_DEADBEEF))
1156		copy_pattern(UMEM_UNINITIALIZED_PATTERN, slab, slabsize);
1157
1158	if (cache_flags & UMF_HASH) {
1159		if ((sp = _umem_cache_alloc(umem_slab_cache, umflag)) == NULL)
1160			goto slab_alloc_failure;
1161		chunks = (slabsize - color) / chunksize;
1162	} else {
1163		sp = UMEM_SLAB(cp, slab);
1164		chunks = (slabsize - sizeof (umem_slab_t) - color) / chunksize;
1165	}
1166
1167	sp->slab_cache	= cp;
1168	sp->slab_head	= NULL;
1169	sp->slab_refcnt	= 0;
1170	sp->slab_base	= buf = slab + color;
1171	sp->slab_chunks	= chunks;
1172
1173	ASSERT(chunks > 0);
1174	while (chunks-- != 0) {
1175		if (cache_flags & UMF_HASH) {
1176			bcp = _umem_cache_alloc(cp->cache_bufctl_cache, umflag);
1177			if (bcp == NULL)
1178				goto bufctl_alloc_failure;
1179			if (cache_flags & UMF_AUDIT) {
1180				umem_bufctl_audit_t *bcap =
1181				    (umem_bufctl_audit_t *)bcp;
1182				bzero(bcap, UMEM_BUFCTL_AUDIT_SIZE);
1183				bcap->bc_cache = cp;
1184			}
1185			bcp->bc_addr = buf;
1186			bcp->bc_slab = sp;
1187		} else {
1188			bcp = UMEM_BUFCTL(cp, buf);
1189		}
1190		if (cache_flags & UMF_BUFTAG) {
1191			umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1192			btp->bt_redzone = UMEM_REDZONE_PATTERN;
1193			btp->bt_bufctl = bcp;
1194			btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1195			if (cache_flags & UMF_DEADBEEF) {
1196				copy_pattern(UMEM_FREE_PATTERN, buf,
1197				    cp->cache_verify);
1198			}
1199		}
1200		bcp->bc_next = sp->slab_head;
1201		sp->slab_head = bcp;
1202		buf += chunksize;
1203	}
1204
1205	umem_log_event(umem_slab_log, cp, sp, slab);
1206
1207	return (sp);
1208
1209bufctl_alloc_failure:
1210
1211	while ((bcp = sp->slab_head) != NULL) {
1212		sp->slab_head = bcp->bc_next;
1213		_umem_cache_free(cp->cache_bufctl_cache, bcp);
1214	}
1215	_umem_cache_free(umem_slab_cache, sp);
1216
1217slab_alloc_failure:
1218
1219	vmem_free(vmp, slab, slabsize);
1220
1221vmem_alloc_failure:
1222
1223	umem_log_event(umem_failure_log, cp, NULL, NULL);
1224	atomic_add_64(&cp->cache_alloc_fail, 1);
1225
1226	return (NULL);
1227}
1228
1229/*
1230 * Destroy a slab.
1231 */
1232static void
1233umem_slab_destroy(umem_cache_t *cp, umem_slab_t *sp)
1234{
1235	vmem_t *vmp = cp->cache_arena;
1236	void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
1237
1238	if (cp->cache_flags & UMF_HASH) {
1239		umem_bufctl_t *bcp;
1240		while ((bcp = sp->slab_head) != NULL) {
1241			sp->slab_head = bcp->bc_next;
1242			_umem_cache_free(cp->cache_bufctl_cache, bcp);
1243		}
1244		_umem_cache_free(umem_slab_cache, sp);
1245	}
1246	vmem_free(vmp, slab, cp->cache_slabsize);
1247}
1248
1249/*
1250 * Allocate a raw (unconstructed) buffer from cp's slab layer.
1251 */
1252static void *
1253umem_slab_alloc(umem_cache_t *cp, int umflag)
1254{
1255	umem_bufctl_t *bcp, **hash_bucket;
1256	umem_slab_t *sp;
1257	void *buf;
1258
1259	(void) mutex_lock(&cp->cache_lock);
1260	cp->cache_slab_alloc++;
1261	sp = cp->cache_freelist;
1262	ASSERT(sp->slab_cache == cp);
1263	if (sp->slab_head == NULL) {
1264		/*
1265		 * The freelist is empty.  Create a new slab.
1266		 */
1267		(void) mutex_unlock(&cp->cache_lock);
1268		if (cp == &umem_null_cache)
1269			return (NULL);
1270		if ((sp = umem_slab_create(cp, umflag)) == NULL)
1271			return (NULL);
1272		(void) mutex_lock(&cp->cache_lock);
1273		cp->cache_slab_create++;
1274		if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
1275			cp->cache_bufmax = cp->cache_buftotal;
1276		sp->slab_next = cp->cache_freelist;
1277		sp->slab_prev = cp->cache_freelist->slab_prev;
1278		sp->slab_next->slab_prev = sp;
1279		sp->slab_prev->slab_next = sp;
1280		cp->cache_freelist = sp;
1281	}
1282
1283	sp->slab_refcnt++;
1284	ASSERT(sp->slab_refcnt <= sp->slab_chunks);
1285
1286	/*
1287	 * If we're taking the last buffer in the slab,
1288	 * remove the slab from the cache's freelist.
1289	 */
1290	bcp = sp->slab_head;
1291	if ((sp->slab_head = bcp->bc_next) == NULL) {
1292		cp->cache_freelist = sp->slab_next;
1293		ASSERT(sp->slab_refcnt == sp->slab_chunks);
1294	}
1295
1296	if (cp->cache_flags & UMF_HASH) {
1297		/*
1298		 * Add buffer to allocated-address hash table.
1299		 */
1300		buf = bcp->bc_addr;
1301		hash_bucket = UMEM_HASH(cp, buf);
1302		bcp->bc_next = *hash_bucket;
1303		*hash_bucket = bcp;
1304		if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) {
1305			UMEM_AUDIT(umem_transaction_log, cp, bcp);
1306		}
1307	} else {
1308		buf = UMEM_BUF(cp, bcp);
1309	}
1310
1311	ASSERT(UMEM_SLAB_MEMBER(sp, buf));
1312
1313	(void) mutex_unlock(&cp->cache_lock);
1314
1315	return (buf);
1316}
1317
1318/*
1319 * Free a raw (unconstructed) buffer to cp's slab layer.
1320 */
1321static void
1322umem_slab_free(umem_cache_t *cp, void *buf)
1323{
1324	umem_slab_t *sp;
1325	umem_bufctl_t *bcp, **prev_bcpp;
1326
1327	ASSERT(buf != NULL);
1328
1329	(void) mutex_lock(&cp->cache_lock);
1330	cp->cache_slab_free++;
1331
1332	if (cp->cache_flags & UMF_HASH) {
1333		/*
1334		 * Look up buffer in allocated-address hash table.
1335		 */
1336		prev_bcpp = UMEM_HASH(cp, buf);
1337		while ((bcp = *prev_bcpp) != NULL) {
1338			if (bcp->bc_addr == buf) {
1339				*prev_bcpp = bcp->bc_next;
1340				sp = bcp->bc_slab;
1341				break;
1342			}
1343			cp->cache_lookup_depth++;
1344			prev_bcpp = &bcp->bc_next;
1345		}
1346	} else {
1347		bcp = UMEM_BUFCTL(cp, buf);
1348		sp = UMEM_SLAB(cp, buf);
1349	}
1350
1351	if (bcp == NULL || sp->slab_cache != cp || !UMEM_SLAB_MEMBER(sp, buf)) {
1352		(void) mutex_unlock(&cp->cache_lock);
1353		umem_error(UMERR_BADADDR, cp, buf);
1354		return;
1355	}
1356
1357	if ((cp->cache_flags & (UMF_AUDIT | UMF_BUFTAG)) == UMF_AUDIT) {
1358		if (cp->cache_flags & UMF_CONTENTS)
1359			((umem_bufctl_audit_t *)bcp)->bc_contents =
1360			    umem_log_enter(umem_content_log, buf,
1361			    cp->cache_contents);
1362		UMEM_AUDIT(umem_transaction_log, cp, bcp);
1363	}
1364
1365	/*
1366	 * If this slab isn't currently on the freelist, put it there.
1367	 */
1368	if (sp->slab_head == NULL) {
1369		ASSERT(sp->slab_refcnt == sp->slab_chunks);
1370		ASSERT(cp->cache_freelist != sp);
1371		sp->slab_next->slab_prev = sp->slab_prev;
1372		sp->slab_prev->slab_next = sp->slab_next;
1373		sp->slab_next = cp->cache_freelist;
1374		sp->slab_prev = cp->cache_freelist->slab_prev;
1375		sp->slab_next->slab_prev = sp;
1376		sp->slab_prev->slab_next = sp;
1377		cp->cache_freelist = sp;
1378	}
1379
1380	bcp->bc_next = sp->slab_head;
1381	sp->slab_head = bcp;
1382
1383	ASSERT(sp->slab_refcnt >= 1);
1384	if (--sp->slab_refcnt == 0) {
1385		/*
1386		 * There are no outstanding allocations from this slab,
1387		 * so we can reclaim the memory.
1388		 */
1389		sp->slab_next->slab_prev = sp->slab_prev;
1390		sp->slab_prev->slab_next = sp->slab_next;
1391		if (sp == cp->cache_freelist)
1392			cp->cache_freelist = sp->slab_next;
1393		cp->cache_slab_destroy++;
1394		cp->cache_buftotal -= sp->slab_chunks;
1395		(void) mutex_unlock(&cp->cache_lock);
1396		umem_slab_destroy(cp, sp);
1397		return;
1398	}
1399	(void) mutex_unlock(&cp->cache_lock);
1400}
1401
1402static int
1403umem_cache_alloc_debug(umem_cache_t *cp, void *buf, int umflag)
1404{
1405	umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1406	umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl;
1407	uint32_t mtbf;
1408	int flags_nfatal;
1409
1410	if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) {
1411		umem_error(UMERR_BADBUFTAG, cp, buf);
1412		return (-1);
1413	}
1414
1415	btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_ALLOC;
1416
1417	if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) {
1418		umem_error(UMERR_BADBUFCTL, cp, buf);
1419		return (-1);
1420	}
1421
1422	btp->bt_redzone = UMEM_REDZONE_PATTERN;
1423
1424	if (cp->cache_flags & UMF_DEADBEEF) {
1425		if (verify_and_copy_pattern(UMEM_FREE_PATTERN,
1426		    UMEM_UNINITIALIZED_PATTERN, buf, cp->cache_verify)) {
1427			umem_error(UMERR_MODIFIED, cp, buf);
1428			return (-1);
1429		}
1430	}
1431
1432	if ((mtbf = umem_mtbf | cp->cache_mtbf) != 0 &&
1433	    gethrtime() % mtbf == 0 &&
1434	    (umflag & (UMEM_FATAL_FLAGS)) == 0) {
1435		umem_log_event(umem_failure_log, cp, NULL, NULL);
1436	} else {
1437		mtbf = 0;
1438	}
1439
1440	/*
1441	 * We do not pass fatal flags on to the constructor.  This prevents
1442	 * leaking buffers in the event of a subordinate constructor failing.
1443	 */
1444	flags_nfatal = UMEM_DEFAULT;
1445	if (mtbf || (cp->cache_constructor != NULL &&
1446	    cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0)) {
1447		atomic_add_64(&cp->cache_alloc_fail, 1);
1448		btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1449		copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
1450		umem_slab_free(cp, buf);
1451		return (-1);
1452	}
1453
1454	if (cp->cache_flags & UMF_AUDIT) {
1455		UMEM_AUDIT(umem_transaction_log, cp, bcp);
1456	}
1457
1458	return (0);
1459}
1460
1461static int
1462umem_cache_free_debug(umem_cache_t *cp, void *buf)
1463{
1464	umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1465	umem_bufctl_audit_t *bcp = (umem_bufctl_audit_t *)btp->bt_bufctl;
1466	umem_slab_t *sp;
1467
1468	if (btp->bt_bxstat != ((intptr_t)bcp ^ UMEM_BUFTAG_ALLOC)) {
1469		if (btp->bt_bxstat == ((intptr_t)bcp ^ UMEM_BUFTAG_FREE)) {
1470			umem_error(UMERR_DUPFREE, cp, buf);
1471			return (-1);
1472		}
1473		sp = umem_findslab(cp, buf);
1474		if (sp == NULL || sp->slab_cache != cp)
1475			umem_error(UMERR_BADADDR, cp, buf);
1476		else
1477			umem_error(UMERR_REDZONE, cp, buf);
1478		return (-1);
1479	}
1480
1481	btp->bt_bxstat = (intptr_t)bcp ^ UMEM_BUFTAG_FREE;
1482
1483	if ((cp->cache_flags & UMF_HASH) && bcp->bc_addr != buf) {
1484		umem_error(UMERR_BADBUFCTL, cp, buf);
1485		return (-1);
1486	}
1487
1488	if (btp->bt_redzone != UMEM_REDZONE_PATTERN) {
1489		umem_error(UMERR_REDZONE, cp, buf);
1490		return (-1);
1491	}
1492
1493	if (cp->cache_flags & UMF_AUDIT) {
1494		if (cp->cache_flags & UMF_CONTENTS)
1495			bcp->bc_contents = umem_log_enter(umem_content_log,
1496			    buf, cp->cache_contents);
1497		UMEM_AUDIT(umem_transaction_log, cp, bcp);
1498	}
1499
1500	if (cp->cache_destructor != NULL)
1501		cp->cache_destructor(buf, cp->cache_private);
1502
1503	if (cp->cache_flags & UMF_DEADBEEF)
1504		copy_pattern(UMEM_FREE_PATTERN, buf, cp->cache_verify);
1505
1506	return (0);
1507}
1508
1509/*
1510 * Free each object in magazine mp to cp's slab layer, and free mp itself.
1511 */
1512static void
1513umem_magazine_destroy(umem_cache_t *cp, umem_magazine_t *mp, int nrounds)
1514{
1515	int round;
1516
1517	ASSERT(cp->cache_next == NULL || IN_UPDATE());
1518
1519	for (round = 0; round < nrounds; round++) {
1520		void *buf = mp->mag_round[round];
1521
1522		if ((cp->cache_flags & UMF_DEADBEEF) &&
1523		    verify_pattern(UMEM_FREE_PATTERN, buf,
1524		    cp->cache_verify) != NULL) {
1525			umem_error(UMERR_MODIFIED, cp, buf);
1526			continue;
1527		}
1528
1529		if (!(cp->cache_flags & UMF_BUFTAG) &&
1530		    cp->cache_destructor != NULL)
1531			cp->cache_destructor(buf, cp->cache_private);
1532
1533		umem_slab_free(cp, buf);
1534	}
1535	ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1536	_umem_cache_free(cp->cache_magtype->mt_cache, mp);
1537}
1538
1539/*
1540 * Allocate a magazine from the depot.
1541 */
1542static umem_magazine_t *
1543umem_depot_alloc(umem_cache_t *cp, umem_maglist_t *mlp)
1544{
1545	umem_magazine_t *mp;
1546
1547	/*
1548	 * If we can't get the depot lock without contention,
1549	 * update our contention count.  We use the depot
1550	 * contention rate to determine whether we need to
1551	 * increase the magazine size for better scalability.
1552	 */
1553	if (mutex_trylock(&cp->cache_depot_lock) != 0) {
1554		(void) mutex_lock(&cp->cache_depot_lock);
1555		cp->cache_depot_contention++;
1556	}
1557
1558	if ((mp = mlp->ml_list) != NULL) {
1559		ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1560		mlp->ml_list = mp->mag_next;
1561		if (--mlp->ml_total < mlp->ml_min)
1562			mlp->ml_min = mlp->ml_total;
1563		mlp->ml_alloc++;
1564	}
1565
1566	(void) mutex_unlock(&cp->cache_depot_lock);
1567
1568	return (mp);
1569}
1570
1571/*
1572 * Free a magazine to the depot.
1573 */
1574static void
1575umem_depot_free(umem_cache_t *cp, umem_maglist_t *mlp, umem_magazine_t *mp)
1576{
1577	(void) mutex_lock(&cp->cache_depot_lock);
1578	ASSERT(UMEM_MAGAZINE_VALID(cp, mp));
1579	mp->mag_next = mlp->ml_list;
1580	mlp->ml_list = mp;
1581	mlp->ml_total++;
1582	(void) mutex_unlock(&cp->cache_depot_lock);
1583}
1584
1585/*
1586 * Update the working set statistics for cp's depot.
1587 */
1588static void
1589umem_depot_ws_update(umem_cache_t *cp)
1590{
1591	(void) mutex_lock(&cp->cache_depot_lock);
1592	cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
1593	cp->cache_full.ml_min = cp->cache_full.ml_total;
1594	cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
1595	cp->cache_empty.ml_min = cp->cache_empty.ml_total;
1596	(void) mutex_unlock(&cp->cache_depot_lock);
1597}
1598
1599/*
1600 * Reap all magazines that have fallen out of the depot's working set.
1601 */
1602static void
1603umem_depot_ws_reap(umem_cache_t *cp)
1604{
1605	long reap;
1606	umem_magazine_t *mp;
1607
1608	ASSERT(cp->cache_next == NULL || IN_REAP());
1609
1610	reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
1611	while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_full)) != NULL)
1612		umem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
1613
1614	reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
1615	while (reap-- && (mp = umem_depot_alloc(cp, &cp->cache_empty)) != NULL)
1616		umem_magazine_destroy(cp, mp, 0);
1617}
1618
1619static void
1620umem_cpu_reload(umem_cpu_cache_t *ccp, umem_magazine_t *mp, int rounds)
1621{
1622	ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
1623	    (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
1624	ASSERT(ccp->cc_magsize > 0);
1625
1626	ccp->cc_ploaded = ccp->cc_loaded;
1627	ccp->cc_prounds = ccp->cc_rounds;
1628	ccp->cc_loaded = mp;
1629	ccp->cc_rounds = rounds;
1630}
1631
1632/*
1633 * Allocate a constructed object from cache cp.
1634 */
1635#pragma weak umem_cache_alloc = _umem_cache_alloc
1636void *
1637_umem_cache_alloc(umem_cache_t *cp, int umflag)
1638{
1639	umem_cpu_cache_t *ccp;
1640	umem_magazine_t *fmp;
1641	void *buf;
1642	int flags_nfatal;
1643
1644retry:
1645	ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask));
1646	(void) mutex_lock(&ccp->cc_lock);
1647	for (;;) {
1648		/*
1649		 * If there's an object available in the current CPU's
1650		 * loaded magazine, just take it and return.
1651		 */
1652		if (ccp->cc_rounds > 0) {
1653			buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
1654			ccp->cc_alloc++;
1655			(void) mutex_unlock(&ccp->cc_lock);
1656			if ((ccp->cc_flags & UMF_BUFTAG) &&
1657			    umem_cache_alloc_debug(cp, buf, umflag) == -1) {
1658				if (umem_alloc_retry(cp, umflag)) {
1659					goto retry;
1660				}
1661
1662				return (NULL);
1663			}
1664			return (buf);
1665		}
1666
1667		/*
1668		 * The loaded magazine is empty.  If the previously loaded
1669		 * magazine was full, exchange them and try again.
1670		 */
1671		if (ccp->cc_prounds > 0) {
1672			umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1673			continue;
1674		}
1675
1676		/*
1677		 * If the magazine layer is disabled, break out now.
1678		 */
1679		if (ccp->cc_magsize == 0)
1680			break;
1681
1682		/*
1683		 * Try to get a full magazine from the depot.
1684		 */
1685		fmp = umem_depot_alloc(cp, &cp->cache_full);
1686		if (fmp != NULL) {
1687			if (ccp->cc_ploaded != NULL)
1688				umem_depot_free(cp, &cp->cache_empty,
1689				    ccp->cc_ploaded);
1690			umem_cpu_reload(ccp, fmp, ccp->cc_magsize);
1691			continue;
1692		}
1693
1694		/*
1695		 * There are no full magazines in the depot,
1696		 * so fall through to the slab layer.
1697		 */
1698		break;
1699	}
1700	(void) mutex_unlock(&ccp->cc_lock);
1701
1702	/*
1703	 * We couldn't allocate a constructed object from the magazine layer,
1704	 * so get a raw buffer from the slab layer and apply its constructor.
1705	 */
1706	buf = umem_slab_alloc(cp, umflag);
1707
1708	if (buf == NULL) {
1709		if (cp == &umem_null_cache)
1710			return (NULL);
1711		if (umem_alloc_retry(cp, umflag)) {
1712			goto retry;
1713		}
1714
1715		return (NULL);
1716	}
1717
1718	if (cp->cache_flags & UMF_BUFTAG) {
1719		/*
1720		 * Let umem_cache_alloc_debug() apply the constructor for us.
1721		 */
1722		if (umem_cache_alloc_debug(cp, buf, umflag) == -1) {
1723			if (umem_alloc_retry(cp, umflag)) {
1724				goto retry;
1725			}
1726			return (NULL);
1727		}
1728		return (buf);
1729	}
1730
1731	/*
1732	 * We do not pass fatal flags on to the constructor.  This prevents
1733	 * leaking buffers in the event of a subordinate constructor failing.
1734	 */
1735	flags_nfatal = UMEM_DEFAULT;
1736	if (cp->cache_constructor != NULL &&
1737	    cp->cache_constructor(buf, cp->cache_private, flags_nfatal) != 0) {
1738		atomic_add_64(&cp->cache_alloc_fail, 1);
1739		umem_slab_free(cp, buf);
1740
1741		if (umem_alloc_retry(cp, umflag)) {
1742			goto retry;
1743		}
1744		return (NULL);
1745	}
1746
1747	return (buf);
1748}
1749
1750/*
1751 * Free a constructed object to cache cp.
1752 */
1753#pragma weak umem_cache_free = _umem_cache_free
1754void
1755_umem_cache_free(umem_cache_t *cp, void *buf)
1756{
1757	umem_cpu_cache_t *ccp = UMEM_CPU_CACHE(cp, CPU(cp->cache_cpu_mask));
1758	umem_magazine_t *emp;
1759	umem_magtype_t *mtp;
1760
1761	if (ccp->cc_flags & UMF_BUFTAG)
1762		if (umem_cache_free_debug(cp, buf) == -1)
1763			return;
1764
1765	(void) mutex_lock(&ccp->cc_lock);
1766	for (;;) {
1767		/*
1768		 * If there's a slot available in the current CPU's
1769		 * loaded magazine, just put the object there and return.
1770		 */
1771		if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
1772			ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
1773			ccp->cc_free++;
1774			(void) mutex_unlock(&ccp->cc_lock);
1775			return;
1776		}
1777
1778		/*
1779		 * The loaded magazine is full.  If the previously loaded
1780		 * magazine was empty, exchange them and try again.
1781		 */
1782		if (ccp->cc_prounds == 0) {
1783			umem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
1784			continue;
1785		}
1786
1787		/*
1788		 * If the magazine layer is disabled, break out now.
1789		 */
1790		if (ccp->cc_magsize == 0)
1791			break;
1792
1793		/*
1794		 * Try to get an empty magazine from the depot.
1795		 */
1796		emp = umem_depot_alloc(cp, &cp->cache_empty);
1797		if (emp != NULL) {
1798			if (ccp->cc_ploaded != NULL)
1799				umem_depot_free(cp, &cp->cache_full,
1800				    ccp->cc_ploaded);
1801			umem_cpu_reload(ccp, emp, 0);
1802			continue;
1803		}
1804
1805		/*
1806		 * There are no empty magazines in the depot,
1807		 * so try to allocate a new one.  We must drop all locks
1808		 * across umem_cache_alloc() because lower layers may
1809		 * attempt to allocate from this cache.
1810		 */
1811		mtp = cp->cache_magtype;
1812		(void) mutex_unlock(&ccp->cc_lock);
1813		emp = _umem_cache_alloc(mtp->mt_cache, UMEM_DEFAULT);
1814		(void) mutex_lock(&ccp->cc_lock);
1815
1816		if (emp != NULL) {
1817			/*
1818			 * We successfully allocated an empty magazine.
1819			 * However, we had to drop ccp->cc_lock to do it,
1820			 * so the cache's magazine size may have changed.
1821			 * If so, free the magazine and try again.
1822			 */
1823			if (ccp->cc_magsize != mtp->mt_magsize) {
1824				(void) mutex_unlock(&ccp->cc_lock);
1825				_umem_cache_free(mtp->mt_cache, emp);
1826				(void) mutex_lock(&ccp->cc_lock);
1827				continue;
1828			}
1829
1830			/*
1831			 * We got a magazine of the right size.  Add it to
1832			 * the depot and try the whole dance again.
1833			 */
1834			umem_depot_free(cp, &cp->cache_empty, emp);
1835			continue;
1836		}
1837
1838		/*
1839		 * We couldn't allocate an empty magazine,
1840		 * so fall through to the slab layer.
1841		 */
1842		break;
1843	}
1844	(void) mutex_unlock(&ccp->cc_lock);
1845
1846	/*
1847	 * We couldn't free our constructed object to the magazine layer,
1848	 * so apply its destructor and free it to the slab layer.
1849	 * Note that if UMF_BUFTAG is in effect, umem_cache_free_debug()
1850	 * will have already applied the destructor.
1851	 */
1852	if (!(cp->cache_flags & UMF_BUFTAG) && cp->cache_destructor != NULL)
1853		cp->cache_destructor(buf, cp->cache_private);
1854
1855	umem_slab_free(cp, buf);
1856}
1857
1858#pragma weak umem_zalloc = _umem_zalloc
1859void *
1860_umem_zalloc(size_t size, int umflag)
1861{
1862	size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1863	void *buf;
1864
1865retry:
1866	if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1867		umem_cache_t *cp = umem_alloc_table[index];
1868		buf = _umem_cache_alloc(cp, umflag);
1869		if (buf != NULL) {
1870			if (cp->cache_flags & UMF_BUFTAG) {
1871				umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1872				((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE;
1873				((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size);
1874			}
1875			bzero(buf, size);
1876		} else if (umem_alloc_retry(cp, umflag))
1877			goto retry;
1878	} else {
1879		buf = _umem_alloc(size, umflag);	/* handles failure */
1880		if (buf != NULL)
1881			bzero(buf, size);
1882	}
1883	return (buf);
1884}
1885
1886#pragma weak umem_alloc = _umem_alloc
1887void *
1888_umem_alloc(size_t size, int umflag)
1889{
1890	size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1891	void *buf;
1892umem_alloc_retry:
1893	if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1894		umem_cache_t *cp = umem_alloc_table[index];
1895		buf = _umem_cache_alloc(cp, umflag);
1896		if ((cp->cache_flags & UMF_BUFTAG) && buf != NULL) {
1897			umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1898			((uint8_t *)buf)[size] = UMEM_REDZONE_BYTE;
1899			((uint32_t *)btp)[1] = UMEM_SIZE_ENCODE(size);
1900		}
1901		if (buf == NULL && umem_alloc_retry(cp, umflag))
1902			goto umem_alloc_retry;
1903		return (buf);
1904	}
1905	if (size == 0)
1906		return (NULL);
1907	if (umem_oversize_arena == NULL) {
1908		if (umem_init())
1909			ASSERT(umem_oversize_arena != NULL);
1910		else
1911			return (NULL);
1912	}
1913	buf = vmem_alloc(umem_oversize_arena, size, UMEM_VMFLAGS(umflag));
1914	if (buf == NULL) {
1915		umem_log_event(umem_failure_log, NULL, NULL, (void *)size);
1916		if (umem_alloc_retry(NULL, umflag))
1917			goto umem_alloc_retry;
1918	}
1919	return (buf);
1920}
1921
1922#pragma weak umem_alloc_align = _umem_alloc_align
1923void *
1924_umem_alloc_align(size_t size, size_t align, int umflag)
1925{
1926	void *buf;
1927
1928	if (size == 0)
1929		return (NULL);
1930	if ((align & (align - 1)) != 0)
1931		return (NULL);
1932	if (align < UMEM_ALIGN)
1933		align = UMEM_ALIGN;
1934
1935umem_alloc_align_retry:
1936	if (umem_memalign_arena == NULL) {
1937		if (umem_init())
1938			ASSERT(umem_oversize_arena != NULL);
1939		else
1940			return (NULL);
1941	}
1942	buf = vmem_xalloc(umem_memalign_arena, size, align, 0, 0, NULL, NULL,
1943	    UMEM_VMFLAGS(umflag));
1944	if (buf == NULL) {
1945		umem_log_event(umem_failure_log, NULL, NULL, (void *)size);
1946		if (umem_alloc_retry(NULL, umflag))
1947			goto umem_alloc_align_retry;
1948	}
1949	return (buf);
1950}
1951
1952#pragma weak umem_free = _umem_free
1953void
1954_umem_free(void *buf, size_t size)
1955{
1956	size_t index = (size - 1) >> UMEM_ALIGN_SHIFT;
1957
1958	if (index < UMEM_MAXBUF >> UMEM_ALIGN_SHIFT) {
1959		umem_cache_t *cp = umem_alloc_table[index];
1960		if (cp->cache_flags & UMF_BUFTAG) {
1961			umem_buftag_t *btp = UMEM_BUFTAG(cp, buf);
1962			uint32_t *ip = (uint32_t *)btp;
1963			if (ip[1] != UMEM_SIZE_ENCODE(size)) {
1964				if (*(uint64_t *)buf == UMEM_FREE_PATTERN) {
1965					umem_error(UMERR_DUPFREE, cp, buf);
1966					return;
1967				}
1968				if (UMEM_SIZE_VALID(ip[1])) {
1969					ip[0] = UMEM_SIZE_ENCODE(size);
1970					umem_error(UMERR_BADSIZE, cp, buf);
1971				} else {
1972					umem_error(UMERR_REDZONE, cp, buf);
1973				}
1974				return;
1975			}
1976			if (((uint8_t *)buf)[size] != UMEM_REDZONE_BYTE) {
1977				umem_error(UMERR_REDZONE, cp, buf);
1978				return;
1979			}
1980			btp->bt_redzone = UMEM_REDZONE_PATTERN;
1981		}
1982		_umem_cache_free(cp, buf);
1983	} else {
1984		if (buf == NULL && size == 0)
1985			return;
1986		vmem_free(umem_oversize_arena, buf, size);
1987	}
1988}
1989
1990#pragma weak umem_free_align = _umem_free_align
1991void
1992_umem_free_align(void *buf, size_t size)
1993{
1994	if (buf == NULL && size == 0)
1995		return;
1996	vmem_xfree(umem_memalign_arena, buf, size);
1997}
1998
1999static void *
2000umem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
2001{
2002	size_t realsize = size + vmp->vm_quantum;
2003
2004	/*
2005	 * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
2006	 * vm_quantum will cause integer wraparound.  Check for this, and
2007	 * blow off the firewall page in this case.  Note that such a
2008	 * giant allocation (the entire address space) can never be
2009	 * satisfied, so it will either fail immediately (VM_NOSLEEP)
2010	 * or sleep forever (VM_SLEEP).  Thus, there is no need for a
2011	 * corresponding check in umem_firewall_va_free().
2012	 */
2013	if (realsize < size)
2014		realsize = size;
2015
2016	return (vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT));
2017}
2018
2019static void
2020umem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
2021{
2022	vmem_free(vmp, addr, size + vmp->vm_quantum);
2023}
2024
2025/*
2026 * Reclaim all unused memory from a cache.
2027 */
2028static void
2029umem_cache_reap(umem_cache_t *cp)
2030{
2031	/*
2032	 * Ask the cache's owner to free some memory if possible.
2033	 * The idea is to handle things like the inode cache, which
2034	 * typically sits on a bunch of memory that it doesn't truly
2035	 * *need*.  Reclaim policy is entirely up to the owner; this
2036	 * callback is just an advisory plea for help.
2037	 */
2038	if (cp->cache_reclaim != NULL)
2039		cp->cache_reclaim(cp->cache_private);
2040
2041	umem_depot_ws_reap(cp);
2042}
2043
2044/*
2045 * Purge all magazines from a cache and set its magazine limit to zero.
2046 * All calls are serialized by being done by the update thread, except for
2047 * the final call from umem_cache_destroy().
2048 */
2049static void
2050umem_cache_magazine_purge(umem_cache_t *cp)
2051{
2052	umem_cpu_cache_t *ccp;
2053	umem_magazine_t *mp, *pmp;
2054	int rounds, prounds, cpu_seqid;
2055
2056	ASSERT(cp->cache_next == NULL || IN_UPDATE());
2057
2058	for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2059		ccp = &cp->cache_cpu[cpu_seqid];
2060
2061		(void) mutex_lock(&ccp->cc_lock);
2062		mp = ccp->cc_loaded;
2063		pmp = ccp->cc_ploaded;
2064		rounds = ccp->cc_rounds;
2065		prounds = ccp->cc_prounds;
2066		ccp->cc_loaded = NULL;
2067		ccp->cc_ploaded = NULL;
2068		ccp->cc_rounds = -1;
2069		ccp->cc_prounds = -1;
2070		ccp->cc_magsize = 0;
2071		(void) mutex_unlock(&ccp->cc_lock);
2072
2073		if (mp)
2074			umem_magazine_destroy(cp, mp, rounds);
2075		if (pmp)
2076			umem_magazine_destroy(cp, pmp, prounds);
2077	}
2078
2079	/*
2080	 * Updating the working set statistics twice in a row has the
2081	 * effect of setting the working set size to zero, so everything
2082	 * is eligible for reaping.
2083	 */
2084	umem_depot_ws_update(cp);
2085	umem_depot_ws_update(cp);
2086
2087	umem_depot_ws_reap(cp);
2088}
2089
2090/*
2091 * Enable per-cpu magazines on a cache.
2092 */
2093static void
2094umem_cache_magazine_enable(umem_cache_t *cp)
2095{
2096	int cpu_seqid;
2097
2098	if (cp->cache_flags & UMF_NOMAGAZINE)
2099		return;
2100
2101	for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2102		umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
2103		(void) mutex_lock(&ccp->cc_lock);
2104		ccp->cc_magsize = cp->cache_magtype->mt_magsize;
2105		(void) mutex_unlock(&ccp->cc_lock);
2106	}
2107
2108}
2109
2110/*
2111 * Recompute a cache's magazine size.  The trade-off is that larger magazines
2112 * provide a higher transfer rate with the depot, while smaller magazines
2113 * reduce memory consumption.  Magazine resizing is an expensive operation;
2114 * it should not be done frequently.
2115 *
2116 * Changes to the magazine size are serialized by only having one thread
2117 * doing updates. (the update thread)
2118 *
2119 * Note: at present this only grows the magazine size.  It might be useful
2120 * to allow shrinkage too.
2121 */
2122static void
2123umem_cache_magazine_resize(umem_cache_t *cp)
2124{
2125	umem_magtype_t *mtp = cp->cache_magtype;
2126
2127	ASSERT(IN_UPDATE());
2128
2129	if (cp->cache_chunksize < mtp->mt_maxbuf) {
2130		umem_cache_magazine_purge(cp);
2131		(void) mutex_lock(&cp->cache_depot_lock);
2132		cp->cache_magtype = ++mtp;
2133		cp->cache_depot_contention_prev =
2134		    cp->cache_depot_contention + INT_MAX;
2135		(void) mutex_unlock(&cp->cache_depot_lock);
2136		umem_cache_magazine_enable(cp);
2137	}
2138}
2139
2140/*
2141 * Rescale a cache's hash table, so that the table size is roughly the
2142 * cache size.  We want the average lookup time to be extremely small.
2143 */
2144static void
2145umem_hash_rescale(umem_cache_t *cp)
2146{
2147	umem_bufctl_t **old_table, **new_table, *bcp;
2148	size_t old_size, new_size, h;
2149
2150	ASSERT(IN_UPDATE());
2151
2152	new_size = MAX(UMEM_HASH_INITIAL,
2153	    1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
2154	old_size = cp->cache_hash_mask + 1;
2155
2156	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
2157		return;
2158
2159	new_table = vmem_alloc(umem_hash_arena, new_size * sizeof (void *),
2160	    VM_NOSLEEP);
2161	if (new_table == NULL)
2162		return;
2163	bzero(new_table, new_size * sizeof (void *));
2164
2165	(void) mutex_lock(&cp->cache_lock);
2166
2167	old_size = cp->cache_hash_mask + 1;
2168	old_table = cp->cache_hash_table;
2169
2170	cp->cache_hash_mask = new_size - 1;
2171	cp->cache_hash_table = new_table;
2172	cp->cache_rescale++;
2173
2174	for (h = 0; h < old_size; h++) {
2175		bcp = old_table[h];
2176		while (bcp != NULL) {
2177			void *addr = bcp->bc_addr;
2178			umem_bufctl_t *next_bcp = bcp->bc_next;
2179			umem_bufctl_t **hash_bucket = UMEM_HASH(cp, addr);
2180			bcp->bc_next = *hash_bucket;
2181			*hash_bucket = bcp;
2182			bcp = next_bcp;
2183		}
2184	}
2185
2186	(void) mutex_unlock(&cp->cache_lock);
2187
2188	vmem_free(umem_hash_arena, old_table, old_size * sizeof (void *));
2189}
2190
2191/*
2192 * Perform periodic maintenance on a cache: hash rescaling,
2193 * depot working-set update, and magazine resizing.
2194 */
2195void
2196umem_cache_update(umem_cache_t *cp)
2197{
2198	int update_flags = 0;
2199
2200	ASSERT(MUTEX_HELD(&umem_cache_lock));
2201
2202	/*
2203	 * If the cache has become much larger or smaller than its hash table,
2204	 * fire off a request to rescale the hash table.
2205	 */
2206	(void) mutex_lock(&cp->cache_lock);
2207
2208	if ((cp->cache_flags & UMF_HASH) &&
2209	    (cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
2210	    (cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
2211	    cp->cache_hash_mask > UMEM_HASH_INITIAL)))
2212		update_flags |= UMU_HASH_RESCALE;
2213
2214	(void) mutex_unlock(&cp->cache_lock);
2215
2216	/*
2217	 * Update the depot working set statistics.
2218	 */
2219	umem_depot_ws_update(cp);
2220
2221	/*
2222	 * If there's a lot of contention in the depot,
2223	 * increase the magazine size.
2224	 */
2225	(void) mutex_lock(&cp->cache_depot_lock);
2226
2227	if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf &&
2228	    (int)(cp->cache_depot_contention -
2229	    cp->cache_depot_contention_prev) > umem_depot_contention)
2230		update_flags |= UMU_MAGAZINE_RESIZE;
2231
2232	cp->cache_depot_contention_prev = cp->cache_depot_contention;
2233
2234	(void) mutex_unlock(&cp->cache_depot_lock);
2235
2236	if (update_flags)
2237		umem_add_update(cp, update_flags);
2238}
2239
2240/*
2241 * Runs all pending updates.
2242 *
2243 * The update lock must be held on entrance, and will be held on exit.
2244 */
2245void
2246umem_process_updates(void)
2247{
2248	ASSERT(MUTEX_HELD(&umem_update_lock));
2249
2250	while (umem_null_cache.cache_unext != &umem_null_cache) {
2251		int notify = 0;
2252		umem_cache_t *cp = umem_null_cache.cache_unext;
2253
2254		cp->cache_uprev->cache_unext = cp->cache_unext;
2255		cp->cache_unext->cache_uprev = cp->cache_uprev;
2256		cp->cache_uprev = cp->cache_unext = NULL;
2257
2258		ASSERT(!(cp->cache_uflags & UMU_ACTIVE));
2259
2260		while (cp->cache_uflags) {
2261			int uflags = (cp->cache_uflags |= UMU_ACTIVE);
2262			(void) mutex_unlock(&umem_update_lock);
2263
2264			/*
2265			 * The order here is important.  Each step can speed up
2266			 * later steps.
2267			 */
2268
2269			if (uflags & UMU_HASH_RESCALE)
2270				umem_hash_rescale(cp);
2271
2272			if (uflags & UMU_MAGAZINE_RESIZE)
2273				umem_cache_magazine_resize(cp);
2274
2275			if (uflags & UMU_REAP)
2276				umem_cache_reap(cp);
2277
2278			(void) mutex_lock(&umem_update_lock);
2279
2280			/*
2281			 * check if anyone has requested notification
2282			 */
2283			if (cp->cache_uflags & UMU_NOTIFY) {
2284				uflags |= UMU_NOTIFY;
2285				notify = 1;
2286			}
2287			cp->cache_uflags &= ~uflags;
2288		}
2289		if (notify)
2290			(void) cond_broadcast(&umem_update_cv);
2291	}
2292}
2293
2294#ifndef UMEM_STANDALONE
2295static void
2296umem_st_update(void)
2297{
2298	ASSERT(MUTEX_HELD(&umem_update_lock));
2299	ASSERT(umem_update_thr == 0 && umem_st_update_thr == 0);
2300
2301	umem_st_update_thr = thr_self();
2302
2303	(void) mutex_unlock(&umem_update_lock);
2304
2305	vmem_update(NULL);
2306	umem_cache_applyall(umem_cache_update);
2307
2308	(void) mutex_lock(&umem_update_lock);
2309
2310	umem_process_updates();	/* does all of the requested work */
2311
2312	umem_reap_next = gethrtime() +
2313	    (hrtime_t)umem_reap_interval * NANOSEC;
2314
2315	umem_reaping = UMEM_REAP_DONE;
2316
2317	umem_st_update_thr = 0;
2318}
2319#endif
2320
2321/*
2322 * Reclaim all unused memory from all caches.  Called from vmem when memory
2323 * gets tight.  Must be called with no locks held.
2324 *
2325 * This just requests a reap on all caches, and notifies the update thread.
2326 */
2327void
2328umem_reap(void)
2329{
2330#ifndef UMEM_STANDALONE
2331	extern int __nthreads(void);
2332#endif
2333
2334	if (umem_ready != UMEM_READY || umem_reaping != UMEM_REAP_DONE ||
2335	    gethrtime() < umem_reap_next)
2336		return;
2337
2338	(void) mutex_lock(&umem_update_lock);
2339
2340	if (umem_reaping != UMEM_REAP_DONE || gethrtime() < umem_reap_next) {
2341		(void) mutex_unlock(&umem_update_lock);
2342		return;
2343	}
2344	umem_reaping = UMEM_REAP_ADDING;	/* lock out other reaps */
2345
2346	(void) mutex_unlock(&umem_update_lock);
2347
2348	umem_updateall(UMU_REAP);
2349
2350	(void) mutex_lock(&umem_update_lock);
2351
2352	umem_reaping = UMEM_REAP_ACTIVE;
2353
2354	/* Standalone is single-threaded */
2355#ifndef UMEM_STANDALONE
2356	if (umem_update_thr == 0) {
2357		/*
2358		 * The update thread does not exist.  If the process is
2359		 * multi-threaded, create it.  If not, or the creation fails,
2360		 * do the update processing inline.
2361		 */
2362		ASSERT(umem_st_update_thr == 0);
2363
2364		if (__nthreads() <= 1 || umem_create_update_thread() == 0)
2365			umem_st_update();
2366	}
2367
2368	(void) cond_broadcast(&umem_update_cv);	/* wake up the update thread */
2369#endif
2370
2371	(void) mutex_unlock(&umem_update_lock);
2372}
2373
2374umem_cache_t *
2375umem_cache_create(
2376	char *name,		/* descriptive name for this cache */
2377	size_t bufsize,		/* size of the objects it manages */
2378	size_t align,		/* required object alignment */
2379	umem_constructor_t *constructor, /* object constructor */
2380	umem_destructor_t *destructor, /* object destructor */
2381	umem_reclaim_t *reclaim, /* memory reclaim callback */
2382	void *private,		/* pass-thru arg for constr/destr/reclaim */
2383	vmem_t *vmp,		/* vmem source for slab allocation */
2384	int cflags)		/* cache creation flags */
2385{
2386	int cpu_seqid;
2387	size_t chunksize;
2388	umem_cache_t *cp, *cnext, *cprev;
2389	umem_magtype_t *mtp;
2390	size_t csize;
2391	size_t phase;
2392
2393	/*
2394	 * The init thread is allowed to create internal and quantum caches.
2395	 *
2396	 * Other threads must wait until until initialization is complete.
2397	 */
2398	if (umem_init_thr == thr_self())
2399		ASSERT((cflags & (UMC_INTERNAL | UMC_QCACHE)) != 0);
2400	else {
2401		ASSERT(!(cflags & UMC_INTERNAL));
2402		if (umem_ready != UMEM_READY && umem_init() == 0) {
2403			errno = EAGAIN;
2404			return (NULL);
2405		}
2406	}
2407
2408	csize = UMEM_CACHE_SIZE(umem_max_ncpus);
2409	phase = P2NPHASE(csize, UMEM_CPU_CACHE_SIZE);
2410
2411	if (vmp == NULL)
2412		vmp = umem_default_arena;
2413
2414	ASSERT(P2PHASE(phase, UMEM_ALIGN) == 0);
2415
2416	/*
2417	 * Check that the arguments are reasonable
2418	 */
2419	if ((align & (align - 1)) != 0 || align > vmp->vm_quantum ||
2420	    ((cflags & UMC_NOHASH) && (cflags & UMC_NOTOUCH)) ||
2421	    name == NULL || bufsize == 0) {
2422		errno = EINVAL;
2423		return (NULL);
2424	}
2425
2426	/*
2427	 * If align == 0, we set it to the minimum required alignment.
2428	 *
2429	 * If align < UMEM_ALIGN, we round it up to UMEM_ALIGN, unless
2430	 * UMC_NOTOUCH was passed.
2431	 */
2432	if (align == 0) {
2433		if (P2ROUNDUP(bufsize, UMEM_ALIGN) >= UMEM_SECOND_ALIGN)
2434			align = UMEM_SECOND_ALIGN;
2435		else
2436			align = UMEM_ALIGN;
2437	} else if (align < UMEM_ALIGN && (cflags & UMC_NOTOUCH) == 0)
2438		align = UMEM_ALIGN;
2439
2440
2441	/*
2442	 * Get a umem_cache structure.  We arrange that cp->cache_cpu[]
2443	 * is aligned on a UMEM_CPU_CACHE_SIZE boundary to prevent
2444	 * false sharing of per-CPU data.
2445	 */
2446	cp = vmem_xalloc(umem_cache_arena, csize, UMEM_CPU_CACHE_SIZE, phase,
2447	    0, NULL, NULL, VM_NOSLEEP);
2448
2449	if (cp == NULL) {
2450		errno = EAGAIN;
2451		return (NULL);
2452	}
2453
2454	bzero(cp, csize);
2455
2456	(void) mutex_lock(&umem_flags_lock);
2457	if (umem_flags & UMF_RANDOMIZE)
2458		umem_flags = (((umem_flags | ~UMF_RANDOM) + 1) & UMF_RANDOM) |
2459		    UMF_RANDOMIZE;
2460	cp->cache_flags = umem_flags | (cflags & UMF_DEBUG);
2461	(void) mutex_unlock(&umem_flags_lock);
2462
2463	/*
2464	 * Make sure all the various flags are reasonable.
2465	 */
2466	if (cp->cache_flags & UMF_LITE) {
2467		if (bufsize >= umem_lite_minsize &&
2468		    align <= umem_lite_maxalign &&
2469		    P2PHASE(bufsize, umem_lite_maxalign) != 0) {
2470			cp->cache_flags |= UMF_BUFTAG;
2471			cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL);
2472		} else {
2473			cp->cache_flags &= ~UMF_DEBUG;
2474		}
2475	}
2476
2477	if ((cflags & UMC_QCACHE) && (cp->cache_flags & UMF_AUDIT))
2478		cp->cache_flags |= UMF_NOMAGAZINE;
2479
2480	if (cflags & UMC_NODEBUG)
2481		cp->cache_flags &= ~UMF_DEBUG;
2482
2483	if (cflags & UMC_NOTOUCH)
2484		cp->cache_flags &= ~UMF_TOUCH;
2485
2486	if (cflags & UMC_NOHASH)
2487		cp->cache_flags &= ~(UMF_AUDIT | UMF_FIREWALL);
2488
2489	if (cflags & UMC_NOMAGAZINE)
2490		cp->cache_flags |= UMF_NOMAGAZINE;
2491
2492	if ((cp->cache_flags & UMF_AUDIT) && !(cflags & UMC_NOTOUCH))
2493		cp->cache_flags |= UMF_REDZONE;
2494
2495	if ((cp->cache_flags & UMF_BUFTAG) && bufsize >= umem_minfirewall &&
2496	    !(cp->cache_flags & UMF_LITE) && !(cflags & UMC_NOHASH))
2497		cp->cache_flags |= UMF_FIREWALL;
2498
2499	if (vmp != umem_default_arena || umem_firewall_arena == NULL)
2500		cp->cache_flags &= ~UMF_FIREWALL;
2501
2502	if (cp->cache_flags & UMF_FIREWALL) {
2503		cp->cache_flags &= ~UMF_BUFTAG;
2504		cp->cache_flags |= UMF_NOMAGAZINE;
2505		ASSERT(vmp == umem_default_arena);
2506		vmp = umem_firewall_arena;
2507	}
2508
2509	/*
2510	 * Set cache properties.
2511	 */
2512	(void) strncpy(cp->cache_name, name, sizeof (cp->cache_name) - 1);
2513	cp->cache_bufsize = bufsize;
2514	cp->cache_align = align;
2515	cp->cache_constructor = constructor;
2516	cp->cache_destructor = destructor;
2517	cp->cache_reclaim = reclaim;
2518	cp->cache_private = private;
2519	cp->cache_arena = vmp;
2520	cp->cache_cflags = cflags;
2521	cp->cache_cpu_mask = umem_cpu_mask;
2522
2523	/*
2524	 * Determine the chunk size.
2525	 */
2526	chunksize = bufsize;
2527
2528	if (align >= UMEM_ALIGN) {
2529		chunksize = P2ROUNDUP(chunksize, UMEM_ALIGN);
2530		cp->cache_bufctl = chunksize - UMEM_ALIGN;
2531	}
2532
2533	if (cp->cache_flags & UMF_BUFTAG) {
2534		cp->cache_bufctl = chunksize;
2535		cp->cache_buftag = chunksize;
2536		chunksize += sizeof (umem_buftag_t);
2537	}
2538
2539	if (cp->cache_flags & UMF_DEADBEEF) {
2540		cp->cache_verify = MIN(cp->cache_buftag, umem_maxverify);
2541		if (cp->cache_flags & UMF_LITE)
2542			cp->cache_verify = MIN(cp->cache_verify, UMEM_ALIGN);
2543	}
2544
2545	cp->cache_contents = MIN(cp->cache_bufctl, umem_content_maxsave);
2546
2547	cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align);
2548
2549	if (chunksize < bufsize) {
2550		errno = ENOMEM;
2551		goto fail;
2552	}
2553
2554	/*
2555	 * Now that we know the chunk size, determine the optimal slab size.
2556	 */
2557	if (vmp == umem_firewall_arena) {
2558		cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum);
2559		cp->cache_mincolor = cp->cache_slabsize - chunksize;
2560		cp->cache_maxcolor = cp->cache_mincolor;
2561		cp->cache_flags |= UMF_HASH;
2562		ASSERT(!(cp->cache_flags & UMF_BUFTAG));
2563	} else if ((cflags & UMC_NOHASH) || (!(cflags & UMC_NOTOUCH) &&
2564	    !(cp->cache_flags & UMF_AUDIT) &&
2565	    chunksize < vmp->vm_quantum / UMEM_VOID_FRACTION)) {
2566		cp->cache_slabsize = vmp->vm_quantum;
2567		cp->cache_mincolor = 0;
2568		cp->cache_maxcolor =
2569		    (cp->cache_slabsize - sizeof (umem_slab_t)) % chunksize;
2570
2571		if (chunksize + sizeof (umem_slab_t) > cp->cache_slabsize) {
2572			errno = EINVAL;
2573			goto fail;
2574		}
2575		ASSERT(!(cp->cache_flags & UMF_AUDIT));
2576	} else {
2577		size_t chunks, bestfit, waste, slabsize;
2578		size_t minwaste = LONG_MAX;
2579
2580		for (chunks = 1; chunks <= UMEM_VOID_FRACTION; chunks++) {
2581			slabsize = P2ROUNDUP(chunksize * chunks,
2582			    vmp->vm_quantum);
2583			/*
2584			 * check for overflow
2585			 */
2586			if ((slabsize / chunks) < chunksize) {
2587				errno = ENOMEM;
2588				goto fail;
2589			}
2590			chunks = slabsize / chunksize;
2591			waste = (slabsize % chunksize) / chunks;
2592			if (waste < minwaste) {
2593				minwaste = waste;
2594				bestfit = slabsize;
2595			}
2596		}
2597		if (cflags & UMC_QCACHE)
2598			bestfit = MAX(1 << highbit(3 * vmp->vm_qcache_max), 64);
2599		cp->cache_slabsize = bestfit;
2600		cp->cache_mincolor = 0;
2601		cp->cache_maxcolor = bestfit % chunksize;
2602		cp->cache_flags |= UMF_HASH;
2603	}
2604
2605	if (cp->cache_flags & UMF_HASH) {
2606		ASSERT(!(cflags & UMC_NOHASH));
2607		cp->cache_bufctl_cache = (cp->cache_flags & UMF_AUDIT) ?
2608		    umem_bufctl_audit_cache : umem_bufctl_cache;
2609	}
2610
2611	if (cp->cache_maxcolor >= vmp->vm_quantum)
2612		cp->cache_maxcolor = vmp->vm_quantum - 1;
2613
2614	cp->cache_color = cp->cache_mincolor;
2615
2616	/*
2617	 * Initialize the rest of the slab layer.
2618	 */
2619	(void) mutex_init(&cp->cache_lock, USYNC_THREAD, NULL);
2620
2621	cp->cache_freelist = &cp->cache_nullslab;
2622	cp->cache_nullslab.slab_cache = cp;
2623	cp->cache_nullslab.slab_refcnt = -1;
2624	cp->cache_nullslab.slab_next = &cp->cache_nullslab;
2625	cp->cache_nullslab.slab_prev = &cp->cache_nullslab;
2626
2627	if (cp->cache_flags & UMF_HASH) {
2628		cp->cache_hash_table = vmem_alloc(umem_hash_arena,
2629		    UMEM_HASH_INITIAL * sizeof (void *), VM_NOSLEEP);
2630		if (cp->cache_hash_table == NULL) {
2631			errno = EAGAIN;
2632			goto fail_lock;
2633		}
2634		bzero(cp->cache_hash_table,
2635		    UMEM_HASH_INITIAL * sizeof (void *));
2636		cp->cache_hash_mask = UMEM_HASH_INITIAL - 1;
2637		cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1;
2638	}
2639
2640	/*
2641	 * Initialize the depot.
2642	 */
2643	(void) mutex_init(&cp->cache_depot_lock, USYNC_THREAD, NULL);
2644
2645	for (mtp = umem_magtype; chunksize <= mtp->mt_minbuf; mtp++)
2646		continue;
2647
2648	cp->cache_magtype = mtp;
2649
2650	/*
2651	 * Initialize the CPU layer.
2652	 */
2653	for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++) {
2654		umem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
2655		(void) mutex_init(&ccp->cc_lock, USYNC_THREAD, NULL);
2656		ccp->cc_flags = cp->cache_flags;
2657		ccp->cc_rounds = -1;
2658		ccp->cc_prounds = -1;
2659	}
2660
2661	/*
2662	 * Add the cache to the global list.  This makes it visible
2663	 * to umem_update(), so the cache must be ready for business.
2664	 */
2665	(void) mutex_lock(&umem_cache_lock);
2666	cp->cache_next = cnext = &umem_null_cache;
2667	cp->cache_prev = cprev = umem_null_cache.cache_prev;
2668	cnext->cache_prev = cp;
2669	cprev->cache_next = cp;
2670	(void) mutex_unlock(&umem_cache_lock);
2671
2672	if (umem_ready == UMEM_READY)
2673		umem_cache_magazine_enable(cp);
2674
2675	return (cp);
2676
2677fail_lock:
2678	(void) mutex_destroy(&cp->cache_lock);
2679fail:
2680	vmem_xfree(umem_cache_arena, cp, csize);
2681	return (NULL);
2682}
2683
2684void
2685umem_cache_destroy(umem_cache_t *cp)
2686{
2687	int cpu_seqid;
2688
2689	/*
2690	 * Remove the cache from the global cache list so that no new updates
2691	 * will be scheduled on its behalf, wait for any pending tasks to
2692	 * complete, purge the cache, and then destroy it.
2693	 */
2694	(void) mutex_lock(&umem_cache_lock);
2695	cp->cache_prev->cache_next = cp->cache_next;
2696	cp->cache_next->cache_prev = cp->cache_prev;
2697	cp->cache_prev = cp->cache_next = NULL;
2698	(void) mutex_unlock(&umem_cache_lock);
2699
2700	umem_remove_updates(cp);
2701
2702	umem_cache_magazine_purge(cp);
2703
2704	(void) mutex_lock(&cp->cache_lock);
2705	if (cp->cache_buftotal != 0)
2706		log_message("umem_cache_destroy: '%s' (%p) not empty\n",
2707		    cp->cache_name, (void *)cp);
2708	cp->cache_reclaim = NULL;
2709	/*
2710	 * The cache is now dead.  There should be no further activity.
2711	 * We enforce this by setting land mines in the constructor and
2712	 * destructor routines that induce a segmentation fault if invoked.
2713	 */
2714	cp->cache_constructor = (umem_constructor_t *)1;
2715	cp->cache_destructor = (umem_destructor_t *)2;
2716	(void) mutex_unlock(&cp->cache_lock);
2717
2718	if (cp->cache_hash_table != NULL)
2719		vmem_free(umem_hash_arena, cp->cache_hash_table,
2720		    (cp->cache_hash_mask + 1) * sizeof (void *));
2721
2722	for (cpu_seqid = 0; cpu_seqid < umem_max_ncpus; cpu_seqid++)
2723		(void) mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
2724
2725	(void) mutex_destroy(&cp->cache_depot_lock);
2726	(void) mutex_destroy(&cp->cache_lock);
2727
2728	vmem_free(umem_cache_arena, cp, UMEM_CACHE_SIZE(umem_max_ncpus));
2729}
2730
2731static int
2732umem_cache_init(void)
2733{
2734	int i;
2735	size_t size, max_size;
2736	umem_cache_t *cp;
2737	umem_magtype_t *mtp;
2738	char name[UMEM_CACHE_NAMELEN + 1];
2739	umem_cache_t *umem_alloc_caches[NUM_ALLOC_SIZES];
2740
2741	for (i = 0; i < sizeof (umem_magtype) / sizeof (*mtp); i++) {
2742		mtp = &umem_magtype[i];
2743		(void) snprintf(name, sizeof (name), "umem_magazine_%d",
2744		    mtp->mt_magsize);
2745		mtp->mt_cache = umem_cache_create(name,
2746		    (mtp->mt_magsize + 1) * sizeof (void *),
2747		    mtp->mt_align, NULL, NULL, NULL, NULL,
2748		    umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2749		if (mtp->mt_cache == NULL)
2750			return (0);
2751	}
2752
2753	umem_slab_cache = umem_cache_create("umem_slab_cache",
2754	    sizeof (umem_slab_t), 0, NULL, NULL, NULL, NULL,
2755	    umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2756
2757	if (umem_slab_cache == NULL)
2758		return (0);
2759
2760	umem_bufctl_cache = umem_cache_create("umem_bufctl_cache",
2761	    sizeof (umem_bufctl_t), 0, NULL, NULL, NULL, NULL,
2762	    umem_internal_arena, UMC_NOHASH | UMC_INTERNAL);
2763
2764	if (umem_bufctl_cache == NULL)
2765		return (0);
2766
2767	/*
2768	 * The size of the umem_bufctl_audit structure depends upon
2769	 * umem_stack_depth.   See umem_impl.h for details on the size
2770	 * restrictions.
2771	 */
2772
2773	size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth);
2774	max_size = UMEM_BUFCTL_AUDIT_MAX_SIZE;
2775
2776	if (size > max_size) {			/* too large -- truncate */
2777		int max_frames = UMEM_MAX_STACK_DEPTH;
2778
2779		ASSERT(UMEM_BUFCTL_AUDIT_SIZE_DEPTH(max_frames) <= max_size);
2780
2781		umem_stack_depth = max_frames;
2782		size = UMEM_BUFCTL_AUDIT_SIZE_DEPTH(umem_stack_depth);
2783	}
2784
2785	umem_bufctl_audit_cache = umem_cache_create("umem_bufctl_audit_cache",
2786	    size, 0, NULL, NULL, NULL, NULL, umem_internal_arena,
2787	    UMC_NOHASH | UMC_INTERNAL);
2788
2789	if (umem_bufctl_audit_cache == NULL)
2790		return (0);
2791
2792	if (vmem_backend & VMEM_BACKEND_MMAP)
2793		umem_va_arena = vmem_create("umem_va",
2794		    NULL, 0, pagesize,
2795		    vmem_alloc, vmem_free, heap_arena,
2796		    8 * pagesize, VM_NOSLEEP);
2797	else
2798		umem_va_arena = heap_arena;
2799
2800	if (umem_va_arena == NULL)
2801		return (0);
2802
2803	umem_default_arena = vmem_create("umem_default",
2804	    NULL, 0, pagesize,
2805	    heap_alloc, heap_free, umem_va_arena,
2806	    0, VM_NOSLEEP);
2807
2808	if (umem_default_arena == NULL)
2809		return (0);
2810
2811	/*
2812	 * make sure the umem_alloc table initializer is correct
2813	 */
2814	i = sizeof (umem_alloc_table) / sizeof (*umem_alloc_table);
2815	ASSERT(umem_alloc_table[i - 1] == &umem_null_cache);
2816
2817	/*
2818	 * Create the default caches to back umem_alloc()
2819	 */
2820	for (i = 0; i < NUM_ALLOC_SIZES; i++) {
2821		size_t cache_size = umem_alloc_sizes[i];
2822		size_t align = 0;
2823		/*
2824		 * If they allocate a multiple of the coherency granularity,
2825		 * they get a coherency-granularity-aligned address.
2826		 */
2827		if (IS_P2ALIGNED(cache_size, 64))
2828			align = 64;
2829		if (IS_P2ALIGNED(cache_size, pagesize))
2830			align = pagesize;
2831		(void) snprintf(name, sizeof (name), "umem_alloc_%lu",
2832		    (long)cache_size);
2833
2834		cp = umem_cache_create(name, cache_size, align,
2835		    NULL, NULL, NULL, NULL, NULL, UMC_INTERNAL);
2836		if (cp == NULL)
2837			return (0);
2838
2839		umem_alloc_caches[i] = cp;
2840	}
2841
2842	/*
2843	 * Initialization cannot fail at this point.  Make the caches
2844	 * visible to umem_alloc() and friends.
2845	 */
2846	size = UMEM_ALIGN;
2847	for (i = 0; i < NUM_ALLOC_SIZES; i++) {
2848		size_t cache_size = umem_alloc_sizes[i];
2849
2850		cp = umem_alloc_caches[i];
2851
2852		while (size <= cache_size) {
2853			umem_alloc_table[(size - 1) >> UMEM_ALIGN_SHIFT] = cp;
2854			size += UMEM_ALIGN;
2855		}
2856	}
2857	return (1);
2858}
2859
2860/*
2861 * umem_startup() is called early on, and must be called explicitly if we're
2862 * the standalone version.
2863 */
2864#ifdef UMEM_STANDALONE
2865void
2866#else
2867#pragma init(umem_startup)
2868static void
2869#endif
2870umem_startup(caddr_t start, size_t len, size_t pagesize, caddr_t minstack,
2871    caddr_t maxstack)
2872{
2873#ifdef UMEM_STANDALONE
2874	int idx;
2875	/* Standalone doesn't fork */
2876#else
2877	umem_forkhandler_init(); /* register the fork handler */
2878#endif
2879
2880#ifdef __lint
2881	/* make lint happy */
2882	minstack = maxstack;
2883#endif
2884
2885#ifdef UMEM_STANDALONE
2886	umem_ready = UMEM_READY_STARTUP;
2887	umem_init_env_ready = 0;
2888
2889	umem_min_stack = minstack;
2890	umem_max_stack = maxstack;
2891
2892	nofail_callback = NULL;
2893	umem_slab_cache = NULL;
2894	umem_bufctl_cache = NULL;
2895	umem_bufctl_audit_cache = NULL;
2896	heap_arena = NULL;
2897	heap_alloc = NULL;
2898	heap_free = NULL;
2899	umem_internal_arena = NULL;
2900	umem_cache_arena = NULL;
2901	umem_hash_arena = NULL;
2902	umem_log_arena = NULL;
2903	umem_oversize_arena = NULL;
2904	umem_va_arena = NULL;
2905	umem_default_arena = NULL;
2906	umem_firewall_va_arena = NULL;
2907	umem_firewall_arena = NULL;
2908	umem_memalign_arena = NULL;
2909	umem_transaction_log = NULL;
2910	umem_content_log = NULL;
2911	umem_failure_log = NULL;
2912	umem_slab_log = NULL;
2913	umem_cpu_mask = 0;
2914
2915	umem_cpus = &umem_startup_cpu;
2916	umem_startup_cpu.cpu_cache_offset = UMEM_CACHE_SIZE(0);
2917	umem_startup_cpu.cpu_number = 0;
2918
2919	bcopy(&umem_null_cache_template, &umem_null_cache,
2920	    sizeof (umem_cache_t));
2921
2922	for (idx = 0; idx < (UMEM_MAXBUF >> UMEM_ALIGN_SHIFT); idx++)
2923		umem_alloc_table[idx] = &umem_null_cache;
2924#endif
2925
2926	/*
2927	 * Perform initialization specific to the way we've been compiled
2928	 * (library or standalone)
2929	 */
2930	umem_type_init(start, len, pagesize);
2931
2932	vmem_startup();
2933}
2934
2935int
2936umem_init(void)
2937{
2938	size_t maxverify, minfirewall;
2939	size_t size;
2940	int idx;
2941	umem_cpu_t *new_cpus;
2942
2943	vmem_t *memalign_arena, *oversize_arena;
2944
2945	if (thr_self() != umem_init_thr) {
2946		/*
2947		 * The usual case -- non-recursive invocation of umem_init().
2948		 */
2949		(void) mutex_lock(&umem_init_lock);
2950		if (umem_ready != UMEM_READY_STARTUP) {
2951			/*
2952			 * someone else beat us to initializing umem.  Wait
2953			 * for them to complete, then return.
2954			 */
2955			while (umem_ready == UMEM_READY_INITING)
2956				(void) _cond_wait(&umem_init_cv,
2957				    &umem_init_lock);
2958			ASSERT(umem_ready == UMEM_READY ||
2959			    umem_ready == UMEM_READY_INIT_FAILED);
2960			(void) mutex_unlock(&umem_init_lock);
2961			return (umem_ready == UMEM_READY);
2962		}
2963
2964		ASSERT(umem_ready == UMEM_READY_STARTUP);
2965		ASSERT(umem_init_env_ready == 0);
2966
2967		umem_ready = UMEM_READY_INITING;
2968		umem_init_thr = thr_self();
2969
2970		(void) mutex_unlock(&umem_init_lock);
2971		umem_setup_envvars(0);		/* can recurse -- see below */
2972		if (umem_init_env_ready) {
2973			/*
2974			 * initialization was completed already
2975			 */
2976			ASSERT(umem_ready == UMEM_READY ||
2977			    umem_ready == UMEM_READY_INIT_FAILED);
2978			ASSERT(umem_init_thr == 0);
2979			return (umem_ready == UMEM_READY);
2980		}
2981	} else if (!umem_init_env_ready) {
2982		/*
2983		 * The umem_setup_envvars() call (above) makes calls into
2984		 * the dynamic linker and directly into user-supplied code.
2985		 * Since we cannot know what that code will do, we could be
2986		 * recursively invoked (by, say, a malloc() call in the code
2987		 * itself, or in a (C++) _init section it causes to be fired).
2988		 *
2989		 * This code is where we end up if such recursion occurs.  We
2990		 * first clean up any partial results in the envvar code, then
2991		 * proceed to finish initialization processing in the recursive
2992		 * call.  The original call will notice this, and return
2993		 * immediately.
2994		 */
2995		umem_setup_envvars(1);		/* clean up any partial state */
2996	} else {
2997		umem_panic(
2998		    "recursive allocation while initializing umem\n");
2999	}
3000	umem_init_env_ready = 1;
3001
3002	/*
3003	 * From this point until we finish, recursion into umem_init() will
3004	 * cause a umem_panic().
3005	 */
3006	maxverify = minfirewall = ULONG_MAX;
3007
3008	/* LINTED constant condition */
3009	if (sizeof (umem_cpu_cache_t) != UMEM_CPU_CACHE_SIZE) {
3010		umem_panic("sizeof (umem_cpu_cache_t) = %d, should be %d\n",
3011		    sizeof (umem_cpu_cache_t), UMEM_CPU_CACHE_SIZE);
3012	}
3013
3014	umem_max_ncpus = umem_get_max_ncpus();
3015
3016	/*
3017	 * load tunables from environment
3018	 */
3019	umem_process_envvars();
3020
3021	if (issetugid())
3022		umem_mtbf = 0;
3023
3024	/*
3025	 * set up vmem
3026	 */
3027	if (!(umem_flags & UMF_AUDIT))
3028		vmem_no_debug();
3029
3030	heap_arena = vmem_heap_arena(&heap_alloc, &heap_free);
3031
3032	pagesize = heap_arena->vm_quantum;
3033
3034	umem_internal_arena = vmem_create("umem_internal", NULL, 0, pagesize,
3035	    heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP);
3036
3037	umem_default_arena = umem_internal_arena;
3038
3039	if (umem_internal_arena == NULL)
3040		goto fail;
3041
3042	umem_cache_arena = vmem_create("umem_cache", NULL, 0, UMEM_ALIGN,
3043	    vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP);
3044
3045	umem_hash_arena = vmem_create("umem_hash", NULL, 0, UMEM_ALIGN,
3046	    vmem_alloc, vmem_free, umem_internal_arena, 0, VM_NOSLEEP);
3047
3048	umem_log_arena = vmem_create("umem_log", NULL, 0, UMEM_ALIGN,
3049	    heap_alloc, heap_free, heap_arena, 0, VM_NOSLEEP);
3050
3051	umem_firewall_va_arena = vmem_create("umem_firewall_va",
3052	    NULL, 0, pagesize,
3053	    umem_firewall_va_alloc, umem_firewall_va_free, heap_arena,
3054	    0, VM_NOSLEEP);
3055
3056	if (umem_cache_arena == NULL || umem_hash_arena == NULL ||
3057	    umem_log_arena == NULL || umem_firewall_va_arena == NULL)
3058		goto fail;
3059
3060	umem_firewall_arena = vmem_create("umem_firewall", NULL, 0, pagesize,
3061	    heap_alloc, heap_free, umem_firewall_va_arena, 0,
3062	    VM_NOSLEEP);
3063
3064	if (umem_firewall_arena == NULL)
3065		goto fail;
3066
3067	oversize_arena = vmem_create("umem_oversize", NULL, 0, pagesize,
3068	    heap_alloc, heap_free, minfirewall < ULONG_MAX ?
3069	    umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP);
3070
3071	memalign_arena = vmem_create("umem_memalign", NULL, 0, UMEM_ALIGN,
3072	    heap_alloc, heap_free, minfirewall < ULONG_MAX ?
3073	    umem_firewall_va_arena : heap_arena, 0, VM_NOSLEEP);
3074
3075	if (oversize_arena == NULL || memalign_arena == NULL)
3076		goto fail;
3077
3078	if (umem_max_ncpus > CPUHINT_MAX())
3079		umem_max_ncpus = CPUHINT_MAX();
3080
3081	while ((umem_max_ncpus & (umem_max_ncpus - 1)) != 0)
3082		umem_max_ncpus++;
3083
3084	if (umem_max_ncpus == 0)
3085		umem_max_ncpus = 1;
3086
3087	size = umem_max_ncpus * sizeof (umem_cpu_t);
3088	new_cpus = vmem_alloc(umem_internal_arena, size, VM_NOSLEEP);
3089	if (new_cpus == NULL)
3090		goto fail;
3091
3092	bzero(new_cpus, size);
3093	for (idx = 0; idx < umem_max_ncpus; idx++) {
3094		new_cpus[idx].cpu_number = idx;
3095		new_cpus[idx].cpu_cache_offset = UMEM_CACHE_SIZE(idx);
3096	}
3097	umem_cpus = new_cpus;
3098	umem_cpu_mask = (umem_max_ncpus - 1);
3099
3100	if (umem_maxverify == 0)
3101		umem_maxverify = maxverify;
3102
3103	if (umem_minfirewall == 0)
3104		umem_minfirewall = minfirewall;
3105
3106	/*
3107	 * Set up updating and reaping
3108	 */
3109	umem_reap_next = gethrtime() + NANOSEC;
3110
3111#ifndef UMEM_STANDALONE
3112	(void) gettimeofday(&umem_update_next, NULL);
3113#endif
3114
3115	/*
3116	 * Set up logging -- failure here is okay, since it will just disable
3117	 * the logs
3118	 */
3119	if (umem_logging) {
3120		umem_transaction_log = umem_log_init(umem_transaction_log_size);
3121		umem_content_log = umem_log_init(umem_content_log_size);
3122		umem_failure_log = umem_log_init(umem_failure_log_size);
3123		umem_slab_log = umem_log_init(umem_slab_log_size);
3124	}
3125
3126	/*
3127	 * Set up caches -- if successful, initialization cannot fail, since
3128	 * allocations from other threads can now succeed.
3129	 */
3130	if (umem_cache_init() == 0) {
3131		log_message("unable to create initial caches\n");
3132		goto fail;
3133	}
3134	umem_oversize_arena = oversize_arena;
3135	umem_memalign_arena = memalign_arena;
3136
3137	umem_cache_applyall(umem_cache_magazine_enable);
3138
3139	/*
3140	 * initialization done, ready to go
3141	 */
3142	(void) mutex_lock(&umem_init_lock);
3143	umem_ready = UMEM_READY;
3144	umem_init_thr = 0;
3145	(void) cond_broadcast(&umem_init_cv);
3146	(void) mutex_unlock(&umem_init_lock);
3147	return (1);
3148
3149fail:
3150	log_message("umem initialization failed\n");
3151
3152	(void) mutex_lock(&umem_init_lock);
3153	umem_ready = UMEM_READY_INIT_FAILED;
3154	umem_init_thr = 0;
3155	(void) cond_broadcast(&umem_init_cv);
3156	(void) mutex_unlock(&umem_init_lock);
3157	return (0);
3158}
3159