uma_core.c revision 313127
1/*-
2 * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
3 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
4 * Copyright (c) 2004-2006 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * uma_core.c  Implementation of the Universal Memory allocator
31 *
32 * This allocator is intended to replace the multitude of similar object caches
33 * in the standard FreeBSD kernel.  The intent is to be flexible as well as
34 * efficient.  A primary design goal is to return unused memory to the rest of
35 * the system.  This will make the system as a whole more flexible due to the
36 * ability to move memory to subsystems which most need it instead of leaving
37 * pools of reserved memory unused.
38 *
39 * The basic ideas stem from similar slab/zone based allocators whose algorithms
40 * are well known.
41 *
42 */
43
44/*
45 * TODO:
46 *	- Improve memory usage for large allocations
47 *	- Investigate cache size adjustments
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD: stable/11/sys/vm/uma_core.c 313127 2017-02-03 01:19:48Z markj $");
52
53/* I should really use ktr.. */
54/*
55#define UMA_DEBUG 1
56#define UMA_DEBUG_ALLOC 1
57#define UMA_DEBUG_ALLOC_1 1
58*/
59
60#include "opt_ddb.h"
61#include "opt_param.h"
62#include "opt_vm.h"
63
64#include <sys/param.h>
65#include <sys/systm.h>
66#include <sys/bitset.h>
67#include <sys/kernel.h>
68#include <sys/types.h>
69#include <sys/queue.h>
70#include <sys/malloc.h>
71#include <sys/ktr.h>
72#include <sys/lock.h>
73#include <sys/sysctl.h>
74#include <sys/mutex.h>
75#include <sys/proc.h>
76#include <sys/random.h>
77#include <sys/rwlock.h>
78#include <sys/sbuf.h>
79#include <sys/sched.h>
80#include <sys/smp.h>
81#include <sys/taskqueue.h>
82#include <sys/vmmeter.h>
83
84#include <vm/vm.h>
85#include <vm/vm_object.h>
86#include <vm/vm_page.h>
87#include <vm/vm_pageout.h>
88#include <vm/vm_param.h>
89#include <vm/vm_map.h>
90#include <vm/vm_kern.h>
91#include <vm/vm_extern.h>
92#include <vm/uma.h>
93#include <vm/uma_int.h>
94#include <vm/uma_dbg.h>
95
96#include <ddb/ddb.h>
97
98#ifdef DEBUG_MEMGUARD
99#include <vm/memguard.h>
100#endif
101
102/*
103 * This is the zone and keg from which all zones are spawned.  The idea is that
104 * even the zone & keg heads are allocated from the allocator, so we use the
105 * bss section to bootstrap us.
106 */
107static struct uma_keg masterkeg;
108static struct uma_zone masterzone_k;
109static struct uma_zone masterzone_z;
110static uma_zone_t kegs = &masterzone_k;
111static uma_zone_t zones = &masterzone_z;
112
113/* This is the zone from which all of uma_slab_t's are allocated. */
114static uma_zone_t slabzone;
115
116/*
117 * The initial hash tables come out of this zone so they can be allocated
118 * prior to malloc coming up.
119 */
120static uma_zone_t hashzone;
121
122/* The boot-time adjusted value for cache line alignment. */
123int uma_align_cache = 64 - 1;
124
125static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
126
127/*
128 * Are we allowed to allocate buckets?
129 */
130static int bucketdisable = 1;
131
132/* Linked list of all kegs in the system */
133static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
134
135/* Linked list of all cache-only zones in the system */
136static LIST_HEAD(,uma_zone) uma_cachezones =
137    LIST_HEAD_INITIALIZER(uma_cachezones);
138
139/* This RW lock protects the keg list */
140static struct rwlock_padalign uma_rwlock;
141
142/* Linked list of boot time pages */
143static LIST_HEAD(,uma_slab) uma_boot_pages =
144    LIST_HEAD_INITIALIZER(uma_boot_pages);
145
146/* This mutex protects the boot time pages list */
147static struct mtx_padalign uma_boot_pages_mtx;
148
149static struct sx uma_drain_lock;
150
151/* Is the VM done starting up? */
152static int booted = 0;
153#define	UMA_STARTUP	1
154#define	UMA_STARTUP2	2
155
156/*
157 * This is the handle used to schedule events that need to happen
158 * outside of the allocation fast path.
159 */
160static struct callout uma_callout;
161#define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
162
163/*
164 * This structure is passed as the zone ctor arg so that I don't have to create
165 * a special allocation function just for zones.
166 */
167struct uma_zctor_args {
168	const char *name;
169	size_t size;
170	uma_ctor ctor;
171	uma_dtor dtor;
172	uma_init uminit;
173	uma_fini fini;
174	uma_import import;
175	uma_release release;
176	void *arg;
177	uma_keg_t keg;
178	int align;
179	uint32_t flags;
180};
181
182struct uma_kctor_args {
183	uma_zone_t zone;
184	size_t size;
185	uma_init uminit;
186	uma_fini fini;
187	int align;
188	uint32_t flags;
189};
190
191struct uma_bucket_zone {
192	uma_zone_t	ubz_zone;
193	char		*ubz_name;
194	int		ubz_entries;	/* Number of items it can hold. */
195	int		ubz_maxsize;	/* Maximum allocation size per-item. */
196};
197
198/*
199 * Compute the actual number of bucket entries to pack them in power
200 * of two sizes for more efficient space utilization.
201 */
202#define	BUCKET_SIZE(n)						\
203    (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
204
205#define	BUCKET_MAX	BUCKET_SIZE(256)
206
207struct uma_bucket_zone bucket_zones[] = {
208	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
209	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
210	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
211	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
212	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
213	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
214	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
215	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
216	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
217	{ NULL, NULL, 0}
218};
219
220/*
221 * Flags and enumerations to be passed to internal functions.
222 */
223enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
224
225/* Prototypes.. */
226
227static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
228static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
229static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
230static void page_free(void *, vm_size_t, uint8_t);
231static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
232static void cache_drain(uma_zone_t);
233static void bucket_drain(uma_zone_t, uma_bucket_t);
234static void bucket_cache_drain(uma_zone_t zone);
235static int keg_ctor(void *, int, void *, int);
236static void keg_dtor(void *, int, void *);
237static int zone_ctor(void *, int, void *, int);
238static void zone_dtor(void *, int, void *);
239static int zero_init(void *, int, int);
240static void keg_small_init(uma_keg_t keg);
241static void keg_large_init(uma_keg_t keg);
242static void zone_foreach(void (*zfunc)(uma_zone_t));
243static void zone_timeout(uma_zone_t zone);
244static int hash_alloc(struct uma_hash *);
245static int hash_expand(struct uma_hash *, struct uma_hash *);
246static void hash_free(struct uma_hash *hash);
247static void uma_timeout(void *);
248static void uma_startup3(void);
249static void *zone_alloc_item(uma_zone_t, void *, int);
250static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
251static void bucket_enable(void);
252static void bucket_init(void);
253static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
254static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
255static void bucket_zone_drain(void);
256static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
257static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
258static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
259static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
260static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
261static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
262    uma_fini fini, int align, uint32_t flags);
263static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
264static void zone_release(uma_zone_t zone, void **bucket, int cnt);
265static void uma_zero_item(void *item, uma_zone_t zone);
266
267void uma_print_zone(uma_zone_t);
268void uma_print_stats(void);
269static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
270static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
271
272#ifdef INVARIANTS
273static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
274static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
275#endif
276
277SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
278
279SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
280    0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
281
282SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
283    0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
284
285static int zone_warnings = 1;
286SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
287    "Warn when UMA zones becomes full");
288
289/*
290 * This routine checks to see whether or not it's safe to enable buckets.
291 */
292static void
293bucket_enable(void)
294{
295	bucketdisable = vm_page_count_min();
296}
297
298/*
299 * Initialize bucket_zones, the array of zones of buckets of various sizes.
300 *
301 * For each zone, calculate the memory required for each bucket, consisting
302 * of the header and an array of pointers.
303 */
304static void
305bucket_init(void)
306{
307	struct uma_bucket_zone *ubz;
308	int size;
309
310	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
311		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
312		size += sizeof(void *) * ubz->ubz_entries;
313		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
314		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
315		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
316	}
317}
318
319/*
320 * Given a desired number of entries for a bucket, return the zone from which
321 * to allocate the bucket.
322 */
323static struct uma_bucket_zone *
324bucket_zone_lookup(int entries)
325{
326	struct uma_bucket_zone *ubz;
327
328	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
329		if (ubz->ubz_entries >= entries)
330			return (ubz);
331	ubz--;
332	return (ubz);
333}
334
335static int
336bucket_select(int size)
337{
338	struct uma_bucket_zone *ubz;
339
340	ubz = &bucket_zones[0];
341	if (size > ubz->ubz_maxsize)
342		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
343
344	for (; ubz->ubz_entries != 0; ubz++)
345		if (ubz->ubz_maxsize < size)
346			break;
347	ubz--;
348	return (ubz->ubz_entries);
349}
350
351static uma_bucket_t
352bucket_alloc(uma_zone_t zone, void *udata, int flags)
353{
354	struct uma_bucket_zone *ubz;
355	uma_bucket_t bucket;
356
357	/*
358	 * This is to stop us from allocating per cpu buckets while we're
359	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
360	 * boot pages.  This also prevents us from allocating buckets in
361	 * low memory situations.
362	 */
363	if (bucketdisable)
364		return (NULL);
365	/*
366	 * To limit bucket recursion we store the original zone flags
367	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
368	 * NOVM flag to persist even through deep recursions.  We also
369	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
370	 * a bucket for a bucket zone so we do not allow infinite bucket
371	 * recursion.  This cookie will even persist to frees of unused
372	 * buckets via the allocation path or bucket allocations in the
373	 * free path.
374	 */
375	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
376		udata = (void *)(uintptr_t)zone->uz_flags;
377	else {
378		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
379			return (NULL);
380		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
381	}
382	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
383		flags |= M_NOVM;
384	ubz = bucket_zone_lookup(zone->uz_count);
385	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
386		ubz++;
387	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
388	if (bucket) {
389#ifdef INVARIANTS
390		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
391#endif
392		bucket->ub_cnt = 0;
393		bucket->ub_entries = ubz->ubz_entries;
394	}
395
396	return (bucket);
397}
398
399static void
400bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
401{
402	struct uma_bucket_zone *ubz;
403
404	KASSERT(bucket->ub_cnt == 0,
405	    ("bucket_free: Freeing a non free bucket."));
406	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
407		udata = (void *)(uintptr_t)zone->uz_flags;
408	ubz = bucket_zone_lookup(bucket->ub_entries);
409	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
410}
411
412static void
413bucket_zone_drain(void)
414{
415	struct uma_bucket_zone *ubz;
416
417	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
418		zone_drain(ubz->ubz_zone);
419}
420
421static void
422zone_log_warning(uma_zone_t zone)
423{
424	static const struct timeval warninterval = { 300, 0 };
425
426	if (!zone_warnings || zone->uz_warning == NULL)
427		return;
428
429	if (ratecheck(&zone->uz_ratecheck, &warninterval))
430		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
431}
432
433static inline void
434zone_maxaction(uma_zone_t zone)
435{
436
437	if (zone->uz_maxaction.ta_func != NULL)
438		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
439}
440
441static void
442zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
443{
444	uma_klink_t klink;
445
446	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
447		kegfn(klink->kl_keg);
448}
449
450/*
451 * Routine called by timeout which is used to fire off some time interval
452 * based calculations.  (stats, hash size, etc.)
453 *
454 * Arguments:
455 *	arg   Unused
456 *
457 * Returns:
458 *	Nothing
459 */
460static void
461uma_timeout(void *unused)
462{
463	bucket_enable();
464	zone_foreach(zone_timeout);
465
466	/* Reschedule this event */
467	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
468}
469
470/*
471 * Routine to perform timeout driven calculations.  This expands the
472 * hashes and does per cpu statistics aggregation.
473 *
474 *  Returns nothing.
475 */
476static void
477keg_timeout(uma_keg_t keg)
478{
479
480	KEG_LOCK(keg);
481	/*
482	 * Expand the keg hash table.
483	 *
484	 * This is done if the number of slabs is larger than the hash size.
485	 * What I'm trying to do here is completely reduce collisions.  This
486	 * may be a little aggressive.  Should I allow for two collisions max?
487	 */
488	if (keg->uk_flags & UMA_ZONE_HASH &&
489	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
490		struct uma_hash newhash;
491		struct uma_hash oldhash;
492		int ret;
493
494		/*
495		 * This is so involved because allocating and freeing
496		 * while the keg lock is held will lead to deadlock.
497		 * I have to do everything in stages and check for
498		 * races.
499		 */
500		newhash = keg->uk_hash;
501		KEG_UNLOCK(keg);
502		ret = hash_alloc(&newhash);
503		KEG_LOCK(keg);
504		if (ret) {
505			if (hash_expand(&keg->uk_hash, &newhash)) {
506				oldhash = keg->uk_hash;
507				keg->uk_hash = newhash;
508			} else
509				oldhash = newhash;
510
511			KEG_UNLOCK(keg);
512			hash_free(&oldhash);
513			return;
514		}
515	}
516	KEG_UNLOCK(keg);
517}
518
519static void
520zone_timeout(uma_zone_t zone)
521{
522
523	zone_foreach_keg(zone, &keg_timeout);
524}
525
526/*
527 * Allocate and zero fill the next sized hash table from the appropriate
528 * backing store.
529 *
530 * Arguments:
531 *	hash  A new hash structure with the old hash size in uh_hashsize
532 *
533 * Returns:
534 *	1 on success and 0 on failure.
535 */
536static int
537hash_alloc(struct uma_hash *hash)
538{
539	int oldsize;
540	int alloc;
541
542	oldsize = hash->uh_hashsize;
543
544	/* We're just going to go to a power of two greater */
545	if (oldsize)  {
546		hash->uh_hashsize = oldsize * 2;
547		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
548		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
549		    M_UMAHASH, M_NOWAIT);
550	} else {
551		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
552		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
553		    M_WAITOK);
554		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
555	}
556	if (hash->uh_slab_hash) {
557		bzero(hash->uh_slab_hash, alloc);
558		hash->uh_hashmask = hash->uh_hashsize - 1;
559		return (1);
560	}
561
562	return (0);
563}
564
565/*
566 * Expands the hash table for HASH zones.  This is done from zone_timeout
567 * to reduce collisions.  This must not be done in the regular allocation
568 * path, otherwise, we can recurse on the vm while allocating pages.
569 *
570 * Arguments:
571 *	oldhash  The hash you want to expand
572 *	newhash  The hash structure for the new table
573 *
574 * Returns:
575 *	Nothing
576 *
577 * Discussion:
578 */
579static int
580hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
581{
582	uma_slab_t slab;
583	int hval;
584	int i;
585
586	if (!newhash->uh_slab_hash)
587		return (0);
588
589	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
590		return (0);
591
592	/*
593	 * I need to investigate hash algorithms for resizing without a
594	 * full rehash.
595	 */
596
597	for (i = 0; i < oldhash->uh_hashsize; i++)
598		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
599			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
600			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
601			hval = UMA_HASH(newhash, slab->us_data);
602			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
603			    slab, us_hlink);
604		}
605
606	return (1);
607}
608
609/*
610 * Free the hash bucket to the appropriate backing store.
611 *
612 * Arguments:
613 *	slab_hash  The hash bucket we're freeing
614 *	hashsize   The number of entries in that hash bucket
615 *
616 * Returns:
617 *	Nothing
618 */
619static void
620hash_free(struct uma_hash *hash)
621{
622	if (hash->uh_slab_hash == NULL)
623		return;
624	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
625		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
626	else
627		free(hash->uh_slab_hash, M_UMAHASH);
628}
629
630/*
631 * Frees all outstanding items in a bucket
632 *
633 * Arguments:
634 *	zone   The zone to free to, must be unlocked.
635 *	bucket The free/alloc bucket with items, cpu queue must be locked.
636 *
637 * Returns:
638 *	Nothing
639 */
640
641static void
642bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
643{
644	int i;
645
646	if (bucket == NULL)
647		return;
648
649	if (zone->uz_fini)
650		for (i = 0; i < bucket->ub_cnt; i++)
651			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
652	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
653	bucket->ub_cnt = 0;
654}
655
656/*
657 * Drains the per cpu caches for a zone.
658 *
659 * NOTE: This may only be called while the zone is being turn down, and not
660 * during normal operation.  This is necessary in order that we do not have
661 * to migrate CPUs to drain the per-CPU caches.
662 *
663 * Arguments:
664 *	zone     The zone to drain, must be unlocked.
665 *
666 * Returns:
667 *	Nothing
668 */
669static void
670cache_drain(uma_zone_t zone)
671{
672	uma_cache_t cache;
673	int cpu;
674
675	/*
676	 * XXX: It is safe to not lock the per-CPU caches, because we're
677	 * tearing down the zone anyway.  I.e., there will be no further use
678	 * of the caches at this point.
679	 *
680	 * XXX: It would good to be able to assert that the zone is being
681	 * torn down to prevent improper use of cache_drain().
682	 *
683	 * XXX: We lock the zone before passing into bucket_cache_drain() as
684	 * it is used elsewhere.  Should the tear-down path be made special
685	 * there in some form?
686	 */
687	CPU_FOREACH(cpu) {
688		cache = &zone->uz_cpu[cpu];
689		bucket_drain(zone, cache->uc_allocbucket);
690		bucket_drain(zone, cache->uc_freebucket);
691		if (cache->uc_allocbucket != NULL)
692			bucket_free(zone, cache->uc_allocbucket, NULL);
693		if (cache->uc_freebucket != NULL)
694			bucket_free(zone, cache->uc_freebucket, NULL);
695		cache->uc_allocbucket = cache->uc_freebucket = NULL;
696	}
697	ZONE_LOCK(zone);
698	bucket_cache_drain(zone);
699	ZONE_UNLOCK(zone);
700}
701
702static void
703cache_shrink(uma_zone_t zone)
704{
705
706	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
707		return;
708
709	ZONE_LOCK(zone);
710	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
711	ZONE_UNLOCK(zone);
712}
713
714static void
715cache_drain_safe_cpu(uma_zone_t zone)
716{
717	uma_cache_t cache;
718	uma_bucket_t b1, b2;
719
720	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
721		return;
722
723	b1 = b2 = NULL;
724	ZONE_LOCK(zone);
725	critical_enter();
726	cache = &zone->uz_cpu[curcpu];
727	if (cache->uc_allocbucket) {
728		if (cache->uc_allocbucket->ub_cnt != 0)
729			LIST_INSERT_HEAD(&zone->uz_buckets,
730			    cache->uc_allocbucket, ub_link);
731		else
732			b1 = cache->uc_allocbucket;
733		cache->uc_allocbucket = NULL;
734	}
735	if (cache->uc_freebucket) {
736		if (cache->uc_freebucket->ub_cnt != 0)
737			LIST_INSERT_HEAD(&zone->uz_buckets,
738			    cache->uc_freebucket, ub_link);
739		else
740			b2 = cache->uc_freebucket;
741		cache->uc_freebucket = NULL;
742	}
743	critical_exit();
744	ZONE_UNLOCK(zone);
745	if (b1)
746		bucket_free(zone, b1, NULL);
747	if (b2)
748		bucket_free(zone, b2, NULL);
749}
750
751/*
752 * Safely drain per-CPU caches of a zone(s) to alloc bucket.
753 * This is an expensive call because it needs to bind to all CPUs
754 * one by one and enter a critical section on each of them in order
755 * to safely access their cache buckets.
756 * Zone lock must not be held on call this function.
757 */
758static void
759cache_drain_safe(uma_zone_t zone)
760{
761	int cpu;
762
763	/*
764	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
765	 */
766	if (zone)
767		cache_shrink(zone);
768	else
769		zone_foreach(cache_shrink);
770
771	CPU_FOREACH(cpu) {
772		thread_lock(curthread);
773		sched_bind(curthread, cpu);
774		thread_unlock(curthread);
775
776		if (zone)
777			cache_drain_safe_cpu(zone);
778		else
779			zone_foreach(cache_drain_safe_cpu);
780	}
781	thread_lock(curthread);
782	sched_unbind(curthread);
783	thread_unlock(curthread);
784}
785
786/*
787 * Drain the cached buckets from a zone.  Expects a locked zone on entry.
788 */
789static void
790bucket_cache_drain(uma_zone_t zone)
791{
792	uma_bucket_t bucket;
793
794	/*
795	 * Drain the bucket queues and free the buckets, we just keep two per
796	 * cpu (alloc/free).
797	 */
798	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
799		LIST_REMOVE(bucket, ub_link);
800		ZONE_UNLOCK(zone);
801		bucket_drain(zone, bucket);
802		bucket_free(zone, bucket, NULL);
803		ZONE_LOCK(zone);
804	}
805
806	/*
807	 * Shrink further bucket sizes.  Price of single zone lock collision
808	 * is probably lower then price of global cache drain.
809	 */
810	if (zone->uz_count > zone->uz_count_min)
811		zone->uz_count--;
812}
813
814static void
815keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
816{
817	uint8_t *mem;
818	int i;
819	uint8_t flags;
820
821	mem = slab->us_data;
822	flags = slab->us_flags;
823	i = start;
824	if (keg->uk_fini != NULL) {
825		for (i--; i > -1; i--)
826			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
827			    keg->uk_size);
828	}
829	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
830		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
831#ifdef UMA_DEBUG
832	printf("%s: Returning %d bytes.\n", keg->uk_name,
833	    PAGE_SIZE * keg->uk_ppera);
834#endif
835	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
836}
837
838/*
839 * Frees pages from a keg back to the system.  This is done on demand from
840 * the pageout daemon.
841 *
842 * Returns nothing.
843 */
844static void
845keg_drain(uma_keg_t keg)
846{
847	struct slabhead freeslabs = { 0 };
848	uma_slab_t slab, tmp;
849
850	/*
851	 * We don't want to take pages from statically allocated kegs at this
852	 * time
853	 */
854	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
855		return;
856
857#ifdef UMA_DEBUG
858	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
859#endif
860	KEG_LOCK(keg);
861	if (keg->uk_free == 0)
862		goto finished;
863
864	LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
865		/* We have nowhere to free these to. */
866		if (slab->us_flags & UMA_SLAB_BOOT)
867			continue;
868
869		LIST_REMOVE(slab, us_link);
870		keg->uk_pages -= keg->uk_ppera;
871		keg->uk_free -= keg->uk_ipers;
872
873		if (keg->uk_flags & UMA_ZONE_HASH)
874			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
875
876		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
877	}
878finished:
879	KEG_UNLOCK(keg);
880
881	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
882		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
883		keg_free_slab(keg, slab, keg->uk_ipers);
884	}
885}
886
887static void
888zone_drain_wait(uma_zone_t zone, int waitok)
889{
890
891	/*
892	 * Set draining to interlock with zone_dtor() so we can release our
893	 * locks as we go.  Only dtor() should do a WAITOK call since it
894	 * is the only call that knows the structure will still be available
895	 * when it wakes up.
896	 */
897	ZONE_LOCK(zone);
898	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
899		if (waitok == M_NOWAIT)
900			goto out;
901		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
902	}
903	zone->uz_flags |= UMA_ZFLAG_DRAINING;
904	bucket_cache_drain(zone);
905	ZONE_UNLOCK(zone);
906	/*
907	 * The DRAINING flag protects us from being freed while
908	 * we're running.  Normally the uma_rwlock would protect us but we
909	 * must be able to release and acquire the right lock for each keg.
910	 */
911	zone_foreach_keg(zone, &keg_drain);
912	ZONE_LOCK(zone);
913	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
914	wakeup(zone);
915out:
916	ZONE_UNLOCK(zone);
917}
918
919void
920zone_drain(uma_zone_t zone)
921{
922
923	zone_drain_wait(zone, M_NOWAIT);
924}
925
926/*
927 * Allocate a new slab for a keg.  This does not insert the slab onto a list.
928 *
929 * Arguments:
930 *	wait  Shall we wait?
931 *
932 * Returns:
933 *	The slab that was allocated or NULL if there is no memory and the
934 *	caller specified M_NOWAIT.
935 */
936static uma_slab_t
937keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
938{
939	uma_alloc allocf;
940	uma_slab_t slab;
941	uint8_t *mem;
942	uint8_t flags;
943	int i;
944
945	mtx_assert(&keg->uk_lock, MA_OWNED);
946	slab = NULL;
947	mem = NULL;
948
949#ifdef UMA_DEBUG
950	printf("alloc_slab:  Allocating a new slab for %s\n", keg->uk_name);
951#endif
952	allocf = keg->uk_allocf;
953	KEG_UNLOCK(keg);
954
955	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
956		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
957		if (slab == NULL)
958			goto out;
959	}
960
961	/*
962	 * This reproduces the old vm_zone behavior of zero filling pages the
963	 * first time they are added to a zone.
964	 *
965	 * Malloced items are zeroed in uma_zalloc.
966	 */
967
968	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
969		wait |= M_ZERO;
970	else
971		wait &= ~M_ZERO;
972
973	if (keg->uk_flags & UMA_ZONE_NODUMP)
974		wait |= M_NODUMP;
975
976	/* zone is passed for legacy reasons. */
977	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
978	if (mem == NULL) {
979		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
980			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
981		slab = NULL;
982		goto out;
983	}
984
985	/* Point the slab into the allocated memory */
986	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
987		slab = (uma_slab_t )(mem + keg->uk_pgoff);
988
989	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
990		for (i = 0; i < keg->uk_ppera; i++)
991			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
992
993	slab->us_keg = keg;
994	slab->us_data = mem;
995	slab->us_freecount = keg->uk_ipers;
996	slab->us_flags = flags;
997	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
998#ifdef INVARIANTS
999	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1000#endif
1001
1002	if (keg->uk_init != NULL) {
1003		for (i = 0; i < keg->uk_ipers; i++)
1004			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1005			    keg->uk_size, wait) != 0)
1006				break;
1007		if (i != keg->uk_ipers) {
1008			keg_free_slab(keg, slab, i);
1009			slab = NULL;
1010			goto out;
1011		}
1012	}
1013out:
1014	KEG_LOCK(keg);
1015
1016	if (slab != NULL) {
1017		if (keg->uk_flags & UMA_ZONE_HASH)
1018			UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1019
1020		keg->uk_pages += keg->uk_ppera;
1021		keg->uk_free += keg->uk_ipers;
1022	}
1023
1024	return (slab);
1025}
1026
1027/*
1028 * This function is intended to be used early on in place of page_alloc() so
1029 * that we may use the boot time page cache to satisfy allocations before
1030 * the VM is ready.
1031 */
1032static void *
1033startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
1034{
1035	uma_keg_t keg;
1036	uma_slab_t tmps;
1037	int pages, check_pages;
1038
1039	keg = zone_first_keg(zone);
1040	pages = howmany(bytes, PAGE_SIZE);
1041	check_pages = pages - 1;
1042	KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
1043
1044	/*
1045	 * Check our small startup cache to see if it has pages remaining.
1046	 */
1047	mtx_lock(&uma_boot_pages_mtx);
1048
1049	/* First check if we have enough room. */
1050	tmps = LIST_FIRST(&uma_boot_pages);
1051	while (tmps != NULL && check_pages-- > 0)
1052		tmps = LIST_NEXT(tmps, us_link);
1053	if (tmps != NULL) {
1054		/*
1055		 * It's ok to lose tmps references.  The last one will
1056		 * have tmps->us_data pointing to the start address of
1057		 * "pages" contiguous pages of memory.
1058		 */
1059		while (pages-- > 0) {
1060			tmps = LIST_FIRST(&uma_boot_pages);
1061			LIST_REMOVE(tmps, us_link);
1062		}
1063		mtx_unlock(&uma_boot_pages_mtx);
1064		*pflag = tmps->us_flags;
1065		return (tmps->us_data);
1066	}
1067	mtx_unlock(&uma_boot_pages_mtx);
1068	if (booted < UMA_STARTUP2)
1069		panic("UMA: Increase vm.boot_pages");
1070	/*
1071	 * Now that we've booted reset these users to their real allocator.
1072	 */
1073#ifdef UMA_MD_SMALL_ALLOC
1074	keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
1075#else
1076	keg->uk_allocf = page_alloc;
1077#endif
1078	return keg->uk_allocf(zone, bytes, pflag, wait);
1079}
1080
1081/*
1082 * Allocates a number of pages from the system
1083 *
1084 * Arguments:
1085 *	bytes  The number of bytes requested
1086 *	wait  Shall we wait?
1087 *
1088 * Returns:
1089 *	A pointer to the alloced memory or possibly
1090 *	NULL if M_NOWAIT is set.
1091 */
1092static void *
1093page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
1094{
1095	void *p;	/* Returned page */
1096
1097	*pflag = UMA_SLAB_KMEM;
1098	p = (void *) kmem_malloc(kmem_arena, bytes, wait);
1099
1100	return (p);
1101}
1102
1103/*
1104 * Allocates a number of pages from within an object
1105 *
1106 * Arguments:
1107 *	bytes  The number of bytes requested
1108 *	wait   Shall we wait?
1109 *
1110 * Returns:
1111 *	A pointer to the alloced memory or possibly
1112 *	NULL if M_NOWAIT is set.
1113 */
1114static void *
1115noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
1116{
1117	TAILQ_HEAD(, vm_page) alloctail;
1118	u_long npages;
1119	vm_offset_t retkva, zkva;
1120	vm_page_t p, p_next;
1121	uma_keg_t keg;
1122
1123	TAILQ_INIT(&alloctail);
1124	keg = zone_first_keg(zone);
1125
1126	npages = howmany(bytes, PAGE_SIZE);
1127	while (npages > 0) {
1128		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1129		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1130		if (p != NULL) {
1131			/*
1132			 * Since the page does not belong to an object, its
1133			 * listq is unused.
1134			 */
1135			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1136			npages--;
1137			continue;
1138		}
1139		if (wait & M_WAITOK) {
1140			VM_WAIT;
1141			continue;
1142		}
1143
1144		/*
1145		 * Page allocation failed, free intermediate pages and
1146		 * exit.
1147		 */
1148		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1149			vm_page_unwire(p, PQ_NONE);
1150			vm_page_free(p);
1151		}
1152		return (NULL);
1153	}
1154	*flags = UMA_SLAB_PRIV;
1155	zkva = keg->uk_kva +
1156	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1157	retkva = zkva;
1158	TAILQ_FOREACH(p, &alloctail, listq) {
1159		pmap_qenter(zkva, &p, 1);
1160		zkva += PAGE_SIZE;
1161	}
1162
1163	return ((void *)retkva);
1164}
1165
1166/*
1167 * Frees a number of pages to the system
1168 *
1169 * Arguments:
1170 *	mem   A pointer to the memory to be freed
1171 *	size  The size of the memory being freed
1172 *	flags The original p->us_flags field
1173 *
1174 * Returns:
1175 *	Nothing
1176 */
1177static void
1178page_free(void *mem, vm_size_t size, uint8_t flags)
1179{
1180	struct vmem *vmem;
1181
1182	if (flags & UMA_SLAB_KMEM)
1183		vmem = kmem_arena;
1184	else if (flags & UMA_SLAB_KERNEL)
1185		vmem = kernel_arena;
1186	else
1187		panic("UMA: page_free used with invalid flags %d", flags);
1188
1189	kmem_free(vmem, (vm_offset_t)mem, size);
1190}
1191
1192/*
1193 * Zero fill initializer
1194 *
1195 * Arguments/Returns follow uma_init specifications
1196 */
1197static int
1198zero_init(void *mem, int size, int flags)
1199{
1200	bzero(mem, size);
1201	return (0);
1202}
1203
1204/*
1205 * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1206 *
1207 * Arguments
1208 *	keg  The zone we should initialize
1209 *
1210 * Returns
1211 *	Nothing
1212 */
1213static void
1214keg_small_init(uma_keg_t keg)
1215{
1216	u_int rsize;
1217	u_int memused;
1218	u_int wastedspace;
1219	u_int shsize;
1220
1221	if (keg->uk_flags & UMA_ZONE_PCPU) {
1222		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1223
1224		keg->uk_slabsize = sizeof(struct pcpu);
1225		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
1226		    PAGE_SIZE);
1227	} else {
1228		keg->uk_slabsize = UMA_SLAB_SIZE;
1229		keg->uk_ppera = 1;
1230	}
1231
1232	/*
1233	 * Calculate the size of each allocation (rsize) according to
1234	 * alignment.  If the requested size is smaller than we have
1235	 * allocation bits for we round it up.
1236	 */
1237	rsize = keg->uk_size;
1238	if (rsize < keg->uk_slabsize / SLAB_SETSIZE)
1239		rsize = keg->uk_slabsize / SLAB_SETSIZE;
1240	if (rsize & keg->uk_align)
1241		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1242	keg->uk_rsize = rsize;
1243
1244	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1245	    keg->uk_rsize < sizeof(struct pcpu),
1246	    ("%s: size %u too large", __func__, keg->uk_rsize));
1247
1248	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1249		shsize = 0;
1250	else
1251		shsize = sizeof(struct uma_slab);
1252
1253	keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize;
1254	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1255	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1256
1257	memused = keg->uk_ipers * rsize + shsize;
1258	wastedspace = keg->uk_slabsize - memused;
1259
1260	/*
1261	 * We can't do OFFPAGE if we're internal or if we've been
1262	 * asked to not go to the VM for buckets.  If we do this we
1263	 * may end up going to the VM  for slabs which we do not
1264	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1265	 * of UMA_ZONE_VM, which clearly forbids it.
1266	 */
1267	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1268	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1269		return;
1270
1271	/*
1272	 * See if using an OFFPAGE slab will limit our waste.  Only do
1273	 * this if it permits more items per-slab.
1274	 *
1275	 * XXX We could try growing slabsize to limit max waste as well.
1276	 * Historically this was not done because the VM could not
1277	 * efficiently handle contiguous allocations.
1278	 */
1279	if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) &&
1280	    (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) {
1281		keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize;
1282		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1283		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1284#ifdef UMA_DEBUG
1285		printf("UMA decided we need offpage slab headers for "
1286		    "keg: %s, calculated wastedspace = %d, "
1287		    "maximum wasted space allowed = %d, "
1288		    "calculated ipers = %d, "
1289		    "new wasted space = %d\n", keg->uk_name, wastedspace,
1290		    keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1291		    keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize);
1292#endif
1293		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1294	}
1295
1296	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1297	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1298		keg->uk_flags |= UMA_ZONE_HASH;
1299}
1300
1301/*
1302 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1303 * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1304 * more complicated.
1305 *
1306 * Arguments
1307 *	keg  The keg we should initialize
1308 *
1309 * Returns
1310 *	Nothing
1311 */
1312static void
1313keg_large_init(uma_keg_t keg)
1314{
1315	u_int shsize;
1316
1317	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1318	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1319	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1320	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1321	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1322
1323	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1324	keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE;
1325	keg->uk_ipers = 1;
1326	keg->uk_rsize = keg->uk_size;
1327
1328	/* We can't do OFFPAGE if we're internal, bail out here. */
1329	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1330		return;
1331
1332	/* Check whether we have enough space to not do OFFPAGE. */
1333	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
1334		shsize = sizeof(struct uma_slab);
1335		if (shsize & UMA_ALIGN_PTR)
1336			shsize = (shsize & ~UMA_ALIGN_PTR) +
1337			    (UMA_ALIGN_PTR + 1);
1338
1339		if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
1340			keg->uk_flags |= UMA_ZONE_OFFPAGE;
1341	}
1342
1343	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1344	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1345		keg->uk_flags |= UMA_ZONE_HASH;
1346}
1347
1348static void
1349keg_cachespread_init(uma_keg_t keg)
1350{
1351	int alignsize;
1352	int trailer;
1353	int pages;
1354	int rsize;
1355
1356	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1357	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1358
1359	alignsize = keg->uk_align + 1;
1360	rsize = keg->uk_size;
1361	/*
1362	 * We want one item to start on every align boundary in a page.  To
1363	 * do this we will span pages.  We will also extend the item by the
1364	 * size of align if it is an even multiple of align.  Otherwise, it
1365	 * would fall on the same boundary every time.
1366	 */
1367	if (rsize & keg->uk_align)
1368		rsize = (rsize & ~keg->uk_align) + alignsize;
1369	if ((rsize & alignsize) == 0)
1370		rsize += alignsize;
1371	trailer = rsize - keg->uk_size;
1372	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1373	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1374	keg->uk_rsize = rsize;
1375	keg->uk_ppera = pages;
1376	keg->uk_slabsize = UMA_SLAB_SIZE;
1377	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1378	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1379	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1380	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1381	    keg->uk_ipers));
1382}
1383
1384/*
1385 * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1386 * the keg onto the global keg list.
1387 *
1388 * Arguments/Returns follow uma_ctor specifications
1389 *	udata  Actually uma_kctor_args
1390 */
1391static int
1392keg_ctor(void *mem, int size, void *udata, int flags)
1393{
1394	struct uma_kctor_args *arg = udata;
1395	uma_keg_t keg = mem;
1396	uma_zone_t zone;
1397
1398	bzero(keg, size);
1399	keg->uk_size = arg->size;
1400	keg->uk_init = arg->uminit;
1401	keg->uk_fini = arg->fini;
1402	keg->uk_align = arg->align;
1403	keg->uk_free = 0;
1404	keg->uk_reserve = 0;
1405	keg->uk_pages = 0;
1406	keg->uk_flags = arg->flags;
1407	keg->uk_allocf = page_alloc;
1408	keg->uk_freef = page_free;
1409	keg->uk_slabzone = NULL;
1410
1411	/*
1412	 * The master zone is passed to us at keg-creation time.
1413	 */
1414	zone = arg->zone;
1415	keg->uk_name = zone->uz_name;
1416
1417	if (arg->flags & UMA_ZONE_VM)
1418		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1419
1420	if (arg->flags & UMA_ZONE_ZINIT)
1421		keg->uk_init = zero_init;
1422
1423	if (arg->flags & UMA_ZONE_MALLOC)
1424		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1425
1426	if (arg->flags & UMA_ZONE_PCPU)
1427#ifdef SMP
1428		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1429#else
1430		keg->uk_flags &= ~UMA_ZONE_PCPU;
1431#endif
1432
1433	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1434		keg_cachespread_init(keg);
1435	} else {
1436		if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1437			keg_large_init(keg);
1438		else
1439			keg_small_init(keg);
1440	}
1441
1442	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1443		keg->uk_slabzone = slabzone;
1444
1445	/*
1446	 * If we haven't booted yet we need allocations to go through the
1447	 * startup cache until the vm is ready.
1448	 */
1449	if (keg->uk_ppera == 1) {
1450#ifdef UMA_MD_SMALL_ALLOC
1451		keg->uk_allocf = uma_small_alloc;
1452		keg->uk_freef = uma_small_free;
1453
1454		if (booted < UMA_STARTUP)
1455			keg->uk_allocf = startup_alloc;
1456#else
1457		if (booted < UMA_STARTUP2)
1458			keg->uk_allocf = startup_alloc;
1459#endif
1460	} else if (booted < UMA_STARTUP2 &&
1461	    (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1462		keg->uk_allocf = startup_alloc;
1463
1464	/*
1465	 * Initialize keg's lock
1466	 */
1467	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1468
1469	/*
1470	 * If we're putting the slab header in the actual page we need to
1471	 * figure out where in each page it goes.  This calculates a right
1472	 * justified offset into the memory on an ALIGN_PTR boundary.
1473	 */
1474	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1475		u_int totsize;
1476
1477		/* Size of the slab struct and free list */
1478		totsize = sizeof(struct uma_slab);
1479
1480		if (totsize & UMA_ALIGN_PTR)
1481			totsize = (totsize & ~UMA_ALIGN_PTR) +
1482			    (UMA_ALIGN_PTR + 1);
1483		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1484
1485		/*
1486		 * The only way the following is possible is if with our
1487		 * UMA_ALIGN_PTR adjustments we are now bigger than
1488		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1489		 * mathematically possible for all cases, so we make
1490		 * sure here anyway.
1491		 */
1492		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1493		if (totsize > PAGE_SIZE * keg->uk_ppera) {
1494			printf("zone %s ipers %d rsize %d size %d\n",
1495			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1496			    keg->uk_size);
1497			panic("UMA slab won't fit.");
1498		}
1499	}
1500
1501	if (keg->uk_flags & UMA_ZONE_HASH)
1502		hash_alloc(&keg->uk_hash);
1503
1504#ifdef UMA_DEBUG
1505	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1506	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1507	    keg->uk_ipers, keg->uk_ppera,
1508	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1509#endif
1510
1511	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1512
1513	rw_wlock(&uma_rwlock);
1514	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1515	rw_wunlock(&uma_rwlock);
1516	return (0);
1517}
1518
1519/*
1520 * Zone header ctor.  This initializes all fields, locks, etc.
1521 *
1522 * Arguments/Returns follow uma_ctor specifications
1523 *	udata  Actually uma_zctor_args
1524 */
1525static int
1526zone_ctor(void *mem, int size, void *udata, int flags)
1527{
1528	struct uma_zctor_args *arg = udata;
1529	uma_zone_t zone = mem;
1530	uma_zone_t z;
1531	uma_keg_t keg;
1532
1533	bzero(zone, size);
1534	zone->uz_name = arg->name;
1535	zone->uz_ctor = arg->ctor;
1536	zone->uz_dtor = arg->dtor;
1537	zone->uz_slab = zone_fetch_slab;
1538	zone->uz_init = NULL;
1539	zone->uz_fini = NULL;
1540	zone->uz_allocs = 0;
1541	zone->uz_frees = 0;
1542	zone->uz_fails = 0;
1543	zone->uz_sleeps = 0;
1544	zone->uz_count = 0;
1545	zone->uz_count_min = 0;
1546	zone->uz_flags = 0;
1547	zone->uz_warning = NULL;
1548	timevalclear(&zone->uz_ratecheck);
1549	keg = arg->keg;
1550
1551	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1552
1553	/*
1554	 * This is a pure cache zone, no kegs.
1555	 */
1556	if (arg->import) {
1557		if (arg->flags & UMA_ZONE_VM)
1558			arg->flags |= UMA_ZFLAG_CACHEONLY;
1559		zone->uz_flags = arg->flags;
1560		zone->uz_size = arg->size;
1561		zone->uz_import = arg->import;
1562		zone->uz_release = arg->release;
1563		zone->uz_arg = arg->arg;
1564		zone->uz_lockptr = &zone->uz_lock;
1565		rw_wlock(&uma_rwlock);
1566		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1567		rw_wunlock(&uma_rwlock);
1568		goto out;
1569	}
1570
1571	/*
1572	 * Use the regular zone/keg/slab allocator.
1573	 */
1574	zone->uz_import = (uma_import)zone_import;
1575	zone->uz_release = (uma_release)zone_release;
1576	zone->uz_arg = zone;
1577
1578	if (arg->flags & UMA_ZONE_SECONDARY) {
1579		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1580		zone->uz_init = arg->uminit;
1581		zone->uz_fini = arg->fini;
1582		zone->uz_lockptr = &keg->uk_lock;
1583		zone->uz_flags |= UMA_ZONE_SECONDARY;
1584		rw_wlock(&uma_rwlock);
1585		ZONE_LOCK(zone);
1586		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1587			if (LIST_NEXT(z, uz_link) == NULL) {
1588				LIST_INSERT_AFTER(z, zone, uz_link);
1589				break;
1590			}
1591		}
1592		ZONE_UNLOCK(zone);
1593		rw_wunlock(&uma_rwlock);
1594	} else if (keg == NULL) {
1595		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1596		    arg->align, arg->flags)) == NULL)
1597			return (ENOMEM);
1598	} else {
1599		struct uma_kctor_args karg;
1600		int error;
1601
1602		/* We should only be here from uma_startup() */
1603		karg.size = arg->size;
1604		karg.uminit = arg->uminit;
1605		karg.fini = arg->fini;
1606		karg.align = arg->align;
1607		karg.flags = arg->flags;
1608		karg.zone = zone;
1609		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1610		    flags);
1611		if (error)
1612			return (error);
1613	}
1614
1615	/*
1616	 * Link in the first keg.
1617	 */
1618	zone->uz_klink.kl_keg = keg;
1619	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1620	zone->uz_lockptr = &keg->uk_lock;
1621	zone->uz_size = keg->uk_size;
1622	zone->uz_flags |= (keg->uk_flags &
1623	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1624
1625	/*
1626	 * Some internal zones don't have room allocated for the per cpu
1627	 * caches.  If we're internal, bail out here.
1628	 */
1629	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1630		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1631		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1632		return (0);
1633	}
1634
1635out:
1636	if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
1637		zone->uz_count = bucket_select(zone->uz_size);
1638	else
1639		zone->uz_count = BUCKET_MAX;
1640	zone->uz_count_min = zone->uz_count;
1641
1642	return (0);
1643}
1644
1645/*
1646 * Keg header dtor.  This frees all data, destroys locks, frees the hash
1647 * table and removes the keg from the global list.
1648 *
1649 * Arguments/Returns follow uma_dtor specifications
1650 *	udata  unused
1651 */
1652static void
1653keg_dtor(void *arg, int size, void *udata)
1654{
1655	uma_keg_t keg;
1656
1657	keg = (uma_keg_t)arg;
1658	KEG_LOCK(keg);
1659	if (keg->uk_free != 0) {
1660		printf("Freed UMA keg (%s) was not empty (%d items). "
1661		    " Lost %d pages of memory.\n",
1662		    keg->uk_name ? keg->uk_name : "",
1663		    keg->uk_free, keg->uk_pages);
1664	}
1665	KEG_UNLOCK(keg);
1666
1667	hash_free(&keg->uk_hash);
1668
1669	KEG_LOCK_FINI(keg);
1670}
1671
1672/*
1673 * Zone header dtor.
1674 *
1675 * Arguments/Returns follow uma_dtor specifications
1676 *	udata  unused
1677 */
1678static void
1679zone_dtor(void *arg, int size, void *udata)
1680{
1681	uma_klink_t klink;
1682	uma_zone_t zone;
1683	uma_keg_t keg;
1684
1685	zone = (uma_zone_t)arg;
1686	keg = zone_first_keg(zone);
1687
1688	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1689		cache_drain(zone);
1690
1691	rw_wlock(&uma_rwlock);
1692	LIST_REMOVE(zone, uz_link);
1693	rw_wunlock(&uma_rwlock);
1694	/*
1695	 * XXX there are some races here where
1696	 * the zone can be drained but zone lock
1697	 * released and then refilled before we
1698	 * remove it... we dont care for now
1699	 */
1700	zone_drain_wait(zone, M_WAITOK);
1701	/*
1702	 * Unlink all of our kegs.
1703	 */
1704	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1705		klink->kl_keg = NULL;
1706		LIST_REMOVE(klink, kl_link);
1707		if (klink == &zone->uz_klink)
1708			continue;
1709		free(klink, M_TEMP);
1710	}
1711	/*
1712	 * We only destroy kegs from non secondary zones.
1713	 */
1714	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1715		rw_wlock(&uma_rwlock);
1716		LIST_REMOVE(keg, uk_link);
1717		rw_wunlock(&uma_rwlock);
1718		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1719	}
1720	ZONE_LOCK_FINI(zone);
1721}
1722
1723/*
1724 * Traverses every zone in the system and calls a callback
1725 *
1726 * Arguments:
1727 *	zfunc  A pointer to a function which accepts a zone
1728 *		as an argument.
1729 *
1730 * Returns:
1731 *	Nothing
1732 */
1733static void
1734zone_foreach(void (*zfunc)(uma_zone_t))
1735{
1736	uma_keg_t keg;
1737	uma_zone_t zone;
1738
1739	rw_rlock(&uma_rwlock);
1740	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1741		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1742			zfunc(zone);
1743	}
1744	rw_runlock(&uma_rwlock);
1745}
1746
1747/* Public functions */
1748/* See uma.h */
1749void
1750uma_startup(void *bootmem, int boot_pages)
1751{
1752	struct uma_zctor_args args;
1753	uma_slab_t slab;
1754	int i;
1755
1756#ifdef UMA_DEBUG
1757	printf("Creating uma keg headers zone and keg.\n");
1758#endif
1759	rw_init(&uma_rwlock, "UMA lock");
1760
1761	/* "manually" create the initial zone */
1762	memset(&args, 0, sizeof(args));
1763	args.name = "UMA Kegs";
1764	args.size = sizeof(struct uma_keg);
1765	args.ctor = keg_ctor;
1766	args.dtor = keg_dtor;
1767	args.uminit = zero_init;
1768	args.fini = NULL;
1769	args.keg = &masterkeg;
1770	args.align = 32 - 1;
1771	args.flags = UMA_ZFLAG_INTERNAL;
1772	/* The initial zone has no Per cpu queues so it's smaller */
1773	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1774
1775#ifdef UMA_DEBUG
1776	printf("Filling boot free list.\n");
1777#endif
1778	for (i = 0; i < boot_pages; i++) {
1779		slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
1780		slab->us_data = (uint8_t *)slab;
1781		slab->us_flags = UMA_SLAB_BOOT;
1782		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1783	}
1784	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1785
1786#ifdef UMA_DEBUG
1787	printf("Creating uma zone headers zone and keg.\n");
1788#endif
1789	args.name = "UMA Zones";
1790	args.size = sizeof(struct uma_zone) +
1791	    (sizeof(struct uma_cache) * (mp_maxid + 1));
1792	args.ctor = zone_ctor;
1793	args.dtor = zone_dtor;
1794	args.uminit = zero_init;
1795	args.fini = NULL;
1796	args.keg = NULL;
1797	args.align = 32 - 1;
1798	args.flags = UMA_ZFLAG_INTERNAL;
1799	/* The initial zone has no Per cpu queues so it's smaller */
1800	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1801
1802#ifdef UMA_DEBUG
1803	printf("Creating slab and hash zones.\n");
1804#endif
1805
1806	/* Now make a zone for slab headers */
1807	slabzone = uma_zcreate("UMA Slabs",
1808				sizeof(struct uma_slab),
1809				NULL, NULL, NULL, NULL,
1810				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1811
1812	hashzone = uma_zcreate("UMA Hash",
1813	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1814	    NULL, NULL, NULL, NULL,
1815	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1816
1817	bucket_init();
1818
1819	booted = UMA_STARTUP;
1820
1821#ifdef UMA_DEBUG
1822	printf("UMA startup complete.\n");
1823#endif
1824}
1825
1826/* see uma.h */
1827void
1828uma_startup2(void)
1829{
1830	booted = UMA_STARTUP2;
1831	bucket_enable();
1832	sx_init(&uma_drain_lock, "umadrain");
1833#ifdef UMA_DEBUG
1834	printf("UMA startup2 complete.\n");
1835#endif
1836}
1837
1838/*
1839 * Initialize our callout handle
1840 *
1841 */
1842
1843static void
1844uma_startup3(void)
1845{
1846#ifdef UMA_DEBUG
1847	printf("Starting callout.\n");
1848#endif
1849	callout_init(&uma_callout, 1);
1850	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1851#ifdef UMA_DEBUG
1852	printf("UMA startup3 complete.\n");
1853#endif
1854}
1855
1856static uma_keg_t
1857uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1858		int align, uint32_t flags)
1859{
1860	struct uma_kctor_args args;
1861
1862	args.size = size;
1863	args.uminit = uminit;
1864	args.fini = fini;
1865	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1866	args.flags = flags;
1867	args.zone = zone;
1868	return (zone_alloc_item(kegs, &args, M_WAITOK));
1869}
1870
1871/* See uma.h */
1872void
1873uma_set_align(int align)
1874{
1875
1876	if (align != UMA_ALIGN_CACHE)
1877		uma_align_cache = align;
1878}
1879
1880/* See uma.h */
1881uma_zone_t
1882uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1883		uma_init uminit, uma_fini fini, int align, uint32_t flags)
1884
1885{
1886	struct uma_zctor_args args;
1887	uma_zone_t res;
1888	bool locked;
1889
1890	/* This stuff is essential for the zone ctor */
1891	memset(&args, 0, sizeof(args));
1892	args.name = name;
1893	args.size = size;
1894	args.ctor = ctor;
1895	args.dtor = dtor;
1896	args.uminit = uminit;
1897	args.fini = fini;
1898#ifdef  INVARIANTS
1899	/*
1900	 * If a zone is being created with an empty constructor and
1901	 * destructor, pass UMA constructor/destructor which checks for
1902	 * memory use after free.
1903	 */
1904	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
1905	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
1906		args.ctor = trash_ctor;
1907		args.dtor = trash_dtor;
1908		args.uminit = trash_init;
1909		args.fini = trash_fini;
1910	}
1911#endif
1912	args.align = align;
1913	args.flags = flags;
1914	args.keg = NULL;
1915
1916	if (booted < UMA_STARTUP2) {
1917		locked = false;
1918	} else {
1919		sx_slock(&uma_drain_lock);
1920		locked = true;
1921	}
1922	res = zone_alloc_item(zones, &args, M_WAITOK);
1923	if (locked)
1924		sx_sunlock(&uma_drain_lock);
1925	return (res);
1926}
1927
1928/* See uma.h */
1929uma_zone_t
1930uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1931		    uma_init zinit, uma_fini zfini, uma_zone_t master)
1932{
1933	struct uma_zctor_args args;
1934	uma_keg_t keg;
1935	uma_zone_t res;
1936	bool locked;
1937
1938	keg = zone_first_keg(master);
1939	memset(&args, 0, sizeof(args));
1940	args.name = name;
1941	args.size = keg->uk_size;
1942	args.ctor = ctor;
1943	args.dtor = dtor;
1944	args.uminit = zinit;
1945	args.fini = zfini;
1946	args.align = keg->uk_align;
1947	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1948	args.keg = keg;
1949
1950	if (booted < UMA_STARTUP2) {
1951		locked = false;
1952	} else {
1953		sx_slock(&uma_drain_lock);
1954		locked = true;
1955	}
1956	/* XXX Attaches only one keg of potentially many. */
1957	res = zone_alloc_item(zones, &args, M_WAITOK);
1958	if (locked)
1959		sx_sunlock(&uma_drain_lock);
1960	return (res);
1961}
1962
1963/* See uma.h */
1964uma_zone_t
1965uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
1966		    uma_init zinit, uma_fini zfini, uma_import zimport,
1967		    uma_release zrelease, void *arg, int flags)
1968{
1969	struct uma_zctor_args args;
1970
1971	memset(&args, 0, sizeof(args));
1972	args.name = name;
1973	args.size = size;
1974	args.ctor = ctor;
1975	args.dtor = dtor;
1976	args.uminit = zinit;
1977	args.fini = zfini;
1978	args.import = zimport;
1979	args.release = zrelease;
1980	args.arg = arg;
1981	args.align = 0;
1982	args.flags = flags;
1983
1984	return (zone_alloc_item(zones, &args, M_WAITOK));
1985}
1986
1987static void
1988zone_lock_pair(uma_zone_t a, uma_zone_t b)
1989{
1990	if (a < b) {
1991		ZONE_LOCK(a);
1992		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
1993	} else {
1994		ZONE_LOCK(b);
1995		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
1996	}
1997}
1998
1999static void
2000zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2001{
2002
2003	ZONE_UNLOCK(a);
2004	ZONE_UNLOCK(b);
2005}
2006
2007int
2008uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2009{
2010	uma_klink_t klink;
2011	uma_klink_t kl;
2012	int error;
2013
2014	error = 0;
2015	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2016
2017	zone_lock_pair(zone, master);
2018	/*
2019	 * zone must use vtoslab() to resolve objects and must already be
2020	 * a secondary.
2021	 */
2022	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2023	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2024		error = EINVAL;
2025		goto out;
2026	}
2027	/*
2028	 * The new master must also use vtoslab().
2029	 */
2030	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2031		error = EINVAL;
2032		goto out;
2033	}
2034
2035	/*
2036	 * The underlying object must be the same size.  rsize
2037	 * may be different.
2038	 */
2039	if (master->uz_size != zone->uz_size) {
2040		error = E2BIG;
2041		goto out;
2042	}
2043	/*
2044	 * Put it at the end of the list.
2045	 */
2046	klink->kl_keg = zone_first_keg(master);
2047	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2048		if (LIST_NEXT(kl, kl_link) == NULL) {
2049			LIST_INSERT_AFTER(kl, klink, kl_link);
2050			break;
2051		}
2052	}
2053	klink = NULL;
2054	zone->uz_flags |= UMA_ZFLAG_MULTI;
2055	zone->uz_slab = zone_fetch_slab_multi;
2056
2057out:
2058	zone_unlock_pair(zone, master);
2059	if (klink != NULL)
2060		free(klink, M_TEMP);
2061
2062	return (error);
2063}
2064
2065
2066/* See uma.h */
2067void
2068uma_zdestroy(uma_zone_t zone)
2069{
2070
2071	sx_slock(&uma_drain_lock);
2072	zone_free_item(zones, zone, NULL, SKIP_NONE);
2073	sx_sunlock(&uma_drain_lock);
2074}
2075
2076/* See uma.h */
2077void *
2078uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2079{
2080	void *item;
2081	uma_cache_t cache;
2082	uma_bucket_t bucket;
2083	int lockfail;
2084	int cpu;
2085
2086	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2087	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2088
2089	/* This is the fast path allocation */
2090#ifdef UMA_DEBUG_ALLOC_1
2091	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
2092#endif
2093	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2094	    zone->uz_name, flags);
2095
2096	if (flags & M_WAITOK) {
2097		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2098		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2099	}
2100	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2101	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2102
2103#ifdef DEBUG_MEMGUARD
2104	if (memguard_cmp_zone(zone)) {
2105		item = memguard_alloc(zone->uz_size, flags);
2106		if (item != NULL) {
2107			if (zone->uz_init != NULL &&
2108			    zone->uz_init(item, zone->uz_size, flags) != 0)
2109				return (NULL);
2110			if (zone->uz_ctor != NULL &&
2111			    zone->uz_ctor(item, zone->uz_size, udata,
2112			    flags) != 0) {
2113			    	zone->uz_fini(item, zone->uz_size);
2114				return (NULL);
2115			}
2116			return (item);
2117		}
2118		/* This is unfortunate but should not be fatal. */
2119	}
2120#endif
2121	/*
2122	 * If possible, allocate from the per-CPU cache.  There are two
2123	 * requirements for safe access to the per-CPU cache: (1) the thread
2124	 * accessing the cache must not be preempted or yield during access,
2125	 * and (2) the thread must not migrate CPUs without switching which
2126	 * cache it accesses.  We rely on a critical section to prevent
2127	 * preemption and migration.  We release the critical section in
2128	 * order to acquire the zone mutex if we are unable to allocate from
2129	 * the current cache; when we re-acquire the critical section, we
2130	 * must detect and handle migration if it has occurred.
2131	 */
2132	critical_enter();
2133	cpu = curcpu;
2134	cache = &zone->uz_cpu[cpu];
2135
2136zalloc_start:
2137	bucket = cache->uc_allocbucket;
2138	if (bucket != NULL && bucket->ub_cnt > 0) {
2139		bucket->ub_cnt--;
2140		item = bucket->ub_bucket[bucket->ub_cnt];
2141#ifdef INVARIANTS
2142		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2143#endif
2144		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2145		cache->uc_allocs++;
2146		critical_exit();
2147		if (zone->uz_ctor != NULL &&
2148		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2149			atomic_add_long(&zone->uz_fails, 1);
2150			zone_free_item(zone, item, udata, SKIP_DTOR);
2151			return (NULL);
2152		}
2153#ifdef INVARIANTS
2154		uma_dbg_alloc(zone, NULL, item);
2155#endif
2156		if (flags & M_ZERO)
2157			uma_zero_item(item, zone);
2158		return (item);
2159	}
2160
2161	/*
2162	 * We have run out of items in our alloc bucket.
2163	 * See if we can switch with our free bucket.
2164	 */
2165	bucket = cache->uc_freebucket;
2166	if (bucket != NULL && bucket->ub_cnt > 0) {
2167#ifdef UMA_DEBUG_ALLOC
2168		printf("uma_zalloc: Swapping empty with alloc.\n");
2169#endif
2170		cache->uc_freebucket = cache->uc_allocbucket;
2171		cache->uc_allocbucket = bucket;
2172		goto zalloc_start;
2173	}
2174
2175	/*
2176	 * Discard any empty allocation bucket while we hold no locks.
2177	 */
2178	bucket = cache->uc_allocbucket;
2179	cache->uc_allocbucket = NULL;
2180	critical_exit();
2181	if (bucket != NULL)
2182		bucket_free(zone, bucket, udata);
2183
2184	/* Short-circuit for zones without buckets and low memory. */
2185	if (zone->uz_count == 0 || bucketdisable)
2186		goto zalloc_item;
2187
2188	/*
2189	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2190	 * we must go back to the zone.  This requires the zone lock, so we
2191	 * must drop the critical section, then re-acquire it when we go back
2192	 * to the cache.  Since the critical section is released, we may be
2193	 * preempted or migrate.  As such, make sure not to maintain any
2194	 * thread-local state specific to the cache from prior to releasing
2195	 * the critical section.
2196	 */
2197	lockfail = 0;
2198	if (ZONE_TRYLOCK(zone) == 0) {
2199		/* Record contention to size the buckets. */
2200		ZONE_LOCK(zone);
2201		lockfail = 1;
2202	}
2203	critical_enter();
2204	cpu = curcpu;
2205	cache = &zone->uz_cpu[cpu];
2206
2207	/*
2208	 * Since we have locked the zone we may as well send back our stats.
2209	 */
2210	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2211	atomic_add_long(&zone->uz_frees, cache->uc_frees);
2212	cache->uc_allocs = 0;
2213	cache->uc_frees = 0;
2214
2215	/* See if we lost the race to fill the cache. */
2216	if (cache->uc_allocbucket != NULL) {
2217		ZONE_UNLOCK(zone);
2218		goto zalloc_start;
2219	}
2220
2221	/*
2222	 * Check the zone's cache of buckets.
2223	 */
2224	if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
2225		KASSERT(bucket->ub_cnt != 0,
2226		    ("uma_zalloc_arg: Returning an empty bucket."));
2227
2228		LIST_REMOVE(bucket, ub_link);
2229		cache->uc_allocbucket = bucket;
2230		ZONE_UNLOCK(zone);
2231		goto zalloc_start;
2232	}
2233	/* We are no longer associated with this CPU. */
2234	critical_exit();
2235
2236	/*
2237	 * We bump the uz count when the cache size is insufficient to
2238	 * handle the working set.
2239	 */
2240	if (lockfail && zone->uz_count < BUCKET_MAX)
2241		zone->uz_count++;
2242	ZONE_UNLOCK(zone);
2243
2244	/*
2245	 * Now lets just fill a bucket and put it on the free list.  If that
2246	 * works we'll restart the allocation from the beginning and it
2247	 * will use the just filled bucket.
2248	 */
2249	bucket = zone_alloc_bucket(zone, udata, flags);
2250	if (bucket != NULL) {
2251		ZONE_LOCK(zone);
2252		critical_enter();
2253		cpu = curcpu;
2254		cache = &zone->uz_cpu[cpu];
2255		/*
2256		 * See if we lost the race or were migrated.  Cache the
2257		 * initialized bucket to make this less likely or claim
2258		 * the memory directly.
2259		 */
2260		if (cache->uc_allocbucket == NULL)
2261			cache->uc_allocbucket = bucket;
2262		else
2263			LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2264		ZONE_UNLOCK(zone);
2265		goto zalloc_start;
2266	}
2267
2268	/*
2269	 * We may not be able to get a bucket so return an actual item.
2270	 */
2271#ifdef UMA_DEBUG
2272	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2273#endif
2274
2275zalloc_item:
2276	item = zone_alloc_item(zone, udata, flags);
2277
2278	return (item);
2279}
2280
2281static uma_slab_t
2282keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2283{
2284	uma_slab_t slab;
2285	int reserve;
2286
2287	mtx_assert(&keg->uk_lock, MA_OWNED);
2288	slab = NULL;
2289	reserve = 0;
2290	if ((flags & M_USE_RESERVE) == 0)
2291		reserve = keg->uk_reserve;
2292
2293	for (;;) {
2294		/*
2295		 * Find a slab with some space.  Prefer slabs that are partially
2296		 * used over those that are totally full.  This helps to reduce
2297		 * fragmentation.
2298		 */
2299		if (keg->uk_free > reserve) {
2300			if (!LIST_EMPTY(&keg->uk_part_slab)) {
2301				slab = LIST_FIRST(&keg->uk_part_slab);
2302			} else {
2303				slab = LIST_FIRST(&keg->uk_free_slab);
2304				LIST_REMOVE(slab, us_link);
2305				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2306				    us_link);
2307			}
2308			MPASS(slab->us_keg == keg);
2309			return (slab);
2310		}
2311
2312		/*
2313		 * M_NOVM means don't ask at all!
2314		 */
2315		if (flags & M_NOVM)
2316			break;
2317
2318		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2319			keg->uk_flags |= UMA_ZFLAG_FULL;
2320			/*
2321			 * If this is not a multi-zone, set the FULL bit.
2322			 * Otherwise slab_multi() takes care of it.
2323			 */
2324			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2325				zone->uz_flags |= UMA_ZFLAG_FULL;
2326				zone_log_warning(zone);
2327				zone_maxaction(zone);
2328			}
2329			if (flags & M_NOWAIT)
2330				break;
2331			zone->uz_sleeps++;
2332			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2333			continue;
2334		}
2335		slab = keg_alloc_slab(keg, zone, flags);
2336		/*
2337		 * If we got a slab here it's safe to mark it partially used
2338		 * and return.  We assume that the caller is going to remove
2339		 * at least one item.
2340		 */
2341		if (slab) {
2342			MPASS(slab->us_keg == keg);
2343			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2344			return (slab);
2345		}
2346		/*
2347		 * We might not have been able to get a slab but another cpu
2348		 * could have while we were unlocked.  Check again before we
2349		 * fail.
2350		 */
2351		flags |= M_NOVM;
2352	}
2353	return (slab);
2354}
2355
2356static uma_slab_t
2357zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2358{
2359	uma_slab_t slab;
2360
2361	if (keg == NULL) {
2362		keg = zone_first_keg(zone);
2363		KEG_LOCK(keg);
2364	}
2365
2366	for (;;) {
2367		slab = keg_fetch_slab(keg, zone, flags);
2368		if (slab)
2369			return (slab);
2370		if (flags & (M_NOWAIT | M_NOVM))
2371			break;
2372	}
2373	KEG_UNLOCK(keg);
2374	return (NULL);
2375}
2376
2377/*
2378 * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2379 * with the keg locked.  On NULL no lock is held.
2380 *
2381 * The last pointer is used to seed the search.  It is not required.
2382 */
2383static uma_slab_t
2384zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2385{
2386	uma_klink_t klink;
2387	uma_slab_t slab;
2388	uma_keg_t keg;
2389	int flags;
2390	int empty;
2391	int full;
2392
2393	/*
2394	 * Don't wait on the first pass.  This will skip limit tests
2395	 * as well.  We don't want to block if we can find a provider
2396	 * without blocking.
2397	 */
2398	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2399	/*
2400	 * Use the last slab allocated as a hint for where to start
2401	 * the search.
2402	 */
2403	if (last != NULL) {
2404		slab = keg_fetch_slab(last, zone, flags);
2405		if (slab)
2406			return (slab);
2407		KEG_UNLOCK(last);
2408	}
2409	/*
2410	 * Loop until we have a slab incase of transient failures
2411	 * while M_WAITOK is specified.  I'm not sure this is 100%
2412	 * required but we've done it for so long now.
2413	 */
2414	for (;;) {
2415		empty = 0;
2416		full = 0;
2417		/*
2418		 * Search the available kegs for slabs.  Be careful to hold the
2419		 * correct lock while calling into the keg layer.
2420		 */
2421		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2422			keg = klink->kl_keg;
2423			KEG_LOCK(keg);
2424			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2425				slab = keg_fetch_slab(keg, zone, flags);
2426				if (slab)
2427					return (slab);
2428			}
2429			if (keg->uk_flags & UMA_ZFLAG_FULL)
2430				full++;
2431			else
2432				empty++;
2433			KEG_UNLOCK(keg);
2434		}
2435		if (rflags & (M_NOWAIT | M_NOVM))
2436			break;
2437		flags = rflags;
2438		/*
2439		 * All kegs are full.  XXX We can't atomically check all kegs
2440		 * and sleep so just sleep for a short period and retry.
2441		 */
2442		if (full && !empty) {
2443			ZONE_LOCK(zone);
2444			zone->uz_flags |= UMA_ZFLAG_FULL;
2445			zone->uz_sleeps++;
2446			zone_log_warning(zone);
2447			zone_maxaction(zone);
2448			msleep(zone, zone->uz_lockptr, PVM,
2449			    "zonelimit", hz/100);
2450			zone->uz_flags &= ~UMA_ZFLAG_FULL;
2451			ZONE_UNLOCK(zone);
2452			continue;
2453		}
2454	}
2455	return (NULL);
2456}
2457
2458static void *
2459slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2460{
2461	void *item;
2462	uint8_t freei;
2463
2464	MPASS(keg == slab->us_keg);
2465	mtx_assert(&keg->uk_lock, MA_OWNED);
2466
2467	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2468	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2469	item = slab->us_data + (keg->uk_rsize * freei);
2470	slab->us_freecount--;
2471	keg->uk_free--;
2472
2473	/* Move this slab to the full list */
2474	if (slab->us_freecount == 0) {
2475		LIST_REMOVE(slab, us_link);
2476		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2477	}
2478
2479	return (item);
2480}
2481
2482static int
2483zone_import(uma_zone_t zone, void **bucket, int max, int flags)
2484{
2485	uma_slab_t slab;
2486	uma_keg_t keg;
2487	int i;
2488
2489	slab = NULL;
2490	keg = NULL;
2491	/* Try to keep the buckets totally full */
2492	for (i = 0; i < max; ) {
2493		if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
2494			break;
2495		keg = slab->us_keg;
2496		while (slab->us_freecount && i < max) {
2497			bucket[i++] = slab_alloc_item(keg, slab);
2498			if (keg->uk_free <= keg->uk_reserve)
2499				break;
2500		}
2501		/* Don't grab more than one slab at a time. */
2502		flags &= ~M_WAITOK;
2503		flags |= M_NOWAIT;
2504	}
2505	if (slab != NULL)
2506		KEG_UNLOCK(keg);
2507
2508	return i;
2509}
2510
2511static uma_bucket_t
2512zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
2513{
2514	uma_bucket_t bucket;
2515	int max;
2516
2517	/* Don't wait for buckets, preserve caller's NOVM setting. */
2518	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2519	if (bucket == NULL)
2520		return (NULL);
2521
2522	max = MIN(bucket->ub_entries, zone->uz_count);
2523	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2524	    max, flags);
2525
2526	/*
2527	 * Initialize the memory if necessary.
2528	 */
2529	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2530		int i;
2531
2532		for (i = 0; i < bucket->ub_cnt; i++)
2533			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2534			    flags) != 0)
2535				break;
2536		/*
2537		 * If we couldn't initialize the whole bucket, put the
2538		 * rest back onto the freelist.
2539		 */
2540		if (i != bucket->ub_cnt) {
2541			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2542			    bucket->ub_cnt - i);
2543#ifdef INVARIANTS
2544			bzero(&bucket->ub_bucket[i],
2545			    sizeof(void *) * (bucket->ub_cnt - i));
2546#endif
2547			bucket->ub_cnt = i;
2548		}
2549	}
2550
2551	if (bucket->ub_cnt == 0) {
2552		bucket_free(zone, bucket, udata);
2553		atomic_add_long(&zone->uz_fails, 1);
2554		return (NULL);
2555	}
2556
2557	return (bucket);
2558}
2559
2560/*
2561 * Allocates a single item from a zone.
2562 *
2563 * Arguments
2564 *	zone   The zone to alloc for.
2565 *	udata  The data to be passed to the constructor.
2566 *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2567 *
2568 * Returns
2569 *	NULL if there is no memory and M_NOWAIT is set
2570 *	An item if successful
2571 */
2572
2573static void *
2574zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2575{
2576	void *item;
2577
2578	item = NULL;
2579
2580#ifdef UMA_DEBUG_ALLOC
2581	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2582#endif
2583	if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
2584		goto fail;
2585	atomic_add_long(&zone->uz_allocs, 1);
2586
2587	/*
2588	 * We have to call both the zone's init (not the keg's init)
2589	 * and the zone's ctor.  This is because the item is going from
2590	 * a keg slab directly to the user, and the user is expecting it
2591	 * to be both zone-init'd as well as zone-ctor'd.
2592	 */
2593	if (zone->uz_init != NULL) {
2594		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2595			zone_free_item(zone, item, udata, SKIP_FINI);
2596			goto fail;
2597		}
2598	}
2599	if (zone->uz_ctor != NULL) {
2600		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2601			zone_free_item(zone, item, udata, SKIP_DTOR);
2602			goto fail;
2603		}
2604	}
2605#ifdef INVARIANTS
2606	uma_dbg_alloc(zone, NULL, item);
2607#endif
2608	if (flags & M_ZERO)
2609		uma_zero_item(item, zone);
2610
2611	return (item);
2612
2613fail:
2614	atomic_add_long(&zone->uz_fails, 1);
2615	return (NULL);
2616}
2617
2618/* See uma.h */
2619void
2620uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2621{
2622	uma_cache_t cache;
2623	uma_bucket_t bucket;
2624	int lockfail;
2625	int cpu;
2626
2627	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2628	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
2629
2630#ifdef UMA_DEBUG_ALLOC_1
2631	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2632#endif
2633	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2634	    zone->uz_name);
2635
2636	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2637	    ("uma_zfree_arg: called with spinlock or critical section held"));
2638
2639        /* uma_zfree(..., NULL) does nothing, to match free(9). */
2640        if (item == NULL)
2641                return;
2642#ifdef DEBUG_MEMGUARD
2643	if (is_memguard_addr(item)) {
2644		if (zone->uz_dtor != NULL)
2645			zone->uz_dtor(item, zone->uz_size, udata);
2646		if (zone->uz_fini != NULL)
2647			zone->uz_fini(item, zone->uz_size);
2648		memguard_free(item);
2649		return;
2650	}
2651#endif
2652#ifdef INVARIANTS
2653	if (zone->uz_flags & UMA_ZONE_MALLOC)
2654		uma_dbg_free(zone, udata, item);
2655	else
2656		uma_dbg_free(zone, NULL, item);
2657#endif
2658	if (zone->uz_dtor != NULL)
2659		zone->uz_dtor(item, zone->uz_size, udata);
2660
2661	/*
2662	 * The race here is acceptable.  If we miss it we'll just have to wait
2663	 * a little longer for the limits to be reset.
2664	 */
2665	if (zone->uz_flags & UMA_ZFLAG_FULL)
2666		goto zfree_item;
2667
2668	/*
2669	 * If possible, free to the per-CPU cache.  There are two
2670	 * requirements for safe access to the per-CPU cache: (1) the thread
2671	 * accessing the cache must not be preempted or yield during access,
2672	 * and (2) the thread must not migrate CPUs without switching which
2673	 * cache it accesses.  We rely on a critical section to prevent
2674	 * preemption and migration.  We release the critical section in
2675	 * order to acquire the zone mutex if we are unable to free to the
2676	 * current cache; when we re-acquire the critical section, we must
2677	 * detect and handle migration if it has occurred.
2678	 */
2679zfree_restart:
2680	critical_enter();
2681	cpu = curcpu;
2682	cache = &zone->uz_cpu[cpu];
2683
2684zfree_start:
2685	/*
2686	 * Try to free into the allocbucket first to give LIFO ordering
2687	 * for cache-hot datastructures.  Spill over into the freebucket
2688	 * if necessary.  Alloc will swap them if one runs dry.
2689	 */
2690	bucket = cache->uc_allocbucket;
2691	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
2692		bucket = cache->uc_freebucket;
2693	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2694		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2695		    ("uma_zfree: Freeing to non free bucket index."));
2696		bucket->ub_bucket[bucket->ub_cnt] = item;
2697		bucket->ub_cnt++;
2698		cache->uc_frees++;
2699		critical_exit();
2700		return;
2701	}
2702
2703	/*
2704	 * We must go back the zone, which requires acquiring the zone lock,
2705	 * which in turn means we must release and re-acquire the critical
2706	 * section.  Since the critical section is released, we may be
2707	 * preempted or migrate.  As such, make sure not to maintain any
2708	 * thread-local state specific to the cache from prior to releasing
2709	 * the critical section.
2710	 */
2711	critical_exit();
2712	if (zone->uz_count == 0 || bucketdisable)
2713		goto zfree_item;
2714
2715	lockfail = 0;
2716	if (ZONE_TRYLOCK(zone) == 0) {
2717		/* Record contention to size the buckets. */
2718		ZONE_LOCK(zone);
2719		lockfail = 1;
2720	}
2721	critical_enter();
2722	cpu = curcpu;
2723	cache = &zone->uz_cpu[cpu];
2724
2725	/*
2726	 * Since we have locked the zone we may as well send back our stats.
2727	 */
2728	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2729	atomic_add_long(&zone->uz_frees, cache->uc_frees);
2730	cache->uc_allocs = 0;
2731	cache->uc_frees = 0;
2732
2733	bucket = cache->uc_freebucket;
2734	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2735		ZONE_UNLOCK(zone);
2736		goto zfree_start;
2737	}
2738	cache->uc_freebucket = NULL;
2739	/* We are no longer associated with this CPU. */
2740	critical_exit();
2741
2742	/* Can we throw this on the zone full list? */
2743	if (bucket != NULL) {
2744#ifdef UMA_DEBUG_ALLOC
2745		printf("uma_zfree: Putting old bucket on the free list.\n");
2746#endif
2747		/* ub_cnt is pointing to the last free item */
2748		KASSERT(bucket->ub_cnt != 0,
2749		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2750		LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2751	}
2752
2753	/*
2754	 * We bump the uz count when the cache size is insufficient to
2755	 * handle the working set.
2756	 */
2757	if (lockfail && zone->uz_count < BUCKET_MAX)
2758		zone->uz_count++;
2759	ZONE_UNLOCK(zone);
2760
2761#ifdef UMA_DEBUG_ALLOC
2762	printf("uma_zfree: Allocating new free bucket.\n");
2763#endif
2764	bucket = bucket_alloc(zone, udata, M_NOWAIT);
2765	if (bucket) {
2766		critical_enter();
2767		cpu = curcpu;
2768		cache = &zone->uz_cpu[cpu];
2769		if (cache->uc_freebucket == NULL) {
2770			cache->uc_freebucket = bucket;
2771			goto zfree_start;
2772		}
2773		/*
2774		 * We lost the race, start over.  We have to drop our
2775		 * critical section to free the bucket.
2776		 */
2777		critical_exit();
2778		bucket_free(zone, bucket, udata);
2779		goto zfree_restart;
2780	}
2781
2782	/*
2783	 * If nothing else caught this, we'll just do an internal free.
2784	 */
2785zfree_item:
2786	zone_free_item(zone, item, udata, SKIP_DTOR);
2787
2788	return;
2789}
2790
2791static void
2792slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
2793{
2794	uint8_t freei;
2795
2796	mtx_assert(&keg->uk_lock, MA_OWNED);
2797	MPASS(keg == slab->us_keg);
2798
2799	/* Do we need to remove from any lists? */
2800	if (slab->us_freecount+1 == keg->uk_ipers) {
2801		LIST_REMOVE(slab, us_link);
2802		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2803	} else if (slab->us_freecount == 0) {
2804		LIST_REMOVE(slab, us_link);
2805		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2806	}
2807
2808	/* Slab management. */
2809	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
2810	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
2811	slab->us_freecount++;
2812
2813	/* Keg statistics. */
2814	keg->uk_free++;
2815}
2816
2817static void
2818zone_release(uma_zone_t zone, void **bucket, int cnt)
2819{
2820	void *item;
2821	uma_slab_t slab;
2822	uma_keg_t keg;
2823	uint8_t *mem;
2824	int clearfull;
2825	int i;
2826
2827	clearfull = 0;
2828	keg = zone_first_keg(zone);
2829	KEG_LOCK(keg);
2830	for (i = 0; i < cnt; i++) {
2831		item = bucket[i];
2832		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2833			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
2834			if (zone->uz_flags & UMA_ZONE_HASH) {
2835				slab = hash_sfind(&keg->uk_hash, mem);
2836			} else {
2837				mem += keg->uk_pgoff;
2838				slab = (uma_slab_t)mem;
2839			}
2840		} else {
2841			slab = vtoslab((vm_offset_t)item);
2842			if (slab->us_keg != keg) {
2843				KEG_UNLOCK(keg);
2844				keg = slab->us_keg;
2845				KEG_LOCK(keg);
2846			}
2847		}
2848		slab_free_item(keg, slab, item);
2849		if (keg->uk_flags & UMA_ZFLAG_FULL) {
2850			if (keg->uk_pages < keg->uk_maxpages) {
2851				keg->uk_flags &= ~UMA_ZFLAG_FULL;
2852				clearfull = 1;
2853			}
2854
2855			/*
2856			 * We can handle one more allocation. Since we're
2857			 * clearing ZFLAG_FULL, wake up all procs blocked
2858			 * on pages. This should be uncommon, so keeping this
2859			 * simple for now (rather than adding count of blocked
2860			 * threads etc).
2861			 */
2862			wakeup(keg);
2863		}
2864	}
2865	KEG_UNLOCK(keg);
2866	if (clearfull) {
2867		ZONE_LOCK(zone);
2868		zone->uz_flags &= ~UMA_ZFLAG_FULL;
2869		wakeup(zone);
2870		ZONE_UNLOCK(zone);
2871	}
2872
2873}
2874
2875/*
2876 * Frees a single item to any zone.
2877 *
2878 * Arguments:
2879 *	zone   The zone to free to
2880 *	item   The item we're freeing
2881 *	udata  User supplied data for the dtor
2882 *	skip   Skip dtors and finis
2883 */
2884static void
2885zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
2886{
2887
2888#ifdef INVARIANTS
2889	if (skip == SKIP_NONE) {
2890		if (zone->uz_flags & UMA_ZONE_MALLOC)
2891			uma_dbg_free(zone, udata, item);
2892		else
2893			uma_dbg_free(zone, NULL, item);
2894	}
2895#endif
2896	if (skip < SKIP_DTOR && zone->uz_dtor)
2897		zone->uz_dtor(item, zone->uz_size, udata);
2898
2899	if (skip < SKIP_FINI && zone->uz_fini)
2900		zone->uz_fini(item, zone->uz_size);
2901
2902	atomic_add_long(&zone->uz_frees, 1);
2903	zone->uz_release(zone->uz_arg, &item, 1);
2904}
2905
2906/* See uma.h */
2907int
2908uma_zone_set_max(uma_zone_t zone, int nitems)
2909{
2910	uma_keg_t keg;
2911
2912	keg = zone_first_keg(zone);
2913	if (keg == NULL)
2914		return (0);
2915	KEG_LOCK(keg);
2916	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2917	if (keg->uk_maxpages * keg->uk_ipers < nitems)
2918		keg->uk_maxpages += keg->uk_ppera;
2919	nitems = keg->uk_maxpages * keg->uk_ipers;
2920	KEG_UNLOCK(keg);
2921
2922	return (nitems);
2923}
2924
2925/* See uma.h */
2926int
2927uma_zone_get_max(uma_zone_t zone)
2928{
2929	int nitems;
2930	uma_keg_t keg;
2931
2932	keg = zone_first_keg(zone);
2933	if (keg == NULL)
2934		return (0);
2935	KEG_LOCK(keg);
2936	nitems = keg->uk_maxpages * keg->uk_ipers;
2937	KEG_UNLOCK(keg);
2938
2939	return (nitems);
2940}
2941
2942/* See uma.h */
2943void
2944uma_zone_set_warning(uma_zone_t zone, const char *warning)
2945{
2946
2947	ZONE_LOCK(zone);
2948	zone->uz_warning = warning;
2949	ZONE_UNLOCK(zone);
2950}
2951
2952/* See uma.h */
2953void
2954uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
2955{
2956
2957	ZONE_LOCK(zone);
2958	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
2959	ZONE_UNLOCK(zone);
2960}
2961
2962/* See uma.h */
2963int
2964uma_zone_get_cur(uma_zone_t zone)
2965{
2966	int64_t nitems;
2967	u_int i;
2968
2969	ZONE_LOCK(zone);
2970	nitems = zone->uz_allocs - zone->uz_frees;
2971	CPU_FOREACH(i) {
2972		/*
2973		 * See the comment in sysctl_vm_zone_stats() regarding the
2974		 * safety of accessing the per-cpu caches. With the zone lock
2975		 * held, it is safe, but can potentially result in stale data.
2976		 */
2977		nitems += zone->uz_cpu[i].uc_allocs -
2978		    zone->uz_cpu[i].uc_frees;
2979	}
2980	ZONE_UNLOCK(zone);
2981
2982	return (nitems < 0 ? 0 : nitems);
2983}
2984
2985/* See uma.h */
2986void
2987uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2988{
2989	uma_keg_t keg;
2990
2991	keg = zone_first_keg(zone);
2992	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
2993	KEG_LOCK(keg);
2994	KASSERT(keg->uk_pages == 0,
2995	    ("uma_zone_set_init on non-empty keg"));
2996	keg->uk_init = uminit;
2997	KEG_UNLOCK(keg);
2998}
2999
3000/* See uma.h */
3001void
3002uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3003{
3004	uma_keg_t keg;
3005
3006	keg = zone_first_keg(zone);
3007	KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3008	KEG_LOCK(keg);
3009	KASSERT(keg->uk_pages == 0,
3010	    ("uma_zone_set_fini on non-empty keg"));
3011	keg->uk_fini = fini;
3012	KEG_UNLOCK(keg);
3013}
3014
3015/* See uma.h */
3016void
3017uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3018{
3019
3020	ZONE_LOCK(zone);
3021	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3022	    ("uma_zone_set_zinit on non-empty keg"));
3023	zone->uz_init = zinit;
3024	ZONE_UNLOCK(zone);
3025}
3026
3027/* See uma.h */
3028void
3029uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3030{
3031
3032	ZONE_LOCK(zone);
3033	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3034	    ("uma_zone_set_zfini on non-empty keg"));
3035	zone->uz_fini = zfini;
3036	ZONE_UNLOCK(zone);
3037}
3038
3039/* See uma.h */
3040/* XXX uk_freef is not actually used with the zone locked */
3041void
3042uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3043{
3044	uma_keg_t keg;
3045
3046	keg = zone_first_keg(zone);
3047	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3048	KEG_LOCK(keg);
3049	keg->uk_freef = freef;
3050	KEG_UNLOCK(keg);
3051}
3052
3053/* See uma.h */
3054/* XXX uk_allocf is not actually used with the zone locked */
3055void
3056uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3057{
3058	uma_keg_t keg;
3059
3060	keg = zone_first_keg(zone);
3061	KEG_LOCK(keg);
3062	keg->uk_allocf = allocf;
3063	KEG_UNLOCK(keg);
3064}
3065
3066/* See uma.h */
3067void
3068uma_zone_reserve(uma_zone_t zone, int items)
3069{
3070	uma_keg_t keg;
3071
3072	keg = zone_first_keg(zone);
3073	if (keg == NULL)
3074		return;
3075	KEG_LOCK(keg);
3076	keg->uk_reserve = items;
3077	KEG_UNLOCK(keg);
3078
3079	return;
3080}
3081
3082/* See uma.h */
3083int
3084uma_zone_reserve_kva(uma_zone_t zone, int count)
3085{
3086	uma_keg_t keg;
3087	vm_offset_t kva;
3088	u_int pages;
3089
3090	keg = zone_first_keg(zone);
3091	if (keg == NULL)
3092		return (0);
3093	pages = count / keg->uk_ipers;
3094
3095	if (pages * keg->uk_ipers < count)
3096		pages++;
3097
3098#ifdef UMA_MD_SMALL_ALLOC
3099	if (keg->uk_ppera > 1) {
3100#else
3101	if (1) {
3102#endif
3103		kva = kva_alloc((vm_size_t)pages * UMA_SLAB_SIZE);
3104		if (kva == 0)
3105			return (0);
3106	} else
3107		kva = 0;
3108	KEG_LOCK(keg);
3109	keg->uk_kva = kva;
3110	keg->uk_offset = 0;
3111	keg->uk_maxpages = pages;
3112#ifdef UMA_MD_SMALL_ALLOC
3113	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3114#else
3115	keg->uk_allocf = noobj_alloc;
3116#endif
3117	keg->uk_flags |= UMA_ZONE_NOFREE;
3118	KEG_UNLOCK(keg);
3119
3120	return (1);
3121}
3122
3123/* See uma.h */
3124void
3125uma_prealloc(uma_zone_t zone, int items)
3126{
3127	int slabs;
3128	uma_slab_t slab;
3129	uma_keg_t keg;
3130
3131	keg = zone_first_keg(zone);
3132	if (keg == NULL)
3133		return;
3134	KEG_LOCK(keg);
3135	slabs = items / keg->uk_ipers;
3136	if (slabs * keg->uk_ipers < items)
3137		slabs++;
3138	while (slabs > 0) {
3139		slab = keg_alloc_slab(keg, zone, M_WAITOK);
3140		if (slab == NULL)
3141			break;
3142		MPASS(slab->us_keg == keg);
3143		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3144		slabs--;
3145	}
3146	KEG_UNLOCK(keg);
3147}
3148
3149/* See uma.h */
3150static void
3151uma_reclaim_locked(bool kmem_danger)
3152{
3153
3154#ifdef UMA_DEBUG
3155	printf("UMA: vm asked us to release pages!\n");
3156#endif
3157	sx_assert(&uma_drain_lock, SA_XLOCKED);
3158	bucket_enable();
3159	zone_foreach(zone_drain);
3160	if (vm_page_count_min() || kmem_danger) {
3161		cache_drain_safe(NULL);
3162		zone_foreach(zone_drain);
3163	}
3164	/*
3165	 * Some slabs may have been freed but this zone will be visited early
3166	 * we visit again so that we can free pages that are empty once other
3167	 * zones are drained.  We have to do the same for buckets.
3168	 */
3169	zone_drain(slabzone);
3170	bucket_zone_drain();
3171}
3172
3173void
3174uma_reclaim(void)
3175{
3176
3177	sx_xlock(&uma_drain_lock);
3178	uma_reclaim_locked(false);
3179	sx_xunlock(&uma_drain_lock);
3180}
3181
3182static int uma_reclaim_needed;
3183
3184void
3185uma_reclaim_wakeup(void)
3186{
3187
3188	uma_reclaim_needed = 1;
3189	wakeup(&uma_reclaim_needed);
3190}
3191
3192void
3193uma_reclaim_worker(void *arg __unused)
3194{
3195
3196	sx_xlock(&uma_drain_lock);
3197	for (;;) {
3198		sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM,
3199		    "umarcl", 0);
3200		if (uma_reclaim_needed) {
3201			uma_reclaim_needed = 0;
3202			uma_reclaim_locked(true);
3203		}
3204	}
3205}
3206
3207/* See uma.h */
3208int
3209uma_zone_exhausted(uma_zone_t zone)
3210{
3211	int full;
3212
3213	ZONE_LOCK(zone);
3214	full = (zone->uz_flags & UMA_ZFLAG_FULL);
3215	ZONE_UNLOCK(zone);
3216	return (full);
3217}
3218
3219int
3220uma_zone_exhausted_nolock(uma_zone_t zone)
3221{
3222	return (zone->uz_flags & UMA_ZFLAG_FULL);
3223}
3224
3225void *
3226uma_large_malloc(vm_size_t size, int wait)
3227{
3228	void *mem;
3229	uma_slab_t slab;
3230	uint8_t flags;
3231
3232	slab = zone_alloc_item(slabzone, NULL, wait);
3233	if (slab == NULL)
3234		return (NULL);
3235	mem = page_alloc(NULL, size, &flags, wait);
3236	if (mem) {
3237		vsetslab((vm_offset_t)mem, slab);
3238		slab->us_data = mem;
3239		slab->us_flags = flags | UMA_SLAB_MALLOC;
3240		slab->us_size = size;
3241	} else {
3242		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3243	}
3244
3245	return (mem);
3246}
3247
3248void
3249uma_large_free(uma_slab_t slab)
3250{
3251
3252	page_free(slab->us_data, slab->us_size, slab->us_flags);
3253	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3254}
3255
3256static void
3257uma_zero_item(void *item, uma_zone_t zone)
3258{
3259	int i;
3260
3261	if (zone->uz_flags & UMA_ZONE_PCPU) {
3262		CPU_FOREACH(i)
3263			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
3264	} else
3265		bzero(item, zone->uz_size);
3266}
3267
3268void
3269uma_print_stats(void)
3270{
3271	zone_foreach(uma_print_zone);
3272}
3273
3274static void
3275slab_print(uma_slab_t slab)
3276{
3277	printf("slab: keg %p, data %p, freecount %d\n",
3278		slab->us_keg, slab->us_data, slab->us_freecount);
3279}
3280
3281static void
3282cache_print(uma_cache_t cache)
3283{
3284	printf("alloc: %p(%d), free: %p(%d)\n",
3285		cache->uc_allocbucket,
3286		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3287		cache->uc_freebucket,
3288		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3289}
3290
3291static void
3292uma_print_keg(uma_keg_t keg)
3293{
3294	uma_slab_t slab;
3295
3296	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3297	    "out %d free %d limit %d\n",
3298	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3299	    keg->uk_ipers, keg->uk_ppera,
3300	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3301	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3302	printf("Part slabs:\n");
3303	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3304		slab_print(slab);
3305	printf("Free slabs:\n");
3306	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3307		slab_print(slab);
3308	printf("Full slabs:\n");
3309	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3310		slab_print(slab);
3311}
3312
3313void
3314uma_print_zone(uma_zone_t zone)
3315{
3316	uma_cache_t cache;
3317	uma_klink_t kl;
3318	int i;
3319
3320	printf("zone: %s(%p) size %d flags %#x\n",
3321	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3322	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3323		uma_print_keg(kl->kl_keg);
3324	CPU_FOREACH(i) {
3325		cache = &zone->uz_cpu[i];
3326		printf("CPU %d Cache:\n", i);
3327		cache_print(cache);
3328	}
3329}
3330
3331#ifdef DDB
3332/*
3333 * Generate statistics across both the zone and its per-cpu cache's.  Return
3334 * desired statistics if the pointer is non-NULL for that statistic.
3335 *
3336 * Note: does not update the zone statistics, as it can't safely clear the
3337 * per-CPU cache statistic.
3338 *
3339 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3340 * safe from off-CPU; we should modify the caches to track this information
3341 * directly so that we don't have to.
3342 */
3343static void
3344uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3345    uint64_t *freesp, uint64_t *sleepsp)
3346{
3347	uma_cache_t cache;
3348	uint64_t allocs, frees, sleeps;
3349	int cachefree, cpu;
3350
3351	allocs = frees = sleeps = 0;
3352	cachefree = 0;
3353	CPU_FOREACH(cpu) {
3354		cache = &z->uz_cpu[cpu];
3355		if (cache->uc_allocbucket != NULL)
3356			cachefree += cache->uc_allocbucket->ub_cnt;
3357		if (cache->uc_freebucket != NULL)
3358			cachefree += cache->uc_freebucket->ub_cnt;
3359		allocs += cache->uc_allocs;
3360		frees += cache->uc_frees;
3361	}
3362	allocs += z->uz_allocs;
3363	frees += z->uz_frees;
3364	sleeps += z->uz_sleeps;
3365	if (cachefreep != NULL)
3366		*cachefreep = cachefree;
3367	if (allocsp != NULL)
3368		*allocsp = allocs;
3369	if (freesp != NULL)
3370		*freesp = frees;
3371	if (sleepsp != NULL)
3372		*sleepsp = sleeps;
3373}
3374#endif /* DDB */
3375
3376static int
3377sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3378{
3379	uma_keg_t kz;
3380	uma_zone_t z;
3381	int count;
3382
3383	count = 0;
3384	rw_rlock(&uma_rwlock);
3385	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3386		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3387			count++;
3388	}
3389	rw_runlock(&uma_rwlock);
3390	return (sysctl_handle_int(oidp, &count, 0, req));
3391}
3392
3393static int
3394sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3395{
3396	struct uma_stream_header ush;
3397	struct uma_type_header uth;
3398	struct uma_percpu_stat ups;
3399	uma_bucket_t bucket;
3400	struct sbuf sbuf;
3401	uma_cache_t cache;
3402	uma_klink_t kl;
3403	uma_keg_t kz;
3404	uma_zone_t z;
3405	uma_keg_t k;
3406	int count, error, i;
3407
3408	error = sysctl_wire_old_buffer(req, 0);
3409	if (error != 0)
3410		return (error);
3411	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3412	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
3413
3414	count = 0;
3415	rw_rlock(&uma_rwlock);
3416	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3417		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3418			count++;
3419	}
3420
3421	/*
3422	 * Insert stream header.
3423	 */
3424	bzero(&ush, sizeof(ush));
3425	ush.ush_version = UMA_STREAM_VERSION;
3426	ush.ush_maxcpus = (mp_maxid + 1);
3427	ush.ush_count = count;
3428	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3429
3430	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3431		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3432			bzero(&uth, sizeof(uth));
3433			ZONE_LOCK(z);
3434			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3435			uth.uth_align = kz->uk_align;
3436			uth.uth_size = kz->uk_size;
3437			uth.uth_rsize = kz->uk_rsize;
3438			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3439				k = kl->kl_keg;
3440				uth.uth_maxpages += k->uk_maxpages;
3441				uth.uth_pages += k->uk_pages;
3442				uth.uth_keg_free += k->uk_free;
3443				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3444				    * k->uk_ipers;
3445			}
3446
3447			/*
3448			 * A zone is secondary is it is not the first entry
3449			 * on the keg's zone list.
3450			 */
3451			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3452			    (LIST_FIRST(&kz->uk_zones) != z))
3453				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3454
3455			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3456				uth.uth_zone_free += bucket->ub_cnt;
3457			uth.uth_allocs = z->uz_allocs;
3458			uth.uth_frees = z->uz_frees;
3459			uth.uth_fails = z->uz_fails;
3460			uth.uth_sleeps = z->uz_sleeps;
3461			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3462			/*
3463			 * While it is not normally safe to access the cache
3464			 * bucket pointers while not on the CPU that owns the
3465			 * cache, we only allow the pointers to be exchanged
3466			 * without the zone lock held, not invalidated, so
3467			 * accept the possible race associated with bucket
3468			 * exchange during monitoring.
3469			 */
3470			for (i = 0; i < (mp_maxid + 1); i++) {
3471				bzero(&ups, sizeof(ups));
3472				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3473					goto skip;
3474				if (CPU_ABSENT(i))
3475					goto skip;
3476				cache = &z->uz_cpu[i];
3477				if (cache->uc_allocbucket != NULL)
3478					ups.ups_cache_free +=
3479					    cache->uc_allocbucket->ub_cnt;
3480				if (cache->uc_freebucket != NULL)
3481					ups.ups_cache_free +=
3482					    cache->uc_freebucket->ub_cnt;
3483				ups.ups_allocs = cache->uc_allocs;
3484				ups.ups_frees = cache->uc_frees;
3485skip:
3486				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3487			}
3488			ZONE_UNLOCK(z);
3489		}
3490	}
3491	rw_runlock(&uma_rwlock);
3492	error = sbuf_finish(&sbuf);
3493	sbuf_delete(&sbuf);
3494	return (error);
3495}
3496
3497int
3498sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
3499{
3500	uma_zone_t zone = *(uma_zone_t *)arg1;
3501	int error, max;
3502
3503	max = uma_zone_get_max(zone);
3504	error = sysctl_handle_int(oidp, &max, 0, req);
3505	if (error || !req->newptr)
3506		return (error);
3507
3508	uma_zone_set_max(zone, max);
3509
3510	return (0);
3511}
3512
3513int
3514sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
3515{
3516	uma_zone_t zone = *(uma_zone_t *)arg1;
3517	int cur;
3518
3519	cur = uma_zone_get_cur(zone);
3520	return (sysctl_handle_int(oidp, &cur, 0, req));
3521}
3522
3523#ifdef INVARIANTS
3524static uma_slab_t
3525uma_dbg_getslab(uma_zone_t zone, void *item)
3526{
3527	uma_slab_t slab;
3528	uma_keg_t keg;
3529	uint8_t *mem;
3530
3531	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3532	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
3533		slab = vtoslab((vm_offset_t)mem);
3534	} else {
3535		/*
3536		 * It is safe to return the slab here even though the
3537		 * zone is unlocked because the item's allocation state
3538		 * essentially holds a reference.
3539		 */
3540		ZONE_LOCK(zone);
3541		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
3542		if (keg->uk_flags & UMA_ZONE_HASH)
3543			slab = hash_sfind(&keg->uk_hash, mem);
3544		else
3545			slab = (uma_slab_t)(mem + keg->uk_pgoff);
3546		ZONE_UNLOCK(zone);
3547	}
3548
3549	return (slab);
3550}
3551
3552/*
3553 * Set up the slab's freei data such that uma_dbg_free can function.
3554 *
3555 */
3556static void
3557uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
3558{
3559	uma_keg_t keg;
3560	int freei;
3561
3562	if (zone_first_keg(zone) == NULL)
3563		return;
3564	if (slab == NULL) {
3565		slab = uma_dbg_getslab(zone, item);
3566		if (slab == NULL)
3567			panic("uma: item %p did not belong to zone %s\n",
3568			    item, zone->uz_name);
3569	}
3570	keg = slab->us_keg;
3571	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3572
3573	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3574		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
3575		    item, zone, zone->uz_name, slab, freei);
3576	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3577
3578	return;
3579}
3580
3581/*
3582 * Verifies freed addresses.  Checks for alignment, valid slab membership
3583 * and duplicate frees.
3584 *
3585 */
3586static void
3587uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
3588{
3589	uma_keg_t keg;
3590	int freei;
3591
3592	if (zone_first_keg(zone) == NULL)
3593		return;
3594	if (slab == NULL) {
3595		slab = uma_dbg_getslab(zone, item);
3596		if (slab == NULL)
3597			panic("uma: Freed item %p did not belong to zone %s\n",
3598			    item, zone->uz_name);
3599	}
3600	keg = slab->us_keg;
3601	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3602
3603	if (freei >= keg->uk_ipers)
3604		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
3605		    item, zone, zone->uz_name, slab, freei);
3606
3607	if (((freei * keg->uk_rsize) + slab->us_data) != item)
3608		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
3609		    item, zone, zone->uz_name, slab, freei);
3610
3611	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
3612		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
3613		    item, zone, zone->uz_name, slab, freei);
3614
3615	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
3616}
3617#endif /* INVARIANTS */
3618
3619#ifdef DDB
3620DB_SHOW_COMMAND(uma, db_show_uma)
3621{
3622	uint64_t allocs, frees, sleeps;
3623	uma_bucket_t bucket;
3624	uma_keg_t kz;
3625	uma_zone_t z;
3626	int cachefree;
3627
3628	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
3629	    "Free", "Requests", "Sleeps", "Bucket");
3630	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3631		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3632			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3633				allocs = z->uz_allocs;
3634				frees = z->uz_frees;
3635				sleeps = z->uz_sleeps;
3636				cachefree = 0;
3637			} else
3638				uma_zone_sumstat(z, &cachefree, &allocs,
3639				    &frees, &sleeps);
3640			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3641			    (LIST_FIRST(&kz->uk_zones) != z)))
3642				cachefree += kz->uk_free;
3643			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3644				cachefree += bucket->ub_cnt;
3645			db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
3646			    z->uz_name, (uintmax_t)kz->uk_size,
3647			    (intmax_t)(allocs - frees), cachefree,
3648			    (uintmax_t)allocs, sleeps, z->uz_count);
3649			if (db_pager_quit)
3650				return;
3651		}
3652	}
3653}
3654
3655DB_SHOW_COMMAND(umacache, db_show_umacache)
3656{
3657	uint64_t allocs, frees;
3658	uma_bucket_t bucket;
3659	uma_zone_t z;
3660	int cachefree;
3661
3662	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3663	    "Requests", "Bucket");
3664	LIST_FOREACH(z, &uma_cachezones, uz_link) {
3665		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
3666		LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3667			cachefree += bucket->ub_cnt;
3668		db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
3669		    z->uz_name, (uintmax_t)z->uz_size,
3670		    (intmax_t)(allocs - frees), cachefree,
3671		    (uintmax_t)allocs, z->uz_count);
3672		if (db_pager_quit)
3673			return;
3674	}
3675}
3676#endif	/* DDB */
3677