1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6 * Copyright (c) 2004-2006 Robert N. M. Watson
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice unmodified, this list of conditions, and the following
14 *    disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/*
32 * uma_core.c  Implementation of the Universal Memory allocator
33 *
34 * This allocator is intended to replace the multitude of similar object caches
35 * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36 * efficient.  A primary design goal is to return unused memory to the rest of
37 * the system.  This will make the system as a whole more flexible due to the
38 * ability to move memory to subsystems which most need it instead of leaving
39 * pools of reserved memory unused.
40 *
41 * The basic ideas stem from similar slab/zone based allocators whose algorithms
42 * are well known.
43 *
44 */
45
46/*
47 * TODO:
48 *	- Improve memory usage for large allocations
49 *	- Investigate cache size adjustments
50 */
51
52#include <sys/cdefs.h>
53__FBSDID("$FreeBSD$");
54
55#include "opt_ddb.h"
56#include "opt_param.h"
57#include "opt_vm.h"
58
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/bitset.h>
62#include <sys/domainset.h>
63#include <sys/eventhandler.h>
64#include <sys/kernel.h>
65#include <sys/types.h>
66#include <sys/limits.h>
67#include <sys/queue.h>
68#include <sys/malloc.h>
69#include <sys/ktr.h>
70#include <sys/lock.h>
71#include <sys/sysctl.h>
72#include <sys/mutex.h>
73#include <sys/proc.h>
74#include <sys/random.h>
75#include <sys/rwlock.h>
76#include <sys/sbuf.h>
77#include <sys/sched.h>
78#include <sys/smp.h>
79#include <sys/taskqueue.h>
80#include <sys/vmmeter.h>
81
82#include <vm/vm.h>
83#include <vm/vm_domainset.h>
84#include <vm/vm_object.h>
85#include <vm/vm_page.h>
86#include <vm/vm_pageout.h>
87#include <vm/vm_param.h>
88#include <vm/vm_phys.h>
89#include <vm/vm_pagequeue.h>
90#include <vm/vm_map.h>
91#include <vm/vm_kern.h>
92#include <vm/vm_extern.h>
93#include <vm/uma.h>
94#include <vm/uma_int.h>
95#include <vm/uma_dbg.h>
96
97#include <ddb/ddb.h>
98
99#ifdef DEBUG_MEMGUARD
100#include <vm/memguard.h>
101#endif
102
103/*
104 * This is the zone and keg from which all zones are spawned.
105 */
106static uma_zone_t kegs;
107static uma_zone_t zones;
108
109/* This is the zone from which all offpage uma_slab_ts are allocated. */
110static uma_zone_t slabzone;
111
112/*
113 * The initial hash tables come out of this zone so they can be allocated
114 * prior to malloc coming up.
115 */
116static uma_zone_t hashzone;
117
118/* The boot-time adjusted value for cache line alignment. */
119int uma_align_cache = 64 - 1;
120
121static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
122
123/*
124 * Are we allowed to allocate buckets?
125 */
126static int bucketdisable = 1;
127
128/* Linked list of all kegs in the system */
129static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
130
131/* Linked list of all cache-only zones in the system */
132static LIST_HEAD(,uma_zone) uma_cachezones =
133    LIST_HEAD_INITIALIZER(uma_cachezones);
134
135/* This RW lock protects the keg list */
136static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
137
138/*
139 * Pointer and counter to pool of pages, that is preallocated at
140 * startup to bootstrap UMA.
141 */
142static char *bootmem;
143static int boot_pages;
144
145static struct sx uma_drain_lock;
146
147/*
148 * kmem soft limit, initialized by uma_set_limit().  Ensure that early
149 * allocations don't trigger a wakeup of the reclaim thread.
150 */
151static unsigned long uma_kmem_limit = LONG_MAX;
152SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_limit, CTLFLAG_RD, &uma_kmem_limit, 0,
153    "UMA kernel memory soft limit");
154static unsigned long uma_kmem_total;
155SYSCTL_ULONG(_vm, OID_AUTO, uma_kmem_total, CTLFLAG_RD, &uma_kmem_total, 0,
156    "UMA kernel memory usage");
157
158/* Is the VM done starting up? */
159static enum {
160	BOOT_COLD,
161	BOOT_STRAPPED,
162	BOOT_PAGEALLOC,
163	BOOT_BUCKETS,
164	BOOT_RUNNING,
165	BOOT_SHUTDOWN,
166} booted = BOOT_COLD;
167
168/*
169 * This is the handle used to schedule events that need to happen
170 * outside of the allocation fast path.
171 */
172static struct callout uma_callout;
173#define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
174
175/*
176 * This structure is passed as the zone ctor arg so that I don't have to create
177 * a special allocation function just for zones.
178 */
179struct uma_zctor_args {
180	const char *name;
181	size_t size;
182	uma_ctor ctor;
183	uma_dtor dtor;
184	uma_init uminit;
185	uma_fini fini;
186	uma_import import;
187	uma_release release;
188	void *arg;
189	uma_keg_t keg;
190	int align;
191	uint32_t flags;
192};
193
194struct uma_kctor_args {
195	uma_zone_t zone;
196	size_t size;
197	uma_init uminit;
198	uma_fini fini;
199	int align;
200	uint32_t flags;
201};
202
203struct uma_bucket_zone {
204	uma_zone_t	ubz_zone;
205	char		*ubz_name;
206	int		ubz_entries;	/* Number of items it can hold. */
207	int		ubz_maxsize;	/* Maximum allocation size per-item. */
208};
209
210/*
211 * Compute the actual number of bucket entries to pack them in power
212 * of two sizes for more efficient space utilization.
213 */
214#define	BUCKET_SIZE(n)						\
215    (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
216
217#define	BUCKET_MAX	BUCKET_SIZE(256)
218
219struct uma_bucket_zone bucket_zones[] = {
220	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
221	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
222	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
223	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
224	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
225	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
226	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
227	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
228	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
229	{ NULL, NULL, 0}
230};
231
232/*
233 * Flags and enumerations to be passed to internal functions.
234 */
235enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
236
237#define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
238
239/* Prototypes.. */
240
241int	uma_startup_count(int);
242void	uma_startup(void *, int);
243void	uma_startup1(void);
244void	uma_startup2(void);
245
246static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
247static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
248static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
249static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
250static void page_free(void *, vm_size_t, uint8_t);
251static void pcpu_page_free(void *, vm_size_t, uint8_t);
252static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int, int);
253static void cache_drain(uma_zone_t);
254static void bucket_drain(uma_zone_t, uma_bucket_t);
255static void bucket_cache_drain(uma_zone_t zone);
256static int keg_ctor(void *, int, void *, int);
257static void keg_dtor(void *, int, void *);
258static int zone_ctor(void *, int, void *, int);
259static void zone_dtor(void *, int, void *);
260static int zero_init(void *, int, int);
261static void keg_small_init(uma_keg_t keg);
262static void keg_large_init(uma_keg_t keg);
263static void zone_foreach(void (*zfunc)(uma_zone_t));
264static void zone_timeout(uma_zone_t zone);
265static int hash_alloc(struct uma_hash *, u_int);
266static int hash_expand(struct uma_hash *, struct uma_hash *);
267static void hash_free(struct uma_hash *hash);
268static void uma_timeout(void *);
269static void uma_startup3(void);
270static void uma_shutdown(void);
271static void *zone_alloc_item(uma_zone_t, void *, int, int);
272static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
273static void bucket_enable(void);
274static void bucket_init(void);
275static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
276static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
277static void bucket_zone_drain(void);
278static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
279static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
280static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
281static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
282static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
283static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
284    uma_fini fini, int align, uint32_t flags);
285static int zone_import(uma_zone_t, void **, int, int, int);
286static void zone_release(uma_zone_t, void **, int);
287static void uma_zero_item(void *, uma_zone_t);
288
289void uma_print_zone(uma_zone_t);
290void uma_print_stats(void);
291static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
292static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
293
294#ifdef INVARIANTS
295static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
296static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
297static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
298static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
299
300static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
301    "Memory allocation debugging");
302
303static u_int dbg_divisor = 1;
304SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
305    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
306    "Debug & thrash every this item in memory allocator");
307
308static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
309static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
310SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
311    &uma_dbg_cnt, "memory items debugged");
312SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
313    &uma_skip_cnt, "memory items skipped, not debugged");
314#endif
315
316SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
317
318SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
319    0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
320
321SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
322    0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
323
324static int zone_warnings = 1;
325SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
326    "Warn when UMA zones becomes full");
327
328/* Adjust bytes under management by UMA. */
329static inline void
330uma_total_dec(unsigned long size)
331{
332
333	atomic_subtract_long(&uma_kmem_total, size);
334}
335
336static inline void
337uma_total_inc(unsigned long size)
338{
339
340	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
341		uma_reclaim_wakeup();
342}
343
344/*
345 * This routine checks to see whether or not it's safe to enable buckets.
346 */
347static void
348bucket_enable(void)
349{
350	bucketdisable = vm_page_count_min();
351}
352
353/*
354 * Initialize bucket_zones, the array of zones of buckets of various sizes.
355 *
356 * For each zone, calculate the memory required for each bucket, consisting
357 * of the header and an array of pointers.
358 */
359static void
360bucket_init(void)
361{
362	struct uma_bucket_zone *ubz;
363	int size;
364
365	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
366		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
367		size += sizeof(void *) * ubz->ubz_entries;
368		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
369		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
370		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
371	}
372}
373
374/*
375 * Given a desired number of entries for a bucket, return the zone from which
376 * to allocate the bucket.
377 */
378static struct uma_bucket_zone *
379bucket_zone_lookup(int entries)
380{
381	struct uma_bucket_zone *ubz;
382
383	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
384		if (ubz->ubz_entries >= entries)
385			return (ubz);
386	ubz--;
387	return (ubz);
388}
389
390static int
391bucket_select(int size)
392{
393	struct uma_bucket_zone *ubz;
394
395	ubz = &bucket_zones[0];
396	if (size > ubz->ubz_maxsize)
397		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
398
399	for (; ubz->ubz_entries != 0; ubz++)
400		if (ubz->ubz_maxsize < size)
401			break;
402	ubz--;
403	return (ubz->ubz_entries);
404}
405
406static uma_bucket_t
407bucket_alloc(uma_zone_t zone, void *udata, int flags)
408{
409	struct uma_bucket_zone *ubz;
410	uma_bucket_t bucket;
411
412	/*
413	 * This is to stop us from allocating per cpu buckets while we're
414	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
415	 * boot pages.  This also prevents us from allocating buckets in
416	 * low memory situations.
417	 */
418	if (bucketdisable)
419		return (NULL);
420	/*
421	 * To limit bucket recursion we store the original zone flags
422	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
423	 * NOVM flag to persist even through deep recursions.  We also
424	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
425	 * a bucket for a bucket zone so we do not allow infinite bucket
426	 * recursion.  This cookie will even persist to frees of unused
427	 * buckets via the allocation path or bucket allocations in the
428	 * free path.
429	 */
430	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
431		udata = (void *)(uintptr_t)zone->uz_flags;
432	else {
433		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
434			return (NULL);
435		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
436	}
437	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
438		flags |= M_NOVM;
439	ubz = bucket_zone_lookup(zone->uz_count);
440	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
441		ubz++;
442	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
443	if (bucket) {
444#ifdef INVARIANTS
445		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
446#endif
447		bucket->ub_cnt = 0;
448		bucket->ub_entries = ubz->ubz_entries;
449	}
450
451	return (bucket);
452}
453
454static void
455bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
456{
457	struct uma_bucket_zone *ubz;
458
459	KASSERT(bucket->ub_cnt == 0,
460	    ("bucket_free: Freeing a non free bucket."));
461	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
462		udata = (void *)(uintptr_t)zone->uz_flags;
463	ubz = bucket_zone_lookup(bucket->ub_entries);
464	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
465}
466
467static void
468bucket_zone_drain(void)
469{
470	struct uma_bucket_zone *ubz;
471
472	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
473		zone_drain(ubz->ubz_zone);
474}
475
476static uma_bucket_t
477zone_try_fetch_bucket(uma_zone_t zone, uma_zone_domain_t zdom, const bool ws)
478{
479	uma_bucket_t bucket;
480
481	ZONE_LOCK_ASSERT(zone);
482
483	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
484		MPASS(zdom->uzd_nitems >= bucket->ub_cnt);
485		LIST_REMOVE(bucket, ub_link);
486		zdom->uzd_nitems -= bucket->ub_cnt;
487		if (ws && zdom->uzd_imin > zdom->uzd_nitems)
488			zdom->uzd_imin = zdom->uzd_nitems;
489	}
490	return (bucket);
491}
492
493static void
494zone_put_bucket(uma_zone_t zone, uma_zone_domain_t zdom, uma_bucket_t bucket,
495    const bool ws)
496{
497
498	ZONE_LOCK_ASSERT(zone);
499
500	LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
501	zdom->uzd_nitems += bucket->ub_cnt;
502	if (ws && zdom->uzd_imax < zdom->uzd_nitems)
503		zdom->uzd_imax = zdom->uzd_nitems;
504}
505
506static void
507zone_log_warning(uma_zone_t zone)
508{
509	static const struct timeval warninterval = { 300, 0 };
510
511	if (!zone_warnings || zone->uz_warning == NULL)
512		return;
513
514	if (ratecheck(&zone->uz_ratecheck, &warninterval))
515		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
516}
517
518static inline void
519zone_maxaction(uma_zone_t zone)
520{
521
522	if (zone->uz_maxaction.ta_func != NULL)
523		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
524}
525
526static void
527zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
528{
529	uma_klink_t klink;
530
531	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
532		kegfn(klink->kl_keg);
533}
534
535/*
536 * Routine called by timeout which is used to fire off some time interval
537 * based calculations.  (stats, hash size, etc.)
538 *
539 * Arguments:
540 *	arg   Unused
541 *
542 * Returns:
543 *	Nothing
544 */
545static void
546uma_timeout(void *unused)
547{
548	bucket_enable();
549	zone_foreach(zone_timeout);
550
551	/* Reschedule this event */
552	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
553}
554
555/*
556 * Update the working set size estimate for the zone's bucket cache.
557 * The constants chosen here are somewhat arbitrary.  With an update period of
558 * 20s (UMA_TIMEOUT), this estimate is dominated by zone activity over the
559 * last 100s.
560 */
561static void
562zone_domain_update_wss(uma_zone_domain_t zdom)
563{
564	long wss;
565
566	MPASS(zdom->uzd_imax >= zdom->uzd_imin);
567	wss = zdom->uzd_imax - zdom->uzd_imin;
568	zdom->uzd_imax = zdom->uzd_imin = zdom->uzd_nitems;
569	zdom->uzd_wss = (3 * wss + 2 * zdom->uzd_wss) / 5;
570}
571
572/*
573 * Routine to perform timeout driven calculations.  This expands the
574 * hashes and does per cpu statistics aggregation.
575 *
576 *  Returns nothing.
577 */
578static void
579keg_timeout(uma_keg_t keg)
580{
581	u_int slabs;
582
583	KEG_LOCK(keg);
584	/*
585	 * Expand the keg hash table.
586	 *
587	 * This is done if the number of slabs is larger than the hash size.
588	 * What I'm trying to do here is completely reduce collisions.  This
589	 * may be a little aggressive.  Should I allow for two collisions max?
590	 */
591	if (keg->uk_flags & UMA_ZONE_HASH &&
592	    (slabs = keg->uk_pages / keg->uk_ppera) >
593	     keg->uk_hash.uh_hashsize) {
594		struct uma_hash newhash;
595		struct uma_hash oldhash;
596		int ret;
597
598		/*
599		 * This is so involved because allocating and freeing
600		 * while the keg lock is held will lead to deadlock.
601		 * I have to do everything in stages and check for
602		 * races.
603		 */
604		KEG_UNLOCK(keg);
605		ret = hash_alloc(&newhash, 1 << fls(slabs));
606		KEG_LOCK(keg);
607		if (ret) {
608			if (hash_expand(&keg->uk_hash, &newhash)) {
609				oldhash = keg->uk_hash;
610				keg->uk_hash = newhash;
611			} else
612				oldhash = newhash;
613
614			KEG_UNLOCK(keg);
615			hash_free(&oldhash);
616			return;
617		}
618	}
619	KEG_UNLOCK(keg);
620}
621
622static void
623zone_timeout(uma_zone_t zone)
624{
625	int i;
626
627	zone_foreach_keg(zone, &keg_timeout);
628
629	ZONE_LOCK(zone);
630	for (i = 0; i < vm_ndomains; i++)
631		zone_domain_update_wss(&zone->uz_domain[i]);
632	ZONE_UNLOCK(zone);
633}
634
635/*
636 * Allocate and zero fill the next sized hash table from the appropriate
637 * backing store.
638 *
639 * Arguments:
640 *	hash  A new hash structure with the old hash size in uh_hashsize
641 *
642 * Returns:
643 *	1 on success and 0 on failure.
644 */
645static int
646hash_alloc(struct uma_hash *hash, u_int size)
647{
648	size_t alloc;
649
650	KASSERT(powerof2(size), ("hash size must be power of 2"));
651	if (size > UMA_HASH_SIZE_INIT)  {
652		hash->uh_hashsize = size;
653		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
654		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
655		    M_UMAHASH, M_NOWAIT);
656	} else {
657		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
658		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
659		    UMA_ANYDOMAIN, M_WAITOK);
660		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
661	}
662	if (hash->uh_slab_hash) {
663		bzero(hash->uh_slab_hash, alloc);
664		hash->uh_hashmask = hash->uh_hashsize - 1;
665		return (1);
666	}
667
668	return (0);
669}
670
671/*
672 * Expands the hash table for HASH zones.  This is done from zone_timeout
673 * to reduce collisions.  This must not be done in the regular allocation
674 * path, otherwise, we can recurse on the vm while allocating pages.
675 *
676 * Arguments:
677 *	oldhash  The hash you want to expand
678 *	newhash  The hash structure for the new table
679 *
680 * Returns:
681 *	Nothing
682 *
683 * Discussion:
684 */
685static int
686hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
687{
688	uma_slab_t slab;
689	u_int hval;
690	u_int idx;
691
692	if (!newhash->uh_slab_hash)
693		return (0);
694
695	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
696		return (0);
697
698	/*
699	 * I need to investigate hash algorithms for resizing without a
700	 * full rehash.
701	 */
702
703	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
704		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
705			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
706			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
707			hval = UMA_HASH(newhash, slab->us_data);
708			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
709			    slab, us_hlink);
710		}
711
712	return (1);
713}
714
715/*
716 * Free the hash bucket to the appropriate backing store.
717 *
718 * Arguments:
719 *	slab_hash  The hash bucket we're freeing
720 *	hashsize   The number of entries in that hash bucket
721 *
722 * Returns:
723 *	Nothing
724 */
725static void
726hash_free(struct uma_hash *hash)
727{
728	if (hash->uh_slab_hash == NULL)
729		return;
730	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
731		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
732	else
733		free(hash->uh_slab_hash, M_UMAHASH);
734}
735
736/*
737 * Frees all outstanding items in a bucket
738 *
739 * Arguments:
740 *	zone   The zone to free to, must be unlocked.
741 *	bucket The free/alloc bucket with items, cpu queue must be locked.
742 *
743 * Returns:
744 *	Nothing
745 */
746
747static void
748bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
749{
750	int i;
751
752	if (bucket == NULL)
753		return;
754
755	if (zone->uz_fini)
756		for (i = 0; i < bucket->ub_cnt; i++)
757			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
758	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
759	bucket->ub_cnt = 0;
760}
761
762/*
763 * Drains the per cpu caches for a zone.
764 *
765 * NOTE: This may only be called while the zone is being turn down, and not
766 * during normal operation.  This is necessary in order that we do not have
767 * to migrate CPUs to drain the per-CPU caches.
768 *
769 * Arguments:
770 *	zone     The zone to drain, must be unlocked.
771 *
772 * Returns:
773 *	Nothing
774 */
775static void
776cache_drain(uma_zone_t zone)
777{
778	uma_cache_t cache;
779	int cpu;
780
781	/*
782	 * XXX: It is safe to not lock the per-CPU caches, because we're
783	 * tearing down the zone anyway.  I.e., there will be no further use
784	 * of the caches at this point.
785	 *
786	 * XXX: It would good to be able to assert that the zone is being
787	 * torn down to prevent improper use of cache_drain().
788	 *
789	 * XXX: We lock the zone before passing into bucket_cache_drain() as
790	 * it is used elsewhere.  Should the tear-down path be made special
791	 * there in some form?
792	 */
793	CPU_FOREACH(cpu) {
794		cache = &zone->uz_cpu[cpu];
795		bucket_drain(zone, cache->uc_allocbucket);
796		bucket_drain(zone, cache->uc_freebucket);
797		if (cache->uc_allocbucket != NULL)
798			bucket_free(zone, cache->uc_allocbucket, NULL);
799		if (cache->uc_freebucket != NULL)
800			bucket_free(zone, cache->uc_freebucket, NULL);
801		cache->uc_allocbucket = cache->uc_freebucket = NULL;
802	}
803	ZONE_LOCK(zone);
804	bucket_cache_drain(zone);
805	ZONE_UNLOCK(zone);
806}
807
808static void
809cache_shrink(uma_zone_t zone)
810{
811
812	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
813		return;
814
815	ZONE_LOCK(zone);
816	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
817	ZONE_UNLOCK(zone);
818}
819
820static void
821cache_drain_safe_cpu(uma_zone_t zone)
822{
823	uma_cache_t cache;
824	uma_bucket_t b1, b2;
825	int domain;
826
827	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
828		return;
829
830	b1 = b2 = NULL;
831	ZONE_LOCK(zone);
832	critical_enter();
833	if (zone->uz_flags & UMA_ZONE_NUMA)
834		domain = PCPU_GET(domain);
835	else
836		domain = 0;
837	cache = &zone->uz_cpu[curcpu];
838	if (cache->uc_allocbucket) {
839		if (cache->uc_allocbucket->ub_cnt != 0)
840			zone_put_bucket(zone, &zone->uz_domain[domain],
841			    cache->uc_allocbucket, false);
842		else
843			b1 = cache->uc_allocbucket;
844		cache->uc_allocbucket = NULL;
845	}
846	if (cache->uc_freebucket) {
847		if (cache->uc_freebucket->ub_cnt != 0)
848			zone_put_bucket(zone, &zone->uz_domain[domain],
849			    cache->uc_freebucket, false);
850		else
851			b2 = cache->uc_freebucket;
852		cache->uc_freebucket = NULL;
853	}
854	critical_exit();
855	ZONE_UNLOCK(zone);
856	if (b1)
857		bucket_free(zone, b1, NULL);
858	if (b2)
859		bucket_free(zone, b2, NULL);
860}
861
862/*
863 * Safely drain per-CPU caches of a zone(s) to alloc bucket.
864 * This is an expensive call because it needs to bind to all CPUs
865 * one by one and enter a critical section on each of them in order
866 * to safely access their cache buckets.
867 * Zone lock must not be held on call this function.
868 */
869static void
870cache_drain_safe(uma_zone_t zone)
871{
872	int cpu;
873
874	/*
875	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
876	 */
877	if (zone)
878		cache_shrink(zone);
879	else
880		zone_foreach(cache_shrink);
881
882	CPU_FOREACH(cpu) {
883		thread_lock(curthread);
884		sched_bind(curthread, cpu);
885		thread_unlock(curthread);
886
887		if (zone)
888			cache_drain_safe_cpu(zone);
889		else
890			zone_foreach(cache_drain_safe_cpu);
891	}
892	thread_lock(curthread);
893	sched_unbind(curthread);
894	thread_unlock(curthread);
895}
896
897/*
898 * Drain the cached buckets from a zone.  Expects a locked zone on entry.
899 */
900static void
901bucket_cache_drain(uma_zone_t zone)
902{
903	uma_zone_domain_t zdom;
904	uma_bucket_t bucket;
905	int i;
906
907	/*
908	 * Drain the bucket queues and free the buckets.
909	 */
910	for (i = 0; i < vm_ndomains; i++) {
911		zdom = &zone->uz_domain[i];
912		while ((bucket = zone_try_fetch_bucket(zone, zdom, false)) !=
913		    NULL) {
914			ZONE_UNLOCK(zone);
915			bucket_drain(zone, bucket);
916			bucket_free(zone, bucket, NULL);
917			ZONE_LOCK(zone);
918		}
919	}
920
921	/*
922	 * Shrink further bucket sizes.  Price of single zone lock collision
923	 * is probably lower then price of global cache drain.
924	 */
925	if (zone->uz_count > zone->uz_count_min)
926		zone->uz_count--;
927}
928
929static void
930keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
931{
932	uint8_t *mem;
933	int i;
934	uint8_t flags;
935
936	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
937	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
938
939	mem = slab->us_data;
940	flags = slab->us_flags;
941	i = start;
942	if (keg->uk_fini != NULL) {
943		for (i--; i > -1; i--)
944#ifdef INVARIANTS
945		/*
946		 * trash_fini implies that dtor was trash_dtor. trash_fini
947		 * would check that memory hasn't been modified since free,
948		 * which executed trash_dtor.
949		 * That's why we need to run uma_dbg_kskip() check here,
950		 * albeit we don't make skip check for other init/fini
951		 * invocations.
952		 */
953		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
954		    keg->uk_fini != trash_fini)
955#endif
956			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
957			    keg->uk_size);
958	}
959	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
960		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
961	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
962	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
963}
964
965/*
966 * Frees pages from a keg back to the system.  This is done on demand from
967 * the pageout daemon.
968 *
969 * Returns nothing.
970 */
971static void
972keg_drain(uma_keg_t keg)
973{
974	struct slabhead freeslabs = { 0 };
975	uma_domain_t dom;
976	uma_slab_t slab, tmp;
977	int i;
978
979	/*
980	 * We don't want to take pages from statically allocated kegs at this
981	 * time
982	 */
983	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
984		return;
985
986	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
987	    keg->uk_name, keg, keg->uk_free);
988	KEG_LOCK(keg);
989	if (keg->uk_free == 0)
990		goto finished;
991
992	for (i = 0; i < vm_ndomains; i++) {
993		dom = &keg->uk_domain[i];
994		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
995			/* We have nowhere to free these to. */
996			if (slab->us_flags & UMA_SLAB_BOOT)
997				continue;
998
999			LIST_REMOVE(slab, us_link);
1000			keg->uk_pages -= keg->uk_ppera;
1001			keg->uk_free -= keg->uk_ipers;
1002
1003			if (keg->uk_flags & UMA_ZONE_HASH)
1004				UMA_HASH_REMOVE(&keg->uk_hash, slab,
1005				    slab->us_data);
1006
1007			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
1008		}
1009	}
1010
1011finished:
1012	KEG_UNLOCK(keg);
1013
1014	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
1015		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
1016		keg_free_slab(keg, slab, keg->uk_ipers);
1017	}
1018}
1019
1020static void
1021zone_drain_wait(uma_zone_t zone, int waitok)
1022{
1023
1024	/*
1025	 * Set draining to interlock with zone_dtor() so we can release our
1026	 * locks as we go.  Only dtor() should do a WAITOK call since it
1027	 * is the only call that knows the structure will still be available
1028	 * when it wakes up.
1029	 */
1030	ZONE_LOCK(zone);
1031	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
1032		if (waitok == M_NOWAIT)
1033			goto out;
1034		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
1035	}
1036	zone->uz_flags |= UMA_ZFLAG_DRAINING;
1037	bucket_cache_drain(zone);
1038	ZONE_UNLOCK(zone);
1039	/*
1040	 * The DRAINING flag protects us from being freed while
1041	 * we're running.  Normally the uma_rwlock would protect us but we
1042	 * must be able to release and acquire the right lock for each keg.
1043	 */
1044	zone_foreach_keg(zone, &keg_drain);
1045	ZONE_LOCK(zone);
1046	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
1047	wakeup(zone);
1048out:
1049	ZONE_UNLOCK(zone);
1050}
1051
1052void
1053zone_drain(uma_zone_t zone)
1054{
1055
1056	zone_drain_wait(zone, M_NOWAIT);
1057}
1058
1059/*
1060 * Allocate a new slab for a keg.  This does not insert the slab onto a list.
1061 * If the allocation was successful, the keg lock will be held upon return,
1062 * otherwise the keg will be left unlocked.
1063 *
1064 * Arguments:
1065 *	flags   Wait flags for the item initialization routine
1066 *	aflags  Wait flags for the slab allocation
1067 *
1068 * Returns:
1069 *	The slab that was allocated or NULL if there is no memory and the
1070 *	caller specified M_NOWAIT.
1071 */
1072static uma_slab_t
1073keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int flags,
1074    int aflags)
1075{
1076	uma_alloc allocf;
1077	uma_slab_t slab;
1078	unsigned long size;
1079	uint8_t *mem;
1080	uint8_t sflags;
1081	int i;
1082
1083	KASSERT(domain >= 0 && domain < vm_ndomains,
1084	    ("keg_alloc_slab: domain %d out of range", domain));
1085	mtx_assert(&keg->uk_lock, MA_OWNED);
1086
1087	allocf = keg->uk_allocf;
1088	KEG_UNLOCK(keg);
1089
1090	slab = NULL;
1091	mem = NULL;
1092	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1093		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, aflags);
1094		if (slab == NULL)
1095			goto out;
1096	}
1097
1098	/*
1099	 * This reproduces the old vm_zone behavior of zero filling pages the
1100	 * first time they are added to a zone.
1101	 *
1102	 * Malloced items are zeroed in uma_zalloc.
1103	 */
1104
1105	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1106		aflags |= M_ZERO;
1107	else
1108		aflags &= ~M_ZERO;
1109
1110	if (keg->uk_flags & UMA_ZONE_NODUMP)
1111		aflags |= M_NODUMP;
1112
1113	/* zone is passed for legacy reasons. */
1114	size = keg->uk_ppera * PAGE_SIZE;
1115	mem = allocf(zone, size, domain, &sflags, aflags);
1116	if (mem == NULL) {
1117		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1118			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1119		slab = NULL;
1120		goto out;
1121	}
1122	uma_total_inc(size);
1123
1124	/* Point the slab into the allocated memory */
1125	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1126		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1127
1128	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1129		for (i = 0; i < keg->uk_ppera; i++)
1130			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1131
1132	slab->us_keg = keg;
1133	slab->us_data = mem;
1134	slab->us_freecount = keg->uk_ipers;
1135	slab->us_flags = sflags;
1136	slab->us_domain = domain;
1137	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1138#ifdef INVARIANTS
1139	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1140#endif
1141
1142	if (keg->uk_init != NULL) {
1143		for (i = 0; i < keg->uk_ipers; i++)
1144			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1145			    keg->uk_size, flags) != 0)
1146				break;
1147		if (i != keg->uk_ipers) {
1148			keg_free_slab(keg, slab, i);
1149			slab = NULL;
1150			goto out;
1151		}
1152	}
1153	KEG_LOCK(keg);
1154
1155	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1156	    slab, keg->uk_name, keg);
1157
1158	if (keg->uk_flags & UMA_ZONE_HASH)
1159		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1160
1161	keg->uk_pages += keg->uk_ppera;
1162	keg->uk_free += keg->uk_ipers;
1163
1164out:
1165	return (slab);
1166}
1167
1168/*
1169 * This function is intended to be used early on in place of page_alloc() so
1170 * that we may use the boot time page cache to satisfy allocations before
1171 * the VM is ready.
1172 */
1173static void *
1174startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1175    int wait)
1176{
1177	uma_keg_t keg;
1178	void *mem;
1179	int pages;
1180
1181	keg = zone_first_keg(zone);
1182
1183	/*
1184	 * If we are in BOOT_BUCKETS or higher, than switch to real
1185	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1186	 */
1187	switch (booted) {
1188		case BOOT_COLD:
1189		case BOOT_STRAPPED:
1190			break;
1191		case BOOT_PAGEALLOC:
1192			if (keg->uk_ppera > 1)
1193				break;
1194		default:
1195#ifdef UMA_MD_SMALL_ALLOC
1196			keg->uk_allocf = (keg->uk_ppera > 1) ?
1197			    page_alloc : uma_small_alloc;
1198#else
1199			keg->uk_allocf = page_alloc;
1200#endif
1201			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1202	}
1203
1204	/*
1205	 * Check our small startup cache to see if it has pages remaining.
1206	 */
1207	pages = howmany(bytes, PAGE_SIZE);
1208	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1209	if (pages > boot_pages)
1210		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1211#ifdef DIAGNOSTIC
1212	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1213	    boot_pages);
1214#endif
1215	mem = bootmem;
1216	boot_pages -= pages;
1217	bootmem += pages * PAGE_SIZE;
1218	*pflag = UMA_SLAB_BOOT;
1219
1220	return (mem);
1221}
1222
1223/*
1224 * Allocates a number of pages from the system
1225 *
1226 * Arguments:
1227 *	bytes  The number of bytes requested
1228 *	wait  Shall we wait?
1229 *
1230 * Returns:
1231 *	A pointer to the alloced memory or possibly
1232 *	NULL if M_NOWAIT is set.
1233 */
1234static void *
1235page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1236    int wait)
1237{
1238	void *p;	/* Returned page */
1239
1240	*pflag = UMA_SLAB_KERNEL;
1241	p = (void *)kmem_malloc_domainset(DOMAINSET_FIXED(domain), bytes, wait);
1242
1243	return (p);
1244}
1245
1246static void *
1247pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1248    int wait)
1249{
1250	struct pglist alloctail;
1251	vm_offset_t addr, zkva;
1252	int cpu, flags;
1253	vm_page_t p, p_next;
1254#ifdef NUMA
1255	struct pcpu *pc;
1256#endif
1257
1258	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1259
1260	TAILQ_INIT(&alloctail);
1261	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1262	    malloc2vm_flags(wait);
1263	*pflag = UMA_SLAB_KERNEL;
1264	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1265		if (CPU_ABSENT(cpu)) {
1266			p = vm_page_alloc(NULL, 0, flags);
1267		} else {
1268#ifndef NUMA
1269			p = vm_page_alloc(NULL, 0, flags);
1270#else
1271			pc = pcpu_find(cpu);
1272			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1273			if (__predict_false(p == NULL))
1274				p = vm_page_alloc(NULL, 0, flags);
1275#endif
1276		}
1277		if (__predict_false(p == NULL))
1278			goto fail;
1279		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1280	}
1281	if ((addr = kva_alloc(bytes)) == 0)
1282		goto fail;
1283	zkva = addr;
1284	TAILQ_FOREACH(p, &alloctail, listq) {
1285		pmap_qenter(zkva, &p, 1);
1286		zkva += PAGE_SIZE;
1287	}
1288	return ((void*)addr);
1289fail:
1290	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1291		vm_page_unwire_noq(p);
1292		vm_page_free(p);
1293	}
1294	return (NULL);
1295}
1296
1297/*
1298 * Allocates a number of pages from within an object
1299 *
1300 * Arguments:
1301 *	bytes  The number of bytes requested
1302 *	wait   Shall we wait?
1303 *
1304 * Returns:
1305 *	A pointer to the alloced memory or possibly
1306 *	NULL if M_NOWAIT is set.
1307 */
1308static void *
1309noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1310    int wait)
1311{
1312	TAILQ_HEAD(, vm_page) alloctail;
1313	u_long npages;
1314	vm_offset_t retkva, zkva;
1315	vm_page_t p, p_next;
1316	uma_keg_t keg;
1317
1318	TAILQ_INIT(&alloctail);
1319	keg = zone_first_keg(zone);
1320
1321	npages = howmany(bytes, PAGE_SIZE);
1322	while (npages > 0) {
1323		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1324		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1325		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1326		    VM_ALLOC_NOWAIT));
1327		if (p != NULL) {
1328			/*
1329			 * Since the page does not belong to an object, its
1330			 * listq is unused.
1331			 */
1332			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1333			npages--;
1334			continue;
1335		}
1336		/*
1337		 * Page allocation failed, free intermediate pages and
1338		 * exit.
1339		 */
1340		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1341			vm_page_unwire_noq(p);
1342			vm_page_free(p);
1343		}
1344		return (NULL);
1345	}
1346	*flags = UMA_SLAB_PRIV;
1347	zkva = keg->uk_kva +
1348	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1349	retkva = zkva;
1350	TAILQ_FOREACH(p, &alloctail, listq) {
1351		pmap_qenter(zkva, &p, 1);
1352		zkva += PAGE_SIZE;
1353	}
1354
1355	return ((void *)retkva);
1356}
1357
1358/*
1359 * Frees a number of pages to the system
1360 *
1361 * Arguments:
1362 *	mem   A pointer to the memory to be freed
1363 *	size  The size of the memory being freed
1364 *	flags The original p->us_flags field
1365 *
1366 * Returns:
1367 *	Nothing
1368 */
1369static void
1370page_free(void *mem, vm_size_t size, uint8_t flags)
1371{
1372
1373	if ((flags & UMA_SLAB_KERNEL) == 0)
1374		panic("UMA: page_free used with invalid flags %x", flags);
1375
1376	kmem_free((vm_offset_t)mem, size);
1377}
1378
1379/*
1380 * Frees pcpu zone allocations
1381 *
1382 * Arguments:
1383 *	mem   A pointer to the memory to be freed
1384 *	size  The size of the memory being freed
1385 *	flags The original p->us_flags field
1386 *
1387 * Returns:
1388 *	Nothing
1389 */
1390static void
1391pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1392{
1393	vm_offset_t sva, curva;
1394	vm_paddr_t paddr;
1395	vm_page_t m;
1396
1397	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1398	sva = (vm_offset_t)mem;
1399	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1400		paddr = pmap_kextract(curva);
1401		m = PHYS_TO_VM_PAGE(paddr);
1402		vm_page_unwire_noq(m);
1403		vm_page_free(m);
1404	}
1405	pmap_qremove(sva, size >> PAGE_SHIFT);
1406	kva_free(sva, size);
1407}
1408
1409
1410/*
1411 * Zero fill initializer
1412 *
1413 * Arguments/Returns follow uma_init specifications
1414 */
1415static int
1416zero_init(void *mem, int size, int flags)
1417{
1418	bzero(mem, size);
1419	return (0);
1420}
1421
1422/*
1423 * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1424 *
1425 * Arguments
1426 *	keg  The zone we should initialize
1427 *
1428 * Returns
1429 *	Nothing
1430 */
1431static void
1432keg_small_init(uma_keg_t keg)
1433{
1434	u_int rsize;
1435	u_int memused;
1436	u_int wastedspace;
1437	u_int shsize;
1438	u_int slabsize;
1439
1440	if (keg->uk_flags & UMA_ZONE_PCPU) {
1441		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1442
1443		slabsize = UMA_PCPU_ALLOC_SIZE;
1444		keg->uk_ppera = ncpus;
1445	} else {
1446		slabsize = UMA_SLAB_SIZE;
1447		keg->uk_ppera = 1;
1448	}
1449
1450	/*
1451	 * Calculate the size of each allocation (rsize) according to
1452	 * alignment.  If the requested size is smaller than we have
1453	 * allocation bits for we round it up.
1454	 */
1455	rsize = keg->uk_size;
1456	if (rsize < slabsize / SLAB_SETSIZE)
1457		rsize = slabsize / SLAB_SETSIZE;
1458	if (rsize & keg->uk_align)
1459		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1460	keg->uk_rsize = rsize;
1461
1462	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1463	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1464	    ("%s: size %u too large", __func__, keg->uk_rsize));
1465
1466	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1467		shsize = 0;
1468	else
1469		shsize = sizeof(struct uma_slab);
1470
1471	if (rsize <= slabsize - shsize)
1472		keg->uk_ipers = (slabsize - shsize) / rsize;
1473	else {
1474		/* Handle special case when we have 1 item per slab, so
1475		 * alignment requirement can be relaxed. */
1476		KASSERT(keg->uk_size <= slabsize - shsize,
1477		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1478		keg->uk_ipers = 1;
1479	}
1480	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1481	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1482
1483	memused = keg->uk_ipers * rsize + shsize;
1484	wastedspace = slabsize - memused;
1485
1486	/*
1487	 * We can't do OFFPAGE if we're internal or if we've been
1488	 * asked to not go to the VM for buckets.  If we do this we
1489	 * may end up going to the VM  for slabs which we do not
1490	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1491	 * of UMA_ZONE_VM, which clearly forbids it.
1492	 */
1493	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1494	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1495		return;
1496
1497	/*
1498	 * See if using an OFFPAGE slab will limit our waste.  Only do
1499	 * this if it permits more items per-slab.
1500	 *
1501	 * XXX We could try growing slabsize to limit max waste as well.
1502	 * Historically this was not done because the VM could not
1503	 * efficiently handle contiguous allocations.
1504	 */
1505	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1506	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1507		keg->uk_ipers = slabsize / keg->uk_rsize;
1508		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1509		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1510		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1511		    "keg: %s(%p), calculated wastedspace = %d, "
1512		    "maximum wasted space allowed = %d, "
1513		    "calculated ipers = %d, "
1514		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1515		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1516		    slabsize - keg->uk_ipers * keg->uk_rsize);
1517		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1518	}
1519
1520	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1521	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1522		keg->uk_flags |= UMA_ZONE_HASH;
1523}
1524
1525/*
1526 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1527 * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1528 * more complicated.
1529 *
1530 * Arguments
1531 *	keg  The keg we should initialize
1532 *
1533 * Returns
1534 *	Nothing
1535 */
1536static void
1537keg_large_init(uma_keg_t keg)
1538{
1539	u_int shsize;
1540
1541	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1542	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1543	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1544	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1545	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1546
1547	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1548	keg->uk_ipers = 1;
1549	keg->uk_rsize = keg->uk_size;
1550
1551	/* Check whether we have enough space to not do OFFPAGE. */
1552	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
1553		shsize = sizeof(struct uma_slab);
1554		if (shsize & UMA_ALIGN_PTR)
1555			shsize = (shsize & ~UMA_ALIGN_PTR) +
1556			    (UMA_ALIGN_PTR + 1);
1557
1558		if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
1559			/*
1560			 * We can't do OFFPAGE if we're internal, in which case
1561			 * we need an extra page per allocation to contain the
1562			 * slab header.
1563			 */
1564			if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1565				keg->uk_flags |= UMA_ZONE_OFFPAGE;
1566			else
1567				keg->uk_ppera++;
1568		}
1569	}
1570
1571	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1572	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1573		keg->uk_flags |= UMA_ZONE_HASH;
1574}
1575
1576static void
1577keg_cachespread_init(uma_keg_t keg)
1578{
1579	int alignsize;
1580	int trailer;
1581	int pages;
1582	int rsize;
1583
1584	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1585	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1586
1587	alignsize = keg->uk_align + 1;
1588	rsize = keg->uk_size;
1589	/*
1590	 * We want one item to start on every align boundary in a page.  To
1591	 * do this we will span pages.  We will also extend the item by the
1592	 * size of align if it is an even multiple of align.  Otherwise, it
1593	 * would fall on the same boundary every time.
1594	 */
1595	if (rsize & keg->uk_align)
1596		rsize = (rsize & ~keg->uk_align) + alignsize;
1597	if ((rsize & alignsize) == 0)
1598		rsize += alignsize;
1599	trailer = rsize - keg->uk_size;
1600	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1601	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1602	keg->uk_rsize = rsize;
1603	keg->uk_ppera = pages;
1604	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1605	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1606	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1607	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1608	    keg->uk_ipers));
1609}
1610
1611/*
1612 * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1613 * the keg onto the global keg list.
1614 *
1615 * Arguments/Returns follow uma_ctor specifications
1616 *	udata  Actually uma_kctor_args
1617 */
1618static int
1619keg_ctor(void *mem, int size, void *udata, int flags)
1620{
1621	struct uma_kctor_args *arg = udata;
1622	uma_keg_t keg = mem;
1623	uma_zone_t zone;
1624
1625	bzero(keg, size);
1626	keg->uk_size = arg->size;
1627	keg->uk_init = arg->uminit;
1628	keg->uk_fini = arg->fini;
1629	keg->uk_align = arg->align;
1630	keg->uk_free = 0;
1631	keg->uk_reserve = 0;
1632	keg->uk_pages = 0;
1633	keg->uk_flags = arg->flags;
1634	keg->uk_slabzone = NULL;
1635
1636	/*
1637	 * We use a global round-robin policy by default.  Zones with
1638	 * UMA_ZONE_NUMA set will use first-touch instead, in which case the
1639	 * iterator is never run.
1640	 */
1641	keg->uk_dr.dr_policy = DOMAINSET_RR();
1642	keg->uk_dr.dr_iter = 0;
1643
1644	/*
1645	 * The master zone is passed to us at keg-creation time.
1646	 */
1647	zone = arg->zone;
1648	keg->uk_name = zone->uz_name;
1649
1650	if (arg->flags & UMA_ZONE_VM)
1651		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1652
1653	if (arg->flags & UMA_ZONE_ZINIT)
1654		keg->uk_init = zero_init;
1655
1656	if (arg->flags & UMA_ZONE_MALLOC)
1657		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1658
1659	if (arg->flags & UMA_ZONE_PCPU)
1660#ifdef SMP
1661		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1662#else
1663		keg->uk_flags &= ~UMA_ZONE_PCPU;
1664#endif
1665
1666	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1667		keg_cachespread_init(keg);
1668	} else {
1669		if (keg->uk_size > UMA_SLAB_SPACE)
1670			keg_large_init(keg);
1671		else
1672			keg_small_init(keg);
1673	}
1674
1675	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1676		keg->uk_slabzone = slabzone;
1677
1678	/*
1679	 * If we haven't booted yet we need allocations to go through the
1680	 * startup cache until the vm is ready.
1681	 */
1682	if (booted < BOOT_PAGEALLOC)
1683		keg->uk_allocf = startup_alloc;
1684#ifdef UMA_MD_SMALL_ALLOC
1685	else if (keg->uk_ppera == 1)
1686		keg->uk_allocf = uma_small_alloc;
1687#endif
1688	else if (keg->uk_flags & UMA_ZONE_PCPU)
1689		keg->uk_allocf = pcpu_page_alloc;
1690	else
1691		keg->uk_allocf = page_alloc;
1692#ifdef UMA_MD_SMALL_ALLOC
1693	if (keg->uk_ppera == 1)
1694		keg->uk_freef = uma_small_free;
1695	else
1696#endif
1697	if (keg->uk_flags & UMA_ZONE_PCPU)
1698		keg->uk_freef = pcpu_page_free;
1699	else
1700		keg->uk_freef = page_free;
1701
1702	/*
1703	 * Initialize keg's lock
1704	 */
1705	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1706
1707	/*
1708	 * If we're putting the slab header in the actual page we need to
1709	 * figure out where in each page it goes.  This calculates a right
1710	 * justified offset into the memory on an ALIGN_PTR boundary.
1711	 */
1712	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1713		u_int totsize;
1714
1715		/* Size of the slab struct and free list */
1716		totsize = sizeof(struct uma_slab);
1717
1718		if (totsize & UMA_ALIGN_PTR)
1719			totsize = (totsize & ~UMA_ALIGN_PTR) +
1720			    (UMA_ALIGN_PTR + 1);
1721		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1722
1723		/*
1724		 * The only way the following is possible is if with our
1725		 * UMA_ALIGN_PTR adjustments we are now bigger than
1726		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1727		 * mathematically possible for all cases, so we make
1728		 * sure here anyway.
1729		 */
1730		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1731		if (totsize > PAGE_SIZE * keg->uk_ppera) {
1732			printf("zone %s ipers %d rsize %d size %d\n",
1733			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1734			    keg->uk_size);
1735			panic("UMA slab won't fit.");
1736		}
1737	}
1738
1739	if (keg->uk_flags & UMA_ZONE_HASH)
1740		hash_alloc(&keg->uk_hash, 0);
1741
1742	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1743	    keg, zone->uz_name, zone,
1744	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1745	    keg->uk_free);
1746
1747	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1748
1749	rw_wlock(&uma_rwlock);
1750	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1751	rw_wunlock(&uma_rwlock);
1752	return (0);
1753}
1754
1755/*
1756 * Zone header ctor.  This initializes all fields, locks, etc.
1757 *
1758 * Arguments/Returns follow uma_ctor specifications
1759 *	udata  Actually uma_zctor_args
1760 */
1761static int
1762zone_ctor(void *mem, int size, void *udata, int flags)
1763{
1764	struct uma_zctor_args *arg = udata;
1765	uma_zone_t zone = mem;
1766	uma_zone_t z;
1767	uma_keg_t keg;
1768
1769	bzero(zone, size);
1770	zone->uz_name = arg->name;
1771	zone->uz_ctor = arg->ctor;
1772	zone->uz_dtor = arg->dtor;
1773	zone->uz_slab = zone_fetch_slab;
1774	zone->uz_init = NULL;
1775	zone->uz_fini = NULL;
1776	zone->uz_allocs = 0;
1777	zone->uz_frees = 0;
1778	zone->uz_fails = 0;
1779	zone->uz_sleeps = 0;
1780	zone->uz_count = 0;
1781	zone->uz_count_min = 0;
1782	zone->uz_flags = 0;
1783	zone->uz_warning = NULL;
1784	/* The domain structures follow the cpu structures. */
1785	zone->uz_domain =
1786	    (struct uma_zone_domain *)&zone->uz_cpu[mp_maxid + 1];
1787	timevalclear(&zone->uz_ratecheck);
1788	keg = arg->keg;
1789
1790	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1791
1792	/*
1793	 * This is a pure cache zone, no kegs.
1794	 */
1795	if (arg->import) {
1796		if (arg->flags & UMA_ZONE_VM)
1797			arg->flags |= UMA_ZFLAG_CACHEONLY;
1798		zone->uz_flags = arg->flags;
1799		zone->uz_size = arg->size;
1800		zone->uz_import = arg->import;
1801		zone->uz_release = arg->release;
1802		zone->uz_arg = arg->arg;
1803		zone->uz_lockptr = &zone->uz_lock;
1804		rw_wlock(&uma_rwlock);
1805		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1806		rw_wunlock(&uma_rwlock);
1807		goto out;
1808	}
1809
1810	/*
1811	 * Use the regular zone/keg/slab allocator.
1812	 */
1813	zone->uz_import = (uma_import)zone_import;
1814	zone->uz_release = (uma_release)zone_release;
1815	zone->uz_arg = zone;
1816
1817	if (arg->flags & UMA_ZONE_SECONDARY) {
1818		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1819		zone->uz_init = arg->uminit;
1820		zone->uz_fini = arg->fini;
1821		zone->uz_lockptr = &keg->uk_lock;
1822		zone->uz_flags |= UMA_ZONE_SECONDARY;
1823		rw_wlock(&uma_rwlock);
1824		ZONE_LOCK(zone);
1825		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1826			if (LIST_NEXT(z, uz_link) == NULL) {
1827				LIST_INSERT_AFTER(z, zone, uz_link);
1828				break;
1829			}
1830		}
1831		ZONE_UNLOCK(zone);
1832		rw_wunlock(&uma_rwlock);
1833	} else if (keg == NULL) {
1834		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1835		    arg->align, arg->flags)) == NULL)
1836			return (ENOMEM);
1837	} else {
1838		struct uma_kctor_args karg;
1839		int error;
1840
1841		/* We should only be here from uma_startup() */
1842		karg.size = arg->size;
1843		karg.uminit = arg->uminit;
1844		karg.fini = arg->fini;
1845		karg.align = arg->align;
1846		karg.flags = arg->flags;
1847		karg.zone = zone;
1848		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1849		    flags);
1850		if (error)
1851			return (error);
1852	}
1853
1854	/*
1855	 * Link in the first keg.
1856	 */
1857	zone->uz_klink.kl_keg = keg;
1858	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1859	zone->uz_lockptr = &keg->uk_lock;
1860	zone->uz_size = keg->uk_size;
1861	zone->uz_flags |= (keg->uk_flags &
1862	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1863
1864	/*
1865	 * Some internal zones don't have room allocated for the per cpu
1866	 * caches.  If we're internal, bail out here.
1867	 */
1868	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1869		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1870		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1871		return (0);
1872	}
1873
1874out:
1875	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1876	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1877	    ("Invalid zone flag combination"));
1878	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
1879		zone->uz_count = BUCKET_MAX;
1880	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1881		zone->uz_count = 0;
1882	else
1883		zone->uz_count = bucket_select(zone->uz_size);
1884	zone->uz_count_min = zone->uz_count;
1885
1886	return (0);
1887}
1888
1889/*
1890 * Keg header dtor.  This frees all data, destroys locks, frees the hash
1891 * table and removes the keg from the global list.
1892 *
1893 * Arguments/Returns follow uma_dtor specifications
1894 *	udata  unused
1895 */
1896static void
1897keg_dtor(void *arg, int size, void *udata)
1898{
1899	uma_keg_t keg;
1900
1901	keg = (uma_keg_t)arg;
1902	KEG_LOCK(keg);
1903	if (keg->uk_free != 0) {
1904		printf("Freed UMA keg (%s) was not empty (%d items). "
1905		    " Lost %d pages of memory.\n",
1906		    keg->uk_name ? keg->uk_name : "",
1907		    keg->uk_free, keg->uk_pages);
1908	}
1909	KEG_UNLOCK(keg);
1910
1911	hash_free(&keg->uk_hash);
1912
1913	KEG_LOCK_FINI(keg);
1914}
1915
1916/*
1917 * Zone header dtor.
1918 *
1919 * Arguments/Returns follow uma_dtor specifications
1920 *	udata  unused
1921 */
1922static void
1923zone_dtor(void *arg, int size, void *udata)
1924{
1925	uma_klink_t klink;
1926	uma_zone_t zone;
1927	uma_keg_t keg;
1928
1929	zone = (uma_zone_t)arg;
1930	keg = zone_first_keg(zone);
1931
1932	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1933		cache_drain(zone);
1934
1935	rw_wlock(&uma_rwlock);
1936	LIST_REMOVE(zone, uz_link);
1937	rw_wunlock(&uma_rwlock);
1938	/*
1939	 * XXX there are some races here where
1940	 * the zone can be drained but zone lock
1941	 * released and then refilled before we
1942	 * remove it... we dont care for now
1943	 */
1944	zone_drain_wait(zone, M_WAITOK);
1945	/*
1946	 * Unlink all of our kegs.
1947	 */
1948	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1949		klink->kl_keg = NULL;
1950		LIST_REMOVE(klink, kl_link);
1951		if (klink == &zone->uz_klink)
1952			continue;
1953		free(klink, M_TEMP);
1954	}
1955	/*
1956	 * We only destroy kegs from non secondary zones.
1957	 */
1958	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1959		rw_wlock(&uma_rwlock);
1960		LIST_REMOVE(keg, uk_link);
1961		rw_wunlock(&uma_rwlock);
1962		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1963	}
1964	ZONE_LOCK_FINI(zone);
1965}
1966
1967/*
1968 * Traverses every zone in the system and calls a callback
1969 *
1970 * Arguments:
1971 *	zfunc  A pointer to a function which accepts a zone
1972 *		as an argument.
1973 *
1974 * Returns:
1975 *	Nothing
1976 */
1977static void
1978zone_foreach(void (*zfunc)(uma_zone_t))
1979{
1980	uma_keg_t keg;
1981	uma_zone_t zone;
1982
1983	rw_rlock(&uma_rwlock);
1984	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1985		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1986			zfunc(zone);
1987	}
1988	rw_runlock(&uma_rwlock);
1989}
1990
1991/*
1992 * Count how many pages do we need to bootstrap.  VM supplies
1993 * its need in early zones in the argument, we add up our zones,
1994 * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
1995 * zone of zones and zone of kegs are accounted separately.
1996 */
1997#define	UMA_BOOT_ZONES	11
1998/* Zone of zones and zone of kegs have arbitrary alignment. */
1999#define	UMA_BOOT_ALIGN	32
2000static int zsize, ksize;
2001int
2002uma_startup_count(int vm_zones)
2003{
2004	int zones, pages;
2005
2006	ksize = sizeof(struct uma_keg) +
2007	    (sizeof(struct uma_domain) * vm_ndomains);
2008	zsize = sizeof(struct uma_zone) +
2009	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
2010	    (sizeof(struct uma_zone_domain) * vm_ndomains);
2011
2012	/*
2013	 * Memory for the zone of kegs and its keg,
2014	 * and for zone of zones.
2015	 */
2016	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
2017	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
2018
2019#ifdef	UMA_MD_SMALL_ALLOC
2020	zones = UMA_BOOT_ZONES;
2021#else
2022	zones = UMA_BOOT_ZONES + vm_zones;
2023	vm_zones = 0;
2024#endif
2025
2026	/* Memory for the rest of startup zones, UMA and VM, ... */
2027	if (zsize > UMA_SLAB_SPACE)
2028		pages += (zones + vm_zones) *
2029		    howmany(roundup2(zsize, UMA_BOOT_ALIGN), UMA_SLAB_SIZE);
2030	else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
2031		pages += zones;
2032	else
2033		pages += howmany(zones,
2034		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
2035
2036	/* ... and their kegs. Note that zone of zones allocates a keg! */
2037	pages += howmany(zones + 1,
2038	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
2039
2040	/*
2041	 * Most of startup zones are not going to be offpages, that's
2042	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
2043	 * calculations.  Some large bucket zones will be offpage, and
2044	 * thus will allocate hashes.  We take conservative approach
2045	 * and assume that all zones may allocate hash.  This may give
2046	 * us some positive inaccuracy, usually an extra single page.
2047	 */
2048	pages += howmany(zones, UMA_SLAB_SPACE /
2049	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
2050
2051	return (pages);
2052}
2053
2054void
2055uma_startup(void *mem, int npages)
2056{
2057	struct uma_zctor_args args;
2058	uma_keg_t masterkeg;
2059	uintptr_t m;
2060
2061#ifdef DIAGNOSTIC
2062	printf("Entering %s with %d boot pages configured\n", __func__, npages);
2063#endif
2064
2065	rw_init(&uma_rwlock, "UMA lock");
2066
2067	/* Use bootpages memory for the zone of zones and zone of kegs. */
2068	m = (uintptr_t)mem;
2069	zones = (uma_zone_t)m;
2070	m += roundup(zsize, CACHE_LINE_SIZE);
2071	kegs = (uma_zone_t)m;
2072	m += roundup(zsize, CACHE_LINE_SIZE);
2073	masterkeg = (uma_keg_t)m;
2074	m += roundup(ksize, CACHE_LINE_SIZE);
2075	m = roundup(m, PAGE_SIZE);
2076	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2077	mem = (void *)m;
2078
2079	/* "manually" create the initial zone */
2080	memset(&args, 0, sizeof(args));
2081	args.name = "UMA Kegs";
2082	args.size = ksize;
2083	args.ctor = keg_ctor;
2084	args.dtor = keg_dtor;
2085	args.uminit = zero_init;
2086	args.fini = NULL;
2087	args.keg = masterkeg;
2088	args.align = UMA_BOOT_ALIGN - 1;
2089	args.flags = UMA_ZFLAG_INTERNAL;
2090	zone_ctor(kegs, zsize, &args, M_WAITOK);
2091
2092	bootmem = mem;
2093	boot_pages = npages;
2094
2095	args.name = "UMA Zones";
2096	args.size = zsize;
2097	args.ctor = zone_ctor;
2098	args.dtor = zone_dtor;
2099	args.uminit = zero_init;
2100	args.fini = NULL;
2101	args.keg = NULL;
2102	args.align = UMA_BOOT_ALIGN - 1;
2103	args.flags = UMA_ZFLAG_INTERNAL;
2104	zone_ctor(zones, zsize, &args, M_WAITOK);
2105
2106	/* Now make a zone for slab headers */
2107	slabzone = uma_zcreate("UMA Slabs",
2108				sizeof(struct uma_slab),
2109				NULL, NULL, NULL, NULL,
2110				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2111
2112	hashzone = uma_zcreate("UMA Hash",
2113	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2114	    NULL, NULL, NULL, NULL,
2115	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2116
2117	bucket_init();
2118
2119	booted = BOOT_STRAPPED;
2120}
2121
2122void
2123uma_startup1(void)
2124{
2125
2126#ifdef DIAGNOSTIC
2127	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2128#endif
2129	booted = BOOT_PAGEALLOC;
2130}
2131
2132void
2133uma_startup2(void)
2134{
2135
2136#ifdef DIAGNOSTIC
2137	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2138#endif
2139	booted = BOOT_BUCKETS;
2140	sx_init(&uma_drain_lock, "umadrain");
2141	bucket_enable();
2142}
2143
2144static void
2145uma_startup3(void)
2146{
2147
2148#ifdef INVARIANTS
2149	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2150	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2151	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2152#endif
2153	callout_init(&uma_callout, 1);
2154	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2155	booted = BOOT_RUNNING;
2156
2157	EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
2158	    EVENTHANDLER_PRI_FIRST);
2159}
2160
2161static void
2162uma_shutdown(void)
2163{
2164
2165	booted = BOOT_SHUTDOWN;
2166}
2167
2168static uma_keg_t
2169uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2170		int align, uint32_t flags)
2171{
2172	struct uma_kctor_args args;
2173
2174	args.size = size;
2175	args.uminit = uminit;
2176	args.fini = fini;
2177	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2178	args.flags = flags;
2179	args.zone = zone;
2180	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2181}
2182
2183/* Public functions */
2184/* See uma.h */
2185void
2186uma_set_align(int align)
2187{
2188
2189	if (align != UMA_ALIGN_CACHE)
2190		uma_align_cache = align;
2191}
2192
2193/* See uma.h */
2194uma_zone_t
2195uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2196		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2197
2198{
2199	struct uma_zctor_args args;
2200	uma_zone_t res;
2201	bool locked;
2202
2203	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2204	    align, name));
2205
2206	/* This stuff is essential for the zone ctor */
2207	memset(&args, 0, sizeof(args));
2208	args.name = name;
2209	args.size = size;
2210	args.ctor = ctor;
2211	args.dtor = dtor;
2212	args.uminit = uminit;
2213	args.fini = fini;
2214#ifdef  INVARIANTS
2215	/*
2216	 * If a zone is being created with an empty constructor and
2217	 * destructor, pass UMA constructor/destructor which checks for
2218	 * memory use after free.
2219	 */
2220	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2221	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2222		args.ctor = trash_ctor;
2223		args.dtor = trash_dtor;
2224		args.uminit = trash_init;
2225		args.fini = trash_fini;
2226	}
2227#endif
2228	args.align = align;
2229	args.flags = flags;
2230	args.keg = NULL;
2231
2232	if (booted < BOOT_BUCKETS) {
2233		locked = false;
2234	} else {
2235		sx_slock(&uma_drain_lock);
2236		locked = true;
2237	}
2238	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2239	if (locked)
2240		sx_sunlock(&uma_drain_lock);
2241	return (res);
2242}
2243
2244/* See uma.h */
2245uma_zone_t
2246uma_zsecond_create(const char *name, uma_ctor ctor, uma_dtor dtor,
2247    uma_init zinit, uma_fini zfini, uma_zone_t master)
2248{
2249	struct uma_zctor_args args;
2250	uma_keg_t keg;
2251	uma_zone_t res;
2252	bool locked;
2253
2254	keg = zone_first_keg(master);
2255	memset(&args, 0, sizeof(args));
2256	args.name = name;
2257	args.size = keg->uk_size;
2258	args.ctor = ctor;
2259	args.dtor = dtor;
2260	args.uminit = zinit;
2261	args.fini = zfini;
2262	args.align = keg->uk_align;
2263	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2264	args.keg = keg;
2265
2266	if (booted < BOOT_BUCKETS) {
2267		locked = false;
2268	} else {
2269		sx_slock(&uma_drain_lock);
2270		locked = true;
2271	}
2272	/* XXX Attaches only one keg of potentially many. */
2273	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2274	if (locked)
2275		sx_sunlock(&uma_drain_lock);
2276	return (res);
2277}
2278
2279/* See uma.h */
2280uma_zone_t
2281uma_zcache_create(const char *name, int size, uma_ctor ctor, uma_dtor dtor,
2282    uma_init zinit, uma_fini zfini, uma_import zimport, uma_release zrelease,
2283    void *arg, int flags)
2284{
2285	struct uma_zctor_args args;
2286
2287	memset(&args, 0, sizeof(args));
2288	args.name = name;
2289	args.size = size;
2290	args.ctor = ctor;
2291	args.dtor = dtor;
2292	args.uminit = zinit;
2293	args.fini = zfini;
2294	args.import = zimport;
2295	args.release = zrelease;
2296	args.arg = arg;
2297	args.align = 0;
2298	args.flags = flags;
2299
2300	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2301}
2302
2303static void
2304zone_lock_pair(uma_zone_t a, uma_zone_t b)
2305{
2306	if (a < b) {
2307		ZONE_LOCK(a);
2308		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
2309	} else {
2310		ZONE_LOCK(b);
2311		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
2312	}
2313}
2314
2315static void
2316zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2317{
2318
2319	ZONE_UNLOCK(a);
2320	ZONE_UNLOCK(b);
2321}
2322
2323int
2324uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2325{
2326	uma_klink_t klink;
2327	uma_klink_t kl;
2328	int error;
2329
2330	error = 0;
2331	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2332
2333	zone_lock_pair(zone, master);
2334	/*
2335	 * zone must use vtoslab() to resolve objects and must already be
2336	 * a secondary.
2337	 */
2338	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2339	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2340		error = EINVAL;
2341		goto out;
2342	}
2343	/*
2344	 * The new master must also use vtoslab().
2345	 */
2346	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2347		error = EINVAL;
2348		goto out;
2349	}
2350
2351	/*
2352	 * The underlying object must be the same size.  rsize
2353	 * may be different.
2354	 */
2355	if (master->uz_size != zone->uz_size) {
2356		error = E2BIG;
2357		goto out;
2358	}
2359	/*
2360	 * Put it at the end of the list.
2361	 */
2362	klink->kl_keg = zone_first_keg(master);
2363	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2364		if (LIST_NEXT(kl, kl_link) == NULL) {
2365			LIST_INSERT_AFTER(kl, klink, kl_link);
2366			break;
2367		}
2368	}
2369	klink = NULL;
2370	zone->uz_flags |= UMA_ZFLAG_MULTI;
2371	zone->uz_slab = zone_fetch_slab_multi;
2372
2373out:
2374	zone_unlock_pair(zone, master);
2375	if (klink != NULL)
2376		free(klink, M_TEMP);
2377
2378	return (error);
2379}
2380
2381
2382/* See uma.h */
2383void
2384uma_zdestroy(uma_zone_t zone)
2385{
2386
2387	/*
2388	 * Large slabs are expensive to reclaim, so don't bother doing
2389	 * unnecessary work if we're shutting down.
2390	 */
2391	if (booted == BOOT_SHUTDOWN &&
2392	    zone->uz_fini == NULL &&
2393	    zone->uz_release == (uma_release)zone_release)
2394		return;
2395	sx_slock(&uma_drain_lock);
2396	zone_free_item(zones, zone, NULL, SKIP_NONE);
2397	sx_sunlock(&uma_drain_lock);
2398}
2399
2400void
2401uma_zwait(uma_zone_t zone)
2402{
2403	void *item;
2404
2405	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2406	uma_zfree(zone, item);
2407}
2408
2409void *
2410uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2411{
2412	void *item;
2413#ifdef SMP
2414	int i;
2415
2416	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2417#endif
2418	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2419	if (item != NULL && (flags & M_ZERO)) {
2420#ifdef SMP
2421		for (i = 0; i <= mp_maxid; i++)
2422			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2423#else
2424		bzero(item, zone->uz_size);
2425#endif
2426	}
2427	return (item);
2428}
2429
2430/*
2431 * A stub while both regular and pcpu cases are identical.
2432 */
2433void
2434uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2435{
2436
2437#ifdef SMP
2438	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2439#endif
2440	uma_zfree_arg(zone, item, udata);
2441}
2442
2443/* See uma.h */
2444void *
2445uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2446{
2447	uma_zone_domain_t zdom;
2448	uma_bucket_t bucket;
2449	uma_cache_t cache;
2450	void *item;
2451	int cpu, domain, lockfail;
2452#ifdef INVARIANTS
2453	bool skipdbg;
2454#endif
2455
2456	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2457	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2458
2459	/* This is the fast path allocation */
2460	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2461	    curthread, zone->uz_name, zone, flags);
2462
2463	if (flags & M_WAITOK) {
2464		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2465		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2466	}
2467	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2468	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2469	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2470	if (zone->uz_flags & UMA_ZONE_PCPU)
2471		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2472		    "with M_ZERO passed"));
2473
2474#ifdef DEBUG_MEMGUARD
2475	if (memguard_cmp_zone(zone)) {
2476		item = memguard_alloc(zone->uz_size, flags);
2477		if (item != NULL) {
2478			if (zone->uz_init != NULL &&
2479			    zone->uz_init(item, zone->uz_size, flags) != 0)
2480				return (NULL);
2481			if (zone->uz_ctor != NULL &&
2482			    zone->uz_ctor(item, zone->uz_size, udata,
2483			    flags) != 0) {
2484			    	zone->uz_fini(item, zone->uz_size);
2485				return (NULL);
2486			}
2487			return (item);
2488		}
2489		/* This is unfortunate but should not be fatal. */
2490	}
2491#endif
2492	/*
2493	 * If possible, allocate from the per-CPU cache.  There are two
2494	 * requirements for safe access to the per-CPU cache: (1) the thread
2495	 * accessing the cache must not be preempted or yield during access,
2496	 * and (2) the thread must not migrate CPUs without switching which
2497	 * cache it accesses.  We rely on a critical section to prevent
2498	 * preemption and migration.  We release the critical section in
2499	 * order to acquire the zone mutex if we are unable to allocate from
2500	 * the current cache; when we re-acquire the critical section, we
2501	 * must detect and handle migration if it has occurred.
2502	 */
2503zalloc_restart:
2504	critical_enter();
2505	cpu = curcpu;
2506	cache = &zone->uz_cpu[cpu];
2507
2508zalloc_start:
2509	bucket = cache->uc_allocbucket;
2510	if (bucket != NULL && bucket->ub_cnt > 0) {
2511		bucket->ub_cnt--;
2512		item = bucket->ub_bucket[bucket->ub_cnt];
2513#ifdef INVARIANTS
2514		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2515#endif
2516		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2517		cache->uc_allocs++;
2518		critical_exit();
2519#ifdef INVARIANTS
2520		skipdbg = uma_dbg_zskip(zone, item);
2521#endif
2522		if (zone->uz_ctor != NULL &&
2523#ifdef INVARIANTS
2524		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2525		    zone->uz_dtor != trash_dtor) &&
2526#endif
2527		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2528			atomic_add_long(&zone->uz_fails, 1);
2529			zone_free_item(zone, item, udata, SKIP_DTOR);
2530			return (NULL);
2531		}
2532#ifdef INVARIANTS
2533		if (!skipdbg)
2534			uma_dbg_alloc(zone, NULL, item);
2535#endif
2536		if (flags & M_ZERO)
2537			uma_zero_item(item, zone);
2538		return (item);
2539	}
2540
2541	/*
2542	 * We have run out of items in our alloc bucket.
2543	 * See if we can switch with our free bucket.
2544	 */
2545	bucket = cache->uc_freebucket;
2546	if (bucket != NULL && bucket->ub_cnt > 0) {
2547		CTR2(KTR_UMA,
2548		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2549		    zone->uz_name, zone);
2550		cache->uc_freebucket = cache->uc_allocbucket;
2551		cache->uc_allocbucket = bucket;
2552		goto zalloc_start;
2553	}
2554
2555	/*
2556	 * Discard any empty allocation bucket while we hold no locks.
2557	 */
2558	bucket = cache->uc_allocbucket;
2559	cache->uc_allocbucket = NULL;
2560	critical_exit();
2561	if (bucket != NULL)
2562		bucket_free(zone, bucket, udata);
2563
2564	if (zone->uz_flags & UMA_ZONE_NUMA) {
2565		domain = PCPU_GET(domain);
2566		if (VM_DOMAIN_EMPTY(domain))
2567			domain = UMA_ANYDOMAIN;
2568	} else
2569		domain = UMA_ANYDOMAIN;
2570
2571	/* Short-circuit for zones without buckets and low memory. */
2572	if (zone->uz_count == 0 || bucketdisable)
2573		goto zalloc_item;
2574
2575	/*
2576	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2577	 * we must go back to the zone.  This requires the zone lock, so we
2578	 * must drop the critical section, then re-acquire it when we go back
2579	 * to the cache.  Since the critical section is released, we may be
2580	 * preempted or migrate.  As such, make sure not to maintain any
2581	 * thread-local state specific to the cache from prior to releasing
2582	 * the critical section.
2583	 */
2584	lockfail = 0;
2585	if (ZONE_TRYLOCK(zone) == 0) {
2586		/* Record contention to size the buckets. */
2587		ZONE_LOCK(zone);
2588		lockfail = 1;
2589	}
2590	critical_enter();
2591	cpu = curcpu;
2592	cache = &zone->uz_cpu[cpu];
2593
2594	/* See if we lost the race to fill the cache. */
2595	if (cache->uc_allocbucket != NULL) {
2596		ZONE_UNLOCK(zone);
2597		goto zalloc_start;
2598	}
2599
2600	/*
2601	 * Check the zone's cache of buckets.
2602	 */
2603	if (domain == UMA_ANYDOMAIN)
2604		zdom = &zone->uz_domain[0];
2605	else
2606		zdom = &zone->uz_domain[domain];
2607	if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
2608		KASSERT(bucket->ub_cnt != 0,
2609		    ("uma_zalloc_arg: Returning an empty bucket."));
2610		cache->uc_allocbucket = bucket;
2611		ZONE_UNLOCK(zone);
2612		goto zalloc_start;
2613	}
2614	/* We are no longer associated with this CPU. */
2615	critical_exit();
2616
2617	/*
2618	 * We bump the uz count when the cache size is insufficient to
2619	 * handle the working set.
2620	 */
2621	if (lockfail && zone->uz_count < BUCKET_MAX)
2622		zone->uz_count++;
2623	ZONE_UNLOCK(zone);
2624
2625	/*
2626	 * Now lets just fill a bucket and put it on the free list.  If that
2627	 * works we'll restart the allocation from the beginning and it
2628	 * will use the just filled bucket.
2629	 */
2630	bucket = zone_alloc_bucket(zone, udata, domain, flags);
2631	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2632	    zone->uz_name, zone, bucket);
2633	if (bucket != NULL) {
2634		ZONE_LOCK(zone);
2635		critical_enter();
2636		cpu = curcpu;
2637		cache = &zone->uz_cpu[cpu];
2638
2639		/*
2640		 * See if we lost the race or were migrated.  Cache the
2641		 * initialized bucket to make this less likely or claim
2642		 * the memory directly.
2643		 */
2644		if (cache->uc_allocbucket == NULL &&
2645		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
2646		    domain == PCPU_GET(domain))) {
2647			cache->uc_allocbucket = bucket;
2648			zdom->uzd_imax += bucket->ub_cnt;
2649		} else if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
2650			critical_exit();
2651			ZONE_UNLOCK(zone);
2652			bucket_drain(zone, bucket);
2653			bucket_free(zone, bucket, udata);
2654			goto zalloc_restart;
2655		} else
2656			zone_put_bucket(zone, zdom, bucket, false);
2657		ZONE_UNLOCK(zone);
2658		goto zalloc_start;
2659	}
2660
2661	/*
2662	 * We may not be able to get a bucket so return an actual item.
2663	 */
2664zalloc_item:
2665	item = zone_alloc_item(zone, udata, domain, flags);
2666
2667	return (item);
2668}
2669
2670void *
2671uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2672{
2673
2674	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2675	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2676
2677	/* This is the fast path allocation */
2678	CTR5(KTR_UMA,
2679	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2680	    curthread, zone->uz_name, zone, domain, flags);
2681
2682	if (flags & M_WAITOK) {
2683		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2684		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2685	}
2686	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2687	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2688
2689	return (zone_alloc_item(zone, udata, domain, flags));
2690}
2691
2692/*
2693 * Find a slab with some space.  Prefer slabs that are partially used over those
2694 * that are totally full.  This helps to reduce fragmentation.
2695 *
2696 * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2697 * only 'domain'.
2698 */
2699static uma_slab_t
2700keg_first_slab(uma_keg_t keg, int domain, bool rr)
2701{
2702	uma_domain_t dom;
2703	uma_slab_t slab;
2704	int start;
2705
2706	KASSERT(domain >= 0 && domain < vm_ndomains,
2707	    ("keg_first_slab: domain %d out of range", domain));
2708
2709	slab = NULL;
2710	start = domain;
2711	do {
2712		dom = &keg->uk_domain[domain];
2713		if (!LIST_EMPTY(&dom->ud_part_slab))
2714			return (LIST_FIRST(&dom->ud_part_slab));
2715		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2716			slab = LIST_FIRST(&dom->ud_free_slab);
2717			LIST_REMOVE(slab, us_link);
2718			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2719			return (slab);
2720		}
2721		if (rr)
2722			domain = (domain + 1) % vm_ndomains;
2723	} while (domain != start);
2724
2725	return (NULL);
2726}
2727
2728static uma_slab_t
2729keg_fetch_free_slab(uma_keg_t keg, int domain, bool rr, int flags)
2730{
2731	uint32_t reserve;
2732
2733	mtx_assert(&keg->uk_lock, MA_OWNED);
2734
2735	reserve = (flags & M_USE_RESERVE) != 0 ? 0 : keg->uk_reserve;
2736	if (keg->uk_free <= reserve)
2737		return (NULL);
2738	return (keg_first_slab(keg, domain, rr));
2739}
2740
2741static uma_slab_t
2742keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, const int flags)
2743{
2744	struct vm_domainset_iter di;
2745	uma_domain_t dom;
2746	uma_slab_t slab;
2747	int aflags, domain;
2748	bool rr;
2749
2750restart:
2751	mtx_assert(&keg->uk_lock, MA_OWNED);
2752
2753	/*
2754	 * Use the keg's policy if upper layers haven't already specified a
2755	 * domain (as happens with first-touch zones).
2756	 *
2757	 * To avoid races we run the iterator with the keg lock held, but that
2758	 * means that we cannot allow the vm_domainset layer to sleep.  Thus,
2759	 * clear M_WAITOK and handle low memory conditions locally.
2760	 */
2761	rr = rdomain == UMA_ANYDOMAIN;
2762	if (rr) {
2763		aflags = (flags & ~M_WAITOK) | M_NOWAIT;
2764		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
2765		    &aflags);
2766	} else {
2767		aflags = flags;
2768		domain = rdomain;
2769	}
2770
2771	for (;;) {
2772		slab = keg_fetch_free_slab(keg, domain, rr, flags);
2773		if (slab != NULL) {
2774			MPASS(slab->us_keg == keg);
2775			return (slab);
2776		}
2777
2778		/*
2779		 * M_NOVM means don't ask at all!
2780		 */
2781		if (flags & M_NOVM)
2782			break;
2783
2784		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2785			keg->uk_flags |= UMA_ZFLAG_FULL;
2786			/*
2787			 * If this is not a multi-zone, set the FULL bit.
2788			 * Otherwise slab_multi() takes care of it.
2789			 */
2790			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2791				zone->uz_flags |= UMA_ZFLAG_FULL;
2792				zone_log_warning(zone);
2793				zone_maxaction(zone);
2794			}
2795			if (flags & M_NOWAIT)
2796				return (NULL);
2797			zone->uz_sleeps++;
2798			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2799			continue;
2800		}
2801		slab = keg_alloc_slab(keg, zone, domain, flags, aflags);
2802		/*
2803		 * If we got a slab here it's safe to mark it partially used
2804		 * and return.  We assume that the caller is going to remove
2805		 * at least one item.
2806		 */
2807		if (slab) {
2808			MPASS(slab->us_keg == keg);
2809			dom = &keg->uk_domain[slab->us_domain];
2810			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2811			return (slab);
2812		}
2813		KEG_LOCK(keg);
2814		if (rr && vm_domainset_iter_policy(&di, &domain) != 0) {
2815			if ((flags & M_WAITOK) != 0) {
2816				KEG_UNLOCK(keg);
2817				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
2818				KEG_LOCK(keg);
2819				goto restart;
2820			}
2821			break;
2822		}
2823	}
2824
2825	/*
2826	 * We might not have been able to get a slab but another cpu
2827	 * could have while we were unlocked.  Check again before we
2828	 * fail.
2829	 */
2830	if ((slab = keg_fetch_free_slab(keg, domain, rr, flags)) != NULL) {
2831		MPASS(slab->us_keg == keg);
2832		return (slab);
2833	}
2834	return (NULL);
2835}
2836
2837static uma_slab_t
2838zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2839{
2840	uma_slab_t slab;
2841
2842	if (keg == NULL) {
2843		keg = zone_first_keg(zone);
2844		KEG_LOCK(keg);
2845	}
2846
2847	for (;;) {
2848		slab = keg_fetch_slab(keg, zone, domain, flags);
2849		if (slab)
2850			return (slab);
2851		if (flags & (M_NOWAIT | M_NOVM))
2852			break;
2853	}
2854	KEG_UNLOCK(keg);
2855	return (NULL);
2856}
2857
2858/*
2859 * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2860 * with the keg locked.  On NULL no lock is held.
2861 *
2862 * The last pointer is used to seed the search.  It is not required.
2863 */
2864static uma_slab_t
2865zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
2866{
2867	uma_klink_t klink;
2868	uma_slab_t slab;
2869	uma_keg_t keg;
2870	int flags;
2871	int empty;
2872	int full;
2873
2874	/*
2875	 * Don't wait on the first pass.  This will skip limit tests
2876	 * as well.  We don't want to block if we can find a provider
2877	 * without blocking.
2878	 */
2879	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2880	/*
2881	 * Use the last slab allocated as a hint for where to start
2882	 * the search.
2883	 */
2884	if (last != NULL) {
2885		slab = keg_fetch_slab(last, zone, domain, flags);
2886		if (slab)
2887			return (slab);
2888		KEG_UNLOCK(last);
2889	}
2890	/*
2891	 * Loop until we have a slab incase of transient failures
2892	 * while M_WAITOK is specified.  I'm not sure this is 100%
2893	 * required but we've done it for so long now.
2894	 */
2895	for (;;) {
2896		empty = 0;
2897		full = 0;
2898		/*
2899		 * Search the available kegs for slabs.  Be careful to hold the
2900		 * correct lock while calling into the keg layer.
2901		 */
2902		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2903			keg = klink->kl_keg;
2904			KEG_LOCK(keg);
2905			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2906				slab = keg_fetch_slab(keg, zone, domain, flags);
2907				if (slab)
2908					return (slab);
2909			}
2910			if (keg->uk_flags & UMA_ZFLAG_FULL)
2911				full++;
2912			else
2913				empty++;
2914			KEG_UNLOCK(keg);
2915		}
2916		if (rflags & (M_NOWAIT | M_NOVM))
2917			break;
2918		flags = rflags;
2919		/*
2920		 * All kegs are full.  XXX We can't atomically check all kegs
2921		 * and sleep so just sleep for a short period and retry.
2922		 */
2923		if (full && !empty) {
2924			ZONE_LOCK(zone);
2925			zone->uz_flags |= UMA_ZFLAG_FULL;
2926			zone->uz_sleeps++;
2927			zone_log_warning(zone);
2928			zone_maxaction(zone);
2929			msleep(zone, zone->uz_lockptr, PVM,
2930			    "zonelimit", hz/100);
2931			zone->uz_flags &= ~UMA_ZFLAG_FULL;
2932			ZONE_UNLOCK(zone);
2933			continue;
2934		}
2935	}
2936	return (NULL);
2937}
2938
2939static void *
2940slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2941{
2942	uma_domain_t dom;
2943	void *item;
2944	uint8_t freei;
2945
2946	MPASS(keg == slab->us_keg);
2947	mtx_assert(&keg->uk_lock, MA_OWNED);
2948
2949	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2950	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2951	item = slab->us_data + (keg->uk_rsize * freei);
2952	slab->us_freecount--;
2953	keg->uk_free--;
2954
2955	/* Move this slab to the full list */
2956	if (slab->us_freecount == 0) {
2957		LIST_REMOVE(slab, us_link);
2958		dom = &keg->uk_domain[slab->us_domain];
2959		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2960	}
2961
2962	return (item);
2963}
2964
2965static int
2966zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2967{
2968	uma_slab_t slab;
2969	uma_keg_t keg;
2970#ifdef NUMA
2971	int stripe;
2972#endif
2973	int i;
2974
2975	slab = NULL;
2976	keg = NULL;
2977	/* Try to keep the buckets totally full */
2978	for (i = 0; i < max; ) {
2979		if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
2980			break;
2981		keg = slab->us_keg;
2982#ifdef NUMA
2983		stripe = howmany(max, vm_ndomains);
2984#endif
2985		while (slab->us_freecount && i < max) {
2986			bucket[i++] = slab_alloc_item(keg, slab);
2987			if (keg->uk_free <= keg->uk_reserve)
2988				break;
2989#ifdef NUMA
2990			/*
2991			 * If the zone is striped we pick a new slab for every
2992			 * N allocations.  Eliminating this conditional will
2993			 * instead pick a new domain for each bucket rather
2994			 * than stripe within each bucket.  The current option
2995			 * produces more fragmentation and requires more cpu
2996			 * time but yields better distribution.
2997			 */
2998			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2999			    vm_ndomains > 1 && --stripe == 0)
3000				break;
3001#endif
3002		}
3003		/* Don't block if we allocated any successfully. */
3004		flags &= ~M_WAITOK;
3005		flags |= M_NOWAIT;
3006	}
3007	if (slab != NULL)
3008		KEG_UNLOCK(keg);
3009
3010	return i;
3011}
3012
3013static uma_bucket_t
3014zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
3015{
3016	uma_bucket_t bucket;
3017	int max;
3018
3019	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
3020
3021	/* Don't wait for buckets, preserve caller's NOVM setting. */
3022	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
3023	if (bucket == NULL)
3024		return (NULL);
3025
3026	max = MIN(bucket->ub_entries, zone->uz_count);
3027	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
3028	    max, domain, flags);
3029
3030	/*
3031	 * Initialize the memory if necessary.
3032	 */
3033	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
3034		int i;
3035
3036		for (i = 0; i < bucket->ub_cnt; i++)
3037			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
3038			    flags) != 0)
3039				break;
3040		/*
3041		 * If we couldn't initialize the whole bucket, put the
3042		 * rest back onto the freelist.
3043		 */
3044		if (i != bucket->ub_cnt) {
3045			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
3046			    bucket->ub_cnt - i);
3047#ifdef INVARIANTS
3048			bzero(&bucket->ub_bucket[i],
3049			    sizeof(void *) * (bucket->ub_cnt - i));
3050#endif
3051			bucket->ub_cnt = i;
3052		}
3053	}
3054
3055	if (bucket->ub_cnt == 0) {
3056		bucket_free(zone, bucket, udata);
3057		atomic_add_long(&zone->uz_fails, 1);
3058		return (NULL);
3059	}
3060
3061	return (bucket);
3062}
3063
3064/*
3065 * Allocates a single item from a zone.
3066 *
3067 * Arguments
3068 *	zone   The zone to alloc for.
3069 *	udata  The data to be passed to the constructor.
3070 *	domain The domain to allocate from or UMA_ANYDOMAIN.
3071 *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
3072 *
3073 * Returns
3074 *	NULL if there is no memory and M_NOWAIT is set
3075 *	An item if successful
3076 */
3077
3078static void *
3079zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
3080{
3081	void *item;
3082#ifdef INVARIANTS
3083	bool skipdbg;
3084#endif
3085
3086	item = NULL;
3087
3088	if (domain != UMA_ANYDOMAIN) {
3089		/* avoid allocs targeting empty domains */
3090		if (VM_DOMAIN_EMPTY(domain))
3091			domain = UMA_ANYDOMAIN;
3092	}
3093	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
3094		goto fail;
3095	atomic_add_long(&zone->uz_allocs, 1);
3096
3097#ifdef INVARIANTS
3098	skipdbg = uma_dbg_zskip(zone, item);
3099#endif
3100	/*
3101	 * We have to call both the zone's init (not the keg's init)
3102	 * and the zone's ctor.  This is because the item is going from
3103	 * a keg slab directly to the user, and the user is expecting it
3104	 * to be both zone-init'd as well as zone-ctor'd.
3105	 */
3106	if (zone->uz_init != NULL) {
3107		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3108			zone_free_item(zone, item, udata, SKIP_FINI);
3109			goto fail;
3110		}
3111	}
3112	if (zone->uz_ctor != NULL &&
3113#ifdef INVARIANTS
3114	    (!skipdbg || zone->uz_ctor != trash_ctor ||
3115	    zone->uz_dtor != trash_dtor) &&
3116#endif
3117	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3118		zone_free_item(zone, item, udata, SKIP_DTOR);
3119		goto fail;
3120	}
3121#ifdef INVARIANTS
3122	if (!skipdbg)
3123		uma_dbg_alloc(zone, NULL, item);
3124#endif
3125	if (flags & M_ZERO)
3126		uma_zero_item(item, zone);
3127
3128	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3129	    zone->uz_name, zone);
3130
3131	return (item);
3132
3133fail:
3134	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3135	    zone->uz_name, zone);
3136	atomic_add_long(&zone->uz_fails, 1);
3137	return (NULL);
3138}
3139
3140/* See uma.h */
3141void
3142uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3143{
3144	uma_cache_t cache;
3145	uma_bucket_t bucket;
3146	uma_zone_domain_t zdom;
3147	int cpu, domain, lockfail;
3148#ifdef INVARIANTS
3149	bool skipdbg;
3150#endif
3151
3152	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3153	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3154
3155	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3156	    zone->uz_name);
3157
3158	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3159	    ("uma_zfree_arg: called with spinlock or critical section held"));
3160
3161        /* uma_zfree(..., NULL) does nothing, to match free(9). */
3162        if (item == NULL)
3163                return;
3164#ifdef DEBUG_MEMGUARD
3165	if (is_memguard_addr(item)) {
3166		if (zone->uz_dtor != NULL)
3167			zone->uz_dtor(item, zone->uz_size, udata);
3168		if (zone->uz_fini != NULL)
3169			zone->uz_fini(item, zone->uz_size);
3170		memguard_free(item);
3171		return;
3172	}
3173#endif
3174#ifdef INVARIANTS
3175	skipdbg = uma_dbg_zskip(zone, item);
3176	if (skipdbg == false) {
3177		if (zone->uz_flags & UMA_ZONE_MALLOC)
3178			uma_dbg_free(zone, udata, item);
3179		else
3180			uma_dbg_free(zone, NULL, item);
3181	}
3182	if (zone->uz_dtor != NULL && (!skipdbg ||
3183	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3184#else
3185	if (zone->uz_dtor != NULL)
3186#endif
3187		zone->uz_dtor(item, zone->uz_size, udata);
3188
3189	/*
3190	 * The race here is acceptable.  If we miss it we'll just have to wait
3191	 * a little longer for the limits to be reset.
3192	 */
3193	if (zone->uz_flags & UMA_ZFLAG_FULL)
3194		goto zfree_item;
3195
3196	/*
3197	 * If possible, free to the per-CPU cache.  There are two
3198	 * requirements for safe access to the per-CPU cache: (1) the thread
3199	 * accessing the cache must not be preempted or yield during access,
3200	 * and (2) the thread must not migrate CPUs without switching which
3201	 * cache it accesses.  We rely on a critical section to prevent
3202	 * preemption and migration.  We release the critical section in
3203	 * order to acquire the zone mutex if we are unable to free to the
3204	 * current cache; when we re-acquire the critical section, we must
3205	 * detect and handle migration if it has occurred.
3206	 */
3207zfree_restart:
3208	critical_enter();
3209	cpu = curcpu;
3210	cache = &zone->uz_cpu[cpu];
3211
3212zfree_start:
3213	/*
3214	 * Try to free into the allocbucket first to give LIFO ordering
3215	 * for cache-hot datastructures.  Spill over into the freebucket
3216	 * if necessary.  Alloc will swap them if one runs dry.
3217	 */
3218	bucket = cache->uc_allocbucket;
3219	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3220		bucket = cache->uc_freebucket;
3221	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3222		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3223		    ("uma_zfree: Freeing to non free bucket index."));
3224		bucket->ub_bucket[bucket->ub_cnt] = item;
3225		bucket->ub_cnt++;
3226		cache->uc_frees++;
3227		critical_exit();
3228		return;
3229	}
3230
3231	/*
3232	 * We must go back the zone, which requires acquiring the zone lock,
3233	 * which in turn means we must release and re-acquire the critical
3234	 * section.  Since the critical section is released, we may be
3235	 * preempted or migrate.  As such, make sure not to maintain any
3236	 * thread-local state specific to the cache from prior to releasing
3237	 * the critical section.
3238	 */
3239	critical_exit();
3240	if (zone->uz_count == 0 || bucketdisable)
3241		goto zfree_item;
3242
3243	lockfail = 0;
3244	if (ZONE_TRYLOCK(zone) == 0) {
3245		/* Record contention to size the buckets. */
3246		ZONE_LOCK(zone);
3247		lockfail = 1;
3248	}
3249	critical_enter();
3250	cpu = curcpu;
3251	cache = &zone->uz_cpu[cpu];
3252
3253	bucket = cache->uc_freebucket;
3254	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3255		ZONE_UNLOCK(zone);
3256		goto zfree_start;
3257	}
3258	cache->uc_freebucket = NULL;
3259	/* We are no longer associated with this CPU. */
3260	critical_exit();
3261
3262	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3263		domain = PCPU_GET(domain);
3264		if (VM_DOMAIN_EMPTY(domain))
3265			domain = UMA_ANYDOMAIN;
3266	} else
3267		domain = 0;
3268	zdom = &zone->uz_domain[0];
3269
3270	/* Can we throw this on the zone full list? */
3271	if (bucket != NULL) {
3272		CTR3(KTR_UMA,
3273		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3274		    zone->uz_name, zone, bucket);
3275		/* ub_cnt is pointing to the last free item */
3276		KASSERT(bucket->ub_cnt != 0,
3277		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
3278		if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
3279			ZONE_UNLOCK(zone);
3280			bucket_drain(zone, bucket);
3281			bucket_free(zone, bucket, udata);
3282			goto zfree_restart;
3283		} else
3284			zone_put_bucket(zone, zdom, bucket, true);
3285	}
3286
3287	/*
3288	 * We bump the uz count when the cache size is insufficient to
3289	 * handle the working set.
3290	 */
3291	if (lockfail && zone->uz_count < BUCKET_MAX)
3292		zone->uz_count++;
3293	ZONE_UNLOCK(zone);
3294
3295	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3296	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3297	    zone->uz_name, zone, bucket);
3298	if (bucket) {
3299		critical_enter();
3300		cpu = curcpu;
3301		cache = &zone->uz_cpu[cpu];
3302		if (cache->uc_freebucket == NULL &&
3303		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3304		    domain == PCPU_GET(domain))) {
3305			cache->uc_freebucket = bucket;
3306			goto zfree_start;
3307		}
3308		/*
3309		 * We lost the race, start over.  We have to drop our
3310		 * critical section to free the bucket.
3311		 */
3312		critical_exit();
3313		bucket_free(zone, bucket, udata);
3314		goto zfree_restart;
3315	}
3316
3317	/*
3318	 * If nothing else caught this, we'll just do an internal free.
3319	 */
3320zfree_item:
3321	zone_free_item(zone, item, udata, SKIP_DTOR);
3322
3323	return;
3324}
3325
3326void
3327uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3328{
3329
3330	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3331	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3332
3333	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3334	    zone->uz_name);
3335
3336	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3337	    ("uma_zfree_domain: called with spinlock or critical section held"));
3338
3339        /* uma_zfree(..., NULL) does nothing, to match free(9). */
3340        if (item == NULL)
3341                return;
3342	zone_free_item(zone, item, udata, SKIP_NONE);
3343}
3344
3345static void
3346slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
3347{
3348	uma_domain_t dom;
3349	uint8_t freei;
3350
3351	mtx_assert(&keg->uk_lock, MA_OWNED);
3352	MPASS(keg == slab->us_keg);
3353
3354	dom = &keg->uk_domain[slab->us_domain];
3355
3356	/* Do we need to remove from any lists? */
3357	if (slab->us_freecount+1 == keg->uk_ipers) {
3358		LIST_REMOVE(slab, us_link);
3359		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3360	} else if (slab->us_freecount == 0) {
3361		LIST_REMOVE(slab, us_link);
3362		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3363	}
3364
3365	/* Slab management. */
3366	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3367	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3368	slab->us_freecount++;
3369
3370	/* Keg statistics. */
3371	keg->uk_free++;
3372}
3373
3374static void
3375zone_release(uma_zone_t zone, void **bucket, int cnt)
3376{
3377	void *item;
3378	uma_slab_t slab;
3379	uma_keg_t keg;
3380	uint8_t *mem;
3381	int clearfull;
3382	int i;
3383
3384	clearfull = 0;
3385	keg = zone_first_keg(zone);
3386	KEG_LOCK(keg);
3387	for (i = 0; i < cnt; i++) {
3388		item = bucket[i];
3389		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3390			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3391			if (zone->uz_flags & UMA_ZONE_HASH) {
3392				slab = hash_sfind(&keg->uk_hash, mem);
3393			} else {
3394				mem += keg->uk_pgoff;
3395				slab = (uma_slab_t)mem;
3396			}
3397		} else {
3398			slab = vtoslab((vm_offset_t)item);
3399			if (slab->us_keg != keg) {
3400				KEG_UNLOCK(keg);
3401				keg = slab->us_keg;
3402				KEG_LOCK(keg);
3403			}
3404		}
3405		slab_free_item(keg, slab, item);
3406		if (keg->uk_flags & UMA_ZFLAG_FULL) {
3407			if (keg->uk_pages < keg->uk_maxpages) {
3408				keg->uk_flags &= ~UMA_ZFLAG_FULL;
3409				clearfull = 1;
3410			}
3411
3412			/*
3413			 * We can handle one more allocation. Since we're
3414			 * clearing ZFLAG_FULL, wake up all procs blocked
3415			 * on pages. This should be uncommon, so keeping this
3416			 * simple for now (rather than adding count of blocked
3417			 * threads etc).
3418			 */
3419			wakeup(keg);
3420		}
3421	}
3422	KEG_UNLOCK(keg);
3423	if (clearfull) {
3424		ZONE_LOCK(zone);
3425		zone->uz_flags &= ~UMA_ZFLAG_FULL;
3426		wakeup(zone);
3427		ZONE_UNLOCK(zone);
3428	}
3429
3430}
3431
3432/*
3433 * Frees a single item to any zone.
3434 *
3435 * Arguments:
3436 *	zone   The zone to free to
3437 *	item   The item we're freeing
3438 *	udata  User supplied data for the dtor
3439 *	skip   Skip dtors and finis
3440 */
3441static void
3442zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3443{
3444#ifdef INVARIANTS
3445	bool skipdbg;
3446
3447	skipdbg = uma_dbg_zskip(zone, item);
3448	if (skip == SKIP_NONE && !skipdbg) {
3449		if (zone->uz_flags & UMA_ZONE_MALLOC)
3450			uma_dbg_free(zone, udata, item);
3451		else
3452			uma_dbg_free(zone, NULL, item);
3453	}
3454
3455	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3456	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3457	    zone->uz_ctor != trash_ctor))
3458#else
3459	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3460#endif
3461		zone->uz_dtor(item, zone->uz_size, udata);
3462
3463	if (skip < SKIP_FINI && zone->uz_fini)
3464		zone->uz_fini(item, zone->uz_size);
3465
3466	atomic_add_long(&zone->uz_frees, 1);
3467	zone->uz_release(zone->uz_arg, &item, 1);
3468}
3469
3470/* See uma.h */
3471int
3472uma_zone_set_max(uma_zone_t zone, int nitems)
3473{
3474	uma_keg_t keg;
3475
3476	keg = zone_first_keg(zone);
3477	if (keg == NULL)
3478		return (0);
3479	KEG_LOCK(keg);
3480	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
3481	if (keg->uk_maxpages * keg->uk_ipers < nitems)
3482		keg->uk_maxpages += keg->uk_ppera;
3483	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3484	KEG_UNLOCK(keg);
3485
3486	return (nitems);
3487}
3488
3489/* See uma.h */
3490int
3491uma_zone_get_max(uma_zone_t zone)
3492{
3493	int nitems;
3494	uma_keg_t keg;
3495
3496	keg = zone_first_keg(zone);
3497	if (keg == NULL)
3498		return (0);
3499	KEG_LOCK(keg);
3500	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3501	KEG_UNLOCK(keg);
3502
3503	return (nitems);
3504}
3505
3506/* See uma.h */
3507void
3508uma_zone_set_warning(uma_zone_t zone, const char *warning)
3509{
3510
3511	ZONE_LOCK(zone);
3512	zone->uz_warning = warning;
3513	ZONE_UNLOCK(zone);
3514}
3515
3516/* See uma.h */
3517void
3518uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3519{
3520
3521	ZONE_LOCK(zone);
3522	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3523	ZONE_UNLOCK(zone);
3524}
3525
3526/* See uma.h */
3527int
3528uma_zone_get_cur(uma_zone_t zone)
3529{
3530	int64_t nitems;
3531	u_int i;
3532
3533	ZONE_LOCK(zone);
3534	nitems = zone->uz_allocs - zone->uz_frees;
3535	CPU_FOREACH(i) {
3536		/*
3537		 * See the comment in sysctl_vm_zone_stats() regarding the
3538		 * safety of accessing the per-cpu caches. With the zone lock
3539		 * held, it is safe, but can potentially result in stale data.
3540		 */
3541		nitems += zone->uz_cpu[i].uc_allocs -
3542		    zone->uz_cpu[i].uc_frees;
3543	}
3544	ZONE_UNLOCK(zone);
3545
3546	return (nitems < 0 ? 0 : nitems);
3547}
3548
3549/* See uma.h */
3550void
3551uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3552{
3553	uma_keg_t keg;
3554
3555	keg = zone_first_keg(zone);
3556	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
3557	KEG_LOCK(keg);
3558	KASSERT(keg->uk_pages == 0,
3559	    ("uma_zone_set_init on non-empty keg"));
3560	keg->uk_init = uminit;
3561	KEG_UNLOCK(keg);
3562}
3563
3564/* See uma.h */
3565void
3566uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3567{
3568	uma_keg_t keg;
3569
3570	keg = zone_first_keg(zone);
3571	KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3572	KEG_LOCK(keg);
3573	KASSERT(keg->uk_pages == 0,
3574	    ("uma_zone_set_fini on non-empty keg"));
3575	keg->uk_fini = fini;
3576	KEG_UNLOCK(keg);
3577}
3578
3579/* See uma.h */
3580void
3581uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3582{
3583
3584	ZONE_LOCK(zone);
3585	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3586	    ("uma_zone_set_zinit on non-empty keg"));
3587	zone->uz_init = zinit;
3588	ZONE_UNLOCK(zone);
3589}
3590
3591/* See uma.h */
3592void
3593uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3594{
3595
3596	ZONE_LOCK(zone);
3597	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3598	    ("uma_zone_set_zfini on non-empty keg"));
3599	zone->uz_fini = zfini;
3600	ZONE_UNLOCK(zone);
3601}
3602
3603/* See uma.h */
3604/* XXX uk_freef is not actually used with the zone locked */
3605void
3606uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3607{
3608	uma_keg_t keg;
3609
3610	keg = zone_first_keg(zone);
3611	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3612	KEG_LOCK(keg);
3613	keg->uk_freef = freef;
3614	KEG_UNLOCK(keg);
3615}
3616
3617/* See uma.h */
3618/* XXX uk_allocf is not actually used with the zone locked */
3619void
3620uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3621{
3622	uma_keg_t keg;
3623
3624	keg = zone_first_keg(zone);
3625	KEG_LOCK(keg);
3626	keg->uk_allocf = allocf;
3627	KEG_UNLOCK(keg);
3628}
3629
3630/* See uma.h */
3631void
3632uma_zone_reserve(uma_zone_t zone, int items)
3633{
3634	uma_keg_t keg;
3635
3636	keg = zone_first_keg(zone);
3637	if (keg == NULL)
3638		return;
3639	KEG_LOCK(keg);
3640	keg->uk_reserve = items;
3641	KEG_UNLOCK(keg);
3642
3643	return;
3644}
3645
3646/* See uma.h */
3647int
3648uma_zone_reserve_kva(uma_zone_t zone, int count)
3649{
3650	uma_keg_t keg;
3651	vm_offset_t kva;
3652	u_int pages;
3653
3654	keg = zone_first_keg(zone);
3655	if (keg == NULL)
3656		return (0);
3657	pages = count / keg->uk_ipers;
3658
3659	if (pages * keg->uk_ipers < count)
3660		pages++;
3661	pages *= keg->uk_ppera;
3662
3663#ifdef UMA_MD_SMALL_ALLOC
3664	if (keg->uk_ppera > 1) {
3665#else
3666	if (1) {
3667#endif
3668		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3669		if (kva == 0)
3670			return (0);
3671	} else
3672		kva = 0;
3673	KEG_LOCK(keg);
3674	keg->uk_kva = kva;
3675	keg->uk_offset = 0;
3676	keg->uk_maxpages = pages;
3677#ifdef UMA_MD_SMALL_ALLOC
3678	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3679#else
3680	keg->uk_allocf = noobj_alloc;
3681#endif
3682	keg->uk_flags |= UMA_ZONE_NOFREE;
3683	KEG_UNLOCK(keg);
3684
3685	return (1);
3686}
3687
3688/* See uma.h */
3689void
3690uma_prealloc(uma_zone_t zone, int items)
3691{
3692	struct vm_domainset_iter di;
3693	uma_domain_t dom;
3694	uma_slab_t slab;
3695	uma_keg_t keg;
3696	int aflags, domain, slabs;
3697
3698	keg = zone_first_keg(zone);
3699	if (keg == NULL)
3700		return;
3701	KEG_LOCK(keg);
3702	slabs = items / keg->uk_ipers;
3703	if (slabs * keg->uk_ipers < items)
3704		slabs++;
3705	while (slabs-- > 0) {
3706		aflags = M_NOWAIT;
3707		vm_domainset_iter_policy_ref_init(&di, &keg->uk_dr, &domain,
3708		    &aflags);
3709		for (;;) {
3710			slab = keg_alloc_slab(keg, zone, domain, M_WAITOK,
3711			    aflags);
3712			if (slab != NULL) {
3713				MPASS(slab->us_keg == keg);
3714				dom = &keg->uk_domain[slab->us_domain];
3715				LIST_INSERT_HEAD(&dom->ud_free_slab, slab,
3716				    us_link);
3717				break;
3718			}
3719			KEG_LOCK(keg);
3720			if (vm_domainset_iter_policy(&di, &domain) != 0) {
3721				KEG_UNLOCK(keg);
3722				vm_wait_doms(&keg->uk_dr.dr_policy->ds_mask, 0);
3723				KEG_LOCK(keg);
3724			}
3725		}
3726	}
3727	KEG_UNLOCK(keg);
3728}
3729
3730/* See uma.h */
3731static void
3732uma_reclaim_locked(bool kmem_danger)
3733{
3734
3735	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3736	sx_assert(&uma_drain_lock, SA_XLOCKED);
3737	bucket_enable();
3738	zone_foreach(zone_drain);
3739	if (vm_page_count_min() || kmem_danger) {
3740		cache_drain_safe(NULL);
3741		zone_foreach(zone_drain);
3742	}
3743
3744	/*
3745	 * Some slabs may have been freed but this zone will be visited early
3746	 * we visit again so that we can free pages that are empty once other
3747	 * zones are drained.  We have to do the same for buckets.
3748	 */
3749	zone_drain(slabzone);
3750	bucket_zone_drain();
3751}
3752
3753void
3754uma_reclaim(void)
3755{
3756
3757	sx_xlock(&uma_drain_lock);
3758	uma_reclaim_locked(false);
3759	sx_xunlock(&uma_drain_lock);
3760}
3761
3762static volatile int uma_reclaim_needed;
3763
3764void
3765uma_reclaim_wakeup(void)
3766{
3767
3768	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3769		wakeup(uma_reclaim);
3770}
3771
3772void
3773uma_reclaim_worker(void *arg __unused)
3774{
3775
3776	for (;;) {
3777		sx_xlock(&uma_drain_lock);
3778		while (atomic_load_int(&uma_reclaim_needed) == 0)
3779			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
3780			    hz);
3781		sx_xunlock(&uma_drain_lock);
3782		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3783		sx_xlock(&uma_drain_lock);
3784		uma_reclaim_locked(true);
3785		atomic_store_int(&uma_reclaim_needed, 0);
3786		sx_xunlock(&uma_drain_lock);
3787		/* Don't fire more than once per-second. */
3788		pause("umarclslp", hz);
3789	}
3790}
3791
3792/* See uma.h */
3793int
3794uma_zone_exhausted(uma_zone_t zone)
3795{
3796	int full;
3797
3798	ZONE_LOCK(zone);
3799	full = (zone->uz_flags & UMA_ZFLAG_FULL);
3800	ZONE_UNLOCK(zone);
3801	return (full);
3802}
3803
3804int
3805uma_zone_exhausted_nolock(uma_zone_t zone)
3806{
3807	return (zone->uz_flags & UMA_ZFLAG_FULL);
3808}
3809
3810void *
3811uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3812{
3813	struct domainset *policy;
3814	vm_offset_t addr;
3815	uma_slab_t slab;
3816
3817	if (domain != UMA_ANYDOMAIN) {
3818		/* avoid allocs targeting empty domains */
3819		if (VM_DOMAIN_EMPTY(domain))
3820			domain = UMA_ANYDOMAIN;
3821	}
3822	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3823	if (slab == NULL)
3824		return (NULL);
3825	policy = (domain == UMA_ANYDOMAIN) ? DOMAINSET_RR() :
3826	    DOMAINSET_FIXED(domain);
3827	addr = kmem_malloc_domainset(policy, size, wait);
3828	if (addr != 0) {
3829		vsetslab(addr, slab);
3830		slab->us_data = (void *)addr;
3831		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3832		slab->us_size = size;
3833		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3834		    pmap_kextract(addr)));
3835		uma_total_inc(size);
3836	} else {
3837		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3838	}
3839
3840	return ((void *)addr);
3841}
3842
3843void *
3844uma_large_malloc(vm_size_t size, int wait)
3845{
3846
3847	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3848}
3849
3850void
3851uma_large_free(uma_slab_t slab)
3852{
3853
3854	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3855	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3856	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3857	uma_total_dec(slab->us_size);
3858	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3859}
3860
3861static void
3862uma_zero_item(void *item, uma_zone_t zone)
3863{
3864
3865	bzero(item, zone->uz_size);
3866}
3867
3868unsigned long
3869uma_limit(void)
3870{
3871
3872	return (uma_kmem_limit);
3873}
3874
3875void
3876uma_set_limit(unsigned long limit)
3877{
3878
3879	uma_kmem_limit = limit;
3880}
3881
3882unsigned long
3883uma_size(void)
3884{
3885
3886	return (atomic_load_long(&uma_kmem_total));
3887}
3888
3889long
3890uma_avail(void)
3891{
3892
3893	return (uma_kmem_limit - uma_size());
3894}
3895
3896void
3897uma_print_stats(void)
3898{
3899	zone_foreach(uma_print_zone);
3900}
3901
3902static void
3903slab_print(uma_slab_t slab)
3904{
3905	printf("slab: keg %p, data %p, freecount %d\n",
3906		slab->us_keg, slab->us_data, slab->us_freecount);
3907}
3908
3909static void
3910cache_print(uma_cache_t cache)
3911{
3912	printf("alloc: %p(%d), free: %p(%d)\n",
3913		cache->uc_allocbucket,
3914		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3915		cache->uc_freebucket,
3916		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3917}
3918
3919static void
3920uma_print_keg(uma_keg_t keg)
3921{
3922	uma_domain_t dom;
3923	uma_slab_t slab;
3924	int i;
3925
3926	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3927	    "out %d free %d limit %d\n",
3928	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3929	    keg->uk_ipers, keg->uk_ppera,
3930	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3931	    keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3932	for (i = 0; i < vm_ndomains; i++) {
3933		dom = &keg->uk_domain[i];
3934		printf("Part slabs:\n");
3935		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3936			slab_print(slab);
3937		printf("Free slabs:\n");
3938		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3939			slab_print(slab);
3940		printf("Full slabs:\n");
3941		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3942			slab_print(slab);
3943	}
3944}
3945
3946void
3947uma_print_zone(uma_zone_t zone)
3948{
3949	uma_cache_t cache;
3950	uma_klink_t kl;
3951	int i;
3952
3953	printf("zone: %s(%p) size %d flags %#x\n",
3954	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3955	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3956		uma_print_keg(kl->kl_keg);
3957	CPU_FOREACH(i) {
3958		cache = &zone->uz_cpu[i];
3959		printf("CPU %d Cache:\n", i);
3960		cache_print(cache);
3961	}
3962}
3963
3964#ifdef DDB
3965/*
3966 * Generate statistics across both the zone and its per-cpu cache's.  Return
3967 * desired statistics if the pointer is non-NULL for that statistic.
3968 *
3969 * Note: does not update the zone statistics, as it can't safely clear the
3970 * per-CPU cache statistic.
3971 *
3972 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3973 * safe from off-CPU; we should modify the caches to track this information
3974 * directly so that we don't have to.
3975 */
3976static void
3977uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
3978    uint64_t *freesp, uint64_t *sleepsp)
3979{
3980	uma_cache_t cache;
3981	uint64_t allocs, frees, sleeps;
3982	int cachefree, cpu;
3983
3984	allocs = frees = sleeps = 0;
3985	cachefree = 0;
3986	CPU_FOREACH(cpu) {
3987		cache = &z->uz_cpu[cpu];
3988		if (cache->uc_allocbucket != NULL)
3989			cachefree += cache->uc_allocbucket->ub_cnt;
3990		if (cache->uc_freebucket != NULL)
3991			cachefree += cache->uc_freebucket->ub_cnt;
3992		allocs += cache->uc_allocs;
3993		frees += cache->uc_frees;
3994	}
3995	allocs += z->uz_allocs;
3996	frees += z->uz_frees;
3997	sleeps += z->uz_sleeps;
3998	if (cachefreep != NULL)
3999		*cachefreep = cachefree;
4000	if (allocsp != NULL)
4001		*allocsp = allocs;
4002	if (freesp != NULL)
4003		*freesp = frees;
4004	if (sleepsp != NULL)
4005		*sleepsp = sleeps;
4006}
4007#endif /* DDB */
4008
4009static int
4010sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
4011{
4012	uma_keg_t kz;
4013	uma_zone_t z;
4014	int count;
4015
4016	count = 0;
4017	rw_rlock(&uma_rwlock);
4018	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4019		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4020			count++;
4021	}
4022	rw_runlock(&uma_rwlock);
4023	return (sysctl_handle_int(oidp, &count, 0, req));
4024}
4025
4026static int
4027sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
4028{
4029	struct uma_stream_header ush;
4030	struct uma_type_header uth;
4031	struct uma_percpu_stat *ups;
4032	uma_zone_domain_t zdom;
4033	struct sbuf sbuf;
4034	uma_cache_t cache;
4035	uma_klink_t kl;
4036	uma_keg_t kz;
4037	uma_zone_t z;
4038	uma_keg_t k;
4039	int count, error, i;
4040
4041	error = sysctl_wire_old_buffer(req, 0);
4042	if (error != 0)
4043		return (error);
4044	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
4045	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
4046	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
4047
4048	count = 0;
4049	rw_rlock(&uma_rwlock);
4050	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4051		LIST_FOREACH(z, &kz->uk_zones, uz_link)
4052			count++;
4053	}
4054
4055	/*
4056	 * Insert stream header.
4057	 */
4058	bzero(&ush, sizeof(ush));
4059	ush.ush_version = UMA_STREAM_VERSION;
4060	ush.ush_maxcpus = (mp_maxid + 1);
4061	ush.ush_count = count;
4062	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
4063
4064	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4065		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4066			bzero(&uth, sizeof(uth));
4067			ZONE_LOCK(z);
4068			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
4069			uth.uth_align = kz->uk_align;
4070			uth.uth_size = kz->uk_size;
4071			uth.uth_rsize = kz->uk_rsize;
4072			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
4073				k = kl->kl_keg;
4074				uth.uth_maxpages += k->uk_maxpages;
4075				uth.uth_pages += k->uk_pages;
4076				uth.uth_keg_free += k->uk_free;
4077				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
4078				    * k->uk_ipers;
4079			}
4080
4081			/*
4082			 * A zone is secondary is it is not the first entry
4083			 * on the keg's zone list.
4084			 */
4085			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
4086			    (LIST_FIRST(&kz->uk_zones) != z))
4087				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
4088
4089			for (i = 0; i < vm_ndomains; i++) {
4090				zdom = &z->uz_domain[i];
4091				uth.uth_zone_free += zdom->uzd_nitems;
4092			}
4093			uth.uth_allocs = z->uz_allocs;
4094			uth.uth_frees = z->uz_frees;
4095			uth.uth_fails = z->uz_fails;
4096			uth.uth_sleeps = z->uz_sleeps;
4097			/*
4098			 * While it is not normally safe to access the cache
4099			 * bucket pointers while not on the CPU that owns the
4100			 * cache, we only allow the pointers to be exchanged
4101			 * without the zone lock held, not invalidated, so
4102			 * accept the possible race associated with bucket
4103			 * exchange during monitoring.
4104			 */
4105			for (i = 0; i < mp_maxid + 1; i++) {
4106				bzero(&ups[i], sizeof(*ups));
4107				if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
4108				    CPU_ABSENT(i))
4109					continue;
4110				cache = &z->uz_cpu[i];
4111				if (cache->uc_allocbucket != NULL)
4112					ups[i].ups_cache_free +=
4113					    cache->uc_allocbucket->ub_cnt;
4114				if (cache->uc_freebucket != NULL)
4115					ups[i].ups_cache_free +=
4116					    cache->uc_freebucket->ub_cnt;
4117				ups[i].ups_allocs = cache->uc_allocs;
4118				ups[i].ups_frees = cache->uc_frees;
4119			}
4120			ZONE_UNLOCK(z);
4121			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4122			for (i = 0; i < mp_maxid + 1; i++)
4123				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4124		}
4125	}
4126	rw_runlock(&uma_rwlock);
4127	error = sbuf_finish(&sbuf);
4128	sbuf_delete(&sbuf);
4129	free(ups, M_TEMP);
4130	return (error);
4131}
4132
4133int
4134sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4135{
4136	uma_zone_t zone = *(uma_zone_t *)arg1;
4137	int error, max;
4138
4139	max = uma_zone_get_max(zone);
4140	error = sysctl_handle_int(oidp, &max, 0, req);
4141	if (error || !req->newptr)
4142		return (error);
4143
4144	uma_zone_set_max(zone, max);
4145
4146	return (0);
4147}
4148
4149int
4150sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4151{
4152	uma_zone_t zone = *(uma_zone_t *)arg1;
4153	int cur;
4154
4155	cur = uma_zone_get_cur(zone);
4156	return (sysctl_handle_int(oidp, &cur, 0, req));
4157}
4158
4159#ifdef INVARIANTS
4160static uma_slab_t
4161uma_dbg_getslab(uma_zone_t zone, void *item)
4162{
4163	uma_slab_t slab;
4164	uma_keg_t keg;
4165	uint8_t *mem;
4166
4167	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4168	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4169		slab = vtoslab((vm_offset_t)mem);
4170	} else {
4171		/*
4172		 * It is safe to return the slab here even though the
4173		 * zone is unlocked because the item's allocation state
4174		 * essentially holds a reference.
4175		 */
4176		ZONE_LOCK(zone);
4177		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
4178		if (keg->uk_flags & UMA_ZONE_HASH)
4179			slab = hash_sfind(&keg->uk_hash, mem);
4180		else
4181			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4182		ZONE_UNLOCK(zone);
4183	}
4184
4185	return (slab);
4186}
4187
4188static bool
4189uma_dbg_zskip(uma_zone_t zone, void *mem)
4190{
4191	uma_keg_t keg;
4192
4193	if ((keg = zone_first_keg(zone)) == NULL)
4194		return (true);
4195
4196	return (uma_dbg_kskip(keg, mem));
4197}
4198
4199static bool
4200uma_dbg_kskip(uma_keg_t keg, void *mem)
4201{
4202	uintptr_t idx;
4203
4204	if (dbg_divisor == 0)
4205		return (true);
4206
4207	if (dbg_divisor == 1)
4208		return (false);
4209
4210	idx = (uintptr_t)mem >> PAGE_SHIFT;
4211	if (keg->uk_ipers > 1) {
4212		idx *= keg->uk_ipers;
4213		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4214	}
4215
4216	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4217		counter_u64_add(uma_skip_cnt, 1);
4218		return (true);
4219	}
4220	counter_u64_add(uma_dbg_cnt, 1);
4221
4222	return (false);
4223}
4224
4225/*
4226 * Set up the slab's freei data such that uma_dbg_free can function.
4227 *
4228 */
4229static void
4230uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4231{
4232	uma_keg_t keg;
4233	int freei;
4234
4235	if (slab == NULL) {
4236		slab = uma_dbg_getslab(zone, item);
4237		if (slab == NULL)
4238			panic("uma: item %p did not belong to zone %s\n",
4239			    item, zone->uz_name);
4240	}
4241	keg = slab->us_keg;
4242	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4243
4244	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4245		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4246		    item, zone, zone->uz_name, slab, freei);
4247	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4248
4249	return;
4250}
4251
4252/*
4253 * Verifies freed addresses.  Checks for alignment, valid slab membership
4254 * and duplicate frees.
4255 *
4256 */
4257static void
4258uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4259{
4260	uma_keg_t keg;
4261	int freei;
4262
4263	if (slab == NULL) {
4264		slab = uma_dbg_getslab(zone, item);
4265		if (slab == NULL)
4266			panic("uma: Freed item %p did not belong to zone %s\n",
4267			    item, zone->uz_name);
4268	}
4269	keg = slab->us_keg;
4270	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4271
4272	if (freei >= keg->uk_ipers)
4273		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4274		    item, zone, zone->uz_name, slab, freei);
4275
4276	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4277		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4278		    item, zone, zone->uz_name, slab, freei);
4279
4280	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4281		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4282		    item, zone, zone->uz_name, slab, freei);
4283
4284	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4285}
4286#endif /* INVARIANTS */
4287
4288#ifdef DDB
4289DB_SHOW_COMMAND(uma, db_show_uma)
4290{
4291	uma_keg_t kz;
4292	uma_zone_t z;
4293	uint64_t allocs, frees, sleeps;
4294	long cachefree;
4295	int i;
4296
4297	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
4298	    "Free", "Requests", "Sleeps", "Bucket");
4299	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4300		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4301			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4302				allocs = z->uz_allocs;
4303				frees = z->uz_frees;
4304				sleeps = z->uz_sleeps;
4305				cachefree = 0;
4306			} else
4307				uma_zone_sumstat(z, &cachefree, &allocs,
4308				    &frees, &sleeps);
4309			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4310			    (LIST_FIRST(&kz->uk_zones) != z)))
4311				cachefree += kz->uk_free;
4312			for (i = 0; i < vm_ndomains; i++)
4313				cachefree += z->uz_domain[i].uzd_nitems;
4314
4315			db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u\n",
4316			    z->uz_name, (uintmax_t)kz->uk_size,
4317			    (intmax_t)(allocs - frees), cachefree,
4318			    (uintmax_t)allocs, sleeps, z->uz_count);
4319			if (db_pager_quit)
4320				return;
4321		}
4322	}
4323}
4324
4325DB_SHOW_COMMAND(umacache, db_show_umacache)
4326{
4327	uma_zone_t z;
4328	uint64_t allocs, frees;
4329	long cachefree;
4330	int i;
4331
4332	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4333	    "Requests", "Bucket");
4334	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4335		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
4336		for (i = 0; i < vm_ndomains; i++)
4337			cachefree += z->uz_domain[i].uzd_nitems;
4338		db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",
4339		    z->uz_name, (uintmax_t)z->uz_size,
4340		    (intmax_t)(allocs - frees), cachefree,
4341		    (uintmax_t)allocs, z->uz_count);
4342		if (db_pager_quit)
4343			return;
4344	}
4345}
4346#endif	/* DDB */
4347