subr_vmem.c revision 280805
181634Sbrian/*-
281634Sbrian * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
381634Sbrian * Copyright (c) 2013 EMC Corp.
481634Sbrian * All rights reserved.
581634Sbrian *
681634Sbrian * Redistribution and use in source and binary forms, with or without
781634Sbrian * modification, are permitted provided that the following conditions
881634Sbrian * are met:
981634Sbrian * 1. Redistributions of source code must retain the above copyright
1081634Sbrian *    notice, this list of conditions and the following disclaimer.
1181634Sbrian * 2. Redistributions in binary form must reproduce the above copyright
1281634Sbrian *    notice, this list of conditions and the following disclaimer in the
1381634Sbrian *    documentation and/or other materials provided with the distribution.
1481634Sbrian *
1581634Sbrian * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1681634Sbrian * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1781634Sbrian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1881634Sbrian * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1981634Sbrian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2081634Sbrian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2181634Sbrian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2281634Sbrian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2381634Sbrian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2481634Sbrian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2581634Sbrian * SUCH DAMAGE.
2681634Sbrian */
2781634Sbrian
2881634Sbrian/*
2981634Sbrian * From:
3081634Sbrian *	$NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
3181634Sbrian *	$NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
3281634Sbrian */
3381634Sbrian
3481634Sbrian/*
3581634Sbrian * reference:
36102558Sbrian * -	Magazines and Vmem: Extending the Slab Allocator
37102558Sbrian *	to Many CPUs and Arbitrary Resources
3881634Sbrian *	http://www.usenix.org/event/usenix01/bonwick.html
3981634Sbrian */
40102500Sbrian
4181634Sbrian#include <sys/cdefs.h>
4281634Sbrian__FBSDID("$FreeBSD: head/sys/kern/subr_vmem.c 280805 2015-03-29 10:02:29Z mav $");
4381634Sbrian
4481634Sbrian#include "opt_ddb.h"
45102558Sbrian
4681634Sbrian#include <sys/param.h>
4781634Sbrian#include <sys/systm.h>
4881634Sbrian#include <sys/kernel.h>
4981634Sbrian#include <sys/queue.h>
5081634Sbrian#include <sys/callout.h>
5181634Sbrian#include <sys/hash.h>
5281634Sbrian#include <sys/lock.h>
5381634Sbrian#include <sys/malloc.h>
5481634Sbrian#include <sys/mutex.h>
5581634Sbrian#include <sys/smp.h>
5681634Sbrian#include <sys/condvar.h>
5781634Sbrian#include <sys/sysctl.h>
5881634Sbrian#include <sys/taskqueue.h>
5981634Sbrian#include <sys/vmem.h>
6081634Sbrian
6181634Sbrian#include "opt_vm.h"
6281634Sbrian
6381634Sbrian#include <vm/uma.h>
6481634Sbrian#include <vm/vm.h>
6581634Sbrian#include <vm/pmap.h>
6681634Sbrian#include <vm/vm_map.h>
6781634Sbrian#include <vm/vm_object.h>
6881634Sbrian#include <vm/vm_kern.h>
6981634Sbrian#include <vm/vm_extern.h>
7081634Sbrian#include <vm/vm_param.h>
7181634Sbrian#include <vm/vm_pageout.h>
7281634Sbrian
7381634Sbrian#define	VMEM_OPTORDER		5
7481634Sbrian#define	VMEM_OPTVALUE		(1 << VMEM_OPTORDER)
7581634Sbrian#define	VMEM_MAXORDER						\
7681634Sbrian    (VMEM_OPTVALUE - 1 + sizeof(vmem_size_t) * NBBY - VMEM_OPTORDER)
7781634Sbrian
7881634Sbrian#define	VMEM_HASHSIZE_MIN	16
7981634Sbrian#define	VMEM_HASHSIZE_MAX	131072
8081888Sbrian
81116251Skris#define	VMEM_QCACHE_IDX_MAX	16
8281634Sbrian
8381634Sbrian#define	VMEM_FITMASK	(M_BESTFIT | M_FIRSTFIT)
8481634Sbrian
85112618Sume#define	VMEM_FLAGS						\
86112618Sume    (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
87112618Sume
88112618Sume#define	BT_FLAGS	(M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
89112618Sume
90112618Sume#define	QC_NAME_MAX	16
9181634Sbrian
9281634Sbrian/*
9381634Sbrian * Data structures private to vmem.
9481634Sbrian */
9581634SbrianMALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
9681634Sbrian
9781634Sbriantypedef struct vmem_btag bt_t;
9881634Sbrian
9994894SbrianTAILQ_HEAD(vmem_seglist, vmem_btag);
10081634SbrianLIST_HEAD(vmem_freelist, vmem_btag);
10181634SbrianLIST_HEAD(vmem_hashlist, vmem_btag);
10281634Sbrian
10381634Sbrianstruct qcache {
10481634Sbrian	uma_zone_t	qc_cache;
10581634Sbrian	vmem_t 		*qc_vmem;
10681634Sbrian	vmem_size_t	qc_size;
10781634Sbrian	char		qc_name[QC_NAME_MAX];
10881634Sbrian};
10981634Sbriantypedef struct qcache qcache_t;
11081634Sbrian#define	QC_POOL_TO_QCACHE(pool)	((qcache_t *)(pool->pr_qcache))
11181634Sbrian
11281634Sbrian#define	VMEM_NAME_MAX	16
11381634Sbrian
11481634Sbrian/* vmem arena */
11581634Sbrianstruct vmem {
116102558Sbrian	struct mtx_padalign	vm_lock;
117102558Sbrian	struct cv		vm_cv;
11881634Sbrian	char			vm_name[VMEM_NAME_MAX+1];
119102558Sbrian	LIST_ENTRY(vmem)	vm_alllist;
120102558Sbrian	struct vmem_hashlist	vm_hash0[VMEM_HASHSIZE_MIN];
121102558Sbrian	struct vmem_freelist	vm_freelist[VMEM_MAXORDER];
122102558Sbrian	struct vmem_seglist	vm_seglist;
123102558Sbrian	struct vmem_hashlist	*vm_hashlist;
124102558Sbrian	vmem_size_t		vm_hashsize;
125102558Sbrian
126102558Sbrian	/* Constant after init */
127102558Sbrian	vmem_size_t		vm_qcache_max;
128102558Sbrian	vmem_size_t		vm_quantum_mask;
129102558Sbrian	vmem_size_t		vm_import_quantum;
130102558Sbrian	int			vm_quantum_shift;
131102558Sbrian
132102558Sbrian	/* Written on alloc/free */
133102558Sbrian	LIST_HEAD(, vmem_btag)	vm_freetags;
134102558Sbrian	int			vm_nfreetags;
135102558Sbrian	int			vm_nbusytag;
136102558Sbrian	vmem_size_t		vm_inuse;
137102558Sbrian	vmem_size_t		vm_size;
138102558Sbrian
139102558Sbrian	/* Used on import. */
140102558Sbrian	vmem_import_t		*vm_importfn;
141102558Sbrian	vmem_release_t		*vm_releasefn;
142102558Sbrian	void			*vm_arg;
143102558Sbrian
144102558Sbrian	/* Space exhaustion callback. */
145102558Sbrian	vmem_reclaim_t		*vm_reclaimfn;
146102558Sbrian
147102558Sbrian	/* quantum cache */
148102558Sbrian	qcache_t		vm_qcache[VMEM_QCACHE_IDX_MAX];
149102558Sbrian};
150102558Sbrian
151102558Sbrian/* boundary tag */
152102558Sbrianstruct vmem_btag {
153102558Sbrian	TAILQ_ENTRY(vmem_btag) bt_seglist;
154102558Sbrian	union {
155102558Sbrian		LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
156102558Sbrian		LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
157102558Sbrian	} bt_u;
158102558Sbrian#define	bt_hashlist	bt_u.u_hashlist
159102558Sbrian#define	bt_freelist	bt_u.u_freelist
160102558Sbrian	vmem_addr_t	bt_start;
161102558Sbrian	vmem_size_t	bt_size;
162102558Sbrian	int		bt_type;
163102558Sbrian};
164102558Sbrian
165102558Sbrian#define	BT_TYPE_SPAN		1	/* Allocated from importfn */
166102558Sbrian#define	BT_TYPE_SPAN_STATIC	2	/* vmem_add() or create. */
167102558Sbrian#define	BT_TYPE_FREE		3	/* Available space. */
168102558Sbrian#define	BT_TYPE_BUSY		4	/* Used space. */
169102558Sbrian#define	BT_ISSPAN_P(bt)	((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
170102558Sbrian
171102558Sbrian#define	BT_END(bt)	((bt)->bt_start + (bt)->bt_size - 1)
172102558Sbrian
173102558Sbrian#if defined(DIAGNOSTIC)
174102558Sbrianstatic int enable_vmem_check = 1;
175102558SbrianSYSCTL_INT(_debug, OID_AUTO, vmem_check, CTLFLAG_RWTUN,
176102558Sbrian    &enable_vmem_check, 0, "Enable vmem check");
177102558Sbrianstatic void vmem_check(vmem_t *);
178102558Sbrian#endif
179102558Sbrian
180102558Sbrianstatic struct callout	vmem_periodic_ch;
181102558Sbrianstatic int		vmem_periodic_interval;
182102558Sbrianstatic struct task	vmem_periodic_wk;
183102558Sbrian
184102558Sbrianstatic struct mtx_padalign vmem_list_lock;
185102558Sbrianstatic LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
18681634Sbrian
187102558Sbrian/* ---- misc */
188102558Sbrian#define	VMEM_CONDVAR_INIT(vm, wchan)	cv_init(&vm->vm_cv, wchan)
189102558Sbrian#define	VMEM_CONDVAR_DESTROY(vm)	cv_destroy(&vm->vm_cv)
190102558Sbrian#define	VMEM_CONDVAR_WAIT(vm)		cv_wait(&vm->vm_cv, &vm->vm_lock)
191102558Sbrian#define	VMEM_CONDVAR_BROADCAST(vm)	cv_broadcast(&vm->vm_cv)
192102558Sbrian
19381634Sbrian
19481634Sbrian#define	VMEM_LOCK(vm)		mtx_lock(&vm->vm_lock)
19581634Sbrian#define	VMEM_TRYLOCK(vm)	mtx_trylock(&vm->vm_lock)
196102558Sbrian#define	VMEM_UNLOCK(vm)		mtx_unlock(&vm->vm_lock)
19781634Sbrian#define	VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
19881634Sbrian#define	VMEM_LOCK_DESTROY(vm)	mtx_destroy(&vm->vm_lock)
19981634Sbrian#define	VMEM_ASSERT_LOCKED(vm)	mtx_assert(&vm->vm_lock, MA_OWNED);
200112618Sume
201112618Sume#define	VMEM_ALIGNUP(addr, align)	(-(-(addr) & -(align)))
20281739Sbrian
20381739Sbrian#define	VMEM_CROSS_P(addr1, addr2, boundary) \
20481634Sbrian	((((addr1) ^ (addr2)) & -(boundary)) != 0)
20581739Sbrian
20681739Sbrian#define	ORDER2SIZE(order)	((order) < VMEM_OPTVALUE ? ((order) + 1) : \
20781739Sbrian    (vmem_size_t)1 << ((order) - (VMEM_OPTVALUE - VMEM_OPTORDER - 1)))
20881739Sbrian#define	SIZE2ORDER(size)	((size) <= VMEM_OPTVALUE ? ((size) - 1) : \
20981634Sbrian    (flsl(size) + (VMEM_OPTVALUE - VMEM_OPTORDER - 2)))
21081634Sbrian
21181634Sbrian/*
21281634Sbrian * Maximum number of boundary tags that may be required to satisfy an
21381634Sbrian * allocation.  Two may be required to import.  Another two may be
214102558Sbrian * required to clip edges.
215102558Sbrian */
216102558Sbrian#define	BT_MAXALLOC	4
217102558Sbrian
21881634Sbrian/*
21981634Sbrian * Max free limits the number of locally cached boundary tags.  We
22081634Sbrian * just want to avoid hitting the zone allocator for every call.
221102558Sbrian */
222102558Sbrian#define BT_MAXFREE	(BT_MAXALLOC * 8)
223102558Sbrian
224102558Sbrian/* Allocator for boundary tags. */
22581634Sbrianstatic uma_zone_t vmem_bt_zone;
22681634Sbrian
22781634Sbrian/* boot time arena storage. */
228113067Sumestatic struct vmem kernel_arena_storage;
22981634Sbrianstatic struct vmem kmem_arena_storage;
23081634Sbrianstatic struct vmem buffer_arena_storage;
23181634Sbrianstatic struct vmem transient_arena_storage;
23281634Sbrianvmem_t *kernel_arena = &kernel_arena_storage;
23381634Sbrianvmem_t *kmem_arena = &kmem_arena_storage;
23481634Sbrianvmem_t *buffer_arena = &buffer_arena_storage;
23581634Sbrianvmem_t *transient_arena = &transient_arena_storage;
23681634Sbrian
23781634Sbrian#ifdef DEBUG_MEMGUARD
238112618Sumestatic struct vmem memguard_arena_storage;
239112618Sumevmem_t *memguard_arena = &memguard_arena_storage;
240112618Sume#endif
241112618Sume
24281634Sbrian/*
24381739Sbrian * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
24481739Sbrian * allocation will not fail once bt_fill() passes.  To do so we cache
24581739Sbrian * at least the maximum possible tag allocations in the arena.
24681739Sbrian */
24781739Sbrianstatic int
24881739Sbrianbt_fill(vmem_t *vm, int flags)
24981634Sbrian{
25081634Sbrian	bt_t *bt;
25181634Sbrian
25281634Sbrian	VMEM_ASSERT_LOCKED(vm);
25381634Sbrian
25481634Sbrian	/*
25581634Sbrian	 * Only allow the kmem arena to dip into reserve tags.  It is the
256116586Sume	 * vmem where new tags come from.
25781634Sbrian	 */
25881634Sbrian	flags &= BT_FLAGS;
25981634Sbrian	if (vm != kmem_arena)
26081634Sbrian		flags &= ~M_USE_RESERVE;
26181634Sbrian
26281634Sbrian	/*
26381634Sbrian	 * Loop until we meet the reserve.  To minimize the lock shuffle
26481634Sbrian	 * and prevent simultaneous fills we first try a NOWAIT regardless
26581634Sbrian	 * of the caller's flags.  Specify M_NOVM so we don't recurse while
26681634Sbrian	 * holding a vmem lock.
26781634Sbrian	 */
26881634Sbrian	while (vm->vm_nfreetags < BT_MAXALLOC) {
26981634Sbrian		bt = uma_zalloc(vmem_bt_zone,
27081634Sbrian		    (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
27181634Sbrian		if (bt == NULL) {
27281634Sbrian			VMEM_UNLOCK(vm);
27381634Sbrian			bt = uma_zalloc(vmem_bt_zone, flags);
27481634Sbrian			VMEM_LOCK(vm);
27581634Sbrian			if (bt == NULL && (flags & M_NOWAIT) != 0)
27681634Sbrian				break;
27781634Sbrian		}
278102558Sbrian		LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
279102558Sbrian		vm->vm_nfreetags++;
280102558Sbrian	}
281102558Sbrian
28281634Sbrian	if (vm->vm_nfreetags < BT_MAXALLOC)
28381897Sbrian		return ENOMEM;
28481897Sbrian
28581897Sbrian	return 0;
286102558Sbrian}
287102558Sbrian
288102558Sbrian/*
289102558Sbrian * Pop a tag off of the freetag stack.
290102558Sbrian */
291102558Sbrianstatic bt_t *
29281897Sbrianbt_alloc(vmem_t *vm)
29381897Sbrian{
29481634Sbrian	bt_t *bt;
29581634Sbrian
29681634Sbrian	VMEM_ASSERT_LOCKED(vm);
29781634Sbrian	bt = LIST_FIRST(&vm->vm_freetags);
29881634Sbrian	MPASS(bt != NULL);
29981634Sbrian	LIST_REMOVE(bt, bt_freelist);
30081634Sbrian	vm->vm_nfreetags--;
30181634Sbrian
30281634Sbrian	return bt;
30381634Sbrian}
30481634Sbrian
30581634Sbrian/*
30681634Sbrian * Trim the per-vmem free list.  Returns with the lock released to
30781634Sbrian * avoid allocator recursions.
30881634Sbrian */
30981634Sbrianstatic void
31081634Sbrianbt_freetrim(vmem_t *vm, int freelimit)
31181634Sbrian{
31281634Sbrian	LIST_HEAD(, vmem_btag) freetags;
31381634Sbrian	bt_t *bt;
31481634Sbrian
31581634Sbrian	LIST_INIT(&freetags);
31681634Sbrian	VMEM_ASSERT_LOCKED(vm);
31781634Sbrian	while (vm->vm_nfreetags > freelimit) {
31881634Sbrian		bt = LIST_FIRST(&vm->vm_freetags);
31981634Sbrian		LIST_REMOVE(bt, bt_freelist);
32081634Sbrian		vm->vm_nfreetags--;
32181634Sbrian		LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
32281634Sbrian	}
32381634Sbrian	VMEM_UNLOCK(vm);
32481634Sbrian	while ((bt = LIST_FIRST(&freetags)) != NULL) {
32581634Sbrian		LIST_REMOVE(bt, bt_freelist);
32681634Sbrian		uma_zfree(vmem_bt_zone, bt);
32781634Sbrian	}
32881634Sbrian}
32981634Sbrian
33081634Sbrianstatic inline void
33194894Sbrianbt_free(vmem_t *vm, bt_t *bt)
33281634Sbrian{
33394894Sbrian
33481634Sbrian	VMEM_ASSERT_LOCKED(vm);
33581634Sbrian	MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
33681634Sbrian	LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
33781634Sbrian	vm->vm_nfreetags++;
33881634Sbrian}
33981634Sbrian
34081634Sbrian/*
34181634Sbrian * freelist[0] ... [1, 1]
34281634Sbrian * freelist[1] ... [2, 2]
34381634Sbrian *  :
34481634Sbrian * freelist[29] ... [30, 30]
34581634Sbrian * freelist[30] ... [31, 31]
34681634Sbrian * freelist[31] ... [32, 63]
34781634Sbrian * freelist[33] ... [64, 127]
34881634Sbrian *  :
34981634Sbrian * freelist[n] ... [(1 << (n - 26)), (1 << (n - 25)) - 1]
35081634Sbrian *  :
35181634Sbrian */
35281634Sbrian
35381634Sbrianstatic struct vmem_freelist *
35481897Sbrianbt_freehead_tofree(vmem_t *vm, vmem_size_t size)
35581634Sbrian{
35681634Sbrian	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
35781634Sbrian	const int idx = SIZE2ORDER(qsize);
35881634Sbrian
35981634Sbrian	MPASS(size != 0 && qsize != 0);
36081634Sbrian	MPASS((size & vm->vm_quantum_mask) == 0);
36181634Sbrian	MPASS(idx >= 0);
36281634Sbrian	MPASS(idx < VMEM_MAXORDER);
36381634Sbrian
36481634Sbrian	return &vm->vm_freelist[idx];
36581634Sbrian}
36681634Sbrian
36781634Sbrian/*
36881634Sbrian * bt_freehead_toalloc: return the freelist for the given size and allocation
36981634Sbrian * strategy.
37081634Sbrian *
37181634Sbrian * For M_FIRSTFIT, return the list in which any blocks are large enough
37281634Sbrian * for the requested size.  otherwise, return the list which can have blocks
37381634Sbrian * large enough for the requested size.
37481634Sbrian */
37581634Sbrianstatic struct vmem_freelist *
37681634Sbrianbt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
37781634Sbrian{
378134789Sbrian	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
379134789Sbrian	int idx = SIZE2ORDER(qsize);
38081634Sbrian
38181634Sbrian	MPASS(size != 0 && qsize != 0);
38281634Sbrian	MPASS((size & vm->vm_quantum_mask) == 0);
38381634Sbrian
384134789Sbrian	if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
385134789Sbrian		idx++;
38681634Sbrian		/* check too large request? */
38781634Sbrian	}
38881634Sbrian	MPASS(idx >= 0);
38981634Sbrian	MPASS(idx < VMEM_MAXORDER);
39081634Sbrian
39181634Sbrian	return &vm->vm_freelist[idx];
392102558Sbrian}
39381634Sbrian
39481634Sbrian/* ---- boundary tag hash */
39581634Sbrian
39681634Sbrianstatic struct vmem_hashlist *
39781634Sbrianbt_hashhead(vmem_t *vm, vmem_addr_t addr)
39881634Sbrian{
39981634Sbrian	struct vmem_hashlist *list;
40081634Sbrian	unsigned int hash;
40181634Sbrian
40281634Sbrian	hash = hash32_buf(&addr, sizeof(addr), 0);
40381634Sbrian	list = &vm->vm_hashlist[hash % vm->vm_hashsize];
40481634Sbrian
40581634Sbrian	return list;
40681634Sbrian}
40781634Sbrian
40881634Sbrianstatic bt_t *
40981634Sbrianbt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
41081634Sbrian{
41181634Sbrian	struct vmem_hashlist *list;
41281634Sbrian	bt_t *bt;
41381634Sbrian
41481634Sbrian	VMEM_ASSERT_LOCKED(vm);
41581634Sbrian	list = bt_hashhead(vm, addr);
41681634Sbrian	LIST_FOREACH(bt, list, bt_hashlist) {
41781634Sbrian		if (bt->bt_start == addr) {
41881634Sbrian			break;
41981634Sbrian		}
42081634Sbrian	}
42181634Sbrian
42281634Sbrian	return bt;
42381634Sbrian}
42481634Sbrian
42581634Sbrianstatic void
42681634Sbrianbt_rembusy(vmem_t *vm, bt_t *bt)
42781634Sbrian{
42881634Sbrian
42981634Sbrian	VMEM_ASSERT_LOCKED(vm);
43081634Sbrian	MPASS(vm->vm_nbusytag > 0);
43181634Sbrian	vm->vm_inuse -= bt->bt_size;
43281634Sbrian	vm->vm_nbusytag--;
43381634Sbrian	LIST_REMOVE(bt, bt_hashlist);
43481634Sbrian}
43581634Sbrian
43681634Sbrianstatic void
43781634Sbrianbt_insbusy(vmem_t *vm, bt_t *bt)
43881634Sbrian{
43981634Sbrian	struct vmem_hashlist *list;
44081634Sbrian
44181634Sbrian	VMEM_ASSERT_LOCKED(vm);
44281634Sbrian	MPASS(bt->bt_type == BT_TYPE_BUSY);
44381634Sbrian
44481634Sbrian	list = bt_hashhead(vm, bt->bt_start);
44581634Sbrian	LIST_INSERT_HEAD(list, bt, bt_hashlist);
44681634Sbrian	vm->vm_nbusytag++;
44781634Sbrian	vm->vm_inuse += bt->bt_size;
44881634Sbrian}
44981634Sbrian
45081634Sbrian/* ---- boundary tag list */
45181634Sbrian
45281634Sbrianstatic void
45381634Sbrianbt_remseg(vmem_t *vm, bt_t *bt)
45481634Sbrian{
45581634Sbrian
45681634Sbrian	TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
45781634Sbrian	bt_free(vm, bt);
45881634Sbrian}
45981634Sbrian
46081634Sbrianstatic void
46181634Sbrianbt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
46281634Sbrian{
46381634Sbrian
46481634Sbrian	TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
46581634Sbrian}
46681634Sbrian
46781634Sbrianstatic void
46881634Sbrianbt_insseg_tail(vmem_t *vm, bt_t *bt)
46981634Sbrian{
47081634Sbrian
47181634Sbrian	TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
47281634Sbrian}
47381634Sbrian
47481634Sbrianstatic void
47581634Sbrianbt_remfree(vmem_t *vm, bt_t *bt)
47681634Sbrian{
477116588Sume
478116588Sume	MPASS(bt->bt_type == BT_TYPE_FREE);
479116588Sume
480116588Sume	LIST_REMOVE(bt, bt_freelist);
48181634Sbrian}
482116443Sume
483116588Sumestatic void
484116588Sumebt_insfree(vmem_t *vm, bt_t *bt)
485116588Sume{
486116588Sume	struct vmem_freelist *list;
487116588Sume
488116588Sume	list = bt_freehead_tofree(vm, bt->bt_size);
489116588Sume	LIST_INSERT_HEAD(list, bt, bt_freelist);
490116588Sume}
491116588Sume
492116588Sume/* ---- vmem internal functions */
493116588Sume
494116588Sume/*
495116443Sume * Import from the arena into the quantum cache in UMA.
496116443Sume */
497116443Sumestatic int
498116443Sumeqc_import(void *arg, void **store, int cnt, int flags)
499116588Sume{
500116588Sume	qcache_t *qc;
501116588Sume	vmem_addr_t addr;
502116588Sume	int i;
503116588Sume
504116443Sume	qc = arg;
505116443Sume	if ((flags & VMEM_FITMASK) == 0)
506116443Sume		flags |= M_BESTFIT;
507112613Sume	for (i = 0; i < cnt; i++) {
508116443Sume		if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
509116443Sume		    VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
510112613Sume			break;
511112613Sume		store[i] = (void *)addr;
51281634Sbrian		/* Only guarantee one allocation. */
51381634Sbrian		flags &= ~M_WAITOK;
51481634Sbrian		flags |= M_NOWAIT;
51581634Sbrian	}
51681634Sbrian	return i;
51781634Sbrian}
51881634Sbrian
51981634Sbrian/*
52081634Sbrian * Release memory from the UMA cache to the arena.
52181634Sbrian */
52281634Sbrianstatic void
52381634Sbrianqc_release(void *arg, void **store, int cnt)
52481634Sbrian{
52581634Sbrian	qcache_t *qc;
52681634Sbrian	int i;
52781634Sbrian
52881634Sbrian	qc = arg;
52981634Sbrian	for (i = 0; i < cnt; i++)
530116588Sume		vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
531116588Sume}
532116588Sume
53381634Sbrianstatic void
534116443Sumeqc_init(vmem_t *vm, vmem_size_t qcache_max)
535116588Sume{
536116588Sume	qcache_t *qc;
537116588Sume	vmem_size_t size;
538116588Sume	int qcache_idx_max;
539116588Sume	int i;
540116588Sume
541116588Sume	MPASS((qcache_max & vm->vm_quantum_mask) == 0);
542116588Sume	qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
543116588Sume	    VMEM_QCACHE_IDX_MAX);
544116588Sume	vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
545116588Sume	for (i = 0; i < qcache_idx_max; i++) {
546116588Sume		qc = &vm->vm_qcache[i];
547116443Sume		size = (i + 1) << vm->vm_quantum_shift;
548116443Sume		snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
549116443Sume		    vm->vm_name, size);
550116443Sume		qc->qc_vmem = vm;
551116588Sume		qc->qc_size = size;
552116588Sume		qc->qc_cache = uma_zcache_create(qc->qc_name, size,
553116588Sume		    NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
554116588Sume		    UMA_ZONE_VM);
555116588Sume		MPASS(qc->qc_cache);
556116443Sume	}
557116443Sume}
558116443Sume
559112613Sumestatic void
560116443Sumeqc_destroy(vmem_t *vm)
561116443Sume{
562112613Sume	int qcache_idx_max;
563112613Sume	int i;
56481634Sbrian
56581634Sbrian	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
56681634Sbrian	for (i = 0; i < qcache_idx_max; i++)
56781634Sbrian		uma_zdestroy(vm->vm_qcache[i].qc_cache);
56881634Sbrian}
56981634Sbrian
57081634Sbrianstatic void
57181634Sbrianqc_drain(vmem_t *vm)
57281634Sbrian{
57381634Sbrian	int qcache_idx_max;
57481634Sbrian	int i;
57581634Sbrian
57681634Sbrian	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
57781634Sbrian	for (i = 0; i < qcache_idx_max; i++)
57881634Sbrian		zone_drain(vm->vm_qcache[i].qc_cache);
57981634Sbrian}
58081634Sbrian
58181634Sbrian#ifndef UMA_MD_SMALL_ALLOC
58281634Sbrian
58381634Sbrianstatic struct mtx_padalign vmem_bt_lock;
58481634Sbrian
58581634Sbrian/*
58681634Sbrian * vmem_bt_alloc:  Allocate a new page of boundary tags.
58781634Sbrian *
58881634Sbrian * On architectures with uma_small_alloc there is no recursion; no address
58981634Sbrian * space need be allocated to allocate boundary tags.  For the others, we
59081634Sbrian * must handle recursion.  Boundary tags are necessary to allocate new
59181634Sbrian * boundary tags.
59281634Sbrian *
59381634Sbrian * UMA guarantees that enough tags are held in reserve to allocate a new
59481634Sbrian * page of kva.  We dip into this reserve by specifying M_USE_RESERVE only
59581634Sbrian * when allocating the page to hold new boundary tags.  In this way the
59681634Sbrian * reserve is automatically filled by the allocation that uses the reserve.
59781634Sbrian *
59881634Sbrian * We still have to guarantee that the new tags are allocated atomically since
59981634Sbrian * many threads may try concurrently.  The bt_lock provides this guarantee.
60081634Sbrian * We convert WAITOK allocations to NOWAIT and then handle the blocking here
60181634Sbrian * on failure.  It's ok to return NULL for a WAITOK allocation as UMA will
60281634Sbrian * loop again after checking to see if we lost the race to allocate.
60381634Sbrian *
60481634Sbrian * There is a small race between vmem_bt_alloc() returning the page and the
60581634Sbrian * zone lock being acquired to add the page to the zone.  For WAITOK
60681634Sbrian * allocations we just pause briefly.  NOWAIT may experience a transient
60781634Sbrian * failure.  To alleviate this we permit a small number of simultaneous
60881634Sbrian * fills to proceed concurrently so NOWAIT is less likely to fail unless
60981634Sbrian * we are really out of KVA.
61081634Sbrian */
61181634Sbrianstatic void *
61281634Sbrianvmem_bt_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
61381634Sbrian{
61481634Sbrian	vmem_addr_t addr;
61581634Sbrian
61681634Sbrian	*pflag = UMA_SLAB_KMEM;
61781634Sbrian
61881634Sbrian	/*
619102558Sbrian	 * Single thread boundary tag allocation so that the address space
62094894Sbrian	 * and memory are added in one atomic operation.
62181634Sbrian	 */
62294894Sbrian	mtx_lock(&vmem_bt_lock);
62381634Sbrian	if (vmem_xalloc(kmem_arena, bytes, 0, 0, 0, VMEM_ADDR_MIN,
62481634Sbrian	    VMEM_ADDR_MAX, M_NOWAIT | M_NOVM | M_USE_RESERVE | M_BESTFIT,
625102558Sbrian	    &addr) == 0) {
626102558Sbrian		if (kmem_back(kmem_object, addr, bytes,
62781634Sbrian		    M_NOWAIT | M_USE_RESERVE) == 0) {
62881634Sbrian			mtx_unlock(&vmem_bt_lock);
62981634Sbrian			return ((void *)addr);
63081634Sbrian		}
63181634Sbrian		vmem_xfree(kmem_arena, addr, bytes);
63281634Sbrian		mtx_unlock(&vmem_bt_lock);
63381634Sbrian		/*
634134789Sbrian		 * Out of memory, not address space.  This may not even be
63581634Sbrian		 * possible due to M_USE_RESERVE page allocation.
63681634Sbrian		 */
63781634Sbrian		if (wait & M_WAITOK)
63881634Sbrian			VM_WAIT;
63981634Sbrian		return (NULL);
64081634Sbrian	}
64181634Sbrian	mtx_unlock(&vmem_bt_lock);
64281634Sbrian	/*
64381634Sbrian	 * We're either out of address space or lost a fill race.
64481634Sbrian	 */
64581634Sbrian	if (wait & M_WAITOK)
64681634Sbrian		pause("btalloc", 1);
647134789Sbrian
64881634Sbrian	return (NULL);
649102558Sbrian}
65081634Sbrian#endif
65181634Sbrian
65281634Sbrianvoid
65381634Sbrianvmem_startup(void)
65481634Sbrian{
65581634Sbrian
65681634Sbrian	mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
65781634Sbrian	vmem_bt_zone = uma_zcreate("vmem btag",
658102558Sbrian	    sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
659102558Sbrian	    UMA_ALIGN_PTR, UMA_ZONE_VM);
66081634Sbrian#ifndef UMA_MD_SMALL_ALLOC
66194894Sbrian	mtx_init(&vmem_bt_lock, "btag lock", NULL, MTX_DEF);
662102558Sbrian	uma_prealloc(vmem_bt_zone, BT_MAXALLOC);
66394894Sbrian	/*
664102558Sbrian	 * Reserve enough tags to allocate new tags.  We allow multiple
66581634Sbrian	 * CPUs to attempt to allocate new tags concurrently to limit
666102558Sbrian	 * false restarts in UMA.
667102558Sbrian	 */
668102558Sbrian	uma_zone_reserve(vmem_bt_zone, BT_MAXALLOC * (mp_ncpus + 1) / 2);
669102558Sbrian	uma_zone_set_allocf(vmem_bt_zone, vmem_bt_alloc);
67094894Sbrian#endif
671102558Sbrian}
672102558Sbrian
673102558Sbrian/* ---- rehash */
67494894Sbrian
67594894Sbrianstatic int
67694894Sbrianvmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
67781634Sbrian{
67881634Sbrian	bt_t *bt;
67981634Sbrian	int i;
68094894Sbrian	struct vmem_hashlist *newhashlist;
68181634Sbrian	struct vmem_hashlist *oldhashlist;
68281634Sbrian	vmem_size_t oldhashsize;
68381634Sbrian
68481634Sbrian	MPASS(newhashsize > 0);
68594894Sbrian
68681634Sbrian	newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
687102558Sbrian	    M_VMEM, M_NOWAIT);
68894894Sbrian	if (newhashlist == NULL)
68981634Sbrian		return ENOMEM;
690102558Sbrian	for (i = 0; i < newhashsize; i++) {
691102558Sbrian		LIST_INIT(&newhashlist[i]);
692134789Sbrian	}
69394894Sbrian
69481634Sbrian	VMEM_LOCK(vm);
69581634Sbrian	oldhashlist = vm->vm_hashlist;
69694894Sbrian	oldhashsize = vm->vm_hashsize;
69794894Sbrian	vm->vm_hashlist = newhashlist;
69881634Sbrian	vm->vm_hashsize = newhashsize;
69994894Sbrian	if (oldhashlist == NULL) {
70081634Sbrian		VMEM_UNLOCK(vm);
701102558Sbrian		return 0;
702102558Sbrian	}
703102558Sbrian	for (i = 0; i < oldhashsize; i++) {
70481634Sbrian		while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
70581634Sbrian			bt_rembusy(vm, bt);
70681634Sbrian			bt_insbusy(vm, bt);
70781634Sbrian		}
708102558Sbrian	}
70994894Sbrian	VMEM_UNLOCK(vm);
71081634Sbrian
71181634Sbrian	if (oldhashlist != vm->vm_hash0) {
712102558Sbrian		free(oldhashlist, M_VMEM);
71394894Sbrian	}
714102558Sbrian
71581634Sbrian	return 0;
716102558Sbrian}
71794894Sbrian
718102558Sbrianstatic void
719102558Sbrianvmem_periodic_kick(void *dummy)
720102558Sbrian{
721102558Sbrian
722102558Sbrian	taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
72381634Sbrian}
724102558Sbrian
725102558Sbrianstatic void
726102558Sbrianvmem_periodic(void *unused, int pending)
727102558Sbrian{
728102558Sbrian	vmem_t *vm;
729102558Sbrian	vmem_size_t desired;
73081634Sbrian	vmem_size_t current;
73181634Sbrian
73294894Sbrian	mtx_lock(&vmem_list_lock);
733102558Sbrian	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
73481634Sbrian#ifdef DIAGNOSTIC
73581634Sbrian		/* Convenient time to verify vmem state. */
736102558Sbrian		if (enable_vmem_check == 1) {
737102558Sbrian			VMEM_LOCK(vm);
738102558Sbrian			vmem_check(vm);
739102558Sbrian			VMEM_UNLOCK(vm);
740102558Sbrian		}
741102558Sbrian#endif
742102558Sbrian		desired = 1 << flsl(vm->vm_nbusytag);
743102558Sbrian		desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
744102558Sbrian		    VMEM_HASHSIZE_MAX);
745102558Sbrian		current = vm->vm_hashsize;
74681634Sbrian
74781634Sbrian		/* Grow in powers of two.  Shrink less aggressively. */
74881634Sbrian		if (desired >= current * 2 || desired * 4 <= current)
74994894Sbrian			vmem_rehash(vm, desired);
75081634Sbrian	}
75181634Sbrian	mtx_unlock(&vmem_list_lock);
75294894Sbrian
75394894Sbrian	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
75481634Sbrian	    vmem_periodic_kick, NULL);
75581634Sbrian}
75681634Sbrian
75781634Sbrianstatic void
75881634Sbrianvmem_start_callout(void *unused)
75994894Sbrian{
76094894Sbrian
76181634Sbrian	TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
76281634Sbrian	vmem_periodic_interval = hz * 10;
76381634Sbrian	callout_init(&vmem_periodic_ch, CALLOUT_MPSAFE);
76481634Sbrian	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
76581634Sbrian	    vmem_periodic_kick, NULL);
76681634Sbrian}
76781634SbrianSYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
76881634Sbrian
76981634Sbrianstatic void
77081634Sbrianvmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int type)
77181634Sbrian{
77281634Sbrian	bt_t *btspan;
77381634Sbrian	bt_t *btfree;
77481634Sbrian
77581634Sbrian	MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
77681634Sbrian	MPASS((size & vm->vm_quantum_mask) == 0);
77781634Sbrian
778102558Sbrian	btspan = bt_alloc(vm);
779102558Sbrian	btspan->bt_type = type;
78081634Sbrian	btspan->bt_start = addr;
78194894Sbrian	btspan->bt_size = size;
78281634Sbrian	bt_insseg_tail(vm, btspan);
78381634Sbrian
78481634Sbrian	btfree = bt_alloc(vm);
785	btfree->bt_type = BT_TYPE_FREE;
786	btfree->bt_start = addr;
787	btfree->bt_size = size;
788	bt_insseg(vm, btfree, btspan);
789	bt_insfree(vm, btfree);
790
791	vm->vm_size += size;
792}
793
794static void
795vmem_destroy1(vmem_t *vm)
796{
797	bt_t *bt;
798
799	/*
800	 * Drain per-cpu quantum caches.
801	 */
802	qc_destroy(vm);
803
804	/*
805	 * The vmem should now only contain empty segments.
806	 */
807	VMEM_LOCK(vm);
808	MPASS(vm->vm_nbusytag == 0);
809
810	while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
811		bt_remseg(vm, bt);
812
813	if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
814		free(vm->vm_hashlist, M_VMEM);
815
816	bt_freetrim(vm, 0);
817
818	VMEM_CONDVAR_DESTROY(vm);
819	VMEM_LOCK_DESTROY(vm);
820	free(vm, M_VMEM);
821}
822
823static int
824vmem_import(vmem_t *vm, vmem_size_t size, vmem_size_t align, int flags)
825{
826	vmem_addr_t addr;
827	int error;
828
829	if (vm->vm_importfn == NULL)
830		return EINVAL;
831
832	/*
833	 * To make sure we get a span that meets the alignment we double it
834	 * and add the size to the tail.  This slightly overestimates.
835	 */
836	if (align != vm->vm_quantum_mask + 1)
837		size = (align * 2) + size;
838	size = roundup(size, vm->vm_import_quantum);
839
840	/*
841	 * Hide MAXALLOC tags so we're guaranteed to be able to add this
842	 * span and the tag we want to allocate from it.
843	 */
844	MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
845	vm->vm_nfreetags -= BT_MAXALLOC;
846	VMEM_UNLOCK(vm);
847	error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
848	VMEM_LOCK(vm);
849	vm->vm_nfreetags += BT_MAXALLOC;
850	if (error)
851		return ENOMEM;
852
853	vmem_add1(vm, addr, size, BT_TYPE_SPAN);
854
855	return 0;
856}
857
858/*
859 * vmem_fit: check if a bt can satisfy the given restrictions.
860 *
861 * it's a caller's responsibility to ensure the region is big enough
862 * before calling us.
863 */
864static int
865vmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
866    vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
867    vmem_addr_t maxaddr, vmem_addr_t *addrp)
868{
869	vmem_addr_t start;
870	vmem_addr_t end;
871
872	MPASS(size > 0);
873	MPASS(bt->bt_size >= size); /* caller's responsibility */
874
875	/*
876	 * XXX assumption: vmem_addr_t and vmem_size_t are
877	 * unsigned integer of the same size.
878	 */
879
880	start = bt->bt_start;
881	if (start < minaddr) {
882		start = minaddr;
883	}
884	end = BT_END(bt);
885	if (end > maxaddr)
886		end = maxaddr;
887	if (start > end)
888		return (ENOMEM);
889
890	start = VMEM_ALIGNUP(start - phase, align) + phase;
891	if (start < bt->bt_start)
892		start += align;
893	if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
894		MPASS(align < nocross);
895		start = VMEM_ALIGNUP(start - phase, nocross) + phase;
896	}
897	if (start <= end && end - start >= size - 1) {
898		MPASS((start & (align - 1)) == phase);
899		MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
900		MPASS(minaddr <= start);
901		MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
902		MPASS(bt->bt_start <= start);
903		MPASS(BT_END(bt) - start >= size - 1);
904		*addrp = start;
905
906		return (0);
907	}
908	return (ENOMEM);
909}
910
911/*
912 * vmem_clip:  Trim the boundary tag edges to the requested start and size.
913 */
914static void
915vmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
916{
917	bt_t *btnew;
918	bt_t *btprev;
919
920	VMEM_ASSERT_LOCKED(vm);
921	MPASS(bt->bt_type == BT_TYPE_FREE);
922	MPASS(bt->bt_size >= size);
923	bt_remfree(vm, bt);
924	if (bt->bt_start != start) {
925		btprev = bt_alloc(vm);
926		btprev->bt_type = BT_TYPE_FREE;
927		btprev->bt_start = bt->bt_start;
928		btprev->bt_size = start - bt->bt_start;
929		bt->bt_start = start;
930		bt->bt_size -= btprev->bt_size;
931		bt_insfree(vm, btprev);
932		bt_insseg(vm, btprev,
933		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
934	}
935	MPASS(bt->bt_start == start);
936	if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
937		/* split */
938		btnew = bt_alloc(vm);
939		btnew->bt_type = BT_TYPE_BUSY;
940		btnew->bt_start = bt->bt_start;
941		btnew->bt_size = size;
942		bt->bt_start = bt->bt_start + size;
943		bt->bt_size -= size;
944		bt_insfree(vm, bt);
945		bt_insseg(vm, btnew,
946		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
947		bt_insbusy(vm, btnew);
948		bt = btnew;
949	} else {
950		bt->bt_type = BT_TYPE_BUSY;
951		bt_insbusy(vm, bt);
952	}
953	MPASS(bt->bt_size >= size);
954	bt->bt_type = BT_TYPE_BUSY;
955}
956
957/* ---- vmem API */
958
959void
960vmem_set_import(vmem_t *vm, vmem_import_t *importfn,
961     vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
962{
963
964	VMEM_LOCK(vm);
965	vm->vm_importfn = importfn;
966	vm->vm_releasefn = releasefn;
967	vm->vm_arg = arg;
968	vm->vm_import_quantum = import_quantum;
969	VMEM_UNLOCK(vm);
970}
971
972void
973vmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
974{
975
976	VMEM_LOCK(vm);
977	vm->vm_reclaimfn = reclaimfn;
978	VMEM_UNLOCK(vm);
979}
980
981/*
982 * vmem_init: Initializes vmem arena.
983 */
984vmem_t *
985vmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
986    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
987{
988	int i;
989
990	MPASS(quantum > 0);
991	MPASS((quantum & (quantum - 1)) == 0);
992
993	bzero(vm, sizeof(*vm));
994
995	VMEM_CONDVAR_INIT(vm, name);
996	VMEM_LOCK_INIT(vm, name);
997	vm->vm_nfreetags = 0;
998	LIST_INIT(&vm->vm_freetags);
999	strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
1000	vm->vm_quantum_mask = quantum - 1;
1001	vm->vm_quantum_shift = flsl(quantum) - 1;
1002	vm->vm_nbusytag = 0;
1003	vm->vm_size = 0;
1004	vm->vm_inuse = 0;
1005	qc_init(vm, qcache_max);
1006
1007	TAILQ_INIT(&vm->vm_seglist);
1008	for (i = 0; i < VMEM_MAXORDER; i++) {
1009		LIST_INIT(&vm->vm_freelist[i]);
1010	}
1011	memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
1012	vm->vm_hashsize = VMEM_HASHSIZE_MIN;
1013	vm->vm_hashlist = vm->vm_hash0;
1014
1015	if (size != 0) {
1016		if (vmem_add(vm, base, size, flags) != 0) {
1017			vmem_destroy1(vm);
1018			return NULL;
1019		}
1020	}
1021
1022	mtx_lock(&vmem_list_lock);
1023	LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
1024	mtx_unlock(&vmem_list_lock);
1025
1026	return vm;
1027}
1028
1029/*
1030 * vmem_create: create an arena.
1031 */
1032vmem_t *
1033vmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
1034    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
1035{
1036
1037	vmem_t *vm;
1038
1039	vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT));
1040	if (vm == NULL)
1041		return (NULL);
1042	if (vmem_init(vm, name, base, size, quantum, qcache_max,
1043	    flags) == NULL) {
1044		free(vm, M_VMEM);
1045		return (NULL);
1046	}
1047	return (vm);
1048}
1049
1050void
1051vmem_destroy(vmem_t *vm)
1052{
1053
1054	mtx_lock(&vmem_list_lock);
1055	LIST_REMOVE(vm, vm_alllist);
1056	mtx_unlock(&vmem_list_lock);
1057
1058	vmem_destroy1(vm);
1059}
1060
1061vmem_size_t
1062vmem_roundup_size(vmem_t *vm, vmem_size_t size)
1063{
1064
1065	return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
1066}
1067
1068/*
1069 * vmem_alloc: allocate resource from the arena.
1070 */
1071int
1072vmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
1073{
1074	const int strat __unused = flags & VMEM_FITMASK;
1075	qcache_t *qc;
1076
1077	flags &= VMEM_FLAGS;
1078	MPASS(size > 0);
1079	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
1080	if ((flags & M_NOWAIT) == 0)
1081		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
1082
1083	if (size <= vm->vm_qcache_max) {
1084		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
1085		*addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
1086		if (*addrp == 0)
1087			return (ENOMEM);
1088		return (0);
1089	}
1090
1091	return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
1092	    flags, addrp);
1093}
1094
1095int
1096vmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
1097    const vmem_size_t phase, const vmem_size_t nocross,
1098    const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
1099    vmem_addr_t *addrp)
1100{
1101	const vmem_size_t size = vmem_roundup_size(vm, size0);
1102	struct vmem_freelist *list;
1103	struct vmem_freelist *first;
1104	struct vmem_freelist *end;
1105	vmem_size_t avail;
1106	bt_t *bt;
1107	int error;
1108	int strat;
1109
1110	flags &= VMEM_FLAGS;
1111	strat = flags & VMEM_FITMASK;
1112	MPASS(size0 > 0);
1113	MPASS(size > 0);
1114	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
1115	MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
1116	if ((flags & M_NOWAIT) == 0)
1117		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
1118	MPASS((align & vm->vm_quantum_mask) == 0);
1119	MPASS((align & (align - 1)) == 0);
1120	MPASS((phase & vm->vm_quantum_mask) == 0);
1121	MPASS((nocross & vm->vm_quantum_mask) == 0);
1122	MPASS((nocross & (nocross - 1)) == 0);
1123	MPASS((align == 0 && phase == 0) || phase < align);
1124	MPASS(nocross == 0 || nocross >= size);
1125	MPASS(minaddr <= maxaddr);
1126	MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
1127
1128	if (align == 0)
1129		align = vm->vm_quantum_mask + 1;
1130
1131	*addrp = 0;
1132	end = &vm->vm_freelist[VMEM_MAXORDER];
1133	/*
1134	 * choose a free block from which we allocate.
1135	 */
1136	first = bt_freehead_toalloc(vm, size, strat);
1137	VMEM_LOCK(vm);
1138	for (;;) {
1139		/*
1140		 * Make sure we have enough tags to complete the
1141		 * operation.
1142		 */
1143		if (vm->vm_nfreetags < BT_MAXALLOC &&
1144		    bt_fill(vm, flags) != 0) {
1145			error = ENOMEM;
1146			break;
1147		}
1148		/*
1149	 	 * Scan freelists looking for a tag that satisfies the
1150		 * allocation.  If we're doing BESTFIT we may encounter
1151		 * sizes below the request.  If we're doing FIRSTFIT we
1152		 * inspect only the first element from each list.
1153		 */
1154		for (list = first; list < end; list++) {
1155			LIST_FOREACH(bt, list, bt_freelist) {
1156				if (bt->bt_size >= size) {
1157					error = vmem_fit(bt, size, align, phase,
1158					    nocross, minaddr, maxaddr, addrp);
1159					if (error == 0) {
1160						vmem_clip(vm, bt, *addrp, size);
1161						goto out;
1162					}
1163				}
1164				/* FIRST skips to the next list. */
1165				if (strat == M_FIRSTFIT)
1166					break;
1167			}
1168		}
1169		/*
1170		 * Retry if the fast algorithm failed.
1171		 */
1172		if (strat == M_FIRSTFIT) {
1173			strat = M_BESTFIT;
1174			first = bt_freehead_toalloc(vm, size, strat);
1175			continue;
1176		}
1177		/*
1178		 * XXX it is possible to fail to meet restrictions with the
1179		 * imported region.  It is up to the user to specify the
1180		 * import quantum such that it can satisfy any allocation.
1181		 */
1182		if (vmem_import(vm, size, align, flags) == 0)
1183			continue;
1184
1185		/*
1186		 * Try to free some space from the quantum cache or reclaim
1187		 * functions if available.
1188		 */
1189		if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
1190			avail = vm->vm_size - vm->vm_inuse;
1191			VMEM_UNLOCK(vm);
1192			if (vm->vm_qcache_max != 0)
1193				qc_drain(vm);
1194			if (vm->vm_reclaimfn != NULL)
1195				vm->vm_reclaimfn(vm, flags);
1196			VMEM_LOCK(vm);
1197			/* If we were successful retry even NOWAIT. */
1198			if (vm->vm_size - vm->vm_inuse > avail)
1199				continue;
1200		}
1201		if ((flags & M_NOWAIT) != 0) {
1202			error = ENOMEM;
1203			break;
1204		}
1205		VMEM_CONDVAR_WAIT(vm);
1206	}
1207out:
1208	VMEM_UNLOCK(vm);
1209	if (error != 0 && (flags & M_NOWAIT) == 0)
1210		panic("failed to allocate waiting allocation\n");
1211
1212	return (error);
1213}
1214
1215/*
1216 * vmem_free: free the resource to the arena.
1217 */
1218void
1219vmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1220{
1221	qcache_t *qc;
1222	MPASS(size > 0);
1223
1224	if (size <= vm->vm_qcache_max) {
1225		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
1226		uma_zfree(qc->qc_cache, (void *)addr);
1227	} else
1228		vmem_xfree(vm, addr, size);
1229}
1230
1231void
1232vmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1233{
1234	bt_t *bt;
1235	bt_t *t;
1236
1237	MPASS(size > 0);
1238
1239	VMEM_LOCK(vm);
1240	bt = bt_lookupbusy(vm, addr);
1241	MPASS(bt != NULL);
1242	MPASS(bt->bt_start == addr);
1243	MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
1244	    bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
1245	MPASS(bt->bt_type == BT_TYPE_BUSY);
1246	bt_rembusy(vm, bt);
1247	bt->bt_type = BT_TYPE_FREE;
1248
1249	/* coalesce */
1250	t = TAILQ_NEXT(bt, bt_seglist);
1251	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1252		MPASS(BT_END(bt) < t->bt_start);	/* YYY */
1253		bt->bt_size += t->bt_size;
1254		bt_remfree(vm, t);
1255		bt_remseg(vm, t);
1256	}
1257	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1258	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1259		MPASS(BT_END(t) < bt->bt_start);	/* YYY */
1260		bt->bt_size += t->bt_size;
1261		bt->bt_start = t->bt_start;
1262		bt_remfree(vm, t);
1263		bt_remseg(vm, t);
1264	}
1265
1266	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1267	MPASS(t != NULL);
1268	MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
1269	if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
1270	    t->bt_size == bt->bt_size) {
1271		vmem_addr_t spanaddr;
1272		vmem_size_t spansize;
1273
1274		MPASS(t->bt_start == bt->bt_start);
1275		spanaddr = bt->bt_start;
1276		spansize = bt->bt_size;
1277		bt_remseg(vm, bt);
1278		bt_remseg(vm, t);
1279		vm->vm_size -= spansize;
1280		VMEM_CONDVAR_BROADCAST(vm);
1281		bt_freetrim(vm, BT_MAXFREE);
1282		(*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
1283	} else {
1284		bt_insfree(vm, bt);
1285		VMEM_CONDVAR_BROADCAST(vm);
1286		bt_freetrim(vm, BT_MAXFREE);
1287	}
1288}
1289
1290/*
1291 * vmem_add:
1292 *
1293 */
1294int
1295vmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
1296{
1297	int error;
1298
1299	error = 0;
1300	flags &= VMEM_FLAGS;
1301	VMEM_LOCK(vm);
1302	if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
1303		vmem_add1(vm, addr, size, BT_TYPE_SPAN_STATIC);
1304	else
1305		error = ENOMEM;
1306	VMEM_UNLOCK(vm);
1307
1308	return (error);
1309}
1310
1311/*
1312 * vmem_size: information about arenas size
1313 */
1314vmem_size_t
1315vmem_size(vmem_t *vm, int typemask)
1316{
1317
1318	switch (typemask) {
1319	case VMEM_ALLOC:
1320		return vm->vm_inuse;
1321	case VMEM_FREE:
1322		return vm->vm_size - vm->vm_inuse;
1323	case VMEM_FREE|VMEM_ALLOC:
1324		return vm->vm_size;
1325	default:
1326		panic("vmem_size");
1327	}
1328}
1329
1330/* ---- debug */
1331
1332#if defined(DDB) || defined(DIAGNOSTIC)
1333
1334static void bt_dump(const bt_t *, int (*)(const char *, ...)
1335    __printflike(1, 2));
1336
1337static const char *
1338bt_type_string(int type)
1339{
1340
1341	switch (type) {
1342	case BT_TYPE_BUSY:
1343		return "busy";
1344	case BT_TYPE_FREE:
1345		return "free";
1346	case BT_TYPE_SPAN:
1347		return "span";
1348	case BT_TYPE_SPAN_STATIC:
1349		return "static span";
1350	default:
1351		break;
1352	}
1353	return "BOGUS";
1354}
1355
1356static void
1357bt_dump(const bt_t *bt, int (*pr)(const char *, ...))
1358{
1359
1360	(*pr)("\t%p: %jx %jx, %d(%s)\n",
1361	    bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
1362	    bt->bt_type, bt_type_string(bt->bt_type));
1363}
1364
1365static void
1366vmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
1367{
1368	const bt_t *bt;
1369	int i;
1370
1371	(*pr)("vmem %p '%s'\n", vm, vm->vm_name);
1372	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1373		bt_dump(bt, pr);
1374	}
1375
1376	for (i = 0; i < VMEM_MAXORDER; i++) {
1377		const struct vmem_freelist *fl = &vm->vm_freelist[i];
1378
1379		if (LIST_EMPTY(fl)) {
1380			continue;
1381		}
1382
1383		(*pr)("freelist[%d]\n", i);
1384		LIST_FOREACH(bt, fl, bt_freelist) {
1385			bt_dump(bt, pr);
1386		}
1387	}
1388}
1389
1390#endif /* defined(DDB) || defined(DIAGNOSTIC) */
1391
1392#if defined(DDB)
1393#include <ddb/ddb.h>
1394
1395static bt_t *
1396vmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
1397{
1398	bt_t *bt;
1399
1400	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1401		if (BT_ISSPAN_P(bt)) {
1402			continue;
1403		}
1404		if (bt->bt_start <= addr && addr <= BT_END(bt)) {
1405			return bt;
1406		}
1407	}
1408
1409	return NULL;
1410}
1411
1412void
1413vmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
1414{
1415	vmem_t *vm;
1416
1417	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1418		bt_t *bt;
1419
1420		bt = vmem_whatis_lookup(vm, addr);
1421		if (bt == NULL) {
1422			continue;
1423		}
1424		(*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
1425		    (void *)addr, (void *)bt->bt_start,
1426		    (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
1427		    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
1428	}
1429}
1430
1431void
1432vmem_printall(const char *modif, int (*pr)(const char *, ...))
1433{
1434	const vmem_t *vm;
1435
1436	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1437		vmem_dump(vm, pr);
1438	}
1439}
1440
1441void
1442vmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
1443{
1444	const vmem_t *vm = (const void *)addr;
1445
1446	vmem_dump(vm, pr);
1447}
1448
1449DB_SHOW_COMMAND(vmemdump, vmemdump)
1450{
1451
1452	if (!have_addr) {
1453		db_printf("usage: show vmemdump <addr>\n");
1454		return;
1455	}
1456
1457	vmem_dump((const vmem_t *)addr, db_printf);
1458}
1459
1460DB_SHOW_ALL_COMMAND(vmemdump, vmemdumpall)
1461{
1462	const vmem_t *vm;
1463
1464	LIST_FOREACH(vm, &vmem_list, vm_alllist)
1465		vmem_dump(vm, db_printf);
1466}
1467
1468DB_SHOW_COMMAND(vmem, vmem_summ)
1469{
1470	const vmem_t *vm = (const void *)addr;
1471	const bt_t *bt;
1472	size_t ft[VMEM_MAXORDER], ut[VMEM_MAXORDER];
1473	size_t fs[VMEM_MAXORDER], us[VMEM_MAXORDER];
1474	int ord;
1475
1476	if (!have_addr) {
1477		db_printf("usage: show vmem <addr>\n");
1478		return;
1479	}
1480
1481	db_printf("vmem %p '%s'\n", vm, vm->vm_name);
1482	db_printf("\tquantum:\t%zu\n", vm->vm_quantum_mask + 1);
1483	db_printf("\tsize:\t%zu\n", vm->vm_size);
1484	db_printf("\tinuse:\t%zu\n", vm->vm_inuse);
1485	db_printf("\tfree:\t%zu\n", vm->vm_size - vm->vm_inuse);
1486	db_printf("\tbusy tags:\t%d\n", vm->vm_nbusytag);
1487	db_printf("\tfree tags:\t%d\n", vm->vm_nfreetags);
1488
1489	memset(&ft, 0, sizeof(ft));
1490	memset(&ut, 0, sizeof(ut));
1491	memset(&fs, 0, sizeof(fs));
1492	memset(&us, 0, sizeof(us));
1493	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1494		ord = SIZE2ORDER(bt->bt_size >> vm->vm_quantum_shift);
1495		if (bt->bt_type == BT_TYPE_BUSY) {
1496			ut[ord]++;
1497			us[ord] += bt->bt_size;
1498		} else if (bt->bt_type == BT_TYPE_FREE) {
1499			ft[ord]++;
1500			fs[ord] += bt->bt_size;
1501		}
1502	}
1503	db_printf("\t\t\tinuse\tsize\t\tfree\tsize\n");
1504	for (ord = 0; ord < VMEM_MAXORDER; ord++) {
1505		if (ut[ord] == 0 && ft[ord] == 0)
1506			continue;
1507		db_printf("\t%-15zu %zu\t%-15zu %zu\t%-16zu\n",
1508		    ORDER2SIZE(ord) << vm->vm_quantum_shift,
1509		    ut[ord], us[ord], ft[ord], fs[ord]);
1510	}
1511}
1512
1513DB_SHOW_ALL_COMMAND(vmem, vmem_summall)
1514{
1515	const vmem_t *vm;
1516
1517	LIST_FOREACH(vm, &vmem_list, vm_alllist)
1518		vmem_summ((db_expr_t)vm, TRUE, count, modif);
1519}
1520#endif /* defined(DDB) */
1521
1522#define vmem_printf printf
1523
1524#if defined(DIAGNOSTIC)
1525
1526static bool
1527vmem_check_sanity(vmem_t *vm)
1528{
1529	const bt_t *bt, *bt2;
1530
1531	MPASS(vm != NULL);
1532
1533	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1534		if (bt->bt_start > BT_END(bt)) {
1535			printf("corrupted tag\n");
1536			bt_dump(bt, vmem_printf);
1537			return false;
1538		}
1539	}
1540	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1541		TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
1542			if (bt == bt2) {
1543				continue;
1544			}
1545			if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
1546				continue;
1547			}
1548			if (bt->bt_start <= BT_END(bt2) &&
1549			    bt2->bt_start <= BT_END(bt)) {
1550				printf("overwrapped tags\n");
1551				bt_dump(bt, vmem_printf);
1552				bt_dump(bt2, vmem_printf);
1553				return false;
1554			}
1555		}
1556	}
1557
1558	return true;
1559}
1560
1561static void
1562vmem_check(vmem_t *vm)
1563{
1564
1565	if (!vmem_check_sanity(vm)) {
1566		panic("insanity vmem %p", vm);
1567	}
1568}
1569
1570#endif /* defined(DIAGNOSTIC) */
1571