subr_vmem.c revision 252330
1252330Sjeff/*-
2252330Sjeff * Copyright (c)2006,2007,2008,2009 YAMAMOTO Takashi,
3252330Sjeff * Copyright (c) 2013 EMC Corp.
4252330Sjeff * All rights reserved.
5252330Sjeff *
6252330Sjeff * Redistribution and use in source and binary forms, with or without
7252330Sjeff * modification, are permitted provided that the following conditions
8252330Sjeff * are met:
9252330Sjeff * 1. Redistributions of source code must retain the above copyright
10252330Sjeff *    notice, this list of conditions and the following disclaimer.
11252330Sjeff * 2. Redistributions in binary form must reproduce the above copyright
12252330Sjeff *    notice, this list of conditions and the following disclaimer in the
13252330Sjeff *    documentation and/or other materials provided with the distribution.
14252330Sjeff *
15252330Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16252330Sjeff * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17252330Sjeff * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18252330Sjeff * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19252330Sjeff * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20252330Sjeff * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21252330Sjeff * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22252330Sjeff * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23252330Sjeff * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24252330Sjeff * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25252330Sjeff * SUCH DAMAGE.
26252330Sjeff */
27252330Sjeff
28252330Sjeff/*
29252330Sjeff * From:
30252330Sjeff *	$NetBSD: vmem_impl.h,v 1.2 2013/01/29 21:26:24 para Exp $
31252330Sjeff *	$NetBSD: subr_vmem.c,v 1.83 2013/03/06 11:20:10 yamt Exp $
32252330Sjeff */
33252330Sjeff
34252330Sjeff/*
35252330Sjeff * reference:
36252330Sjeff * -	Magazines and Vmem: Extending the Slab Allocator
37252330Sjeff *	to Many CPUs and Arbitrary Resources
38252330Sjeff *	http://www.usenix.org/event/usenix01/bonwick.html
39252330Sjeff */
40252330Sjeff
41252330Sjeff#include <sys/cdefs.h>
42252330Sjeff__FBSDID("$FreeBSD: head/sys/kern/subr_vmem.c 252330 2013-06-28 03:51:20Z jeff $");
43252330Sjeff
44252330Sjeff#include "opt_ddb.h"
45252330Sjeff
46252330Sjeff#include <sys/param.h>
47252330Sjeff#include <sys/systm.h>
48252330Sjeff#include <sys/kernel.h>
49252330Sjeff#include <sys/queue.h>
50252330Sjeff#include <sys/callout.h>
51252330Sjeff#include <sys/hash.h>
52252330Sjeff#include <sys/lock.h>
53252330Sjeff#include <sys/malloc.h>
54252330Sjeff#include <sys/mutex.h>
55252330Sjeff#include <sys/smp.h>
56252330Sjeff#include <sys/condvar.h>
57252330Sjeff#include <sys/taskqueue.h>
58252330Sjeff#include <sys/vmem.h>
59252330Sjeff
60252330Sjeff#include <vm/uma.h>
61252330Sjeff#include <vm/vm.h>
62252330Sjeff#include <vm/pmap.h>
63252330Sjeff#include <vm/vm_map.h>
64252330Sjeff#include <vm/vm_kern.h>
65252330Sjeff#include <vm/vm_extern.h>
66252330Sjeff#include <vm/vm_param.h>
67252330Sjeff#include <vm/vm_pageout.h>
68252330Sjeff
69252330Sjeff#define	VMEM_MAXORDER		(sizeof(vmem_size_t) * NBBY)
70252330Sjeff
71252330Sjeff#define	VMEM_HASHSIZE_MIN	16
72252330Sjeff#define	VMEM_HASHSIZE_MAX	131072
73252330Sjeff
74252330Sjeff#define	VMEM_QCACHE_IDX_MAX	16
75252330Sjeff
76252330Sjeff#define	VMEM_FITMASK	(M_BESTFIT | M_FIRSTFIT)
77252330Sjeff
78252330Sjeff#define	VMEM_FLAGS						\
79252330Sjeff    (M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM | M_BESTFIT | M_FIRSTFIT)
80252330Sjeff
81252330Sjeff#define	BT_FLAGS	(M_NOWAIT | M_WAITOK | M_USE_RESERVE | M_NOVM)
82252330Sjeff
83252330Sjeff#define	QC_NAME_MAX	16
84252330Sjeff
85252330Sjeff/*
86252330Sjeff * Data structures private to vmem.
87252330Sjeff */
88252330SjeffMALLOC_DEFINE(M_VMEM, "vmem", "vmem internal structures");
89252330Sjeff
90252330Sjefftypedef struct vmem_btag bt_t;
91252330Sjeff
92252330SjeffTAILQ_HEAD(vmem_seglist, vmem_btag);
93252330SjeffLIST_HEAD(vmem_freelist, vmem_btag);
94252330SjeffLIST_HEAD(vmem_hashlist, vmem_btag);
95252330Sjeff
96252330Sjeffstruct qcache {
97252330Sjeff	uma_zone_t	qc_cache;
98252330Sjeff	vmem_t 		*qc_vmem;
99252330Sjeff	vmem_size_t	qc_size;
100252330Sjeff	char		qc_name[QC_NAME_MAX];
101252330Sjeff};
102252330Sjefftypedef struct qcache qcache_t;
103252330Sjeff#define	QC_POOL_TO_QCACHE(pool)	((qcache_t *)(pool->pr_qcache))
104252330Sjeff
105252330Sjeff#define	VMEM_NAME_MAX	16
106252330Sjeff
107252330Sjeff/* vmem arena */
108252330Sjeffstruct vmem {
109252330Sjeff	struct mtx_padalign	vm_lock;
110252330Sjeff	struct cv		vm_cv;
111252330Sjeff	char			vm_name[VMEM_NAME_MAX+1];
112252330Sjeff	LIST_ENTRY(vmem)	vm_alllist;
113252330Sjeff	struct vmem_hashlist	vm_hash0[VMEM_HASHSIZE_MIN];
114252330Sjeff	struct vmem_freelist	vm_freelist[VMEM_MAXORDER];
115252330Sjeff	struct vmem_seglist	vm_seglist;
116252330Sjeff	struct vmem_hashlist	*vm_hashlist;
117252330Sjeff	vmem_size_t		vm_hashsize;
118252330Sjeff
119252330Sjeff	/* Constant after init */
120252330Sjeff	vmem_size_t		vm_qcache_max;
121252330Sjeff	vmem_size_t		vm_quantum_mask;
122252330Sjeff	vmem_size_t		vm_import_quantum;
123252330Sjeff	int			vm_quantum_shift;
124252330Sjeff
125252330Sjeff	/* Written on alloc/free */
126252330Sjeff	LIST_HEAD(, vmem_btag)	vm_freetags;
127252330Sjeff	int			vm_nfreetags;
128252330Sjeff	int			vm_nbusytag;
129252330Sjeff	vmem_size_t		vm_inuse;
130252330Sjeff	vmem_size_t		vm_size;
131252330Sjeff
132252330Sjeff	/* Used on import. */
133252330Sjeff	vmem_import_t		*vm_importfn;
134252330Sjeff	vmem_release_t		*vm_releasefn;
135252330Sjeff	void			*vm_arg;
136252330Sjeff
137252330Sjeff	/* Space exhaustion callback. */
138252330Sjeff	vmem_reclaim_t		*vm_reclaimfn;
139252330Sjeff
140252330Sjeff	/* quantum cache */
141252330Sjeff	qcache_t		vm_qcache[VMEM_QCACHE_IDX_MAX];
142252330Sjeff};
143252330Sjeff
144252330Sjeff/* boundary tag */
145252330Sjeffstruct vmem_btag {
146252330Sjeff	TAILQ_ENTRY(vmem_btag) bt_seglist;
147252330Sjeff	union {
148252330Sjeff		LIST_ENTRY(vmem_btag) u_freelist; /* BT_TYPE_FREE */
149252330Sjeff		LIST_ENTRY(vmem_btag) u_hashlist; /* BT_TYPE_BUSY */
150252330Sjeff	} bt_u;
151252330Sjeff#define	bt_hashlist	bt_u.u_hashlist
152252330Sjeff#define	bt_freelist	bt_u.u_freelist
153252330Sjeff	vmem_addr_t	bt_start;
154252330Sjeff	vmem_size_t	bt_size;
155252330Sjeff	int		bt_type;
156252330Sjeff};
157252330Sjeff
158252330Sjeff#define	BT_TYPE_SPAN		1	/* Allocated from importfn */
159252330Sjeff#define	BT_TYPE_SPAN_STATIC	2	/* vmem_add() or create. */
160252330Sjeff#define	BT_TYPE_FREE		3	/* Available space. */
161252330Sjeff#define	BT_TYPE_BUSY		4	/* Used space. */
162252330Sjeff#define	BT_ISSPAN_P(bt)	((bt)->bt_type <= BT_TYPE_SPAN_STATIC)
163252330Sjeff
164252330Sjeff#define	BT_END(bt)	((bt)->bt_start + (bt)->bt_size - 1)
165252330Sjeff
166252330Sjeff#if defined(DIAGNOSTIC)
167252330Sjeffstatic void vmem_check(vmem_t *);
168252330Sjeff#endif
169252330Sjeff
170252330Sjeffstatic struct callout	vmem_periodic_ch;
171252330Sjeffstatic int		vmem_periodic_interval;
172252330Sjeffstatic struct task	vmem_periodic_wk;
173252330Sjeff
174252330Sjeffstatic struct mtx_padalign vmem_list_lock;
175252330Sjeffstatic LIST_HEAD(, vmem) vmem_list = LIST_HEAD_INITIALIZER(vmem_list);
176252330Sjeff
177252330Sjeff/* ---- misc */
178252330Sjeff#define	VMEM_CONDVAR_INIT(vm, wchan)	cv_init(&vm->vm_cv, wchan)
179252330Sjeff#define	VMEM_CONDVAR_DESTROY(vm)	cv_destroy(&vm->vm_cv)
180252330Sjeff#define	VMEM_CONDVAR_WAIT(vm)		cv_wait(&vm->vm_cv, &vm->vm_lock)
181252330Sjeff#define	VMEM_CONDVAR_BROADCAST(vm)	cv_broadcast(&vm->vm_cv)
182252330Sjeff
183252330Sjeff
184252330Sjeff#define	VMEM_LOCK(vm)		mtx_lock(&vm->vm_lock)
185252330Sjeff#define	VMEM_TRYLOCK(vm)	mtx_trylock(&vm->vm_lock)
186252330Sjeff#define	VMEM_UNLOCK(vm)		mtx_unlock(&vm->vm_lock)
187252330Sjeff#define	VMEM_LOCK_INIT(vm, name) mtx_init(&vm->vm_lock, (name), NULL, MTX_DEF)
188252330Sjeff#define	VMEM_LOCK_DESTROY(vm)	mtx_destroy(&vm->vm_lock)
189252330Sjeff#define	VMEM_ASSERT_LOCKED(vm)	mtx_assert(&vm->vm_lock, MA_OWNED);
190252330Sjeff
191252330Sjeff#define	VMEM_ALIGNUP(addr, align)	(-(-(addr) & -(align)))
192252330Sjeff
193252330Sjeff#define	VMEM_CROSS_P(addr1, addr2, boundary) \
194252330Sjeff	((((addr1) ^ (addr2)) & -(boundary)) != 0)
195252330Sjeff
196252330Sjeff#define	ORDER2SIZE(order)	((vmem_size_t)1 << (order))
197252330Sjeff#define	SIZE2ORDER(size)	((int)flsl(size) - 1)
198252330Sjeff
199252330Sjeff/*
200252330Sjeff * Maximum number of boundary tags that may be required to satisfy an
201252330Sjeff * allocation.  Two may be required to import.  Another two may be
202252330Sjeff * required to clip edges.
203252330Sjeff */
204252330Sjeff#define	BT_MAXALLOC	4
205252330Sjeff
206252330Sjeff/*
207252330Sjeff * Max free limits the number of locally cached boundary tags.  We
208252330Sjeff * just want to avoid hitting the zone allocator for every call.
209252330Sjeff */
210252330Sjeff#define BT_MAXFREE	(BT_MAXALLOC * 8)
211252330Sjeff
212252330Sjeff/* Allocator for boundary tags. */
213252330Sjeffstatic uma_zone_t vmem_bt_zone;
214252330Sjeff
215252330Sjeff/* boot time arena storage. */
216252330Sjeffstatic struct vmem buffer_arena_storage;
217252330Sjeffstatic struct vmem transient_arena_storage;
218252330Sjeffvmem_t *buffer_arena = &buffer_arena_storage;
219252330Sjeffvmem_t *transient_arena = &transient_arena_storage;
220252330Sjeff
221252330Sjeff/*
222252330Sjeff * Fill the vmem's boundary tag cache.  We guarantee that boundary tag
223252330Sjeff * allocation will not fail once bt_fill() passes.  To do so we cache
224252330Sjeff * at least the maximum possible tag allocations in the arena.
225252330Sjeff */
226252330Sjeffstatic int
227252330Sjeffbt_fill(vmem_t *vm, int flags)
228252330Sjeff{
229252330Sjeff	bt_t *bt;
230252330Sjeff
231252330Sjeff	VMEM_ASSERT_LOCKED(vm);
232252330Sjeff
233252330Sjeff	/*
234252330Sjeff	 * Loop until we meet the reserve.  To minimize the lock shuffle
235252330Sjeff	 * and prevent simultaneous fills we first try a NOWAIT regardless
236252330Sjeff	 * of the caller's flags.  Specify M_NOVM so we don't recurse while
237252330Sjeff	 * holding a vmem lock.
238252330Sjeff	 */
239252330Sjeff	while (vm->vm_nfreetags < BT_MAXALLOC) {
240252330Sjeff		bt = uma_zalloc(vmem_bt_zone,
241252330Sjeff		    (flags & M_USE_RESERVE) | M_NOWAIT | M_NOVM);
242252330Sjeff		if (bt == NULL) {
243252330Sjeff			VMEM_UNLOCK(vm);
244252330Sjeff			bt = uma_zalloc(vmem_bt_zone, flags);
245252330Sjeff			VMEM_LOCK(vm);
246252330Sjeff			if (bt == NULL && (flags & M_NOWAIT) != 0)
247252330Sjeff				break;
248252330Sjeff		}
249252330Sjeff		LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
250252330Sjeff		vm->vm_nfreetags++;
251252330Sjeff	}
252252330Sjeff
253252330Sjeff	if (vm->vm_nfreetags < BT_MAXALLOC)
254252330Sjeff		return ENOMEM;
255252330Sjeff
256252330Sjeff	return 0;
257252330Sjeff}
258252330Sjeff
259252330Sjeff/*
260252330Sjeff * Pop a tag off of the freetag stack.
261252330Sjeff */
262252330Sjeffstatic bt_t *
263252330Sjeffbt_alloc(vmem_t *vm)
264252330Sjeff{
265252330Sjeff	bt_t *bt;
266252330Sjeff
267252330Sjeff	VMEM_ASSERT_LOCKED(vm);
268252330Sjeff	bt = LIST_FIRST(&vm->vm_freetags);
269252330Sjeff	MPASS(bt != NULL);
270252330Sjeff	LIST_REMOVE(bt, bt_freelist);
271252330Sjeff	vm->vm_nfreetags--;
272252330Sjeff
273252330Sjeff	return bt;
274252330Sjeff}
275252330Sjeff
276252330Sjeff/*
277252330Sjeff * Trim the per-vmem free list.  Returns with the lock released to
278252330Sjeff * avoid allocator recursions.
279252330Sjeff */
280252330Sjeffstatic void
281252330Sjeffbt_freetrim(vmem_t *vm, int freelimit)
282252330Sjeff{
283252330Sjeff	LIST_HEAD(, vmem_btag) freetags;
284252330Sjeff	bt_t *bt;
285252330Sjeff
286252330Sjeff	LIST_INIT(&freetags);
287252330Sjeff	VMEM_ASSERT_LOCKED(vm);
288252330Sjeff	while (vm->vm_nfreetags > freelimit) {
289252330Sjeff		bt = LIST_FIRST(&vm->vm_freetags);
290252330Sjeff		LIST_REMOVE(bt, bt_freelist);
291252330Sjeff		vm->vm_nfreetags--;
292252330Sjeff		LIST_INSERT_HEAD(&freetags, bt, bt_freelist);
293252330Sjeff	}
294252330Sjeff	VMEM_UNLOCK(vm);
295252330Sjeff	while ((bt = LIST_FIRST(&freetags)) != NULL) {
296252330Sjeff		LIST_REMOVE(bt, bt_freelist);
297252330Sjeff		uma_zfree(vmem_bt_zone, bt);
298252330Sjeff	}
299252330Sjeff}
300252330Sjeff
301252330Sjeffstatic inline void
302252330Sjeffbt_free(vmem_t *vm, bt_t *bt)
303252330Sjeff{
304252330Sjeff
305252330Sjeff	VMEM_ASSERT_LOCKED(vm);
306252330Sjeff	MPASS(LIST_FIRST(&vm->vm_freetags) != bt);
307252330Sjeff	LIST_INSERT_HEAD(&vm->vm_freetags, bt, bt_freelist);
308252330Sjeff	vm->vm_nfreetags++;
309252330Sjeff}
310252330Sjeff
311252330Sjeff/*
312252330Sjeff * freelist[0] ... [1, 1]
313252330Sjeff * freelist[1] ... [2, 3]
314252330Sjeff * freelist[2] ... [4, 7]
315252330Sjeff * freelist[3] ... [8, 15]
316252330Sjeff *  :
317252330Sjeff * freelist[n] ... [(1 << n), (1 << (n + 1)) - 1]
318252330Sjeff *  :
319252330Sjeff */
320252330Sjeff
321252330Sjeffstatic struct vmem_freelist *
322252330Sjeffbt_freehead_tofree(vmem_t *vm, vmem_size_t size)
323252330Sjeff{
324252330Sjeff	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
325252330Sjeff	const int idx = SIZE2ORDER(qsize);
326252330Sjeff
327252330Sjeff	MPASS(size != 0 && qsize != 0);
328252330Sjeff	MPASS((size & vm->vm_quantum_mask) == 0);
329252330Sjeff	MPASS(idx >= 0);
330252330Sjeff	MPASS(idx < VMEM_MAXORDER);
331252330Sjeff
332252330Sjeff	return &vm->vm_freelist[idx];
333252330Sjeff}
334252330Sjeff
335252330Sjeff/*
336252330Sjeff * bt_freehead_toalloc: return the freelist for the given size and allocation
337252330Sjeff * strategy.
338252330Sjeff *
339252330Sjeff * For M_FIRSTFIT, return the list in which any blocks are large enough
340252330Sjeff * for the requested size.  otherwise, return the list which can have blocks
341252330Sjeff * large enough for the requested size.
342252330Sjeff */
343252330Sjeffstatic struct vmem_freelist *
344252330Sjeffbt_freehead_toalloc(vmem_t *vm, vmem_size_t size, int strat)
345252330Sjeff{
346252330Sjeff	const vmem_size_t qsize = size >> vm->vm_quantum_shift;
347252330Sjeff	int idx = SIZE2ORDER(qsize);
348252330Sjeff
349252330Sjeff	MPASS(size != 0 && qsize != 0);
350252330Sjeff	MPASS((size & vm->vm_quantum_mask) == 0);
351252330Sjeff
352252330Sjeff	if (strat == M_FIRSTFIT && ORDER2SIZE(idx) != qsize) {
353252330Sjeff		idx++;
354252330Sjeff		/* check too large request? */
355252330Sjeff	}
356252330Sjeff	MPASS(idx >= 0);
357252330Sjeff	MPASS(idx < VMEM_MAXORDER);
358252330Sjeff
359252330Sjeff	return &vm->vm_freelist[idx];
360252330Sjeff}
361252330Sjeff
362252330Sjeff/* ---- boundary tag hash */
363252330Sjeff
364252330Sjeffstatic struct vmem_hashlist *
365252330Sjeffbt_hashhead(vmem_t *vm, vmem_addr_t addr)
366252330Sjeff{
367252330Sjeff	struct vmem_hashlist *list;
368252330Sjeff	unsigned int hash;
369252330Sjeff
370252330Sjeff	hash = hash32_buf(&addr, sizeof(addr), 0);
371252330Sjeff	list = &vm->vm_hashlist[hash % vm->vm_hashsize];
372252330Sjeff
373252330Sjeff	return list;
374252330Sjeff}
375252330Sjeff
376252330Sjeffstatic bt_t *
377252330Sjeffbt_lookupbusy(vmem_t *vm, vmem_addr_t addr)
378252330Sjeff{
379252330Sjeff	struct vmem_hashlist *list;
380252330Sjeff	bt_t *bt;
381252330Sjeff
382252330Sjeff	VMEM_ASSERT_LOCKED(vm);
383252330Sjeff	list = bt_hashhead(vm, addr);
384252330Sjeff	LIST_FOREACH(bt, list, bt_hashlist) {
385252330Sjeff		if (bt->bt_start == addr) {
386252330Sjeff			break;
387252330Sjeff		}
388252330Sjeff	}
389252330Sjeff
390252330Sjeff	return bt;
391252330Sjeff}
392252330Sjeff
393252330Sjeffstatic void
394252330Sjeffbt_rembusy(vmem_t *vm, bt_t *bt)
395252330Sjeff{
396252330Sjeff
397252330Sjeff	VMEM_ASSERT_LOCKED(vm);
398252330Sjeff	MPASS(vm->vm_nbusytag > 0);
399252330Sjeff	vm->vm_inuse -= bt->bt_size;
400252330Sjeff	vm->vm_nbusytag--;
401252330Sjeff	LIST_REMOVE(bt, bt_hashlist);
402252330Sjeff}
403252330Sjeff
404252330Sjeffstatic void
405252330Sjeffbt_insbusy(vmem_t *vm, bt_t *bt)
406252330Sjeff{
407252330Sjeff	struct vmem_hashlist *list;
408252330Sjeff
409252330Sjeff	VMEM_ASSERT_LOCKED(vm);
410252330Sjeff	MPASS(bt->bt_type == BT_TYPE_BUSY);
411252330Sjeff
412252330Sjeff	list = bt_hashhead(vm, bt->bt_start);
413252330Sjeff	LIST_INSERT_HEAD(list, bt, bt_hashlist);
414252330Sjeff	vm->vm_nbusytag++;
415252330Sjeff	vm->vm_inuse += bt->bt_size;
416252330Sjeff}
417252330Sjeff
418252330Sjeff/* ---- boundary tag list */
419252330Sjeff
420252330Sjeffstatic void
421252330Sjeffbt_remseg(vmem_t *vm, bt_t *bt)
422252330Sjeff{
423252330Sjeff
424252330Sjeff	TAILQ_REMOVE(&vm->vm_seglist, bt, bt_seglist);
425252330Sjeff	bt_free(vm, bt);
426252330Sjeff}
427252330Sjeff
428252330Sjeffstatic void
429252330Sjeffbt_insseg(vmem_t *vm, bt_t *bt, bt_t *prev)
430252330Sjeff{
431252330Sjeff
432252330Sjeff	TAILQ_INSERT_AFTER(&vm->vm_seglist, prev, bt, bt_seglist);
433252330Sjeff}
434252330Sjeff
435252330Sjeffstatic void
436252330Sjeffbt_insseg_tail(vmem_t *vm, bt_t *bt)
437252330Sjeff{
438252330Sjeff
439252330Sjeff	TAILQ_INSERT_TAIL(&vm->vm_seglist, bt, bt_seglist);
440252330Sjeff}
441252330Sjeff
442252330Sjeffstatic void
443252330Sjeffbt_remfree(vmem_t *vm, bt_t *bt)
444252330Sjeff{
445252330Sjeff
446252330Sjeff	MPASS(bt->bt_type == BT_TYPE_FREE);
447252330Sjeff
448252330Sjeff	LIST_REMOVE(bt, bt_freelist);
449252330Sjeff}
450252330Sjeff
451252330Sjeffstatic void
452252330Sjeffbt_insfree(vmem_t *vm, bt_t *bt)
453252330Sjeff{
454252330Sjeff	struct vmem_freelist *list;
455252330Sjeff
456252330Sjeff	list = bt_freehead_tofree(vm, bt->bt_size);
457252330Sjeff	LIST_INSERT_HEAD(list, bt, bt_freelist);
458252330Sjeff}
459252330Sjeff
460252330Sjeff/* ---- vmem internal functions */
461252330Sjeff
462252330Sjeff/*
463252330Sjeff * Import from the arena into the quantum cache in UMA.
464252330Sjeff */
465252330Sjeffstatic int
466252330Sjeffqc_import(void *arg, void **store, int cnt, int flags)
467252330Sjeff{
468252330Sjeff	qcache_t *qc;
469252330Sjeff	vmem_addr_t addr;
470252330Sjeff	int i;
471252330Sjeff
472252330Sjeff	qc = arg;
473252330Sjeff	flags |= M_BESTFIT;
474252330Sjeff	for (i = 0; i < cnt; i++) {
475252330Sjeff		if (vmem_xalloc(qc->qc_vmem, qc->qc_size, 0, 0, 0,
476252330Sjeff		    VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags, &addr) != 0)
477252330Sjeff			break;
478252330Sjeff		store[i] = (void *)addr;
479252330Sjeff		/* Only guarantee one allocation. */
480252330Sjeff		flags &= ~M_WAITOK;
481252330Sjeff		flags |= M_NOWAIT;
482252330Sjeff	}
483252330Sjeff	return i;
484252330Sjeff}
485252330Sjeff
486252330Sjeff/*
487252330Sjeff * Release memory from the UMA cache to the arena.
488252330Sjeff */
489252330Sjeffstatic void
490252330Sjeffqc_release(void *arg, void **store, int cnt)
491252330Sjeff{
492252330Sjeff	qcache_t *qc;
493252330Sjeff	int i;
494252330Sjeff
495252330Sjeff	qc = arg;
496252330Sjeff	for (i = 0; i < cnt; i++)
497252330Sjeff		vmem_xfree(qc->qc_vmem, (vmem_addr_t)store[i], qc->qc_size);
498252330Sjeff}
499252330Sjeff
500252330Sjeffstatic void
501252330Sjeffqc_init(vmem_t *vm, vmem_size_t qcache_max)
502252330Sjeff{
503252330Sjeff	qcache_t *qc;
504252330Sjeff	vmem_size_t size;
505252330Sjeff	int qcache_idx_max;
506252330Sjeff	int i;
507252330Sjeff
508252330Sjeff	MPASS((qcache_max & vm->vm_quantum_mask) == 0);
509252330Sjeff	qcache_idx_max = MIN(qcache_max >> vm->vm_quantum_shift,
510252330Sjeff	    VMEM_QCACHE_IDX_MAX);
511252330Sjeff	vm->vm_qcache_max = qcache_idx_max << vm->vm_quantum_shift;
512252330Sjeff	for (i = 0; i < qcache_idx_max; i++) {
513252330Sjeff		qc = &vm->vm_qcache[i];
514252330Sjeff		size = (i + 1) << vm->vm_quantum_shift;
515252330Sjeff		snprintf(qc->qc_name, sizeof(qc->qc_name), "%s-%zu",
516252330Sjeff		    vm->vm_name, size);
517252330Sjeff		qc->qc_vmem = vm;
518252330Sjeff		qc->qc_size = size;
519252330Sjeff		qc->qc_cache = uma_zcache_create(qc->qc_name, size,
520252330Sjeff		    NULL, NULL, NULL, NULL, qc_import, qc_release, qc,
521252330Sjeff		    UMA_ZONE_VM);
522252330Sjeff		MPASS(qc->qc_cache);
523252330Sjeff	}
524252330Sjeff}
525252330Sjeff
526252330Sjeffstatic void
527252330Sjeffqc_destroy(vmem_t *vm)
528252330Sjeff{
529252330Sjeff	int qcache_idx_max;
530252330Sjeff	int i;
531252330Sjeff
532252330Sjeff	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
533252330Sjeff	for (i = 0; i < qcache_idx_max; i++)
534252330Sjeff		uma_zdestroy(vm->vm_qcache[i].qc_cache);
535252330Sjeff}
536252330Sjeff
537252330Sjeffstatic void
538252330Sjeffqc_drain(vmem_t *vm)
539252330Sjeff{
540252330Sjeff	int qcache_idx_max;
541252330Sjeff	int i;
542252330Sjeff
543252330Sjeff	qcache_idx_max = vm->vm_qcache_max >> vm->vm_quantum_shift;
544252330Sjeff	for (i = 0; i < qcache_idx_max; i++)
545252330Sjeff		zone_drain(vm->vm_qcache[i].qc_cache);
546252330Sjeff}
547252330Sjeff
548252330Sjeffvoid
549252330Sjeffvmem_startup(void)
550252330Sjeff{
551252330Sjeff
552252330Sjeff	mtx_init(&vmem_list_lock, "vmem list lock", NULL, MTX_DEF);
553252330Sjeff	vmem_bt_zone = uma_zcreate("vmem btag",
554252330Sjeff	    sizeof(struct vmem_btag), NULL, NULL, NULL, NULL,
555252330Sjeff	    UMA_ALIGN_PTR, UMA_ZONE_VM);
556252330Sjeff}
557252330Sjeff
558252330Sjeff/* ---- rehash */
559252330Sjeff
560252330Sjeffstatic int
561252330Sjeffvmem_rehash(vmem_t *vm, vmem_size_t newhashsize)
562252330Sjeff{
563252330Sjeff	bt_t *bt;
564252330Sjeff	int i;
565252330Sjeff	struct vmem_hashlist *newhashlist;
566252330Sjeff	struct vmem_hashlist *oldhashlist;
567252330Sjeff	vmem_size_t oldhashsize;
568252330Sjeff
569252330Sjeff	MPASS(newhashsize > 0);
570252330Sjeff
571252330Sjeff	newhashlist = malloc(sizeof(struct vmem_hashlist) * newhashsize,
572252330Sjeff	    M_VMEM, M_NOWAIT);
573252330Sjeff	if (newhashlist == NULL)
574252330Sjeff		return ENOMEM;
575252330Sjeff	for (i = 0; i < newhashsize; i++) {
576252330Sjeff		LIST_INIT(&newhashlist[i]);
577252330Sjeff	}
578252330Sjeff
579252330Sjeff	VMEM_LOCK(vm);
580252330Sjeff	oldhashlist = vm->vm_hashlist;
581252330Sjeff	oldhashsize = vm->vm_hashsize;
582252330Sjeff	vm->vm_hashlist = newhashlist;
583252330Sjeff	vm->vm_hashsize = newhashsize;
584252330Sjeff	if (oldhashlist == NULL) {
585252330Sjeff		VMEM_UNLOCK(vm);
586252330Sjeff		return 0;
587252330Sjeff	}
588252330Sjeff	for (i = 0; i < oldhashsize; i++) {
589252330Sjeff		while ((bt = LIST_FIRST(&oldhashlist[i])) != NULL) {
590252330Sjeff			bt_rembusy(vm, bt);
591252330Sjeff			bt_insbusy(vm, bt);
592252330Sjeff		}
593252330Sjeff	}
594252330Sjeff	VMEM_UNLOCK(vm);
595252330Sjeff
596252330Sjeff	if (oldhashlist != vm->vm_hash0) {
597252330Sjeff		free(oldhashlist, M_VMEM);
598252330Sjeff	}
599252330Sjeff
600252330Sjeff	return 0;
601252330Sjeff}
602252330Sjeff
603252330Sjeffstatic void
604252330Sjeffvmem_periodic_kick(void *dummy)
605252330Sjeff{
606252330Sjeff
607252330Sjeff	taskqueue_enqueue(taskqueue_thread, &vmem_periodic_wk);
608252330Sjeff}
609252330Sjeff
610252330Sjeffstatic void
611252330Sjeffvmem_periodic(void *unused, int pending)
612252330Sjeff{
613252330Sjeff	vmem_t *vm;
614252330Sjeff	vmem_size_t desired;
615252330Sjeff	vmem_size_t current;
616252330Sjeff
617252330Sjeff	mtx_lock(&vmem_list_lock);
618252330Sjeff	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
619252330Sjeff#ifdef DIAGNOSTIC
620252330Sjeff		/* Convenient time to verify vmem state. */
621252330Sjeff		VMEM_LOCK(vm);
622252330Sjeff		vmem_check(vm);
623252330Sjeff		VMEM_UNLOCK(vm);
624252330Sjeff#endif
625252330Sjeff		desired = 1 << flsl(vm->vm_nbusytag);
626252330Sjeff		desired = MIN(MAX(desired, VMEM_HASHSIZE_MIN),
627252330Sjeff		    VMEM_HASHSIZE_MAX);
628252330Sjeff		current = vm->vm_hashsize;
629252330Sjeff
630252330Sjeff		/* Grow in powers of two.  Shrink less aggressively. */
631252330Sjeff		if (desired >= current * 2 || desired * 4 <= current)
632252330Sjeff			vmem_rehash(vm, desired);
633252330Sjeff	}
634252330Sjeff	mtx_unlock(&vmem_list_lock);
635252330Sjeff
636252330Sjeff	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
637252330Sjeff	    vmem_periodic_kick, NULL);
638252330Sjeff}
639252330Sjeff
640252330Sjeffstatic void
641252330Sjeffvmem_start_callout(void *unused)
642252330Sjeff{
643252330Sjeff
644252330Sjeff	TASK_INIT(&vmem_periodic_wk, 0, vmem_periodic, NULL);
645252330Sjeff	vmem_periodic_interval = hz * 10;
646252330Sjeff	callout_init(&vmem_periodic_ch, CALLOUT_MPSAFE);
647252330Sjeff	callout_reset(&vmem_periodic_ch, vmem_periodic_interval,
648252330Sjeff	    vmem_periodic_kick, NULL);
649252330Sjeff}
650252330SjeffSYSINIT(vfs, SI_SUB_CONFIGURE, SI_ORDER_ANY, vmem_start_callout, NULL);
651252330Sjeff
652252330Sjeffstatic void
653252330Sjeffvmem_add1(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags, int type)
654252330Sjeff{
655252330Sjeff	bt_t *btspan;
656252330Sjeff	bt_t *btfree;
657252330Sjeff
658252330Sjeff	MPASS(type == BT_TYPE_SPAN || type == BT_TYPE_SPAN_STATIC);
659252330Sjeff
660252330Sjeff	btspan = bt_alloc(vm);
661252330Sjeff	btspan->bt_type = type;
662252330Sjeff	btspan->bt_start = addr;
663252330Sjeff	btspan->bt_size = size;
664252330Sjeff
665252330Sjeff	btfree = bt_alloc(vm);
666252330Sjeff	btfree->bt_type = BT_TYPE_FREE;
667252330Sjeff	btfree->bt_start = addr;
668252330Sjeff	btfree->bt_size = size;
669252330Sjeff
670252330Sjeff	bt_insseg_tail(vm, btspan);
671252330Sjeff	bt_insseg(vm, btfree, btspan);
672252330Sjeff	bt_insfree(vm, btfree);
673252330Sjeff	vm->vm_size += size;
674252330Sjeff}
675252330Sjeff
676252330Sjeffstatic void
677252330Sjeffvmem_destroy1(vmem_t *vm)
678252330Sjeff{
679252330Sjeff	bt_t *bt;
680252330Sjeff
681252330Sjeff	/*
682252330Sjeff	 * Drain per-cpu quantum caches.
683252330Sjeff	 */
684252330Sjeff	qc_destroy(vm);
685252330Sjeff
686252330Sjeff	/*
687252330Sjeff	 * The vmem should now only contain empty segments.
688252330Sjeff	 */
689252330Sjeff	VMEM_LOCK(vm);
690252330Sjeff	MPASS(vm->vm_nbusytag == 0);
691252330Sjeff
692252330Sjeff	while ((bt = TAILQ_FIRST(&vm->vm_seglist)) != NULL)
693252330Sjeff		bt_remseg(vm, bt);
694252330Sjeff
695252330Sjeff	if (vm->vm_hashlist != NULL && vm->vm_hashlist != vm->vm_hash0)
696252330Sjeff		free(vm->vm_hashlist, M_VMEM);
697252330Sjeff
698252330Sjeff	bt_freetrim(vm, 0);
699252330Sjeff
700252330Sjeff	VMEM_CONDVAR_DESTROY(vm);
701252330Sjeff	VMEM_LOCK_DESTROY(vm);
702252330Sjeff	free(vm, M_VMEM);
703252330Sjeff}
704252330Sjeff
705252330Sjeffstatic int
706252330Sjeffvmem_import(vmem_t *vm, vmem_size_t size, int flags)
707252330Sjeff{
708252330Sjeff	vmem_addr_t addr;
709252330Sjeff	int error;
710252330Sjeff
711252330Sjeff	if (vm->vm_importfn == NULL)
712252330Sjeff		return EINVAL;
713252330Sjeff
714252330Sjeff	size = roundup(size, vm->vm_import_quantum);
715252330Sjeff
716252330Sjeff	/*
717252330Sjeff	 * Hide MAXALLOC tags so we're guaranteed to be able to add this
718252330Sjeff	 * span and the tag we want to allocate from it.
719252330Sjeff	 */
720252330Sjeff	MPASS(vm->vm_nfreetags >= BT_MAXALLOC);
721252330Sjeff	vm->vm_nfreetags -= BT_MAXALLOC;
722252330Sjeff	VMEM_UNLOCK(vm);
723252330Sjeff	error = (vm->vm_importfn)(vm->vm_arg, size, flags, &addr);
724252330Sjeff	VMEM_LOCK(vm);
725252330Sjeff	vm->vm_nfreetags += BT_MAXALLOC;
726252330Sjeff	if (error)
727252330Sjeff		return ENOMEM;
728252330Sjeff
729252330Sjeff	vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN);
730252330Sjeff
731252330Sjeff	return 0;
732252330Sjeff}
733252330Sjeff
734252330Sjeff/*
735252330Sjeff * vmem_fit: check if a bt can satisfy the given restrictions.
736252330Sjeff *
737252330Sjeff * it's a caller's responsibility to ensure the region is big enough
738252330Sjeff * before calling us.
739252330Sjeff */
740252330Sjeffstatic int
741252330Sjeffvmem_fit(const bt_t *bt, vmem_size_t size, vmem_size_t align,
742252330Sjeff    vmem_size_t phase, vmem_size_t nocross, vmem_addr_t minaddr,
743252330Sjeff    vmem_addr_t maxaddr, vmem_addr_t *addrp)
744252330Sjeff{
745252330Sjeff	vmem_addr_t start;
746252330Sjeff	vmem_addr_t end;
747252330Sjeff
748252330Sjeff	MPASS(size > 0);
749252330Sjeff	MPASS(bt->bt_size >= size); /* caller's responsibility */
750252330Sjeff
751252330Sjeff	/*
752252330Sjeff	 * XXX assumption: vmem_addr_t and vmem_size_t are
753252330Sjeff	 * unsigned integer of the same size.
754252330Sjeff	 */
755252330Sjeff
756252330Sjeff	start = bt->bt_start;
757252330Sjeff	if (start < minaddr) {
758252330Sjeff		start = minaddr;
759252330Sjeff	}
760252330Sjeff	end = BT_END(bt);
761252330Sjeff	if (end > maxaddr)
762252330Sjeff		end = maxaddr;
763252330Sjeff	if (start > end)
764252330Sjeff		return (ENOMEM);
765252330Sjeff
766252330Sjeff	start = VMEM_ALIGNUP(start - phase, align) + phase;
767252330Sjeff	if (start < bt->bt_start)
768252330Sjeff		start += align;
769252330Sjeff	if (VMEM_CROSS_P(start, start + size - 1, nocross)) {
770252330Sjeff		MPASS(align < nocross);
771252330Sjeff		start = VMEM_ALIGNUP(start - phase, nocross) + phase;
772252330Sjeff	}
773252330Sjeff	if (start <= end && end - start >= size - 1) {
774252330Sjeff		MPASS((start & (align - 1)) == phase);
775252330Sjeff		MPASS(!VMEM_CROSS_P(start, start + size - 1, nocross));
776252330Sjeff		MPASS(minaddr <= start);
777252330Sjeff		MPASS(maxaddr == 0 || start + size - 1 <= maxaddr);
778252330Sjeff		MPASS(bt->bt_start <= start);
779252330Sjeff		MPASS(BT_END(bt) - start >= size - 1);
780252330Sjeff		*addrp = start;
781252330Sjeff
782252330Sjeff		return (0);
783252330Sjeff	}
784252330Sjeff	return (ENOMEM);
785252330Sjeff}
786252330Sjeff
787252330Sjeff/*
788252330Sjeff * vmem_clip:  Trim the boundary tag edges to the requested start and size.
789252330Sjeff */
790252330Sjeffstatic void
791252330Sjeffvmem_clip(vmem_t *vm, bt_t *bt, vmem_addr_t start, vmem_size_t size)
792252330Sjeff{
793252330Sjeff	bt_t *btnew;
794252330Sjeff	bt_t *btprev;
795252330Sjeff
796252330Sjeff	VMEM_ASSERT_LOCKED(vm);
797252330Sjeff	MPASS(bt->bt_type == BT_TYPE_FREE);
798252330Sjeff	MPASS(bt->bt_size >= size);
799252330Sjeff	bt_remfree(vm, bt);
800252330Sjeff	if (bt->bt_start != start) {
801252330Sjeff		btprev = bt_alloc(vm);
802252330Sjeff		btprev->bt_type = BT_TYPE_FREE;
803252330Sjeff		btprev->bt_start = bt->bt_start;
804252330Sjeff		btprev->bt_size = start - bt->bt_start;
805252330Sjeff		bt->bt_start = start;
806252330Sjeff		bt->bt_size -= btprev->bt_size;
807252330Sjeff		bt_insfree(vm, btprev);
808252330Sjeff		bt_insseg(vm, btprev,
809252330Sjeff		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
810252330Sjeff	}
811252330Sjeff	MPASS(bt->bt_start == start);
812252330Sjeff	if (bt->bt_size != size && bt->bt_size - size > vm->vm_quantum_mask) {
813252330Sjeff		/* split */
814252330Sjeff		btnew = bt_alloc(vm);
815252330Sjeff		btnew->bt_type = BT_TYPE_BUSY;
816252330Sjeff		btnew->bt_start = bt->bt_start;
817252330Sjeff		btnew->bt_size = size;
818252330Sjeff		bt->bt_start = bt->bt_start + size;
819252330Sjeff		bt->bt_size -= size;
820252330Sjeff		bt_insfree(vm, bt);
821252330Sjeff		bt_insseg(vm, btnew,
822252330Sjeff		    TAILQ_PREV(bt, vmem_seglist, bt_seglist));
823252330Sjeff		bt_insbusy(vm, btnew);
824252330Sjeff		bt = btnew;
825252330Sjeff	} else {
826252330Sjeff		bt->bt_type = BT_TYPE_BUSY;
827252330Sjeff		bt_insbusy(vm, bt);
828252330Sjeff	}
829252330Sjeff	MPASS(bt->bt_size >= size);
830252330Sjeff	bt->bt_type = BT_TYPE_BUSY;
831252330Sjeff}
832252330Sjeff
833252330Sjeff/* ---- vmem API */
834252330Sjeff
835252330Sjeffvoid
836252330Sjeffvmem_set_import(vmem_t *vm, vmem_import_t *importfn,
837252330Sjeff     vmem_release_t *releasefn, void *arg, vmem_size_t import_quantum)
838252330Sjeff{
839252330Sjeff
840252330Sjeff	VMEM_LOCK(vm);
841252330Sjeff	vm->vm_importfn = importfn;
842252330Sjeff	vm->vm_releasefn = releasefn;
843252330Sjeff	vm->vm_arg = arg;
844252330Sjeff	vm->vm_import_quantum = import_quantum;
845252330Sjeff	VMEM_UNLOCK(vm);
846252330Sjeff}
847252330Sjeff
848252330Sjeffvoid
849252330Sjeffvmem_set_reclaim(vmem_t *vm, vmem_reclaim_t *reclaimfn)
850252330Sjeff{
851252330Sjeff
852252330Sjeff	VMEM_LOCK(vm);
853252330Sjeff	vm->vm_reclaimfn = reclaimfn;
854252330Sjeff	VMEM_UNLOCK(vm);
855252330Sjeff}
856252330Sjeff
857252330Sjeff/*
858252330Sjeff * vmem_init: Initializes vmem arena.
859252330Sjeff */
860252330Sjeffvmem_t *
861252330Sjeffvmem_init(vmem_t *vm, const char *name, vmem_addr_t base, vmem_size_t size,
862252330Sjeff    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
863252330Sjeff{
864252330Sjeff	int i;
865252330Sjeff
866252330Sjeff	MPASS(quantum > 0);
867252330Sjeff
868252330Sjeff	bzero(vm, sizeof(*vm));
869252330Sjeff
870252330Sjeff	VMEM_CONDVAR_INIT(vm, name);
871252330Sjeff	VMEM_LOCK_INIT(vm, name);
872252330Sjeff	vm->vm_nfreetags = 0;
873252330Sjeff	LIST_INIT(&vm->vm_freetags);
874252330Sjeff	strlcpy(vm->vm_name, name, sizeof(vm->vm_name));
875252330Sjeff	vm->vm_quantum_mask = quantum - 1;
876252330Sjeff	vm->vm_quantum_shift = SIZE2ORDER(quantum);
877252330Sjeff	MPASS(ORDER2SIZE(vm->vm_quantum_shift) == quantum);
878252330Sjeff	vm->vm_nbusytag = 0;
879252330Sjeff	vm->vm_size = 0;
880252330Sjeff	vm->vm_inuse = 0;
881252330Sjeff	qc_init(vm, qcache_max);
882252330Sjeff
883252330Sjeff	TAILQ_INIT(&vm->vm_seglist);
884252330Sjeff	for (i = 0; i < VMEM_MAXORDER; i++) {
885252330Sjeff		LIST_INIT(&vm->vm_freelist[i]);
886252330Sjeff	}
887252330Sjeff	memset(&vm->vm_hash0, 0, sizeof(vm->vm_hash0));
888252330Sjeff	vm->vm_hashsize = VMEM_HASHSIZE_MIN;
889252330Sjeff	vm->vm_hashlist = vm->vm_hash0;
890252330Sjeff
891252330Sjeff	if (size != 0) {
892252330Sjeff		if (vmem_add(vm, base, size, flags) != 0) {
893252330Sjeff			vmem_destroy1(vm);
894252330Sjeff			return NULL;
895252330Sjeff		}
896252330Sjeff	}
897252330Sjeff
898252330Sjeff	mtx_lock(&vmem_list_lock);
899252330Sjeff	LIST_INSERT_HEAD(&vmem_list, vm, vm_alllist);
900252330Sjeff	mtx_unlock(&vmem_list_lock);
901252330Sjeff
902252330Sjeff	return vm;
903252330Sjeff}
904252330Sjeff
905252330Sjeff/*
906252330Sjeff * vmem_create: create an arena.
907252330Sjeff */
908252330Sjeffvmem_t *
909252330Sjeffvmem_create(const char *name, vmem_addr_t base, vmem_size_t size,
910252330Sjeff    vmem_size_t quantum, vmem_size_t qcache_max, int flags)
911252330Sjeff{
912252330Sjeff
913252330Sjeff	vmem_t *vm;
914252330Sjeff
915252330Sjeff	vm = malloc(sizeof(*vm), M_VMEM, flags & (M_WAITOK|M_NOWAIT));
916252330Sjeff	if (vm == NULL)
917252330Sjeff		return (NULL);
918252330Sjeff	if (vmem_init(vm, name, base, size, quantum, qcache_max,
919252330Sjeff	    flags) == NULL) {
920252330Sjeff		free(vm, M_VMEM);
921252330Sjeff		return (NULL);
922252330Sjeff	}
923252330Sjeff	return (vm);
924252330Sjeff}
925252330Sjeff
926252330Sjeffvoid
927252330Sjeffvmem_destroy(vmem_t *vm)
928252330Sjeff{
929252330Sjeff
930252330Sjeff	mtx_lock(&vmem_list_lock);
931252330Sjeff	LIST_REMOVE(vm, vm_alllist);
932252330Sjeff	mtx_unlock(&vmem_list_lock);
933252330Sjeff
934252330Sjeff	vmem_destroy1(vm);
935252330Sjeff}
936252330Sjeff
937252330Sjeffvmem_size_t
938252330Sjeffvmem_roundup_size(vmem_t *vm, vmem_size_t size)
939252330Sjeff{
940252330Sjeff
941252330Sjeff	return (size + vm->vm_quantum_mask) & ~vm->vm_quantum_mask;
942252330Sjeff}
943252330Sjeff
944252330Sjeff/*
945252330Sjeff * vmem_alloc: allocate resource from the arena.
946252330Sjeff */
947252330Sjeffint
948252330Sjeffvmem_alloc(vmem_t *vm, vmem_size_t size, int flags, vmem_addr_t *addrp)
949252330Sjeff{
950252330Sjeff	const int strat __unused = flags & VMEM_FITMASK;
951252330Sjeff	qcache_t *qc;
952252330Sjeff
953252330Sjeff	flags &= VMEM_FLAGS;
954252330Sjeff	MPASS(size > 0);
955252330Sjeff	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
956252330Sjeff	if ((flags & M_NOWAIT) == 0)
957252330Sjeff		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_alloc");
958252330Sjeff
959252330Sjeff	if (size <= vm->vm_qcache_max) {
960252330Sjeff		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
961252330Sjeff		*addrp = (vmem_addr_t)uma_zalloc(qc->qc_cache, flags);
962252330Sjeff		if (*addrp == 0)
963252330Sjeff			return (ENOMEM);
964252330Sjeff		return (0);
965252330Sjeff	}
966252330Sjeff
967252330Sjeff	return vmem_xalloc(vm, size, 0, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
968252330Sjeff	    flags, addrp);
969252330Sjeff}
970252330Sjeff
971252330Sjeffint
972252330Sjeffvmem_xalloc(vmem_t *vm, const vmem_size_t size0, vmem_size_t align,
973252330Sjeff    const vmem_size_t phase, const vmem_size_t nocross,
974252330Sjeff    const vmem_addr_t minaddr, const vmem_addr_t maxaddr, int flags,
975252330Sjeff    vmem_addr_t *addrp)
976252330Sjeff{
977252330Sjeff	const vmem_size_t size = vmem_roundup_size(vm, size0);
978252330Sjeff	struct vmem_freelist *list;
979252330Sjeff	struct vmem_freelist *first;
980252330Sjeff	struct vmem_freelist *end;
981252330Sjeff	vmem_size_t avail;
982252330Sjeff	bt_t *bt;
983252330Sjeff	int error;
984252330Sjeff	int strat;
985252330Sjeff
986252330Sjeff	flags &= VMEM_FLAGS;
987252330Sjeff	strat = flags & VMEM_FITMASK;
988252330Sjeff	MPASS(size0 > 0);
989252330Sjeff	MPASS(size > 0);
990252330Sjeff	MPASS(strat == M_BESTFIT || strat == M_FIRSTFIT);
991252330Sjeff	MPASS((flags & (M_NOWAIT|M_WAITOK)) != (M_NOWAIT|M_WAITOK));
992252330Sjeff	if ((flags & M_NOWAIT) == 0)
993252330Sjeff		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "vmem_xalloc");
994252330Sjeff	MPASS((align & vm->vm_quantum_mask) == 0);
995252330Sjeff	MPASS((align & (align - 1)) == 0);
996252330Sjeff	MPASS((phase & vm->vm_quantum_mask) == 0);
997252330Sjeff	MPASS((nocross & vm->vm_quantum_mask) == 0);
998252330Sjeff	MPASS((nocross & (nocross - 1)) == 0);
999252330Sjeff	MPASS((align == 0 && phase == 0) || phase < align);
1000252330Sjeff	MPASS(nocross == 0 || nocross >= size);
1001252330Sjeff	MPASS(minaddr <= maxaddr);
1002252330Sjeff	MPASS(!VMEM_CROSS_P(phase, phase + size - 1, nocross));
1003252330Sjeff
1004252330Sjeff	if (align == 0)
1005252330Sjeff		align = vm->vm_quantum_mask + 1;
1006252330Sjeff
1007252330Sjeff	*addrp = 0;
1008252330Sjeff	end = &vm->vm_freelist[VMEM_MAXORDER];
1009252330Sjeff	/*
1010252330Sjeff	 * choose a free block from which we allocate.
1011252330Sjeff	 */
1012252330Sjeff	first = bt_freehead_toalloc(vm, size, strat);
1013252330Sjeff	VMEM_LOCK(vm);
1014252330Sjeff	for (;;) {
1015252330Sjeff		/*
1016252330Sjeff		 * Make sure we have enough tags to complete the
1017252330Sjeff		 * operation.
1018252330Sjeff		 */
1019252330Sjeff		if (vm->vm_nfreetags < BT_MAXALLOC &&
1020252330Sjeff		    bt_fill(vm, flags) != 0) {
1021252330Sjeff			error = ENOMEM;
1022252330Sjeff			break;
1023252330Sjeff		}
1024252330Sjeff		/*
1025252330Sjeff	 	 * Scan freelists looking for a tag that satisfies the
1026252330Sjeff		 * allocation.  If we're doing BESTFIT we may encounter
1027252330Sjeff		 * sizes below the request.  If we're doing FIRSTFIT we
1028252330Sjeff		 * inspect only the first element from each list.
1029252330Sjeff		 */
1030252330Sjeff		for (list = first; list < end; list++) {
1031252330Sjeff			LIST_FOREACH(bt, list, bt_freelist) {
1032252330Sjeff				if (bt->bt_size >= size) {
1033252330Sjeff					error = vmem_fit(bt, size, align, phase,
1034252330Sjeff					    nocross, minaddr, maxaddr, addrp);
1035252330Sjeff					if (error == 0) {
1036252330Sjeff						vmem_clip(vm, bt, *addrp, size);
1037252330Sjeff						goto out;
1038252330Sjeff					}
1039252330Sjeff				}
1040252330Sjeff				/* FIRST skips to the next list. */
1041252330Sjeff				if (strat == M_FIRSTFIT)
1042252330Sjeff					break;
1043252330Sjeff			}
1044252330Sjeff		}
1045252330Sjeff		/*
1046252330Sjeff		 * Retry if the fast algorithm failed.
1047252330Sjeff		 */
1048252330Sjeff		if (strat == M_FIRSTFIT) {
1049252330Sjeff			strat = M_BESTFIT;
1050252330Sjeff			first = bt_freehead_toalloc(vm, size, strat);
1051252330Sjeff			continue;
1052252330Sjeff		}
1053252330Sjeff		/*
1054252330Sjeff		 * XXX it is possible to fail to meet restrictions with the
1055252330Sjeff		 * imported region.  It is up to the user to specify the
1056252330Sjeff		 * import quantum such that it can satisfy any allocation.
1057252330Sjeff		 */
1058252330Sjeff		if (vmem_import(vm, size, flags) == 0)
1059252330Sjeff			continue;
1060252330Sjeff
1061252330Sjeff		/*
1062252330Sjeff		 * Try to free some space from the quantum cache or reclaim
1063252330Sjeff		 * functions if available.
1064252330Sjeff		 */
1065252330Sjeff		if (vm->vm_qcache_max != 0 || vm->vm_reclaimfn != NULL) {
1066252330Sjeff			avail = vm->vm_size - vm->vm_inuse;
1067252330Sjeff			VMEM_UNLOCK(vm);
1068252330Sjeff			if (vm->vm_qcache_max != 0)
1069252330Sjeff				qc_drain(vm);
1070252330Sjeff			if (vm->vm_reclaimfn != NULL)
1071252330Sjeff				vm->vm_reclaimfn(vm, flags);
1072252330Sjeff			VMEM_LOCK(vm);
1073252330Sjeff			/* If we were successful retry even NOWAIT. */
1074252330Sjeff			if (vm->vm_size - vm->vm_inuse > avail)
1075252330Sjeff				continue;
1076252330Sjeff		}
1077252330Sjeff		if ((flags & M_NOWAIT) != 0) {
1078252330Sjeff			error = ENOMEM;
1079252330Sjeff			break;
1080252330Sjeff		}
1081252330Sjeff		VMEM_CONDVAR_WAIT(vm);
1082252330Sjeff	}
1083252330Sjeffout:
1084252330Sjeff	VMEM_UNLOCK(vm);
1085252330Sjeff	if (error != 0 && (flags & M_NOWAIT) == 0)
1086252330Sjeff		panic("failed to allocate waiting allocation\n");
1087252330Sjeff
1088252330Sjeff	return (error);
1089252330Sjeff}
1090252330Sjeff
1091252330Sjeff/*
1092252330Sjeff * vmem_free: free the resource to the arena.
1093252330Sjeff */
1094252330Sjeffvoid
1095252330Sjeffvmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1096252330Sjeff{
1097252330Sjeff	qcache_t *qc;
1098252330Sjeff	MPASS(size > 0);
1099252330Sjeff
1100252330Sjeff	if (size <= vm->vm_qcache_max) {
1101252330Sjeff		qc = &vm->vm_qcache[(size - 1) >> vm->vm_quantum_shift];
1102252330Sjeff		uma_zfree(qc->qc_cache, (void *)addr);
1103252330Sjeff	} else
1104252330Sjeff		vmem_xfree(vm, addr, size);
1105252330Sjeff}
1106252330Sjeff
1107252330Sjeffvoid
1108252330Sjeffvmem_xfree(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
1109252330Sjeff{
1110252330Sjeff	bt_t *bt;
1111252330Sjeff	bt_t *t;
1112252330Sjeff
1113252330Sjeff	MPASS(size > 0);
1114252330Sjeff
1115252330Sjeff	VMEM_LOCK(vm);
1116252330Sjeff	bt = bt_lookupbusy(vm, addr);
1117252330Sjeff	MPASS(bt != NULL);
1118252330Sjeff	MPASS(bt->bt_start == addr);
1119252330Sjeff	MPASS(bt->bt_size == vmem_roundup_size(vm, size) ||
1120252330Sjeff	    bt->bt_size - vmem_roundup_size(vm, size) <= vm->vm_quantum_mask);
1121252330Sjeff	MPASS(bt->bt_type == BT_TYPE_BUSY);
1122252330Sjeff	bt_rembusy(vm, bt);
1123252330Sjeff	bt->bt_type = BT_TYPE_FREE;
1124252330Sjeff
1125252330Sjeff	/* coalesce */
1126252330Sjeff	t = TAILQ_NEXT(bt, bt_seglist);
1127252330Sjeff	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1128252330Sjeff		MPASS(BT_END(bt) < t->bt_start);	/* YYY */
1129252330Sjeff		bt->bt_size += t->bt_size;
1130252330Sjeff		bt_remfree(vm, t);
1131252330Sjeff		bt_remseg(vm, t);
1132252330Sjeff	}
1133252330Sjeff	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1134252330Sjeff	if (t != NULL && t->bt_type == BT_TYPE_FREE) {
1135252330Sjeff		MPASS(BT_END(t) < bt->bt_start);	/* YYY */
1136252330Sjeff		bt->bt_size += t->bt_size;
1137252330Sjeff		bt->bt_start = t->bt_start;
1138252330Sjeff		bt_remfree(vm, t);
1139252330Sjeff		bt_remseg(vm, t);
1140252330Sjeff	}
1141252330Sjeff
1142252330Sjeff	t = TAILQ_PREV(bt, vmem_seglist, bt_seglist);
1143252330Sjeff	MPASS(t != NULL);
1144252330Sjeff	MPASS(BT_ISSPAN_P(t) || t->bt_type == BT_TYPE_BUSY);
1145252330Sjeff	if (vm->vm_releasefn != NULL && t->bt_type == BT_TYPE_SPAN &&
1146252330Sjeff	    t->bt_size == bt->bt_size) {
1147252330Sjeff		vmem_addr_t spanaddr;
1148252330Sjeff		vmem_size_t spansize;
1149252330Sjeff
1150252330Sjeff		MPASS(t->bt_start == bt->bt_start);
1151252330Sjeff		spanaddr = bt->bt_start;
1152252330Sjeff		spansize = bt->bt_size;
1153252330Sjeff		bt_remseg(vm, bt);
1154252330Sjeff		bt_remseg(vm, t);
1155252330Sjeff		vm->vm_size -= spansize;
1156252330Sjeff		VMEM_CONDVAR_BROADCAST(vm);
1157252330Sjeff		bt_freetrim(vm, BT_MAXFREE);
1158252330Sjeff		(*vm->vm_releasefn)(vm->vm_arg, spanaddr, spansize);
1159252330Sjeff	} else {
1160252330Sjeff		bt_insfree(vm, bt);
1161252330Sjeff		VMEM_CONDVAR_BROADCAST(vm);
1162252330Sjeff		bt_freetrim(vm, BT_MAXFREE);
1163252330Sjeff	}
1164252330Sjeff}
1165252330Sjeff
1166252330Sjeff/*
1167252330Sjeff * vmem_add:
1168252330Sjeff *
1169252330Sjeff */
1170252330Sjeffint
1171252330Sjeffvmem_add(vmem_t *vm, vmem_addr_t addr, vmem_size_t size, int flags)
1172252330Sjeff{
1173252330Sjeff	int error;
1174252330Sjeff
1175252330Sjeff	error = 0;
1176252330Sjeff	flags &= VMEM_FLAGS;
1177252330Sjeff	VMEM_LOCK(vm);
1178252330Sjeff	if (vm->vm_nfreetags >= BT_MAXALLOC || bt_fill(vm, flags) == 0)
1179252330Sjeff		vmem_add1(vm, addr, size, flags, BT_TYPE_SPAN_STATIC);
1180252330Sjeff	else
1181252330Sjeff		error = ENOMEM;
1182252330Sjeff	VMEM_UNLOCK(vm);
1183252330Sjeff
1184252330Sjeff	return (error);
1185252330Sjeff}
1186252330Sjeff
1187252330Sjeff/*
1188252330Sjeff * vmem_size: information about arenas size
1189252330Sjeff */
1190252330Sjeffvmem_size_t
1191252330Sjeffvmem_size(vmem_t *vm, int typemask)
1192252330Sjeff{
1193252330Sjeff
1194252330Sjeff	switch (typemask) {
1195252330Sjeff	case VMEM_ALLOC:
1196252330Sjeff		return vm->vm_inuse;
1197252330Sjeff	case VMEM_FREE:
1198252330Sjeff		return vm->vm_size - vm->vm_inuse;
1199252330Sjeff	case VMEM_FREE|VMEM_ALLOC:
1200252330Sjeff		return vm->vm_size;
1201252330Sjeff	default:
1202252330Sjeff		panic("vmem_size");
1203252330Sjeff	}
1204252330Sjeff}
1205252330Sjeff
1206252330Sjeff/* ---- debug */
1207252330Sjeff
1208252330Sjeff#if defined(DDB) || defined(DIAGNOSTIC)
1209252330Sjeff
1210252330Sjeffstatic void bt_dump(const bt_t *, int (*)(const char *, ...)
1211252330Sjeff    __printflike(1, 2));
1212252330Sjeff
1213252330Sjeffstatic const char *
1214252330Sjeffbt_type_string(int type)
1215252330Sjeff{
1216252330Sjeff
1217252330Sjeff	switch (type) {
1218252330Sjeff	case BT_TYPE_BUSY:
1219252330Sjeff		return "busy";
1220252330Sjeff	case BT_TYPE_FREE:
1221252330Sjeff		return "free";
1222252330Sjeff	case BT_TYPE_SPAN:
1223252330Sjeff		return "span";
1224252330Sjeff	case BT_TYPE_SPAN_STATIC:
1225252330Sjeff		return "static span";
1226252330Sjeff	default:
1227252330Sjeff		break;
1228252330Sjeff	}
1229252330Sjeff	return "BOGUS";
1230252330Sjeff}
1231252330Sjeff
1232252330Sjeffstatic void
1233252330Sjeffbt_dump(const bt_t *bt, int (*pr)(const char *, ...))
1234252330Sjeff{
1235252330Sjeff
1236252330Sjeff	(*pr)("\t%p: %jx %jx, %d(%s)\n",
1237252330Sjeff	    bt, (intmax_t)bt->bt_start, (intmax_t)bt->bt_size,
1238252330Sjeff	    bt->bt_type, bt_type_string(bt->bt_type));
1239252330Sjeff}
1240252330Sjeff
1241252330Sjeffstatic void
1242252330Sjeffvmem_dump(const vmem_t *vm , int (*pr)(const char *, ...) __printflike(1, 2))
1243252330Sjeff{
1244252330Sjeff	const bt_t *bt;
1245252330Sjeff	int i;
1246252330Sjeff
1247252330Sjeff	(*pr)("vmem %p '%s'\n", vm, vm->vm_name);
1248252330Sjeff	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1249252330Sjeff		bt_dump(bt, pr);
1250252330Sjeff	}
1251252330Sjeff
1252252330Sjeff	for (i = 0; i < VMEM_MAXORDER; i++) {
1253252330Sjeff		const struct vmem_freelist *fl = &vm->vm_freelist[i];
1254252330Sjeff
1255252330Sjeff		if (LIST_EMPTY(fl)) {
1256252330Sjeff			continue;
1257252330Sjeff		}
1258252330Sjeff
1259252330Sjeff		(*pr)("freelist[%d]\n", i);
1260252330Sjeff		LIST_FOREACH(bt, fl, bt_freelist) {
1261252330Sjeff			bt_dump(bt, pr);
1262252330Sjeff		}
1263252330Sjeff	}
1264252330Sjeff}
1265252330Sjeff
1266252330Sjeff#endif /* defined(DDB) || defined(DIAGNOSTIC) */
1267252330Sjeff
1268252330Sjeff#if defined(DDB)
1269252330Sjeffstatic bt_t *
1270252330Sjeffvmem_whatis_lookup(vmem_t *vm, vmem_addr_t addr)
1271252330Sjeff{
1272252330Sjeff	bt_t *bt;
1273252330Sjeff
1274252330Sjeff	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1275252330Sjeff		if (BT_ISSPAN_P(bt)) {
1276252330Sjeff			continue;
1277252330Sjeff		}
1278252330Sjeff		if (bt->bt_start <= addr && addr <= BT_END(bt)) {
1279252330Sjeff			return bt;
1280252330Sjeff		}
1281252330Sjeff	}
1282252330Sjeff
1283252330Sjeff	return NULL;
1284252330Sjeff}
1285252330Sjeff
1286252330Sjeffvoid
1287252330Sjeffvmem_whatis(vmem_addr_t addr, int (*pr)(const char *, ...))
1288252330Sjeff{
1289252330Sjeff	vmem_t *vm;
1290252330Sjeff
1291252330Sjeff	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1292252330Sjeff		bt_t *bt;
1293252330Sjeff
1294252330Sjeff		bt = vmem_whatis_lookup(vm, addr);
1295252330Sjeff		if (bt == NULL) {
1296252330Sjeff			continue;
1297252330Sjeff		}
1298252330Sjeff		(*pr)("%p is %p+%zu in VMEM '%s' (%s)\n",
1299252330Sjeff		    (void *)addr, (void *)bt->bt_start,
1300252330Sjeff		    (vmem_size_t)(addr - bt->bt_start), vm->vm_name,
1301252330Sjeff		    (bt->bt_type == BT_TYPE_BUSY) ? "allocated" : "free");
1302252330Sjeff	}
1303252330Sjeff}
1304252330Sjeff
1305252330Sjeffvoid
1306252330Sjeffvmem_printall(const char *modif, int (*pr)(const char *, ...))
1307252330Sjeff{
1308252330Sjeff	const vmem_t *vm;
1309252330Sjeff
1310252330Sjeff	LIST_FOREACH(vm, &vmem_list, vm_alllist) {
1311252330Sjeff		vmem_dump(vm, pr);
1312252330Sjeff	}
1313252330Sjeff}
1314252330Sjeff
1315252330Sjeffvoid
1316252330Sjeffvmem_print(vmem_addr_t addr, const char *modif, int (*pr)(const char *, ...))
1317252330Sjeff{
1318252330Sjeff	const vmem_t *vm = (const void *)addr;
1319252330Sjeff
1320252330Sjeff	vmem_dump(vm, pr);
1321252330Sjeff}
1322252330Sjeff#endif /* defined(DDB) */
1323252330Sjeff
1324252330Sjeff#define vmem_printf printf
1325252330Sjeff
1326252330Sjeff#if defined(DIAGNOSTIC)
1327252330Sjeff
1328252330Sjeffstatic bool
1329252330Sjeffvmem_check_sanity(vmem_t *vm)
1330252330Sjeff{
1331252330Sjeff	const bt_t *bt, *bt2;
1332252330Sjeff
1333252330Sjeff	MPASS(vm != NULL);
1334252330Sjeff
1335252330Sjeff	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1336252330Sjeff		if (bt->bt_start > BT_END(bt)) {
1337252330Sjeff			printf("corrupted tag\n");
1338252330Sjeff			bt_dump(bt, vmem_printf);
1339252330Sjeff			return false;
1340252330Sjeff		}
1341252330Sjeff	}
1342252330Sjeff	TAILQ_FOREACH(bt, &vm->vm_seglist, bt_seglist) {
1343252330Sjeff		TAILQ_FOREACH(bt2, &vm->vm_seglist, bt_seglist) {
1344252330Sjeff			if (bt == bt2) {
1345252330Sjeff				continue;
1346252330Sjeff			}
1347252330Sjeff			if (BT_ISSPAN_P(bt) != BT_ISSPAN_P(bt2)) {
1348252330Sjeff				continue;
1349252330Sjeff			}
1350252330Sjeff			if (bt->bt_start <= BT_END(bt2) &&
1351252330Sjeff			    bt2->bt_start <= BT_END(bt)) {
1352252330Sjeff				printf("overwrapped tags\n");
1353252330Sjeff				bt_dump(bt, vmem_printf);
1354252330Sjeff				bt_dump(bt2, vmem_printf);
1355252330Sjeff				return false;
1356252330Sjeff			}
1357252330Sjeff		}
1358252330Sjeff	}
1359252330Sjeff
1360252330Sjeff	return true;
1361252330Sjeff}
1362252330Sjeff
1363252330Sjeffstatic void
1364252330Sjeffvmem_check(vmem_t *vm)
1365252330Sjeff{
1366252330Sjeff
1367252330Sjeff	if (!vmem_check_sanity(vm)) {
1368252330Sjeff		panic("insanity vmem %p", vm);
1369252330Sjeff	}
1370252330Sjeff}
1371252330Sjeff
1372252330Sjeff#endif /* defined(DIAGNOSTIC) */
1373