1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: env_alloc.c,v 12.22 2008/01/27 18:02:31 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13/*
14 * Implement shared memory region allocation.  The initial list is a single
15 * memory "chunk" which is carved up as memory is requested.  Chunks are
16 * coalesced when free'd.  We maintain two types of linked-lists: a list of
17 * all chunks sorted by address, and a set of lists with free chunks sorted
18 * by size.
19 *
20 * The ALLOC_LAYOUT structure is the governing structure for the allocator.
21 *
22 * The ALLOC_ELEMENT structure is the structure that describes any single
23 * chunk of memory, and is immediately followed by the user's memory.
24 *
25 * The internal memory chunks are always aligned to a uintmax_t boundary so
26 * we don't drop core accessing the fields of the ALLOC_ELEMENT structure.
27 *
28 * The memory chunks returned to the user are aligned to a uintmax_t boundary.
29 * This is enforced by terminating the ALLOC_ELEMENT structure with a uintmax_t
30 * field as that immediately precedes the user's memory.  Any caller needing
31 * more than uintmax_t alignment is responsible for doing alignment themselves.
32 */
33
34typedef SH_TAILQ_HEAD(__sizeq) SIZEQ_HEAD;
35
36typedef struct __alloc_layout {
37	SH_TAILQ_HEAD(__addrq) addrq;		/* Sorted by address */
38
39	/*
40	 * A perfect Berkeley DB application does little allocation because
41	 * most things are allocated on startup and never free'd.  This is
42	 * true even for the cache, because we don't free and re-allocate
43	 * the memory associated with a cache buffer when swapping a page
44	 * in memory for a page on disk -- unless the page is changing size.
45	 * The latter problem is why we have multiple size queues.  If the
46	 * application's working set fits in cache, it's not a problem.  If
47	 * the application's working set doesn't fit in cache, but all of
48	 * the databases have the same size pages, it's still not a problem.
49	 * If the application's working set doesn't fit in cache, and its
50	 * databases have different page sizes, we can end up walking a lot
51	 * of 512B chunk allocations looking for an available 64KB chunk.
52	 *
53	 * So, we keep a set of queues, where we expect to find a chunk of
54	 * roughly the right size at the front of the list.  The first queue
55	 * is chunks <= 1024, the second is <= 2048, and so on.  With 11
56	 * queues, we have separate queues for chunks up to 1MB.
57	 */
58#define	DB_SIZE_Q_COUNT	11
59	SIZEQ_HEAD	sizeq[DB_SIZE_Q_COUNT];	/* Sorted by size */
60#ifdef HAVE_STATISTICS
61	u_int32_t	pow2_size[DB_SIZE_Q_COUNT];
62#endif
63
64#ifdef HAVE_STATISTICS
65	u_int32_t success;			/* Successful allocations */
66	u_int32_t failure;			/* Failed allocations */
67	u_int32_t freed;			/* Free calls */
68	u_int32_t longest;			/* Longest chain walked */
69#endif
70	uintmax_t  unused;			/* Guarantee alignment */
71} ALLOC_LAYOUT;
72
73typedef struct __alloc_element {
74	SH_TAILQ_ENTRY addrq;			/* List by address */
75	SH_TAILQ_ENTRY sizeq;			/* List by size */
76
77	/*
78	 * The "len" field is the total length of the chunk, not the size
79	 * available to the caller.
80	 */
81	size_t len;				/* Chunk length */
82
83	/*
84	 * The "ulen" field is the length returned to the caller.
85	 *
86	 * Set to 0 if the chunk is not currently in use.
87	 */
88	uintmax_t ulen;				/* User's length */
89} ALLOC_ELEMENT;
90
91/*
92 * If the chunk can be split into two pieces, with the fragment holding at
93 * least 64 bytes of memory, we divide the chunk into two parts.
94 */
95#define	SHALLOC_FRAGMENT	(sizeof(ALLOC_ELEMENT) + 64)
96
97/* Macro to find the appropriate queue for a specific size chunk. */
98#undef	SET_QUEUE_FOR_SIZE
99#define	SET_QUEUE_FOR_SIZE(head, q, i, len) do {			\
100	for (i = 0; i < DB_SIZE_Q_COUNT; ++i) {				\
101		q = &(head)->sizeq[i];					\
102		if ((len) <= (size_t)1024 << i)				\
103			break;						\
104	}								\
105} while (0)
106
107static void __env_size_insert __P((ALLOC_LAYOUT *, ALLOC_ELEMENT *));
108
109/*
110 * __env_alloc_init --
111 *	Initialize the area as one large chunk.
112 *
113 * PUBLIC: void __env_alloc_init __P((REGINFO *, size_t));
114 */
115void
116__env_alloc_init(infop, size)
117	REGINFO *infop;
118	size_t size;
119{
120	ALLOC_ELEMENT *elp;
121	ALLOC_LAYOUT *head;
122	ENV *env;
123	u_int i;
124
125	env = infop->env;
126
127	/* No initialization needed for heap memory regions. */
128	if (F_ISSET(env, ENV_PRIVATE))
129		return;
130
131	/*
132	 * The first chunk of memory is the ALLOC_LAYOUT structure.
133	 */
134	head = infop->addr;
135	memset(head, 0, sizeof(*head));
136	SH_TAILQ_INIT(&head->addrq);
137	for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
138		SH_TAILQ_INIT(&head->sizeq[i]);
139	COMPQUIET(head->unused, 0);
140
141	/*
142	 * The rest of the memory is the first available chunk.
143	 */
144	elp = (ALLOC_ELEMENT *)((u_int8_t *)head + sizeof(ALLOC_LAYOUT));
145	elp->len = size - sizeof(ALLOC_LAYOUT);
146	elp->ulen = 0;
147
148	SH_TAILQ_INSERT_HEAD(&head->addrq, elp, addrq, __alloc_element);
149	SH_TAILQ_INSERT_HEAD(
150	    &head->sizeq[DB_SIZE_Q_COUNT - 1], elp, sizeq, __alloc_element);
151}
152
153/*
154 * The length, the ALLOC_ELEMENT structure and an optional guard byte,
155 * rounded up to standard alignment.
156 */
157#ifdef DIAGNOSTIC
158#define	DB_ALLOC_SIZE(len)						\
159	(size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT) + 1, sizeof(uintmax_t))
160#else
161#define	DB_ALLOC_SIZE(len)						\
162	(size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT), sizeof(uintmax_t))
163#endif
164
165/*
166 * __env_alloc_overhead --
167 *	Return the overhead needed for an allocation.
168 *
169 * PUBLIC: size_t __env_alloc_overhead __P((void));
170 */
171size_t
172__env_alloc_overhead()
173{
174	return (sizeof(ALLOC_ELEMENT));
175}
176
177/*
178 * __env_alloc_size --
179 *	Return the space needed for an allocation, including alignment.
180 *
181 * PUBLIC: size_t __env_alloc_size __P((size_t));
182 */
183size_t
184__env_alloc_size(len)
185	size_t len;
186{
187	return (DB_ALLOC_SIZE(len));
188}
189
190/*
191 * __env_alloc --
192 *	Allocate space from the shared region.
193 *
194 * PUBLIC: int __env_alloc __P((REGINFO *, size_t, void *));
195 */
196int
197__env_alloc(infop, len, retp)
198	REGINFO *infop;
199	size_t len;
200	void *retp;
201{
202	SIZEQ_HEAD *q;
203	ALLOC_ELEMENT *elp, *frag, *elp_tmp;
204	ALLOC_LAYOUT *head;
205	ENV *env;
206	size_t total_len;
207	u_int8_t *p;
208	u_int i;
209	int ret;
210#ifdef HAVE_STATISTICS
211	u_int32_t st_search;
212#endif
213	env = infop->env;
214	*(void **)retp = NULL;
215
216	/*
217	 * In a heap-backed environment, we call malloc for additional space.
218	 * (Malloc must return memory correctly aligned for our use.)
219	 *
220	 * In a heap-backed environment, memory is laid out as follows:
221	 *
222	 * { size_t total-length } { user-memory } { guard-byte }
223	 */
224	if (F_ISSET(env, ENV_PRIVATE)) {
225		/* Check if we're over the limit. */
226		if (infop->allocated >= infop->max_alloc)
227			return (ENOMEM);
228
229		/* We need an additional size_t to hold the length. */
230		len += sizeof(size_t);
231
232#ifdef DIAGNOSTIC
233		/* Plus one byte for the guard byte. */
234		++len;
235#endif
236		/* Allocate the space. */
237		if ((ret = __os_malloc(env, len, &p)) != 0)
238			return (ret);
239		infop->allocated += len;
240
241		*(size_t *)p = len;
242#ifdef DIAGNOSTIC
243		p[len - 1] = GUARD_BYTE;
244#endif
245		*(void **)retp = p + sizeof(size_t);
246		return (0);
247	}
248
249	head = infop->addr;
250	total_len = DB_ALLOC_SIZE(len);
251
252	/* Find the first size queue that could satisfy the request. */
253	SET_QUEUE_FOR_SIZE(head, q, i, total_len);
254
255#ifdef HAVE_STATISTICS
256	++head->pow2_size[i];		/* Note the size of the request. */
257#endif
258
259	/*
260	 * Search this queue, and, if necessary, queues larger than this queue,
261	 * looking for a chunk we can use.
262	 */
263	STAT((st_search = 0));
264	for (elp = NULL;; ++q) {
265		SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) {
266			STAT((++st_search));
267
268			/*
269			 * Chunks are sorted from largest to smallest -- if
270			 * this chunk is less than what we need, no chunk
271			 * further down the list will be large enough.
272			 */
273			if (elp_tmp->len < total_len)
274				break;
275
276			/*
277			 * This chunk will do... maybe there's a better one,
278			 * but this one will do.
279			 */
280			elp = elp_tmp;
281
282			/*
283			 * We might have many chunks of the same size.  Stop
284			 * looking if we won't fragment memory by picking the
285			 * current one.
286			 */
287			if (elp_tmp->len - total_len <= SHALLOC_FRAGMENT)
288				break;
289		}
290		if (elp != NULL || ++i >= DB_SIZE_Q_COUNT)
291			break;
292	}
293
294#ifdef HAVE_STATISTICS
295	if (head->longest < st_search)
296		head->longest = st_search;
297#endif
298
299	/*
300	 * If we don't find an element of the right size, we're done.
301	 */
302	if (elp == NULL) {
303		STAT((++head->failure));
304		return (ENOMEM);
305	}
306	STAT((++head->success));
307
308	/* Pull the chunk off of the size queue. */
309	SH_TAILQ_REMOVE(q, elp, sizeq, __alloc_element);
310
311	if (elp->len - total_len > SHALLOC_FRAGMENT) {
312		frag = (ALLOC_ELEMENT *)((u_int8_t *)elp + total_len);
313		frag->len = elp->len - total_len;
314		frag->ulen = 0;
315
316		elp->len = total_len;
317
318		/* The fragment follows the chunk on the address queue. */
319		SH_TAILQ_INSERT_AFTER(
320		    &head->addrq, elp, frag, addrq, __alloc_element);
321
322		/* Insert the frag into the correct size queue. */
323		__env_size_insert(head, frag);
324	}
325
326	p = (u_int8_t *)elp + sizeof(ALLOC_ELEMENT);
327	elp->ulen = len;
328#ifdef DIAGNOSTIC
329	p[len] = GUARD_BYTE;
330#endif
331	*(void **)retp = p;
332
333	return (0);
334}
335
336/*
337 * __env_alloc_free --
338 *	Free space into the shared region.
339 *
340 * PUBLIC: void __env_alloc_free __P((REGINFO *, void *));
341 */
342void
343__env_alloc_free(infop, ptr)
344	REGINFO *infop;
345	void *ptr;
346{
347	ALLOC_ELEMENT *elp, *elp_tmp;
348	ALLOC_LAYOUT *head;
349	ENV *env;
350	SIZEQ_HEAD *q;
351	size_t len;
352	u_int8_t i, *p;
353
354	env = infop->env;
355
356	/* In a private region, we call free. */
357	if (F_ISSET(env, ENV_PRIVATE)) {
358		/* Find the start of the memory chunk and its length. */
359		p = (u_int8_t *)((size_t *)ptr - 1);
360		len = *(size_t *)p;
361
362		infop->allocated -= len;
363
364#ifdef DIAGNOSTIC
365		/* Check the guard byte. */
366		DB_ASSERT(env, p[len - 1] == GUARD_BYTE);
367
368		/* Trash the memory chunk. */
369		memset(p, CLEAR_BYTE, len);
370#endif
371		__os_free(env, p);
372		return;
373	}
374
375	head = infop->addr;
376	STAT((++head->freed));
377
378	p = ptr;
379	elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT));
380
381#ifdef DIAGNOSTIC
382	/* Check the guard byte. */
383	DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE);
384
385	/* Trash the memory chunk. */
386	memset(p, CLEAR_BYTE, elp->len - sizeof(ALLOC_ELEMENT));
387#endif
388
389	/* Mark the memory as no longer in use. */
390	elp->ulen = 0;
391
392	/*
393	 * Try and merge this chunk with chunks on either side of it.  Two
394	 * chunks can be merged if they're contiguous and not in use.
395	 */
396	if ((elp_tmp =
397	    SH_TAILQ_PREV(&head->addrq, elp, addrq, __alloc_element)) != NULL &&
398	    elp_tmp->ulen == 0 &&
399	    (u_int8_t *)elp_tmp + elp_tmp->len == (u_int8_t *)elp) {
400		/*
401		 * If we're merging the entry into a previous entry, remove the
402		 * current entry from the addr queue and the previous entry from
403		 * its size queue, and merge.
404		 */
405		SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element);
406		SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
407		SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
408
409		elp_tmp->len += elp->len;
410		elp = elp_tmp;
411	}
412	if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL &&
413	    elp_tmp->ulen == 0 &&
414	    (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) {
415		/*
416		 * If we're merging the current entry into a subsequent entry,
417		 * remove the subsequent entry from the addr and size queues
418		 * and merge.
419		 */
420		SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element);
421		SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len);
422		SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element);
423
424		elp->len += elp_tmp->len;
425	}
426
427	/* Insert in the correct place in the size queues. */
428	__env_size_insert(head, elp);
429}
430
431/*
432 * __env_size_insert --
433 *	Insert into the correct place in the size queues.
434 */
435static void
436__env_size_insert(head, elp)
437	ALLOC_LAYOUT *head;
438	ALLOC_ELEMENT *elp;
439{
440	SIZEQ_HEAD *q;
441	ALLOC_ELEMENT *elp_tmp;
442	u_int i;
443
444	/* Find the appropriate queue for the chunk. */
445	SET_QUEUE_FOR_SIZE(head, q, i, elp->len);
446
447	/* Find the correct slot in the size queue. */
448	SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element)
449		if (elp->len <= elp_tmp->len)
450			break;
451	if (elp_tmp == NULL)
452		SH_TAILQ_INSERT_TAIL(q, elp, sizeq);
453	else
454		SH_TAILQ_INSERT_BEFORE(q, elp_tmp, elp, sizeq, __alloc_element);
455}
456
457#ifdef HAVE_STATISTICS
458/*
459 * __env_alloc_print --
460 *	Display the lists of memory chunks.
461 *
462 * PUBLIC: void __env_alloc_print __P((REGINFO *, u_int32_t));
463 */
464void
465__env_alloc_print(infop, flags)
466	REGINFO *infop;
467	u_int32_t flags;
468{
469#ifdef __ALLOC_DISPLAY_ALLOCATION_LISTS
470	ALLOC_ELEMENT *elp;
471#endif
472	ALLOC_LAYOUT *head;
473	ENV *env;
474	u_int i;
475
476	env = infop->env;
477	head = infop->addr;
478
479	if (F_ISSET(env, ENV_PRIVATE))
480		return;
481
482	__db_msg(env,
483    "Region allocations: %lu allocations, %lu failures, %lu frees, %lu longest",
484	    (u_long)head->success, (u_long)head->failure, (u_long)head->freed,
485	    (u_long)head->longest);
486
487	if (!LF_ISSET(DB_STAT_ALL))
488		return;
489
490	__db_msg(env, "%s", "Allocations by power-of-two sizes:");
491	for (i = 0; i < DB_SIZE_Q_COUNT; ++i)
492		__db_msg(env, "%3dKB\t%lu",
493		    (1024 << i) / 1024, (u_long)head->pow2_size[i]);
494
495#ifdef __ALLOC_DISPLAY_ALLOCATION_LISTS
496	/*
497	 * We don't normally display the list of address/chunk pairs, a few
498	 * thousand lines of output is too voluminous for even DB_STAT_ALL.
499	 */
500	__db_msg(env,
501	    "Allocation list by address: {chunk length, user length}");
502	SH_TAILQ_FOREACH(elp, &head->addrq, addrq, __alloc_element)
503		__db_msg(env, "\t%#lx {%lu, %lu}",
504		    P_TO_ULONG(elp), (u_long)elp->len, (u_long)elp->ulen);
505
506	__db_msg(env, "Allocation free list by size: KB {chunk length}");
507	for (i = 0; i < DB_SIZE_Q_COUNT; ++i) {
508		__db_msg(env, "%3dKB", (1024 << i) / 1024);
509		SH_TAILQ_FOREACH(elp, &head->sizeq[i], sizeq, __alloc_element)
510			__db_msg(env,
511			    "\t%#lx {%lu}", P_TO_ULONG(elp), (u_long)elp->len);
512	}
513#endif
514}
515#endif
516