1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: env_alloc.c,v 12.22 2008/01/27 18:02:31 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12 13/* 14 * Implement shared memory region allocation. The initial list is a single 15 * memory "chunk" which is carved up as memory is requested. Chunks are 16 * coalesced when free'd. We maintain two types of linked-lists: a list of 17 * all chunks sorted by address, and a set of lists with free chunks sorted 18 * by size. 19 * 20 * The ALLOC_LAYOUT structure is the governing structure for the allocator. 21 * 22 * The ALLOC_ELEMENT structure is the structure that describes any single 23 * chunk of memory, and is immediately followed by the user's memory. 24 * 25 * The internal memory chunks are always aligned to a uintmax_t boundary so 26 * we don't drop core accessing the fields of the ALLOC_ELEMENT structure. 27 * 28 * The memory chunks returned to the user are aligned to a uintmax_t boundary. 29 * This is enforced by terminating the ALLOC_ELEMENT structure with a uintmax_t 30 * field as that immediately precedes the user's memory. Any caller needing 31 * more than uintmax_t alignment is responsible for doing alignment themselves. 32 */ 33 34typedef SH_TAILQ_HEAD(__sizeq) SIZEQ_HEAD; 35 36typedef struct __alloc_layout { 37 SH_TAILQ_HEAD(__addrq) addrq; /* Sorted by address */ 38 39 /* 40 * A perfect Berkeley DB application does little allocation because 41 * most things are allocated on startup and never free'd. This is 42 * true even for the cache, because we don't free and re-allocate 43 * the memory associated with a cache buffer when swapping a page 44 * in memory for a page on disk -- unless the page is changing size. 45 * The latter problem is why we have multiple size queues. If the 46 * application's working set fits in cache, it's not a problem. If 47 * the application's working set doesn't fit in cache, but all of 48 * the databases have the same size pages, it's still not a problem. 49 * If the application's working set doesn't fit in cache, and its 50 * databases have different page sizes, we can end up walking a lot 51 * of 512B chunk allocations looking for an available 64KB chunk. 52 * 53 * So, we keep a set of queues, where we expect to find a chunk of 54 * roughly the right size at the front of the list. The first queue 55 * is chunks <= 1024, the second is <= 2048, and so on. With 11 56 * queues, we have separate queues for chunks up to 1MB. 57 */ 58#define DB_SIZE_Q_COUNT 11 59 SIZEQ_HEAD sizeq[DB_SIZE_Q_COUNT]; /* Sorted by size */ 60#ifdef HAVE_STATISTICS 61 u_int32_t pow2_size[DB_SIZE_Q_COUNT]; 62#endif 63 64#ifdef HAVE_STATISTICS 65 u_int32_t success; /* Successful allocations */ 66 u_int32_t failure; /* Failed allocations */ 67 u_int32_t freed; /* Free calls */ 68 u_int32_t longest; /* Longest chain walked */ 69#endif 70 uintmax_t unused; /* Guarantee alignment */ 71} ALLOC_LAYOUT; 72 73typedef struct __alloc_element { 74 SH_TAILQ_ENTRY addrq; /* List by address */ 75 SH_TAILQ_ENTRY sizeq; /* List by size */ 76 77 /* 78 * The "len" field is the total length of the chunk, not the size 79 * available to the caller. 80 */ 81 size_t len; /* Chunk length */ 82 83 /* 84 * The "ulen" field is the length returned to the caller. 85 * 86 * Set to 0 if the chunk is not currently in use. 87 */ 88 uintmax_t ulen; /* User's length */ 89} ALLOC_ELEMENT; 90 91/* 92 * If the chunk can be split into two pieces, with the fragment holding at 93 * least 64 bytes of memory, we divide the chunk into two parts. 94 */ 95#define SHALLOC_FRAGMENT (sizeof(ALLOC_ELEMENT) + 64) 96 97/* Macro to find the appropriate queue for a specific size chunk. */ 98#undef SET_QUEUE_FOR_SIZE 99#define SET_QUEUE_FOR_SIZE(head, q, i, len) do { \ 100 for (i = 0; i < DB_SIZE_Q_COUNT; ++i) { \ 101 q = &(head)->sizeq[i]; \ 102 if ((len) <= (size_t)1024 << i) \ 103 break; \ 104 } \ 105} while (0) 106 107static void __env_size_insert __P((ALLOC_LAYOUT *, ALLOC_ELEMENT *)); 108 109/* 110 * __env_alloc_init -- 111 * Initialize the area as one large chunk. 112 * 113 * PUBLIC: void __env_alloc_init __P((REGINFO *, size_t)); 114 */ 115void 116__env_alloc_init(infop, size) 117 REGINFO *infop; 118 size_t size; 119{ 120 ALLOC_ELEMENT *elp; 121 ALLOC_LAYOUT *head; 122 ENV *env; 123 u_int i; 124 125 env = infop->env; 126 127 /* No initialization needed for heap memory regions. */ 128 if (F_ISSET(env, ENV_PRIVATE)) 129 return; 130 131 /* 132 * The first chunk of memory is the ALLOC_LAYOUT structure. 133 */ 134 head = infop->addr; 135 memset(head, 0, sizeof(*head)); 136 SH_TAILQ_INIT(&head->addrq); 137 for (i = 0; i < DB_SIZE_Q_COUNT; ++i) 138 SH_TAILQ_INIT(&head->sizeq[i]); 139 COMPQUIET(head->unused, 0); 140 141 /* 142 * The rest of the memory is the first available chunk. 143 */ 144 elp = (ALLOC_ELEMENT *)((u_int8_t *)head + sizeof(ALLOC_LAYOUT)); 145 elp->len = size - sizeof(ALLOC_LAYOUT); 146 elp->ulen = 0; 147 148 SH_TAILQ_INSERT_HEAD(&head->addrq, elp, addrq, __alloc_element); 149 SH_TAILQ_INSERT_HEAD( 150 &head->sizeq[DB_SIZE_Q_COUNT - 1], elp, sizeq, __alloc_element); 151} 152 153/* 154 * The length, the ALLOC_ELEMENT structure and an optional guard byte, 155 * rounded up to standard alignment. 156 */ 157#ifdef DIAGNOSTIC 158#define DB_ALLOC_SIZE(len) \ 159 (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT) + 1, sizeof(uintmax_t)) 160#else 161#define DB_ALLOC_SIZE(len) \ 162 (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT), sizeof(uintmax_t)) 163#endif 164 165/* 166 * __env_alloc_overhead -- 167 * Return the overhead needed for an allocation. 168 * 169 * PUBLIC: size_t __env_alloc_overhead __P((void)); 170 */ 171size_t 172__env_alloc_overhead() 173{ 174 return (sizeof(ALLOC_ELEMENT)); 175} 176 177/* 178 * __env_alloc_size -- 179 * Return the space needed for an allocation, including alignment. 180 * 181 * PUBLIC: size_t __env_alloc_size __P((size_t)); 182 */ 183size_t 184__env_alloc_size(len) 185 size_t len; 186{ 187 return (DB_ALLOC_SIZE(len)); 188} 189 190/* 191 * __env_alloc -- 192 * Allocate space from the shared region. 193 * 194 * PUBLIC: int __env_alloc __P((REGINFO *, size_t, void *)); 195 */ 196int 197__env_alloc(infop, len, retp) 198 REGINFO *infop; 199 size_t len; 200 void *retp; 201{ 202 SIZEQ_HEAD *q; 203 ALLOC_ELEMENT *elp, *frag, *elp_tmp; 204 ALLOC_LAYOUT *head; 205 ENV *env; 206 size_t total_len; 207 u_int8_t *p; 208 u_int i; 209 int ret; 210#ifdef HAVE_STATISTICS 211 u_int32_t st_search; 212#endif 213 env = infop->env; 214 *(void **)retp = NULL; 215 216 /* 217 * In a heap-backed environment, we call malloc for additional space. 218 * (Malloc must return memory correctly aligned for our use.) 219 * 220 * In a heap-backed environment, memory is laid out as follows: 221 * 222 * { size_t total-length } { user-memory } { guard-byte } 223 */ 224 if (F_ISSET(env, ENV_PRIVATE)) { 225 /* Check if we're over the limit. */ 226 if (infop->allocated >= infop->max_alloc) 227 return (ENOMEM); 228 229 /* We need an additional size_t to hold the length. */ 230 len += sizeof(size_t); 231 232#ifdef DIAGNOSTIC 233 /* Plus one byte for the guard byte. */ 234 ++len; 235#endif 236 /* Allocate the space. */ 237 if ((ret = __os_malloc(env, len, &p)) != 0) 238 return (ret); 239 infop->allocated += len; 240 241 *(size_t *)p = len; 242#ifdef DIAGNOSTIC 243 p[len - 1] = GUARD_BYTE; 244#endif 245 *(void **)retp = p + sizeof(size_t); 246 return (0); 247 } 248 249 head = infop->addr; 250 total_len = DB_ALLOC_SIZE(len); 251 252 /* Find the first size queue that could satisfy the request. */ 253 SET_QUEUE_FOR_SIZE(head, q, i, total_len); 254 255#ifdef HAVE_STATISTICS 256 ++head->pow2_size[i]; /* Note the size of the request. */ 257#endif 258 259 /* 260 * Search this queue, and, if necessary, queues larger than this queue, 261 * looking for a chunk we can use. 262 */ 263 STAT((st_search = 0)); 264 for (elp = NULL;; ++q) { 265 SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) { 266 STAT((++st_search)); 267 268 /* 269 * Chunks are sorted from largest to smallest -- if 270 * this chunk is less than what we need, no chunk 271 * further down the list will be large enough. 272 */ 273 if (elp_tmp->len < total_len) 274 break; 275 276 /* 277 * This chunk will do... maybe there's a better one, 278 * but this one will do. 279 */ 280 elp = elp_tmp; 281 282 /* 283 * We might have many chunks of the same size. Stop 284 * looking if we won't fragment memory by picking the 285 * current one. 286 */ 287 if (elp_tmp->len - total_len <= SHALLOC_FRAGMENT) 288 break; 289 } 290 if (elp != NULL || ++i >= DB_SIZE_Q_COUNT) 291 break; 292 } 293 294#ifdef HAVE_STATISTICS 295 if (head->longest < st_search) 296 head->longest = st_search; 297#endif 298 299 /* 300 * If we don't find an element of the right size, we're done. 301 */ 302 if (elp == NULL) { 303 STAT((++head->failure)); 304 return (ENOMEM); 305 } 306 STAT((++head->success)); 307 308 /* Pull the chunk off of the size queue. */ 309 SH_TAILQ_REMOVE(q, elp, sizeq, __alloc_element); 310 311 if (elp->len - total_len > SHALLOC_FRAGMENT) { 312 frag = (ALLOC_ELEMENT *)((u_int8_t *)elp + total_len); 313 frag->len = elp->len - total_len; 314 frag->ulen = 0; 315 316 elp->len = total_len; 317 318 /* The fragment follows the chunk on the address queue. */ 319 SH_TAILQ_INSERT_AFTER( 320 &head->addrq, elp, frag, addrq, __alloc_element); 321 322 /* Insert the frag into the correct size queue. */ 323 __env_size_insert(head, frag); 324 } 325 326 p = (u_int8_t *)elp + sizeof(ALLOC_ELEMENT); 327 elp->ulen = len; 328#ifdef DIAGNOSTIC 329 p[len] = GUARD_BYTE; 330#endif 331 *(void **)retp = p; 332 333 return (0); 334} 335 336/* 337 * __env_alloc_free -- 338 * Free space into the shared region. 339 * 340 * PUBLIC: void __env_alloc_free __P((REGINFO *, void *)); 341 */ 342void 343__env_alloc_free(infop, ptr) 344 REGINFO *infop; 345 void *ptr; 346{ 347 ALLOC_ELEMENT *elp, *elp_tmp; 348 ALLOC_LAYOUT *head; 349 ENV *env; 350 SIZEQ_HEAD *q; 351 size_t len; 352 u_int8_t i, *p; 353 354 env = infop->env; 355 356 /* In a private region, we call free. */ 357 if (F_ISSET(env, ENV_PRIVATE)) { 358 /* Find the start of the memory chunk and its length. */ 359 p = (u_int8_t *)((size_t *)ptr - 1); 360 len = *(size_t *)p; 361 362 infop->allocated -= len; 363 364#ifdef DIAGNOSTIC 365 /* Check the guard byte. */ 366 DB_ASSERT(env, p[len - 1] == GUARD_BYTE); 367 368 /* Trash the memory chunk. */ 369 memset(p, CLEAR_BYTE, len); 370#endif 371 __os_free(env, p); 372 return; 373 } 374 375 head = infop->addr; 376 STAT((++head->freed)); 377 378 p = ptr; 379 elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT)); 380 381#ifdef DIAGNOSTIC 382 /* Check the guard byte. */ 383 DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE); 384 385 /* Trash the memory chunk. */ 386 memset(p, CLEAR_BYTE, elp->len - sizeof(ALLOC_ELEMENT)); 387#endif 388 389 /* Mark the memory as no longer in use. */ 390 elp->ulen = 0; 391 392 /* 393 * Try and merge this chunk with chunks on either side of it. Two 394 * chunks can be merged if they're contiguous and not in use. 395 */ 396 if ((elp_tmp = 397 SH_TAILQ_PREV(&head->addrq, elp, addrq, __alloc_element)) != NULL && 398 elp_tmp->ulen == 0 && 399 (u_int8_t *)elp_tmp + elp_tmp->len == (u_int8_t *)elp) { 400 /* 401 * If we're merging the entry into a previous entry, remove the 402 * current entry from the addr queue and the previous entry from 403 * its size queue, and merge. 404 */ 405 SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element); 406 SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len); 407 SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element); 408 409 elp_tmp->len += elp->len; 410 elp = elp_tmp; 411 } 412 if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL && 413 elp_tmp->ulen == 0 && 414 (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) { 415 /* 416 * If we're merging the current entry into a subsequent entry, 417 * remove the subsequent entry from the addr and size queues 418 * and merge. 419 */ 420 SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element); 421 SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len); 422 SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element); 423 424 elp->len += elp_tmp->len; 425 } 426 427 /* Insert in the correct place in the size queues. */ 428 __env_size_insert(head, elp); 429} 430 431/* 432 * __env_size_insert -- 433 * Insert into the correct place in the size queues. 434 */ 435static void 436__env_size_insert(head, elp) 437 ALLOC_LAYOUT *head; 438 ALLOC_ELEMENT *elp; 439{ 440 SIZEQ_HEAD *q; 441 ALLOC_ELEMENT *elp_tmp; 442 u_int i; 443 444 /* Find the appropriate queue for the chunk. */ 445 SET_QUEUE_FOR_SIZE(head, q, i, elp->len); 446 447 /* Find the correct slot in the size queue. */ 448 SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) 449 if (elp->len <= elp_tmp->len) 450 break; 451 if (elp_tmp == NULL) 452 SH_TAILQ_INSERT_TAIL(q, elp, sizeq); 453 else 454 SH_TAILQ_INSERT_BEFORE(q, elp_tmp, elp, sizeq, __alloc_element); 455} 456 457#ifdef HAVE_STATISTICS 458/* 459 * __env_alloc_print -- 460 * Display the lists of memory chunks. 461 * 462 * PUBLIC: void __env_alloc_print __P((REGINFO *, u_int32_t)); 463 */ 464void 465__env_alloc_print(infop, flags) 466 REGINFO *infop; 467 u_int32_t flags; 468{ 469#ifdef __ALLOC_DISPLAY_ALLOCATION_LISTS 470 ALLOC_ELEMENT *elp; 471#endif 472 ALLOC_LAYOUT *head; 473 ENV *env; 474 u_int i; 475 476 env = infop->env; 477 head = infop->addr; 478 479 if (F_ISSET(env, ENV_PRIVATE)) 480 return; 481 482 __db_msg(env, 483 "Region allocations: %lu allocations, %lu failures, %lu frees, %lu longest", 484 (u_long)head->success, (u_long)head->failure, (u_long)head->freed, 485 (u_long)head->longest); 486 487 if (!LF_ISSET(DB_STAT_ALL)) 488 return; 489 490 __db_msg(env, "%s", "Allocations by power-of-two sizes:"); 491 for (i = 0; i < DB_SIZE_Q_COUNT; ++i) 492 __db_msg(env, "%3dKB\t%lu", 493 (1024 << i) / 1024, (u_long)head->pow2_size[i]); 494 495#ifdef __ALLOC_DISPLAY_ALLOCATION_LISTS 496 /* 497 * We don't normally display the list of address/chunk pairs, a few 498 * thousand lines of output is too voluminous for even DB_STAT_ALL. 499 */ 500 __db_msg(env, 501 "Allocation list by address: {chunk length, user length}"); 502 SH_TAILQ_FOREACH(elp, &head->addrq, addrq, __alloc_element) 503 __db_msg(env, "\t%#lx {%lu, %lu}", 504 P_TO_ULONG(elp), (u_long)elp->len, (u_long)elp->ulen); 505 506 __db_msg(env, "Allocation free list by size: KB {chunk length}"); 507 for (i = 0; i < DB_SIZE_Q_COUNT; ++i) { 508 __db_msg(env, "%3dKB", (1024 << i) / 1024); 509 SH_TAILQ_FOREACH(elp, &head->sizeq[i], sizeq, __alloc_element) 510 __db_msg(env, 511 "\t%#lx {%lu}", P_TO_ULONG(elp), (u_long)elp->len); 512 } 513#endif 514} 515#endif 516