1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_alloc.c,v 12.43 2008/04/21 14:39:57 carol Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/mp.h" 13#include "dbinc/txn.h" 14 15static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); 16 17/* 18 * __memp_alloc -- 19 * Allocate some space from a cache region. 20 * 21 * PUBLIC: int __memp_alloc __P((DB_MPOOL *, 22 * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); 23 */ 24int 25__memp_alloc(dbmp, infop, mfp, len, offsetp, retp) 26 DB_MPOOL *dbmp; 27 REGINFO *infop; 28 MPOOLFILE *mfp; 29 size_t len; 30 roff_t *offsetp; 31 void *retp; 32{ 33 BH *bhp, *mvcc_bhp, *t1bhp, *t2bhp, *t3bhp; 34 BH_FROZEN_PAGE *frozen_bhp; 35 DB_LSN vlsn; 36 DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp; 37 ENV *env; 38 MPOOL *c_mp; 39 MPOOLFILE *bh_mfp; 40 size_t freed_space; 41 u_int32_t buckets, buffers, high_priority, priority, priority_saved; 42 u_int32_t put_counter, total_buckets; 43 int aggressive, alloc_freeze, giveup, got_oldest, ret; 44 u_int8_t *endp; 45 void *p; 46 47 env = dbmp->env; 48 c_mp = infop->primary; 49 dbht = R_ADDR(infop, c_mp->htab); 50 hp_end = &dbht[c_mp->htab_buckets]; 51 hp_saved = NULL; 52 priority_saved = 0; 53 54 buckets = buffers = put_counter = total_buckets = 0; 55 aggressive = alloc_freeze = giveup = got_oldest = 0; 56 57 STAT(c_mp->stat.st_alloc++); 58 59 /* 60 * If we're allocating a buffer, and the one we're discarding is the 61 * same size, we don't want to waste the time to re-integrate it into 62 * the shared memory free list. If the DB_MPOOLFILE argument isn't 63 * NULL, we'll compare the underlying page sizes of the two buffers 64 * before free-ing and re-allocating buffers. 65 */ 66 if (mfp != NULL) { 67 len = SSZA(BH, buf) + mfp->stat.st_pagesize; 68 /* Add space for alignment padding for MVCC diagnostics. */ 69 MVCC_BHSIZE(mfp, len); 70 } 71 72 MPOOL_REGION_LOCK(env, infop); 73 74 /* 75 * Anything newer than 1/10th of the buffer pool is ignored during 76 * allocation (unless allocation starts failing). 77 */ 78 high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; 79 80 /* 81 * First we try to allocate from free memory. If that fails, scan the 82 * buffer pool to find buffers with low priorities. We consider small 83 * sets of hash buckets each time to limit the amount of work needing 84 * to be done. This approximates LRU, but not very well. We either 85 * find a buffer of the same size to use, or we will free 3 times what 86 * we need in the hopes it will coalesce into a contiguous chunk of the 87 * right size. In the latter case we branch back here and try again. 88 */ 89alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) { 90 if (mfp != NULL) 91 c_mp->stat.st_pages++; 92 MPOOL_REGION_UNLOCK(env, infop); 93 /* 94 * For MVCC diagnostics, align the pointer so that the buffer 95 * starts on a page boundary. 96 */ 97 MVCC_BHALIGN(mfp, p); 98 99found: if (offsetp != NULL) 100 *offsetp = R_OFFSET(infop, p); 101 *(void **)retp = p; 102 103 /* 104 * Update the search statistics. 105 * 106 * We're not holding the region locked here, these statistics 107 * can't be trusted. 108 */ 109#ifdef HAVE_STATISTICS 110 total_buckets += buckets; 111 if (total_buckets != 0) { 112 if (total_buckets > c_mp->stat.st_alloc_max_buckets) 113 c_mp->stat.st_alloc_max_buckets = total_buckets; 114 c_mp->stat.st_alloc_buckets += total_buckets; 115 } 116 if (buffers != 0) { 117 if (buffers > c_mp->stat.st_alloc_max_pages) 118 c_mp->stat.st_alloc_max_pages = buffers; 119 c_mp->stat.st_alloc_pages += buffers; 120 } 121#endif 122 return (0); 123 } else if (giveup || c_mp->stat.st_pages == 0) { 124 MPOOL_REGION_UNLOCK(env, infop); 125 126 __db_errx(env, 127 "unable to allocate space from the buffer cache"); 128 return (ret); 129 } 130 ret = 0; 131 132 /* 133 * We re-attempt the allocation every time we've freed 3 times what 134 * we need. Reset our free-space counter. 135 */ 136 freed_space = 0; 137 total_buckets += buckets; 138 buckets = 0; 139 140 /* 141 * Walk the hash buckets and find the next two with potentially useful 142 * buffers. Free the buffer with the lowest priority from the buckets' 143 * chains. 144 */ 145 for (;;) { 146 /* All pages have been freed, make one last try */ 147 if (c_mp->stat.st_pages == 0) 148 goto alloc; 149 150 /* Check for wrap around. */ 151 hp = &dbht[c_mp->last_checked++]; 152 if (hp >= hp_end) { 153 c_mp->last_checked = 0; 154 hp = &dbht[c_mp->last_checked++]; 155 } 156 157 /* 158 * The failure mode is when there are too many buffers we can't 159 * write or there's not enough memory in the system to support 160 * the number of pinned buffers. 161 * 162 * Get aggressive if we've reviewed the entire cache without 163 * freeing the needed space. (The code resets "aggressive" 164 * when we free any space.) Aggressive means: 165 * 166 * a: set a flag to attempt to flush high priority buffers as 167 * well as other buffers. 168 * b: sync the mpool to force out queue extent pages. While we 169 * might not have enough space for what we want and flushing 170 * is expensive, why not? 171 * c: look at a buffer in every hash bucket rather than choose 172 * the more preferable of two. 173 * d: start to think about giving up. 174 * 175 * If we get here twice, sleep for a second, hopefully someone 176 * else will run and free up some memory. 177 * 178 * Always try to allocate memory too, in case some other thread 179 * returns its memory to the region. 180 * 181 * We don't have any way to know an allocation has no way to 182 * succeed. Fail if no pages are returned to the cache after 183 * we've been trying for a relatively long time. 184 * 185 * !!! 186 * This test ignores pathological cases like no buffers in the 187 * system -- we check for that early on, so it isn't possible. 188 */ 189 if (buckets++ == c_mp->htab_buckets) { 190 if (freed_space > 0) 191 goto alloc; 192 MPOOL_REGION_UNLOCK(env, infop); 193 194 switch (++aggressive) { 195 case 1: 196 break; 197 case 2: 198 put_counter = c_mp->put_counter; 199 /* FALLTHROUGH */ 200 case 3: 201 case 4: 202 case 5: 203 case 6: 204 (void)__memp_sync_int( 205 env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL); 206 207 __os_yield(env, 1, 0); 208 break; 209 default: 210 aggressive = 1; 211 if (put_counter == c_mp->put_counter) 212 giveup = 1; 213 break; 214 } 215 216 MPOOL_REGION_LOCK(env, infop); 217 goto alloc; 218 } 219 220 /* 221 * Skip empty buckets. 222 * 223 * We can check for empty buckets before locking the hash 224 * bucket as we only care if the pointer is zero or non-zero. 225 */ 226 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) 227 continue; 228 229 /* Unlock the region and lock the hash bucket. */ 230 MPOOL_REGION_UNLOCK(env, infop); 231 MUTEX_LOCK(env, hp->mtx_hash); 232 233 /* 234 * Find a buffer we can use. 235 * 236 * We don't want to free a buffer out of the middle of an MVCC 237 * chain (that requires I/O). So, walk the buffers, looking 238 * for, in order of preference: 239 * an obsolete buffer at the end of an MVCC chain, 240 * the lowest-LRU singleton buffer, and 241 * the lowest LRU-buffer of all. 242 * We use an obsolete buffer at the end of a chain as soon as 243 * we find one. We use the lowest-LRU singleton buffer if we 244 * find one and it's better than the result of another hash 245 * bucket we've reviewed. We use the lowest-LRU buffer we find 246 * if it's lower than another hash bucket we've reviewed and 247 * we're being aggressive. 248 * 249 * Ignore referenced buffers, we can't get rid of them. 250 */ 251retry_search: bhp = mvcc_bhp = NULL; 252 SH_TAILQ_FOREACH(t1bhp, &hp->hash_bucket, hq, __bh) { 253 /* 254 * It's a single buffer (not an MVCC chain). 255 * 256 * If the buffer is not in use, its LRU is one we'll 257 * consider at this point in our search, and it's a 258 * better LRU than we've found so far, remember it. 259 */ 260 if (SH_CHAIN_SINGLETON(t1bhp, vc)) { 261 if (t1bhp->ref == 0 && 262 (aggressive || 263 t1bhp->priority < high_priority) && 264 (bhp == NULL || 265 bhp->priority > t1bhp->priority)) 266 bhp = t1bhp; 267 continue; 268 } 269 270 /* 271 * It's an MVCC chain. 272 */ 273 t2bhp = t1bhp; 274 do { 275 t3bhp = t2bhp; 276 277 /* 278 * If the buffer is not in use, its LRU is one 279 * we'll consider at this point, and it's a 280 * better LRU than we've found so far, remember 281 * it. The "LRU is OK" check is simpler here 282 * because we'll only consider a MVCC buffer if 283 * we're being aggressive. 284 */ 285 if (t2bhp->ref == 0 && 286 aggressive && 287 (mvcc_bhp == NULL || 288 mvcc_bhp->priority > t2bhp->priority)) 289 mvcc_bhp = t2bhp; 290 } while 291 ((t2bhp = SH_CHAIN_PREV(t2bhp, vc, __bh)) != NULL); 292 293 /* 294 * t3bhp is the last buffer on the MVCC chain, and 295 * an obsolete buffer at the end of the MVCC chain 296 * gets used without further search. 297 * 298 * If the buffer isn't obsolete with respect to the 299 * cached old reader LSN, recalculate the oldest 300 * reader LSN and check again. 301 */ 302retry_obsolete: if (BH_OBSOLETE(t3bhp, hp->old_reader, vlsn)) { 303 bhp = t3bhp; 304 goto this_buffer; 305 } 306 if (!got_oldest) { 307 if ((ret = __txn_oldest_reader( 308 env, &hp->old_reader)) != 0) 309 return (ret); 310 got_oldest = 1; 311 goto retry_obsolete; 312 } 313 } 314 315 /* 316 * bhp is either NULL or the lowest-LRU singleton buffer. 317 * mvcc_bhp is either NULL or the lowest-LRU MVCC buffer. 318 * In both cases, we'll use the chosen buffer only if we 319 * have compared its LRU against the chosen LRU of another 320 * hash bucket. 321 */ 322 if (bhp == NULL) { 323 if (mvcc_bhp == NULL) 324 goto next_hb; 325 bhp = mvcc_bhp; 326 } 327 328 /* Adjust the priority if the bucket has not been reset. */ 329 priority = bhp->priority; 330 if (c_mp->lru_reset != 0 && c_mp->lru_reset <= hp - dbht) 331 priority -= MPOOL_BASE_DECREMENT; 332 333 /* 334 * Compare two hash buckets and select the one with the lowest 335 * priority. Performance testing shows looking at two improves 336 * the LRU-ness and looking at more only does a little better. 337 */ 338 if (hp_saved == NULL) { 339 hp_saved = hp; 340 priority_saved = priority; 341 goto next_hb; 342 } 343 344 /* 345 * If the buffer we just found is a better choice than our 346 * previous choice, use it. 347 * 348 * If the previous choice was better, pretend we're moving 349 * from this hash bucket to the previous one and re-do the 350 * search. 351 * 352 * We don't worry about simply swapping between two buckets 353 * because that could only happen if a buffer was removed 354 * from the chain, or its priority updated. If a buffer 355 * is removed from the chain, some other thread has managed 356 * to discard a buffer, so we're moving forward. Updating 357 * a buffer's priority will make it a high-priority buffer, 358 * so we'll ignore it when we search again, and so we will 359 * eventually zero in on a buffer to use, or we'll decide 360 * there are no buffers we can use. 361 * 362 * If there's only a single hash bucket with buffers, we'll 363 * search the bucket once, choose a buffer, walk the entire 364 * list of buckets and search it again. In the case of a 365 * system that's busy, it's possible to imagine a case where 366 * we'd loop for a long while. For that reason, and because 367 * the test is easy, we special case and test for it. 368 */ 369 if (priority > priority_saved && hp != hp_saved) { 370 MUTEX_UNLOCK(env, hp->mtx_hash); 371 hp_tmp = hp_saved; 372 hp_saved = hp; 373 hp = hp_tmp; 374 priority_saved = priority; 375 MUTEX_LOCK(env, hp->mtx_hash); 376 goto retry_search; 377 } 378 379this_buffer: buffers++; 380 381 /* 382 * Discard any previously remembered hash bucket, we've got 383 * a winner. 384 */ 385 hp_saved = NULL; 386 387 /* Find the associated MPOOLFILE. */ 388 bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 389 390 /* If the page is dirty, pin it and write it. */ 391 ret = 0; 392 if (F_ISSET(bhp, BH_DIRTY)) { 393 ++bhp->ref; 394 ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); 395 --bhp->ref; 396#ifdef HAVE_STATISTICS 397 if (ret == 0) 398 ++c_mp->stat.st_rw_evict; 399#endif 400 } 401#ifdef HAVE_STATISTICS 402 else 403 ++c_mp->stat.st_ro_evict; 404#endif 405 406 /* 407 * Freeze this buffer, if necessary. That is, if the buffer 408 * itself or the next version created could be read by the 409 * oldest reader in the system. 410 */ 411 if (ret == 0 && bh_mfp->multiversion) { 412 if (!got_oldest && !SH_CHAIN_HASPREV(bhp, vc) && 413 !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) { 414 (void)__txn_oldest_reader(env, 415 &hp->old_reader); 416 got_oldest = 1; 417 } 418 if (SH_CHAIN_HASPREV(bhp, vc) || 419 !BH_OBSOLETE(bhp, hp->old_reader, vlsn)) { 420 /* 421 * Before freezing, double-check that we have 422 * an up-to-date old_reader LSN. 423 */ 424 if (!aggressive || 425 F_ISSET(bhp, BH_FROZEN) || bhp->ref != 0) 426 goto next_hb; 427 ret = __memp_bh_freeze(dbmp, 428 infop, hp, bhp, &alloc_freeze); 429 } 430 } 431 432 /* 433 * If a write fails for any reason, we can't proceed. 434 * 435 * We released the hash bucket lock while doing I/O, so another 436 * thread may have acquired this buffer and incremented the ref 437 * count after we wrote it, in which case we can't have it. 438 * 439 * If there's a write error and we're having problems finding 440 * something to allocate, avoid selecting this buffer again 441 * by making it the bucket's least-desirable buffer. 442 */ 443 if (ret != 0 || bhp->ref != 0) { 444 if (ret != 0 && aggressive) 445 __memp_bad_buffer(hp); 446 goto next_hb; 447 } 448 449 /* 450 * If the buffer is frozen, thaw it and look for another one 451 * we can use. 452 */ 453 if (F_ISSET(bhp, BH_FROZEN)) { 454 ++bhp->ref; 455 if ((ret = __memp_bh_thaw(dbmp, infop, hp, 456 bhp, NULL)) != 0) { 457 MUTEX_UNLOCK(env, hp->mtx_hash); 458 return (ret); 459 } 460 alloc_freeze = 0; 461 goto retry_search; 462 } 463 464 /* 465 * If we need some empty buffer headers for freezing, turn the 466 * buffer we've found into frozen headers and put them on the 467 * free list. Only reset alloc_freeze if we've actually 468 * allocated some frozen buffer headers. 469 */ 470 if (alloc_freeze) { 471 if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0) 472 return (ret); 473 MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize, 474 PROT_READ | PROT_WRITE | PROT_EXEC); 475 476 MPOOL_REGION_LOCK(env, infop); 477 SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen, 478 (BH_FROZEN_ALLOC *)bhp, links); 479 frozen_bhp = (BH_FROZEN_PAGE *) 480 ((BH_FROZEN_ALLOC *)bhp + 1); 481 endp = (u_int8_t *)bhp->buf + bh_mfp->stat.st_pagesize; 482 while ((u_int8_t *)(frozen_bhp + 1) < endp) { 483 SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, 484 (BH *)frozen_bhp, hq); 485 frozen_bhp++; 486 } 487 alloc_freeze = 0; 488 continue; 489 } 490 491 /* 492 * Check to see if the buffer is the size we're looking for. 493 * If so, we can simply reuse it. Otherwise, free the buffer 494 * and its space and keep looking. 495 */ 496 if (mfp != NULL && 497 mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { 498 if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0) 499 return (ret); 500 p = bhp; 501 goto found; 502 } 503 504 freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize; 505 if ((ret = 506 __memp_bhfree(dbmp, infop, hp, bhp, BH_FREE_FREEMEM)) != 0) 507 return (ret); 508 509 /* Reset "aggressive" if we free any space. */ 510 if (aggressive > 1) 511 aggressive = 1; 512 513 /* 514 * Unlock this hash bucket and re-acquire the region lock. If 515 * we're reaching here as a result of calling memp_bhfree, the 516 * hash bucket lock has already been discarded. 517 */ 518 if (0) { 519next_hb: MUTEX_UNLOCK(env, hp->mtx_hash); 520 } 521 MPOOL_REGION_LOCK(env, infop); 522 523 /* 524 * Retry the allocation as soon as we've freed up sufficient 525 * space. We're likely to have to coalesce of memory to 526 * satisfy the request, don't try until it's likely (possible?) 527 * we'll succeed. 528 */ 529 if (freed_space >= 3 * len) 530 goto alloc; 531 } 532 /* NOTREACHED */ 533} 534 535/* 536 * __memp_free -- 537 * Free some space from a cache region. 538 * 539 * PUBLIC: void __memp_free __P((REGINFO *, MPOOLFILE *, void *)); 540 */ 541void 542__memp_free(infop, mfp, buf) 543 REGINFO *infop; 544 MPOOLFILE *mfp; 545 void *buf; 546{ 547 MVCC_BHUNALIGN(mfp, buf); 548 COMPQUIET(mfp, NULL); 549 __env_alloc_free(infop, buf); 550} 551 552/* 553 * __memp_bad_buffer -- 554 * Make the first buffer in a hash bucket the least desirable buffer. 555 */ 556static void 557__memp_bad_buffer(hp) 558 DB_MPOOL_HASH *hp; 559{ 560 BH *bhp, *last_bhp; 561 u_int32_t priority; 562 563 /* 564 * Get the first buffer from the bucket. If it is also the last buffer 565 * (in other words, it is the only buffer in the bucket), we're done. 566 */ 567 bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); 568 last_bhp = SH_TAILQ_LASTP(&hp->hash_bucket, hq, __bh); 569 if (bhp == last_bhp) 570 return; 571 572 /* There are multiple buffers in the bucket, remove the first one. */ 573 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); 574 575 /* 576 * Find the highest priority buffer in the bucket. Buffers are 577 * sorted by priority, so it's the last one in the bucket. 578 */ 579 priority = BH_PRIORITY(last_bhp); 580 581 /* 582 * Append our buffer to the bucket and set its priority to be just as 583 * bad. 584 */ 585 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); 586 for (; bhp != NULL ; bhp = SH_CHAIN_PREV(bhp, vc, __bh)) 587 bhp->priority = priority; 588} 589