1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/txn.h" 15 16/* 17 * __memp_fget_pp -- 18 * DB_MPOOLFILE->get pre/post processing. 19 * 20 * PUBLIC: int __memp_fget_pp 21 * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); 22 */ 23int 24__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) 25 DB_MPOOLFILE *dbmfp; 26 db_pgno_t *pgnoaddr; 27 DB_TXN *txnp; 28 u_int32_t flags; 29 void *addrp; 30{ 31 DB_THREAD_INFO *ip; 32 ENV *env; 33 int rep_blocked, ret; 34 35 env = dbmfp->env; 36 37 MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get"); 38 39 /* 40 * Validate arguments. 41 * 42 * !!! 43 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly 44 * files here, and create non-existent pages in readonly files if the 45 * flags are set, later. The reason is that the hash access method 46 * wants to get empty pages that don't really exist in readonly files. 47 * The only alternative is for hash to write the last "bucket" all the 48 * time, which we don't want to do because one of our big goals in life 49 * is to keep database files small. It's sleazy as hell, but we catch 50 * any attempt to actually write the file in memp_fput(). 51 */ 52#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ 53 DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) 54 if (flags != 0) { 55 if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) 56 return (ret); 57 58 switch (flags) { 59 case DB_MPOOL_DIRTY: 60 case DB_MPOOL_CREATE: 61 case DB_MPOOL_EDIT: 62 case DB_MPOOL_LAST: 63 case DB_MPOOL_NEW: 64 break; 65 default: 66 return (__db_ferr(env, "memp_fget", 1)); 67 } 68 } 69 70 ENV_ENTER(env, ip); 71 72 rep_blocked = 0; 73 if (txnp == NULL && IS_ENV_REPLICATED(env)) { 74 if ((ret = __op_rep_enter(env)) != 0) 75 goto err; 76 rep_blocked = 1; 77 } 78 ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp); 79 /* 80 * We only decrement the count in op_rep_exit if the operation fails. 81 * Otherwise the count will be decremented when the page is no longer 82 * pinned in memp_fput. 83 */ 84 if (ret != 0 && rep_blocked) 85 (void)__op_rep_exit(env); 86 87 /* Similarly if an app has a page pinned it is ACTIVE. */ 88err: if (ret != 0) 89 ENV_LEAVE(env, ip); 90 91 return (ret); 92} 93 94/* 95 * __memp_fget -- 96 * Get a page from the file. 97 * 98 * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *, 99 * PUBLIC: db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *)); 100 */ 101int 102__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) 103 DB_MPOOLFILE *dbmfp; 104 db_pgno_t *pgnoaddr; 105 DB_THREAD_INFO *ip; 106 DB_TXN *txn; 107 u_int32_t flags; 108 void *addrp; 109{ 110 enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; 111 BH *alloc_bhp, *bhp, *oldest_bhp; 112 ENV *env; 113 DB_LSN *read_lsnp, vlsn; 114 DB_MPOOL *dbmp; 115 DB_MPOOL_HASH *hp; 116 MPOOL *c_mp; 117 MPOOLFILE *mfp; 118 PIN_LIST *list, *lp; 119 REGENV *renv; 120 REGINFO *infop, *t_infop, *reginfo; 121 TXN_DETAIL *td; 122 roff_t list_off, mf_offset; 123 u_int32_t bucket, pinmax, st_hsearch; 124 int b_incr, b_lock, h_locked, dirty, extending; 125 int makecopy, mvcc, need_free, ret; 126 127 *(void **)addrp = NULL; 128 COMPQUIET(c_mp, NULL); 129 COMPQUIET(infop, NULL); 130 131 env = dbmfp->env; 132 dbmp = env->mp_handle; 133 134 mfp = dbmfp->mfp; 135 mvcc = mfp->multiversion && (txn != NULL); 136 mf_offset = R_OFFSET(dbmp->reginfo, mfp); 137 alloc_bhp = bhp = oldest_bhp = NULL; 138 read_lsnp = NULL; 139 td = NULL; 140 hp = NULL; 141 b_incr = b_lock = h_locked = extending = makecopy = ret = 0; 142 143 if (LF_ISSET(DB_MPOOL_DIRTY)) { 144 if (F_ISSET(dbmfp, MP_READONLY)) { 145 __db_errx(env, 146 "%s: dirty flag set for readonly file page", 147 __memp_fn(dbmfp)); 148 return (EINVAL); 149 } 150 if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get", 151 flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0) 152 return (ret); 153 } 154 155 dirty = LF_ISSET(DB_MPOOL_DIRTY | DB_MPOOL_EDIT | DB_MPOOL_FREE); 156 LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT); 157 158 /* 159 * If the transaction is being used to update a multiversion database 160 * for the first time, set the read LSN. In addition, if this is an 161 * update, allocate a mutex. If no transaction has been supplied, that 162 * will be caught later, when we know whether one is required. 163 */ 164 if (mvcc && txn != NULL && txn->td != NULL) { 165 /* We're only interested in the ultimate parent transaction. */ 166 while (txn->parent != NULL) 167 txn = txn->parent; 168 td = (TXN_DETAIL *)txn->td; 169 if (F_ISSET(txn, TXN_SNAPSHOT)) { 170 read_lsnp = &td->read_lsn; 171 if (IS_MAX_LSN(*read_lsnp) && 172 (ret = __log_current_lsn(env, read_lsnp, 173 NULL, NULL)) != 0) 174 return (ret); 175 } 176 if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) && 177 td->mvcc_mtx == MUTEX_INVALID && (ret = 178 __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0) 179 return (ret); 180 } 181 182 switch (flags) { 183 case DB_MPOOL_LAST: 184 /* Get the last page number in the file. */ 185 MUTEX_LOCK(env, mfp->mutex); 186 *pgnoaddr = mfp->last_pgno; 187 MUTEX_UNLOCK(env, mfp->mutex); 188 break; 189 case DB_MPOOL_NEW: 190 /* 191 * If always creating a page, skip the first search 192 * of the hash bucket. 193 */ 194 goto newpg; 195 case DB_MPOOL_CREATE: 196 default: 197 break; 198 } 199 200 /* 201 * If mmap'ing the file and the page is not past the end of the file, 202 * just return a pointer. We can't use R_ADDR here: this is an offset 203 * into an mmap'd file, not a shared region, and doesn't change for 204 * private environments. 205 * 206 * The page may be past the end of the file, so check the page number 207 * argument against the original length of the file. If we previously 208 * returned pages past the original end of the file, last_pgno will 209 * have been updated to match the "new" end of the file, and checking 210 * against it would return pointers past the end of the mmap'd region. 211 * 212 * If another process has opened the file for writing since we mmap'd 213 * it, we will start playing the game by their rules, i.e. everything 214 * goes through the cache. All pages previously returned will be safe, 215 * as long as the correct locking protocol was observed. 216 * 217 * We don't discard the map because we don't know when all of the 218 * pages will have been discarded from the process' address space. 219 * It would be possible to do so by reference counting the open 220 * pages from the mmap, but it's unclear to me that it's worth it. 221 */ 222 if (dbmfp->addr != NULL && 223 F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { 224 *(void **)addrp = (u_int8_t *)dbmfp->addr + 225 (*pgnoaddr * mfp->stat.st_pagesize); 226 STAT(++mfp->stat.st_map); 227 return (0); 228 } 229 230 /* 231 * Determine the cache and hash bucket where this page lives and get 232 * local pointers to them. Reset on each pass through this code, the 233 * page number can change. 234 */ 235 MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, bucket, ret); 236 if (ret != 0) 237 return (ret); 238 c_mp = infop->primary; 239 240 if (0) { 241 /* if we search again, get an exclusive lock. */ 242retry: MUTEX_LOCK(env, hp->mtx_hash); 243 } 244 245 /* Search the hash chain for the page. */ 246 st_hsearch = 0; 247 h_locked = 1; 248 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { 249 ++st_hsearch; 250 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) 251 continue; 252 253 /* Snapshot reads -- get the version visible at read_lsn. */ 254 if (read_lsnp != NULL) { 255 while (bhp != NULL && 256 !BH_OWNED_BY(env, bhp, txn) && 257 !BH_VISIBLE(env, bhp, read_lsnp, vlsn)) 258 bhp = SH_CHAIN_PREV(bhp, vc, __bh); 259 260 /* 261 * We can get a null bhp if we are looking for a 262 * page that was created after the transaction was 263 * started so its not visible (i.e. page added to 264 * the BTREE in a subsequent txn). 265 */ 266 if (bhp == NULL) { 267 ret = DB_PAGE_NOTFOUND; 268 goto err; 269 } 270 } 271 272 makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn); 273 274 /* 275 * Increment the reference count. This signals that the 276 * buffer may not be discarded. We must drop the hash 277 * mutex before we lock the buffer mutex. 278 */ 279 if (BH_REFCOUNT(bhp) == UINT16_MAX) { 280 __db_errx(env, 281 "%s: page %lu: reference count overflow", 282 __memp_fn(dbmfp), (u_long)bhp->pgno); 283 ret = __env_panic(env, EINVAL); 284 goto err; 285 } 286 atomic_inc(env, &bhp->ref); 287 b_incr = 1; 288 289 /* 290 * Lock the buffer. If the page is being read in or modified it 291 * will be exclusively locked and we will block. 292 */ 293 MUTEX_UNLOCK(env, hp->mtx_hash); 294 h_locked = 0; 295 if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) { 296xlatch: if (LF_ISSET(DB_MPOOL_TRY)) { 297 if ((ret = 298 MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) 299 goto err; 300 } else 301 MUTEX_LOCK(env, bhp->mtx_buf); 302 F_SET(bhp, BH_EXCLUSIVE); 303 } else if (LF_ISSET(DB_MPOOL_TRY)) { 304 if ((ret = MUTEX_TRY_READLOCK(env, bhp->mtx_buf)) != 0) 305 goto err; 306 } else 307 MUTEX_READLOCK(env, bhp->mtx_buf); 308 309#ifdef HAVE_SHARED_LATCHES 310 /* 311 * If buffer is still in transit once we have a shared latch, 312 * upgrade to an exclusive latch. 313 */ 314 if (F_ISSET(bhp, BH_FREED | BH_TRASH) && 315 !F_ISSET(bhp, BH_EXCLUSIVE)) { 316 MUTEX_UNLOCK(env, bhp->mtx_buf); 317 goto xlatch; 318 } 319#else 320 F_SET(bhp, BH_EXCLUSIVE); 321#endif 322 b_lock = 1; 323 324 /* 325 * If the buffer was frozen before we waited for any I/O to 326 * complete and is still frozen, we will need to thaw it. 327 * Otherwise, it was thawed while we waited, and we need to 328 * search again. 329 */ 330 if (F_ISSET(bhp, BH_THAWED)) { 331thawed: need_free = (atomic_dec(env, &bhp->ref) == 0); 332 b_incr = 0; 333 MUTEX_UNLOCK(env, bhp->mtx_buf); 334 b_lock = 0; 335 if (need_free) { 336 MPOOL_REGION_LOCK(env, infop); 337 SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, 338 bhp, hq); 339 MPOOL_REGION_UNLOCK(env, infop); 340 } 341 bhp = NULL; 342 goto retry; 343 } 344 345 /* 346 * If the buffer we wanted was frozen or thawed while we 347 * waited, we need to start again. That is indicated by 348 * a new buffer header in the version chain owned by the same 349 * transaction as the one we pinned. 350 * 351 * Also, if we're doing an unversioned read on a multiversion 352 * file, another thread may have dirtied this buffer while we 353 * swapped from the hash bucket lock to the buffer lock. 354 */ 355 if (SH_CHAIN_HASNEXT(bhp, vc) && 356 (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || 357 (!dirty && read_lsnp == NULL))) { 358 DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0); 359 atomic_dec(env, &bhp->ref); 360 b_incr = 0; 361 MUTEX_UNLOCK(env, bhp->mtx_buf); 362 b_lock = 0; 363 bhp = NULL; 364 goto retry; 365 } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { 366 ret = DB_LOCK_DEADLOCK; 367 goto err; 368 } else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE && 369 flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) { 370 ret = DB_PAGE_NOTFOUND; 371 goto err; 372 } 373 374 STAT(++mfp->stat.st_cache_hit); 375 break; 376 } 377 378#ifdef HAVE_STATISTICS 379 /* 380 * Update the hash bucket search statistics -- do now because our next 381 * search may be for a different bucket. 382 */ 383 ++c_mp->stat.st_hash_searches; 384 if (st_hsearch > c_mp->stat.st_hash_longest) 385 c_mp->stat.st_hash_longest = st_hsearch; 386 c_mp->stat.st_hash_examined += st_hsearch; 387#endif 388 389 /* 390 * There are 4 possible paths to this location: 391 * 392 * FIRST_MISS: 393 * Didn't find the page in the hash bucket on our first pass: 394 * bhp == NULL, alloc_bhp == NULL 395 * 396 * FIRST_FOUND: 397 * Found the page in the hash bucket on our first pass: 398 * bhp != NULL, alloc_bhp == NULL 399 * 400 * SECOND_FOUND: 401 * Didn't find the page in the hash bucket on the first pass, 402 * allocated space, and found the page in the hash bucket on 403 * our second pass: 404 * bhp != NULL, alloc_bhp != NULL 405 * 406 * SECOND_MISS: 407 * Didn't find the page in the hash bucket on the first pass, 408 * allocated space, and didn't find the page in the hash bucket 409 * on our second pass: 410 * bhp == NULL, alloc_bhp != NULL 411 */ 412 state = bhp == NULL ? 413 (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : 414 (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); 415 416 switch (state) { 417 case FIRST_FOUND: 418 /* 419 * If we are to free the buffer, then this had better be the 420 * only reference. If so, just free the buffer. If not, 421 * complain and get out. 422 */ 423 if (flags == DB_MPOOL_FREE) { 424freebuf: MUTEX_LOCK(env, hp->mtx_hash); 425 h_locked = 1; 426 if (F_ISSET(bhp, BH_DIRTY)) { 427 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 428 DB_ASSERT(env, 429 atomic_read(&hp->hash_page_dirty) > 0); 430 atomic_dec(env, &hp->hash_page_dirty); 431 } 432 433 /* 434 * If the buffer we found is already freed, we're done. 435 * If the ref count is not 1 then someone may be 436 * peeking at the buffer. We cannot free it until they 437 * determine that it is not what they want. Clear the 438 * buffer so that waiting threads get an empty page. 439 */ 440 if (F_ISSET(bhp, BH_FREED)) 441 goto done; 442 else if (F_ISSET(bhp, BH_FROZEN)) 443 makecopy = 1; 444 445 if (makecopy) 446 break; 447 else if (BH_REFCOUNT(bhp) != 1 || 448 !SH_CHAIN_SINGLETON(bhp, vc)) { 449 /* 450 * Create an empty page in the chain for 451 * subsequent gets. Otherwise, a thread that 452 * re-creates this page while it is still in 453 * cache will see stale data. 454 */ 455 F_SET(bhp, BH_FREED); 456 F_CLR(bhp, BH_TRASH); 457 } else { 458 ret = __memp_bhfree(dbmp, infop, mfp, 459 hp, bhp, BH_FREE_FREEMEM); 460 bhp = NULL; 461 b_incr = b_lock = h_locked = 0; 462 } 463 goto done; 464 } else if (F_ISSET(bhp, BH_FREED)) { 465revive: DB_ASSERT(env, 466 flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW); 467 makecopy = makecopy || 468 (mvcc && !BH_OWNED_BY(env, bhp, txn)) || 469 F_ISSET(bhp, BH_FROZEN); 470 if (flags == DB_MPOOL_CREATE) { 471 MUTEX_LOCK(env, mfp->mutex); 472 if (*pgnoaddr > mfp->last_pgno) 473 mfp->last_pgno = *pgnoaddr; 474 MUTEX_UNLOCK(env, mfp->mutex); 475 } 476 } 477 if (mvcc) { 478 /* 479 * With multiversion databases, we might need to 480 * allocate a new buffer into which we can copy the one 481 * that we found. In that case, check the last buffer 482 * in the chain to see whether we can reuse an obsolete 483 * buffer. 484 * 485 * To provide snapshot isolation, we need to make sure 486 * that we've seen a buffer older than the oldest 487 * snapshot read LSN. 488 */ 489reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && 490 !h_locked) { 491 MUTEX_LOCK(env, hp->mtx_hash); 492 h_locked = 1; 493 } 494 if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && 495 SH_CHAIN_HASPREV(bhp, vc)) { 496 oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh); 497 while (SH_CHAIN_HASPREV(oldest_bhp, vc)) 498 oldest_bhp = SH_CHAIN_PREVP( 499 oldest_bhp, vc, __bh); 500 501 if (BH_REFCOUNT(oldest_bhp) == 0 && 502 !BH_OBSOLETE( 503 oldest_bhp, hp->old_reader, vlsn) && 504 (ret = __txn_oldest_reader(env, 505 &hp->old_reader)) != 0) 506 goto err; 507 508 if (BH_OBSOLETE( 509 oldest_bhp, hp->old_reader, vlsn) && 510 BH_REFCOUNT(oldest_bhp) == 0) { 511 DB_ASSERT(env, 512 !F_ISSET(oldest_bhp, BH_DIRTY)); 513 atomic_inc(env, &oldest_bhp->ref); 514 if (F_ISSET(oldest_bhp, BH_FROZEN)) { 515 /* 516 * This call will release the 517 * hash bucket mutex. 518 */ 519 ret = __memp_bh_thaw(dbmp, 520 infop, hp, oldest_bhp, 521 NULL); 522 h_locked = 0; 523 if (ret != 0) 524 goto err; 525 goto reuse; 526 } 527 if ((ret = __memp_bhfree(dbmp, 528 infop, mfp, hp, oldest_bhp, 529 BH_FREE_REUSE)) != 0) 530 goto err; 531 alloc_bhp = oldest_bhp; 532 h_locked = 0; 533 } 534 535 DB_ASSERT(env, alloc_bhp == NULL || 536 !F_ISSET(alloc_bhp, BH_FROZEN)); 537 } 538 } 539 540 /* We found the buffer or we're ready to copy -- we're done. */ 541 if (!(makecopy || F_ISSET(bhp, BH_FROZEN)) || alloc_bhp != NULL) 542 break; 543 544 /* FALLTHROUGH */ 545 case FIRST_MISS: 546 /* 547 * We didn't find the buffer in our first check. Figure out 548 * if the page exists, and allocate structures so we can add 549 * the page to the buffer pool. 550 */ 551 if (h_locked) 552 MUTEX_UNLOCK(env, hp->mtx_hash); 553 h_locked = 0; 554 555 /* 556 * The buffer is not in the pool, so we don't need to free it. 557 */ 558 if (LF_ISSET(DB_MPOOL_FREE) && 559 (bhp == NULL || F_ISSET(bhp, BH_FREED) || !makecopy)) 560 goto done; 561 562 if (bhp != NULL) 563 goto alloc; 564 565newpg: /* 566 * If DB_MPOOL_NEW is set, we have to allocate a page number. 567 * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then 568 * it's an error to try and get a page past the end of file. 569 */ 570 DB_ASSERT(env, !h_locked); 571 MUTEX_LOCK(env, mfp->mutex); 572 switch (flags) { 573 case DB_MPOOL_NEW: 574 extending = 1; 575 if (mfp->maxpgno != 0 && 576 mfp->last_pgno >= mfp->maxpgno) { 577 __db_errx(env, "%s: file limited to %lu pages", 578 __memp_fn(dbmfp), (u_long)mfp->maxpgno); 579 ret = ENOSPC; 580 } else 581 *pgnoaddr = mfp->last_pgno + 1; 582 break; 583 case DB_MPOOL_CREATE: 584 if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { 585 __db_errx(env, "%s: file limited to %lu pages", 586 __memp_fn(dbmfp), (u_long)mfp->maxpgno); 587 ret = ENOSPC; 588 } else if (!extending) 589 extending = *pgnoaddr > mfp->last_pgno; 590 break; 591 default: 592 ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; 593 break; 594 } 595 MUTEX_UNLOCK(env, mfp->mutex); 596 if (ret != 0) 597 goto err; 598 599 /* 600 * !!! 601 * In the DB_MPOOL_NEW code path, hp, infop and c_mp have 602 * not yet been initialized. 603 */ 604 if (hp == NULL) { 605 MP_GET_BUCKET(env, 606 mfp, *pgnoaddr, &infop, hp, bucket, ret); 607 if (ret != 0) 608 goto err; 609 MUTEX_UNLOCK(env, hp->mtx_hash); 610 c_mp = infop->primary; 611 } 612 613alloc: /* Allocate a new buffer header and data space. */ 614 if (alloc_bhp == NULL && (ret = 615 __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0) 616 goto err; 617 618 /* Initialize enough so we can call __memp_bhfree. */ 619 alloc_bhp->flags = 0; 620 atomic_init(&alloc_bhp->ref, 1); 621#ifdef DIAGNOSTIC 622 if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { 623 __db_errx(env, 624 "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"); 625 ret = __env_panic(env, EINVAL); 626 goto err; 627 } 628#endif 629 630 /* 631 * If we're doing copy-on-write, we will already have the 632 * buffer header. In that case, we don't need to search again. 633 */ 634 if (bhp != NULL) 635 break; 636 637 /* 638 * If we are extending the file, we'll need the mfp lock 639 * again. 640 */ 641 if (extending) 642 MUTEX_LOCK(env, mfp->mutex); 643 644 /* 645 * DB_MPOOL_NEW does not guarantee you a page unreferenced by 646 * any other thread of control. (That guarantee is interesting 647 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller 648 * did not specify the page number, and so, may reasonably not 649 * have any way to lock the page outside of mpool.) Regardless, 650 * if we allocate the page, and some other thread of control 651 * requests the page by number, we will not detect that and the 652 * thread of control that allocated using DB_MPOOL_NEW may not 653 * have a chance to initialize the page. (Note: we *could* 654 * detect this case if we set a flag in the buffer header which 655 * guaranteed that no gets of the page would succeed until the 656 * reference count went to 0, that is, until the creating page 657 * put the page.) What we do guarantee is that if two threads 658 * of control are both doing DB_MPOOL_NEW calls, they won't 659 * collide, that is, they won't both get the same page. 660 * 661 * There's a possibility that another thread allocated the page 662 * we were planning to allocate while we were off doing buffer 663 * allocation. We can do that by making sure the page number 664 * we were going to use is still available. If it's not, then 665 * we check to see if the next available page number hashes to 666 * the same mpool region as the old one -- if it does, we can 667 * continue, otherwise, we have to start over. 668 */ 669 if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { 670 *pgnoaddr = mfp->last_pgno + 1; 671 MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop, ret); 672 if (ret != 0) 673 goto err; 674 if (t_infop != infop) { 675 /* 676 * flags == DB_MPOOL_NEW, so extending is set 677 * and we're holding the mfp locked. 678 */ 679 MUTEX_UNLOCK(env, mfp->mutex); 680 goto newpg; 681 } 682 } 683 684 /* 685 * We released the mfp lock, so another thread might have 686 * extended the file. Update the last_pgno and initialize 687 * the file, as necessary, if we extended the file. 688 */ 689 if (extending) { 690 if (*pgnoaddr > mfp->last_pgno) 691 mfp->last_pgno = *pgnoaddr; 692 MUTEX_UNLOCK(env, mfp->mutex); 693 if (ret != 0) 694 goto err; 695 } 696 goto retry; 697 case SECOND_FOUND: 698 /* 699 * We allocated buffer space for the requested page, but then 700 * found the page in the buffer cache on our second check. 701 * That's OK -- we can use the page we found in the pool, 702 * unless DB_MPOOL_NEW is set. If we're about to copy-on-write, 703 * this is exactly the situation we want. 704 * 705 * For multiversion files, we may have left some pages in cache 706 * beyond the end of a file after truncating. In that case, we 707 * would get to here with extending set. If so, we need to 708 * insert the new page in the version chain similar to when 709 * we copy on write. 710 */ 711 if (F_ISSET(bhp, BH_FREED) && 712 (flags == DB_MPOOL_NEW || flags == DB_MPOOL_CREATE)) 713 goto revive; 714 else if (flags == DB_MPOOL_FREE) 715 goto freebuf; 716 else if (makecopy || F_ISSET(bhp, BH_FROZEN)) 717 break; 718 719 /* 720 * We can't use the page we found in the pool if DB_MPOOL_NEW 721 * was set. (For details, see the above comment beginning 722 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by 723 * any other thread of control".) If DB_MPOOL_NEW is set, we 724 * release our pin on this particular buffer, and try to get 725 * another one. 726 */ 727 if (flags == DB_MPOOL_NEW) { 728 DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0); 729 atomic_dec(env, &bhp->ref); 730 b_incr = 0; 731 if (F_ISSET(bhp, BH_EXCLUSIVE)) 732 F_CLR(bhp, BH_EXCLUSIVE); 733 MUTEX_UNLOCK(env, bhp->mtx_buf); 734 b_lock = 0; 735 bhp = NULL; 736 goto newpg; 737 } 738 739 break; 740 case SECOND_MISS: 741 /* 742 * We allocated buffer space for the requested page, and found 743 * the page still missing on our second pass through the buffer 744 * cache. Instantiate the page. 745 */ 746 DB_ASSERT(env, alloc_bhp != NULL); 747 bhp = alloc_bhp; 748 alloc_bhp = NULL; 749 750 /* 751 * Initialize all the BH and hash bucket fields so we can call 752 * __memp_bhfree if an error occurs. 753 * 754 * Append the buffer to the tail of the bucket list. 755 */ 756 bhp->priority = UINT32_MAX; 757 bhp->pgno = *pgnoaddr; 758 bhp->mf_offset = mf_offset; 759 bhp->bucket = bucket; 760 bhp->region = (int)(infop - dbmp->reginfo); 761 bhp->td_off = INVALID_ROFF; 762 SH_CHAIN_INIT(bhp, vc); 763 bhp->flags = 0; 764 765 /* 766 * Reference the buffer and lock exclusive. We either 767 * need to read the buffer or create it from scratch 768 * and don't want anyone looking at it till we do. 769 */ 770 MUTEX_LOCK(env, bhp->mtx_buf); 771 b_lock = 1; 772 F_SET(bhp, BH_EXCLUSIVE); 773 b_incr = 1; 774 775 /* We created a new page, it starts dirty. */ 776 if (extending) { 777 atomic_inc(env, &hp->hash_page_dirty); 778 F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); 779 } 780 781 MUTEX_REQUIRED(env, hp->mtx_hash); 782 SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, bhp, hq, __bh); 783 MUTEX_UNLOCK(env, hp->mtx_hash); 784 h_locked = 0; 785 786 /* 787 * If we created the page, zero it out. If we didn't create 788 * the page, read from the backing file. 789 * 790 * !!! 791 * DB_MPOOL_NEW doesn't call the pgin function. 792 * 793 * If DB_MPOOL_CREATE is used, then the application's pgin 794 * function has to be able to handle pages of 0's -- if it 795 * uses DB_MPOOL_NEW, it can detect all of its page creates, 796 * and not bother. 797 * 798 * If we're running in diagnostic mode, smash any bytes on the 799 * page that are unknown quantities for the caller. 800 * 801 * Otherwise, read the page into memory, optionally creating it 802 * if DB_MPOOL_CREATE is set. 803 */ 804 if (extending) { 805 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 806 PROT_READ | PROT_WRITE); 807 memset(bhp->buf, 0, 808 (mfp->clear_len == DB_CLEARLEN_NOTSET) ? 809 mfp->stat.st_pagesize : mfp->clear_len); 810#if defined(DIAGNOSTIC) || defined(UMRW) 811 if (mfp->clear_len != DB_CLEARLEN_NOTSET) 812 memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, 813 mfp->stat.st_pagesize - mfp->clear_len); 814#endif 815 816 if (flags == DB_MPOOL_CREATE && mfp->ftype != 0 && 817 (ret = __memp_pg(dbmfp, 818 bhp->pgno, bhp->buf, 1)) != 0) 819 goto err; 820 821 STAT(++mfp->stat.st_page_create); 822 } else { 823 F_SET(bhp, BH_TRASH); 824 STAT(++mfp->stat.st_cache_miss); 825 } 826 827 makecopy = mvcc && dirty && !extending; 828 829 /* Increment buffer count referenced by MPOOLFILE. */ 830 MUTEX_LOCK(env, mfp->mutex); 831 ++mfp->block_cnt; 832 MUTEX_UNLOCK(env, mfp->mutex); 833 } 834 835 DB_ASSERT(env, bhp != NULL && BH_REFCOUNT(bhp) != 0 && b_lock); 836 DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN) || !F_ISSET(bhp, BH_FREED) || 837 makecopy); 838 839 /* We've got a buffer header we're re-instantiating. */ 840 if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) { 841 if (alloc_bhp == NULL) 842 goto reuse; 843 844 /* 845 * To thaw the buffer, we must hold the hash bucket mutex, 846 * and the call to __memp_bh_thaw will release it. 847 */ 848 if (h_locked == 0) 849 MUTEX_LOCK(env, hp->mtx_hash); 850 h_locked = 1; 851 852 /* 853 * If the empty buffer has been filled in the meantime, don't 854 * overwrite it. 855 */ 856 if (F_ISSET(bhp, BH_THAWED)) { 857 MUTEX_UNLOCK(env, hp->mtx_hash); 858 h_locked = 0; 859 goto thawed; 860 } 861 862 ret = __memp_bh_thaw(dbmp, infop, hp, bhp, alloc_bhp); 863 bhp = NULL; 864 b_lock = h_locked = 0; 865 if (ret != 0) 866 goto err; 867 bhp = alloc_bhp; 868 alloc_bhp = NULL; 869 MUTEX_REQUIRED(env, bhp->mtx_buf); 870 b_incr = b_lock = 1; 871 } 872 873 /* 874 * BH_TRASH -- 875 * The buffer we found may need to be filled from the disk. 876 * 877 * It's possible for the read function to fail, which means we fail 878 * as well. Discard the buffer on failure unless another thread 879 * is waiting on our I/O to complete. It's OK to leave the buffer 880 * around, as the waiting thread will see the BH_TRASH flag set, 881 * and will also attempt to discard it. If there's a waiter, 882 * we need to decrement our reference count. 883 */ 884 if (F_ISSET(bhp, BH_TRASH) && 885 flags != DB_MPOOL_FREE && !F_ISSET(bhp, BH_FREED)) { 886 if ((ret = __memp_pgread(dbmfp, 887 bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) 888 goto err; 889 DB_ASSERT(env, read_lsnp != NULL || !SH_CHAIN_HASNEXT(bhp, vc)); 890 } 891 892 /* Copy-on-write. */ 893 if (makecopy) { 894 /* 895 * If we read a page from disk that we want to modify, we now 896 * need to make copy, so we now need to allocate another buffer 897 * to hold the new copy. 898 */ 899 if (alloc_bhp == NULL) 900 goto reuse; 901 902 DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp); 903 DB_ASSERT(env, txn != NULL || 904 (F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED))); 905 DB_ASSERT(env, (extending || flags == DB_MPOOL_FREE || 906 F_ISSET(bhp, BH_FREED)) || 907 !F_ISSET(bhp, BH_FROZEN | BH_TRASH)); 908 MUTEX_REQUIRED(env, bhp->mtx_buf); 909 910 if (BH_REFCOUNT(bhp) == 1) 911 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 912 PROT_READ); 913 914 atomic_init(&alloc_bhp->ref, 1); 915 MUTEX_LOCK(env, alloc_bhp->mtx_buf); 916 alloc_bhp->priority = bhp->priority; 917 alloc_bhp->pgno = bhp->pgno; 918 alloc_bhp->bucket = bhp->bucket; 919 alloc_bhp->region = bhp->region; 920 alloc_bhp->mf_offset = bhp->mf_offset; 921 alloc_bhp->td_off = INVALID_ROFF; 922 if (txn == NULL) { 923 DB_ASSERT(env, 924 F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED)); 925 if (bhp->td_off != INVALID_ROFF && (ret = 926 __memp_bh_settxn(dbmp, mfp, alloc_bhp, 927 BH_OWNER(env, bhp))) != 0) 928 goto err; 929 } else if ((ret = 930 __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0) 931 goto err; 932 MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize, 933 PROT_READ | PROT_WRITE); 934 if (extending || 935 F_ISSET(bhp, BH_FREED) || flags == DB_MPOOL_FREE) { 936 memset(alloc_bhp->buf, 0, 937 (mfp->clear_len == DB_CLEARLEN_NOTSET) ? 938 mfp->stat.st_pagesize : mfp->clear_len); 939#if defined(DIAGNOSTIC) || defined(UMRW) 940 if (mfp->clear_len != DB_CLEARLEN_NOTSET) 941 memset(alloc_bhp->buf + mfp->clear_len, 942 CLEAR_BYTE, 943 mfp->stat.st_pagesize - mfp->clear_len); 944#endif 945 } else 946 memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); 947 MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize, 0); 948 949 if (h_locked == 0) 950 MUTEX_LOCK(env, hp->mtx_hash); 951 MUTEX_REQUIRED(env, hp->mtx_hash); 952 h_locked = 1; 953 954 alloc_bhp->flags = BH_EXCLUSIVE | 955 ((flags == DB_MPOOL_FREE) ? BH_FREED : 956 F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE)); 957 DB_ASSERT(env, flags != DB_MPOOL_FREE || 958 !F_ISSET(bhp, BH_DIRTY)); 959 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 960 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 961 SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh); 962 SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, 963 bhp, alloc_bhp, hq, __bh); 964 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); 965 MUTEX_UNLOCK(env, hp->mtx_hash); 966 h_locked = 0; 967 DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0); 968 if (atomic_dec(env, &bhp->ref) == 0) { 969 bhp->priority = c_mp->lru_count; 970 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); 971 } 972 F_CLR(bhp, BH_EXCLUSIVE); 973 MUTEX_UNLOCK(env, bhp->mtx_buf); 974 975 bhp = alloc_bhp; 976 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); 977 b_incr = 1; 978 MUTEX_REQUIRED(env, bhp->mtx_buf); 979 b_lock = 1; 980 981 if (alloc_bhp != oldest_bhp) { 982 MUTEX_LOCK(env, mfp->mutex); 983 ++mfp->block_cnt; 984 MUTEX_UNLOCK(env, mfp->mutex); 985 } 986 987 alloc_bhp = NULL; 988 } else if (mvcc && extending && 989 (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0) 990 goto err; 991 992 if (flags == DB_MPOOL_FREE) { 993 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 994 /* If we have created an empty buffer, it is not returned. */ 995 if (!F_ISSET(bhp, BH_FREED)) 996 goto freebuf; 997 goto done; 998 } 999 1000 /* 1001 * Free the allocated memory, we no longer need it. 1002 */ 1003 if (alloc_bhp != NULL) { 1004 if ((ret = __memp_bhfree(dbmp, infop, NULL, 1005 NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0) 1006 goto err; 1007 alloc_bhp = NULL; 1008 } 1009 1010 if (dirty || extending || 1011 (F_ISSET(bhp, BH_FREED) && 1012 (flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW))) { 1013 MUTEX_REQUIRED(env, bhp->mtx_buf); 1014 if (F_ISSET(bhp, BH_FREED)) { 1015 memset(bhp->buf, 0, 1016 (mfp->clear_len == DB_CLEARLEN_NOTSET) ? 1017 mfp->stat.st_pagesize : mfp->clear_len); 1018 F_CLR(bhp, BH_FREED); 1019 } 1020 if (!F_ISSET(bhp, BH_DIRTY)) { 1021#ifdef DIAGNOSTIC 1022 MUTEX_LOCK(env, hp->mtx_hash); 1023#endif 1024 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 1025 atomic_inc(env, &hp->hash_page_dirty); 1026 F_SET(bhp, BH_DIRTY); 1027#ifdef DIAGNOSTIC 1028 MUTEX_UNLOCK(env, hp->mtx_hash); 1029#endif 1030 } 1031 } else if (F_ISSET(bhp, BH_EXCLUSIVE)) { 1032 F_CLR(bhp, BH_EXCLUSIVE); 1033#ifdef HAVE_SHARED_LATCHES 1034 MUTEX_UNLOCK(env, bhp->mtx_buf); 1035 MUTEX_READLOCK(env, bhp->mtx_buf); 1036 /* 1037 * If another thread has dirtied the page while we 1038 * switched locks, we have to go through it all again. 1039 */ 1040 if (SH_CHAIN_HASNEXT(bhp, vc) && read_lsnp == NULL) { 1041 atomic_dec(env, &bhp->ref); 1042 b_incr = 0; 1043 MUTEX_UNLOCK(env, bhp->mtx_buf); 1044 b_lock = 0; 1045 bhp = NULL; 1046 goto retry; 1047 } 1048#endif 1049 } 1050 1051 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | 1052 (dirty || extending || F_ISSET(bhp, BH_DIRTY) ? 1053 PROT_WRITE : 0)); 1054 1055#ifdef DIAGNOSTIC 1056 MUTEX_LOCK(env, hp->mtx_hash); 1057 { 1058 BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh); 1059 1060 DB_ASSERT(env, !mfp->multiversion || read_lsnp != NULL || 1061 next_bhp == NULL); 1062 DB_ASSERT(env, !mvcc || read_lsnp == NULL || 1063 bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) || 1064 (BH_VISIBLE(env, bhp, read_lsnp, vlsn) && 1065 (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) || 1066 (next_bhp->td_off != INVALID_ROFF && 1067 (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED || 1068 IS_ZERO_LSN(BH_OWNER(env, next_bhp)->last_lsn) || 1069 !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn)))))); 1070 } 1071 MUTEX_UNLOCK(env, hp->mtx_hash); 1072#endif 1073 1074 /* 1075 * Record this pin for this thread. Holding the page pinned 1076 * without recording the pin is ok since we do not recover from 1077 * a death from within the library itself. 1078 */ 1079 if (ip != NULL) { 1080 reginfo = env->reginfo; 1081 if (ip->dbth_pincount == ip->dbth_pinmax) { 1082 pinmax = ip->dbth_pinmax; 1083 renv = reginfo->primary; 1084 MUTEX_LOCK(env, renv->mtx_regenv); 1085 if ((ret = __env_alloc(reginfo, 1086 2 * pinmax * sizeof(PIN_LIST), &list)) != 0) { 1087 MUTEX_UNLOCK(env, renv->mtx_regenv); 1088 goto err; 1089 } 1090 1091 memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist), 1092 pinmax * sizeof(PIN_LIST)); 1093 memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST)); 1094 list_off = R_OFFSET(reginfo, list); 1095 list = R_ADDR(reginfo, ip->dbth_pinlist); 1096 ip->dbth_pinmax = 2 * pinmax; 1097 ip->dbth_pinlist = list_off; 1098 if (list != ip->dbth_pinarray) 1099 __env_alloc_free(reginfo, list); 1100 MUTEX_UNLOCK(env, renv->mtx_regenv); 1101 } 1102 list = R_ADDR(reginfo, ip->dbth_pinlist); 1103 for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) 1104 if (lp->b_ref == INVALID_ROFF) 1105 break; 1106 1107 ip->dbth_pincount++; 1108 lp->b_ref = R_OFFSET(infop, bhp); 1109 lp->region = (int)(infop - dbmp->reginfo); 1110 } 1111 1112#ifdef DIAGNOSTIC 1113 /* Update the file's pinned reference count. */ 1114 MPOOL_SYSTEM_LOCK(env); 1115 ++dbmfp->pinref; 1116 MPOOL_SYSTEM_UNLOCK(env); 1117 1118 /* 1119 * We want to switch threads as often as possible, and at awkward 1120 * times. Yield every time we get a new page to ensure contention. 1121 */ 1122 if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU)) 1123 __os_yield(env, 0, 0); 1124#endif 1125 1126 DB_ASSERT(env, alloc_bhp == NULL); 1127 DB_ASSERT(env, !(dirty || extending) || 1128 atomic_read(&hp->hash_page_dirty) > 0); 1129 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0 && 1130 !F_ISSET(bhp, BH_FREED | BH_FROZEN | BH_TRASH)); 1131 1132 *(void **)addrp = bhp->buf; 1133 return (0); 1134 1135done: 1136err: /* 1137 * We should only get to here with ret == 0 if freeing a buffer. 1138 * In that case, check that it has in fact been freed. 1139 */ 1140 DB_ASSERT(env, ret != 0 || flags != DB_MPOOL_FREE || bhp == NULL || 1141 (F_ISSET(bhp, BH_FREED) && !SH_CHAIN_HASNEXT(bhp, vc))); 1142 1143 if (bhp != NULL) { 1144 if (b_incr) 1145 atomic_dec(env, &bhp->ref); 1146 if (b_lock) { 1147 F_CLR(bhp, BH_EXCLUSIVE); 1148 MUTEX_UNLOCK(env, bhp->mtx_buf); 1149 } 1150 } 1151 1152 if (h_locked) 1153 MUTEX_UNLOCK(env, hp->mtx_hash); 1154 1155 /* If alloc_bhp is set, free the memory. */ 1156 if (alloc_bhp != NULL) 1157 (void)__memp_bhfree(dbmp, infop, NULL, 1158 NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED); 1159 1160 return (ret); 1161} 1162