1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_fget.c,v 12.53 2008/04/28 02:59:57 alexg Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/txn.h" 15 16/* 17 * __memp_fget_pp -- 18 * DB_MPOOLFILE->get pre/post processing. 19 * 20 * PUBLIC: int __memp_fget_pp 21 * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); 22 */ 23int 24__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) 25 DB_MPOOLFILE *dbmfp; 26 db_pgno_t *pgnoaddr; 27 DB_TXN *txnp; 28 u_int32_t flags; 29 void *addrp; 30{ 31 DB_THREAD_INFO *ip; 32 ENV *env; 33 int rep_blocked, ret; 34 35 env = dbmfp->env; 36 37 MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get"); 38 39 /* 40 * Validate arguments. 41 * 42 * !!! 43 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly 44 * files here, and create non-existent pages in readonly files if the 45 * flags are set, later. The reason is that the hash access method 46 * wants to get empty pages that don't really exist in readonly files. 47 * The only alternative is for hash to write the last "bucket" all the 48 * time, which we don't want to do because one of our big goals in life 49 * is to keep database files small. It's sleazy as hell, but we catch 50 * any attempt to actually write the file in memp_fput(). 51 */ 52#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ 53 DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) 54 if (flags != 0) { 55 if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) 56 return (ret); 57 58 switch (flags) { 59 case DB_MPOOL_DIRTY: 60 case DB_MPOOL_CREATE: 61 case DB_MPOOL_EDIT: 62 case DB_MPOOL_LAST: 63 case DB_MPOOL_NEW: 64 break; 65 default: 66 return (__db_ferr(env, "memp_fget", 1)); 67 } 68 } 69 70 ENV_ENTER(env, ip); 71 72 rep_blocked = 0; 73 if (txnp == NULL && IS_ENV_REPLICATED(env)) { 74 if ((ret = __op_rep_enter(env)) != 0) 75 goto err; 76 rep_blocked = 1; 77 } 78 ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp); 79 /* 80 * We only decrement the count in op_rep_exit if the operation fails. 81 * Otherwise the count will be decremented when the page is no longer 82 * pinned in memp_fput. 83 */ 84 if (ret != 0 && rep_blocked) 85 (void)__op_rep_exit(env); 86 87 /* Similarly if an app has a page pinned it is ACTIVE. */ 88err: if (ret != 0) 89 ENV_LEAVE(env, ip); 90 91 return (ret); 92} 93 94/* 95 * __memp_fget -- 96 * Get a page from the file. 97 * 98 * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *, 99 * PUBLIC: db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *)); 100 */ 101int 102__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) 103 DB_MPOOLFILE *dbmfp; 104 db_pgno_t *pgnoaddr; 105 DB_THREAD_INFO *ip; 106 DB_TXN *txn; 107 u_int32_t flags; 108 void *addrp; 109{ 110 enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; 111 BH *alloc_bhp, *bhp, *frozen_bhp, *oldest_bhp; 112 ENV *env; 113 DB_LSN *read_lsnp, vlsn; 114 DB_MPOOL *dbmp; 115 DB_MPOOL_HASH *hp; 116 MPOOL *c_mp; 117 MPOOLFILE *mfp; 118 PIN_LIST *list, *lp; 119 REGINFO *infop, *t_infop, *reginfo; 120 TXN_DETAIL *td; 121 roff_t list_off, mf_offset; 122 u_int32_t pinmax, st_hsearch; 123 int b_incr, b_locked, dirty, edit, extending, first; 124 int makecopy, mvcc, need_free, ret; 125 126 *(void **)addrp = NULL; 127 COMPQUIET(c_mp, NULL); 128 COMPQUIET(infop, NULL); 129 COMPQUIET(oldest_bhp, NULL); 130 131 env = dbmfp->env; 132 dbmp = env->mp_handle; 133 134 mfp = dbmfp->mfp; 135 mvcc = mfp->multiversion; 136 mf_offset = R_OFFSET(dbmp->reginfo, mfp); 137 alloc_bhp = bhp = frozen_bhp = NULL; 138 read_lsnp = NULL; 139 td = NULL; 140 hp = NULL; 141 b_incr = b_locked = extending = makecopy = ret = 0; 142 143 if (LF_ISSET(DB_MPOOL_DIRTY)) { 144 if (F_ISSET(dbmfp, MP_READONLY)) { 145 __db_errx(env, 146 "%s: dirty flag set for readonly file page", 147 __memp_fn(dbmfp)); 148 return (EINVAL); 149 } 150 if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get", 151 flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0) 152 return (ret); 153 } 154 155 dirty = LF_ISSET(DB_MPOOL_DIRTY); 156 edit = LF_ISSET(DB_MPOOL_EDIT); 157 LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT); 158 159 /* 160 * If the transaction is being used to update a multiversion database 161 * for the first time, set the read LSN. In addition, if this is an 162 * update, allocate a mutex. If no transaction has been supplied, that 163 * will be caught later, when we know whether one is required. 164 */ 165 if (mvcc && txn != NULL && txn->td != NULL) { 166 /* We're only interested in the ultimate parent transaction. */ 167 while (txn->parent != NULL) 168 txn = txn->parent; 169 td = (TXN_DETAIL *)txn->td; 170 if (F_ISSET(txn, TXN_SNAPSHOT)) { 171 read_lsnp = &td->read_lsn; 172 if (IS_MAX_LSN(*read_lsnp) && 173 (ret = __log_current_lsn(env, read_lsnp, 174 NULL, NULL)) != 0) 175 return (ret); 176 } 177 if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) && 178 td->mvcc_mtx == MUTEX_INVALID && (ret = 179 __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0) 180 return (ret); 181 } 182 183 switch (flags) { 184 case DB_MPOOL_LAST: 185 /* Get the last page number in the file. */ 186 MUTEX_LOCK(env, mfp->mutex); 187 *pgnoaddr = mfp->last_pgno; 188 MUTEX_UNLOCK(env, mfp->mutex); 189 break; 190 case DB_MPOOL_NEW: 191 /* 192 * If always creating a page, skip the first search 193 * of the hash bucket. 194 */ 195 state = FIRST_MISS; 196 goto alloc; 197 case DB_MPOOL_CREATE: 198 default: 199 break; 200 } 201 202 /* 203 * If mmap'ing the file and the page is not past the end of the file, 204 * just return a pointer. We can't use R_ADDR here: this is an offset 205 * into an mmap'd file, not a shared region, and doesn't change for 206 * private environments. 207 * 208 * The page may be past the end of the file, so check the page number 209 * argument against the original length of the file. If we previously 210 * returned pages past the original end of the file, last_pgno will 211 * have been updated to match the "new" end of the file, and checking 212 * against it would return pointers past the end of the mmap'd region. 213 * 214 * If another process has opened the file for writing since we mmap'd 215 * it, we will start playing the game by their rules, i.e. everything 216 * goes through the cache. All pages previously returned will be safe, 217 * as long as the correct locking protocol was observed. 218 * 219 * We don't discard the map because we don't know when all of the 220 * pages will have been discarded from the process' address space. 221 * It would be possible to do so by reference counting the open 222 * pages from the mmap, but it's unclear to me that it's worth it. 223 */ 224 if (dbmfp->addr != NULL && 225 F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { 226 *(void **)addrp = (u_int8_t *)dbmfp->addr + 227 (*pgnoaddr * mfp->stat.st_pagesize); 228 STAT(++mfp->stat.st_map); 229 return (0); 230 } 231 232retry: /* 233 * Determine the cache and hash bucket where this page lives and get 234 * local pointers to them. Reset on each pass through this code, the 235 * page number can change. 236 */ 237 MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, ret); 238 if (ret != 0) 239 return (ret); 240 c_mp = infop->primary; 241 242 /* Search the hash chain for the page. */ 243 st_hsearch = 0; 244 b_locked = 1; 245 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { 246 ++st_hsearch; 247 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) 248 continue; 249 250 /* Snapshot reads -- get the version visible at read_lsn. */ 251 if (mvcc && !edit && read_lsnp != NULL) { 252 while (bhp != NULL && 253 !BH_OWNED_BY(env, bhp, txn) && 254 !BH_VISIBLE(env, bhp, read_lsnp, vlsn)) 255 bhp = SH_CHAIN_PREV(bhp, vc, __bh); 256 257 DB_ASSERT(env, bhp != NULL); 258 } 259 260 makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn); 261 262 if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) { 263 DB_ASSERT(env, frozen_bhp == NULL); 264 frozen_bhp = bhp; 265 } 266 267 /* 268 * Increment the reference count. We may discard the hash 269 * bucket lock as we evaluate and/or read the buffer, so we 270 * need to ensure it doesn't move and its contents remain 271 * unchanged. 272 */ 273 if (bhp->ref == UINT16_MAX) { 274 __db_errx(env, 275 "%s: page %lu: reference count overflow", 276 __memp_fn(dbmfp), (u_long)bhp->pgno); 277 ret = __env_panic(env, EINVAL); 278 goto err; 279 } 280 ++bhp->ref; 281 b_incr = 1; 282 283 /* 284 * BH_LOCKED -- 285 * I/O is in progress or sync is waiting on the buffer to write 286 * it. Because we've incremented the buffer reference count, 287 * we know the buffer can't move. Unlock the bucket lock, wait 288 * for the buffer to become available, re-acquire the bucket. 289 */ 290 for (first = 1; F_ISSET(bhp, BH_LOCKED) && 291 !F_ISSET(env->dbenv, DB_ENV_NOLOCKING); first = 0) { 292 /* 293 * If someone is trying to sync this buffer and the 294 * buffer is hot, they may never get in. Give up and 295 * try again. 296 */ 297 if (!first && bhp->ref_sync != 0) { 298 --bhp->ref; 299 MUTEX_UNLOCK(env, hp->mtx_hash); 300 bhp = frozen_bhp = NULL; 301 b_incr = b_locked = 0; 302 __os_yield(env, 0, 1); 303 goto retry; 304 } 305 306 /* 307 * If we're the first thread waiting on I/O, set the 308 * flag so the thread doing I/O knows to wake us up, 309 * and lock the mutex. 310 */ 311 if (!F_ISSET(hp, IO_WAITER)) { 312 F_SET(hp, IO_WAITER); 313 MUTEX_LOCK(env, hp->mtx_io); 314 } 315 STAT(++hp->hash_io_wait); 316 317 /* Release the hash bucket lock. */ 318 MUTEX_UNLOCK(env, hp->mtx_hash); 319 320 /* Wait for I/O to finish. */ 321 MUTEX_LOCK(env, hp->mtx_io); 322 MUTEX_UNLOCK(env, hp->mtx_io); 323 324 /* Re-acquire the hash bucket lock. */ 325 MUTEX_LOCK(env, hp->mtx_hash); 326 } 327 328 /* 329 * If the buffer was frozen before we waited for any I/O to 330 * complete and is still frozen, we will need to thaw it. 331 * Otherwise, it was thawed while we waited, and we need to 332 * search again. 333 */ 334 if (frozen_bhp != NULL && F_ISSET(frozen_bhp, BH_THAWED)) { 335thawed: need_free = (--frozen_bhp->ref == 0); 336 b_incr = 0; 337 MUTEX_UNLOCK(env, hp->mtx_hash); 338 MPOOL_REGION_LOCK(env, infop); 339 if (alloc_bhp != NULL) { 340 __memp_free(infop, mfp, alloc_bhp); 341 alloc_bhp = NULL; 342 } 343 if (need_free) 344 SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, 345 frozen_bhp, hq); 346 MPOOL_REGION_UNLOCK(env, infop); 347 bhp = frozen_bhp = NULL; 348 goto retry; 349 } 350 351 /* 352 * If the buffer we wanted was frozen or thawed while we 353 * waited, we need to start again. 354 */ 355 if (SH_CHAIN_HASNEXT(bhp, vc) && 356 SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) { 357 --bhp->ref; 358 b_incr = 0; 359 MUTEX_UNLOCK(env, hp->mtx_hash); 360 bhp = frozen_bhp = NULL; 361 goto retry; 362 } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { 363 ret = DB_LOCK_DEADLOCK; 364 goto err; 365 } 366 367#ifdef HAVE_STATISTICS 368 ++mfp->stat.st_cache_hit; 369#endif 370 break; 371 } 372 373#ifdef HAVE_STATISTICS 374 /* 375 * Update the hash bucket search statistics -- do now because our next 376 * search may be for a different bucket. 377 */ 378 ++c_mp->stat.st_hash_searches; 379 if (st_hsearch > c_mp->stat.st_hash_longest) 380 c_mp->stat.st_hash_longest = st_hsearch; 381 c_mp->stat.st_hash_examined += st_hsearch; 382#endif 383 384 /* 385 * There are 4 possible paths to this location: 386 * 387 * FIRST_MISS: 388 * Didn't find the page in the hash bucket on our first pass: 389 * bhp == NULL, alloc_bhp == NULL 390 * 391 * FIRST_FOUND: 392 * Found the page in the hash bucket on our first pass: 393 * bhp != NULL, alloc_bhp == NULL 394 * 395 * SECOND_FOUND: 396 * Didn't find the page in the hash bucket on the first pass, 397 * allocated space, and found the page in the hash bucket on 398 * our second pass: 399 * bhp != NULL, alloc_bhp != NULL 400 * 401 * SECOND_MISS: 402 * Didn't find the page in the hash bucket on the first pass, 403 * allocated space, and didn't find the page in the hash bucket 404 * on our second pass: 405 * bhp == NULL, alloc_bhp != NULL 406 */ 407 state = bhp == NULL ? 408 (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : 409 (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); 410 411 switch (state) { 412 case FIRST_FOUND: 413 /* 414 * If we are to free the buffer, then this had better be the 415 * only reference. If so, just free the buffer. If not, 416 * complain and get out. 417 */ 418 if (flags == DB_MPOOL_FREE) { 419 if (--bhp->ref == 0) { 420 if (F_ISSET(bhp, BH_DIRTY)) { 421 --hp->hash_page_dirty; 422 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 423 } 424 /* 425 * In a multiversion database, this page could 426 * be requested again so we have to leave it in 427 * cache for now. It should *not* ever be 428 * requested again for modification without an 429 * intervening DB_MPOOL_CREATE or DB_MPOOL_NEW. 430 * 431 * Mark it with BH_FREED so we don't reuse the 432 * data when the page is resurrected. 433 */ 434 if (mvcc && (F_ISSET(bhp, BH_FROZEN) || 435 !SH_CHAIN_SINGLETON(bhp, vc) || 436 bhp->td_off == INVALID_ROFF || 437 !IS_MAX_LSN(*VISIBLE_LSN(env, bhp)))) { 438 F_SET(bhp, BH_FREED); 439 MUTEX_UNLOCK(env, hp->mtx_hash); 440 return (0); 441 } 442 return (__memp_bhfree( 443 dbmp, infop, hp, bhp, BH_FREE_FREEMEM)); 444 } 445 __db_errx(env, 446 "File %s: freeing pinned buffer for page %lu", 447 __memp_fns(dbmp, mfp), (u_long)*pgnoaddr); 448 ret = __env_panic(env, EINVAL); 449 goto err; 450 } 451 452 if (mvcc) { 453 if (flags == DB_MPOOL_CREATE && 454 F_ISSET(bhp, BH_FREED)) { 455 extending = makecopy = 1; 456 MUTEX_LOCK(env, mfp->mutex); 457 if (*pgnoaddr > mfp->last_pgno) 458 mfp->last_pgno = *pgnoaddr; 459 MUTEX_UNLOCK(env, mfp->mutex); 460 } 461 462 /* 463 * With multiversion databases, we might need to 464 * allocate a new buffer into which we can copy the one 465 * that we found. In that case, check the last buffer 466 * in the chain to see whether we can reuse an obsolete 467 * buffer. 468 * 469 * To provide snapshot isolation, we need to make sure 470 * that we've seen a buffer older than the oldest 471 * snapshot read LSN. 472 */ 473reuse: if ((makecopy || frozen_bhp != NULL) && (oldest_bhp = 474 SH_CHAIN_PREV(bhp, vc, __bh)) != NULL) { 475 while (SH_CHAIN_HASPREV(oldest_bhp, vc)) 476 oldest_bhp = SH_CHAIN_PREVP(oldest_bhp, 477 vc, __bh); 478 479 if (oldest_bhp->ref == 0 && !BH_OBSOLETE( 480 oldest_bhp, hp->old_reader, vlsn) && 481 (ret = __txn_oldest_reader(env, 482 &hp->old_reader)) != 0) 483 goto err; 484 485 if (BH_OBSOLETE( 486 oldest_bhp, hp->old_reader, vlsn) && 487 oldest_bhp->ref == 0) { 488 if (F_ISSET(oldest_bhp, BH_FROZEN)) { 489 ++oldest_bhp->ref; 490 if ((ret = __memp_bh_thaw(dbmp, 491 infop, hp, oldest_bhp, 492 NULL)) != 0) 493 goto err; 494 goto reuse; 495 } else if ((ret = __memp_bhfree(dbmp, 496 infop, hp, oldest_bhp, 497 BH_FREE_REUSE)) != 0) 498 goto err; 499 alloc_bhp = oldest_bhp; 500 } 501 502 DB_ASSERT(env, alloc_bhp == NULL || 503 !F_ISSET(alloc_bhp, BH_FROZEN)); 504 } 505 } 506 507 /* We found the buffer or we're ready to copy -- we're done. */ 508 if ((!makecopy && frozen_bhp == NULL) || alloc_bhp != NULL) 509 break; 510 511 /* FALLTHROUGH */ 512 case FIRST_MISS: 513 /* 514 * We didn't find the buffer in our first check. Figure out 515 * if the page exists, and allocate structures so we can add 516 * the page to the buffer pool. 517 */ 518 MUTEX_UNLOCK(env, hp->mtx_hash); 519 b_locked = 0; 520 521 /* 522 * The buffer is not in the pool, so we don't need to free it. 523 */ 524 if (flags == DB_MPOOL_FREE) 525 return (0); 526 527alloc: /* 528 * If DB_MPOOL_NEW is set, we have to allocate a page number. 529 * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then 530 * it's an error to try and get a page past the end of file. 531 */ 532 DB_ASSERT(env, !b_locked); 533 MUTEX_LOCK(env, mfp->mutex); 534 switch (flags) { 535 case DB_MPOOL_NEW: 536 extending = 1; 537 if (mfp->maxpgno != 0 && 538 mfp->last_pgno >= mfp->maxpgno) { 539 __db_errx( 540 env, "%s: file limited to %lu pages", 541 __memp_fn(dbmfp), (u_long)mfp->maxpgno); 542 ret = ENOSPC; 543 } else 544 *pgnoaddr = mfp->last_pgno + 1; 545 break; 546 case DB_MPOOL_CREATE: 547 if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { 548 __db_errx( 549 env, "%s: file limited to %lu pages", 550 __memp_fn(dbmfp), (u_long)mfp->maxpgno); 551 ret = ENOSPC; 552 } else if (!extending) 553 extending = *pgnoaddr > mfp->last_pgno; 554 break; 555 default: 556 ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; 557 break; 558 } 559 MUTEX_UNLOCK(env, mfp->mutex); 560 if (ret != 0) 561 goto err; 562 563 /* 564 * !!! 565 * In the DB_MPOOL_NEW code path, infop and c_mp have 566 * not yet been initialized. 567 */ 568 MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret); 569 if (ret != 0) 570 goto err; 571 c_mp = infop->primary; 572 573 /* Allocate a new buffer header and data space. */ 574 if ((ret = 575 __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0) 576 goto err; 577#ifdef DIAGNOSTIC 578 if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { 579 __db_errx(env, 580 "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"); 581 ret = __env_panic(env, EINVAL); 582 goto err; 583 } 584#endif 585 /* 586 * If we are extending the file, we'll need the mfp lock 587 * again. 588 */ 589 if (extending) 590 MUTEX_LOCK(env, mfp->mutex); 591 592 /* 593 * DB_MPOOL_NEW does not guarantee you a page unreferenced by 594 * any other thread of control. (That guarantee is interesting 595 * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller 596 * did not specify the page number, and so, may reasonably not 597 * have any way to lock the page outside of mpool.) Regardless, 598 * if we allocate the page, and some other thread of control 599 * requests the page by number, we will not detect that and the 600 * thread of control that allocated using DB_MPOOL_NEW may not 601 * have a chance to initialize the page. (Note: we *could* 602 * detect this case if we set a flag in the buffer header which 603 * guaranteed that no gets of the page would succeed until the 604 * reference count went to 0, that is, until the creating page 605 * put the page.) What we do guarantee is that if two threads 606 * of control are both doing DB_MPOOL_NEW calls, they won't 607 * collide, that is, they won't both get the same page. 608 * 609 * There's a possibility that another thread allocated the page 610 * we were planning to allocate while we were off doing buffer 611 * allocation. We can do that by making sure the page number 612 * we were going to use is still available. If it's not, then 613 * we check to see if the next available page number hashes to 614 * the same mpool region as the old one -- if it does, we can 615 * continue, otherwise, we have to start over. 616 */ 617 if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { 618 *pgnoaddr = mfp->last_pgno + 1; 619 MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret); 620 if (ret != 0) 621 goto err; 622 if (t_infop != infop) { 623 /* 624 * flags == DB_MPOOL_NEW, so extending is set 625 * and we're holding the mfp locked. 626 */ 627 MUTEX_UNLOCK(env, mfp->mutex); 628 629 MPOOL_REGION_LOCK(env, infop); 630 __memp_free(infop, mfp, alloc_bhp); 631 c_mp->stat.st_pages--; 632 MPOOL_REGION_UNLOCK(env, infop); 633 634 alloc_bhp = NULL; 635 goto alloc; 636 } 637 } 638 639 /* 640 * We released the mfp lock, so another thread might have 641 * extended the file. Update the last_pgno and initialize 642 * the file, as necessary, if we extended the file. 643 */ 644 if (extending) { 645 if (*pgnoaddr > mfp->last_pgno) 646 mfp->last_pgno = *pgnoaddr; 647 648 MUTEX_UNLOCK(env, mfp->mutex); 649 if (ret != 0) 650 goto err; 651 } 652 653 /* 654 * If we're doing copy-on-write, we will already have the 655 * buffer header. In that case, we don't need to search again. 656 */ 657 if (bhp != NULL) { 658 MUTEX_LOCK(env, hp->mtx_hash); 659 b_locked = 1; 660 break; 661 } 662 DB_ASSERT(env, frozen_bhp == NULL); 663 goto retry; 664 case SECOND_FOUND: 665 /* 666 * We allocated buffer space for the requested page, but then 667 * found the page in the buffer cache on our second check. 668 * That's OK -- we can use the page we found in the pool, 669 * unless DB_MPOOL_NEW is set. If we're about to copy-on-write, 670 * this is exactly the situation we want. 671 * 672 * For multiversion files, we may have left some pages in cache 673 * beyond the end of a file after truncating. In that case, we 674 * would get to here with extending set. If so, we need to 675 * insert the new page in the version chain similar to when 676 * we copy on write. 677 */ 678 if (extending && F_ISSET(bhp, BH_FREED)) 679 makecopy = 1; 680 if (makecopy || frozen_bhp != NULL) 681 break; 682 683 /* Free the allocated memory, we no longer need it. Since we 684 * can't acquire the region lock while holding the hash bucket 685 * lock, we have to release the hash bucket and re-acquire it. 686 * That's OK, because we have the buffer pinned down. 687 */ 688 MUTEX_UNLOCK(env, hp->mtx_hash); 689 MPOOL_REGION_LOCK(env, infop); 690 __memp_free(infop, mfp, alloc_bhp); 691 c_mp->stat.st_pages--; 692 MPOOL_REGION_UNLOCK(env, infop); 693 alloc_bhp = NULL; 694 695 /* 696 * We can't use the page we found in the pool if DB_MPOOL_NEW 697 * was set. (For details, see the above comment beginning 698 * "DB_MPOOL_NEW does not guarantee you a page unreferenced by 699 * any other thread of control".) If DB_MPOOL_NEW is set, we 700 * release our pin on this particular buffer, and try to get 701 * another one. 702 */ 703 if (flags == DB_MPOOL_NEW) { 704 --bhp->ref; 705 b_incr = b_locked = 0; 706 bhp = NULL; 707 goto alloc; 708 } 709 710 /* We can use the page -- get the bucket lock. */ 711 MUTEX_LOCK(env, hp->mtx_hash); 712 break; 713 case SECOND_MISS: 714 /* 715 * We allocated buffer space for the requested page, and found 716 * the page still missing on our second pass through the buffer 717 * cache. Instantiate the page. 718 */ 719 bhp = alloc_bhp; 720 alloc_bhp = NULL; 721 722 /* 723 * Initialize all the BH and hash bucket fields so we can call 724 * __memp_bhfree if an error occurs. 725 * 726 * Append the buffer to the tail of the bucket list and update 727 * the hash bucket's priority. 728 */ 729 /*lint --e{668} (flexelint: bhp cannot be NULL). */ 730#ifdef DIAG_MVCC 731 memset(bhp, 0, SSZ(BH, align_off)); 732#else 733 memset(bhp, 0, sizeof(BH)); 734#endif 735 bhp->ref = 1; 736 b_incr = 1; 737 bhp->priority = UINT32_MAX; 738 bhp->pgno = *pgnoaddr; 739 bhp->mf_offset = mf_offset; 740 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); 741 SH_CHAIN_INIT(bhp, vc); 742 743 /* We created a new page, it starts dirty. */ 744 if (extending) { 745 ++hp->hash_page_dirty; 746 F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); 747 } 748 749 /* 750 * If we created the page, zero it out. If we didn't create 751 * the page, read from the backing file. 752 * 753 * !!! 754 * DB_MPOOL_NEW doesn't call the pgin function. 755 * 756 * If DB_MPOOL_CREATE is used, then the application's pgin 757 * function has to be able to handle pages of 0's -- if it 758 * uses DB_MPOOL_NEW, it can detect all of its page creates, 759 * and not bother. 760 * 761 * If we're running in diagnostic mode, smash any bytes on the 762 * page that are unknown quantities for the caller. 763 * 764 * Otherwise, read the page into memory, optionally creating it 765 * if DB_MPOOL_CREATE is set. 766 */ 767 if (extending) { 768 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 769 PROT_READ | PROT_WRITE); 770 if (mfp->clear_len == DB_CLEARLEN_NOTSET) 771 memset(bhp->buf, 0, mfp->stat.st_pagesize); 772 else { 773 memset(bhp->buf, 0, mfp->clear_len); 774#if defined(DIAGNOSTIC) || defined(UMRW) 775 memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, 776 mfp->stat.st_pagesize - mfp->clear_len); 777#endif 778 } 779 780 if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) 781 F_SET(bhp, BH_CALLPGIN); 782 783 STAT(++mfp->stat.st_page_create); 784 } else { 785 F_SET(bhp, BH_TRASH); 786 STAT(++mfp->stat.st_cache_miss); 787 } 788 789 /* Increment buffer count referenced by MPOOLFILE. */ 790 MUTEX_LOCK(env, mfp->mutex); 791 ++mfp->block_cnt; 792 MUTEX_UNLOCK(env, mfp->mutex); 793 } 794 795 DB_ASSERT(env, bhp != NULL); 796 DB_ASSERT(env, bhp->ref != 0); 797 798 /* We've got a buffer header we're re-instantiating. */ 799 if (frozen_bhp != NULL) { 800 DB_ASSERT(env, alloc_bhp != NULL); 801 802 /* 803 * If the empty buffer has been filled in the meantime, don't 804 * overwrite it. 805 */ 806 if (F_ISSET(frozen_bhp, BH_THAWED)) 807 goto thawed; 808 else { 809 if ((ret = __memp_bh_thaw(dbmp, infop, hp, 810 frozen_bhp, alloc_bhp)) != 0) 811 goto err; 812 bhp = alloc_bhp; 813 } 814 815 frozen_bhp = alloc_bhp = NULL; 816 817 /* 818 * If we're updating a buffer that was frozen, we have to go 819 * through all of that again to allocate another buffer to hold 820 * the new copy. 821 */ 822 if (makecopy) { 823 MUTEX_UNLOCK(env, hp->mtx_hash); 824 b_locked = 0; 825 goto alloc; 826 } 827 } 828 829 /* 830 * BH_TRASH -- 831 * The buffer we found may need to be filled from the disk. 832 * 833 * It's possible for the read function to fail, which means we fail as 834 * well. Note, the __memp_pgread() function discards and reacquires 835 * the hash lock, so the buffer must be pinned down so that it cannot 836 * move and its contents are unchanged. Discard the buffer on failure 837 * unless another thread is waiting on our I/O to complete. It's OK to 838 * leave the buffer around, as the waiting thread will see the BH_TRASH 839 * flag set, and will also attempt to discard it. If there's a waiter, 840 * we need to decrement our reference count. 841 */ 842 if (F_ISSET(bhp, BH_TRASH) && 843 (ret = __memp_pgread(dbmfp, 844 hp, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) 845 goto err; 846 847 /* 848 * BH_CALLPGIN -- 849 * The buffer was processed for being written to disk, and now has 850 * to be re-converted for use. 851 */ 852 if (F_ISSET(bhp, BH_CALLPGIN)) { 853 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 854 PROT_READ | PROT_WRITE); 855 if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) 856 goto err; 857 F_CLR(bhp, BH_CALLPGIN); 858 } 859 860 /* Copy-on-write. */ 861 if (makecopy && state != SECOND_MISS) { 862 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 863 DB_ASSERT(env, bhp != NULL); 864 DB_ASSERT(env, alloc_bhp != NULL); 865 DB_ASSERT(env, alloc_bhp != bhp); 866 867 if (bhp->ref == 1) 868 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 869 PROT_READ); 870 871 alloc_bhp->ref = 1; 872 alloc_bhp->ref_sync = 0; 873 alloc_bhp->flags = F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE); 874 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 875 alloc_bhp->priority = bhp->priority; 876 alloc_bhp->pgno = bhp->pgno; 877 alloc_bhp->mf_offset = bhp->mf_offset; 878 alloc_bhp->td_off = INVALID_ROFF; 879 if (txn != NULL && 880 (ret = __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0) 881 goto err; 882 if (extending) { 883 memset(alloc_bhp->buf, 0, mfp->stat.st_pagesize); 884 F_SET(alloc_bhp, BH_DIRTY_CREATE); 885 } else 886 memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); 887 888 SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh); 889 SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, 890 bhp, alloc_bhp, hq, __bh); 891 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); 892 if (--bhp->ref == 0) { 893 bhp->priority = c_mp->lru_count; 894 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); 895 } 896 bhp = alloc_bhp; 897 898 if (alloc_bhp != oldest_bhp) { 899 MUTEX_LOCK(env, mfp->mutex); 900 ++mfp->block_cnt; 901 MUTEX_UNLOCK(env, mfp->mutex); 902 } 903 904 alloc_bhp = NULL; 905 } else if (mvcc && extending && txn != NULL && 906 (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0) 907 goto err; 908 909 if ((dirty || edit || extending) && !F_ISSET(bhp, BH_DIRTY)) { 910 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 911 ++hp->hash_page_dirty; 912 F_SET(bhp, BH_DIRTY); 913 } 914 915 /* 916 * If we're the only reference, update buffer priority. We may be 917 * about to release the hash bucket lock, and everything should be 918 * correct, first. (We've already done this work if we created the 919 * buffer, so there is no need to do it again.) 920 */ 921 if (state != SECOND_MISS && bhp->ref == 1) { 922 bhp->priority = UINT32_MAX; 923 if (SH_CHAIN_SINGLETON(bhp, vc)) { 924 if (bhp != SH_TAILQ_LAST(&hp->hash_bucket, hq, __bh)) { 925 SH_TAILQ_REMOVE(&hp->hash_bucket, 926 bhp, hq, __bh); 927 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); 928 } 929 } 930 } 931 932 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | 933 (dirty || edit || extending || F_ISSET(bhp, BH_DIRTY) ? 934 PROT_WRITE : 0)); 935 936#ifdef DIAGNOSTIC 937 { 938 BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh); 939 940 DB_ASSERT(env, !mfp->multiversion || 941 !F_ISSET(bhp, BH_DIRTY) || next_bhp == NULL); 942 943 DB_ASSERT(env, !mvcc || edit || read_lsnp == NULL || 944 bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) || 945 (BH_VISIBLE(env, bhp, read_lsnp, vlsn) && 946 (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) || 947 (next_bhp->td_off != INVALID_ROFF && 948 (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED || 949 !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn)))))); 950 } 951#endif 952 953 MUTEX_UNLOCK(env, hp->mtx_hash); 954 /* 955 * Record this pin for this thread. Holding the page pinned 956 * without recording the pin is ok since we do not recover from 957 * a death from within the library itself. 958 */ 959 if (ip != NULL) { 960 reginfo = env->reginfo; 961 if (ip->dbth_pincount == ip->dbth_pinmax) { 962 pinmax = ip->dbth_pinmax; 963 if ((ret = __env_alloc(reginfo, 964 2 * pinmax * sizeof(PIN_LIST), &list)) != 0) 965 goto err; 966 967 memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist), 968 pinmax * sizeof(PIN_LIST)); 969 memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST)); 970 list_off = R_OFFSET(reginfo, list); 971 list = R_ADDR(reginfo, ip->dbth_pinlist); 972 ip->dbth_pinmax = 2 * pinmax; 973 ip->dbth_pinlist = list_off; 974 if (list != ip->dbth_pinarray) 975 __env_alloc_free(reginfo, list); 976 } 977 list = R_ADDR(reginfo, ip->dbth_pinlist); 978 for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) 979 if (lp->b_ref == INVALID_ROFF) 980 break; 981 982 ip->dbth_pincount++; 983 lp->b_ref = R_OFFSET(infop, bhp); 984 lp->region = (int)(infop - dbmp->reginfo); 985 } 986 987#ifdef DIAGNOSTIC 988 /* Update the file's pinned reference count. */ 989 MPOOL_SYSTEM_LOCK(env); 990 ++dbmfp->pinref; 991 MPOOL_SYSTEM_UNLOCK(env); 992 993 /* 994 * We want to switch threads as often as possible, and at awkward 995 * times. Yield every time we get a new page to ensure contention. 996 */ 997 if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU)) 998 __os_yield(env, 0, 0); 999#endif 1000 1001 DB_ASSERT(env, alloc_bhp == NULL); 1002 1003 *(void **)addrp = bhp->buf; 1004 return (0); 1005 1006err: /* 1007 * Discard our reference. If we're the only reference, discard the 1008 * the buffer entirely. If we held a reference to a buffer, we are 1009 * also still holding the hash bucket mutex. 1010 */ 1011 if (b_incr || frozen_bhp != NULL) { 1012 if (!b_locked) { 1013 MUTEX_LOCK(env, hp->mtx_hash); 1014 b_locked = 1; 1015 } 1016 if (frozen_bhp != NULL) 1017 --frozen_bhp->ref; 1018 if (b_incr && bhp != frozen_bhp) 1019 --bhp->ref; 1020 } 1021 if (b_locked) 1022 MUTEX_UNLOCK(env, hp->mtx_hash); 1023 1024 /* If alloc_bhp is set, free the memory. */ 1025 if (alloc_bhp != NULL) { 1026 MPOOL_REGION_LOCK(env, infop); 1027 __memp_free(infop, mfp, alloc_bhp); 1028 c_mp->stat.st_pages--; 1029 MPOOL_REGION_UNLOCK(env, infop); 1030 } 1031 1032 return (ret); 1033} 1034