1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" /* Required for diagnostic code. */ 13#include "dbinc/mp.h" 14#include "dbinc/log.h" 15#include "dbinc/txn.h" 16 17static int __memp_pgwrite 18 __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *)); 19 20/* 21 * __memp_bhwrite -- 22 * Write the page associated with a given buffer header. 23 * 24 * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *, 25 * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int)); 26 */ 27int 28__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) 29 DB_MPOOL *dbmp; 30 DB_MPOOL_HASH *hp; 31 MPOOLFILE *mfp; 32 BH *bhp; 33 int open_extents; 34{ 35 DB_MPOOLFILE *dbmfp; 36 DB_MPREG *mpreg; 37 ENV *env; 38 int ret; 39 40 env = dbmp->env; 41 42 /* 43 * If the file has been removed or is a closed temporary file, we're 44 * done -- the page-write function knows how to handle the fact that 45 * we don't have (or need!) any real file descriptor information. 46 */ 47 if (mfp->deadfile) 48 return (__memp_pgwrite(env, NULL, hp, bhp)); 49 50 /* 51 * Walk the process' DB_MPOOLFILE list and find a file descriptor for 52 * the file. We also check that the descriptor is open for writing. 53 */ 54 MUTEX_LOCK(env, dbmp->mutex); 55 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) 56 if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) { 57 ++dbmfp->ref; 58 break; 59 } 60 MUTEX_UNLOCK(env, dbmp->mutex); 61 62 if (dbmfp != NULL) { 63 /* 64 * Temporary files may not have been created. We only handle 65 * temporary files in this path, because only the process that 66 * created a temporary file will ever flush buffers to it. 67 */ 68 if (dbmfp->fhp == NULL) { 69 /* We may not be allowed to create backing files. */ 70 if (mfp->no_backing_file) { 71 --dbmfp->ref; 72 return (EPERM); 73 } 74 75 MUTEX_LOCK(env, dbmp->mutex); 76 if (dbmfp->fhp == NULL) { 77 ret = __db_tmp_open(env, 78 F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? 79 DB_OSO_DIRECT : 0, &dbmfp->fhp); 80 } else 81 ret = 0; 82 MUTEX_UNLOCK(env, dbmp->mutex); 83 if (ret != 0) { 84 __db_errx(env, 85 "unable to create temporary backing file"); 86 --dbmfp->ref; 87 return (ret); 88 } 89 } 90 91 goto pgwrite; 92 } 93 94 /* 95 * There's no file handle for this file in our process. 96 * 97 * !!! 98 * It's the caller's choice if we're going to open extent files. 99 */ 100 if (!open_extents && F_ISSET(mfp, MP_EXTENT)) 101 return (EPERM); 102 103 /* 104 * !!! 105 * Don't try to attach to temporary files. There are two problems in 106 * trying to do that. First, if we have different privileges than the 107 * process that "owns" the temporary file, we might create the backing 108 * disk file such that the owning process couldn't read/write its own 109 * buffers, e.g., memp_trickle running as root creating a file owned 110 * as root, mode 600. Second, if the temporary file has already been 111 * created, we don't have any way of finding out what its real name is, 112 * and, even if we did, it was already unlinked (so that it won't be 113 * left if the process dies horribly). This decision causes a problem, 114 * however: if the temporary file consumes the entire buffer cache, 115 * and the owner doesn't flush the buffers to disk, we could end up 116 * with resource starvation, and the memp_trickle thread couldn't do 117 * anything about it. That's a pretty unlikely scenario, though. 118 * 119 * Note we should never get here when the temporary file in question 120 * has already been closed in another process, in which case it should 121 * be marked dead. 122 */ 123 if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file) 124 return (EPERM); 125 126 /* 127 * It's not a page from a file we've opened. If the file requires 128 * application-specific input/output processing, see if this process 129 * has ever registered information as to how to write this type of 130 * file. If not, there's nothing we can do. 131 */ 132 if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) { 133 MUTEX_LOCK(env, dbmp->mutex); 134 LIST_FOREACH(mpreg, &dbmp->dbregq, q) 135 if (mpreg->ftype == mfp->ftype) 136 break; 137 MUTEX_UNLOCK(env, dbmp->mutex); 138 if (mpreg == NULL) 139 return (EPERM); 140 } 141 142 /* 143 * Try and open the file, specifying the known underlying shared area. 144 * 145 * !!! 146 * There's no negative cache, so we may repeatedly try and open files 147 * that we have previously tried (and failed) to open. 148 */ 149 if ((ret = __memp_fcreate(env, &dbmfp)) != 0) 150 return (ret); 151 if ((ret = __memp_fopen(dbmfp, mfp, 152 NULL, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) { 153 (void)__memp_fclose(dbmfp, 0); 154 155 /* 156 * Ignore any error if the file is marked dead, assume the file 157 * was removed from under us. 158 */ 159 if (!mfp->deadfile) 160 return (ret); 161 162 dbmfp = NULL; 163 } 164 165pgwrite: 166 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 167 PROT_READ | PROT_WRITE | PROT_EXEC); 168 ret = __memp_pgwrite(env, dbmfp, hp, bhp); 169 if (dbmfp == NULL) 170 return (ret); 171 172 /* 173 * Discard our reference, and, if we're the last reference, make sure 174 * the file eventually gets closed. 175 */ 176 MUTEX_LOCK(env, dbmp->mutex); 177 if (dbmfp->ref == 1) 178 F_SET(dbmfp, MP_FLUSH); 179 else 180 --dbmfp->ref; 181 MUTEX_UNLOCK(env, dbmp->mutex); 182 183 return (ret); 184} 185 186/* 187 * __memp_pgread -- 188 * Read a page from a file. 189 * 190 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); 191 */ 192int 193__memp_pgread(dbmfp, bhp, can_create) 194 DB_MPOOLFILE *dbmfp; 195 BH *bhp; 196 int can_create; 197{ 198 ENV *env; 199 MPOOLFILE *mfp; 200 size_t len, nr; 201 u_int32_t pagesize; 202 int ret; 203 204 env = dbmfp->env; 205 mfp = dbmfp->mfp; 206 pagesize = mfp->stat.st_pagesize; 207 208 /* We should never be called with a dirty or unlocked buffer. */ 209 DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_FROZEN)); 210 DB_ASSERT(env, can_create || !F_ISSET(bhp, BH_DIRTY)); 211 DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE)); 212 213 /* Mark the buffer as in transistion. */ 214 F_SET(bhp, BH_TRASH); 215 216 /* 217 * Temporary files may not yet have been created. We don't create 218 * them now, we create them when the pages have to be flushed. 219 */ 220 nr = 0; 221 if (dbmfp->fhp != NULL) 222 if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp, 223 bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0) 224 goto err; 225 226 /* 227 * The page may not exist; if it doesn't, nr may well be 0, but we 228 * expect the underlying OS calls not to return an error code in 229 * this case. 230 */ 231 if (nr < pagesize) { 232 /* 233 * Don't output error messages for short reads. In particular, 234 * DB recovery processing may request pages never written to 235 * disk or for which only some part have been written to disk, 236 * in which case we won't find the page. The caller must know 237 * how to handle the error. 238 */ 239 if (!can_create) { 240 ret = DB_PAGE_NOTFOUND; 241 goto err; 242 } 243 244 /* Clear any bytes that need to be cleared. */ 245 len = mfp->clear_len == DB_CLEARLEN_NOTSET ? 246 pagesize : mfp->clear_len; 247 memset(bhp->buf, 0, len); 248 249#if defined(DIAGNOSTIC) || defined(UMRW) 250 /* 251 * If we're running in diagnostic mode, corrupt any bytes on 252 * the page that are unknown quantities for the caller. 253 */ 254 if (len < pagesize) 255 memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); 256#endif 257#ifdef HAVE_STATISTICS 258 ++mfp->stat.st_page_create; 259 } else 260 ++mfp->stat.st_page_in; 261#else 262 } 263#endif 264 265 /* Call any pgin function. */ 266 ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1); 267 268 /* 269 * If no errors occurred, the data is now valid, clear the BH_TRASH 270 * flag. 271 */ 272 if (ret == 0) 273 F_CLR(bhp, BH_TRASH); 274err: return (ret); 275} 276 277/* 278 * __memp_pgwrite -- 279 * Write a page to a file. 280 */ 281static int 282__memp_pgwrite(env, dbmfp, hp, bhp) 283 ENV *env; 284 DB_MPOOLFILE *dbmfp; 285 DB_MPOOL_HASH *hp; 286 BH *bhp; 287{ 288 DB_LSN lsn; 289 MPOOLFILE *mfp; 290 size_t nw; 291 int ret; 292 void * buf; 293 294 /* 295 * Since writing does not require exclusive access, another thread 296 * could have already written this buffer. 297 */ 298 if (!F_ISSET(bhp, BH_DIRTY)) 299 return (0); 300 301 mfp = dbmfp == NULL ? NULL : dbmfp->mfp; 302 ret = 0; 303 buf = NULL; 304 305 /* We should never be called with a frozen or trashed buffer. */ 306 DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN | BH_TRASH)); 307 308 /* 309 * It's possible that the underlying file doesn't exist, either 310 * because of an outright removal or because it was a temporary 311 * file that's been closed. 312 * 313 * !!! 314 * Once we pass this point, we know that dbmfp and mfp aren't NULL, 315 * and that we have a valid file reference. 316 */ 317 if (mfp == NULL || mfp->deadfile) 318 goto file_dead; 319 320 /* 321 * If the page is in a file for which we have LSN information, we have 322 * to ensure the appropriate log records are on disk. 323 */ 324 if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET && 325 !IS_CLIENT_PGRECOVER(env)) { 326 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); 327 if (!IS_NOT_LOGGED_LSN(lsn) && 328 (ret = __log_flush(env, &lsn)) != 0) 329 goto err; 330 } 331 332#ifdef DIAGNOSTIC 333 /* 334 * Verify write-ahead logging semantics. 335 * 336 * !!! 337 * Two special cases. There is a single field on the meta-data page, 338 * the last-page-number-in-the-file field, for which we do not log 339 * changes. If the page was originally created in a database that 340 * didn't have logging turned on, we can see a page marked dirty but 341 * for which no corresponding log record has been written. However, 342 * the only way that a page can be created for which there isn't a 343 * previous log record and valid LSN is when the page was created 344 * without logging turned on, and so we check for that special-case 345 * LSN value. 346 * 347 * Second, when a client is reading database pages from a master 348 * during an internal backup, we may get pages modified after 349 * the current end-of-log. 350 */ 351 if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) && 352 !IS_CLIENT_PGRECOVER(env)) { 353 /* 354 * There is a potential race here. If we are in the midst of 355 * switching log files, it's possible we could test against the 356 * old file and the new offset in the log region's LSN. If we 357 * fail the first test, acquire the log mutex and check again. 358 */ 359 DB_LOG *dblp; 360 LOG *lp; 361 362 dblp = env->lg_handle; 363 lp = dblp->reginfo.primary; 364 if (!lp->db_log_inmemory && 365 LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) { 366 MUTEX_LOCK(env, lp->mtx_flush); 367 DB_ASSERT(env, 368 LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0); 369 MUTEX_UNLOCK(env, lp->mtx_flush); 370 } 371 } 372#endif 373 374 /* 375 * Call any pgout function. If we have the page exclusive then 376 * we are going to reuse it otherwise make a copy of the page so 377 * that others can continue looking at the page while we write it. 378 */ 379 buf = bhp->buf; 380 if (mfp->ftype != 0) { 381 if (F_ISSET(bhp, BH_EXCLUSIVE)) 382 F_SET(bhp, BH_TRASH); 383 else { 384 if ((ret = 385 __os_malloc(env, mfp->stat.st_pagesize, &buf)) != 0) 386 goto err; 387 memcpy(buf, bhp->buf, mfp->stat.st_pagesize); 388 } 389 if ((ret = __memp_pg(dbmfp, bhp->pgno, buf, 0)) != 0) 390 goto err; 391 } 392 393 /* Write the page. */ 394 if ((ret = __os_io( 395 env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno, mfp->stat.st_pagesize, 396 0, mfp->stat.st_pagesize, buf, &nw)) != 0) { 397 __db_errx(env, "%s: write failed for page %lu", 398 __memp_fn(dbmfp), (u_long)bhp->pgno); 399 goto err; 400 } 401 STAT(++mfp->stat.st_page_out); 402 if (bhp->pgno > mfp->last_flushed_pgno) { 403 MUTEX_LOCK(env, mfp->mutex); 404 if (bhp->pgno > mfp->last_flushed_pgno) 405 mfp->last_flushed_pgno = bhp->pgno; 406 MUTEX_UNLOCK(env, mfp->mutex); 407 } 408 409err: 410file_dead: 411 if (buf != NULL && buf != bhp->buf) 412 __os_free(env, buf); 413 /* 414 * !!! 415 * Once we pass this point, dbmfp and mfp may be NULL, we may not have 416 * a valid file reference. 417 */ 418 419 /* 420 * Update the hash bucket statistics, reset the flags. If we were 421 * successful, the page is no longer dirty. Someone else may have 422 * also written the page so we need to latch the hash bucket here 423 * to get the accounting correct. Since we have the buffer 424 * shared it cannot be marked dirty again till we release it. 425 * This is the only place we update the flags field only holding 426 * a shared latch. 427 */ 428 if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) { 429 MUTEX_LOCK(env, hp->mtx_hash); 430 DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); 431 if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) { 432 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 433 DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0); 434 atomic_dec(env, &hp->hash_page_dirty); 435 } 436 437 /* put the page back if necessary. */ 438 if ((ret != 0 || BH_REFCOUNT(bhp) > 1) && 439 F_ISSET(bhp, BH_TRASH)) { 440 ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1); 441 F_CLR(bhp, BH_TRASH); 442 } 443 MUTEX_UNLOCK(env, hp->mtx_hash); 444 } 445 446 return (ret); 447} 448 449/* 450 * __memp_pg -- 451 * Call the pgin/pgout routine. 452 * 453 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int)); 454 */ 455int 456__memp_pg(dbmfp, pgno, buf, is_pgin) 457 DB_MPOOLFILE *dbmfp; 458 db_pgno_t pgno; 459 void *buf; 460 int is_pgin; 461{ 462 DBT dbt, *dbtp; 463 DB_MPOOL *dbmp; 464 DB_MPREG *mpreg; 465 ENV *env; 466 MPOOLFILE *mfp; 467 int ftype, ret; 468 469 env = dbmfp->env; 470 dbmp = env->mp_handle; 471 mfp = dbmfp->mfp; 472 473 if ((ftype = mfp->ftype) == DB_FTYPE_SET) 474 mpreg = dbmp->pg_inout; 475 else { 476 MUTEX_LOCK(env, dbmp->mutex); 477 LIST_FOREACH(mpreg, &dbmp->dbregq, q) 478 if (ftype == mpreg->ftype) 479 break; 480 MUTEX_UNLOCK(env, dbmp->mutex); 481 } 482 if (mpreg == NULL) 483 return (0); 484 485 if (mfp->pgcookie_len == 0) 486 dbtp = NULL; 487 else { 488 DB_SET_DBT(dbt, R_ADDR( 489 dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len); 490 dbtp = &dbt; 491 } 492 493 if (is_pgin) { 494 if (mpreg->pgin != NULL && (ret = 495 mpreg->pgin(env->dbenv, pgno, buf, dbtp)) != 0) 496 goto err; 497 } else 498 if (mpreg->pgout != NULL && (ret = 499 mpreg->pgout(env->dbenv, pgno, buf, dbtp)) != 0) 500 goto err; 501 502 return (0); 503 504err: __db_errx(env, "%s: %s failed for page %lu", 505 __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)pgno); 506 return (ret); 507} 508 509/* 510 * __memp_bhfree -- 511 * Free a bucket header and its referenced data. 512 * 513 * PUBLIC: int __memp_bhfree __P((DB_MPOOL *, 514 * PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t)); 515 */ 516int 517__memp_bhfree(dbmp, infop, mfp, hp, bhp, flags) 518 DB_MPOOL *dbmp; 519 REGINFO *infop; 520 MPOOLFILE *mfp; 521 DB_MPOOL_HASH *hp; 522 BH *bhp; 523 u_int32_t flags; 524{ 525 ENV *env; 526#ifdef DIAGNOSTIC 527 DB_LSN vlsn; 528#endif 529 BH *prev_bhp; 530 MPOOL *c_mp; 531 int ret, t_ret; 532#ifdef DIAG_MVCC 533 size_t pagesize; 534#endif 535 536 ret = 0; 537 538 /* 539 * Assumes the hash bucket is locked and the MPOOL is not. 540 */ 541 env = dbmp->env; 542#ifdef DIAG_MVCC 543 if (mfp != NULL) 544 pagesize = mfp->stat.st_pagesize; 545#endif 546 547 DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || 548 (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash))); 549 DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 && 550 !F_ISSET(bhp, BH_DIRTY | BH_FROZEN)); 551 DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || 552 SH_CHAIN_SINGLETON(bhp, vc) || (SH_CHAIN_HASNEXT(bhp, vc) && 553 (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || 554 bhp->td_off == INVALID_ROFF || 555 IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) || 556 BH_OBSOLETE(bhp, hp->old_reader, vlsn)))); 557 558 /* 559 * Delete the buffer header from the hash bucket queue or the 560 * version chain. 561 */ 562 if (hp == NULL) 563 goto no_hp; 564 prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh); 565 if (!SH_CHAIN_HASNEXT(bhp, vc)) { 566 if (prev_bhp != NULL) 567 SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, 568 bhp, prev_bhp, hq, __bh); 569 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); 570 } 571 SH_CHAIN_REMOVE(bhp, vc, __bh); 572 573 /* 574 * Remove the reference to this buffer from the transaction that 575 * created it, if any. When the BH_FREE_UNLOCKED flag is set, we're 576 * discarding the environment, so the transaction region is already 577 * gone. 578 */ 579 if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) { 580 ret = __txn_remove_buffer( 581 env, BH_OWNER(env, bhp), hp->mtx_hash); 582 bhp->td_off = INVALID_ROFF; 583 } 584 585 /* 586 * We're going to use the memory for something else -- it had better be 587 * accessible. 588 */ 589no_hp: MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC); 590 591 /* 592 * Discard the hash bucket's mutex, it's no longer needed, and 593 * we don't want to be holding it when acquiring other locks. 594 */ 595 if (!LF_ISSET(BH_FREE_UNLOCKED)) 596 MUTEX_UNLOCK(env, hp->mtx_hash); 597 598 /* 599 * If we're only removing this header from the chain for reuse, we're 600 * done. 601 */ 602 if (LF_ISSET(BH_FREE_REUSE)) 603 return (ret); 604 605 /* 606 * If we're not reusing the buffer immediately, free the buffer for 607 * real. 608 */ 609 if (!LF_ISSET(BH_FREE_UNLOCKED)) 610 MUTEX_UNLOCK(env, bhp->mtx_buf); 611 if (LF_ISSET(BH_FREE_FREEMEM)) { 612 if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0) 613 return (ret); 614 MPOOL_REGION_LOCK(env, infop); 615 616 MVCC_BHUNALIGN(bhp); 617 __memp_free(infop, bhp); 618 c_mp = infop->primary; 619 c_mp->stat.st_pages--; 620 621 MPOOL_REGION_UNLOCK(env, infop); 622 } 623 624 if (mfp == NULL) 625 return (ret); 626 627 /* 628 * Decrement the reference count of the underlying MPOOLFILE. 629 * If this is its last reference, remove it. 630 */ 631 MUTEX_LOCK(env, mfp->mutex); 632 if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { 633 if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) 634 ret = t_ret; 635 } else 636 MUTEX_UNLOCK(env, mfp->mutex); 637 638 return (ret); 639} 640