1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_bh.c,v 12.43 2008/01/08 20:58:42 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" /* Required for diagnostic code. */ 13#include "dbinc/mp.h" 14#include "dbinc/log.h" 15#include "dbinc/txn.h" 16 17static int __memp_pgwrite 18 __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *)); 19 20/* 21 * __memp_bhwrite -- 22 * Write the page associated with a given buffer header. 23 * 24 * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *, 25 * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int)); 26 */ 27int 28__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) 29 DB_MPOOL *dbmp; 30 DB_MPOOL_HASH *hp; 31 MPOOLFILE *mfp; 32 BH *bhp; 33 int open_extents; 34{ 35 DB_MPOOLFILE *dbmfp; 36 DB_MPREG *mpreg; 37 ENV *env; 38 int ret; 39 40 env = dbmp->env; 41 42 /* 43 * If the file has been removed or is a closed temporary file, we're 44 * done -- the page-write function knows how to handle the fact that 45 * we don't have (or need!) any real file descriptor information. 46 */ 47 if (mfp->deadfile) 48 return (__memp_pgwrite(env, NULL, hp, bhp)); 49 50 /* 51 * Walk the process' DB_MPOOLFILE list and find a file descriptor for 52 * the file. We also check that the descriptor is open for writing. 53 */ 54 MUTEX_LOCK(env, dbmp->mutex); 55 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) 56 if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) { 57 ++dbmfp->ref; 58 break; 59 } 60 MUTEX_UNLOCK(env, dbmp->mutex); 61 62 if (dbmfp != NULL) { 63 /* 64 * Temporary files may not have been created. We only handle 65 * temporary files in this path, because only the process that 66 * created a temporary file will ever flush buffers to it. 67 */ 68 if (dbmfp->fhp == NULL) { 69 /* We may not be allowed to create backing files. */ 70 if (mfp->no_backing_file) { 71 --dbmfp->ref; 72 return (EPERM); 73 } 74 75 MUTEX_LOCK(env, dbmp->mutex); 76 if (dbmfp->fhp == NULL) 77 ret = __db_appname(env, DB_APP_TMP, NULL, 78 F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? 79 DB_OSO_DIRECT : 0, &dbmfp->fhp, NULL); 80 else 81 ret = 0; 82 MUTEX_UNLOCK(env, dbmp->mutex); 83 if (ret != 0) { 84 __db_errx(env, 85 "unable to create temporary backing file"); 86 --dbmfp->ref; 87 return (ret); 88 } 89 } 90 91 goto pgwrite; 92 } 93 94 /* 95 * There's no file handle for this file in our process. 96 * 97 * !!! 98 * It's the caller's choice if we're going to open extent files. 99 */ 100 if (!open_extents && F_ISSET(mfp, MP_EXTENT)) 101 return (EPERM); 102 103 /* 104 * !!! 105 * Don't try to attach to temporary files. There are two problems in 106 * trying to do that. First, if we have different privileges than the 107 * process that "owns" the temporary file, we might create the backing 108 * disk file such that the owning process couldn't read/write its own 109 * buffers, e.g., memp_trickle running as root creating a file owned 110 * as root, mode 600. Second, if the temporary file has already been 111 * created, we don't have any way of finding out what its real name is, 112 * and, even if we did, it was already unlinked (so that it won't be 113 * left if the process dies horribly). This decision causes a problem, 114 * however: if the temporary file consumes the entire buffer cache, 115 * and the owner doesn't flush the buffers to disk, we could end up 116 * with resource starvation, and the memp_trickle thread couldn't do 117 * anything about it. That's a pretty unlikely scenario, though. 118 * 119 * Note we should never get here when the temporary file in question 120 * has already been closed in another process, in which case it should 121 * be marked dead. 122 */ 123 if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file) 124 return (EPERM); 125 126 /* 127 * It's not a page from a file we've opened. If the file requires 128 * application-specific input/output processing, see if this process 129 * has ever registered information as to how to write this type of 130 * file. If not, there's nothing we can do. 131 */ 132 if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) { 133 MUTEX_LOCK(env, dbmp->mutex); 134 LIST_FOREACH(mpreg, &dbmp->dbregq, q) 135 if (mpreg->ftype == mfp->ftype) 136 break; 137 MUTEX_UNLOCK(env, dbmp->mutex); 138 if (mpreg == NULL) 139 return (EPERM); 140 } 141 142 /* 143 * Try and open the file, specifying the known underlying shared area. 144 * 145 * !!! 146 * There's no negative cache, so we may repeatedly try and open files 147 * that we have previously tried (and failed) to open. 148 */ 149 if ((ret = __memp_fcreate(env, &dbmfp)) != 0) 150 return (ret); 151 if ((ret = __memp_fopen(dbmfp, 152 mfp, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) { 153 (void)__memp_fclose(dbmfp, 0); 154 155 /* 156 * Ignore any error if the file is marked dead, assume the file 157 * was removed from under us. 158 */ 159 if (!mfp->deadfile) 160 return (ret); 161 162 dbmfp = NULL; 163 } 164 165pgwrite: 166 MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 167 PROT_READ | PROT_WRITE | PROT_EXEC); 168 ret = __memp_pgwrite(env, dbmfp, hp, bhp); 169 if (dbmfp == NULL) 170 return (ret); 171 172 /* 173 * Discard our reference, and, if we're the last reference, make sure 174 * the file eventually gets closed. 175 */ 176 MUTEX_LOCK(env, dbmp->mutex); 177 if (dbmfp->ref == 1) 178 F_SET(dbmfp, MP_FLUSH); 179 else 180 --dbmfp->ref; 181 MUTEX_UNLOCK(env, dbmp->mutex); 182 183 return (ret); 184} 185 186/* 187 * __memp_pgread -- 188 * Read a page from a file. 189 * 190 * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *, int)); 191 */ 192int 193__memp_pgread(dbmfp, hp, bhp, can_create) 194 DB_MPOOLFILE *dbmfp; 195 DB_MPOOL_HASH *hp; 196 BH *bhp; 197 int can_create; 198{ 199 ENV *env; 200 MPOOLFILE *mfp; 201 size_t len, nr; 202 u_int32_t pagesize; 203 int ret; 204 205 env = dbmfp->env; 206 mfp = dbmfp->mfp; 207 pagesize = mfp->stat.st_pagesize; 208 209 /* We should never be called with a dirty or a locked buffer. */ 210 DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_LOCKED)); 211 DB_ASSERT(env, can_create || !F_ISSET(bhp, BH_DIRTY)); 212 213 /* Lock the buffer and unlock the hash bucket. */ 214 F_SET(bhp, BH_LOCKED | BH_TRASH); 215 MUTEX_UNLOCK(env, hp->mtx_hash); 216 217 /* 218 * Temporary files may not yet have been created. We don't create 219 * them now, we create them when the pages have to be flushed. 220 */ 221 nr = 0; 222 if (dbmfp->fhp != NULL) 223 if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp, 224 bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0) 225 goto err; 226 227 /* 228 * The page may not exist; if it doesn't, nr may well be 0, but we 229 * expect the underlying OS calls not to return an error code in 230 * this case. 231 */ 232 if (nr < pagesize) { 233 /* 234 * Don't output error messages for short reads. In particular, 235 * DB recovery processing may request pages never written to 236 * disk or for which only some part have been written to disk, 237 * in which case we won't find the page. The caller must know 238 * how to handle the error. 239 */ 240 if (!can_create) { 241 ret = DB_PAGE_NOTFOUND; 242 goto err; 243 } 244 245 /* Clear any bytes that need to be cleared. */ 246 len = mfp->clear_len == DB_CLEARLEN_NOTSET ? 247 pagesize : mfp->clear_len; 248 memset(bhp->buf, 0, len); 249 250#if defined(DIAGNOSTIC) || defined(UMRW) 251 /* 252 * If we're running in diagnostic mode, corrupt any bytes on 253 * the page that are unknown quantities for the caller. 254 */ 255 if (len < pagesize) 256 memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); 257#endif 258#ifdef HAVE_STATISTICS 259 ++mfp->stat.st_page_create; 260 } else 261 ++mfp->stat.st_page_in; 262#else 263 } 264#endif 265 266 /* Call any pgin function. */ 267 ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); 268 269 /* Re-acquire the hash bucket lock. */ 270err: MUTEX_LOCK(env, hp->mtx_hash); 271 272 /* 273 * If no errors occurred, the data is now valid, clear the BH_TRASH 274 * flag; regardless, clear the lock bit and let other threads proceed. 275 */ 276 F_CLR(bhp, BH_LOCKED); 277 if (ret == 0) 278 F_CLR(bhp, BH_TRASH); 279 280 /* 281 * If a thread of control is waiting on this buffer, wake it up. 282 */ 283 if (F_ISSET(hp, IO_WAITER)) { 284 F_CLR(hp, IO_WAITER); 285 MUTEX_UNLOCK(env, hp->mtx_io); 286 } 287 288 return (ret); 289} 290 291/* 292 * __memp_pgwrite -- 293 * Write a page to a file. 294 */ 295static int 296__memp_pgwrite(env, dbmfp, hp, bhp) 297 ENV *env; 298 DB_MPOOLFILE *dbmfp; 299 DB_MPOOL_HASH *hp; 300 BH *bhp; 301{ 302 DB_LSN lsn; 303 MPOOLFILE *mfp; 304 size_t nw; 305 int callpgin, ret; 306 307 mfp = dbmfp == NULL ? NULL : dbmfp->mfp; 308 callpgin = ret = 0; 309 310 /* We should never be called with a clean or trash buffer. */ 311 DB_ASSERT(env, F_ISSET(bhp, BH_DIRTY)); 312 DB_ASSERT(env, !F_ISSET(bhp, BH_TRASH)); 313 314 /* 315 * The sync code has already locked the buffer, but the allocation 316 * code has not. Lock the buffer and release the hash bucket mutex. 317 */ 318 F_SET(bhp, BH_LOCKED); 319 MUTEX_UNLOCK(env, hp->mtx_hash); 320 321 /* 322 * It's possible that the underlying file doesn't exist, either 323 * because of an outright removal or because it was a temporary 324 * file that's been closed. 325 * 326 * !!! 327 * Once we pass this point, we know that dbmfp and mfp aren't NULL, 328 * and that we have a valid file reference. 329 */ 330 if (mfp == NULL || mfp->deadfile) 331 goto file_dead; 332 333 /* 334 * If the page is in a file for which we have LSN information, we have 335 * to ensure the appropriate log records are on disk. 336 */ 337 if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET && 338 !IS_CLIENT_PGRECOVER(env)) { 339 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); 340 if (!IS_NOT_LOGGED_LSN(lsn) && 341 (ret = __log_flush(env, &lsn)) != 0) 342 goto err; 343 } 344 345#ifdef DIAGNOSTIC 346 /* 347 * Verify write-ahead logging semantics. 348 * 349 * !!! 350 * Two special cases. There is a single field on the meta-data page, 351 * the last-page-number-in-the-file field, for which we do not log 352 * changes. If the page was originally created in a database that 353 * didn't have logging turned on, we can see a page marked dirty but 354 * for which no corresponding log record has been written. However, 355 * the only way that a page can be created for which there isn't a 356 * previous log record and valid LSN is when the page was created 357 * without logging turned on, and so we check for that special-case 358 * LSN value. 359 * 360 * Second, when a client is reading database pages from a master 361 * during an internal backup, we may get pages modified after 362 * the current end-of-log. 363 */ 364 if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) && 365 !IS_CLIENT_PGRECOVER(env)) { 366 /* 367 * There is a potential race here. If we are in the midst of 368 * switching log files, it's possible we could test against the 369 * old file and the new offset in the log region's LSN. If we 370 * fail the first test, acquire the log mutex and check again. 371 */ 372 DB_LOG *dblp; 373 LOG *lp; 374 375 dblp = env->lg_handle; 376 lp = dblp->reginfo.primary; 377 if (!lp->db_log_inmemory && 378 LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) { 379 MUTEX_LOCK(env, lp->mtx_flush); 380 DB_ASSERT(env, 381 LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0); 382 MUTEX_UNLOCK(env, lp->mtx_flush); 383 } 384 } 385#endif 386 387 /* 388 * Call any pgout function. We set the callpgin flag so that we flag 389 * that the contents of the buffer will need to be passed through pgin 390 * before they are reused. 391 */ 392 if (mfp->ftype != 0 && !F_ISSET(bhp, BH_CALLPGIN)) { 393 callpgin = 1; 394 if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) 395 goto err; 396 } 397 398 /* Write the page. */ 399 if ((ret = __os_io( 400 env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno, mfp->stat.st_pagesize, 401 0, mfp->stat.st_pagesize, bhp->buf, &nw)) != 0) { 402 __db_errx(env, "%s: write failed for page %lu", 403 __memp_fn(dbmfp), (u_long)bhp->pgno); 404 goto err; 405 } 406 STAT(++mfp->stat.st_page_out); 407 if (bhp->pgno > mfp->last_flushed_pgno) { 408 MUTEX_LOCK(env, mfp->mutex); 409 if (bhp->pgno > mfp->last_flushed_pgno) 410 mfp->last_flushed_pgno = bhp->pgno; 411 MUTEX_UNLOCK(env, mfp->mutex); 412 } 413 414err: 415file_dead: 416 /* 417 * !!! 418 * Once we pass this point, dbmfp and mfp may be NULL, we may not have 419 * a valid file reference. 420 * 421 * Re-acquire the hash lock. 422 */ 423 MUTEX_LOCK(env, hp->mtx_hash); 424 425 /* 426 * If we rewrote the page, it will need processing by the pgin 427 * routine before reuse. 428 */ 429 if (callpgin) 430 F_SET(bhp, BH_CALLPGIN); 431 432 /* 433 * Update the hash bucket statistics, reset the flags. If we were 434 * successful, the page is no longer dirty. 435 */ 436 if (ret == 0) { 437 DB_ASSERT(env, hp->hash_page_dirty != 0); 438 --hp->hash_page_dirty; 439 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); 440 } 441 442 /* Regardless, clear any sync wait-for count and remove our lock. */ 443 bhp->ref_sync = 0; 444 F_CLR(bhp, BH_LOCKED); 445 446 /* 447 * If a thread of control is waiting on this buffer, wake it up. 448 */ 449 if (F_ISSET(hp, IO_WAITER)) { 450 F_CLR(hp, IO_WAITER); 451 MUTEX_UNLOCK(env, hp->mtx_io); 452 } 453 454 return (ret); 455} 456 457/* 458 * __memp_pg -- 459 * Call the pgin/pgout routine. 460 * 461 * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); 462 */ 463int 464__memp_pg(dbmfp, bhp, is_pgin) 465 DB_MPOOLFILE *dbmfp; 466 BH *bhp; 467 int is_pgin; 468{ 469 DBT dbt, *dbtp; 470 DB_MPOOL *dbmp; 471 DB_MPREG *mpreg; 472 ENV *env; 473 MPOOLFILE *mfp; 474 int ftype, ret; 475 476 env = dbmfp->env; 477 dbmp = env->mp_handle; 478 mfp = dbmfp->mfp; 479 480 if ((ftype = mfp->ftype) == DB_FTYPE_SET) 481 mpreg = dbmp->pg_inout; 482 else { 483 MUTEX_LOCK(env, dbmp->mutex); 484 LIST_FOREACH(mpreg, &dbmp->dbregq, q) 485 if (ftype == mpreg->ftype) 486 break; 487 MUTEX_UNLOCK(env, dbmp->mutex); 488 } 489 if (mpreg == NULL) 490 return (0); 491 492 if (mfp->pgcookie_len == 0) 493 dbtp = NULL; 494 else { 495 DB_SET_DBT(dbt, R_ADDR( 496 dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len); 497 dbtp = &dbt; 498 } 499 500 if (is_pgin) { 501 if (mpreg->pgin != NULL && (ret = 502 mpreg->pgin(env->dbenv, bhp->pgno, bhp->buf, dbtp)) != 0) 503 goto err; 504 } else 505 if (mpreg->pgout != NULL && (ret = 506 mpreg->pgout(env->dbenv, bhp->pgno, bhp->buf, dbtp)) != 0) 507 goto err; 508 509 return (0); 510 511err: __db_errx(env, "%s: %s failed for page %lu", 512 __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); 513 return (ret); 514} 515 516/* 517 * __memp_bhfree -- 518 * Free a bucket header and its referenced data. 519 * 520 * PUBLIC: int __memp_bhfree 521 * PUBLIC: __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, u_int32_t)); 522 */ 523int 524__memp_bhfree(dbmp, infop, hp, bhp, flags) 525 DB_MPOOL *dbmp; 526 REGINFO *infop; 527 DB_MPOOL_HASH *hp; 528 BH *bhp; 529 u_int32_t flags; 530{ 531 ENV *env; 532#ifdef DIAGNOSTIC 533 DB_LSN vlsn; 534#endif 535 BH *prev_bhp; 536 MPOOL *c_mp; 537 MPOOLFILE *mfp; 538 int ret, t_ret; 539#ifdef DIAG_MVCC 540 size_t pagesize; 541#endif 542 543 ret = 0; 544 545 /* 546 * Assumes the hash bucket is locked and the MPOOL is not. 547 */ 548 env = dbmp->env; 549 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 550#ifdef DIAG_MVCC 551 pagesize = mfp->stat.st_pagesize; 552#endif 553 554 DB_ASSERT(env, bhp->ref == 0 && !F_ISSET(bhp, BH_FROZEN)); 555 DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || 556 SH_CHAIN_SINGLETON(bhp, vc) || 557 (SH_CHAIN_HASNEXT(bhp, vc) && 558 SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off) || 559 (SH_CHAIN_HASPREV(bhp, vc) ? 560 IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) : 561 BH_OBSOLETE(bhp, hp->old_reader, vlsn))); 562 563 /* 564 * Delete the buffer header from the hash bucket queue or the 565 * version chain. 566 */ 567 prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh); 568 if (SH_CHAIN_NEXT(bhp, vc, __bh) == NULL) { 569 if (prev_bhp != NULL) 570 SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, 571 bhp, prev_bhp, hq, __bh); 572 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); 573 } 574 SH_CHAIN_REMOVE(bhp, vc, __bh); 575 576 /* 577 * Remove the reference to this buffer from the transaction that 578 * created it, if any. When the BH_FREE_UNLOCKED flag is set, we're 579 * discarding the environment, so the transaction region is already 580 * gone. 581 */ 582 if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) { 583 ret = __txn_remove_buffer( 584 env, BH_OWNER(env, bhp), hp->mtx_hash); 585 bhp->td_off = INVALID_ROFF; 586 } 587 588 /* 589 * We're going to use the memory for something else -- it had better be 590 * accessible. 591 */ 592 MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC); 593 594 /* 595 * If we're only removing this header from the chain for reuse, we're 596 * done. 597 */ 598 if (LF_ISSET(BH_FREE_REUSE)) 599 return (0); 600 601 /* 602 * Discard the hash bucket's mutex, it's no longer needed, and 603 * we don't want to be holding it when acquiring other locks. 604 */ 605 if (!LF_ISSET(BH_FREE_UNLOCKED)) 606 MUTEX_UNLOCK(env, hp->mtx_hash); 607 608 /* 609 * If we're not reusing the buffer immediately, free the buffer for 610 * real. 611 */ 612 if (LF_ISSET(BH_FREE_FREEMEM)) { 613 MPOOL_REGION_LOCK(env, infop); 614 615 __memp_free(infop, mfp, bhp); 616 c_mp = infop->primary; 617 c_mp->stat.st_pages--; 618 619 MPOOL_REGION_UNLOCK(env, infop); 620 } 621 622 /* 623 * Decrement the reference count of the underlying MPOOLFILE. 624 * If this is its last reference, remove it. 625 */ 626 MUTEX_LOCK(env, mfp->mutex); 627 if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { 628 if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) 629 ret = t_ret; 630 } else 631 MUTEX_UNLOCK(env, mfp->mutex); 632 633 return (ret); 634} 635