1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/db_page.h" 15#include "dbinc/hash.h" 16 17typedef struct { 18 DB_MPOOL_HASH *track_hp; /* Hash bucket. */ 19 20 roff_t track_off; /* Page file offset. */ 21 db_pgno_t track_pgno; /* Page number. */ 22} BH_TRACK; 23 24static int __bhcmp __P((const void *, const void *)); 25static int __memp_close_flush_files __P((ENV *, int)); 26static int __memp_sync_files __P((ENV *)); 27static int __memp_sync_file __P((ENV *, 28 MPOOLFILE *, void *, u_int32_t *, u_int32_t)); 29 30/* 31 * __memp_walk_files -- 32 * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *, 33 * PUBLIC: int (*) __P((ENV *, MPOOLFILE *, void *, 34 * PUBLIC: u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t)); 35 */ 36int 37__memp_walk_files(env, mp, func, arg, countp, flags) 38 ENV *env; 39 MPOOL *mp; 40 int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); 41 void *arg; 42 u_int32_t *countp; 43 u_int32_t flags; 44{ 45 DB_MPOOL *dbmp; 46 DB_MPOOL_HASH *hp; 47 MPOOLFILE *mfp; 48 int i, ret, t_ret; 49 50 dbmp = env->mp_handle; 51 ret = 0; 52 53 hp = R_ADDR(dbmp->reginfo, mp->ftab); 54 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 55 MUTEX_LOCK(env, hp->mtx_hash); 56 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 57 if ((t_ret = func(env, 58 mfp, arg, countp, flags)) != 0 && ret == 0) 59 ret = t_ret; 60 if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) 61 break; 62 } 63 MUTEX_UNLOCK(env, hp->mtx_hash); 64 if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) 65 break; 66 } 67 return (ret); 68} 69 70/* 71 * __memp_sync_pp -- 72 * ENV->memp_sync pre/post processing. 73 * 74 * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *)); 75 */ 76int 77__memp_sync_pp(dbenv, lsnp) 78 DB_ENV *dbenv; 79 DB_LSN *lsnp; 80{ 81 DB_THREAD_INFO *ip; 82 ENV *env; 83 int ret; 84 85 env = dbenv->env; 86 87 ENV_REQUIRES_CONFIG(env, 88 env->mp_handle, "memp_sync", DB_INIT_MPOOL); 89 90 /* 91 * If no LSN is provided, flush the entire cache (reasonable usage 92 * even if there's no log subsystem configured). 93 */ 94 if (lsnp != NULL) 95 ENV_REQUIRES_CONFIG(env, 96 env->lg_handle, "memp_sync", DB_INIT_LOG); 97 98 ENV_ENTER(env, ip); 99 REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret); 100 ENV_LEAVE(env, ip); 101 return (ret); 102} 103 104/* 105 * __memp_sync -- 106 * ENV->memp_sync. 107 * 108 * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *)); 109 */ 110int 111__memp_sync(env, flags, lsnp) 112 ENV *env; 113 u_int32_t flags; 114 DB_LSN *lsnp; 115{ 116 DB_MPOOL *dbmp; 117 MPOOL *mp; 118 int interrupted, ret; 119 120 dbmp = env->mp_handle; 121 mp = dbmp->reginfo[0].primary; 122 123 /* If we've flushed to the requested LSN, return that information. */ 124 if (lsnp != NULL) { 125 MPOOL_SYSTEM_LOCK(env); 126 if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) { 127 *lsnp = mp->lsn; 128 129 MPOOL_SYSTEM_UNLOCK(env); 130 return (0); 131 } 132 MPOOL_SYSTEM_UNLOCK(env); 133 } 134 135 if ((ret = 136 __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0) 137 return (ret); 138 139 if (!interrupted && lsnp != NULL) { 140 MPOOL_SYSTEM_LOCK(env); 141 if (LOG_COMPARE(lsnp, &mp->lsn) > 0) 142 mp->lsn = *lsnp; 143 MPOOL_SYSTEM_UNLOCK(env); 144 } 145 146 return (0); 147} 148 149/* 150 * __memp_fsync_pp -- 151 * DB_MPOOLFILE->sync pre/post processing. 152 * 153 * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *)); 154 */ 155int 156__memp_fsync_pp(dbmfp) 157 DB_MPOOLFILE *dbmfp; 158{ 159 DB_THREAD_INFO *ip; 160 ENV *env; 161 int ret; 162 163 env = dbmfp->env; 164 165 MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync"); 166 167 ENV_ENTER(env, ip); 168 REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret); 169 ENV_LEAVE(env, ip); 170 return (ret); 171} 172 173/* 174 * __memp_fsync -- 175 * DB_MPOOLFILE->sync. 176 * 177 * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); 178 */ 179int 180__memp_fsync(dbmfp) 181 DB_MPOOLFILE *dbmfp; 182{ 183 MPOOLFILE *mfp; 184 185 mfp = dbmfp->mfp; 186 187 /* 188 * If this handle doesn't have a file descriptor that's open for 189 * writing, or if the file is a temporary, or if the file hasn't 190 * been written since it was flushed, there's no reason to proceed 191 * further. 192 */ 193 if (F_ISSET(dbmfp, MP_READONLY)) 194 return (0); 195 196 if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file) 197 return (0); 198 199 if (mfp->file_written == 0) 200 return (0); 201 202 return (__memp_sync_int( 203 dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)); 204} 205 206/* 207 * __mp_xxx_fh -- 208 * Return a file descriptor for DB 1.85 compatibility locking. 209 * 210 * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); 211 */ 212int 213__mp_xxx_fh(dbmfp, fhp) 214 DB_MPOOLFILE *dbmfp; 215 DB_FH **fhp; 216{ 217 int ret; 218 219 /* 220 * This is a truly spectacular layering violation, intended ONLY to 221 * support compatibility for the DB 1.85 DB->fd call. 222 * 223 * Sync the database file to disk, creating the file as necessary. 224 * 225 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). 226 * The MP_READONLY test isn't interesting because we will either 227 * already have a file descriptor (we opened the database file for 228 * reading) or we aren't readonly (we created the database which 229 * requires write privileges). The MP_TEMP test isn't interesting 230 * because we want to write to the backing file regardless so that 231 * we get a file descriptor to return. 232 */ 233 if ((*fhp = dbmfp->fhp) != NULL) 234 return (0); 235 236 if ((ret = __memp_sync_int( 237 dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0) 238 *fhp = dbmfp->fhp; 239 return (ret); 240} 241 242/* 243 * __memp_sync_int -- 244 * Mpool sync internal function. 245 * 246 * PUBLIC: int __memp_sync_int __P((ENV *, 247 * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *)); 248 */ 249int 250__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp) 251 ENV *env; 252 DB_MPOOLFILE *dbmfp; 253 u_int32_t trickle_max, flags, *wrote_totalp; 254 int *interruptedp; 255{ 256 BH *bhp; 257 BH_TRACK *bharray; 258 DB_MPOOL *dbmp; 259 DB_MPOOL_HASH *hp; 260 MPOOL *c_mp, *mp; 261 MPOOLFILE *mfp; 262 db_mutex_t mutex; 263 roff_t last_mf_offset; 264 u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote_total; 265 int dirty, filecnt, maxopenfd, required_write, ret, t_ret; 266 int wrote_cnt; 267 268 dbmp = env->mp_handle; 269 mp = dbmp->reginfo[0].primary; 270 last_mf_offset = INVALID_ROFF; 271 filecnt = wrote_total = 0; 272 273 if (wrote_totalp != NULL) 274 *wrote_totalp = 0; 275 if (interruptedp != NULL) 276 *interruptedp = 0; 277 278 /* 279 * If we're flushing the cache, it's a checkpoint or we're flushing a 280 * specific file, we really have to write the blocks and we have to 281 * confirm they made it to disk. Otherwise, we can skip a block if 282 * it's hard to get. 283 */ 284 required_write = LF_ISSET(DB_SYNC_CACHE | 285 DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT); 286 287 /* Get shared configuration information. */ 288 MPOOL_SYSTEM_LOCK(env); 289 maxopenfd = mp->mp_maxopenfd; 290 MPOOL_SYSTEM_UNLOCK(env); 291 292 /* Assume one dirty page per bucket. */ 293 ar_max = mp->nreg * mp->htab_buckets; 294 if ((ret = 295 __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0) 296 return (ret); 297 298 /* 299 * Walk each cache's list of buffers and mark all dirty buffers to be 300 * written and all dirty buffers to be potentially written, depending 301 * on our flags. 302 */ 303 for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { 304 c_mp = dbmp->reginfo[n_cache].primary; 305 306 hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); 307 for (i = 0; i < c_mp->htab_buckets; i++, hp++) { 308 /* 309 * We can check for empty buckets before locking as 310 * we only care if the pointer is zero or non-zero. 311 * We can ignore empty or clean buckets because we 312 * only need write buffers that were dirty before 313 * we started. 314 */ 315#ifdef DIAGNOSTIC 316 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) 317#else 318 if (atomic_read(&hp->hash_page_dirty) == 0) 319#endif 320 continue; 321 322 dirty = 0; 323 MUTEX_LOCK(env, hp->mtx_hash); 324 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { 325 /* Always ignore clean pages. */ 326 if (!F_ISSET(bhp, BH_DIRTY)) 327 continue; 328 329 dirty++; 330 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 331 332 /* 333 * Ignore in-memory files, unless the file is 334 * specifically being flushed. 335 */ 336 if (mfp->no_backing_file) 337 continue; 338 if (!LF_ISSET(DB_SYNC_FILE) && 339 F_ISSET(mfp, MP_TEMP)) 340 continue; 341 342 /* 343 * Ignore files that aren't involved in DB's 344 * transactional operations during checkpoints. 345 */ 346 if (LF_ISSET(DB_SYNC_CHECKPOINT) && 347 mfp->lsn_off == DB_LSN_OFF_NOTSET) 348 continue; 349 350 /* 351 * Ignore files that aren't Queue extent files 352 * if we're flushing a Queue file with extents. 353 */ 354 if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) && 355 !F_ISSET(mfp, MP_EXTENT)) 356 continue; 357 358 /* 359 * If we're flushing a specific file, see if 360 * this page is from that file. 361 */ 362 if (dbmfp != NULL && mfp != dbmfp->mfp) 363 continue; 364 365 /* Track the buffer, we want it. */ 366 bharray[ar_cnt].track_hp = hp; 367 bharray[ar_cnt].track_pgno = bhp->pgno; 368 bharray[ar_cnt].track_off = bhp->mf_offset; 369 ar_cnt++; 370 371 /* 372 * If we run out of space, double and continue. 373 * Don't stop at trickle_max, we want to sort 374 * as large a sample set as possible in order 375 * to minimize disk seeks. 376 */ 377 if (ar_cnt >= ar_max) { 378 if ((ret = __os_realloc(env, 379 (ar_max * 2) * sizeof(BH_TRACK), 380 &bharray)) != 0) 381 break; 382 ar_max *= 2; 383 } 384 } 385 386 if (ret != 0) 387 goto err; 388 /* 389 * We are only checking this in diagnostic mode 390 * since it requires extra latching to keep the count 391 * in sync with the number of bits counted. 392 */ 393 DB_ASSERT(env, 394 dirty == (int)atomic_read(&hp->hash_page_dirty)); 395 MUTEX_UNLOCK(env, hp->mtx_hash); 396 397 /* Check if the call has been interrupted. */ 398 if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET( 399 mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { 400 STAT(++mp->stat.st_sync_interrupted); 401 if (interruptedp != NULL) 402 *interruptedp = 1; 403 goto err; 404 } 405 } 406 } 407 408 /* If there no buffers to write, we're done. */ 409 if (ar_cnt == 0) 410 goto done; 411 412 /* 413 * Write the buffers in file/page order, trying to reduce seeks by the 414 * filesystem and, when pages are smaller than filesystem block sizes, 415 * reduce the actual number of writes. 416 */ 417 if (ar_cnt > 1) 418 qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); 419 420 /* 421 * If we're trickling buffers, only write enough to reach the correct 422 * percentage. 423 */ 424 if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max) 425 ar_cnt = trickle_max; 426 427 /* 428 * Flush the log. We have to ensure the log records reflecting the 429 * changes on the database pages we're writing have already made it 430 * to disk. We still have to check the log each time we write a page 431 * (because pages we are about to write may be modified after we have 432 * flushed the log), but in general this will at least avoid any I/O 433 * on the log's part. 434 */ 435 if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0) 436 goto err; 437 438 /* 439 * Walk the array, writing buffers. When we write a buffer, we NULL 440 * out its hash bucket pointer so we don't process a slot more than 441 * once. 442 */ 443 for (i = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { 444 if (i >= ar_cnt) { 445 i = 0; 446 __os_yield(env, 1, 0); 447 } 448 if ((hp = bharray[i].track_hp) == NULL) 449 continue; 450 451 /* Lock the hash bucket and find the buffer. */ 452 mutex = hp->mtx_hash; 453 MUTEX_READLOCK(env, mutex); 454 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) 455 if (bhp->pgno == bharray[i].track_pgno && 456 bhp->mf_offset == bharray[i].track_off) 457 break; 458 459 /* 460 * If we can't find the buffer we're done, somebody else had 461 * to have written it. 462 * 463 * If the buffer isn't dirty, we're done, there's no work 464 * needed. 465 */ 466 if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) { 467 MUTEX_UNLOCK(env, mutex); 468 --remaining; 469 bharray[i].track_hp = NULL; 470 continue; 471 } 472 473 /* 474 * If the buffer is locked by another thread, ignore it, we'll 475 * come back to it. 476 */ 477 if (F_ISSET(bhp, BH_EXCLUSIVE)) { 478 MUTEX_UNLOCK(env, mutex); 479 if (!required_write) { 480 --remaining; 481 bharray[i].track_hp = NULL; 482 } 483 continue; 484 } 485 486 /* Pin the buffer into memory. */ 487 atomic_inc(env, &bhp->ref); 488 MUTEX_UNLOCK(env, mutex); 489 MUTEX_READLOCK(env, bhp->mtx_buf); 490 DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE)); 491 492 /* 493 * When swapping the hash bucket mutex for the buffer mutex, 494 * we may have raced with an MVCC update. In that case, we 495 * no longer have the most recent version, and need to retry 496 * (the buffer header we have pinned will no longer be marked 497 * dirty, so we can't just write it). 498 */ 499 if (SH_CHAIN_HASNEXT(bhp, vc)) { 500 atomic_dec(env, &bhp->ref); 501 MUTEX_UNLOCK(env, bhp->mtx_buf); 502 continue; 503 } 504 505 /* we will dispose of this buffer. */ 506 --remaining; 507 bharray[i].track_hp = NULL; 508 509 /* 510 * If we've switched files, check to see if we're configured 511 * to close file descriptors. 512 */ 513 if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) { 514 if (++filecnt >= maxopenfd) { 515 filecnt = 0; 516 if ((t_ret = __memp_close_flush_files( 517 env, 1)) != 0 && ret == 0) 518 ret = t_ret; 519 } 520 last_mf_offset = bhp->mf_offset; 521 } 522 523 /* 524 * If the buffer is dirty, we write it. We only try to 525 * write the buffer once. 526 */ 527 if (F_ISSET(bhp, BH_DIRTY)) { 528 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 529 if ((t_ret = 530 __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) { 531 ++wrote_cnt; 532 ++wrote_total; 533 } else { 534 if (ret == 0) 535 ret = t_ret; 536 __db_errx 537 (env, "%s: unable to flush page: %lu", 538 __memp_fns(dbmp, mfp), (u_long)bhp->pgno); 539 540 } 541 } 542 543 /* Discard our buffer reference. */ 544 DB_ASSERT(env, atomic_read(&bhp->ref) > 0); 545 atomic_dec(env, &bhp->ref); 546 MUTEX_UNLOCK(env, bhp->mtx_buf); 547 548 /* Check if the call has been interrupted. */ 549 if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && 550 FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { 551 STAT(++mp->stat.st_sync_interrupted); 552 if (interruptedp != NULL) 553 *interruptedp = 1; 554 goto err; 555 } 556 557 /* 558 * Sleep after some number of writes to avoid disk saturation. 559 * Don't cache the max writes value, an application shutting 560 * down might reset the value in order to do a fast flush or 561 * checkpoint. 562 */ 563 if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) && 564 !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) && 565 mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) { 566 wrote_cnt = 0; 567 __os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep); 568 } 569 } 570 571done: /* 572 * If a write is required, we have to force the pages to disk. We 573 * don't do this as we go along because we want to give the OS as 574 * much time as possible to lazily flush, and because we have to flush 575 * files that might not even have had dirty buffers in the cache, so 576 * we have to walk the files list. 577 */ 578 if (ret == 0 && required_write) { 579 if (dbmfp == NULL) 580 ret = __memp_sync_files(env); 581 else 582 ret = __os_fsync(env, dbmfp->fhp); 583 } 584 585 /* If we've opened files to flush pages, close them. */ 586 if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0) 587 ret = t_ret; 588 589err: __os_free(env, bharray); 590 if (wrote_totalp != NULL) 591 *wrote_totalp = wrote_total; 592 593 return (ret); 594} 595 596static int 597__memp_sync_file(env, mfp, argp, countp, flags) 598 ENV *env; 599 MPOOLFILE *mfp; 600 void *argp; 601 u_int32_t *countp; 602 u_int32_t flags; 603{ 604 DB_MPOOL *dbmp; 605 DB_MPOOLFILE *dbmfp; 606 int ret, t_ret; 607 608 COMPQUIET(countp, NULL); 609 COMPQUIET(flags, 0); 610 611 if (!mfp->file_written || mfp->no_backing_file || 612 mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 613 return (0); 614 /* 615 * Pin the MPOOLFILE structure into memory, and release the 616 * region mutex allowing us to walk the linked list. We'll 617 * re-acquire that mutex to move to the next entry in the list. 618 * 619 * This works because we only need to flush current entries, 620 * we don't care about new entries being added, and the linked 621 * list is never re-ordered, a single pass is sufficient. It 622 * requires MPOOLFILE structures removed before we get to them 623 * be flushed to disk, but that's nothing new, they could have 624 * been removed while checkpoint was running, too. 625 * 626 * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is 627 * not being discarded. (A thread removing the MPOOLFILE 628 * will: hold the MPOOLFILE mutex, set deadfile, drop the 629 * MPOOLFILE mutex and then acquire the region MUTEX to walk 630 * the linked list and remove the MPOOLFILE structure. Make 631 * sure the MPOOLFILE wasn't marked dead while we waited for 632 * the mutex. 633 */ 634 MUTEX_LOCK(env, mfp->mutex); 635 if (!mfp->file_written || mfp->deadfile) { 636 MUTEX_UNLOCK(env, mfp->mutex); 637 return (0); 638 } 639 ++mfp->mpf_cnt; 640 MUTEX_UNLOCK(env, mfp->mutex); 641 642 /* 643 * Look for an already open, writeable handle (fsync doesn't 644 * work on read-only Windows handles). 645 */ 646 dbmp = env->mp_handle; 647 MUTEX_LOCK(env, dbmp->mutex); 648 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) { 649 if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY)) 650 continue; 651 /* 652 * We don't want to hold the mutex while calling sync. 653 * Increment the DB_MPOOLFILE handle ref count to pin 654 * it into memory. 655 */ 656 ++dbmfp->ref; 657 break; 658 } 659 MUTEX_UNLOCK(env, dbmp->mutex); 660 661 /* If we don't find a handle we can use, open one. */ 662 if (dbmfp == NULL) { 663 if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) { 664 __db_err(env, ret, 665 "%s: unable to flush", (char *) 666 R_ADDR(dbmp->reginfo, mfp->path_off)); 667 } 668 } else 669 ret = __os_fsync(env, dbmfp->fhp); 670 671 /* 672 * Re-acquire the MPOOLFILE mutex, we need it to modify the 673 * reference count. 674 */ 675 MUTEX_LOCK(env, mfp->mutex); 676 677 /* 678 * If we wrote the file and there are no other references (or there 679 * is a single reference, and it's the one we opened to write 680 * buffers during checkpoint), clear the file_written flag. We 681 * do this so that applications opening thousands of files don't 682 * loop here opening and flushing those files during checkpoint. 683 * 684 * The danger here is if a buffer were to be written as part of 685 * a checkpoint, and then not be flushed to disk. This cannot 686 * happen because we only clear file_written when there are no 687 * other users of the MPOOLFILE in the system, and, as we hold 688 * the region lock, no possibility of another thread of control 689 * racing with us to open a MPOOLFILE. 690 */ 691 if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 && 692 dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) { 693 mfp->file_written = 0; 694 695 /* 696 * We may be the last reference for a MPOOLFILE, as we 697 * weren't holding the MPOOLFILE mutex when flushing 698 * it's buffers to disk. If we can discard it, set 699 * a flag to schedule a clean-out pass. (Not likely, 700 * I mean, what are the chances that there aren't any 701 * buffers in the pool? Regardless, it might happen.) 702 */ 703 if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0) 704 *(int *)argp = 1; 705 } 706 707 /* 708 * If we found the file we must close it in case we are the last 709 * reference to the dbmfp. NOTE: since we have incremented 710 * mfp->mpf_cnt this cannot be the last reference to the mfp. 711 * This is important since we are called with the hash bucket 712 * locked. The mfp will get freed via the cleanup pass. 713 */ 714 if (dbmfp != NULL && 715 (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0) 716 ret = t_ret; 717 718 --mfp->mpf_cnt; 719 720 /* Unlock the MPOOLFILE. */ 721 MUTEX_UNLOCK(env, mfp->mutex); 722 return (ret); 723} 724 725/* 726 * __memp_sync_files -- 727 * Sync all the files in the environment, open or not. 728 */ 729static int 730__memp_sync_files(env) 731 ENV *env; 732{ 733 DB_MPOOL *dbmp; 734 DB_MPOOL_HASH *hp; 735 MPOOL *mp; 736 MPOOLFILE *mfp, *next_mfp; 737 int i, need_discard_pass, ret; 738 739 dbmp = env->mp_handle; 740 mp = dbmp->reginfo[0].primary; 741 need_discard_pass = ret = 0; 742 743 ret = __memp_walk_files(env, 744 mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR); 745 746 /* 747 * We may need to do a last pass through the MPOOLFILE list -- if we 748 * were the last reference to an MPOOLFILE, we need to clean it out. 749 */ 750 if (!need_discard_pass) 751 return (ret); 752 753 hp = R_ADDR(dbmp->reginfo, mp->ftab); 754 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 755retry: MUTEX_LOCK(env, hp->mtx_hash); 756 for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket, 757 __mpoolfile); mfp != NULL; mfp = next_mfp) { 758 next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile); 759 /* 760 * Do a fast check -- we can check for zero/non-zero 761 * without a mutex on the MPOOLFILE. If likely to 762 * succeed, lock the MPOOLFILE down and look for real. 763 */ 764 if (mfp->deadfile || 765 mfp->block_cnt != 0 || mfp->mpf_cnt != 0) 766 continue; 767 768 MUTEX_LOCK(env, mfp->mutex); 769 if (!mfp->deadfile && 770 mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { 771 MUTEX_UNLOCK(env, hp->mtx_hash); 772 (void)__memp_mf_discard(dbmp, mfp); 773 goto retry; 774 } else 775 MUTEX_UNLOCK(env, mfp->mutex); 776 } 777 MUTEX_UNLOCK(env, hp->mtx_hash); 778 } 779 return (ret); 780} 781 782/* 783 * __memp_mf_sync -- 784 * Flush an MPOOLFILE, when no currently open handle is available. 785 * 786 * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int)); 787 */ 788int 789__memp_mf_sync(dbmp, mfp, locked) 790 DB_MPOOL *dbmp; 791 MPOOLFILE *mfp; 792 int locked; 793{ 794 DB_FH *fhp; 795 DB_MPOOL_HASH *hp; 796 ENV *env; 797 MPOOL *mp; 798 int ret, t_ret; 799 char *rpath; 800 801 COMPQUIET(hp, NULL); 802 env = dbmp->env; 803 804 /* 805 * We need to be holding the hash lock: we're using the path name 806 * and __memp_nameop might try and rename the file. 807 */ 808 if (!locked) { 809 mp = dbmp->reginfo[0].primary; 810 hp = R_ADDR(dbmp->reginfo, mp->ftab); 811 hp += FNBUCKET( 812 R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN); 813 MUTEX_LOCK(env, hp->mtx_hash); 814 } 815 816 if ((ret = __db_appname(env, DB_APP_DATA, 817 R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) { 818 if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { 819 ret = __os_fsync(env, fhp); 820 if ((t_ret = 821 __os_closehandle(env, fhp)) != 0 && ret == 0) 822 ret = t_ret; 823 } 824 __os_free(env, rpath); 825 } 826 827 if (!locked) 828 MUTEX_UNLOCK(env, hp->mtx_hash); 829 830 return (ret); 831} 832 833/* 834 * __memp_close_flush_files -- 835 * Close files opened only to flush buffers. 836 */ 837static int 838__memp_close_flush_files(env, dosync) 839 ENV *env; 840 int dosync; 841{ 842 DB_MPOOL *dbmp; 843 DB_MPOOLFILE *dbmfp; 844 MPOOLFILE *mfp; 845 int ret; 846 847 dbmp = env->mp_handle; 848 849 /* 850 * The routine exists because we must close files opened by sync to 851 * flush buffers. There are two cases: first, extent files have to 852 * be closed so they may be removed when empty. Second, regular 853 * files have to be closed so we don't run out of descriptors (for 854 * example, an application partitioning its data into databases 855 * based on timestamps, so there's a continually increasing set of 856 * files). 857 * 858 * We mark files opened in the __memp_bhwrite() function with the 859 * MP_FLUSH flag. Here we walk through our file descriptor list, 860 * and, if a file was opened by __memp_bhwrite(), we close it. 861 */ 862retry: MUTEX_LOCK(env, dbmp->mutex); 863 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) 864 if (F_ISSET(dbmfp, MP_FLUSH)) { 865 F_CLR(dbmfp, MP_FLUSH); 866 MUTEX_UNLOCK(env, dbmp->mutex); 867 if (dosync) { 868 /* 869 * If we have the only open handle on the file, 870 * clear the dirty flag so we don't re-open and 871 * sync it again when discarding the MPOOLFILE 872 * structure. Clear the flag before the sync 873 * so can't race with a thread writing the file. 874 */ 875 mfp = dbmfp->mfp; 876 if (mfp->mpf_cnt == 1) { 877 MUTEX_LOCK(env, mfp->mutex); 878 if (mfp->mpf_cnt == 1) 879 mfp->file_written = 0; 880 MUTEX_UNLOCK(env, mfp->mutex); 881 } 882 if ((ret = __os_fsync(env, dbmfp->fhp)) != 0) 883 return (ret); 884 } 885 if ((ret = __memp_fclose(dbmfp, 0)) != 0) 886 return (ret); 887 goto retry; 888 } 889 MUTEX_UNLOCK(env, dbmp->mutex); 890 891 return (0); 892} 893 894static int 895__bhcmp(p1, p2) 896 const void *p1, *p2; 897{ 898 BH_TRACK *bhp1, *bhp2; 899 900 bhp1 = (BH_TRACK *)p1; 901 bhp2 = (BH_TRACK *)p2; 902 903 /* Sort by file (shared memory pool offset). */ 904 if (bhp1->track_off < bhp2->track_off) 905 return (-1); 906 if (bhp1->track_off > bhp2->track_off) 907 return (1); 908 909 /* 910 * !!! 911 * Defend against badly written quicksort code calling the comparison 912 * function with two identical pointers (e.g., WATCOM C++ (Power++)). 913 */ 914 if (bhp1->track_pgno < bhp2->track_pgno) 915 return (-1); 916 if (bhp1->track_pgno > bhp2->track_pgno) 917 return (1); 918 return (0); 919} 920