1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_sync.c,v 12.59 2008/01/17 13:59:12 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/db_page.h" 15#include "dbinc/hash.h" 16 17typedef struct { 18 DB_MPOOL_HASH *track_hp; /* Hash bucket. */ 19 20 roff_t track_off; /* Page file offset. */ 21 db_pgno_t track_pgno; /* Page number. */ 22} BH_TRACK; 23 24static int __bhcmp __P((const void *, const void *)); 25static int __memp_close_flush_files __P((ENV *, int)); 26static int __memp_sync_files __P((ENV *)); 27static int __memp_sync_file __P((ENV *, 28 MPOOLFILE *, void *, u_int32_t *, u_int32_t)); 29 30/* 31 * __memp_walk_files -- 32 * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *, 33 * PUBLIC: int (*) __P((ENV *, MPOOLFILE *, void *, 34 * PUBLIC: u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t)); 35 */ 36int 37__memp_walk_files(env, mp, func, arg, countp, flags) 38 ENV *env; 39 MPOOL *mp; 40 int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); 41 void *arg; 42 u_int32_t *countp; 43 u_int32_t flags; 44{ 45 DB_MPOOL *dbmp; 46 DB_MPOOL_HASH *hp; 47 MPOOLFILE *mfp; 48 int i, ret, t_ret; 49 50 dbmp = env->mp_handle; 51 ret = 0; 52 53 hp = R_ADDR(dbmp->reginfo, mp->ftab); 54 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 55 MUTEX_LOCK(env, hp->mtx_hash); 56 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 57 if ((t_ret = func(env, 58 mfp, arg, countp, flags)) != 0 && ret == 0) 59 ret = t_ret; 60 if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) 61 break; 62 } 63 MUTEX_UNLOCK(env, hp->mtx_hash); 64 if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) 65 break; 66 } 67 return (ret); 68} 69 70/* 71 * __memp_sync_pp -- 72 * ENV->memp_sync pre/post processing. 73 * 74 * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *)); 75 */ 76int 77__memp_sync_pp(dbenv, lsnp) 78 DB_ENV *dbenv; 79 DB_LSN *lsnp; 80{ 81 DB_THREAD_INFO *ip; 82 ENV *env; 83 int ret; 84 85 env = dbenv->env; 86 87 ENV_REQUIRES_CONFIG(env, 88 env->mp_handle, "memp_sync", DB_INIT_MPOOL); 89 90 /* 91 * If no LSN is provided, flush the entire cache (reasonable usage 92 * even if there's no log subsystem configured). 93 */ 94 if (lsnp != NULL) 95 ENV_REQUIRES_CONFIG(env, 96 env->lg_handle, "memp_sync", DB_INIT_LOG); 97 98 ENV_ENTER(env, ip); 99 REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret); 100 ENV_LEAVE(env, ip); 101 return (ret); 102} 103 104/* 105 * __memp_sync -- 106 * ENV->memp_sync. 107 * 108 * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *)); 109 */ 110int 111__memp_sync(env, flags, lsnp) 112 ENV *env; 113 u_int32_t flags; 114 DB_LSN *lsnp; 115{ 116 DB_MPOOL *dbmp; 117 MPOOL *mp; 118 int interrupted, ret; 119 120 dbmp = env->mp_handle; 121 mp = dbmp->reginfo[0].primary; 122 123 /* If we've flushed to the requested LSN, return that information. */ 124 if (lsnp != NULL) { 125 MPOOL_SYSTEM_LOCK(env); 126 if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) { 127 *lsnp = mp->lsn; 128 129 MPOOL_SYSTEM_UNLOCK(env); 130 return (0); 131 } 132 MPOOL_SYSTEM_UNLOCK(env); 133 } 134 135 if ((ret = 136 __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0) 137 return (ret); 138 139 if (!interrupted && lsnp != NULL) { 140 MPOOL_SYSTEM_LOCK(env); 141 if (LOG_COMPARE(lsnp, &mp->lsn) > 0) 142 mp->lsn = *lsnp; 143 MPOOL_SYSTEM_UNLOCK(env); 144 } 145 146 return (0); 147} 148 149/* 150 * __memp_fsync_pp -- 151 * DB_MPOOLFILE->sync pre/post processing. 152 * 153 * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *)); 154 */ 155int 156__memp_fsync_pp(dbmfp) 157 DB_MPOOLFILE *dbmfp; 158{ 159 DB_THREAD_INFO *ip; 160 ENV *env; 161 int ret; 162 163 env = dbmfp->env; 164 165 MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync"); 166 167 ENV_ENTER(env, ip); 168 REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret); 169 ENV_LEAVE(env, ip); 170 return (ret); 171} 172 173/* 174 * __memp_fsync -- 175 * DB_MPOOLFILE->sync. 176 * 177 * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); 178 */ 179int 180__memp_fsync(dbmfp) 181 DB_MPOOLFILE *dbmfp; 182{ 183 MPOOLFILE *mfp; 184 185 mfp = dbmfp->mfp; 186 187 /* 188 * If this handle doesn't have a file descriptor that's open for 189 * writing, or if the file is a temporary, or if the file hasn't 190 * been written since it was flushed, there's no reason to proceed 191 * further. 192 */ 193 if (F_ISSET(dbmfp, MP_READONLY)) 194 return (0); 195 196 if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file) 197 return (0); 198 199 if (mfp->file_written == 0) 200 return (0); 201 202 return (__memp_sync_int( 203 dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)); 204} 205 206/* 207 * __mp_xxx_fh -- 208 * Return a file descriptor for DB 1.85 compatibility locking. 209 * 210 * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); 211 */ 212int 213__mp_xxx_fh(dbmfp, fhp) 214 DB_MPOOLFILE *dbmfp; 215 DB_FH **fhp; 216{ 217 int ret; 218 219 /* 220 * This is a truly spectacular layering violation, intended ONLY to 221 * support compatibility for the DB 1.85 DB->fd call. 222 * 223 * Sync the database file to disk, creating the file as necessary. 224 * 225 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). 226 * The MP_READONLY test isn't interesting because we will either 227 * already have a file descriptor (we opened the database file for 228 * reading) or we aren't readonly (we created the database which 229 * requires write privileges). The MP_TEMP test isn't interesting 230 * because we want to write to the backing file regardless so that 231 * we get a file descriptor to return. 232 */ 233 if ((*fhp = dbmfp->fhp) != NULL) 234 return (0); 235 236 if ((ret = __memp_sync_int( 237 dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0) 238 *fhp = dbmfp->fhp; 239 return (ret); 240} 241 242/* 243 * __memp_sync_int -- 244 * Mpool sync internal function. 245 * 246 * PUBLIC: int __memp_sync_int __P((ENV *, 247 * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *)); 248 */ 249int 250__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp) 251 ENV *env; 252 DB_MPOOLFILE *dbmfp; 253 u_int32_t trickle_max, flags, *wrote_totalp; 254 int *interruptedp; 255{ 256 BH *bhp; 257 BH_TRACK *bharray; 258 DB_MPOOL *dbmp; 259 DB_MPOOL_HASH *hp; 260 MPOOL *c_mp, *mp; 261 MPOOLFILE *mfp; 262 db_mutex_t mutex; 263 roff_t last_mf_offset; 264 u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total; 265 int filecnt, maxopenfd, pass, required_write, ret, t_ret; 266 int wait_cnt, wrote_cnt; 267 268 dbmp = env->mp_handle; 269 mp = dbmp->reginfo[0].primary; 270 last_mf_offset = INVALID_ROFF; 271 filecnt = pass = wrote_total = 0; 272 273 if (wrote_totalp != NULL) 274 *wrote_totalp = 0; 275 if (interruptedp != NULL) 276 *interruptedp = 0; 277 278 /* 279 * If we're flushing the cache, it's a checkpoint or we're flushing a 280 * specific file, we really have to write the blocks and we have to 281 * confirm they made it to disk. Otherwise, we can skip a block if 282 * it's hard to get. 283 */ 284 required_write = LF_ISSET(DB_SYNC_CACHE | 285 DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT); 286 287 /* Get shared configuration information. */ 288 MPOOL_SYSTEM_LOCK(env); 289 maxopenfd = mp->mp_maxopenfd; 290 MPOOL_SYSTEM_UNLOCK(env); 291 292 /* Assume one dirty page per bucket. */ 293 ar_max = mp->nreg * mp->htab_buckets; 294 if ((ret = 295 __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0) 296 return (ret); 297 298 /* 299 * Walk each cache's list of buffers and mark all dirty buffers to be 300 * written and all dirty buffers to be potentially written, depending 301 * on our flags. 302 */ 303 for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { 304 c_mp = dbmp->reginfo[n_cache].primary; 305 306 hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); 307 for (i = 0; i < c_mp->htab_buckets; i++, hp++) { 308 /* 309 * We can check for empty buckets before locking as 310 * we only care if the pointer is zero or non-zero. 311 * We can ignore empty or clean buckets because we 312 * only need write buffers that were dirty before 313 * we started. 314 */ 315#ifdef DIAGNOSTIC 316 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) 317#else 318 if (hp->hash_page_dirty == 0) 319#endif 320 continue; 321 322 dirty = 0; 323 MUTEX_LOCK(env, hp->mtx_hash); 324 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { 325 /* Always ignore clean pages. */ 326 if (!F_ISSET(bhp, BH_DIRTY)) 327 continue; 328 329 dirty++; 330 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 331 332 /* 333 * Ignore in-memory files, unless the file is 334 * specifically being flushed. 335 */ 336 if (mfp->no_backing_file) 337 continue; 338 if (!LF_ISSET(DB_SYNC_FILE) && 339 F_ISSET(mfp, MP_TEMP)) 340 continue; 341 342 /* 343 * Ignore files that aren't involved in DB's 344 * transactional operations during checkpoints. 345 */ 346 if (LF_ISSET(DB_SYNC_CHECKPOINT) && 347 mfp->lsn_off == DB_LSN_OFF_NOTSET) 348 continue; 349 350 /* 351 * Ignore files that aren't Queue extent files 352 * if we're flushing a Queue file with extents. 353 */ 354 if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) && 355 !F_ISSET(mfp, MP_EXTENT)) 356 continue; 357 358 /* 359 * If we're flushing a specific file, see if 360 * this page is from that file. 361 */ 362 if (dbmfp != NULL && mfp != dbmfp->mfp) 363 continue; 364 365 /* Track the buffer, we want it. */ 366 bharray[ar_cnt].track_hp = hp; 367 bharray[ar_cnt].track_pgno = bhp->pgno; 368 bharray[ar_cnt].track_off = bhp->mf_offset; 369 ar_cnt++; 370 371 /* 372 * If we run out of space, double and continue. 373 * Don't stop at trickle_max, we want to sort 374 * as large a sample set as possible in order 375 * to minimize disk seeks. 376 */ 377 if (ar_cnt >= ar_max) { 378 if ((ret = __os_realloc(env, 379 (ar_max * 2) * sizeof(BH_TRACK), 380 &bharray)) != 0) 381 break; 382 ar_max *= 2; 383 } 384 } 385 DB_ASSERT(env, dirty == hp->hash_page_dirty); 386 if (dirty != hp->hash_page_dirty) { 387 __db_errx(env, 388 "memp_sync: correcting dirty count %lu %lu", 389 (u_long)hp->hash_page_dirty, (u_long)dirty); 390 hp->hash_page_dirty = dirty; 391 } 392 MUTEX_UNLOCK(env, hp->mtx_hash); 393 394 if (ret != 0) 395 goto err; 396 397 /* Check if the call has been interrupted. */ 398 if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET( 399 mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { 400 if (interruptedp != NULL) 401 *interruptedp = 1; 402 goto err; 403 } 404 } 405 } 406 407 /* If there no buffers to write, we're done. */ 408 if (ar_cnt == 0) 409 goto done; 410 411 /* 412 * Write the buffers in file/page order, trying to reduce seeks by the 413 * filesystem and, when pages are smaller than filesystem block sizes, 414 * reduce the actual number of writes. 415 */ 416 if (ar_cnt > 1) 417 qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); 418 419 /* 420 * If we're trickling buffers, only write enough to reach the correct 421 * percentage. 422 */ 423 if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max) 424 ar_cnt = trickle_max; 425 426 /* 427 * Flush the log. We have to ensure the log records reflecting the 428 * changes on the database pages we're writing have already made it 429 * to disk. We still have to check the log each time we write a page 430 * (because pages we are about to write may be modified after we have 431 * flushed the log), but in general this will at least avoid any I/O 432 * on the log's part. 433 */ 434 if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0) 435 goto err; 436 437 /* 438 * Walk the array, writing buffers. When we write a buffer, we NULL 439 * out its hash bucket pointer so we don't process a slot more than 440 * once. 441 */ 442 for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { 443 if (i >= ar_cnt) { 444 i = 0; 445 ++pass; 446 __os_yield(env, 1, 0); 447 } 448 if ((hp = bharray[i].track_hp) == NULL) 449 continue; 450 451 /* Lock the hash bucket and find the buffer. */ 452 mutex = hp->mtx_hash; 453 MUTEX_LOCK(env, mutex); 454 SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) 455 if (bhp->pgno == bharray[i].track_pgno && 456 bhp->mf_offset == bharray[i].track_off) 457 break; 458 459 /* 460 * If we can't find the buffer we're done, somebody else had 461 * to have written it. 462 * 463 * If the buffer isn't dirty, we're done, there's no work 464 * needed. 465 */ 466 if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) { 467 MUTEX_UNLOCK(env, mutex); 468 --remaining; 469 bharray[i].track_hp = NULL; 470 continue; 471 } 472 473 /* 474 * If the buffer is locked by another thread, ignore it, we'll 475 * come back to it. 476 * 477 * If the buffer is pinned and it's only the first or second 478 * time we have looked at it, ignore it, we'll come back to 479 * it. 480 * 481 * In either case, skip the buffer if we're not required to 482 * write it. 483 */ 484 if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) { 485 MUTEX_UNLOCK(env, mutex); 486 if (!required_write) { 487 --remaining; 488 bharray[i].track_hp = NULL; 489 } 490 continue; 491 } 492 493 /* Pin the buffer into memory and lock it. */ 494 ++bhp->ref; 495 F_SET(bhp, BH_LOCKED); 496 497 /* 498 * If the buffer is referenced by another thread, set the sync 499 * wait-for count (used to count down outstanding references to 500 * this buffer as they are returned to the cache), then unlock 501 * the hash bucket and wait for the count to go to 0. No other 502 * thread can acquire the buffer because we have it locked. 503 * 504 * If a thread attempts to re-pin a page, the wait-for count 505 * will never go to 0 (that thread spins on our buffer lock, 506 * while we spin on the thread's ref count). Give up if we 507 * don't get the buffer in 3 seconds, we'll try again later. 508 * 509 * If, when the wait-for count goes to 0, the buffer is found 510 * to be dirty, write it. 511 */ 512 bhp->ref_sync = bhp->ref - 1; 513 if (bhp->ref_sync != 0) { 514 MUTEX_UNLOCK(env, mutex); 515 for (wait_cnt = 1; 516 bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) 517 __os_yield(env, 1, 0); 518 MUTEX_LOCK(env, mutex); 519 } 520 521 /* 522 * If we've switched files, check to see if we're configured 523 * to close file descriptors. 524 */ 525 if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) { 526 if (++filecnt >= maxopenfd) { 527 filecnt = 0; 528 if ((t_ret = __memp_close_flush_files( 529 env, 1)) != 0 && ret == 0) 530 ret = t_ret; 531 } 532 last_mf_offset = bhp->mf_offset; 533 } 534 535 /* 536 * If the ref_sync count has gone to 0, we're going to be done 537 * with this buffer no matter what happens. 538 */ 539 if (bhp->ref_sync == 0) { 540 --remaining; 541 bharray[i].track_hp = NULL; 542 } 543 544 /* 545 * If the ref_sync count has gone to 0 and the buffer is still 546 * dirty, we write it. We only try to write the buffer once. 547 */ 548 if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) { 549 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); 550 if ((t_ret = 551 __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) { 552 ++wrote_cnt; 553 ++wrote_total; 554 } else { 555 if (ret == 0) 556 ret = t_ret; 557 __db_errx 558 (env, "%s: unable to flush page: %lu", 559 __memp_fns(dbmp, mfp), (u_long)bhp->pgno); 560 561 } 562 } 563 564 /* 565 * If ref_sync count never went to 0, the buffer was written 566 * by another thread, or the write failed, we still have the 567 * buffer locked. 568 */ 569 if (F_ISSET(bhp, BH_LOCKED)) 570 F_CLR(bhp, BH_LOCKED); 571 572 /* 573 * Reset the ref_sync count regardless of our success, we're 574 * done with this buffer for now. 575 */ 576 bhp->ref_sync = 0; 577 578 /* Discard our buffer reference. */ 579 --bhp->ref; 580 581 /* 582 * If a thread of control is waiting in this hash bucket, wake 583 * it up. 584 */ 585 if (F_ISSET(hp, IO_WAITER)) { 586 F_CLR(hp, IO_WAITER); 587 MUTEX_UNLOCK(env, hp->mtx_io); 588 } 589 590 /* Release the hash bucket mutex. */ 591 MUTEX_UNLOCK(env, mutex); 592 593 /* Check if the call has been interrupted. */ 594 if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && 595 FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { 596 if (interruptedp != NULL) 597 *interruptedp = 1; 598 goto err; 599 } 600 601 /* 602 * Sleep after some number of writes to avoid disk saturation. 603 * Don't cache the max writes value, an application shutting 604 * down might reset the value in order to do a fast flush or 605 * checkpoint. 606 */ 607 if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) && 608 !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) && 609 mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) { 610 wrote_cnt = 0; 611 __os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep); 612 } 613 } 614 615done: /* 616 * If a write is required, we have to force the pages to disk. We 617 * don't do this as we go along because we want to give the OS as 618 * much time as possible to lazily flush, and because we have to flush 619 * files that might not even have had dirty buffers in the cache, so 620 * we have to walk the files list. 621 */ 622 if (ret == 0 && required_write) { 623 if (dbmfp == NULL) 624 ret = __memp_sync_files(env); 625 else 626 ret = __os_fsync(env, dbmfp->fhp); 627 } 628 629 /* If we've opened files to flush pages, close them. */ 630 if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0) 631 ret = t_ret; 632 633err: __os_free(env, bharray); 634 if (wrote_totalp != NULL) 635 *wrote_totalp = wrote_total; 636 637 return (ret); 638} 639 640static int 641__memp_sync_file(env, mfp, argp, countp, flags) 642 ENV *env; 643 MPOOLFILE *mfp; 644 void *argp; 645 u_int32_t *countp; 646 u_int32_t flags; 647{ 648 DB_MPOOL *dbmp; 649 DB_MPOOLFILE *dbmfp; 650 int ret, t_ret; 651 652 COMPQUIET(countp, NULL); 653 COMPQUIET(flags, 0); 654 655 if (!mfp->file_written || mfp->no_backing_file || 656 mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 657 return (0); 658 /* 659 * Pin the MPOOLFILE structure into memory, and release the 660 * region mutex allowing us to walk the linked list. We'll 661 * re-acquire that mutex to move to the next entry in the list. 662 * 663 * This works because we only need to flush current entries, 664 * we don't care about new entries being added, and the linked 665 * list is never re-ordered, a single pass is sufficient. It 666 * requires MPOOLFILE structures removed before we get to them 667 * be flushed to disk, but that's nothing new, they could have 668 * been removed while checkpoint was running, too. 669 * 670 * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is 671 * not being discarded. (A thread removing the MPOOLFILE 672 * will: hold the MPOOLFILE mutex, set deadfile, drop the 673 * MPOOLFILE mutex and then acquire the region MUTEX to walk 674 * the linked list and remove the MPOOLFILE structure. Make 675 * sure the MPOOLFILE wasn't marked dead while we waited for 676 * the mutex. 677 */ 678 MUTEX_LOCK(env, mfp->mutex); 679 if (!mfp->file_written || mfp->deadfile) { 680 MUTEX_UNLOCK(env, mfp->mutex); 681 return (0); 682 } 683 ++mfp->mpf_cnt; 684 MUTEX_UNLOCK(env, mfp->mutex); 685 686 /* 687 * Look for an already open, writeable handle (fsync doesn't 688 * work on read-only Windows handles). 689 */ 690 dbmp = env->mp_handle; 691 MUTEX_LOCK(env, dbmp->mutex); 692 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) { 693 if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY)) 694 continue; 695 /* 696 * We don't want to hold the mutex while calling sync. 697 * Increment the DB_MPOOLFILE handle ref count to pin 698 * it into memory. 699 */ 700 ++dbmfp->ref; 701 break; 702 } 703 MUTEX_UNLOCK(env, dbmp->mutex); 704 705 /* If we don't find a handle we can use, open one. */ 706 if (dbmfp == NULL) { 707 if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) { 708 __db_err(env, ret, 709 "%s: unable to flush", (char *) 710 R_ADDR(dbmp->reginfo, mfp->path_off)); 711 } 712 } else 713 ret = __os_fsync(env, dbmfp->fhp); 714 715 /* 716 * Re-acquire the MPOOLFILE mutex, we need it to modify the 717 * reference count. 718 */ 719 MUTEX_LOCK(env, mfp->mutex); 720 721 /* 722 * If we wrote the file and there are no other references (or there 723 * is a single reference, and it's the one we opened to write 724 * buffers during checkpoint), clear the file_written flag. We 725 * do this so that applications opening thousands of files don't 726 * loop here opening and flushing those files during checkpoint. 727 * 728 * The danger here is if a buffer were to be written as part of 729 * a checkpoint, and then not be flushed to disk. This cannot 730 * happen because we only clear file_written when there are no 731 * other users of the MPOOLFILE in the system, and, as we hold 732 * the region lock, no possibility of another thread of control 733 * racing with us to open a MPOOLFILE. 734 */ 735 if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 && 736 dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) { 737 mfp->file_written = 0; 738 739 /* 740 * We may be the last reference for a MPOOLFILE, as we 741 * weren't holding the MPOOLFILE mutex when flushing 742 * it's buffers to disk. If we can discard it, set 743 * a flag to schedule a clean-out pass. (Not likely, 744 * I mean, what are the chances that there aren't any 745 * buffers in the pool? Regardless, it might happen.) 746 */ 747 if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0) 748 *(int *)argp = 1; 749 } 750 751 /* 752 * If we found the file we must close it in case we are the last 753 * reference to the dbmfp. NOTE: since we have incremented 754 * mfp->mpf_cnt this cannot be the last reference to the mfp. 755 * This is important since we are called with the hash bucket 756 * locked. The mfp will get freed via the cleanup pass. 757 */ 758 if (dbmfp != NULL && 759 (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0) 760 ret = t_ret; 761 762 --mfp->mpf_cnt; 763 764 /* Unlock the MPOOLFILE. */ 765 MUTEX_UNLOCK(env, mfp->mutex); 766 return (ret); 767} 768 769/* 770 * __memp_sync_files -- 771 * Sync all the files in the environment, open or not. 772 */ 773static int 774__memp_sync_files(env) 775 ENV *env; 776{ 777 DB_MPOOL *dbmp; 778 DB_MPOOL_HASH *hp; 779 MPOOL *mp; 780 MPOOLFILE *mfp, *next_mfp; 781 int i, need_discard_pass, ret; 782 783 dbmp = env->mp_handle; 784 mp = dbmp->reginfo[0].primary; 785 need_discard_pass = ret = 0; 786 787 ret = __memp_walk_files(env, 788 mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR); 789 790 /* 791 * We may need to do a last pass through the MPOOLFILE list -- if we 792 * were the last reference to an MPOOLFILE, we need to clean it out. 793 */ 794 if (!need_discard_pass) 795 return (ret); 796 797 hp = R_ADDR(dbmp->reginfo, mp->ftab); 798 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 799retry: MUTEX_LOCK(env, hp->mtx_hash); 800 for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket, 801 __mpoolfile); mfp != NULL; mfp = next_mfp) { 802 next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile); 803 /* 804 * Do a fast check -- we can check for zero/non-zero 805 * without a mutex on the MPOOLFILE. If likely to 806 * succeed, lock the MPOOLFILE down and look for real. 807 */ 808 if (mfp->deadfile || 809 mfp->block_cnt != 0 || mfp->mpf_cnt != 0) 810 continue; 811 812 MUTEX_LOCK(env, mfp->mutex); 813 if (!mfp->deadfile && 814 mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { 815 MUTEX_UNLOCK(env, hp->mtx_hash); 816 (void)__memp_mf_discard(dbmp, mfp); 817 goto retry; 818 } else 819 MUTEX_UNLOCK(env, mfp->mutex); 820 } 821 MUTEX_UNLOCK(env, hp->mtx_hash); 822 } 823 return (ret); 824} 825 826/* 827 * __memp_mf_sync -- 828 * Flush an MPOOLFILE, when no currently open handle is available. 829 * 830 * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int)); 831 */ 832int 833__memp_mf_sync(dbmp, mfp, locked) 834 DB_MPOOL *dbmp; 835 MPOOLFILE *mfp; 836 int locked; 837{ 838 DB_FH *fhp; 839 DB_MPOOL_HASH *hp; 840 ENV *env; 841 MPOOL *mp; 842 int ret, t_ret; 843 char *rpath; 844 845 COMPQUIET(hp, NULL); 846 env = dbmp->env; 847 848 /* 849 * We need to be holding the hash lock: we're using the path name 850 * and __memp_nameop might try and rename the file. 851 */ 852 if (!locked) { 853 mp = dbmp->reginfo[0].primary; 854 hp = R_ADDR(dbmp->reginfo, mp->ftab); 855 hp += FNBUCKET( 856 R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN); 857 MUTEX_LOCK(env, hp->mtx_hash); 858 } 859 860 if ((ret = __db_appname(env, DB_APP_DATA, 861 R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { 862 if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { 863 ret = __os_fsync(env, fhp); 864 if ((t_ret = 865 __os_closehandle(env, fhp)) != 0 && ret == 0) 866 ret = t_ret; 867 } 868 __os_free(env, rpath); 869 } 870 871 if (!locked) 872 MUTEX_UNLOCK(env, hp->mtx_hash); 873 874 return (ret); 875} 876 877/* 878 * __memp_close_flush_files -- 879 * Close files opened only to flush buffers. 880 */ 881static int 882__memp_close_flush_files(env, dosync) 883 ENV *env; 884 int dosync; 885{ 886 DB_MPOOL *dbmp; 887 DB_MPOOLFILE *dbmfp; 888 MPOOLFILE *mfp; 889 int ret; 890 891 dbmp = env->mp_handle; 892 893 /* 894 * The routine exists because we must close files opened by sync to 895 * flush buffers. There are two cases: first, extent files have to 896 * be closed so they may be removed when empty. Second, regular 897 * files have to be closed so we don't run out of descriptors (for 898 * example, an application partitioning its data into databases 899 * based on timestamps, so there's a continually increasing set of 900 * files). 901 * 902 * We mark files opened in the __memp_bhwrite() function with the 903 * MP_FLUSH flag. Here we walk through our file descriptor list, 904 * and, if a file was opened by __memp_bhwrite(), we close it. 905 */ 906retry: MUTEX_LOCK(env, dbmp->mutex); 907 TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) 908 if (F_ISSET(dbmfp, MP_FLUSH)) { 909 F_CLR(dbmfp, MP_FLUSH); 910 MUTEX_UNLOCK(env, dbmp->mutex); 911 if (dosync) { 912 /* 913 * If we have the only open handle on the file, 914 * clear the dirty flag so we don't re-open and 915 * sync it again when discarding the MPOOLFILE 916 * structure. Clear the flag before the sync 917 * so can't race with a thread writing the file. 918 */ 919 mfp = dbmfp->mfp; 920 if (mfp->mpf_cnt == 1) { 921 MUTEX_LOCK(env, mfp->mutex); 922 if (mfp->mpf_cnt == 1) 923 mfp->file_written = 0; 924 MUTEX_UNLOCK(env, mfp->mutex); 925 } 926 if ((ret = __os_fsync(env, dbmfp->fhp)) != 0) 927 return (ret); 928 } 929 if ((ret = __memp_fclose(dbmfp, 0)) != 0) 930 return (ret); 931 goto retry; 932 } 933 MUTEX_UNLOCK(env, dbmp->mutex); 934 935 return (0); 936} 937 938static int 939__bhcmp(p1, p2) 940 const void *p1, *p2; 941{ 942 BH_TRACK *bhp1, *bhp2; 943 944 bhp1 = (BH_TRACK *)p1; 945 bhp2 = (BH_TRACK *)p2; 946 947 /* Sort by file (shared memory pool offset). */ 948 if (bhp1->track_off < bhp2->track_off) 949 return (-1); 950 if (bhp1->track_off > bhp2->track_off) 951 return (1); 952 953 /* 954 * !!! 955 * Defend against badly written quicksort code calling the comparison 956 * function with two identical pointers (e.g., WATCOM C++ (Power++)). 957 */ 958 if (bhp1->track_pgno < bhp2->track_pgno) 959 return (-1); 960 if (bhp1->track_pgno > bhp2->track_pgno) 961 return (1); 962 return (0); 963} 964