1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/db_page.h" 15#include "dbinc/hash.h" 16 17static int __memp_mpf_alloc __P((DB_MPOOL *, 18 DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **)); 19static int __memp_mpf_find __P((ENV *, 20 DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **)); 21 22/* 23 * __memp_fopen_pp -- 24 * DB_MPOOLFILE->open pre/post processing. 25 * 26 * PUBLIC: int __memp_fopen_pp 27 * PUBLIC: __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t)); 28 */ 29int 30__memp_fopen_pp(dbmfp, path, flags, mode, pagesize) 31 DB_MPOOLFILE *dbmfp; 32 const char *path; 33 u_int32_t flags; 34 int mode; 35 size_t pagesize; 36{ 37 DB_THREAD_INFO *ip; 38 ENV *env; 39 int ret; 40 41 env = dbmfp->env; 42 43 /* Validate arguments. */ 44 if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags, 45 DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION | 46 DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) 47 return (ret); 48 49 /* 50 * Require a non-zero, power-of-two pagesize, smaller than the 51 * clear length. 52 */ 53 if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { 54 __db_errx(env, 55 "DB_MPOOLFILE->open: page sizes must be a power-of-2"); 56 return (EINVAL); 57 } 58 if (dbmfp->clear_len > pagesize) { 59 __db_errx(env, 60 "DB_MPOOLFILE->open: clear length larger than page size"); 61 return (EINVAL); 62 } 63 64 /* Read-only checks, and local flag. */ 65 if (LF_ISSET(DB_RDONLY) && path == NULL) { 66 __db_errx(env, 67 "DB_MPOOLFILE->open: temporary files can't be readonly"); 68 return (EINVAL); 69 } 70 71 if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) { 72 __db_errx(env, 73 "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"); 74 return (EINVAL); 75 } 76 77 ENV_ENTER(env, ip); 78 REPLICATION_WRAP(env, 79 (__memp_fopen(dbmfp, NULL, 80 path, NULL, flags, mode, pagesize)), 0, ret); 81 ENV_LEAVE(env, ip); 82 return (ret); 83} 84 85/* 86 * __memp_fopen -- 87 * DB_MPOOLFILE->open. 88 * 89 * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *, 90 * PUBLIC: const char *, const char **, u_int32_t, int, size_t)); 91 */ 92int 93__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) 94 DB_MPOOLFILE *dbmfp; 95 MPOOLFILE *mfp; 96 const char *path; 97 const char **dirp; 98 u_int32_t flags; 99 int mode; 100 size_t pgsize; 101{ 102 DB_ENV *dbenv; 103 DB_MPOOL *dbmp; 104 DB_MPOOLFILE *tmp_dbmfp; 105 DB_MPOOL_HASH *hp; 106 ENV *env; 107 MPOOL *mp; 108 MPOOLFILE *alloc_mfp; 109 size_t maxmap; 110 db_pgno_t last_pgno; 111 u_int32_t bucket, mbytes, bytes, oflags, pagesize; 112 int refinc, ret; 113 char *rpath; 114 115 /* If this handle is already open, return. */ 116 if (F_ISSET(dbmfp, MP_OPEN_CALLED)) 117 return (0); 118 119 env = dbmfp->env; 120 dbmp = env->mp_handle; 121 dbenv = env->dbenv; 122 mp = dbmp->reginfo[0].primary; 123 alloc_mfp = NULL; 124 mbytes = bytes = 0; 125 refinc = ret = 0; 126 rpath = NULL; 127 128 /* 129 * We're keeping the page size as a size_t in the public API, but 130 * it's a u_int32_t everywhere internally. 131 */ 132 pagesize = (u_int32_t)pgsize; 133 134 /* 135 * We're called internally with a specified mfp, in which case the 136 * path is NULL, but we'll get the path from the underlying region 137 * information. Otherwise, if the path is NULL, it's a temporary 138 * file -- we know we can't join any existing files, and we'll delay 139 * the open until we actually need to write the file. All temporary 140 * files will go into the first hash bucket. 141 */ 142 DB_ASSERT(env, mfp == NULL || path == NULL); 143 144 bucket = 0; 145 hp = R_ADDR(dbmp->reginfo, mp->ftab); 146 if (mfp == NULL) { 147 if (path == NULL) 148 goto alloc; 149 150 /* 151 * Hash to the proper file table entry and walk it. 152 * 153 * The fileID is a filesystem unique number (e.g., a 154 * UNIX dev/inode pair) plus a timestamp. If files are 155 * removed and created in less than a second, the fileID 156 * can be repeated. The problem with repetition happens 157 * when the file that previously had the fileID value still 158 * has pages in the pool, since we don't want to use them 159 * to satisfy requests for the new file. Because the 160 * DB_TRUNCATE flag reuses the dev/inode pair, repeated 161 * opens with that flag set guarantees matching fileIDs 162 * when the machine can open a file and then re-open 163 * with truncate within a second. For this reason, we 164 * pass that flag down, and, if we find a matching entry, 165 * we ensure that it's never found again, and we create 166 * a new entry for the current request. 167 */ 168 169 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 170 bucket = FNBUCKET(path, strlen(path)); 171 else 172 bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN); 173 hp += bucket; 174 175 /* 176 * If we are passed a FILEID find the MPOOLFILE and inc 177 * its ref count. That way it cannot go away while we 178 * open it. 179 */ 180 if (F_ISSET(dbmfp, MP_FILEID_SET)) { 181 MUTEX_LOCK(env, hp->mtx_hash); 182 ret = 183 __memp_mpf_find(env, dbmfp, hp, path, flags,&mfp); 184 MUTEX_UNLOCK(env, hp->mtx_hash); 185 if (ret != 0) 186 goto err; 187 if (mfp != NULL) 188 refinc = 1; 189 } 190 } else { 191 /* 192 * Deadfile can only be set if mpf_cnt goes to zero (or if we 193 * failed creating the file DB_AM_DISCARD). Increment the ref 194 * count so the file cannot become dead and be unlinked. 195 */ 196 MUTEX_LOCK(env, mfp->mutex); 197 if (!mfp->deadfile) { 198 ++mfp->mpf_cnt; 199 refinc = 1; 200 } 201 MUTEX_UNLOCK(env, mfp->mutex); 202 203 /* 204 * Test one last time to see if the file is dead -- it may have 205 * been removed. This happens when a checkpoint trying to open 206 * the file to flush a buffer races with the Db::remove method. 207 * The error will be ignored, so don't output an error message. 208 */ 209 if (mfp->deadfile) 210 return (EINVAL); 211 } 212 213 /* 214 * If there's no backing file, we can join existing files in the cache, 215 * but there's nothing to read from disk. 216 */ 217 if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { 218 /* Convert MP open flags to DB OS-layer open flags. */ 219 oflags = 0; 220 if (LF_ISSET(DB_CREATE)) 221 oflags |= DB_OSO_CREATE; 222 if (LF_ISSET(DB_DIRECT)) 223 oflags |= DB_OSO_DIRECT; 224 if (LF_ISSET(DB_RDONLY)) { 225 F_SET(dbmfp, MP_READONLY); 226 oflags |= DB_OSO_RDONLY; 227 } 228 229 /* 230 * XXX 231 * A grievous layering violation, the DB_DSYNC_DB flag 232 * was left in the ENV structure and not driven through 233 * the cache API. This needs to be fixed when the general 234 * API configuration is fixed. 235 */ 236 if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB)) 237 oflags |= DB_OSO_DSYNC; 238 239 /* 240 * Get the real name for this file and open it. 241 * 242 * Supply a page size so os_open can decide whether to 243 * turn buffering off if the DB_DIRECT_DB flag is set. 244 * 245 * Acquire the region lock if we're using a path from 246 * an underlying MPOOLFILE -- there's a race in accessing 247 * the path name stored in the region, __memp_nameop may 248 * be simultaneously renaming the file. 249 */ 250 if (mfp != NULL) { 251 MPOOL_SYSTEM_LOCK(env); 252 path = R_ADDR(dbmp->reginfo, mfp->path_off); 253 } 254 if ((ret = __db_appname(env, 255 DB_APP_DATA, path, dirp, &rpath)) == 0) 256 ret = __os_open(env, rpath, 257 (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp); 258 if (mfp != NULL) 259 MPOOL_SYSTEM_UNLOCK(env); 260 if (ret != 0) 261 goto err; 262 263 /* 264 * Cache file handles are shared, and have mutexes to 265 * protect the underlying file handle across seek and 266 * read/write calls. 267 */ 268 dbmfp->fhp->ref = 1; 269 if ((ret = __mutex_alloc(env, MTX_MPOOL_FH, 270 DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0) 271 goto err; 272 273 /* 274 * Figure out the file's size. 275 * 276 * !!! 277 * We can't use off_t's here, or in any code in the mainline 278 * library for that matter. (We have to use them in the 279 * os stubs, of course, as there are system calls that 280 * take them as arguments.) The reason is some customers 281 * build in environments where an off_t is 32-bits, but 282 * still run where offsets are 64-bits, and they pay us 283 * a lot of money. 284 */ 285 if ((ret = __os_ioinfo( 286 env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { 287 __db_err(env, ret, "%s", rpath); 288 goto err; 289 } 290 291 /* 292 * Don't permit files that aren't a multiple of the pagesize, 293 * and find the number of the last page in the file, all the 294 * time being careful not to overflow 32 bits. 295 * 296 * During verify or recovery, we might have to cope with a 297 * truncated file; if the file size is not a multiple of the 298 * page size, round down to a page, we'll take care of the 299 * partial page outside the mpool system. 300 */ 301 DB_ASSERT(env, pagesize != 0); 302 if (bytes % pagesize != 0) { 303 if (LF_ISSET(DB_ODDFILESIZE)) 304 bytes -= (u_int32_t)(bytes % pagesize); 305 else { 306 __db_errx(env, 307 "%s: file size not a multiple of the pagesize", rpath); 308 ret = EINVAL; 309 goto err; 310 } 311 } 312 313 /* 314 * Get the file id if we weren't given one. Generated file id's 315 * don't use timestamps, otherwise there'd be no chance of any 316 * other process joining the party. Don't bother looking for 317 * this id in the hash table, its new. 318 */ 319 if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) { 320 if ((ret = 321 __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) 322 goto err; 323 F_SET(dbmfp, MP_FILEID_SET); 324 goto alloc; 325 } 326 } 327 328 if (mfp != NULL) 329 goto have_mfp; 330 331 /* 332 * We can race with another process opening the same file when 333 * we allocate the mpoolfile structure. We will come back 334 * here and check the hash table again to see if it has appeared. 335 * For most files this is not a problem, since the name is locked 336 * at a higher layer but QUEUE extent files are not locked. 337 */ 338check: MUTEX_LOCK(env, hp->mtx_hash); 339 if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0)) 340 goto err; 341 342 if (alloc_mfp != NULL && mfp == NULL) { 343 mfp = alloc_mfp; 344 alloc_mfp = NULL; 345 SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile); 346 } else if (mfp != NULL) { 347 /* 348 * Some things about a file cannot be changed: the clear length, 349 * page size, or LSN location. However, if this is an attempt 350 * to open a named in-memory file, we may not yet have that 351 * information. so accept uninitialized entries. 352 * 353 * The file type can change if the application's pre- and post- 354 * processing needs change. For example, an application that 355 * created a hash subdatabase in a database that was previously 356 * all btree. 357 * 358 * !!! 359 * We do not check to see if the pgcookie information changed, 360 * or update it if it is. 361 */ 362 if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET && 363 mfp->clear_len != DB_CLEARLEN_NOTSET && 364 dbmfp->clear_len != mfp->clear_len) || 365 (pagesize != 0 && pagesize != mfp->stat.st_pagesize) || 366 (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET && 367 mfp->lsn_off != DB_LSN_OFF_NOTSET && 368 dbmfp->lsn_offset != mfp->lsn_off)) { 369 __db_errx(env, 370 "%s: clear length, page size or LSN location changed", 371 path); 372 MUTEX_UNLOCK(env, hp->mtx_hash); 373 ret = EINVAL; 374 goto err; 375 } 376 } 377 378 MUTEX_UNLOCK(env, hp->mtx_hash); 379 if (alloc_mfp != NULL) { 380 MUTEX_LOCK(env, alloc_mfp->mutex); 381 if ((ret = __memp_mf_discard(dbmp, alloc_mfp)) != 0) 382 goto err; 383 } 384 385 if (mfp == NULL) { 386 /* 387 * If we didn't find the file and this is an in-memory file, 388 * then the create flag should be set. 389 */ 390 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && 391 !LF_ISSET(DB_CREATE)) { 392 ret = ENOENT; 393 goto err; 394 } 395 396alloc: /* 397 * Get the file ID if we weren't given one. Generated file 398 * ID's don't use timestamps, otherwise there'd be no 399 * chance of any other process joining the party. 400 */ 401 if (path != NULL && 402 !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && 403 !F_ISSET(dbmfp, MP_FILEID_SET) && (ret = 404 __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) 405 goto err; 406 407 if ((ret = __memp_mpf_alloc(dbmp, 408 dbmfp, path, pagesize, flags, &alloc_mfp)) != 0) 409 goto err; 410 411 /* 412 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a 413 * page get, we have to increment the last page in the file. 414 * Figure it out and save it away. 415 * 416 * Note correction: page numbers are zero-based, not 1-based. 417 */ 418 DB_ASSERT(env, pagesize != 0); 419 last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); 420 last_pgno += (db_pgno_t)(bytes / pagesize); 421 if (last_pgno != 0) 422 --last_pgno; 423 424 alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno = 425 alloc_mfp->last_pgno = last_pgno; 426 427 alloc_mfp->bucket = bucket; 428 429 /* Go back and see if someone else has opened the file. */ 430 if (path != NULL) 431 goto check; 432 433 mfp = alloc_mfp; 434 /* This is a temp, noone else can see it, put it at the end. */ 435 MUTEX_LOCK(env, hp->mtx_hash); 436 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q); 437 MUTEX_UNLOCK(env, hp->mtx_hash); 438 } 439have_mfp: 440 /* 441 * We need to verify that all handles open a file either durable or not 442 * durable. This needs to be cross process and cross sub-databases, so 443 * mpool is the place to do it. 444 */ 445 if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) { 446 if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) { 447 if (LF_ISSET(DB_TXN_NOT_DURABLE)) 448 F_SET(mfp, MP_NOT_DURABLE); 449 F_CLR(mfp, MP_DURABLE_UNKNOWN); 450 } else if (!LF_ISSET(DB_TXN_NOT_DURABLE) != 451 !F_ISSET(mfp, MP_NOT_DURABLE)) { 452 __db_errx(env, 453 "Cannot open DURABLE and NOT DURABLE handles in the same file"); 454 ret = EINVAL; 455 goto err; 456 } 457 } 458 459 if (LF_ISSET(DB_MULTIVERSION)) { 460 ++mfp->multiversion; 461 F_SET(dbmfp, MP_MULTIVERSION); 462 } 463 464 /* 465 * All paths to here have initialized the mfp variable to reference 466 * the selected (or allocated) MPOOLFILE. 467 */ 468 dbmfp->mfp = mfp; 469 470 /* 471 * Check to see if we can mmap the file. If a file: 472 * + isn't temporary 473 * + is read-only 474 * + doesn't require any pgin/pgout support 475 * + the DB_NOMMAP flag wasn't set (in either the file open or 476 * the environment in which it was opened) 477 * + and is less than mp_mmapsize bytes in size 478 * 479 * we can mmap it instead of reading/writing buffers. Don't do error 480 * checking based on the mmap call failure. We want to do normal I/O 481 * on the file if the reason we failed was because the file was on an 482 * NFS mounted partition, and we can fail in buffer I/O just as easily 483 * as here. 484 * 485 * We'd like to test to see if the file is too big to mmap. Since we 486 * don't know what size or type off_t's or size_t's are, or the largest 487 * unsigned integral type is, or what random insanity the local C 488 * compiler will perpetrate, doing the comparison in a portable way is 489 * flatly impossible. Hope that mmap fails if the file is too large. 490 */ 491#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */ 492 if (F_ISSET(mfp, MP_CAN_MMAP)) { 493 maxmap = dbenv->mp_mmapsize == 0 ? 494 DB_MAXMMAPSIZE : dbenv->mp_mmapsize; 495 if (path == NULL || 496 FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 497 F_CLR(mfp, MP_CAN_MMAP); 498 else if (!F_ISSET(dbmfp, MP_READONLY)) 499 F_CLR(mfp, MP_CAN_MMAP); 500 else if (dbmfp->ftype != 0) 501 F_CLR(mfp, MP_CAN_MMAP); 502 else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) 503 F_CLR(mfp, MP_CAN_MMAP); 504 else { 505 MPOOL_SYSTEM_LOCK(env); 506 maxmap = mp->mp_mmapsize == 0 ? 507 DB_MAXMMAPSIZE : mp->mp_mmapsize; 508 MPOOL_SYSTEM_UNLOCK(env); 509 if (mbytes > maxmap / MEGABYTE || 510 (mbytes == maxmap / MEGABYTE && 511 bytes >= maxmap % MEGABYTE)) 512 F_CLR(mfp, MP_CAN_MMAP); 513 } 514 515 dbmfp->addr = NULL; 516 if (F_ISSET(mfp, MP_CAN_MMAP)) { 517 dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; 518 if (__os_mapfile(env, rpath, 519 dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { 520 dbmfp->addr = NULL; 521 F_CLR(mfp, MP_CAN_MMAP); 522 } 523 } 524 } 525 526 F_SET(dbmfp, MP_OPEN_CALLED); 527 528 /* 529 * Share the underlying file descriptor if that's possible. 530 * 531 * Add the file to the process' list of DB_MPOOLFILEs. 532 */ 533 MUTEX_LOCK(env, dbmp->mutex); 534 535 if (dbmfp->fhp != NULL) 536 TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q) 537 if (dbmfp->mfp == tmp_dbmfp->mfp && 538 (F_ISSET(dbmfp, MP_READONLY) || 539 !F_ISSET(tmp_dbmfp, MP_READONLY))) { 540 (void)__mutex_free(env, &dbmfp->fhp->mtx_fh); 541 (void)__os_closehandle(env, dbmfp->fhp); 542 ++tmp_dbmfp->fhp->ref; 543 dbmfp->fhp = tmp_dbmfp->fhp; 544 break; 545 } 546 547 TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); 548 549 MUTEX_UNLOCK(env, dbmp->mutex); 550 551 if (0) { 552err: if (refinc) { 553 /* 554 * If mpf_cnt goes to zero here and unlink_on_close is 555 * set, then we missed the last close, but there was an 556 * error trying to open the file, so we probably cannot 557 * unlink it anyway. 558 */ 559 MUTEX_LOCK(env, mfp->mutex); 560 --mfp->mpf_cnt; 561 MUTEX_UNLOCK(env, mfp->mutex); 562 } 563 564 } 565 if (rpath != NULL) 566 __os_free(env, rpath); 567 return (ret); 568} 569 570/* 571 * __memp_mpf_find -- 572 * Search a hash bucket for a MPOOLFILE. 573 */ 574static int 575__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp) 576 ENV *env; 577 DB_MPOOLFILE *dbmfp; 578 DB_MPOOL_HASH *hp; 579 const char *path; 580 u_int32_t flags; 581 MPOOLFILE **mfpp; 582{ 583 DB_MPOOL *dbmp; 584 MPOOLFILE *mfp; 585 586 dbmp = env->mp_handle; 587 588 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 589 /* Skip dead files and temporary files. */ 590 if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 591 continue; 592 593 /* 594 * Any remaining DB_MPOOL_NOFILE databases are in-memory 595 * named databases and need only match other in-memory 596 * databases with the same name. 597 */ 598 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { 599 if (!mfp->no_backing_file) 600 continue; 601 602 if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off))) 603 continue; 604 605 /* 606 * We matched an in-memory file; grab the fileid if 607 * it is set in the region, but not in the dbmfp. 608 */ 609 if (!F_ISSET(dbmfp, MP_FILEID_SET)) 610 (void)__memp_set_fileid(dbmfp, 611 R_ADDR(dbmp->reginfo, mfp->fileid_off)); 612 } else 613 if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, 614 mfp->fileid_off), DB_FILE_ID_LEN) != 0) 615 continue; 616 617 /* 618 * If the file is being truncated, remove it from the system 619 * and create a new entry. 620 * 621 * !!! 622 * We should be able to set mfp to NULL and break out of the 623 * loop, but I like the idea of checking all the entries. 624 */ 625 if (LF_ISSET(DB_TRUNCATE)) { 626 MUTEX_LOCK(env, mfp->mutex); 627 mfp->deadfile = 1; 628 MUTEX_UNLOCK(env, mfp->mutex); 629 continue; 630 } 631 632 /* 633 * Check to see if this file has died while we waited. 634 * 635 * We normally don't lock the deadfile field when we read it as 636 * we only care if the field is zero or non-zero. We do lock 637 * on read when searching for a matching MPOOLFILE so that two 638 * threads of control don't race between setting the deadfile 639 * bit and incrementing the reference count, that is, a thread 640 * of control decrementing the reference count and then setting 641 * deadfile because the reference count is 0 blocks us finding 642 * the file without knowing it's about to be marked dead. 643 */ 644 MUTEX_LOCK(env, mfp->mutex); 645 if (mfp->deadfile) { 646 MUTEX_UNLOCK(env, mfp->mutex); 647 continue; 648 } 649 ++mfp->mpf_cnt; 650 MUTEX_UNLOCK(env, mfp->mutex); 651 652 /* Initialize any fields that are not yet set. */ 653 if (dbmfp->ftype != 0) 654 mfp->ftype = dbmfp->ftype; 655 if (dbmfp->clear_len != DB_CLEARLEN_NOTSET) 656 mfp->clear_len = dbmfp->clear_len; 657 if (dbmfp->lsn_offset != -1) 658 mfp->lsn_off = dbmfp->lsn_offset; 659 660 break; 661 } 662 663 *mfpp = mfp; 664 return (0); 665} 666 667static int 668__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) 669 DB_MPOOL *dbmp; 670 DB_MPOOLFILE *dbmfp; 671 const char *path; 672 u_int32_t pagesize; 673 u_int32_t flags; 674 MPOOLFILE **retmfp; 675{ 676 ENV *env; 677 MPOOLFILE *mfp; 678 int ret; 679 void *p; 680 681 env = dbmp->env; 682 ret = 0; 683 /* Allocate and initialize a new MPOOLFILE. */ 684 if ((ret = __memp_alloc(dbmp, 685 dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) 686 goto err; 687 memset(mfp, 0, sizeof(MPOOLFILE)); 688 mfp->mpf_cnt = 1; 689 mfp->ftype = dbmfp->ftype; 690 mfp->stat.st_pagesize = pagesize; 691 mfp->lsn_off = dbmfp->lsn_offset; 692 mfp->clear_len = dbmfp->clear_len; 693 mfp->priority = dbmfp->priority; 694 if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { 695 mfp->maxpgno = (db_pgno_t) 696 (dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize)); 697 mfp->maxpgno += (db_pgno_t) 698 ((dbmfp->bytes + mfp->stat.st_pagesize - 1) / 699 mfp->stat.st_pagesize); 700 } 701 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 702 mfp->no_backing_file = 1; 703 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) 704 mfp->unlink_on_close = 1; 705 706 if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) 707 F_SET(mfp, MP_DURABLE_UNKNOWN); 708 if (LF_ISSET(DB_DIRECT)) 709 F_SET(mfp, MP_DIRECT); 710 if (LF_ISSET(DB_EXTENT)) 711 F_SET(mfp, MP_EXTENT); 712 if (LF_ISSET(DB_TXN_NOT_DURABLE)) 713 F_SET(mfp, MP_NOT_DURABLE); 714 F_SET(mfp, MP_CAN_MMAP); 715 716 /* 717 * An in-memory database with no name is a temp file. Named 718 * in-memory databases get an artificially bumped reference 719 * count so they don't disappear on close; they need a remove 720 * to make them disappear. 721 */ 722 if (path == NULL) 723 F_SET(mfp, MP_TEMP); 724 else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 725 mfp->mpf_cnt++; 726 727 /* Copy the file identification string into shared memory. */ 728 if (F_ISSET(dbmfp, MP_FILEID_SET)) { 729 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 730 NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) 731 goto err; 732 memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); 733 } 734 735 /* Copy the file path into shared memory. */ 736 if (path != NULL) { 737 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 738 NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) 739 goto err; 740 memcpy(p, path, strlen(path) + 1); 741 } 742 743 /* Copy the page cookie into shared memory. */ 744 if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { 745 mfp->pgcookie_len = 0; 746 mfp->pgcookie_off = 0; 747 } else { 748 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 749 NULL, dbmfp->pgcookie->size, 750 &mfp->pgcookie_off, &p)) != 0) 751 goto err; 752 memcpy(p, 753 dbmfp->pgcookie->data, dbmfp->pgcookie->size); 754 mfp->pgcookie_len = dbmfp->pgcookie->size; 755 } 756 757 if ((ret = __mutex_alloc(env, 758 MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0) 759 goto err; 760 *retmfp = mfp; 761 762err: return (ret); 763} 764 765/* 766 * memp_fclose_pp -- 767 * DB_MPOOLFILE->close pre/post processing. 768 * 769 * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t)); 770 */ 771int 772__memp_fclose_pp(dbmfp, flags) 773 DB_MPOOLFILE *dbmfp; 774 u_int32_t flags; 775{ 776 DB_THREAD_INFO *ip; 777 ENV *env; 778 int ret; 779 780 env = dbmfp->env; 781 782 /* 783 * Validate arguments, but as a handle destructor, we can't fail. 784 */ 785 if (flags != 0) 786 (void)__db_ferr(env, "DB_MPOOLFILE->close", 0); 787 788 ENV_ENTER(env, ip); 789 REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret); 790 ENV_LEAVE(env, ip); 791 return (ret); 792} 793 794/* 795 * __memp_fclose -- 796 * DB_MPOOLFILE->close. 797 * 798 * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); 799 */ 800int 801__memp_fclose(dbmfp, flags) 802 DB_MPOOLFILE *dbmfp; 803 u_int32_t flags; 804{ 805 DB_MPOOL *dbmp; 806 ENV *env; 807 MPOOLFILE *mfp; 808 char *rpath; 809 u_int32_t ref; 810 int deleted, ret, t_ret; 811 812 env = dbmfp->env; 813 dbmp = env->mp_handle; 814 ret = 0; 815 816 /* 817 * Remove the DB_MPOOLFILE from the process' list. 818 * 819 * It's possible the underlying mpool cache may never have been created. 820 * In that case, all we have is a structure, discard it. 821 * 822 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE 823 * file list, check the MP_OPEN_CALLED flag to be sure. 824 */ 825 if (dbmp == NULL) 826 goto done; 827 828 MUTEX_LOCK(env, dbmp->mutex); 829 830 DB_ASSERT(env, dbmfp->ref >= 1); 831 if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED)) 832 TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); 833 834 /* 835 * Decrement the file descriptor's ref count -- if we're the last ref, 836 * we'll discard the file descriptor. 837 */ 838 if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0) 839 dbmfp->fhp = NULL; 840 MUTEX_UNLOCK(env, dbmp->mutex); 841 if (ref != 0) 842 return (0); 843 844 /* Complain if pinned blocks never returned. */ 845 if (dbmfp->pinref != 0) { 846 __db_errx(env, "%s: close: %lu blocks left pinned", 847 __memp_fn(dbmfp), (u_long)dbmfp->pinref); 848 ret = __env_panic(env, DB_RUNRECOVERY); 849 } 850 851 /* Discard any mmap information. */ 852 if (dbmfp->addr != NULL && 853 (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0) 854 __db_err(env, ret, "%s", __memp_fn(dbmfp)); 855 856 /* 857 * Close the file and discard the descriptor structure; temporary 858 * files may not yet have been created. 859 */ 860 if (dbmfp->fhp != NULL) { 861 if ((t_ret = 862 __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0) 863 ret = t_ret; 864 if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) { 865 __db_err(env, t_ret, "%s", __memp_fn(dbmfp)); 866 if (ret == 0) 867 ret = t_ret; 868 } 869 dbmfp->fhp = NULL; 870 } 871 872 /* 873 * Discard our reference on the underlying MPOOLFILE, and close it 874 * if it's no longer useful to anyone. It possible the open of the 875 * file never happened or wasn't successful, in which case, mpf will 876 * be NULL and MP_OPEN_CALLED will not be set. 877 */ 878 mfp = dbmfp->mfp; 879 DB_ASSERT(env, 880 (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) || 881 (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL)); 882 if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) 883 goto done; 884 885 /* 886 * If it's a temp file, all outstanding references belong to unflushed 887 * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). 888 * We don't care about preserving any of those buffers, so mark the 889 * MPOOLFILE as dead so that even the dirty ones just get discarded 890 * when we try to flush them. 891 */ 892 deleted = 0; 893 if (!LF_ISSET(DB_MPOOL_NOLOCK)) 894 MUTEX_LOCK(env, mfp->mutex); 895 if (F_ISSET(dbmfp, MP_MULTIVERSION)) 896 --mfp->multiversion; 897 if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { 898 if (LF_ISSET(DB_MPOOL_DISCARD) || 899 F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) { 900 mfp->deadfile = 1; 901 } 902 if (mfp->unlink_on_close) { 903 if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA, 904 R_ADDR(dbmp->reginfo, mfp->path_off), NULL, 905 &rpath)) != 0 && ret == 0) 906 ret = t_ret; 907 if (t_ret == 0) { 908 if ((t_ret = __os_unlink( 909 dbmp->env, rpath, 0)) != 0 && ret == 0) 910 ret = t_ret; 911 __os_free(env, rpath); 912 } 913 } 914 if (mfp->mpf_cnt == 0) { 915 F_CLR(mfp, MP_NOT_DURABLE); 916 F_SET(mfp, MP_DURABLE_UNKNOWN); 917 } 918 if (mfp->block_cnt == 0) { 919 /* 920 * We should never discard this mp file if our caller 921 * is holding the lock on it. See comment in 922 * __memp_sync_file. 923 */ 924 DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK)); 925 if ((t_ret = 926 __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) 927 ret = t_ret; 928 deleted = 1; 929 } 930 } 931 if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK)) 932 MUTEX_UNLOCK(env, mfp->mutex); 933 934done: /* Discard the DB_MPOOLFILE structure. */ 935 if (dbmfp->pgcookie != NULL) { 936 __os_free(env, dbmfp->pgcookie->data); 937 __os_free(env, dbmfp->pgcookie); 938 } 939 __os_free(env, dbmfp); 940 941 return (ret); 942} 943 944/* 945 * __memp_mf_discard -- 946 * Discard an MPOOLFILE. 947 * 948 * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); 949 */ 950int 951__memp_mf_discard(dbmp, mfp) 952 DB_MPOOL *dbmp; 953 MPOOLFILE *mfp; 954{ 955 DB_MPOOL_HASH *hp; 956 ENV *env; 957#ifdef HAVE_STATISTICS 958 DB_MPOOL_STAT *sp; 959#endif 960 MPOOL *mp; 961 int need_sync, ret, t_ret; 962 963 env = dbmp->env; 964 mp = dbmp->reginfo[0].primary; 965 hp = R_ADDR(dbmp->reginfo, mp->ftab); 966 hp += mfp->bucket; 967 ret = 0; 968 969 /* 970 * Expects caller to be holding the MPOOLFILE mutex. 971 * 972 * When discarding a file, we have to flush writes from it to disk. 973 * The scenario is that dirty buffers from this file need to be 974 * flushed to satisfy a future checkpoint, but when the checkpoint 975 * calls mpool sync, the sync code won't know anything about them. 976 * Ignore files not written, discarded, or only temporary. 977 */ 978 need_sync = 979 mfp->file_written && !mfp->deadfile && !F_ISSET(mfp, MP_TEMP); 980 981 /* 982 * We have to release the MPOOLFILE mutex before acquiring the region 983 * mutex so we don't deadlock. Make sure nobody ever looks at this 984 * structure again. 985 */ 986 mfp->deadfile = 1; 987 988 /* Discard the mutex we're holding and return it too the pool. */ 989 MUTEX_UNLOCK(env, mfp->mutex); 990 if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0) 991 ret = t_ret; 992 993 /* Lock the bucket and delete from the list of MPOOLFILEs. */ 994 MUTEX_LOCK(env, hp->mtx_hash); 995 SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile); 996 MUTEX_UNLOCK(env, hp->mtx_hash); 997 998 /* Lock the region and collect stats and free the space. */ 999 MPOOL_SYSTEM_LOCK(env); 1000 if (need_sync && 1001 (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0) 1002 ret = t_ret; 1003 1004#ifdef HAVE_STATISTICS 1005 /* Copy the statistics into the region. */ 1006 sp = &mp->stat; 1007 sp->st_cache_hit += mfp->stat.st_cache_hit; 1008 sp->st_cache_miss += mfp->stat.st_cache_miss; 1009 sp->st_map += mfp->stat.st_map; 1010 sp->st_page_create += mfp->stat.st_page_create; 1011 sp->st_page_in += mfp->stat.st_page_in; 1012 sp->st_page_out += mfp->stat.st_page_out; 1013#endif 1014 1015 /* Free the space. */ 1016 if (mfp->path_off != 0) 1017 __memp_free(&dbmp->reginfo[0], 1018 R_ADDR(dbmp->reginfo, mfp->path_off)); 1019 if (mfp->fileid_off != 0) 1020 __memp_free(&dbmp->reginfo[0], 1021 R_ADDR(dbmp->reginfo, mfp->fileid_off)); 1022 if (mfp->pgcookie_off != 0) 1023 __memp_free(&dbmp->reginfo[0], 1024 R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); 1025 __memp_free(&dbmp->reginfo[0], mfp); 1026 1027 MPOOL_SYSTEM_UNLOCK(env); 1028 1029 return (ret); 1030} 1031 1032/* 1033 * __memp_inmemlist -- 1034 * Return a list of the named in-memory databases. 1035 * 1036 * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *)); 1037 */ 1038int 1039__memp_inmemlist(env, namesp, cntp) 1040 ENV *env; 1041 char ***namesp; 1042 int *cntp; 1043{ 1044 DB_MPOOL *dbmp; 1045 DB_MPOOL_HASH *hp; 1046 MPOOL *mp; 1047 MPOOLFILE *mfp; 1048 int arraysz, cnt, i, ret; 1049 char **names; 1050 1051 names = NULL; 1052 dbmp = env->mp_handle; 1053 mp = dbmp->reginfo[0].primary; 1054 hp = R_ADDR(dbmp->reginfo, mp->ftab); 1055 1056 arraysz = cnt = 0; 1057 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 1058 MUTEX_LOCK(env, hp->mtx_hash); 1059 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 1060 /* Skip dead files and temporary files. */ 1061 if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 1062 continue; 1063 1064 /* Skip entries that allow files. */ 1065 if (!mfp->no_backing_file) 1066 continue; 1067 1068 /* We found one. */ 1069 if (cnt >= arraysz) { 1070 arraysz += 100; 1071 if ((ret = __os_realloc(env, 1072 (u_int)arraysz * sizeof(names[0]), 1073 &names)) != 0) 1074 goto nomem; 1075 } 1076 if ((ret = __os_strdup(env, 1077 R_ADDR(dbmp->reginfo, mfp->path_off), 1078 &names[cnt])) != 0) 1079 goto nomem; 1080 1081 cnt++; 1082 } 1083 MUTEX_UNLOCK(env, hp->mtx_hash); 1084 } 1085 *namesp = names; 1086 *cntp = cnt; 1087 return (0); 1088 1089nomem: MUTEX_UNLOCK(env, hp->mtx_hash); 1090 if (names != NULL) { 1091 while (--cnt >= 0) 1092 __os_free(env, names[cnt]); 1093 __os_free(env, names); 1094 } 1095 1096 /* Make sure we don't return any garbage. */ 1097 *cntp = 0; 1098 *namesp = NULL; 1099 return (ret); 1100} 1101