1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: mp_fopen.c,v 12.50 2008/01/31 18:40:45 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/log.h" 13#include "dbinc/mp.h" 14#include "dbinc/db_page.h" 15#include "dbinc/hash.h" 16 17static int __memp_mpf_alloc __P((DB_MPOOL *, 18 DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **)); 19static int __memp_mpf_find __P((ENV *, 20 DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **)); 21 22/* 23 * __memp_fopen_pp -- 24 * DB_MPOOLFILE->open pre/post processing. 25 * 26 * PUBLIC: int __memp_fopen_pp 27 * PUBLIC: __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t)); 28 */ 29int 30__memp_fopen_pp(dbmfp, path, flags, mode, pagesize) 31 DB_MPOOLFILE *dbmfp; 32 const char *path; 33 u_int32_t flags; 34 int mode; 35 size_t pagesize; 36{ 37 DB_THREAD_INFO *ip; 38 ENV *env; 39 int ret; 40 41 env = dbmfp->env; 42 43 /* Validate arguments. */ 44 if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags, 45 DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION | 46 DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) 47 return (ret); 48 49 /* 50 * Require a non-zero, power-of-two pagesize, smaller than the 51 * clear length. 52 */ 53 if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { 54 __db_errx(env, 55 "DB_MPOOLFILE->open: page sizes must be a power-of-2"); 56 return (EINVAL); 57 } 58 if (dbmfp->clear_len > pagesize) { 59 __db_errx(env, 60 "DB_MPOOLFILE->open: clear length larger than page size"); 61 return (EINVAL); 62 } 63 64 /* Read-only checks, and local flag. */ 65 if (LF_ISSET(DB_RDONLY) && path == NULL) { 66 __db_errx(env, 67 "DB_MPOOLFILE->open: temporary files can't be readonly"); 68 return (EINVAL); 69 } 70 71 if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) { 72 __db_errx(env, 73 "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"); 74 return (EINVAL); 75 } 76 77 ENV_ENTER(env, ip); 78 REPLICATION_WRAP(env, 79 (__memp_fopen(dbmfp, NULL, path, flags, mode, pagesize)), 0, ret); 80 ENV_LEAVE(env, ip); 81 return (ret); 82} 83 84/* 85 * __memp_fopen -- 86 * DB_MPOOLFILE->open. 87 * 88 * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, 89 * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t)); 90 */ 91int 92__memp_fopen(dbmfp, mfp, path, flags, mode, pgsize) 93 DB_MPOOLFILE *dbmfp; 94 MPOOLFILE *mfp; 95 const char *path; 96 u_int32_t flags; 97 int mode; 98 size_t pgsize; 99{ 100 DB_ENV *dbenv; 101 DB_MPOOL *dbmp; 102 DB_MPOOLFILE *tmp_dbmfp; 103 DB_MPOOL_HASH *hp; 104 ENV *env; 105 MPOOL *mp; 106 MPOOLFILE *alloc_mfp; 107 size_t maxmap; 108 db_pgno_t last_pgno; 109 u_int32_t bucket, mbytes, bytes, oflags, pagesize; 110 int refinc, ret; 111 char *rpath; 112 113 /* If this handle is already open, return. */ 114 if (F_ISSET(dbmfp, MP_OPEN_CALLED)) 115 return (0); 116 117 env = dbmfp->env; 118 dbmp = env->mp_handle; 119 dbenv = env->dbenv; 120 mp = dbmp->reginfo[0].primary; 121 alloc_mfp = NULL; 122 mbytes = bytes = 0; 123 refinc = ret = 0; 124 rpath = NULL; 125 126 /* 127 * We're keeping the page size as a size_t in the public API, but 128 * it's a u_int32_t everywhere internally. 129 */ 130 pagesize = (u_int32_t)pgsize; 131 132 /* 133 * We're called internally with a specified mfp, in which case the 134 * path is NULL, but we'll get the path from the underlying region 135 * information. Otherwise, if the path is NULL, it's a temporary 136 * file -- we know we can't join any existing files, and we'll delay 137 * the open until we actually need to write the file. All temporary 138 * files will go into the first hash bucket. 139 */ 140 DB_ASSERT(env, mfp == NULL || path == NULL); 141 142 bucket = 0; 143 hp = R_ADDR(dbmp->reginfo, mp->ftab); 144 if (mfp == NULL) { 145 if (path == NULL) 146 goto alloc; 147 148 /* 149 * Hash to the proper file table entry and walk it. 150 * 151 * The fileID is a filesystem unique number (e.g., a 152 * UNIX dev/inode pair) plus a timestamp. If files are 153 * removed and created in less than a second, the fileID 154 * can be repeated. The problem with repetition happens 155 * when the file that previously had the fileID value still 156 * has pages in the pool, since we don't want to use them 157 * to satisfy requests for the new file. Because the 158 * DB_TRUNCATE flag reuses the dev/inode pair, repeated 159 * opens with that flag set guarantees matching fileIDs 160 * when the machine can open a file and then re-open 161 * with truncate within a second. For this reason, we 162 * pass that flag down, and, if we find a matching entry, 163 * we ensure that it's never found again, and we create 164 * a new entry for the current request. 165 */ 166 167 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 168 bucket = FNBUCKET(path, strlen(path)); 169 else 170 bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN); 171 hp += bucket; 172 173 /* 174 * If we are passed a FILEID find the MPOOLFILE and inc 175 * its ref count. That way it cannot go away while we 176 * open it. 177 */ 178 if (F_ISSET(dbmfp, MP_FILEID_SET)) { 179 MUTEX_LOCK(env, hp->mtx_hash); 180 ret = 181 __memp_mpf_find(env, dbmfp, hp, path, flags,&mfp); 182 MUTEX_UNLOCK(env, hp->mtx_hash); 183 if (ret != 0) 184 goto err; 185 if (mfp != NULL) 186 refinc = 1; 187 } 188 } else { 189 /* 190 * Deadfile can only be set if mpf_cnt goes to zero (or if we 191 * failed creating the file DB_AM_DISCARD). Increment the ref 192 * count so the file cannot become dead and be unlinked. 193 */ 194 MUTEX_LOCK(env, mfp->mutex); 195 if (!mfp->deadfile) { 196 ++mfp->mpf_cnt; 197 refinc = 1; 198 } 199 MUTEX_UNLOCK(env, mfp->mutex); 200 201 /* 202 * Test one last time to see if the file is dead -- it may have 203 * been removed. This happens when a checkpoint trying to open 204 * the file to flush a buffer races with the Db::remove method. 205 * The error will be ignored, so don't output an error message. 206 */ 207 if (mfp->deadfile) 208 return (EINVAL); 209 } 210 211 /* 212 * If there's no backing file, we can join existing files in the cache, 213 * but there's nothing to read from disk. 214 */ 215 if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { 216 /* Convert MP open flags to DB OS-layer open flags. */ 217 oflags = 0; 218 if (LF_ISSET(DB_CREATE)) 219 oflags |= DB_OSO_CREATE; 220 if (LF_ISSET(DB_DIRECT)) 221 oflags |= DB_OSO_DIRECT; 222 if (LF_ISSET(DB_RDONLY)) { 223 F_SET(dbmfp, MP_READONLY); 224 oflags |= DB_OSO_RDONLY; 225 } 226 227 /* 228 * XXX 229 * A grievous layering violation, the DB_DSYNC_DB flag 230 * was left in the ENV structure and not driven through 231 * the cache API. This needs to be fixed when the general 232 * API configuration is fixed. 233 */ 234 if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB)) 235 oflags |= DB_OSO_DSYNC; 236 237 /* 238 * Get the real name for this file and open it. 239 * 240 * Supply a page size so os_open can decide whether to 241 * turn buffering off if the DB_DIRECT_DB flag is set. 242 * 243 * Acquire the region lock if we're using a path from 244 * an underlying MPOOLFILE -- there's a race in accessing 245 * the path name stored in the region, __memp_nameop may 246 * be simultaneously renaming the file. 247 */ 248 if (mfp != NULL) { 249 MPOOL_SYSTEM_LOCK(env); 250 path = R_ADDR(dbmp->reginfo, mfp->path_off); 251 } 252 if ((ret = __db_appname(env, 253 DB_APP_DATA, path, 0, NULL, &rpath)) == 0) 254 ret = __os_open(env, rpath, 255 (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp); 256 if (mfp != NULL) 257 MPOOL_SYSTEM_UNLOCK(env); 258 if (ret != 0) 259 goto err; 260 261 /* 262 * Cache file handles are shared, and have mutexes to 263 * protect the underlying file handle across seek and 264 * read/write calls. 265 */ 266 dbmfp->fhp->ref = 1; 267 if ((ret = __mutex_alloc(env, MTX_MPOOL_FH, 268 DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0) 269 goto err; 270 271 /* 272 * Figure out the file's size. 273 * 274 * !!! 275 * We can't use off_t's here, or in any code in the mainline 276 * library for that matter. (We have to use them in the 277 * os stubs, of course, as there are system calls that 278 * take them as arguments.) The reason is some customers 279 * build in environments where an off_t is 32-bits, but 280 * still run where offsets are 64-bits, and they pay us 281 * a lot of money. 282 */ 283 if ((ret = __os_ioinfo( 284 env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { 285 __db_err(env, ret, "%s", rpath); 286 goto err; 287 } 288 289 /* 290 * Don't permit files that aren't a multiple of the pagesize, 291 * and find the number of the last page in the file, all the 292 * time being careful not to overflow 32 bits. 293 * 294 * During verify or recovery, we might have to cope with a 295 * truncated file; if the file size is not a multiple of the 296 * page size, round down to a page, we'll take care of the 297 * partial page outside the mpool system. 298 */ 299 DB_ASSERT(env, pagesize != 0); 300 if (bytes % pagesize != 0) { 301 if (LF_ISSET(DB_ODDFILESIZE)) 302 bytes -= (u_int32_t)(bytes % pagesize); 303 else { 304 __db_errx(env, 305 "%s: file size not a multiple of the pagesize", rpath); 306 ret = EINVAL; 307 goto err; 308 } 309 } 310 311 /* 312 * Get the file id if we weren't given one. Generated file id's 313 * don't use timestamps, otherwise there'd be no chance of any 314 * other process joining the party. Don't bother looking for 315 * this id in the hash table, its new. 316 */ 317 if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) { 318 if ((ret = 319 __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) 320 goto err; 321 F_SET(dbmfp, MP_FILEID_SET); 322 goto alloc; 323 } 324 } 325 326 if (mfp != NULL) 327 goto have_mfp; 328 329 /* 330 * We can race with another process opening the same file when 331 * we allocate the mpoolfile structure. We will come back 332 * here and check the hash table again to see if it has appeared. 333 * For most files this is not a problem, since the name is locked 334 * at a higher layer but QUEUE extent files are not locked. 335 */ 336check: MUTEX_LOCK(env, hp->mtx_hash); 337 if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0)) 338 goto err; 339 340 if (alloc_mfp != NULL && mfp == NULL) { 341 mfp = alloc_mfp; 342 alloc_mfp = NULL; 343 SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile); 344 } else if (mfp != NULL) { 345 /* 346 * Some things about a file cannot be changed: the clear length, 347 * page size, or LSN location. However, if this is an attempt 348 * to open a named in-memory file, we may not yet have that 349 * information. so accept uninitialized entries. 350 * 351 * The file type can change if the application's pre- and post- 352 * processing needs change. For example, an application that 353 * created a hash subdatabase in a database that was previously 354 * all btree. 355 * 356 * !!! 357 * We do not check to see if the pgcookie information changed, 358 * or update it if it is. 359 */ 360 if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET && 361 mfp->clear_len != DB_CLEARLEN_NOTSET && 362 dbmfp->clear_len != mfp->clear_len) || 363 (pagesize != 0 && pagesize != mfp->stat.st_pagesize) || 364 (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET && 365 mfp->lsn_off != DB_LSN_OFF_NOTSET && 366 dbmfp->lsn_offset != mfp->lsn_off)) { 367 __db_errx(env, 368 "%s: clear length, page size or LSN location changed", 369 path); 370 MUTEX_UNLOCK(env, hp->mtx_hash); 371 ret = EINVAL; 372 goto err; 373 } 374 } 375 376 MUTEX_UNLOCK(env, hp->mtx_hash); 377 if (alloc_mfp != NULL) { 378 MUTEX_LOCK(env, alloc_mfp->mutex); 379 if ((ret = __memp_mf_discard(dbmp, alloc_mfp)) != 0) 380 goto err; 381 } 382 383 if (mfp == NULL) { 384 /* 385 * If we didn't find the file and this is an in-memory file, 386 * then the create flag should be set. 387 */ 388 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && 389 !LF_ISSET(DB_CREATE)) { 390 ret = ENOENT; 391 goto err; 392 } 393 394alloc: /* 395 * Get the file ID if we weren't given one. Generated file 396 * ID's don't use timestamps, otherwise there'd be no 397 * chance of any other process joining the party. 398 */ 399 if (path != NULL && 400 !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && 401 !F_ISSET(dbmfp, MP_FILEID_SET) && (ret = 402 __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) 403 goto err; 404 405 if ((ret = __memp_mpf_alloc(dbmp, 406 dbmfp, path, pagesize, flags, &alloc_mfp)) != 0) 407 goto err; 408 409 /* 410 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a 411 * page get, we have to increment the last page in the file. 412 * Figure it out and save it away. 413 * 414 * Note correction: page numbers are zero-based, not 1-based. 415 */ 416 DB_ASSERT(env, pagesize != 0); 417 last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); 418 last_pgno += (db_pgno_t)(bytes / pagesize); 419 if (last_pgno != 0) 420 --last_pgno; 421 422 alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno = 423 alloc_mfp->last_pgno = last_pgno; 424 425 alloc_mfp->bucket = bucket; 426 427 /* Go back and see if someone else has opened the file. */ 428 if (path != NULL) 429 goto check; 430 431 mfp = alloc_mfp; 432 /* This is a temp, noone else can see it, put it at the end. */ 433 MUTEX_LOCK(env, hp->mtx_hash); 434 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q); 435 MUTEX_UNLOCK(env, hp->mtx_hash); 436 } 437have_mfp: 438 /* 439 * We need to verify that all handles open a file either durable or not 440 * durable. This needs to be cross process and cross sub-databases, so 441 * mpool is the place to do it. 442 */ 443 if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) { 444 if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) { 445 if (LF_ISSET(MP_NOT_DURABLE)) 446 F_SET(mfp, MP_NOT_DURABLE); 447 F_CLR(mfp, MP_DURABLE_UNKNOWN); 448 } else if (!LF_ISSET(DB_TXN_NOT_DURABLE) != 449 !F_ISSET(mfp, MP_NOT_DURABLE)) { 450 __db_errx(env, 451 "Cannot open DURABLE and NOT DURABLE handles in the same file"); 452 ret = EINVAL; 453 goto err; 454 } 455 } 456 457 if (LF_ISSET(DB_MULTIVERSION)) { 458 ++mfp->multiversion; 459 F_SET(dbmfp, MP_MULTIVERSION); 460 } 461 462 /* 463 * All paths to here have initialized the mfp variable to reference 464 * the selected (or allocated) MPOOLFILE. 465 */ 466 dbmfp->mfp = mfp; 467 468 /* 469 * Check to see if we can mmap the file. If a file: 470 * + isn't temporary 471 * + is read-only 472 * + doesn't require any pgin/pgout support 473 * + the DB_NOMMAP flag wasn't set (in either the file open or 474 * the environment in which it was opened) 475 * + and is less than mp_mmapsize bytes in size 476 * 477 * we can mmap it instead of reading/writing buffers. Don't do error 478 * checking based on the mmap call failure. We want to do normal I/O 479 * on the file if the reason we failed was because the file was on an 480 * NFS mounted partition, and we can fail in buffer I/O just as easily 481 * as here. 482 * 483 * We'd like to test to see if the file is too big to mmap. Since we 484 * don't know what size or type off_t's or size_t's are, or the largest 485 * unsigned integral type is, or what random insanity the local C 486 * compiler will perpetrate, doing the comparison in a portable way is 487 * flatly impossible. Hope that mmap fails if the file is too large. 488 */ 489#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */ 490 if (F_ISSET(mfp, MP_CAN_MMAP)) { 491 maxmap = dbenv->mp_mmapsize == 0 ? 492 DB_MAXMMAPSIZE : dbenv->mp_mmapsize; 493 if (path == NULL || 494 FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 495 F_CLR(mfp, MP_CAN_MMAP); 496 else if (!F_ISSET(dbmfp, MP_READONLY)) 497 F_CLR(mfp, MP_CAN_MMAP); 498 else if (dbmfp->ftype != 0) 499 F_CLR(mfp, MP_CAN_MMAP); 500 else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) 501 F_CLR(mfp, MP_CAN_MMAP); 502 else { 503 MPOOL_SYSTEM_LOCK(env); 504 maxmap = mp->mp_mmapsize == 0 ? 505 DB_MAXMMAPSIZE : mp->mp_mmapsize; 506 MPOOL_SYSTEM_UNLOCK(env); 507 if (mbytes > maxmap / MEGABYTE || 508 (mbytes == maxmap / MEGABYTE && 509 bytes >= maxmap % MEGABYTE)) 510 F_CLR(mfp, MP_CAN_MMAP); 511 } 512 513 dbmfp->addr = NULL; 514 if (F_ISSET(mfp, MP_CAN_MMAP)) { 515 dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; 516 if (__os_mapfile(env, rpath, 517 dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { 518 dbmfp->addr = NULL; 519 F_CLR(mfp, MP_CAN_MMAP); 520 } 521 } 522 } 523 524 F_SET(dbmfp, MP_OPEN_CALLED); 525 526 /* 527 * Share the underlying file descriptor if that's possible. 528 * 529 * Add the file to the process' list of DB_MPOOLFILEs. 530 */ 531 MUTEX_LOCK(env, dbmp->mutex); 532 533 if (dbmfp->fhp != NULL) 534 TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q) 535 if (dbmfp->mfp == tmp_dbmfp->mfp && 536 (F_ISSET(dbmfp, MP_READONLY) || 537 !F_ISSET(tmp_dbmfp, MP_READONLY))) { 538 (void)__mutex_free(env, &dbmfp->fhp->mtx_fh); 539 (void)__os_closehandle(env, dbmfp->fhp); 540 ++tmp_dbmfp->fhp->ref; 541 dbmfp->fhp = tmp_dbmfp->fhp; 542 break; 543 } 544 545 TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); 546 547 MUTEX_UNLOCK(env, dbmp->mutex); 548 549 if (0) { 550err: if (refinc) { 551 /* 552 * If mpf_cnt goes to zero here and unlink_on_close is 553 * set, then we missed the last close, but there was an 554 * error trying to open the file, so we probably cannot 555 * unlink it anyway. 556 */ 557 MUTEX_LOCK(env, mfp->mutex); 558 --mfp->mpf_cnt; 559 MUTEX_UNLOCK(env, mfp->mutex); 560 } 561 562 } 563 if (rpath != NULL) 564 __os_free(env, rpath); 565 return (ret); 566} 567 568/* 569 * __memp_mpf_find -- 570 * Search a hash bucket for a MPOOLFILE. 571 */ 572static int 573__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp) 574 ENV *env; 575 DB_MPOOLFILE *dbmfp; 576 DB_MPOOL_HASH *hp; 577 const char *path; 578 u_int32_t flags; 579 MPOOLFILE **mfpp; 580{ 581 DB_MPOOL *dbmp; 582 MPOOLFILE *mfp; 583 584 dbmp = env->mp_handle; 585 586 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 587 /* Skip dead files and temporary files. */ 588 if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 589 continue; 590 591 /* 592 * Any remaining DB_MPOOL_NOFILE databases are in-memory 593 * named databases and need only match other in-memory 594 * databases with the same name. 595 */ 596 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { 597 if (!mfp->no_backing_file) 598 continue; 599 600 if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off))) 601 continue; 602 603 /* 604 * We matched an in-memory file; grab the fileid if 605 * it is set in the region, but not in the dbmfp. 606 */ 607 if (!F_ISSET(dbmfp, MP_FILEID_SET)) 608 (void)__memp_set_fileid(dbmfp, 609 R_ADDR(dbmp->reginfo, mfp->fileid_off)); 610 } else 611 if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, 612 mfp->fileid_off), DB_FILE_ID_LEN) != 0) 613 continue; 614 615 /* 616 * If the file is being truncated, remove it from the system 617 * and create a new entry. 618 * 619 * !!! 620 * We should be able to set mfp to NULL and break out of the 621 * loop, but I like the idea of checking all the entries. 622 */ 623 if (LF_ISSET(DB_TRUNCATE)) { 624 MUTEX_LOCK(env, mfp->mutex); 625 mfp->deadfile = 1; 626 MUTEX_UNLOCK(env, mfp->mutex); 627 continue; 628 } 629 630 /* 631 * Check to see if this file has died while we waited. 632 * 633 * We normally don't lock the deadfile field when we read it as 634 * we only care if the field is zero or non-zero. We do lock 635 * on read when searching for a matching MPOOLFILE so that two 636 * threads of control don't race between setting the deadfile 637 * bit and incrementing the reference count, that is, a thread 638 * of control decrementing the reference count and then setting 639 * deadfile because the reference count is 0 blocks us finding 640 * the file without knowing it's about to be marked dead. 641 */ 642 MUTEX_LOCK(env, mfp->mutex); 643 if (mfp->deadfile) { 644 MUTEX_UNLOCK(env, mfp->mutex); 645 continue; 646 } 647 ++mfp->mpf_cnt; 648 MUTEX_UNLOCK(env, mfp->mutex); 649 650 /* Initialize any fields that are not yet set. */ 651 if (dbmfp->ftype != 0) 652 mfp->ftype = dbmfp->ftype; 653 if (dbmfp->clear_len != DB_CLEARLEN_NOTSET) 654 mfp->clear_len = dbmfp->clear_len; 655 if (dbmfp->lsn_offset != -1) 656 mfp->lsn_off = dbmfp->lsn_offset; 657 658 break; 659 } 660 661 *mfpp = mfp; 662 return (0); 663} 664 665static int 666__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) 667 DB_MPOOL *dbmp; 668 DB_MPOOLFILE *dbmfp; 669 const char *path; 670 u_int32_t pagesize; 671 u_int32_t flags; 672 MPOOLFILE **retmfp; 673{ 674 ENV *env; 675 MPOOLFILE *mfp; 676 int ret; 677 void *p; 678 679 env = dbmp->env; 680 ret = 0; 681 /* Allocate and initialize a new MPOOLFILE. */ 682 if ((ret = __memp_alloc(dbmp, 683 dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) 684 goto err; 685 memset(mfp, 0, sizeof(MPOOLFILE)); 686 mfp->mpf_cnt = 1; 687 mfp->ftype = dbmfp->ftype; 688 mfp->stat.st_pagesize = pagesize; 689 mfp->lsn_off = dbmfp->lsn_offset; 690 mfp->clear_len = dbmfp->clear_len; 691 mfp->priority = dbmfp->priority; 692 if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { 693 mfp->maxpgno = (db_pgno_t) 694 (dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize)); 695 mfp->maxpgno += (db_pgno_t) 696 ((dbmfp->bytes + mfp->stat.st_pagesize - 1) / 697 mfp->stat.st_pagesize); 698 } 699 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 700 mfp->no_backing_file = 1; 701 if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) 702 mfp->unlink_on_close = 1; 703 704 if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) 705 F_SET(mfp, MP_DURABLE_UNKNOWN); 706 if (LF_ISSET(DB_DIRECT)) 707 F_SET(mfp, MP_DIRECT); 708 if (LF_ISSET(DB_EXTENT)) 709 F_SET(mfp, MP_EXTENT); 710 if (LF_ISSET(DB_TXN_NOT_DURABLE)) 711 F_SET(mfp, MP_NOT_DURABLE); 712 F_SET(mfp, MP_CAN_MMAP); 713 714 /* 715 * An in-memory database with no name is a temp file. Named 716 * in-memory databases get an artificially bumped reference 717 * count so they don't disappear on close; they need a remove 718 * to make them disappear. 719 */ 720 if (path == NULL) 721 F_SET(mfp, MP_TEMP); 722 else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) 723 mfp->mpf_cnt++; 724 725 /* Copy the file identification string into shared memory. */ 726 if (F_ISSET(dbmfp, MP_FILEID_SET)) { 727 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 728 NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) 729 goto err; 730 memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); 731 } 732 733 /* Copy the file path into shared memory. */ 734 if (path != NULL) { 735 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 736 NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) 737 goto err; 738 memcpy(p, path, strlen(path) + 1); 739 } 740 741 /* Copy the page cookie into shared memory. */ 742 if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { 743 mfp->pgcookie_len = 0; 744 mfp->pgcookie_off = 0; 745 } else { 746 if ((ret = __memp_alloc(dbmp, dbmp->reginfo, 747 NULL, dbmfp->pgcookie->size, 748 &mfp->pgcookie_off, &p)) != 0) 749 goto err; 750 memcpy(p, 751 dbmfp->pgcookie->data, dbmfp->pgcookie->size); 752 mfp->pgcookie_len = dbmfp->pgcookie->size; 753 } 754 755 if ((ret = __mutex_alloc(env, 756 MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0) 757 goto err; 758 *retmfp = mfp; 759 760err: return (ret); 761} 762 763/* 764 * memp_fclose_pp -- 765 * DB_MPOOLFILE->close pre/post processing. 766 * 767 * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t)); 768 */ 769int 770__memp_fclose_pp(dbmfp, flags) 771 DB_MPOOLFILE *dbmfp; 772 u_int32_t flags; 773{ 774 DB_THREAD_INFO *ip; 775 ENV *env; 776 int ret; 777 778 env = dbmfp->env; 779 780 /* 781 * Validate arguments, but as a handle destructor, we can't fail. 782 */ 783 if (flags != 0) 784 (void)__db_ferr(env, "DB_MPOOLFILE->close", 0); 785 786 ENV_ENTER(env, ip); 787 REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret); 788 ENV_LEAVE(env, ip); 789 return (ret); 790} 791 792/* 793 * __memp_fclose -- 794 * DB_MPOOLFILE->close. 795 * 796 * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); 797 */ 798int 799__memp_fclose(dbmfp, flags) 800 DB_MPOOLFILE *dbmfp; 801 u_int32_t flags; 802{ 803 DB_MPOOL *dbmp; 804 ENV *env; 805 MPOOLFILE *mfp; 806 char *rpath; 807 u_int32_t ref; 808 int deleted, ret, t_ret; 809 810 env = dbmfp->env; 811 dbmp = env->mp_handle; 812 ret = 0; 813 814 /* 815 * Remove the DB_MPOOLFILE from the process' list. 816 * 817 * It's possible the underlying mpool cache may never have been created. 818 * In that case, all we have is a structure, discard it. 819 * 820 * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE 821 * file list, check the MP_OPEN_CALLED flag to be sure. 822 */ 823 if (dbmp == NULL) 824 goto done; 825 826 MUTEX_LOCK(env, dbmp->mutex); 827 828 DB_ASSERT(env, dbmfp->ref >= 1); 829 if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED)) 830 TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); 831 832 /* 833 * Decrement the file descriptor's ref count -- if we're the last ref, 834 * we'll discard the file descriptor. 835 */ 836 if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0) 837 dbmfp->fhp = NULL; 838 MUTEX_UNLOCK(env, dbmp->mutex); 839 if (ref != 0) 840 return (0); 841 842 /* Complain if pinned blocks never returned. */ 843 if (dbmfp->pinref != 0) { 844 __db_errx(env, "%s: close: %lu blocks left pinned", 845 __memp_fn(dbmfp), (u_long)dbmfp->pinref); 846 ret = __env_panic(env, DB_RUNRECOVERY); 847 } 848 849 /* Discard any mmap information. */ 850 if (dbmfp->addr != NULL && 851 (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0) 852 __db_err(env, ret, "%s", __memp_fn(dbmfp)); 853 854 /* 855 * Close the file and discard the descriptor structure; temporary 856 * files may not yet have been created. 857 */ 858 if (dbmfp->fhp != NULL) { 859 if ((t_ret = 860 __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0) 861 ret = t_ret; 862 if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) { 863 __db_err(env, t_ret, "%s", __memp_fn(dbmfp)); 864 if (ret == 0) 865 ret = t_ret; 866 } 867 dbmfp->fhp = NULL; 868 } 869 870 /* 871 * Discard our reference on the underlying MPOOLFILE, and close it 872 * if it's no longer useful to anyone. It possible the open of the 873 * file never happened or wasn't successful, in which case, mpf will 874 * be NULL and MP_OPEN_CALLED will not be set. 875 */ 876 mfp = dbmfp->mfp; 877 DB_ASSERT(env, 878 (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) || 879 (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL)); 880 if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) 881 goto done; 882 883 /* 884 * If it's a temp file, all outstanding references belong to unflushed 885 * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). 886 * We don't care about preserving any of those buffers, so mark the 887 * MPOOLFILE as dead so that even the dirty ones just get discarded 888 * when we try to flush them. 889 */ 890 deleted = 0; 891 if (!LF_ISSET(DB_MPOOL_NOLOCK)) 892 MUTEX_LOCK(env, mfp->mutex); 893 if (F_ISSET(dbmfp, MP_MULTIVERSION)) 894 --mfp->multiversion; 895 if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { 896 if (LF_ISSET(DB_MPOOL_DISCARD) || 897 F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) { 898 mfp->deadfile = 1; 899 } 900 if (mfp->unlink_on_close) { 901 if ((t_ret = __db_appname(dbmp->env, 902 DB_APP_DATA, R_ADDR(dbmp->reginfo, 903 mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0) 904 ret = t_ret; 905 if (t_ret == 0) { 906 if ((t_ret = __os_unlink( 907 dbmp->env, rpath, 0)) != 0 && ret == 0) 908 ret = t_ret; 909 __os_free(env, rpath); 910 } 911 } 912 if (mfp->block_cnt == 0) { 913 /* 914 * We should never discard this mp file if our caller 915 * is holding the lock on it. See comment in 916 * __memp_sync_file. 917 */ 918 DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK)); 919 if ((t_ret = 920 __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) 921 ret = t_ret; 922 deleted = 1; 923 } 924 } 925 if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK)) 926 MUTEX_UNLOCK(env, mfp->mutex); 927 928done: /* Discard the DB_MPOOLFILE structure. */ 929 if (dbmfp->pgcookie != NULL) { 930 __os_free(env, dbmfp->pgcookie->data); 931 __os_free(env, dbmfp->pgcookie); 932 } 933 __os_free(env, dbmfp); 934 935 return (ret); 936} 937 938/* 939 * __memp_mf_discard -- 940 * Discard an MPOOLFILE. 941 * 942 * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); 943 */ 944int 945__memp_mf_discard(dbmp, mfp) 946 DB_MPOOL *dbmp; 947 MPOOLFILE *mfp; 948{ 949 DB_MPOOL_HASH *hp; 950 ENV *env; 951#ifdef HAVE_STATISTICS 952 DB_MPOOL_STAT *sp; 953#endif 954 MPOOL *mp; 955 int need_sync, ret, t_ret; 956 957 env = dbmp->env; 958 mp = dbmp->reginfo[0].primary; 959 hp = R_ADDR(dbmp->reginfo, mp->ftab); 960 hp += mfp->bucket; 961 ret = 0; 962 963 /* 964 * Expects caller to be holding the MPOOLFILE mutex. 965 * 966 * When discarding a file, we have to flush writes from it to disk. 967 * The scenario is that dirty buffers from this file need to be 968 * flushed to satisfy a future checkpoint, but when the checkpoint 969 * calls mpool sync, the sync code won't know anything about them. 970 * Ignore files not written, discarded, or only temporary. 971 */ 972 need_sync = 973 mfp->file_written && !mfp->deadfile && !F_ISSET(mfp, MP_TEMP); 974 975 /* 976 * We have to release the MPOOLFILE mutex before acquiring the region 977 * mutex so we don't deadlock. Make sure nobody ever looks at this 978 * structure again. 979 */ 980 mfp->deadfile = 1; 981 982 /* Discard the mutex we're holding and return it too the pool. */ 983 MUTEX_UNLOCK(env, mfp->mutex); 984 if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0) 985 ret = t_ret; 986 987 /* Lock the bucket and delete from the list of MPOOLFILEs. */ 988 MUTEX_LOCK(env, hp->mtx_hash); 989 SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile); 990 MUTEX_UNLOCK(env, hp->mtx_hash); 991 992 /* Lock the region and collect stats and free the space. */ 993 MPOOL_SYSTEM_LOCK(env); 994 if (need_sync && 995 (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0) 996 ret = t_ret; 997 998#ifdef HAVE_STATISTICS 999 /* Copy the statistics into the region. */ 1000 sp = &mp->stat; 1001 sp->st_cache_hit += mfp->stat.st_cache_hit; 1002 sp->st_cache_miss += mfp->stat.st_cache_miss; 1003 sp->st_map += mfp->stat.st_map; 1004 sp->st_page_create += mfp->stat.st_page_create; 1005 sp->st_page_in += mfp->stat.st_page_in; 1006 sp->st_page_out += mfp->stat.st_page_out; 1007#endif 1008 1009 /* Free the space. */ 1010 if (mfp->path_off != 0) 1011 __memp_free(&dbmp->reginfo[0], NULL, 1012 R_ADDR(dbmp->reginfo, mfp->path_off)); 1013 if (mfp->fileid_off != 0) 1014 __memp_free(&dbmp->reginfo[0], NULL, 1015 R_ADDR(dbmp->reginfo, mfp->fileid_off)); 1016 if (mfp->pgcookie_off != 0) 1017 __memp_free(&dbmp->reginfo[0], NULL, 1018 R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); 1019 __memp_free(&dbmp->reginfo[0], NULL, mfp); 1020 1021 MPOOL_SYSTEM_UNLOCK(env); 1022 1023 return (ret); 1024} 1025 1026/* 1027 * __memp_inmemlist -- 1028 * Return a list of the named in-memory databases. 1029 * 1030 * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *)); 1031 */ 1032int 1033__memp_inmemlist(env, namesp, cntp) 1034 ENV *env; 1035 char ***namesp; 1036 int *cntp; 1037{ 1038 DB_MPOOL *dbmp; 1039 DB_MPOOL_HASH *hp; 1040 MPOOL *mp; 1041 MPOOLFILE *mfp; 1042 int arraysz, cnt, i, ret; 1043 char **names; 1044 1045 names = NULL; 1046 dbmp = env->mp_handle; 1047 mp = dbmp->reginfo[0].primary; 1048 hp = R_ADDR(dbmp->reginfo, mp->ftab); 1049 1050 arraysz = cnt = 0; 1051 for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { 1052 MUTEX_LOCK(env, hp->mtx_hash); 1053 SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { 1054 /* Skip dead files and temporary files. */ 1055 if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) 1056 continue; 1057 1058 /* Skip entries that allow files. */ 1059 if (!mfp->no_backing_file) 1060 continue; 1061 1062 /* We found one. */ 1063 if (cnt >= arraysz) { 1064 arraysz += 100; 1065 if ((ret = __os_realloc(env, 1066 (u_int)arraysz * sizeof(names[0]), 1067 &names)) != 0) 1068 goto nomem; 1069 } 1070 if ((ret = __os_strdup(env, 1071 R_ADDR(dbmp->reginfo, mfp->path_off), 1072 &names[cnt])) != 0) 1073 goto nomem; 1074 1075 cnt++; 1076 } 1077 MUTEX_UNLOCK(env, hp->mtx_hash); 1078 } 1079 *namesp = names; 1080 *cntp = cnt; 1081 return (0); 1082 1083nomem: MUTEX_UNLOCK(env, hp->mtx_hash); 1084 if (names != NULL) { 1085 while (--cnt >= 0) 1086 __os_free(env, names[cnt]); 1087 __os_free(env, names); 1088 } 1089 1090 /* Make sure we don't return any garbage. */ 1091 *cntp = 0; 1092 *namesp = NULL; 1093 return (ret); 1094} 1095