1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: env_region.c,v 12.45 2008/01/31 18:40:43 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/mp.h" 13 14static int __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **)); 15static int __env_faultmem __P((ENV *, void *, size_t, int)); 16static int __env_sys_attach __P((ENV *, REGINFO *, REGION *)); 17static int __env_sys_detach __P((ENV *, REGINFO *, int)); 18static void __env_des_destroy __P((ENV *, REGION *)); 19static void __env_remove_file __P((ENV *)); 20 21/* 22 * __env_attach 23 * Join/create the environment 24 * 25 * PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int)); 26 */ 27int 28__env_attach(env, init_flagsp, create_ok, retry_ok) 29 ENV *env; 30 u_int32_t *init_flagsp; 31 int create_ok, retry_ok; 32{ 33 DB_ENV *dbenv; 34 REGENV *renv; 35 REGENV_REF ref; 36 REGINFO *infop; 37 REGION *rp, tregion; 38 size_t nrw, size; 39 u_int32_t bytes, i, mbytes, nregions, signature; 40 u_int retry_cnt; 41 int majver, minver, patchver, ret, segid; 42 char buf[sizeof(DB_REGION_FMT) + 20]; 43 44 /* Initialization */ 45 dbenv = env->dbenv; 46 retry_cnt = 0; 47 signature = __env_struct_sig(); 48 49 /* Repeated initialization. */ 50loop: renv = NULL; 51 52 /* Set up the ENV's REG_INFO structure. */ 53 if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0) 54 return (ret); 55 infop->env = env; 56 infop->type = REGION_TYPE_ENV; 57 infop->id = REGION_ID_ENV; 58 infop->flags = REGION_JOIN_OK; 59 if (create_ok) 60 F_SET(infop, REGION_CREATE_OK); 61 62 /* Build the region name. */ 63 if (F_ISSET(env, ENV_PRIVATE)) 64 ret = __os_strdup(env, "process-private", &infop->name); 65 else { 66 (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); 67 ret = 68 __db_appname(env, DB_APP_NONE, buf, 0, NULL, &infop->name); 69 } 70 if (ret != 0) 71 goto err; 72 73 /* 74 * We have to single-thread the creation of the REGENV region. Once 75 * it exists, we can serialize using region mutexes, but until then 76 * we have to be the only player in the game. 77 * 78 * If this is a private environment, we are only called once and there 79 * are no possible race conditions. 80 * 81 * If this is a public environment, we use the filesystem to ensure 82 * the creation of the environment file is single-threaded. 83 * 84 * If the application has specified their own mapping functions, try 85 * and create the region. The application will have to let us know if 86 * it's actually a creation or not, and we'll have to fall-back to a 87 * join if it's not a create. 88 */ 89 if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL) 90 goto creation; 91 92 /* 93 * Try to create the file, if we have the authority. We have to ensure 94 * that multiple threads/processes attempting to simultaneously create 95 * the file are properly ordered. Open using the O_CREAT and O_EXCL 96 * flags so that multiple attempts to create the region will return 97 * failure in all but one. POSIX 1003.1 requires that EEXIST be the 98 * errno return value -- I sure hope they're right. 99 */ 100 if (create_ok) { 101 if ((ret = __os_open(env, infop->name, 0, 102 DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION, 103 env->db_mode, &env->lockfhp)) == 0) 104 goto creation; 105 if (ret != EEXIST) { 106 __db_err(env, ret, "%s", infop->name); 107 goto err; 108 } 109 } 110 111 /* The region must exist, it's not okay to recreate it. */ 112 F_CLR(infop, REGION_CREATE_OK); 113 114 /* 115 * If we couldn't create the file, try and open it. (If that fails, 116 * we're done.) 117 */ 118 if ((ret = __os_open( 119 env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0) 120 goto err; 121 122 /* 123 * !!! 124 * The region may be in system memory not backed by the filesystem 125 * (more specifically, not backed by this file), and we're joining 126 * it. In that case, the process that created it will have written 127 * out a REGENV_REF structure as its only contents. We read that 128 * structure before we do anything further, e.g., we can't just map 129 * that file in and then figure out what's going on. 130 * 131 * All of this noise is because some systems don't have a coherent VM 132 * and buffer cache, and what's worse, when you mix operations on the 133 * VM and buffer cache, half the time you hang the system. 134 * 135 * If the file is the size of an REGENV_REF structure, then we know 136 * the real region is in some other memory. (The only way you get a 137 * file that size is to deliberately write it, as it's smaller than 138 * any possible disk sector created by writing a file or mapping the 139 * file into memory.) In which case, retrieve the structure from the 140 * file and use it to acquire the referenced memory. 141 * 142 * If the structure is larger than a REGENV_REF structure, then this 143 * file is backing the shared memory region, and we just map it into 144 * memory. 145 * 146 * And yes, this makes me want to take somebody and kill them. (I 147 * digress -- but you have no freakin' idea. This is unbelievably 148 * stupid and gross, and I've probably spent six months of my life, 149 * now, trying to make different versions of it work.) 150 */ 151 if ((ret = __os_ioinfo(env, infop->name, 152 env->lockfhp, &mbytes, &bytes, NULL)) != 0) { 153 __db_err(env, ret, "%s", infop->name); 154 goto err; 155 } 156 157 /* 158 * !!! 159 * A size_t is OK -- regions get mapped into memory, and so can't 160 * be larger than a size_t. 161 */ 162 size = mbytes * MEGABYTE + bytes; 163 164 /* 165 * If the size is less than the size of a REGENV_REF structure, the 166 * region (or, possibly, the REGENV_REF structure) has not yet been 167 * completely written. Shouldn't be possible, but there's no reason 168 * not to wait awhile and try again. 169 * 170 * Otherwise, if the size is the size of a REGENV_REF structure, 171 * read it into memory and use it as a reference to the real region. 172 */ 173 if (size <= sizeof(ref)) { 174 if (size != sizeof(ref)) 175 goto retry; 176 177 if ((ret = __os_read(env, env->lockfhp, &ref, 178 sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) { 179 if (ret == 0) 180 ret = EIO; 181 __db_err(env, ret, 182 "%s: unable to read system-memory information", 183 infop->name); 184 goto err; 185 } 186 size = ref.size; 187 segid = ref.segid; 188 189 F_SET(env, ENV_SYSTEM_MEM); 190 } else if (F_ISSET(env, ENV_SYSTEM_MEM)) { 191 ret = EINVAL; 192 __db_err(env, ret, 193 "%s: existing environment not created in system memory", 194 infop->name); 195 goto err; 196 } else 197 segid = INVALID_REGION_SEGID; 198 199#ifndef HAVE_MUTEX_FCNTL 200 /* 201 * If we're not doing fcntl locking, we can close the file handle. We 202 * no longer need it and the less contact between the buffer cache and 203 * the VM, the better. 204 */ 205 (void)__os_closehandle(env, env->lockfhp); 206 env->lockfhp = NULL; 207#endif 208 209 /* Call the region join routine to acquire the region. */ 210 memset(&tregion, 0, sizeof(tregion)); 211 tregion.size = (roff_t)size; 212 tregion.segid = segid; 213 if ((ret = __env_sys_attach(env, infop, &tregion)) != 0) 214 goto err; 215 216user_map_functions: 217 /* 218 * The environment's REGENV structure has to live at offset 0 instead 219 * of the usual alloc information. Set the primary reference and 220 * correct the "addr" value to reference the alloc region. Note, 221 * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted 222 * as well, but that should be fine. 223 */ 224 infop->primary = infop->addr; 225 infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); 226 renv = infop->primary; 227 228 /* 229 * Make sure the region matches our build. Special case a region 230 * that's all nul bytes, just treat it like any other corruption. 231 */ 232 if (renv->majver != DB_VERSION_MAJOR || 233 renv->minver != DB_VERSION_MINOR) { 234 if (renv->majver != 0 || renv->minver != 0) { 235 __db_errx(env, 236 "Program version %d.%d doesn't match environment version %d.%d", 237 DB_VERSION_MAJOR, DB_VERSION_MINOR, 238 renv->majver, renv->minver); 239 ret = DB_VERSION_MISMATCH; 240 } else 241 ret = EINVAL; 242 goto err; 243 } 244 if (renv->signature != signature) { 245 __db_errx(env, "Build signature doesn't match environment"); 246 ret = DB_VERSION_MISMATCH; 247 goto err; 248 } 249 250 /* 251 * Check if the environment has had a catastrophic failure. 252 * 253 * Check the magic number to ensure the region is initialized. If the 254 * magic number isn't set, the lock may not have been initialized, and 255 * an attempt to use it could lead to random behavior. 256 * 257 * The panic and magic values aren't protected by any lock, so we never 258 * use them in any check that's more complex than set/not-set. 259 * 260 * !!! 261 * I'd rather play permissions games using the underlying file, but I 262 * can't because Windows/NT filesystems won't open files mode 0. 263 */ 264 if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) { 265 ret = __env_panic_msg(env); 266 goto err; 267 } 268 if (renv->magic != DB_REGION_MAGIC) 269 goto retry; 270 271 /* 272 * Get a reference to the underlying REGION information for this 273 * environment. 274 */ 275 if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL) 276 goto find_err; 277 infop->rp = rp; 278 279 /* 280 * There's still a possibility for inconsistent data. When we acquired 281 * the size of the region and attached to it, it might have still been 282 * growing as part of its creation. We can detect this by checking the 283 * size we originally found against the region's current size. (The 284 * region's current size has to be final, the creator finished growing 285 * it before setting the magic number in the region.) 286 * 287 * !!! 288 * Skip this test when the application specified its own map functions. 289 * The size of the region is essentially unknown in that case: some 290 * other process asked the application's map function for some bytes, 291 * but we were never told the final size of the region. We could get 292 * a size back from the map function, but for all we know, our process' 293 * map function only knows how to join regions, it has no clue how big 294 * those regions are. 295 */ 296 if (DB_GLOBAL(j_region_map) == NULL && rp->size != size) 297 goto retry; 298 299 /* 300 * Check our callers configuration flags, it's an error to configure 301 * incompatible or additional subsystems in an existing environment. 302 * Return the total set of flags to the caller so they initialize the 303 * correct set of subsystems. 304 */ 305 if (init_flagsp != NULL) { 306 FLD_CLR(*init_flagsp, renv->init_flags); 307 if (*init_flagsp != 0) { 308 __db_errx(env, 309 "configured environment flags incompatible with existing environment"); 310 ret = EINVAL; 311 goto err; 312 } 313 *init_flagsp = renv->init_flags; 314 } 315 316 /* 317 * Fault the pages into memory. Note, do this AFTER releasing the 318 * lock, because we're only reading the pages, not writing them. 319 */ 320 (void)__env_faultmem(env, infop->primary, rp->size, 0); 321 322 /* Everything looks good, we're done. */ 323 env->reginfo = infop; 324 return (0); 325 326creation: 327 /* Create the environment region. */ 328 F_SET(infop, REGION_CREATE); 329 330 /* 331 * Allocate room for REGION structures plus overhead. 332 * 333 * XXX 334 * Overhead is so high because encryption passwds, replication vote 335 * arrays and the thread control block table are all stored in the 336 * base environment region. This is a bug, at the least replication 337 * should have its own region. 338 * 339 * Allocate space for thread info blocks. Max is only advisory, 340 * so we allocate 25% more. 341 */ 342 memset(&tregion, 0, sizeof(tregion)); 343 nregions = __memp_max_regions(env) + 10; 344 size = nregions * sizeof(REGION); 345 size += dbenv->passwd_len; 346 size += (dbenv->thr_max + dbenv->thr_max / 4) * 347 __env_alloc_size(sizeof(DB_THREAD_INFO)); 348 size += env->thr_nbucket * __env_alloc_size(sizeof(DB_HASHTAB)); 349 size += 16 * 1024; 350 tregion.size = size; 351 tregion.segid = INVALID_REGION_SEGID; 352 if ((ret = __env_sys_attach(env, infop, &tregion)) != 0) 353 goto err; 354 355 /* 356 * If the application has specified its own mapping functions, we don't 357 * know until we get here if we are creating the region or not. The 358 * way we find out is underlying functions clear the REGION_CREATE flag. 359 */ 360 if (!F_ISSET(infop, REGION_CREATE)) 361 goto user_map_functions; 362 363 /* 364 * Fault the pages into memory. Note, do this BEFORE we initialize 365 * anything, because we're writing the pages, not just reading them. 366 */ 367 (void)__env_faultmem(env, infop->addr, tregion.size, 1); 368 369 /* 370 * The first object in the region is the REGENV structure. This is 371 * different from the other regions, and, from everything else in 372 * this region, where all objects are allocated from the pool, i.e., 373 * there aren't any fixed locations. The remaining space is made 374 * available for later allocation. 375 * 376 * The allocation space must be size_t aligned, because that's what 377 * the initialization routine is going to store there. To make sure 378 * that happens, the REGENV structure was padded with a final size_t. 379 * No other region needs to worry about it because all of them treat 380 * the entire region as allocation space. 381 * 382 * Set the primary reference and correct the "addr" value to reference 383 * the alloc region. Note, this requires that we "uncorrect" it at 384 * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be 385 * shifted as well, but that should be fine. 386 */ 387 infop->primary = infop->addr; 388 infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); 389 __env_alloc_init(infop, tregion.size - sizeof(REGENV)); 390 391 /* 392 * Initialize the rest of the REGENV structure. (Don't set the magic 393 * number to the correct value, that would validate the environment). 394 */ 395 renv = infop->primary; 396 renv->magic = 0; 397 renv->panic = 0; 398 399 (void)db_version(&majver, &minver, &patchver); 400 renv->majver = (u_int32_t)majver; 401 renv->minver = (u_int32_t)minver; 402 renv->patchver = (u_int32_t)patchver; 403 renv->signature = signature; 404 405 (void)time(&renv->timestamp); 406 __os_unique_id(env, &renv->envid); 407 408 /* 409 * Initialize init_flags to store the flags that any other environment 410 * handle that uses DB_JOINENV to join this environment will need. 411 */ 412 renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp; 413 414 /* 415 * Set up the region array. We use an array rather than a linked list 416 * as we have to traverse this list after failure in some cases, and 417 * we don't want to infinitely loop should the application fail while 418 * we're manipulating the list. 419 */ 420 renv->region_cnt = nregions; 421 if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) { 422 __db_err( 423 env, ret, "unable to create new master region array"); 424 goto err; 425 } 426 renv->region_off = R_OFFSET(infop, rp); 427 for (i = 0; i < nregions; ++i, ++rp) 428 rp->id = INVALID_REGION_ID; 429 430 renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF; 431 renv->flags = 0; 432 renv->op_timestamp = renv->rep_timestamp = 0; 433 renv->mtx_regenv = MUTEX_INVALID; 434 435 /* 436 * Get the underlying REGION structure for this environment. Note, 437 * we created the underlying OS region before we acquired the REGION 438 * structure, which is backwards from the normal procedure. Update 439 * the REGION structure. 440 */ 441 if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) { 442find_err: __db_errx(env, "%s: unable to find environment", infop->name); 443 if (ret == 0) 444 ret = EINVAL; 445 goto err; 446 } 447 infop->rp = rp; 448 rp->size = tregion.size; 449 rp->segid = tregion.segid; 450 451 /* 452 * !!! 453 * If we create an environment where regions are public and in system 454 * memory, we have to inform processes joining the environment how to 455 * attach to the shared memory segment. So, we write the shared memory 456 * identifier into the file, to be read by those other processes. 457 * 458 * XXX 459 * This is really OS-layer information, but I can't see any easy way 460 * to move it down there without passing down information that it has 461 * no right to know, e.g., that this is the one-and-only REGENV region 462 * and not some other random region. 463 */ 464 if (tregion.segid != INVALID_REGION_SEGID) { 465 ref.size = tregion.size; 466 ref.segid = tregion.segid; 467 if ((ret = __os_write( 468 env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) { 469 __db_err(env, ret, 470 "%s: unable to write out public environment ID", 471 infop->name); 472 goto err; 473 } 474 } 475 476#ifndef HAVE_MUTEX_FCNTL 477 /* 478 * If we're not doing fcntl locking, we can close the file handle. We 479 * no longer need it and the less contact between the buffer cache and 480 * the VM, the better. 481 */ 482 if (env->lockfhp != NULL) { 483 (void)__os_closehandle(env, env->lockfhp); 484 env->lockfhp = NULL; 485 } 486#endif 487 488 /* Everything looks good, we're done. */ 489 env->reginfo = infop; 490 return (0); 491 492err: 493retry: /* Close any open file handle. */ 494 if (env->lockfhp != NULL) { 495 (void)__os_closehandle(env, env->lockfhp); 496 env->lockfhp = NULL; 497 } 498 499 /* 500 * If we joined or created the region, detach from it. If we created 501 * it, destroy it. Note, there's a path in the above code where we're 502 * using a temporary REGION structure because we haven't yet allocated 503 * the real one. In that case the region address (addr) will be filled 504 * in, but the REGION pointer (rp) won't. Fix it. 505 */ 506 if (infop->addr != NULL) { 507 if (infop->rp == NULL) 508 infop->rp = &tregion; 509 510 /* Reset the addr value that we "corrected" above. */ 511 infop->addr = infop->primary; 512 (void)__env_sys_detach(env, 513 infop, F_ISSET(infop, REGION_CREATE)); 514 } 515 516 /* Free the allocated name and/or REGINFO structure. */ 517 if (infop->name != NULL) 518 __os_free(env, infop->name); 519 __os_free(env, infop); 520 521 /* If we had a temporary error, wait awhile and try again. */ 522 if (ret == 0) { 523 if (!retry_ok || ++retry_cnt > 3) { 524 __db_errx(env, "unable to join the environment"); 525 ret = EAGAIN; 526 } else { 527 __os_yield(env, retry_cnt * 3, 0); 528 goto loop; 529 } 530 } 531 532 return (ret); 533} 534 535/* 536 * __env_turn_on -- 537 * Turn on the created environment. 538 * 539 * PUBLIC: int __env_turn_on __P((ENV *)); 540 */ 541int 542__env_turn_on(env) 543 ENV *env; 544{ 545 REGENV *renv; 546 REGINFO *infop; 547 548 infop = env->reginfo; 549 renv = infop->primary; 550 551 /* If we didn't create the region, there's no need for further work. */ 552 if (!F_ISSET(infop, REGION_CREATE)) 553 return (0); 554 555 /* 556 * Validate the file. All other threads of control are waiting 557 * on this value to be written -- "Let slip the hounds of war!" 558 */ 559 renv->magic = DB_REGION_MAGIC; 560 561 return (0); 562} 563 564/* 565 * __env_turn_off -- 566 * Turn off the environment. 567 * 568 * PUBLIC: int __env_turn_off __P((ENV *, u_int32_t)); 569 */ 570int 571__env_turn_off(env, flags) 572 ENV *env; 573 u_int32_t flags; 574{ 575 REGENV *renv; 576 REGINFO *infop; 577 int ret, t_ret; 578 579 ret = 0; 580 581 /* 582 * Connect to the environment: If we can't join the environment, we 583 * guess it's because it doesn't exist and we're done. 584 * 585 * If the environment exists, attach and lock the environment. 586 */ 587 if (__env_attach(env, NULL, 0, 1) != 0) 588 return (0); 589 590 infop = env->reginfo; 591 renv = infop->primary; 592 593 MUTEX_LOCK(env, renv->mtx_regenv); 594 595 /* 596 * If the environment is in use, we're done unless we're forcing the 597 * issue or the environment has panic'd. (If the environment panic'd, 598 * the thread holding the reference count may not have cleaned up, so 599 * we clean up. It's possible the application didn't plan on removing 600 * the environment in this particular call, but panic'd environments 601 * aren't useful to anyone.) 602 * 603 * Otherwise, panic the environment and overwrite the magic number so 604 * any thread of control attempting to connect (or racing with us) will 605 * back off and retry, or just die. 606 */ 607 if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic) 608 ret = EBUSY; 609 else 610 renv->panic = 1; 611 612 /* 613 * Unlock the environment (nobody should need this lock because 614 * we've poisoned the pool) and detach from the environment. 615 */ 616 MUTEX_UNLOCK(env, renv->mtx_regenv); 617 618 if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0) 619 ret = t_ret; 620 621 return (ret); 622} 623 624/* 625 * __env_panic_set -- 626 * Set/clear unrecoverable error. 627 * 628 * PUBLIC: void __env_panic_set __P((ENV *, int)); 629 */ 630void 631__env_panic_set(env, on) 632 ENV *env; 633 int on; 634{ 635 if (env != NULL && env->reginfo != NULL) 636 ((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0; 637} 638 639/* 640 * __env_ref_increment -- 641 * Increment the environment's reference count. 642 * 643 * PUBLIC: int __env_ref_increment __P((ENV *)); 644 */ 645int 646__env_ref_increment(env) 647 ENV *env; 648{ 649 REGENV *renv; 650 REGINFO *infop; 651 int ret; 652 653 infop = env->reginfo; 654 renv = infop->primary; 655 656 /* If we're creating the primary region, allocate a mutex. */ 657 if (F_ISSET(infop, REGION_CREATE)) { 658 if ((ret = __mutex_alloc( 659 env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0) 660 return (ret); 661 renv->refcnt = 1; 662 } else { 663 /* Lock the environment, increment the reference, unlock. */ 664 MUTEX_LOCK(env, renv->mtx_regenv); 665 ++renv->refcnt; 666 MUTEX_UNLOCK(env, renv->mtx_regenv); 667 } 668 669 F_SET(env, ENV_REF_COUNTED); 670 return (0); 671} 672 673/* 674 * __env_ref_decrement -- 675 * Decrement the environment's reference count. 676 * 677 * PUBLIC: int __env_ref_decrement __P((ENV *)); 678 */ 679int 680__env_ref_decrement(env) 681 ENV *env; 682{ 683 REGENV *renv; 684 REGINFO *infop; 685 686 /* Be cautious -- we may not have an environment. */ 687 if ((infop = env->reginfo) == NULL) 688 return (0); 689 690 renv = infop->primary; 691 692 /* Even if we have an environment, may not have reference counted it. */ 693 if (F_ISSET(env, ENV_REF_COUNTED)) { 694 /* Lock the environment, decrement the reference, unlock. */ 695 MUTEX_LOCK(env, renv->mtx_regenv); 696 if (renv->refcnt == 0) 697 __db_errx(env, 698 "environment reference count went negative"); 699 else 700 --renv->refcnt; 701 MUTEX_UNLOCK(env, renv->mtx_regenv); 702 703 F_CLR(env, ENV_REF_COUNTED); 704 } 705 706 /* If a private environment, we're done with the mutex, destroy it. */ 707 return (F_ISSET(env, ENV_PRIVATE) ? 708 __mutex_free(env, &renv->mtx_regenv) : 0); 709} 710 711/* 712 * __env_detach -- 713 * Detach from the environment. 714 * 715 * PUBLIC: int __env_detach __P((ENV *, int)); 716 */ 717int 718__env_detach(env, destroy) 719 ENV *env; 720 int destroy; 721{ 722 REGENV *renv; 723 REGINFO *infop; 724 REGION rp; 725 int ret, t_ret; 726 727 infop = env->reginfo; 728 renv = infop->primary; 729 ret = 0; 730 731 /* Close the locking file handle. */ 732 if (env->lockfhp != NULL) { 733 if ((t_ret = 734 __os_closehandle(env, env->lockfhp)) != 0 && ret == 0) 735 ret = t_ret; 736 env->lockfhp = NULL; 737 } 738 739 /* 740 * If a private region, return the memory to the heap. Not needed for 741 * filesystem-backed or system shared memory regions, that memory isn't 742 * owned by any particular process. 743 */ 744 if (destroy) { 745 /* 746 * Free the REGION array. 747 * 748 * The actual underlying region structure is allocated from the 749 * primary shared region, and we're about to free it. Save a 750 * copy on our stack for the REGINFO to reference when it calls 751 * down into the OS layer to release the shared memory segment. 752 */ 753 rp = *infop->rp; 754 infop->rp = &rp; 755 756 if (renv->region_off != INVALID_ROFF) 757 __env_alloc_free( 758 infop, R_ADDR(infop, renv->region_off)); 759 } 760 761 /* 762 * Set the ENV->reginfo field to NULL. BDB uses the ENV->reginfo 763 * field to decide if the underlying region can be accessed or needs 764 * cleanup. We're about to destroy what it references, so it needs to 765 * be cleared. 766 */ 767 env->reginfo = NULL; 768 769 /* Reset the addr value that we "corrected" above. */ 770 infop->addr = infop->primary; 771 772 if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0) 773 ret = t_ret; 774 if (infop->name != NULL) 775 __os_free(env, infop->name); 776 777 /* Discard the ENV->reginfo field's memory. */ 778 __os_free(env, infop); 779 780 return (ret); 781} 782 783/* 784 * __env_remove_env -- 785 * Remove an environment. 786 * 787 * PUBLIC: int __env_remove_env __P((ENV *)); 788 */ 789int 790__env_remove_env(env) 791 ENV *env; 792{ 793 DB_ENV *dbenv; 794 REGENV *renv; 795 REGINFO *infop, reginfo; 796 REGION *rp; 797 u_int32_t flags_orig, i; 798 799 dbenv = env->dbenv; 800 801 /* 802 * We do not want to hang on a mutex request, nor do we care about 803 * panics. 804 */ 805 flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); 806 F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); 807 808 /* 809 * This routine has to walk a nasty line between not looking into the 810 * environment (which may be corrupted after an app or system crash), 811 * and removing everything that needs removing. 812 * 813 * Connect to the environment: If we can't join the environment, we 814 * guess it's because it doesn't exist. Remove the underlying files, 815 * at least. 816 */ 817 if (__env_attach(env, NULL, 0, 0) != 0) 818 goto remfiles; 819 820 infop = env->reginfo; 821 renv = infop->primary; 822 823 /* 824 * Kill the environment, if it's not already dead. 825 */ 826 renv->panic = 1; 827 828 /* 829 * Walk the array of regions. Connect to each region and disconnect 830 * with the destroy flag set. This shouldn't cause any problems, even 831 * if the region is corrupted, because we never look inside the region 832 * (with the single exception of mutex regions on systems where we have 833 * to return resources to the underlying system). 834 */ 835 for (rp = R_ADDR(infop, renv->region_off), 836 i = 0; i < renv->region_cnt; ++i, ++rp) { 837 if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV) 838 continue; 839 /* 840 * !!! 841 * The REGION_CREATE_OK flag is set for Windows/95 -- regions 842 * are zero'd out when the last reference to the region goes 843 * away, in which case the underlying OS region code requires 844 * callers be prepared to create the region in order to join it. 845 */ 846 memset(®info, 0, sizeof(reginfo)); 847 reginfo.id = rp->id; 848 reginfo.flags = REGION_CREATE_OK; 849 850 /* 851 * If we get here and can't attach and/or detach to the 852 * region, it's a mess. Ignore errors, there's nothing 853 * we can do about them. 854 */ 855 if (__env_region_attach(env, ®info, 0) != 0) 856 continue; 857 858#ifdef HAVE_MUTEX_SYSTEM_RESOURCES 859 /* 860 * If destroying the mutex region, return any system 861 * resources to the system. 862 */ 863 if (reginfo.type == REGION_TYPE_MUTEX) 864 __mutex_resource_return(env, ®info); 865#endif 866 (void)__env_region_detach(env, ®info, 1); 867 } 868 869 /* Detach from the environment's primary region. */ 870 (void)__env_detach(env, 1); 871 872remfiles: 873 /* 874 * Walk the list of files in the directory, unlinking files in the 875 * Berkeley DB name space. 876 */ 877 __env_remove_file(env); 878 879 F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); 880 F_SET(dbenv, flags_orig); 881 882 return (0); 883} 884 885/* 886 * __env_remove_file -- 887 * Discard any region files in the filesystem. 888 */ 889static void 890__env_remove_file(env) 891 ENV *env; 892{ 893 int cnt, fcnt, lastrm, ret; 894 const char *dir; 895 char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20]; 896 897 /* Get the full path of a file in the environment. */ 898 (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); 899 if ((ret = __db_appname(env, DB_APP_NONE, buf, 0, NULL, &path)) != 0) 900 return; 901 902 /* Get the parent directory for the environment. */ 903 if ((p = __db_rpath(path)) == NULL) { 904 p = path; 905 saved_char = *p; 906 907 dir = PATH_DOT; 908 } else { 909 saved_char = *p; 910 *p = '\0'; 911 912 dir = path; 913 } 914 915 /* Get the list of file names. */ 916 if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) 917 __db_err(env, ret, "%s", dir); 918 919 /* Restore the path, and free it. */ 920 *p = saved_char; 921 __os_free(env, path); 922 923 if (ret != 0) 924 return; 925 926 /* 927 * Remove files from the region directory. 928 */ 929 for (lastrm = -1, cnt = fcnt; --cnt >= 0;) { 930 /* Skip anything outside our name space. */ 931 if (strncmp(names[cnt], 932 DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1)) 933 continue; 934 935 /* Skip queue extent files. */ 936 if (strncmp(names[cnt], "__dbq.", 6) == 0) 937 continue; 938 939 /* Skip registry files. */ 940 if (strncmp(names[cnt], "__db.register", 13) == 0) 941 continue; 942 943 /* Skip replication files. */ 944 if (strncmp(names[cnt], "__db.rep", 8) == 0) 945 continue; 946 947 /* 948 * Remove the primary environment region last, because it's 949 * the key to this whole mess. 950 */ 951 if (strcmp(names[cnt], DB_REGION_ENV) == 0) { 952 lastrm = cnt; 953 continue; 954 } 955 956 /* Remove the file. */ 957 if (__db_appname(env, 958 DB_APP_NONE, names[cnt], 0, NULL, &path) == 0) { 959 /* 960 * Overwrite region files. Temporary files would have 961 * been maintained in encrypted format, so there's no 962 * reason to overwrite them. This is not an exact 963 * check on the file being a region file, but it's 964 * not likely to be wrong, and the worst thing that can 965 * happen is we overwrite a file that didn't need to be 966 * overwritten. 967 */ 968 (void)__os_unlink(env, path, 1); 969 __os_free(env, path); 970 } 971 } 972 973 if (lastrm != -1) 974 if (__db_appname(env, 975 DB_APP_NONE, names[lastrm], 0, NULL, &path) == 0) { 976 (void)__os_unlink(env, path, 1); 977 __os_free(env, path); 978 } 979 __os_dirfree(env, names, fcnt); 980} 981 982/* 983 * __env_region_attach 984 * Join/create a region. 985 * 986 * PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t)); 987 */ 988int 989__env_region_attach(env, infop, size) 990 ENV *env; 991 REGINFO *infop; 992 size_t size; 993{ 994 REGION *rp; 995 int ret; 996 char buf[sizeof(DB_REGION_FMT) + 20]; 997 998 /* 999 * Find or create a REGION structure for this region. If we create 1000 * it, the REGION_CREATE flag will be set in the infop structure. 1001 */ 1002 F_CLR(infop, REGION_CREATE); 1003 if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0) 1004 return (ret); 1005 infop->env = env; 1006 infop->rp = rp; 1007 infop->type = rp->type; 1008 infop->id = rp->id; 1009 1010 /* 1011 * __env_des_get may have created the region and reset the create 1012 * flag. If we're creating the region, set the desired size. 1013 */ 1014 if (F_ISSET(infop, REGION_CREATE)) 1015 rp->size = (roff_t)size; 1016 1017 /* Join/create the underlying region. */ 1018 (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id); 1019 if ((ret = __db_appname(env, 1020 DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0) 1021 goto err; 1022 if ((ret = __env_sys_attach(env, infop, rp)) != 0) 1023 goto err; 1024 1025 /* 1026 * Fault the pages into memory. Note, do this BEFORE we initialize 1027 * anything because we're writing pages in created regions, not just 1028 * reading them. 1029 */ 1030 (void)__env_faultmem(env, 1031 infop->addr, rp->size, F_ISSET(infop, REGION_CREATE)); 1032 1033 /* 1034 * !!! 1035 * The underlying layer may have just decided that we are going 1036 * to create the region. There are various system issues that 1037 * can result in a useless region that requires re-initialization. 1038 * 1039 * If we created the region, initialize it for allocation. 1040 */ 1041 if (F_ISSET(infop, REGION_CREATE)) 1042 __env_alloc_init(infop, rp->size); 1043 1044 return (0); 1045 1046err: /* Discard the underlying region. */ 1047 if (infop->addr != NULL) 1048 (void)__env_sys_detach(env, 1049 infop, F_ISSET(infop, REGION_CREATE)); 1050 infop->rp = NULL; 1051 infop->id = INVALID_REGION_ID; 1052 1053 /* Discard the REGION structure if we created it. */ 1054 if (F_ISSET(infop, REGION_CREATE)) { 1055 __env_des_destroy(env, rp); 1056 F_CLR(infop, REGION_CREATE); 1057 } 1058 1059 return (ret); 1060} 1061 1062/* 1063 * __env_region_detach -- 1064 * Detach from a region. 1065 * 1066 * PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int)); 1067 */ 1068int 1069__env_region_detach(env, infop, destroy) 1070 ENV *env; 1071 REGINFO *infop; 1072 int destroy; 1073{ 1074 REGION *rp; 1075 int ret; 1076 1077 rp = infop->rp; 1078 if (F_ISSET(env, ENV_PRIVATE)) 1079 destroy = 1; 1080 1081 /* 1082 * When discarding the regions as we shut down a database environment, 1083 * discard any allocated shared memory segments. This is the last time 1084 * we use them, and db_region_destroy is the last region-specific call 1085 * we make. 1086 */ 1087 if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL) 1088 __env_alloc_free(infop, infop->primary); 1089 1090 /* Detach from the underlying OS region. */ 1091 ret = __env_sys_detach(env, infop, destroy); 1092 1093 /* If we destroyed the region, discard the REGION structure. */ 1094 if (destroy) 1095 __env_des_destroy(env, rp); 1096 1097 /* Destroy the structure. */ 1098 if (infop->name != NULL) 1099 __os_free(env, infop->name); 1100 1101 return (ret); 1102} 1103 1104/* 1105 * __env_sys_attach -- 1106 * Prep and call the underlying OS attach function. 1107 */ 1108static int 1109__env_sys_attach(env, infop, rp) 1110 ENV *env; 1111 REGINFO *infop; 1112 REGION *rp; 1113{ 1114 int ret; 1115 1116 /* 1117 * All regions are created on 8K boundaries out of sheer paranoia, 1118 * so we don't make some underlying VM unhappy. Make sure we don't 1119 * overflow or underflow. 1120 */ 1121#define OS_VMPAGESIZE (8 * 1024) 1122#define OS_VMROUNDOFF(i) { \ 1123 if ((i) < \ 1124 (UINT32_MAX - OS_VMPAGESIZE) + 1 || (i) < OS_VMPAGESIZE) \ 1125 (i) += OS_VMPAGESIZE - 1; \ 1126 (i) -= (i) % OS_VMPAGESIZE; \ 1127} 1128 OS_VMROUNDOFF(rp->size); 1129 1130#ifdef DB_REGIONSIZE_MAX 1131 /* Some architectures have hard limits on the maximum region size. */ 1132 if (rp->size > DB_REGIONSIZE_MAX) { 1133 __db_errx(env, "region size %lu is too large; maximum is %lu", 1134 (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX); 1135 return (EINVAL); 1136 } 1137#endif 1138 1139 /* 1140 * If a region is private, malloc the memory. 1141 * 1142 * !!! 1143 * If this fails because the region is too large to malloc, mmap(2) 1144 * using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative. 1145 * I don't know of any architectures (yet!) where malloc is a problem. 1146 */ 1147 if (F_ISSET(env, ENV_PRIVATE)) { 1148#if defined(HAVE_MUTEX_HPPA_MSEM_INIT) 1149 /* 1150 * !!! 1151 * There exist spinlocks that don't work in malloc memory, e.g., 1152 * the HP/UX msemaphore interface. If we don't have locks that 1153 * will work in malloc memory, we better not be private or not 1154 * be threaded. 1155 */ 1156 if (F_ISSET(env, ENV_THREAD)) { 1157 __db_errx(env, "%s", 1158 "architecture does not support locks inside process-local (malloc) memory"); 1159 __db_errx(env, "%s", 1160 "application may not specify both DB_PRIVATE and DB_THREAD"); 1161 return (EINVAL); 1162 } 1163#endif 1164 if ((ret = __os_malloc( 1165 env, sizeof(REGENV), &infop->addr)) != 0) 1166 return (ret); 1167 1168 infop->max_alloc = rp->size; 1169 } else 1170 if ((ret = __os_attach(env, infop, rp)) != 0) 1171 return (ret); 1172 1173 /* 1174 * We may require alignment the underlying system or heap allocation 1175 * library doesn't supply. Align the address if necessary, saving 1176 * the original values for restoration when the region is discarded. 1177 */ 1178 infop->addr_orig = infop->addr; 1179 infop->addr = ALIGNP_INC(infop->addr_orig, sizeof(size_t)); 1180 1181 rp->size_orig = rp->size; 1182 if (infop->addr != infop->addr_orig) 1183 rp->size -= (roff_t) 1184 ((u_int8_t *)infop->addr - (u_int8_t *)infop->addr_orig); 1185 1186 return (0); 1187} 1188 1189/* 1190 * __env_sys_detach -- 1191 * Prep and call the underlying OS detach function. 1192 */ 1193static int 1194__env_sys_detach(env, infop, destroy) 1195 ENV *env; 1196 REGINFO *infop; 1197 int destroy; 1198{ 1199 REGION *rp; 1200 1201 rp = infop->rp; 1202 1203 /* Restore any address/size altered for alignment reasons. */ 1204 if (infop->addr != infop->addr_orig) { 1205 infop->addr = infop->addr_orig; 1206 rp->size = rp->size_orig; 1207 } 1208 1209 /* If a region is private, free the memory. */ 1210 if (F_ISSET(env, ENV_PRIVATE)) { 1211 __os_free(env, infop->addr); 1212 return (0); 1213 } 1214 1215 return (__os_detach(env, infop, destroy)); 1216} 1217 1218/* 1219 * __env_des_get -- 1220 * Return a reference to the shared information for a REGION, 1221 * optionally creating a new entry. 1222 */ 1223static int 1224__env_des_get(env, env_infop, infop, rpp) 1225 ENV *env; 1226 REGINFO *env_infop, *infop; 1227 REGION **rpp; 1228{ 1229 REGENV *renv; 1230 REGION *rp, *empty_slot, *first_type; 1231 u_int32_t i, maxid; 1232 1233 *rpp = NULL; 1234 renv = env_infop->primary; 1235 1236 /* 1237 * If the caller wants to join a region, walk through the existing 1238 * regions looking for a matching ID (if ID specified) or matching 1239 * type (if type specified). If we return based on a matching type 1240 * return the "primary" region, that is, the first region that was 1241 * created of this type. 1242 * 1243 * Track the first empty slot and maximum region ID for new region 1244 * allocation. 1245 * 1246 * MaxID starts at REGION_ID_ENV, the ID of the primary environment. 1247 */ 1248 maxid = REGION_ID_ENV; 1249 empty_slot = first_type = NULL; 1250 for (rp = R_ADDR(env_infop, renv->region_off), 1251 i = 0; i < renv->region_cnt; ++i, ++rp) { 1252 if (rp->id == INVALID_REGION_ID) { 1253 if (empty_slot == NULL) 1254 empty_slot = rp; 1255 continue; 1256 } 1257 if (infop->id != INVALID_REGION_ID) { 1258 if (infop->id == rp->id) 1259 break; 1260 continue; 1261 } 1262 if (infop->type == rp->type && 1263 F_ISSET(infop, REGION_JOIN_OK) && 1264 (first_type == NULL || first_type->id > rp->id)) 1265 first_type = rp; 1266 1267 if (rp->id > maxid) 1268 maxid = rp->id; 1269 } 1270 1271 /* If we found a matching ID (or a matching type), return it. */ 1272 if (i >= renv->region_cnt) 1273 rp = first_type; 1274 if (rp != NULL) { 1275 *rpp = rp; 1276 return (0); 1277 } 1278 1279 /* 1280 * If we didn't find a region and we don't have permission to create 1281 * the region, fail. The caller generates any error message. 1282 */ 1283 if (!F_ISSET(infop, REGION_CREATE_OK)) 1284 return (ENOENT); 1285 1286 /* 1287 * If we didn't find a region and don't have room to create the region 1288 * fail with an error message, there's a sizing problem. 1289 */ 1290 if (empty_slot == NULL) { 1291 __db_errx(env, "no room remaining for additional REGIONs"); 1292 return (ENOENT); 1293 } 1294 1295 /* 1296 * Initialize a REGION structure for the caller. If id was set, use 1297 * that value, otherwise we use the next available ID. 1298 */ 1299 memset(empty_slot, 0, sizeof(REGION)); 1300 empty_slot->segid = INVALID_REGION_SEGID; 1301 1302 /* 1303 * Set the type and ID; if no region ID was specified, 1304 * allocate one. 1305 */ 1306 empty_slot->type = infop->type; 1307 empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id; 1308 1309 F_SET(infop, REGION_CREATE); 1310 1311 *rpp = empty_slot; 1312 return (0); 1313} 1314 1315/* 1316 * __env_des_destroy -- 1317 * Destroy a reference to a REGION. 1318 */ 1319static void 1320__env_des_destroy(env, rp) 1321 ENV *env; 1322 REGION *rp; 1323{ 1324 COMPQUIET(env, NULL); 1325 1326 rp->id = INVALID_REGION_ID; 1327} 1328 1329/* 1330 * __env_faultmem -- 1331 * Fault the region into memory. 1332 */ 1333static int 1334__env_faultmem(env, addr, size, created) 1335 ENV *env; 1336 void *addr; 1337 size_t size; 1338 int created; 1339{ 1340 int ret; 1341 u_int8_t *p, *t; 1342 1343 /* Ignore heap regions. */ 1344 if (F_ISSET(env, ENV_PRIVATE)) 1345 return (0); 1346 1347 /* 1348 * It's sometimes significantly faster to page-fault in all of the 1349 * region's pages before we run the application, as we see nasty 1350 * side-effects when we page-fault while holding various locks, i.e., 1351 * the lock takes a long time to acquire because of the underlying 1352 * page fault, and the other threads convoy behind the lock holder. 1353 * 1354 * If we created the region, we write a non-zero value so that the 1355 * system can't cheat. If we're just joining the region, we can 1356 * only read the value and try to confuse the compiler sufficiently 1357 * that it doesn't figure out that we're never really using it. 1358 * 1359 * Touch every page (assuming pages are 512B, the smallest VM page 1360 * size used in any general purpose processor). 1361 */ 1362 ret = 0; 1363 if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) { 1364 if (created) 1365 for (p = addr, 1366 t = (u_int8_t *)addr + size; p < t; p += 512) 1367 p[0] = 0xdb; 1368 else 1369 for (p = addr, 1370 t = (u_int8_t *)addr + size; p < t; p += 512) 1371 ret |= p[0]; 1372 } 1373 1374 return (ret); 1375} 1376