1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996,2008 Oracle. All rights reserved. 5 * 6 * $Id: os_map.c,v 12.26 2008/01/31 18:40:46 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12 13#ifdef HAVE_SYSTEM_INCLUDE_FILES 14#ifdef HAVE_MMAP 15#include <sys/mman.h> 16#endif 17 18#ifdef HAVE_SHMGET 19#include <sys/ipc.h> 20#include <sys/shm.h> 21#endif 22#endif 23 24#ifdef HAVE_MMAP 25static int __os_map __P((ENV *, char *, DB_FH *, size_t, int, int, void **)); 26#endif 27#ifdef HAVE_SHMGET 28static int __shm_mode __P((ENV *)); 29#else 30static int __no_system_mem __P((ENV *)); 31#endif 32 33/* 34 * __os_attach -- 35 * Create/join a shared memory region. 36 * 37 * PUBLIC: int __os_attach __P((ENV *, REGINFO *, REGION *)); 38 */ 39int 40__os_attach(env, infop, rp) 41 ENV *env; 42 REGINFO *infop; 43 REGION *rp; 44{ 45 DB_ENV *dbenv; 46 int create_ok, ret; 47 48 /* 49 * We pass a DB_ENV handle to the user's replacement map function, 50 * so there must be a valid handle. 51 */ 52 DB_ASSERT(env, env != NULL && env->dbenv != NULL); 53 dbenv = env->dbenv; 54 55 if (DB_GLOBAL(j_region_map) != NULL) { 56 /* 57 * We have to find out if the region is being created. Ask 58 * the underlying map function, and use the REGINFO structure 59 * to pass that information back to our caller. 60 */ 61 create_ok = F_ISSET(infop, REGION_CREATE) ? 1 : 0; 62 ret = DB_GLOBAL(j_region_map) 63 (dbenv, infop->name, rp->size, &create_ok, &infop->addr); 64 if (create_ok) 65 F_SET(infop, REGION_CREATE); 66 else 67 F_CLR(infop, REGION_CREATE); 68 return (ret); 69 } 70 71 if (F_ISSET(env, ENV_SYSTEM_MEM)) { 72 /* 73 * If the region is in system memory on UNIX, we use shmget(2). 74 * 75 * !!! 76 * There exist spinlocks that don't work in shmget memory, e.g., 77 * the HP/UX msemaphore interface. If we don't have locks that 78 * will work in shmget memory, we better be private and not be 79 * threaded. If we reach this point, we know we're public, so 80 * it's an error. 81 */ 82#if defined(HAVE_MUTEX_HPPA_MSEM_INIT) 83 __db_errx(env, 84 "architecture does not support locks inside system shared memory"); 85 return (EINVAL); 86#endif 87#if defined(HAVE_SHMGET) 88 { 89 key_t segid; 90 int id, mode; 91 92 /* 93 * We could potentially create based on REGION_CREATE_OK, but 94 * that's dangerous -- we might get crammed in sideways if 95 * some of the expected regions exist but others do not. Also, 96 * if the requested size differs from an existing region's 97 * actual size, then all sorts of nasty things can happen. 98 * Basing create solely on REGION_CREATE is much safer -- a 99 * recovery will get us straightened out. 100 */ 101 if (F_ISSET(infop, REGION_CREATE)) { 102 /* 103 * The application must give us a base System V IPC key 104 * value. Adjust that value based on the region's ID, 105 * and correct so the user's original value appears in 106 * the ipcs output. 107 */ 108 if (dbenv->shm_key == INVALID_REGION_SEGID) { 109 __db_errx(env, 110 "no base system shared memory ID specified"); 111 return (EINVAL); 112 } 113 114 /* 115 * !!! 116 * The BDB API takes a "long" as the base segment ID, 117 * then adds an unsigned 32-bit value and stores it 118 * in a key_t. Wrong, admittedly, but not worth an 119 * API change to fix. 120 */ 121 segid = (key_t) 122 ((u_long)dbenv->shm_key + (infop->id - 1)); 123 124 /* 125 * If map to an existing region, assume the application 126 * crashed and we're restarting. Delete the old region 127 * and re-try. If that fails, return an error, the 128 * application will have to select a different segment 129 * ID or clean up some other way. 130 */ 131 if ((id = shmget(segid, 0, 0)) != -1) { 132 (void)shmctl(id, IPC_RMID, NULL); 133 if ((id = shmget(segid, 0, 0)) != -1) { 134 __db_errx(env, 135 "shmget: key: %ld: shared system memory region already exists", 136 (long)segid); 137 return (EAGAIN); 138 } 139 } 140 141 /* 142 * Map the DbEnv::open method file mode permissions to 143 * shmget call permissions. 144 */ 145 mode = IPC_CREAT | __shm_mode(env); 146 if ((id = shmget(segid, rp->size, mode)) == -1) { 147 ret = __os_get_syserr(); 148 __db_syserr(env, ret, 149 "shmget: key: %ld: unable to create shared system memory region", 150 (long)segid); 151 return (__os_posix_err(ret)); 152 } 153 rp->segid = id; 154 } else 155 id = rp->segid; 156 157 if ((infop->addr = shmat(id, NULL, 0)) == (void *)-1) { 158 infop->addr = NULL; 159 ret = __os_get_syserr(); 160 __db_syserr(env, ret, 161 "shmat: id %d: unable to attach to shared system memory region", id); 162 return (__os_posix_err(ret)); 163 } 164 165 /* Optionally lock the memory down. */ 166 if (F_ISSET(env, ENV_LOCKDOWN)) { 167#ifdef HAVE_SHMCTL_SHM_LOCK 168 ret = shmctl( 169 id, SHM_LOCK, NULL) == 0 ? 0 : __os_get_syserr(); 170#else 171 ret = DB_OPNOTSUP; 172#endif 173 if (ret != 0) { 174 __db_syserr(env, ret, 175 "shmctl/SHM_LOCK: id %d: unable to lock down shared memory region", id); 176 return (__os_posix_err(ret)); 177 } 178 } 179 180 return (0); 181 } 182#else 183 return (__no_system_mem(env)); 184#endif 185 } 186 187#ifdef HAVE_MMAP 188 { 189 DB_FH *fhp; 190 191 fhp = NULL; 192 193 /* 194 * Try to open/create the shared region file. We DO NOT need to ensure 195 * that multiple threads/processes attempting to simultaneously create 196 * the region are properly ordered, our caller has already taken care 197 * of that. 198 */ 199 if ((ret = __os_open(env, infop->name, 0, 200 DB_OSO_REGION | 201 (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0), 202 env->db_mode, &fhp)) != 0) 203 __db_err(env, ret, "%s", infop->name); 204 205 /* 206 * If we created the file, grow it to its full size before mapping 207 * it in. We really want to avoid touching the buffer cache after 208 * mmap(2) is called, doing anything else confuses the hell out of 209 * systems without merged VM/buffer cache systems, or, more to the 210 * point, *badly* merged VM/buffer cache systems. 211 */ 212 if (ret == 0 && F_ISSET(infop, REGION_CREATE)) { 213 if (F_ISSET(dbenv, DB_ENV_REGION_INIT)) 214 ret = __db_file_write(env, fhp, 215 rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00); 216 else 217 ret = __db_file_extend(env, fhp, rp->size); 218 } 219 220 /* Map the file in. */ 221 if (ret == 0) 222 ret = __os_map(env, 223 infop->name, fhp, rp->size, 1, 0, &infop->addr); 224 225 if (fhp != NULL) 226 (void)__os_closehandle(env, fhp); 227 228 return (ret); 229 } 230#else 231 COMPQUIET(infop, NULL); 232 COMPQUIET(rp, NULL); 233 __db_errx(env, 234 "architecture lacks mmap(2), shared environments not possible"); 235 return (DB_OPNOTSUP); 236#endif 237} 238 239/* 240 * __os_detach -- 241 * Detach from a shared memory region. 242 * 243 * PUBLIC: int __os_detach __P((ENV *, REGINFO *, int)); 244 */ 245int 246__os_detach(env, infop, destroy) 247 ENV *env; 248 REGINFO *infop; 249 int destroy; 250{ 251 DB_ENV *dbenv; 252 REGION *rp; 253 int ret; 254 255 /* 256 * We pass a DB_ENV handle to the user's replacement unmap function, 257 * so there must be a valid handle. 258 */ 259 DB_ASSERT(env, env != NULL && env->dbenv != NULL); 260 dbenv = env->dbenv; 261 262 rp = infop->rp; 263 264 /* If the user replaced the unmap call, call through their interface. */ 265 if (DB_GLOBAL(j_region_unmap) != NULL) 266 return (DB_GLOBAL(j_region_unmap)(dbenv, infop->addr)); 267 268 if (F_ISSET(env, ENV_SYSTEM_MEM)) { 269#ifdef HAVE_SHMGET 270 int segid; 271 272 /* 273 * We may be about to remove the memory referenced by rp, 274 * save the segment ID, and (optionally) wipe the original. 275 */ 276 segid = rp->segid; 277 if (destroy) 278 rp->segid = INVALID_REGION_SEGID; 279 280 if (shmdt(infop->addr) != 0) { 281 ret = __os_get_syserr(); 282 __db_syserr(env, ret, "shmdt"); 283 return (__os_posix_err(ret)); 284 } 285 286 if (destroy && shmctl(segid, IPC_RMID, 287 NULL) != 0 && (ret = __os_get_syserr()) != EINVAL) { 288 __db_syserr(env, ret, 289 "shmctl: id %d: unable to delete system shared memory region", 290 segid); 291 return (__os_posix_err(ret)); 292 } 293 294 return (0); 295#else 296 return (__no_system_mem(env)); 297#endif 298 } 299 300#ifdef HAVE_MMAP 301#ifdef HAVE_MUNLOCK 302 if (F_ISSET(env, ENV_LOCKDOWN)) 303 (void)munlock(infop->addr, rp->size); 304#endif 305 if (munmap(infop->addr, rp->size) != 0) { 306 ret = __os_get_syserr(); 307 __db_syserr(env, ret, "munmap"); 308 return (__os_posix_err(ret)); 309 } 310 311 if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0) 312 return (ret); 313 314 return (0); 315#else 316 COMPQUIET(destroy, 0); 317 COMPQUIET(ret, 0); 318 return (EINVAL); 319#endif 320} 321 322/* 323 * __os_mapfile -- 324 * Map in a shared memory file. 325 * 326 * PUBLIC: int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **)); 327 */ 328int 329__os_mapfile(env, path, fhp, len, is_rdonly, addrp) 330 ENV *env; 331 char *path; 332 DB_FH *fhp; 333 int is_rdonly; 334 size_t len; 335 void **addrp; 336{ 337#if defined(HAVE_MMAP) && !defined(HAVE_QNX) 338 DB_ENV *dbenv; 339 340 /* If the user replaced the map call, call through their interface. */ 341 if (DB_GLOBAL(j_file_map) != NULL) { 342 /* 343 * We pass a DB_ENV handle to the user's replacement map 344 * function, so there must be a valid handle. 345 */ 346 DB_ASSERT(env, env != NULL && env->dbenv != NULL); 347 dbenv = env->dbenv; 348 349 return ( 350 DB_GLOBAL(j_file_map)(dbenv, path, len, is_rdonly, addrp)); 351 } 352 353 return (__os_map(env, path, fhp, len, 0, is_rdonly, addrp)); 354#else 355 COMPQUIET(env, NULL); 356 COMPQUIET(path, NULL); 357 COMPQUIET(fhp, NULL); 358 COMPQUIET(is_rdonly, 0); 359 COMPQUIET(len, 0); 360 COMPQUIET(addrp, NULL); 361 return (DB_OPNOTSUP); 362#endif 363} 364 365/* 366 * __os_unmapfile -- 367 * Unmap the shared memory file. 368 * 369 * PUBLIC: int __os_unmapfile __P((ENV *, void *, size_t)); 370 */ 371int 372__os_unmapfile(env, addr, len) 373 ENV *env; 374 void *addr; 375 size_t len; 376{ 377 DB_ENV *dbenv; 378 int ret; 379 380 /* 381 * We pass a DB_ENV handle to the user's replacement unmap function, 382 * so there must be a valid handle. 383 */ 384 DB_ASSERT(env, env != NULL && env->dbenv != NULL); 385 dbenv = env->dbenv; 386 387 if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL)) 388 __db_msg(env, "fileops: munmap"); 389 390 /* If the user replaced the map call, call through their interface. */ 391 if (DB_GLOBAL(j_file_unmap) != NULL) 392 return (DB_GLOBAL(j_file_unmap)(dbenv, addr)); 393 394#ifdef HAVE_MMAP 395#ifdef HAVE_MUNLOCK 396 if (F_ISSET(env, ENV_LOCKDOWN)) 397 RETRY_CHK((munlock(addr, len)), ret); 398 /* 399 * !!! 400 * The return value is ignored. 401 */ 402#else 403 COMPQUIET(env, NULL); 404#endif 405 RETRY_CHK((munmap(addr, len)), ret); 406 ret = __os_posix_err(ret); 407#else 408 COMPQUIET(env, NULL); 409 ret = EINVAL; 410#endif 411 return (ret); 412} 413 414#ifdef HAVE_MMAP 415/* 416 * __os_map -- 417 * Call the mmap(2) function. 418 */ 419static int 420__os_map(env, path, fhp, len, is_region, is_rdonly, addrp) 421 ENV *env; 422 char *path; 423 DB_FH *fhp; 424 int is_region, is_rdonly; 425 size_t len; 426 void **addrp; 427{ 428 DB_ENV *dbenv; 429 int flags, prot, ret; 430 void *p; 431 432 /* 433 * We pass a DB_ENV handle to the user's replacement map function, 434 * so there must be a valid handle. 435 */ 436 DB_ASSERT(env, env != NULL && env->dbenv != NULL); 437 dbenv = env->dbenv; 438 439 if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL)) 440 __db_msg(env, "fileops: mmap %s", path); 441 442 DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1); 443 444 /* 445 * If it's read-only, it's private, and if it's not, it's shared. 446 * Don't bother with an additional parameter. 447 */ 448 flags = is_rdonly ? MAP_PRIVATE : MAP_SHARED; 449 450#ifdef MAP_FILE 451 /* 452 * Historically, MAP_FILE was required for mapping regular files, 453 * even though it was the default. Some systems have it, some 454 * don't, some that have it set it to 0. 455 */ 456 flags |= MAP_FILE; 457#endif 458 459 /* 460 * I know of no systems that implement the flag to tell the system 461 * that the region contains semaphores, but it's not an unreasonable 462 * thing to do, and has been part of the design since forever. I 463 * don't think anyone will object, but don't set it for read-only 464 * files, it doesn't make sense. 465 */ 466#ifdef MAP_HASSEMAPHORE 467 if (is_region && !is_rdonly) 468 flags |= MAP_HASSEMAPHORE; 469#else 470 COMPQUIET(is_region, 0); 471#endif 472 473 /* 474 * FreeBSD: 475 * Causes data dirtied via this VM map to be flushed to physical media 476 * only when necessary (usually by the pager) rather then gratuitously. 477 * Typically this prevents the update daemons from flushing pages 478 * dirtied through such maps and thus allows efficient sharing of 479 * memory across unassociated processes using a file-backed shared 480 * memory map. 481 */ 482#ifdef MAP_NOSYNC 483 flags |= MAP_NOSYNC; 484#endif 485 486 prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE); 487 488 /* 489 * XXX 490 * Work around a bug in the VMS V7.1 mmap() implementation. To map 491 * a file into memory on VMS it needs to be opened in a certain way, 492 * originally. To get the file opened in that certain way, the VMS 493 * mmap() closes the file and re-opens it. When it does this, it 494 * doesn't flush any caches out to disk before closing. The problem 495 * this causes us is that when the memory cache doesn't get written 496 * out, the file isn't big enough to match the memory chunk and the 497 * mmap() call fails. This call to fsync() fixes the problem. DEC 498 * thinks this isn't a bug because of language in XPG5 discussing user 499 * responsibility for on-disk and in-memory synchronization. 500 */ 501#ifdef VMS 502 if (__os_fsync(env, fhp) == -1) 503 return (__os_posix_err(__os_get_syserr())); 504#endif 505 506 /* MAP_FAILED was not defined in early mmap implementations. */ 507#ifndef MAP_FAILED 508#define MAP_FAILED -1 509#endif 510 if ((p = mmap(NULL, 511 len, prot, flags, fhp->fd, (off_t)0)) == (void *)MAP_FAILED) { 512 ret = __os_get_syserr(); 513 __db_syserr(env, ret, "mmap"); 514 return (__os_posix_err(ret)); 515 } 516 517 /* 518 * If it's a region, we want to make sure that the memory isn't paged. 519 * For example, Solaris will page large mpools because it thinks that 520 * I/O buffer memory is more important than we are. The mlock system 521 * call may or may not succeed (mlock is restricted to the super-user 522 * on some systems). Currently, the only other use of mmap in DB is 523 * to map read-only databases -- we don't want them paged, either, so 524 * the call isn't conditional. 525 */ 526 if (F_ISSET(env, ENV_LOCKDOWN)) { 527#ifdef HAVE_MLOCK 528 ret = mlock(p, len) == 0 ? 0 : __os_get_syserr(); 529#else 530 ret = DB_OPNOTSUP; 531#endif 532 if (ret != 0) { 533 __db_syserr(env, ret, "mlock"); 534 return (__os_posix_err(ret)); 535 } 536 } 537 538 *addrp = p; 539 return (0); 540} 541#endif 542 543#ifdef HAVE_SHMGET 544#ifndef SHM_R 545#define SHM_R 0400 546#endif 547#ifndef SHM_W 548#define SHM_W 0200 549#endif 550 551/* 552 * __shm_mode -- 553 * Map the DbEnv::open method file mode permissions to shmget call 554 * permissions. 555 */ 556static int 557__shm_mode(env) 558 ENV *env; 559{ 560 int mode; 561 562 /* Default to r/w owner, r/w group. */ 563 if (env->db_mode == 0) 564 return (SHM_R | SHM_W | SHM_R >> 3 | SHM_W >> 3); 565 566 mode = 0; 567 if (env->db_mode & S_IRUSR) 568 mode |= SHM_R; 569 if (env->db_mode & S_IWUSR) 570 mode |= SHM_W; 571 if (env->db_mode & S_IRGRP) 572 mode |= SHM_R >> 3; 573 if (env->db_mode & S_IWGRP) 574 mode |= SHM_W >> 3; 575 if (env->db_mode & S_IROTH) 576 mode |= SHM_R >> 6; 577 if (env->db_mode & S_IWOTH) 578 mode |= SHM_W >> 6; 579 return (mode); 580} 581#else 582/* 583 * __no_system_mem -- 584 * No system memory environments error message. 585 */ 586static int 587__no_system_mem(env) 588 ENV *env; 589{ 590 __db_errx(env, 591 "architecture doesn't support environments in system memory"); 592 return (DB_OPNOTSUP); 593} 594#endif /* HAVE_SHMGET */ 595