1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: os_map.c,v 12.26 2008/01/31 18:40:46 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13#ifdef HAVE_SYSTEM_INCLUDE_FILES
14#ifdef HAVE_MMAP
15#include <sys/mman.h>
16#endif
17
18#ifdef HAVE_SHMGET
19#include <sys/ipc.h>
20#include <sys/shm.h>
21#endif
22#endif
23
24#ifdef HAVE_MMAP
25static int __os_map __P((ENV *, char *, DB_FH *, size_t, int, int, void **));
26#endif
27#ifdef HAVE_SHMGET
28static int __shm_mode __P((ENV *));
29#else
30static int __no_system_mem __P((ENV *));
31#endif
32
33/*
34 * __os_attach --
35 *	Create/join a shared memory region.
36 *
37 * PUBLIC: int __os_attach __P((ENV *, REGINFO *, REGION *));
38 */
39int
40__os_attach(env, infop, rp)
41	ENV *env;
42	REGINFO *infop;
43	REGION *rp;
44{
45	DB_ENV *dbenv;
46	int create_ok, ret;
47
48	/*
49	 * We pass a DB_ENV handle to the user's replacement map function,
50	 * so there must be a valid handle.
51	 */
52	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
53	dbenv = env->dbenv;
54
55	if (DB_GLOBAL(j_region_map) != NULL) {
56		/*
57		 * We have to find out if the region is being created.  Ask
58		 * the underlying map function, and use the REGINFO structure
59		 * to pass that information back to our caller.
60		 */
61		create_ok = F_ISSET(infop, REGION_CREATE) ? 1 : 0;
62		ret = DB_GLOBAL(j_region_map)
63		    (dbenv, infop->name, rp->size, &create_ok, &infop->addr);
64		if (create_ok)
65			F_SET(infop, REGION_CREATE);
66		else
67			F_CLR(infop, REGION_CREATE);
68		return (ret);
69	}
70
71	if (F_ISSET(env, ENV_SYSTEM_MEM)) {
72		/*
73		 * If the region is in system memory on UNIX, we use shmget(2).
74		 *
75		 * !!!
76		 * There exist spinlocks that don't work in shmget memory, e.g.,
77		 * the HP/UX msemaphore interface.  If we don't have locks that
78		 * will work in shmget memory, we better be private and not be
79		 * threaded.  If we reach this point, we know we're public, so
80		 * it's an error.
81		 */
82#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
83		__db_errx(env,
84	    "architecture does not support locks inside system shared memory");
85		return (EINVAL);
86#endif
87#if defined(HAVE_SHMGET)
88		{
89		key_t segid;
90		int id, mode;
91
92		/*
93		 * We could potentially create based on REGION_CREATE_OK, but
94		 * that's dangerous -- we might get crammed in sideways if
95		 * some of the expected regions exist but others do not.  Also,
96		 * if the requested size differs from an existing region's
97		 * actual size, then all sorts of nasty things can happen.
98		 * Basing create solely on REGION_CREATE is much safer -- a
99		 * recovery will get us straightened out.
100		 */
101		if (F_ISSET(infop, REGION_CREATE)) {
102			/*
103			 * The application must give us a base System V IPC key
104			 * value.  Adjust that value based on the region's ID,
105			 * and correct so the user's original value appears in
106			 * the ipcs output.
107			 */
108			if (dbenv->shm_key == INVALID_REGION_SEGID) {
109				__db_errx(env,
110			    "no base system shared memory ID specified");
111				return (EINVAL);
112			}
113
114			/*
115			 * !!!
116			 * The BDB API takes a "long" as the base segment ID,
117			 * then adds an unsigned 32-bit value and stores it
118			 * in a key_t.  Wrong, admittedly, but not worth an
119			 * API change to fix.
120			 */
121			segid = (key_t)
122			    ((u_long)dbenv->shm_key + (infop->id - 1));
123
124			/*
125			 * If map to an existing region, assume the application
126			 * crashed and we're restarting.  Delete the old region
127			 * and re-try.  If that fails, return an error, the
128			 * application will have to select a different segment
129			 * ID or clean up some other way.
130			 */
131			if ((id = shmget(segid, 0, 0)) != -1) {
132				(void)shmctl(id, IPC_RMID, NULL);
133				if ((id = shmget(segid, 0, 0)) != -1) {
134					__db_errx(env,
135		"shmget: key: %ld: shared system memory region already exists",
136					    (long)segid);
137					return (EAGAIN);
138				}
139			}
140
141			/*
142			 * Map the DbEnv::open method file mode permissions to
143			 * shmget call permissions.
144			 */
145			mode = IPC_CREAT | __shm_mode(env);
146			if ((id = shmget(segid, rp->size, mode)) == -1) {
147				ret = __os_get_syserr();
148				__db_syserr(env, ret,
149	"shmget: key: %ld: unable to create shared system memory region",
150				    (long)segid);
151				return (__os_posix_err(ret));
152			}
153			rp->segid = id;
154		} else
155			id = rp->segid;
156
157		if ((infop->addr = shmat(id, NULL, 0)) == (void *)-1) {
158			infop->addr = NULL;
159			ret = __os_get_syserr();
160			__db_syserr(env, ret,
161	"shmat: id %d: unable to attach to shared system memory region", id);
162			return (__os_posix_err(ret));
163		}
164
165		/* Optionally lock the memory down. */
166		if (F_ISSET(env, ENV_LOCKDOWN)) {
167#ifdef HAVE_SHMCTL_SHM_LOCK
168			ret = shmctl(
169			    id, SHM_LOCK, NULL) == 0 ? 0 : __os_get_syserr();
170#else
171			ret = DB_OPNOTSUP;
172#endif
173			if (ret != 0) {
174				__db_syserr(env, ret,
175	"shmctl/SHM_LOCK: id %d: unable to lock down shared memory region", id);
176				return (__os_posix_err(ret));
177			}
178		}
179
180		return (0);
181		}
182#else
183		return (__no_system_mem(env));
184#endif
185	}
186
187#ifdef HAVE_MMAP
188	{
189	DB_FH *fhp;
190
191	fhp = NULL;
192
193	/*
194	 * Try to open/create the shared region file.  We DO NOT need to ensure
195	 * that multiple threads/processes attempting to simultaneously create
196	 * the region are properly ordered, our caller has already taken care
197	 * of that.
198	 */
199	if ((ret = __os_open(env, infop->name, 0,
200	    DB_OSO_REGION |
201	    (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
202	    env->db_mode, &fhp)) != 0)
203		__db_err(env, ret, "%s", infop->name);
204
205	/*
206	 * If we created the file, grow it to its full size before mapping
207	 * it in.  We really want to avoid touching the buffer cache after
208	 * mmap(2) is called, doing anything else confuses the hell out of
209	 * systems without merged VM/buffer cache systems, or, more to the
210	 * point, *badly* merged VM/buffer cache systems.
211	 */
212	if (ret == 0 && F_ISSET(infop, REGION_CREATE)) {
213		if (F_ISSET(dbenv, DB_ENV_REGION_INIT))
214			ret = __db_file_write(env, fhp,
215			    rp->size / MEGABYTE, rp->size % MEGABYTE, 0x00);
216		else
217			ret = __db_file_extend(env, fhp, rp->size);
218	}
219
220	/* Map the file in. */
221	if (ret == 0)
222		ret = __os_map(env,
223		    infop->name, fhp, rp->size, 1, 0, &infop->addr);
224
225	if (fhp != NULL)
226		(void)__os_closehandle(env, fhp);
227
228	return (ret);
229	}
230#else
231	COMPQUIET(infop, NULL);
232	COMPQUIET(rp, NULL);
233	__db_errx(env,
234	    "architecture lacks mmap(2), shared environments not possible");
235	return (DB_OPNOTSUP);
236#endif
237}
238
239/*
240 * __os_detach --
241 *	Detach from a shared memory region.
242 *
243 * PUBLIC: int __os_detach __P((ENV *, REGINFO *, int));
244 */
245int
246__os_detach(env, infop, destroy)
247	ENV *env;
248	REGINFO *infop;
249	int destroy;
250{
251	DB_ENV *dbenv;
252	REGION *rp;
253	int ret;
254
255	/*
256	 * We pass a DB_ENV handle to the user's replacement unmap function,
257	 * so there must be a valid handle.
258	 */
259	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
260	dbenv = env->dbenv;
261
262	rp = infop->rp;
263
264	/* If the user replaced the unmap call, call through their interface. */
265	if (DB_GLOBAL(j_region_unmap) != NULL)
266		return (DB_GLOBAL(j_region_unmap)(dbenv, infop->addr));
267
268	if (F_ISSET(env, ENV_SYSTEM_MEM)) {
269#ifdef HAVE_SHMGET
270		int segid;
271
272		/*
273		 * We may be about to remove the memory referenced by rp,
274		 * save the segment ID, and (optionally) wipe the original.
275		 */
276		segid = rp->segid;
277		if (destroy)
278			rp->segid = INVALID_REGION_SEGID;
279
280		if (shmdt(infop->addr) != 0) {
281			ret = __os_get_syserr();
282			__db_syserr(env, ret, "shmdt");
283			return (__os_posix_err(ret));
284		}
285
286		if (destroy && shmctl(segid, IPC_RMID,
287		    NULL) != 0 && (ret = __os_get_syserr()) != EINVAL) {
288			__db_syserr(env, ret,
289	    "shmctl: id %d: unable to delete system shared memory region",
290			    segid);
291			return (__os_posix_err(ret));
292		}
293
294		return (0);
295#else
296		return (__no_system_mem(env));
297#endif
298	}
299
300#ifdef HAVE_MMAP
301#ifdef HAVE_MUNLOCK
302	if (F_ISSET(env, ENV_LOCKDOWN))
303		(void)munlock(infop->addr, rp->size);
304#endif
305	if (munmap(infop->addr, rp->size) != 0) {
306		ret = __os_get_syserr();
307		__db_syserr(env, ret, "munmap");
308		return (__os_posix_err(ret));
309	}
310
311	if (destroy && (ret = __os_unlink(env, infop->name, 1)) != 0)
312		return (ret);
313
314	return (0);
315#else
316	COMPQUIET(destroy, 0);
317	COMPQUIET(ret, 0);
318	return (EINVAL);
319#endif
320}
321
322/*
323 * __os_mapfile --
324 *	Map in a shared memory file.
325 *
326 * PUBLIC: int __os_mapfile __P((ENV *, char *, DB_FH *, size_t, int, void **));
327 */
328int
329__os_mapfile(env, path, fhp, len, is_rdonly, addrp)
330	ENV *env;
331	char *path;
332	DB_FH *fhp;
333	int is_rdonly;
334	size_t len;
335	void **addrp;
336{
337#if defined(HAVE_MMAP) && !defined(HAVE_QNX)
338	DB_ENV *dbenv;
339
340	/* If the user replaced the map call, call through their interface. */
341	if (DB_GLOBAL(j_file_map) != NULL) {
342		/*
343		 * We pass a DB_ENV handle to the user's replacement map
344		 * function, so there must be a valid handle.
345		 */
346		DB_ASSERT(env, env != NULL && env->dbenv != NULL);
347		dbenv = env->dbenv;
348
349		return (
350		    DB_GLOBAL(j_file_map)(dbenv, path, len, is_rdonly, addrp));
351	}
352
353	return (__os_map(env, path, fhp, len, 0, is_rdonly, addrp));
354#else
355	COMPQUIET(env, NULL);
356	COMPQUIET(path, NULL);
357	COMPQUIET(fhp, NULL);
358	COMPQUIET(is_rdonly, 0);
359	COMPQUIET(len, 0);
360	COMPQUIET(addrp, NULL);
361	return (DB_OPNOTSUP);
362#endif
363}
364
365/*
366 * __os_unmapfile --
367 *	Unmap the shared memory file.
368 *
369 * PUBLIC: int __os_unmapfile __P((ENV *, void *, size_t));
370 */
371int
372__os_unmapfile(env, addr, len)
373	ENV *env;
374	void *addr;
375	size_t len;
376{
377	DB_ENV *dbenv;
378	int ret;
379
380	/*
381	 * We pass a DB_ENV handle to the user's replacement unmap function,
382	 * so there must be a valid handle.
383	 */
384	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
385	dbenv = env->dbenv;
386
387	if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
388		__db_msg(env, "fileops: munmap");
389
390	/* If the user replaced the map call, call through their interface. */
391	if (DB_GLOBAL(j_file_unmap) != NULL)
392		return (DB_GLOBAL(j_file_unmap)(dbenv, addr));
393
394#ifdef HAVE_MMAP
395#ifdef HAVE_MUNLOCK
396	if (F_ISSET(env, ENV_LOCKDOWN))
397		RETRY_CHK((munlock(addr, len)), ret);
398		/*
399		 * !!!
400		 * The return value is ignored.
401		 */
402#else
403	COMPQUIET(env, NULL);
404#endif
405	RETRY_CHK((munmap(addr, len)), ret);
406	ret = __os_posix_err(ret);
407#else
408	COMPQUIET(env, NULL);
409	ret = EINVAL;
410#endif
411	return (ret);
412}
413
414#ifdef HAVE_MMAP
415/*
416 * __os_map --
417 *	Call the mmap(2) function.
418 */
419static int
420__os_map(env, path, fhp, len, is_region, is_rdonly, addrp)
421	ENV *env;
422	char *path;
423	DB_FH *fhp;
424	int is_region, is_rdonly;
425	size_t len;
426	void **addrp;
427{
428	DB_ENV *dbenv;
429	int flags, prot, ret;
430	void *p;
431
432	/*
433	 * We pass a DB_ENV handle to the user's replacement map function,
434	 * so there must be a valid handle.
435	 */
436	DB_ASSERT(env, env != NULL && env->dbenv != NULL);
437	dbenv = env->dbenv;
438
439	if (FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
440		__db_msg(env, "fileops: mmap %s", path);
441
442	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
443
444	/*
445	 * If it's read-only, it's private, and if it's not, it's shared.
446	 * Don't bother with an additional parameter.
447	 */
448	flags = is_rdonly ? MAP_PRIVATE : MAP_SHARED;
449
450#ifdef MAP_FILE
451	/*
452	 * Historically, MAP_FILE was required for mapping regular files,
453	 * even though it was the default.  Some systems have it, some
454	 * don't, some that have it set it to 0.
455	 */
456	flags |= MAP_FILE;
457#endif
458
459	/*
460	 * I know of no systems that implement the flag to tell the system
461	 * that the region contains semaphores, but it's not an unreasonable
462	 * thing to do, and has been part of the design since forever.  I
463	 * don't think anyone will object, but don't set it for read-only
464	 * files, it doesn't make sense.
465	 */
466#ifdef MAP_HASSEMAPHORE
467	if (is_region && !is_rdonly)
468		flags |= MAP_HASSEMAPHORE;
469#else
470	COMPQUIET(is_region, 0);
471#endif
472
473	/*
474	 * FreeBSD:
475	 * Causes data dirtied via this VM map to be flushed to physical media
476	 * only when necessary (usually by the pager) rather then gratuitously.
477	 * Typically this prevents the update daemons from flushing pages
478	 * dirtied through such maps and thus allows efficient sharing of
479	 * memory across unassociated processes using a file-backed shared
480	 * memory map.
481	 */
482#ifdef MAP_NOSYNC
483	flags |= MAP_NOSYNC;
484#endif
485
486	prot = PROT_READ | (is_rdonly ? 0 : PROT_WRITE);
487
488	/*
489	 * XXX
490	 * Work around a bug in the VMS V7.1 mmap() implementation.  To map
491	 * a file into memory on VMS it needs to be opened in a certain way,
492	 * originally.  To get the file opened in that certain way, the VMS
493	 * mmap() closes the file and re-opens it.  When it does this, it
494	 * doesn't flush any caches out to disk before closing.  The problem
495	 * this causes us is that when the memory cache doesn't get written
496	 * out, the file isn't big enough to match the memory chunk and the
497	 * mmap() call fails.  This call to fsync() fixes the problem.  DEC
498	 * thinks this isn't a bug because of language in XPG5 discussing user
499	 * responsibility for on-disk and in-memory synchronization.
500	 */
501#ifdef VMS
502	if (__os_fsync(env, fhp) == -1)
503		return (__os_posix_err(__os_get_syserr()));
504#endif
505
506	/* MAP_FAILED was not defined in early mmap implementations. */
507#ifndef MAP_FAILED
508#define	MAP_FAILED	-1
509#endif
510	if ((p = mmap(NULL,
511	    len, prot, flags, fhp->fd, (off_t)0)) == (void *)MAP_FAILED) {
512		ret = __os_get_syserr();
513		__db_syserr(env, ret, "mmap");
514		return (__os_posix_err(ret));
515	}
516
517	/*
518	 * If it's a region, we want to make sure that the memory isn't paged.
519	 * For example, Solaris will page large mpools because it thinks that
520	 * I/O buffer memory is more important than we are.  The mlock system
521	 * call may or may not succeed (mlock is restricted to the super-user
522	 * on some systems).  Currently, the only other use of mmap in DB is
523	 * to map read-only databases -- we don't want them paged, either, so
524	 * the call isn't conditional.
525	 */
526	if (F_ISSET(env, ENV_LOCKDOWN)) {
527#ifdef HAVE_MLOCK
528		ret = mlock(p, len) == 0 ? 0 : __os_get_syserr();
529#else
530		ret = DB_OPNOTSUP;
531#endif
532		if (ret != 0) {
533			__db_syserr(env, ret, "mlock");
534			return (__os_posix_err(ret));
535		}
536	}
537
538	*addrp = p;
539	return (0);
540}
541#endif
542
543#ifdef HAVE_SHMGET
544#ifndef SHM_R
545#define	SHM_R	0400
546#endif
547#ifndef SHM_W
548#define	SHM_W	0200
549#endif
550
551/*
552 * __shm_mode --
553 *	Map the DbEnv::open method file mode permissions to shmget call
554 *	permissions.
555 */
556static int
557__shm_mode(env)
558	ENV *env;
559{
560	int mode;
561
562	/* Default to r/w owner, r/w group. */
563	if (env->db_mode == 0)
564		return (SHM_R | SHM_W | SHM_R >> 3 | SHM_W >> 3);
565
566	mode = 0;
567	if (env->db_mode & S_IRUSR)
568		mode |= SHM_R;
569	if (env->db_mode & S_IWUSR)
570		mode |= SHM_W;
571	if (env->db_mode & S_IRGRP)
572		mode |= SHM_R >> 3;
573	if (env->db_mode & S_IWGRP)
574		mode |= SHM_W >> 3;
575	if (env->db_mode & S_IROTH)
576		mode |= SHM_R >> 6;
577	if (env->db_mode & S_IWOTH)
578		mode |= SHM_W >> 6;
579	return (mode);
580}
581#else
582/*
583 * __no_system_mem --
584 *	No system memory environments error message.
585 */
586static int
587__no_system_mem(env)
588	ENV *env;
589{
590	__db_errx(env,
591	    "architecture doesn't support environments in system memory");
592	return (DB_OPNOTSUP);
593}
594#endif /* HAVE_SHMGET */
595