1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1998,2008 Oracle.  All rights reserved.
5 *
6 * $Id: region.h,v 12.21 2008/05/07 12:35:10 bschmeck Exp $
7 */
8
9#ifndef _DB_REGION_H_
10#define	_DB_REGION_H_
11
12/*
13 * The DB environment consists of some number of "regions", which are described
14 * by the following four structures:
15 *
16 *	REGENV	   -- shared information about the environment
17 *	REGENV_REF -- file describing system memory version of REGENV
18 *	REGION	   -- shared information about a single region
19 *	REGINFO	   -- per-process information about a REGION
20 *
21 * There are three types of memory that hold regions:
22 *	per-process heap (malloc)
23 *	file mapped into memory (mmap, MapViewOfFile)
24 *	system memory (shmget, CreateFileMapping)
25 *
26 * By default, regions are created in filesystem-backed shared memory.  They
27 * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private
28 * to a process, in heap memory (DB_PRIVATE).
29 *
30 * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
31 * we're not using a private environment allocated in heap, "__db.001" will
32 * always exist, as we use it to synchronize on the regions, whether they are
33 * in filesystem-backed memory or system memory.
34 *
35 * The file "__db.001" contains a REGENV structure and an array of REGION
36 * structures.  Each REGION structures describes an underlying chunk of
37 * shared memory.
38 *
39 *	__db.001
40 *	+---------+
41 *	|REGENV  |
42 *	+---------+   +----------+
43 *	|REGION   |-> | __db.002 |
44 *	|	  |   +----------+
45 *	+---------+   +----------+
46 *	|REGION   |-> | __db.003 |
47 *	|	  |   +----------+
48 *	+---------+   +----------+
49 *	|REGION   |-> | __db.004 |
50 *	|	  |   +----------+
51 *	+---------+
52 *
53 * The tricky part about manipulating the regions is creating or joining the
54 * database environment.  We have to be sure only a single thread of control
55 * creates and/or recovers a database environment.  All other threads should
56 * then join without seeing inconsistent data.
57 *
58 * We do this in two parts: first, we use the underlying O_EXCL flag to the
59 * open system call to serialize creation of the __db.001 file.  The thread
60 * of control creating that file then proceeds to create the remaining
61 * regions in the environment, including the mutex region.  Once the mutex
62 * region has been created, the creating thread of control fills in the
63 * __db.001 file's magic number.  Other threads of control (the ones that
64 * didn't create the __db.001 file), wait on the initialization of the
65 * __db.001 file's magic number.  After it has been initialized, all threads
66 * of control can proceed, using normal shared mutex locking procedures for
67 * exclusion.
68 *
69 * REGIONs are not moved or removed during the life of the environment, and
70 * so processes can have long-lived references to them.
71 *
72 * One of the REGION structures describes the environment region itself.
73 *
74 * The REGION array is not locked in any way.  It's an array so we don't have
75 * to manipulate data structures after a crash -- on some systems, we have to
76 * join and clean up the mutex region after application failure.  Using an
77 * array means we don't have to worry about broken links or other nastiness
78 * after the failure.
79 *
80 * All requests to create or join a region return a REGINFO structure, which
81 * is held by the caller and used to open and subsequently close the reference
82 * to the region.  The REGINFO structure contains the per-process information
83 * that we need to access the region.
84 *
85 * The one remaining complication.  If the regions (including the environment
86 * region) live in system memory, and the system memory isn't "named" somehow
87 * in the filesystem name space, we need some way of finding it.  Do this by
88 * by writing the REGENV_REF structure into the "__db.001" file.  When we find
89 * a __db.001 file that is too small to be a real, on-disk environment, we use
90 * the information it contains to redirect to the real "__db.001" file/memory.
91 * This currently only happens when the REGENV file is in shared system memory.
92 *
93 * Although DB does not currently grow regions when they run out of memory, it
94 * would be possible to do so.  To grow a region, allocate a new region of the
95 * appropriate size, then copy the old region over it and insert the additional
96 * memory into the already existing shalloc arena.  Region users must reset
97 * their base addresses and any local pointers into the memory, of course.
98 * This failed in historic versions of DB because the region mutexes lived in
99 * the mapped memory, and when it was unmapped and remapped (or copied),
100 * threads could lose track of it.  Also, some systems didn't support mutex
101 * copying, e.g., from OSF1 V4.0:
102 *
103 *	The address of an msemaphore structure may be significant.  If the
104 *	msemaphore structure contains any value copied from an msemaphore
105 *	structure at a different address, the result is undefined.
106 *
107 * All mutexes are now maintained in a separate region which is never unmapped,
108 * so growing regions should be possible.
109 */
110
111#if defined(__cplusplus)
112extern "C" {
113#endif
114
115#define	DB_REGION_PREFIX	"__db"		/* DB file name prefix. */
116#define	DB_REGION_FMT		"__db.%03d"	/* Region file name format. */
117#define	DB_REGION_ENV		"__db.001"	/* Primary environment name. */
118
119#define	INVALID_REGION_ID	0	/* Out-of-band region ID. */
120#define	REGION_ID_ENV		1	/* Primary environment ID. */
121
122typedef enum {
123	INVALID_REGION_TYPE=0,		/* Region type. */
124	REGION_TYPE_ENV,
125	REGION_TYPE_LOCK,
126	REGION_TYPE_LOG,
127	REGION_TYPE_MPOOL,
128	REGION_TYPE_MUTEX,
129	REGION_TYPE_TXN } reg_type_t;
130
131#define	INVALID_REGION_SEGID	-1	/* Segment IDs are either shmget(2) or
132					 * Win16 segment identifiers.  They are
133					 * both stored in a "long", and we need
134					 * an out-of-band value.
135					 */
136/*
137 * Nothing can live at region offset 0, because, in all cases, that's where
138 * we store *something*.  Lots of code needs an out-of-band value for region
139 * offsets, so we use 0.
140 */
141#define	INVALID_ROFF		0
142
143/* Reference describing system memory version of REGENV. */
144typedef struct __db_reg_env_ref {
145	roff_t	   size;		/* Region size. */
146	long	   segid;		/* UNIX shmget ID, VxWorks ID. */
147} REGENV_REF;
148
149/* Per-environment region information. */
150typedef struct __db_reg_env {
151	/*
152	 * !!!
153	 * The magic, panic, version, envid and signature fields of the region
154	 * are fixed in size, the timestamp field is the first field which is
155	 * variable length.  These fields must never change in order, to
156	 * guarantee we can always read them, no matter what release we have.
157	 *
158	 * !!!
159	 * The magic and panic fields are NOT protected by any mutex, and for
160	 * this reason cannot be anything more complicated than zero/non-zero.
161	 */
162	u_int32_t magic;		/* Valid region magic number. */
163	u_int32_t panic;		/* Environment is dead. */
164
165	u_int32_t majver;		/* Major DB version number. */
166	u_int32_t minver;		/* Minor DB version number. */
167	u_int32_t patchver;		/* Patch DB version number. */
168
169	u_int32_t envid;		/* Unique environment ID. */
170
171	u_int32_t signature;		/* Structure signatures. */
172
173	time_t	  timestamp;		/* Creation time. */
174
175	u_int32_t init_flags;		/* Flags environment initialized with.*/
176
177	/*
178	 * The mtx_regenv mutex protects the environment reference count and
179	 * memory allocation from the primary shared region (the crypto, thread
180	 * control block and replication implementations allocate memory from
181	 * the primary shared region).
182	 *
183	 * The rest of the fields are initialized at creation time, and don't
184	 * need mutex protection.  The flags, op_timestamp and rep_timestamp
185	 * fields are used by replication only and are protected by the
186	 * replication mutex.  The rep_timestamp is is not protected when it
187	 * is used in recovery as that is already single threaded.
188	 */
189	db_mutex_t mtx_regenv;		/* Refcnt, region allocation mutex. */
190	u_int32_t  refcnt;		/* References to the environment. */
191
192	u_int32_t region_cnt;		/* Number of REGIONs. */
193	roff_t	  region_off;		/* Offset of region array */
194
195	roff_t	  cipher_off;		/* Offset of cipher area */
196
197	roff_t	  thread_off;		/* Offset of the thread area. */
198
199	roff_t	  rep_off;		/* Offset of the replication area. */
200#define	DB_REGENV_REPLOCKED	0x0001	/* Env locked for rep backup. */
201	u_int32_t flags;		/* Shared environment flags. */
202#define	DB_REGENV_TIMEOUT	30	/* Backup timeout. */
203	time_t	  op_timestamp;		/* Timestamp for operations. */
204	time_t	  rep_timestamp;	/* Timestamp for rep db handles. */
205
206	uintmax_t unused;		/* The ALLOC_LAYOUT structure follows
207					 * the REGENV structure in memory and
208					 * contains uintmax_t fields.  Force
209					 * proper alignment of that structure.
210					 */
211} REGENV;
212
213/* Per-region shared region information. */
214typedef struct __db_region {
215	u_int32_t	id;		/* Region id. */
216	reg_type_t	type;		/* Region type. */
217
218	roff_t	size_orig;		/* Region size in bytes (original). */
219	roff_t	size;			/* Region size in bytes (adjusted). */
220
221	roff_t	primary;		/* Primary data structure offset. */
222
223	long	segid;			/* UNIX shmget(2), Win16 segment ID. */
224} REGION;
225
226/*
227 * Per-process/per-attachment information about a single region.
228 */
229struct __db_reginfo_t {		/* __env_region_attach IN parameters. */
230	ENV	   *env;		/* Enclosing environment. */
231	reg_type_t  type;		/* Region type. */
232	u_int32_t   id;			/* Region id. */
233
234				/* env_region_attach OUT parameters. */
235	REGION	   *rp;			/* Shared region. */
236
237	char	   *name;		/* Region file name. */
238
239	void	   *addr_orig;		/* Region address (original). */
240	void	   *addr;		/* Region address (adjusted). */
241	void	   *primary;		/* Primary data structure address. */
242
243	size_t	    max_alloc;		/* Maximum bytes allocated. */
244	size_t	    allocated;		/* Bytes allocated. */
245
246#ifdef DB_WIN32
247	HANDLE	wnt_handle;		/* Win/NT HANDLE. */
248#endif
249
250#define	REGION_CREATE		0x01	/* Caller created region. */
251#define	REGION_CREATE_OK	0x02	/* Caller willing to create region. */
252#define	REGION_JOIN_OK		0x04	/* Caller is looking for a match. */
253	u_int32_t   flags;
254};
255
256/*
257 * R_ADDR	Return a per-process address for a shared region offset.
258 * R_OFFSET	Return a shared region offset for a per-process address.
259 */
260#define	R_ADDR(reginfop, offset)					\
261	(F_ISSET((reginfop)->env, ENV_PRIVATE) ?			\
262	    (void *)(offset) :						\
263	    (void *)((u_int8_t *)((reginfop)->addr) + (offset)))
264#define	R_OFFSET(reginfop, p)						\
265	(F_ISSET((reginfop)->env, ENV_PRIVATE) ?			\
266	    (roff_t)(p) :						\
267	    (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))
268
269/*
270 * PANIC_ISSET, PANIC_CHECK:
271 *	Check to see if the DB environment is dead.
272 */
273#define	PANIC_ISSET(env)						\
274	((env) != NULL && (env)->reginfo != NULL &&			\
275	    ((REGENV *)(env)->reginfo->primary)->panic != 0 &&		\
276	    !F_ISSET((env)->dbenv, DB_ENV_NOPANIC))
277
278#define	PANIC_CHECK(env)						\
279	if (PANIC_ISSET(env))						\
280		return (__env_panic_msg(env));
281
282#if defined(__cplusplus)
283}
284#endif
285#endif /* !_DB_REGION_H_ */
286