1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1998,2008 Oracle. All rights reserved. 5 * 6 * $Id: region.h,v 12.21 2008/05/07 12:35:10 bschmeck Exp $ 7 */ 8 9#ifndef _DB_REGION_H_ 10#define _DB_REGION_H_ 11 12/* 13 * The DB environment consists of some number of "regions", which are described 14 * by the following four structures: 15 * 16 * REGENV -- shared information about the environment 17 * REGENV_REF -- file describing system memory version of REGENV 18 * REGION -- shared information about a single region 19 * REGINFO -- per-process information about a REGION 20 * 21 * There are three types of memory that hold regions: 22 * per-process heap (malloc) 23 * file mapped into memory (mmap, MapViewOfFile) 24 * system memory (shmget, CreateFileMapping) 25 * 26 * By default, regions are created in filesystem-backed shared memory. They 27 * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private 28 * to a process, in heap memory (DB_PRIVATE). 29 * 30 * Regions in the filesystem are named "__db.001", "__db.002" and so on. If 31 * we're not using a private environment allocated in heap, "__db.001" will 32 * always exist, as we use it to synchronize on the regions, whether they are 33 * in filesystem-backed memory or system memory. 34 * 35 * The file "__db.001" contains a REGENV structure and an array of REGION 36 * structures. Each REGION structures describes an underlying chunk of 37 * shared memory. 38 * 39 * __db.001 40 * +---------+ 41 * |REGENV | 42 * +---------+ +----------+ 43 * |REGION |-> | __db.002 | 44 * | | +----------+ 45 * +---------+ +----------+ 46 * |REGION |-> | __db.003 | 47 * | | +----------+ 48 * +---------+ +----------+ 49 * |REGION |-> | __db.004 | 50 * | | +----------+ 51 * +---------+ 52 * 53 * The tricky part about manipulating the regions is creating or joining the 54 * database environment. We have to be sure only a single thread of control 55 * creates and/or recovers a database environment. All other threads should 56 * then join without seeing inconsistent data. 57 * 58 * We do this in two parts: first, we use the underlying O_EXCL flag to the 59 * open system call to serialize creation of the __db.001 file. The thread 60 * of control creating that file then proceeds to create the remaining 61 * regions in the environment, including the mutex region. Once the mutex 62 * region has been created, the creating thread of control fills in the 63 * __db.001 file's magic number. Other threads of control (the ones that 64 * didn't create the __db.001 file), wait on the initialization of the 65 * __db.001 file's magic number. After it has been initialized, all threads 66 * of control can proceed, using normal shared mutex locking procedures for 67 * exclusion. 68 * 69 * REGIONs are not moved or removed during the life of the environment, and 70 * so processes can have long-lived references to them. 71 * 72 * One of the REGION structures describes the environment region itself. 73 * 74 * The REGION array is not locked in any way. It's an array so we don't have 75 * to manipulate data structures after a crash -- on some systems, we have to 76 * join and clean up the mutex region after application failure. Using an 77 * array means we don't have to worry about broken links or other nastiness 78 * after the failure. 79 * 80 * All requests to create or join a region return a REGINFO structure, which 81 * is held by the caller and used to open and subsequently close the reference 82 * to the region. The REGINFO structure contains the per-process information 83 * that we need to access the region. 84 * 85 * The one remaining complication. If the regions (including the environment 86 * region) live in system memory, and the system memory isn't "named" somehow 87 * in the filesystem name space, we need some way of finding it. Do this by 88 * by writing the REGENV_REF structure into the "__db.001" file. When we find 89 * a __db.001 file that is too small to be a real, on-disk environment, we use 90 * the information it contains to redirect to the real "__db.001" file/memory. 91 * This currently only happens when the REGENV file is in shared system memory. 92 * 93 * Although DB does not currently grow regions when they run out of memory, it 94 * would be possible to do so. To grow a region, allocate a new region of the 95 * appropriate size, then copy the old region over it and insert the additional 96 * memory into the already existing shalloc arena. Region users must reset 97 * their base addresses and any local pointers into the memory, of course. 98 * This failed in historic versions of DB because the region mutexes lived in 99 * the mapped memory, and when it was unmapped and remapped (or copied), 100 * threads could lose track of it. Also, some systems didn't support mutex 101 * copying, e.g., from OSF1 V4.0: 102 * 103 * The address of an msemaphore structure may be significant. If the 104 * msemaphore structure contains any value copied from an msemaphore 105 * structure at a different address, the result is undefined. 106 * 107 * All mutexes are now maintained in a separate region which is never unmapped, 108 * so growing regions should be possible. 109 */ 110 111#if defined(__cplusplus) 112extern "C" { 113#endif 114 115#define DB_REGION_PREFIX "__db" /* DB file name prefix. */ 116#define DB_REGION_FMT "__db.%03d" /* Region file name format. */ 117#define DB_REGION_ENV "__db.001" /* Primary environment name. */ 118 119#define INVALID_REGION_ID 0 /* Out-of-band region ID. */ 120#define REGION_ID_ENV 1 /* Primary environment ID. */ 121 122typedef enum { 123 INVALID_REGION_TYPE=0, /* Region type. */ 124 REGION_TYPE_ENV, 125 REGION_TYPE_LOCK, 126 REGION_TYPE_LOG, 127 REGION_TYPE_MPOOL, 128 REGION_TYPE_MUTEX, 129 REGION_TYPE_TXN } reg_type_t; 130 131#define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or 132 * Win16 segment identifiers. They are 133 * both stored in a "long", and we need 134 * an out-of-band value. 135 */ 136/* 137 * Nothing can live at region offset 0, because, in all cases, that's where 138 * we store *something*. Lots of code needs an out-of-band value for region 139 * offsets, so we use 0. 140 */ 141#define INVALID_ROFF 0 142 143/* Reference describing system memory version of REGENV. */ 144typedef struct __db_reg_env_ref { 145 roff_t size; /* Region size. */ 146 long segid; /* UNIX shmget ID, VxWorks ID. */ 147} REGENV_REF; 148 149/* Per-environment region information. */ 150typedef struct __db_reg_env { 151 /* 152 * !!! 153 * The magic, panic, version, envid and signature fields of the region 154 * are fixed in size, the timestamp field is the first field which is 155 * variable length. These fields must never change in order, to 156 * guarantee we can always read them, no matter what release we have. 157 * 158 * !!! 159 * The magic and panic fields are NOT protected by any mutex, and for 160 * this reason cannot be anything more complicated than zero/non-zero. 161 */ 162 u_int32_t magic; /* Valid region magic number. */ 163 u_int32_t panic; /* Environment is dead. */ 164 165 u_int32_t majver; /* Major DB version number. */ 166 u_int32_t minver; /* Minor DB version number. */ 167 u_int32_t patchver; /* Patch DB version number. */ 168 169 u_int32_t envid; /* Unique environment ID. */ 170 171 u_int32_t signature; /* Structure signatures. */ 172 173 time_t timestamp; /* Creation time. */ 174 175 u_int32_t init_flags; /* Flags environment initialized with.*/ 176 177 /* 178 * The mtx_regenv mutex protects the environment reference count and 179 * memory allocation from the primary shared region (the crypto, thread 180 * control block and replication implementations allocate memory from 181 * the primary shared region). 182 * 183 * The rest of the fields are initialized at creation time, and don't 184 * need mutex protection. The flags, op_timestamp and rep_timestamp 185 * fields are used by replication only and are protected by the 186 * replication mutex. The rep_timestamp is is not protected when it 187 * is used in recovery as that is already single threaded. 188 */ 189 db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */ 190 u_int32_t refcnt; /* References to the environment. */ 191 192 u_int32_t region_cnt; /* Number of REGIONs. */ 193 roff_t region_off; /* Offset of region array */ 194 195 roff_t cipher_off; /* Offset of cipher area */ 196 197 roff_t thread_off; /* Offset of the thread area. */ 198 199 roff_t rep_off; /* Offset of the replication area. */ 200#define DB_REGENV_REPLOCKED 0x0001 /* Env locked for rep backup. */ 201 u_int32_t flags; /* Shared environment flags. */ 202#define DB_REGENV_TIMEOUT 30 /* Backup timeout. */ 203 time_t op_timestamp; /* Timestamp for operations. */ 204 time_t rep_timestamp; /* Timestamp for rep db handles. */ 205 206 uintmax_t unused; /* The ALLOC_LAYOUT structure follows 207 * the REGENV structure in memory and 208 * contains uintmax_t fields. Force 209 * proper alignment of that structure. 210 */ 211} REGENV; 212 213/* Per-region shared region information. */ 214typedef struct __db_region { 215 u_int32_t id; /* Region id. */ 216 reg_type_t type; /* Region type. */ 217 218 roff_t size_orig; /* Region size in bytes (original). */ 219 roff_t size; /* Region size in bytes (adjusted). */ 220 221 roff_t primary; /* Primary data structure offset. */ 222 223 long segid; /* UNIX shmget(2), Win16 segment ID. */ 224} REGION; 225 226/* 227 * Per-process/per-attachment information about a single region. 228 */ 229struct __db_reginfo_t { /* __env_region_attach IN parameters. */ 230 ENV *env; /* Enclosing environment. */ 231 reg_type_t type; /* Region type. */ 232 u_int32_t id; /* Region id. */ 233 234 /* env_region_attach OUT parameters. */ 235 REGION *rp; /* Shared region. */ 236 237 char *name; /* Region file name. */ 238 239 void *addr_orig; /* Region address (original). */ 240 void *addr; /* Region address (adjusted). */ 241 void *primary; /* Primary data structure address. */ 242 243 size_t max_alloc; /* Maximum bytes allocated. */ 244 size_t allocated; /* Bytes allocated. */ 245 246#ifdef DB_WIN32 247 HANDLE wnt_handle; /* Win/NT HANDLE. */ 248#endif 249 250#define REGION_CREATE 0x01 /* Caller created region. */ 251#define REGION_CREATE_OK 0x02 /* Caller willing to create region. */ 252#define REGION_JOIN_OK 0x04 /* Caller is looking for a match. */ 253 u_int32_t flags; 254}; 255 256/* 257 * R_ADDR Return a per-process address for a shared region offset. 258 * R_OFFSET Return a shared region offset for a per-process address. 259 */ 260#define R_ADDR(reginfop, offset) \ 261 (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \ 262 (void *)(offset) : \ 263 (void *)((u_int8_t *)((reginfop)->addr) + (offset))) 264#define R_OFFSET(reginfop, p) \ 265 (F_ISSET((reginfop)->env, ENV_PRIVATE) ? \ 266 (roff_t)(p) : \ 267 (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr)) 268 269/* 270 * PANIC_ISSET, PANIC_CHECK: 271 * Check to see if the DB environment is dead. 272 */ 273#define PANIC_ISSET(env) \ 274 ((env) != NULL && (env)->reginfo != NULL && \ 275 ((REGENV *)(env)->reginfo->primary)->panic != 0 && \ 276 !F_ISSET((env)->dbenv, DB_ENV_NOPANIC)) 277 278#define PANIC_CHECK(env) \ 279 if (PANIC_ISSET(env)) \ 280 return (__env_panic_msg(env)); 281 282#if defined(__cplusplus) 283} 284#endif 285#endif /* !_DB_REGION_H_ */ 286