1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2004,2008 Oracle. All rights reserved. 5 * 6 * $Id: env_register.c,v 1.42 2008/05/07 12:27:33 bschmeck Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12 13#define REGISTER_FILE "__db.register" 14 15#define PID_EMPTY "X 0\n" /* Unused PID entry */ 16#define PID_FMT "%24lu\n" /* PID entry format */ 17 /* Unused PID test */ 18#define PID_ISEMPTY(p) (memcmp(p, PID_EMPTY, PID_LEN) == 0) 19#define PID_LEN (25) /* PID entry length */ 20 21#define REGISTRY_LOCK(env, pos, nowait) \ 22 __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait) 23#define REGISTRY_UNLOCK(env, pos) \ 24 __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0) 25#define REGISTRY_EXCL_LOCK(env, nowait) \ 26 REGISTRY_LOCK(env, 1, nowait) 27#define REGISTRY_EXCL_UNLOCK(env) \ 28 REGISTRY_UNLOCK(env, 1) 29 30static int __envreg_add __P((ENV *, int *)); 31 32/* 33 * Support for portable, multi-process database environment locking, based on 34 * the Subversion SR (#11511). 35 * 36 * The registry feature is configured by specifying the DB_REGISTER flag to the 37 * DbEnv.open method. If DB_REGISTER is specified, DB opens the registry file 38 * in the database environment home directory. The registry file is formatted 39 * as follows: 40 * 41 * 12345 # process ID slot 1 42 * X # empty slot 43 * 12346 # process ID slot 2 44 * X # empty slot 45 * 12347 # process ID slot 3 46 * 12348 # process ID slot 4 47 * X 12349 # empty slot 48 * X # empty slot 49 * 50 * All lines are fixed-length. All lines are process ID slots. Empty slots 51 * are marked with leading non-digit characters. 52 * 53 * To modify the file, you get an exclusive lock on the first byte of the file. 54 * 55 * While holding any DbEnv handle, each process has an exclusive lock on the 56 * first byte of a process ID slot. There is a restriction on having more 57 * than one DbEnv handle open at a time, because Berkeley DB uses per-process 58 * locking to implement this feature, that is, a process may never have more 59 * than a single slot locked. 60 * 61 * This work requires that if a process dies or the system crashes, locks held 62 * by the dying processes will be dropped. (We can't use system shared 63 * memory-backed or filesystem-backed locks because they're persistent when a 64 * process dies.) On POSIX systems, we use fcntl(2) locks; on Win32 we have 65 * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on 66 * Lockfile/UnlockFile. 67 * 68 * We could implement the same solution with flock locking instead of fcntl, 69 * but flock would require a separate file for each process of control (and 70 * probably each DbEnv handle) in the database environment, which is fairly 71 * ugly. 72 * 73 * Whenever a process opens a new DbEnv handle, it walks the registry file and 74 * verifies it CANNOT acquire the lock for any non-empty slot. If a lock for 75 * a non-empty slot is available, we know a process died holding an open handle, 76 * and recovery needs to be run. 77 * 78 * It's possible to get corruption in the registry file. If a write system 79 * call fails after partially completing, there can be corrupted entries in 80 * the registry file, or a partial entry at the end of the file. This is OK. 81 * A corrupted entry will be flagged as a non-empty line during the registry 82 * file walk. Since the line was corrupted by process failure, no process will 83 * hold a lock on the slot, which will lead to recovery being run. 84 * 85 * There can still be processes running in the environment when we recover it, 86 * and, in fact, there can still be processes running in the old environment 87 * after we're up and running in a new one. This is safe because performing 88 * recovery panics (and removes) the existing environment, so the window of 89 * vulnerability is small. Further, we check the panic flag in the DB API 90 * methods, when waking from spinning on a mutex, and whenever we're about to 91 * write to disk). The only window of corruption is if the write check of the 92 * panic were to complete, the region subsequently be recovered, and then the 93 * write continues. That's very, very unlikely to happen. This vulnerability 94 * already exists in Berkeley DB, too, the registry code doesn't make it any 95 * worse than it already is. 96 * 97 * The only way to avoid that window entirely is to ensure that all processes 98 * in the Berkeley DB environment exit before we run recovery. Applications 99 * can do that if they maintain their own process registry outside of Berkeley 100 * DB, but it's a little more difficult to do here. The obvious approach is 101 * to send signals to any process using the database environment as soon as we 102 * decide to run recovery, but there are problems with that approach: we might 103 * not have permission to send signals to the process, the process might have 104 * signal handlers installed, the cookie stored might not be the same as kill's 105 * argument, we may not be able to reliably tell if the process died, and there 106 * are probably other problems. However, if we can send a signal, it reduces 107 * the window, and so we include the code here. To configure it, turn on the 108 * DB_ENVREG_KILL_ALL #define. 109 */ 110#define DB_ENVREG_KILL_ALL 0 111 112/* 113 * __envreg_register -- 114 * Register a ENV handle. 115 * 116 * PUBLIC: int __envreg_register __P((ENV *, int *)); 117 */ 118int 119__envreg_register(env, need_recoveryp) 120 ENV *env; 121 int *need_recoveryp; 122{ 123 DB_ENV *dbenv; 124 pid_t pid; 125 u_int32_t bytes, mbytes; 126 int ret; 127 char *pp; 128 129 *need_recoveryp = 0; 130 131 dbenv = env->dbenv; 132 dbenv->thread_id(dbenv, &pid, NULL); 133 pp = NULL; 134 135 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 136 __db_msg(env, "%lu: register environment", (u_long)pid); 137 138 /* Build the path name and open the registry file. */ 139 if ((ret = 140 __db_appname(env, DB_APP_NONE, REGISTER_FILE, 0, NULL, &pp)) != 0) 141 goto err; 142 if ((ret = __os_open(env, pp, 0, 143 DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0) 144 goto err; 145 146 /* 147 * Wait for an exclusive lock on the file. 148 * 149 * !!! 150 * We're locking bytes that don't yet exist, but that's OK as far as 151 * I know. 152 */ 153 if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0) 154 goto err; 155 156 /* 157 * If the file size is 0, initialize the file. 158 * 159 * Run recovery if we create the file, that means we can clean up the 160 * system by removing the registry file and restarting the application. 161 */ 162 if ((ret = __os_ioinfo( 163 env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0) 164 goto err; 165 if (mbytes == 0 && bytes == 0) { 166 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 167 __db_msg(env, "%lu: creating %s", (u_long)pid, pp); 168 *need_recoveryp = 1; 169 } 170 171 /* Register this process. */ 172 if ((ret = __envreg_add(env, need_recoveryp)) != 0) 173 goto err; 174 175 /* 176 * Release our exclusive lock if we don't need to run recovery. If 177 * we need to run recovery, ENV->open will call back into register 178 * code once recovery has completed. 179 */ 180 if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0) 181 goto err; 182 183 if (0) { 184err: *need_recoveryp = 0; 185 186 /* 187 * !!! 188 * Closing the file handle must release all of our locks. 189 */ 190 if (dbenv->registry != NULL) 191 (void)__os_closehandle(env, dbenv->registry); 192 dbenv->registry = NULL; 193 } 194 195 if (pp != NULL) 196 __os_free(env, pp); 197 198 return (ret); 199} 200 201/* 202 * __envreg_add -- 203 * Add the process' pid to the register. 204 */ 205static int 206__envreg_add(env, need_recoveryp) 207 ENV *env; 208 int *need_recoveryp; 209{ 210 DB_ENV *dbenv; 211 pid_t pid; 212 off_t end, pos; 213 size_t nr, nw; 214 u_int lcnt; 215 u_int32_t bytes, mbytes; 216 int need_recovery, ret; 217 char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10]; 218 219 dbenv = env->dbenv; 220 need_recovery = 0; 221 COMPQUIET(p, NULL); 222 223 /* Get a copy of our process ID. */ 224 dbenv->thread_id(dbenv, &pid, NULL); 225 snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid); 226 227 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 228 __db_msg(env, "%lu: adding self to registry", (u_long)pid); 229 230#if DB_ENVREG_KILL_ALL 231 if (0) { 232kill_all: /* 233 * A second pass through the file, this time killing any 234 * processes still running. 235 */ 236 if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 237 return (ret); 238 } 239#endif 240 241 /* 242 * Read the file. Skip empty slots, and check that a lock is held 243 * for any allocated slots. An allocated slot which we can lock 244 * indicates a process died holding a handle and recovery needs to 245 * be run. 246 */ 247 for (lcnt = 0;; ++lcnt) { 248 if ((ret = __os_read( 249 env, dbenv->registry, buf, PID_LEN, &nr)) != 0) 250 return (ret); 251 if (nr == 0) 252 break; 253 254 /* 255 * A partial record at the end of the file is possible if a 256 * previously un-registered process was interrupted while 257 * registering. 258 */ 259 if (nr != PID_LEN) { 260 need_recovery = 1; 261 break; 262 } 263 264 if (PID_ISEMPTY(buf)) { 265 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 266 __db_msg(env, "%02u: EMPTY", lcnt); 267 continue; 268 } 269 270 /* 271 * !!! 272 * DB_REGISTER is implemented using per-process locking, only 273 * a single ENV handle may be open per process. Enforce 274 * that restriction. 275 */ 276 if (memcmp(buf, pid_buf, PID_LEN) == 0) { 277 __db_errx(env, 278 "DB_REGISTER limits processes to one open DB_ENV handle per environment"); 279 return (EINVAL); 280 } 281 282 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) { 283 for (p = buf; *p == ' ';) 284 ++p; 285 buf[nr - 1] = '\0'; 286 } 287 288#if DB_ENVREG_KILL_ALL 289 if (need_recovery) { 290 pid = (pid_t)strtoul(buf, NULL, 10); 291 (void)kill(pid, SIGKILL); 292 293 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 294 __db_msg(env, "%02u: %s: KILLED", lcnt, p); 295 continue; 296 } 297#endif 298 pos = (off_t)lcnt * PID_LEN; 299 if (REGISTRY_LOCK(env, pos, 1) == 0) { 300 if ((ret = REGISTRY_UNLOCK(env, pos)) != 0) 301 return (ret); 302 303 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 304 __db_msg(env, "%02u: %s: FAILED", lcnt, p); 305 306 need_recovery = 1; 307#if DB_ENVREG_KILL_ALL 308 goto kill_all; 309#else 310 break; 311#endif 312 } else 313 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 314 __db_msg(env, "%02u: %s: LOCKED", lcnt, p); 315 } 316 317 /* 318 * If we have to perform recovery... 319 * 320 * Mark all slots empty. Registry ignores empty slots we can't lock, 321 * so it doesn't matter if any of the processes are in the middle of 322 * exiting Berkeley DB -- they'll discard their lock when they exit. 323 */ 324 if (need_recovery) { 325 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 326 __db_msg(env, "%lu: recovery required", (u_long)pid); 327 328 /* Figure out how big the file is. */ 329 if ((ret = __os_ioinfo( 330 env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0) 331 return (ret); 332 end = (off_t)mbytes * MEGABYTE + bytes; 333 334 /* 335 * Seek to the beginning of the file and overwrite slots to 336 * the end of the file. 337 * 338 * It's possible for there to be a partial entry at the end of 339 * the file if a process died when trying to register. If so, 340 * correct for it and overwrite it as well. 341 */ 342 if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 343 return (ret); 344 for (lcnt = (u_int)end / PID_LEN + 345 ((u_int)end % PID_LEN == 0 ? 0 : 1); lcnt > 0; --lcnt) 346 if ((ret = __os_write(env, 347 dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) 348 return (ret); 349 } 350 351 /* 352 * Seek to the first process slot and add ourselves to the first empty 353 * slot we can lock. 354 */ 355 if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 356 return (ret); 357 for (lcnt = 0;; ++lcnt) { 358 if ((ret = __os_read( 359 env, dbenv->registry, buf, PID_LEN, &nr)) != 0) 360 return (ret); 361 if (nr == PID_LEN && !PID_ISEMPTY(buf)) 362 continue; 363 pos = (off_t)lcnt * PID_LEN; 364 if (REGISTRY_LOCK(env, pos, 1) == 0) { 365 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 366 __db_msg(env, 367 "%lu: locking slot %02u at offset %lu", 368 (u_long)pid, lcnt, (u_long)pos); 369 370 if ((ret = __os_seek(env, 371 dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 || 372 (ret = __os_write(env, 373 dbenv->registry, pid_buf, PID_LEN, &nw)) != 0) 374 return (ret); 375 dbenv->registry_off = (u_int32_t)pos; 376 break; 377 } 378 } 379 380 if (need_recovery) 381 *need_recoveryp = 1; 382 383 return (ret); 384} 385 386/* 387 * __envreg_unregister -- 388 * Unregister a ENV handle. 389 * 390 * PUBLIC: int __envreg_unregister __P((ENV *, int)); 391 */ 392int 393__envreg_unregister(env, recovery_failed) 394 ENV *env; 395 int recovery_failed; 396{ 397 DB_ENV *dbenv; 398 size_t nw; 399 int ret, t_ret; 400 401 dbenv = env->dbenv; 402 ret = 0; 403 404 /* 405 * If recovery failed, we want to drop our locks and return, but still 406 * make sure any subsequent process doesn't decide everything is just 407 * fine and try to get into the database environment. In the case of 408 * an error, discard our locks, but leave our slot filled-in. 409 */ 410 if (recovery_failed) 411 goto err; 412 413 /* 414 * Why isn't an exclusive lock necessary to discard a ENV handle? 415 * 416 * We mark our process ID slot empty before we discard the process slot 417 * lock, and threads of control reviewing the register file ignore any 418 * slots which they can't lock. 419 */ 420 if ((ret = __os_seek(env, 421 dbenv->registry, 0, 0, dbenv->registry_off)) != 0 || 422 (ret = __os_write( 423 env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) 424 goto err; 425 426 /* 427 * !!! 428 * This code assumes that closing the file descriptor discards all 429 * held locks. 430 * 431 * !!! 432 * There is an ordering problem here -- in the case of a process that 433 * failed in recovery, we're unlocking both the exclusive lock and our 434 * slot lock. If the OS unlocked the exclusive lock and then allowed 435 * another thread of control to acquire the exclusive lock before also 436 * also releasing our slot lock, we could race. That can't happen, I 437 * don't think. 438 */ 439err: if ((t_ret = 440 __os_closehandle(env, dbenv->registry)) != 0 && ret == 0) 441 ret = t_ret; 442 443 dbenv->registry = NULL; 444 return (ret); 445} 446 447/* 448 * __envreg_xunlock -- 449 * Discard the exclusive lock held by the ENV handle. 450 * 451 * PUBLIC: int __envreg_xunlock __P((ENV *)); 452 */ 453int 454__envreg_xunlock(env) 455 ENV *env; 456{ 457 DB_ENV *dbenv; 458 pid_t pid; 459 int ret; 460 461 dbenv = env->dbenv; 462 463 dbenv->thread_id(dbenv, &pid, NULL); 464 465 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 466 __db_msg(env, 467 "%lu: recovery completed, unlocking", (u_long)pid); 468 469 if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0) 470 return (ret); 471 472 __db_err(env, ret, "%s: exclusive file unlock", REGISTER_FILE); 473 return (__env_panic(env, ret)); 474} 475