1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2004-2009 Oracle. All rights reserved. 5 * 6 * $Id$ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12 13#define REGISTER_FILE "__db.register" 14 15#define PID_EMPTY "X 0\n" /* Unused PID entry */ 16#define PID_FMT "%24lu\n" /* PID entry format */ 17 /* Unused PID test */ 18#define PID_ISEMPTY(p) (memcmp(p, PID_EMPTY, PID_LEN) == 0) 19#define PID_LEN (25) /* PID entry length */ 20 21#define REGISTRY_LOCK(env, pos, nowait) \ 22 __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait) 23#define REGISTRY_UNLOCK(env, pos) \ 24 __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0) 25#define REGISTRY_EXCL_LOCK(env, nowait) \ 26 REGISTRY_LOCK(env, 1, nowait) 27#define REGISTRY_EXCL_UNLOCK(env) \ 28 REGISTRY_UNLOCK(env, 1) 29 30static int __envreg_add __P((ENV *, int *, u_int32_t)); 31 32/* 33 * Support for portable, multi-process database environment locking, based on 34 * the Subversion SR (#11511). 35 * 36 * The registry feature is configured by specifying the DB_REGISTER flag to the 37 * DbEnv.open method. If DB_REGISTER is specified, DB opens the registry file 38 * in the database environment home directory. The registry file is formatted 39 * as follows: 40 * 41 * 12345 # process ID slot 1 42 * X # empty slot 43 * 12346 # process ID slot 2 44 * X # empty slot 45 * 12347 # process ID slot 3 46 * 12348 # process ID slot 4 47 * X 12349 # empty slot 48 * X # empty slot 49 * 50 * All lines are fixed-length. All lines are process ID slots. Empty slots 51 * are marked with leading non-digit characters. 52 * 53 * To modify the file, you get an exclusive lock on the first byte of the file. 54 * 55 * While holding any DbEnv handle, each process has an exclusive lock on the 56 * first byte of a process ID slot. There is a restriction on having more 57 * than one DbEnv handle open at a time, because Berkeley DB uses per-process 58 * locking to implement this feature, that is, a process may never have more 59 * than a single slot locked. 60 * 61 * This work requires that if a process dies or the system crashes, locks held 62 * by the dying processes will be dropped. (We can't use system shared 63 * memory-backed or filesystem-backed locks because they're persistent when a 64 * process dies.) On POSIX systems, we use fcntl(2) locks; on Win32 we have 65 * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on 66 * Lockfile/UnlockFile. 67 * 68 * We could implement the same solution with flock locking instead of fcntl, 69 * but flock would require a separate file for each process of control (and 70 * probably each DbEnv handle) in the database environment, which is fairly 71 * ugly. 72 * 73 * Whenever a process opens a new DbEnv handle, it walks the registry file and 74 * verifies it CANNOT acquire the lock for any non-empty slot. If a lock for 75 * a non-empty slot is available, we know a process died holding an open handle, 76 * and recovery needs to be run. 77 * 78 * It's possible to get corruption in the registry file. If a write system 79 * call fails after partially completing, there can be corrupted entries in 80 * the registry file, or a partial entry at the end of the file. This is OK. 81 * A corrupted entry will be flagged as a non-empty line during the registry 82 * file walk. Since the line was corrupted by process failure, no process will 83 * hold a lock on the slot, which will lead to recovery being run. 84 * 85 * There can still be processes running in the environment when we recover it, 86 * and, in fact, there can still be processes running in the old environment 87 * after we're up and running in a new one. This is safe because performing 88 * recovery panics (and removes) the existing environment, so the window of 89 * vulnerability is small. Further, we check the panic flag in the DB API 90 * methods, when waking from spinning on a mutex, and whenever we're about to 91 * write to disk). The only window of corruption is if the write check of the 92 * panic were to complete, the region subsequently be recovered, and then the 93 * write continues. That's very, very unlikely to happen. This vulnerability 94 * already exists in Berkeley DB, too, the registry code doesn't make it any 95 * worse than it already is. 96 * 97 * The only way to avoid that window entirely is to ensure that all processes 98 * in the Berkeley DB environment exit before we run recovery. Applications 99 * can do that if they maintain their own process registry outside of Berkeley 100 * DB, but it's a little more difficult to do here. The obvious approach is 101 * to send signals to any process using the database environment as soon as we 102 * decide to run recovery, but there are problems with that approach: we might 103 * not have permission to send signals to the process, the process might have 104 * signal handlers installed, the cookie stored might not be the same as kill's 105 * argument, we may not be able to reliably tell if the process died, and there 106 * are probably other problems. However, if we can send a signal, it reduces 107 * the window, and so we include the code here. To configure it, turn on the 108 * DB_ENVREG_KILL_ALL #define. 109 */ 110#define DB_ENVREG_KILL_ALL 0 111 112/* 113 * __envreg_register -- 114 * Register a ENV handle. 115 * 116 * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t)); 117 */ 118int 119__envreg_register(env, need_recoveryp, flags) 120 ENV *env; 121 int *need_recoveryp; 122 u_int32_t flags; 123{ 124 DB_ENV *dbenv; 125 pid_t pid; 126 u_int32_t bytes, mbytes; 127 int ret; 128 char *pp; 129 130 *need_recoveryp = 0; 131 132 dbenv = env->dbenv; 133 dbenv->thread_id(dbenv, &pid, NULL); 134 pp = NULL; 135 136 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 137 __db_msg(env, "%lu: register environment", (u_long)pid); 138 139 /* Build the path name and open the registry file. */ 140 if ((ret = __db_appname(env, 141 DB_APP_NONE, REGISTER_FILE, NULL, &pp)) != 0) 142 goto err; 143 if ((ret = __os_open(env, pp, 0, 144 DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0) 145 goto err; 146 147 /* 148 * Wait for an exclusive lock on the file. 149 * 150 * !!! 151 * We're locking bytes that don't yet exist, but that's OK as far as 152 * I know. 153 */ 154 if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0) 155 goto err; 156 157 /* 158 * If the file size is 0, initialize the file. 159 * 160 * Run recovery if we create the file, that means we can clean up the 161 * system by removing the registry file and restarting the application. 162 */ 163 if ((ret = __os_ioinfo( 164 env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0) 165 goto err; 166 if (mbytes == 0 && bytes == 0) { 167 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 168 __db_msg(env, "%lu: creating %s", (u_long)pid, pp); 169 *need_recoveryp = 1; 170 } 171 172 /* Register this process. */ 173 if ((ret = __envreg_add(env, need_recoveryp, flags) != 0)) 174 goto err; 175 176 /* 177 * Release our exclusive lock if we don't need to run recovery. If 178 * we need to run recovery, ENV->open will call back into register 179 * code once recovery has completed. 180 */ 181 if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0) 182 goto err; 183 184 if (0) { 185err: *need_recoveryp = 0; 186 187 /* 188 * !!! 189 * Closing the file handle must release all of our locks. 190 */ 191 if (dbenv->registry != NULL) 192 (void)__os_closehandle(env, dbenv->registry); 193 dbenv->registry = NULL; 194 } 195 196 if (pp != NULL) 197 __os_free(env, pp); 198 199 return (ret); 200} 201 202/* 203 * __envreg_add -- 204 * Add the process' pid to the register. 205 */ 206static int 207__envreg_add(env, need_recoveryp, flags) 208 ENV *env; 209 int *need_recoveryp; 210 u_int32_t flags; 211{ 212 DB_ENV *dbenv; 213 DB_THREAD_INFO *ip; 214 REGENV * renv; 215 REGINFO *infop; 216 pid_t pid; 217 off_t end, pos, dead; 218 size_t nr, nw; 219 u_int lcnt; 220 u_int32_t bytes, mbytes, orig_flags; 221 int need_recovery, ret, t_ret; 222 char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10]; 223 224 dbenv = env->dbenv; 225 need_recovery = 0; 226 COMPQUIET(dead, 0); 227 COMPQUIET(p, NULL); 228 ip = NULL; 229 230 /* Get a copy of our process ID. */ 231 dbenv->thread_id(dbenv, &pid, NULL); 232 snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid); 233 234 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 235 __db_msg(env, "%lu: adding self to registry", (u_long)pid); 236 237#if DB_ENVREG_KILL_ALL 238 if (0) { 239kill_all: /* 240 * A second pass through the file, this time killing any 241 * processes still running. 242 */ 243 if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 244 return (ret); 245 } 246#endif 247 248 /* 249 * Read the file. Skip empty slots, and check that a lock is held 250 * for any allocated slots. An allocated slot which we can lock 251 * indicates a process died holding a handle and recovery needs to 252 * be run. 253 */ 254 for (lcnt = 0;; ++lcnt) { 255 if ((ret = __os_read( 256 env, dbenv->registry, buf, PID_LEN, &nr)) != 0) 257 return (ret); 258 if (nr == 0) 259 break; 260 261 /* 262 * A partial record at the end of the file is possible if a 263 * previously un-registered process was interrupted while 264 * registering. 265 */ 266 if (nr != PID_LEN) { 267 need_recovery = 1; 268 break; 269 } 270 271 if (PID_ISEMPTY(buf)) { 272 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 273 __db_msg(env, "%02u: EMPTY", lcnt); 274 continue; 275 } 276 277 /* 278 * !!! 279 * DB_REGISTER is implemented using per-process locking, only 280 * a single ENV handle may be open per process. Enforce 281 * that restriction. 282 */ 283 if (memcmp(buf, pid_buf, PID_LEN) == 0) { 284 __db_errx(env, 285 "DB_REGISTER limits processes to one open DB_ENV handle per environment"); 286 return (EINVAL); 287 } 288 289 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) { 290 for (p = buf; *p == ' ';) 291 ++p; 292 buf[nr - 1] = '\0'; 293 } 294 295#if DB_ENVREG_KILL_ALL 296 if (need_recovery) { 297 pid = (pid_t)strtoul(buf, NULL, 10); 298 (void)kill(pid, SIGKILL); 299 300 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 301 __db_msg(env, "%02u: %s: KILLED", lcnt, p); 302 continue; 303 } 304#endif 305 pos = (off_t)lcnt * PID_LEN; 306 if (REGISTRY_LOCK(env, pos, 1) == 0) { 307 if ((ret = REGISTRY_UNLOCK(env, pos)) != 0) 308 return (ret); 309 310 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 311 __db_msg(env, "%02u: %s: FAILED", lcnt, p); 312 313 need_recovery = 1; 314 dead = pos; 315#if DB_ENVREG_KILL_ALL 316 goto kill_all; 317#else 318 break; 319#endif 320 } else 321 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 322 __db_msg(env, "%02u: %s: LOCKED", lcnt, p); 323 } 324 325 /* 326 * If we have to perform recovery... 327 * 328 * Mark all slots empty. Registry ignores empty slots we can't lock, 329 * so it doesn't matter if any of the processes are in the middle of 330 * exiting Berkeley DB -- they'll discard their lock when they exit. 331 */ 332 if (need_recovery) { 333 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 334 __db_msg(env, "%lu: recovery required", (u_long)pid); 335 336 if (LF_ISSET(DB_FAILCHK)) { 337 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 338 __db_msg(env, 339 "%lu: performing failchk", (u_long)pid); 340 /* The environment will already exist, so we do not 341 * want DB_CREATE set, nor do we want any recovery at 342 * this point. No need to put values back as flags is 343 * passed in by value. Save original dbenv flags in 344 * case we need to recover/remove existing environment. 345 * Set DB_ENV_FAILCHK before attach to help ensure we 346 * dont block on a mutex held by the dead process. 347 */ 348 LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL); 349 orig_flags = dbenv->flags; 350 F_SET(dbenv, DB_ENV_FAILCHK); 351 /* Attach to environment and subsystems. */ 352 if ((ret = __env_attach_regions( 353 dbenv, flags, orig_flags, 0)) != 0) 354 goto sig_proc; 355 if ((t_ret = 356 __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 && 357 ret == 0) 358 ret = t_ret; 359 if ((t_ret = 360 __env_failchk_int(dbenv)) != 0 && ret == 0) 361 ret = t_ret; 362 /* Detach from environment and deregister thread. */ 363 if ((t_ret = 364 __env_refresh(dbenv, orig_flags, 0)) != 0 && 365 ret == 0) 366 ret = t_ret; 367 if (ret == 0) { 368 if ((ret = __os_seek(env, dbenv->registry, 369 0, 0,(u_int32_t)dead)) != 0 || 370 (ret = __os_write(env, dbenv->registry, 371 PID_EMPTY, PID_LEN, &nw)) != 0) 372 return (ret); 373 need_recovery = 0; 374 goto add; 375 } 376 377 } 378 /* If we can't attach, then we cannot set DB_REGISTER panic. */ 379sig_proc: if (__env_attach(env, NULL, 0, 0) == 0) { 380 infop = env->reginfo; 381 renv = infop->primary; 382 /* Indicate DB_REGSITER panic. Also, set environment 383 * panic as this is the panic trigger mechanism in 384 * the code that everything looks for. 385 */ 386 renv->reg_panic = 1; 387 renv->panic = 1; 388 (void)__env_detach(env, 0); 389 } 390 391 /* Wait for processes to see the panic and leave. */ 392 __os_yield(env, 0, dbenv->envreg_timeout); 393 394 /* FIGURE out how big the file is. */ 395 if ((ret = __os_ioinfo( 396 env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0) 397 return (ret); 398 end = (off_t)mbytes * MEGABYTE + bytes; 399 400 /* 401 * Seek to the beginning of the file and overwrite slots to 402 * the end of the file. 403 * 404 * It's possible for there to be a partial entry at the end of 405 * the file if a process died when trying to register. If so, 406 * correct for it and overwrite it as well. 407 */ 408 if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 409 return (ret); 410 for (lcnt = 0; lcnt < ((u_int)end / PID_LEN + 411 ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) { 412 413 if ((ret = __os_read( 414 env, dbenv->registry, buf, PID_LEN, &nr)) != 0) 415 return (ret); 416 417 pos = (off_t)lcnt * PID_LEN; 418 /* do not notify on dead process */ 419 if (pos != dead) { 420 pid = (pid_t)strtoul(buf, NULL, 10); 421 DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid); 422 } 423 424 if ((ret = __os_seek(env, 425 dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 || 426 (ret = __os_write(env, 427 dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) 428 return (ret); 429 } 430 /* wait one last time to get everyone out */ 431 __os_yield(env, 0, dbenv->envreg_timeout); 432 } 433 434 /* 435 * Seek to the first process slot and add ourselves to the first empty 436 * slot we can lock. 437 */ 438add: if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) 439 return (ret); 440 for (lcnt = 0;; ++lcnt) { 441 if ((ret = __os_read( 442 env, dbenv->registry, buf, PID_LEN, &nr)) != 0) 443 return (ret); 444 if (nr == PID_LEN && !PID_ISEMPTY(buf)) 445 continue; 446 pos = (off_t)lcnt * PID_LEN; 447 if (REGISTRY_LOCK(env, pos, 1) == 0) { 448 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 449 __db_msg(env, 450 "%lu: locking slot %02u at offset %lu", 451 (u_long)pid, lcnt, (u_long)pos); 452 453 if ((ret = __os_seek(env, 454 dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 || 455 (ret = __os_write(env, 456 dbenv->registry, pid_buf, PID_LEN, &nw)) != 0) 457 return (ret); 458 dbenv->registry_off = (u_int32_t)pos; 459 break; 460 } 461 } 462 463 if (need_recovery) 464 *need_recoveryp = 1; 465 466 return (ret); 467} 468 469/* 470 * __envreg_unregister -- 471 * Unregister a ENV handle. 472 * 473 * PUBLIC: int __envreg_unregister __P((ENV *, int)); 474 */ 475int 476__envreg_unregister(env, recovery_failed) 477 ENV *env; 478 int recovery_failed; 479{ 480 DB_ENV *dbenv; 481 size_t nw; 482 int ret, t_ret; 483 484 dbenv = env->dbenv; 485 ret = 0; 486 487 /* 488 * If recovery failed, we want to drop our locks and return, but still 489 * make sure any subsequent process doesn't decide everything is just 490 * fine and try to get into the database environment. In the case of 491 * an error, discard our locks, but leave our slot filled-in. 492 */ 493 if (recovery_failed) 494 goto err; 495 496 /* 497 * Why isn't an exclusive lock necessary to discard a ENV handle? 498 * 499 * We mark our process ID slot empty before we discard the process slot 500 * lock, and threads of control reviewing the register file ignore any 501 * slots which they can't lock. 502 */ 503 if ((ret = __os_seek(env, 504 dbenv->registry, 0, 0, dbenv->registry_off)) != 0 || 505 (ret = __os_write( 506 env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) 507 goto err; 508 509 /* 510 * !!! 511 * This code assumes that closing the file descriptor discards all 512 * held locks. 513 * 514 * !!! 515 * There is an ordering problem here -- in the case of a process that 516 * failed in recovery, we're unlocking both the exclusive lock and our 517 * slot lock. If the OS unlocked the exclusive lock and then allowed 518 * another thread of control to acquire the exclusive lock before also 519 * also releasing our slot lock, we could race. That can't happen, I 520 * don't think. 521 */ 522err: if ((t_ret = 523 __os_closehandle(env, dbenv->registry)) != 0 && ret == 0) 524 ret = t_ret; 525 526 dbenv->registry = NULL; 527 return (ret); 528} 529 530/* 531 * __envreg_xunlock -- 532 * Discard the exclusive lock held by the ENV handle. 533 * 534 * PUBLIC: int __envreg_xunlock __P((ENV *)); 535 */ 536int 537__envreg_xunlock(env) 538 ENV *env; 539{ 540 DB_ENV *dbenv; 541 pid_t pid; 542 int ret; 543 544 dbenv = env->dbenv; 545 546 dbenv->thread_id(dbenv, &pid, NULL); 547 548 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) 549 __db_msg(env, 550 "%lu: recovery completed, unlocking", (u_long)pid); 551 552 if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0) 553 return (ret); 554 555 __db_err(env, ret, "%s: exclusive file unlock", REGISTER_FILE); 556 return (__env_panic(env, ret)); 557} 558