1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2004-2009 Oracle.  All rights reserved.
5 *
6 * $Id$
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13#define	REGISTER_FILE	"__db.register"
14
15#define	PID_EMPTY	"X                      0\n"	/* Unused PID entry */
16#define	PID_FMT		"%24lu\n"			/* PID entry format */
17							/* Unused PID test */
18#define	PID_ISEMPTY(p)	(memcmp(p, PID_EMPTY, PID_LEN) == 0)
19#define	PID_LEN		(25)				/* PID entry length */
20
21#define	REGISTRY_LOCK(env, pos, nowait)					\
22	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait)
23#define	REGISTRY_UNLOCK(env, pos)					\
24	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0)
25#define	REGISTRY_EXCL_LOCK(env, nowait)					\
26	REGISTRY_LOCK(env, 1, nowait)
27#define	REGISTRY_EXCL_UNLOCK(env)					\
28	REGISTRY_UNLOCK(env, 1)
29
30static  int __envreg_add __P((ENV *, int *, u_int32_t));
31
32/*
33 * Support for portable, multi-process database environment locking, based on
34 * the Subversion SR (#11511).
35 *
36 * The registry feature is configured by specifying the DB_REGISTER flag to the
37 * DbEnv.open method.  If DB_REGISTER is specified, DB opens the registry file
38 * in the database environment home directory.  The registry file is formatted
39 * as follows:
40 *
41 *	                    12345		# process ID slot 1
42 *	X		# empty slot
43 *	                    12346		# process ID slot 2
44 *	X		# empty slot
45 *	                    12347		# process ID slot 3
46 *	                    12348		# process ID slot 4
47 *	X                   12349		# empty slot
48 *	X		# empty slot
49 *
50 * All lines are fixed-length.  All lines are process ID slots.  Empty slots
51 * are marked with leading non-digit characters.
52 *
53 * To modify the file, you get an exclusive lock on the first byte of the file.
54 *
55 * While holding any DbEnv handle, each process has an exclusive lock on the
56 * first byte of a process ID slot.  There is a restriction on having more
57 * than one DbEnv handle open at a time, because Berkeley DB uses per-process
58 * locking to implement this feature, that is, a process may never have more
59 * than a single slot locked.
60 *
61 * This work requires that if a process dies or the system crashes, locks held
62 * by the dying processes will be dropped.  (We can't use system shared
63 * memory-backed or filesystem-backed locks because they're persistent when a
64 * process dies.)  On POSIX systems, we use fcntl(2) locks; on Win32 we have
65 * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
66 * Lockfile/UnlockFile.
67 *
68 * We could implement the same solution with flock locking instead of fcntl,
69 * but flock would require a separate file for each process of control (and
70 * probably each DbEnv handle) in the database environment, which is fairly
71 * ugly.
72 *
73 * Whenever a process opens a new DbEnv handle, it walks the registry file and
74 * verifies it CANNOT acquire the lock for any non-empty slot.  If a lock for
75 * a non-empty slot is available, we know a process died holding an open handle,
76 * and recovery needs to be run.
77 *
78 * It's possible to get corruption in the registry file.  If a write system
79 * call fails after partially completing, there can be corrupted entries in
80 * the registry file, or a partial entry at the end of the file.  This is OK.
81 * A corrupted entry will be flagged as a non-empty line during the registry
82 * file walk.  Since the line was corrupted by process failure, no process will
83 * hold a lock on the slot, which will lead to recovery being run.
84 *
85 * There can still be processes running in the environment when we recover it,
86 * and, in fact, there can still be processes running in the old environment
87 * after we're up and running in a new one.  This is safe because performing
88 * recovery panics (and removes) the existing environment, so the window of
89 * vulnerability is small.  Further, we check the panic flag in the DB API
90 * methods, when waking from spinning on a mutex, and whenever we're about to
91 * write to disk).  The only window of corruption is if the write check of the
92 * panic were to complete, the region subsequently be recovered, and then the
93 * write continues.  That's very, very unlikely to happen.  This vulnerability
94 * already exists in Berkeley DB, too, the registry code doesn't make it any
95 * worse than it already is.
96 *
97 * The only way to avoid that window entirely is to ensure that all processes
98 * in the Berkeley DB environment exit before we run recovery.   Applications
99 * can do that if they maintain their own process registry outside of Berkeley
100 * DB, but it's a little more difficult to do here.   The obvious approach is
101 * to send signals to any process using the database environment as soon as we
102 * decide to run recovery, but there are problems with that approach: we might
103 * not have permission to send signals to the process, the process might have
104 * signal handlers installed, the cookie stored might not be the same as kill's
105 * argument, we may not be able to reliably tell if the process died, and there
106 * are probably other problems.  However, if we can send a signal, it reduces
107 * the window, and so we include the code here.  To configure it, turn on the
108 * DB_ENVREG_KILL_ALL #define.
109 */
110#define	DB_ENVREG_KILL_ALL	0
111
112/*
113 * __envreg_register --
114 *	Register a ENV handle.
115 *
116 * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t));
117 */
118int
119__envreg_register(env, need_recoveryp, flags)
120	ENV *env;
121	int *need_recoveryp;
122	u_int32_t flags;
123{
124	DB_ENV *dbenv;
125	pid_t pid;
126	u_int32_t bytes, mbytes;
127	int ret;
128	char *pp;
129
130	*need_recoveryp = 0;
131
132	dbenv = env->dbenv;
133	dbenv->thread_id(dbenv, &pid, NULL);
134	pp = NULL;
135
136	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
137		__db_msg(env, "%lu: register environment", (u_long)pid);
138
139	/* Build the path name and open the registry file. */
140	if ((ret = __db_appname(env,
141	    DB_APP_NONE, REGISTER_FILE, NULL, &pp)) != 0)
142		goto err;
143	if ((ret = __os_open(env, pp, 0,
144	    DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0)
145		goto err;
146
147	/*
148	 * Wait for an exclusive lock on the file.
149	 *
150	 * !!!
151	 * We're locking bytes that don't yet exist, but that's OK as far as
152	 * I know.
153	 */
154	if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0)
155		goto err;
156
157	/*
158	 * If the file size is 0, initialize the file.
159	 *
160	 * Run recovery if we create the file, that means we can clean up the
161	 * system by removing the registry file and restarting the application.
162	 */
163	if ((ret = __os_ioinfo(
164	    env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
165		goto err;
166	if (mbytes == 0 && bytes == 0) {
167		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
168			__db_msg(env, "%lu: creating %s", (u_long)pid, pp);
169		*need_recoveryp = 1;
170	}
171
172	/* Register this process. */
173	if ((ret = __envreg_add(env, need_recoveryp, flags) != 0))
174		goto err;
175
176	/*
177	 * Release our exclusive lock if we don't need to run recovery.  If
178	 * we need to run recovery, ENV->open will call back into register
179	 * code once recovery has completed.
180	 */
181	if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0)
182		goto err;
183
184	if (0) {
185err:		*need_recoveryp = 0;
186
187		/*
188		 * !!!
189		 * Closing the file handle must release all of our locks.
190		 */
191		if (dbenv->registry != NULL)
192			(void)__os_closehandle(env, dbenv->registry);
193		dbenv->registry = NULL;
194	}
195
196	if (pp != NULL)
197		__os_free(env, pp);
198
199	return (ret);
200}
201
202/*
203 * __envreg_add --
204 *	Add the process' pid to the register.
205 */
206static int
207__envreg_add(env, need_recoveryp, flags)
208	ENV *env;
209	int *need_recoveryp;
210	u_int32_t flags;
211{
212	DB_ENV *dbenv;
213	DB_THREAD_INFO *ip;
214	REGENV * renv;
215	REGINFO *infop;
216	pid_t pid;
217	off_t end, pos, dead;
218	size_t nr, nw;
219	u_int lcnt;
220	u_int32_t bytes, mbytes, orig_flags;
221	int need_recovery, ret, t_ret;
222	char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];
223
224	dbenv = env->dbenv;
225	need_recovery = 0;
226	COMPQUIET(dead, 0);
227	COMPQUIET(p, NULL);
228	ip = NULL;
229
230	/* Get a copy of our process ID. */
231	dbenv->thread_id(dbenv, &pid, NULL);
232	snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);
233
234	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
235		__db_msg(env, "%lu: adding self to registry", (u_long)pid);
236
237#if DB_ENVREG_KILL_ALL
238	if (0) {
239kill_all:	/*
240		 * A second pass through the file, this time killing any
241		 * processes still running.
242		 */
243		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
244			return (ret);
245	}
246#endif
247
248	/*
249	 * Read the file.  Skip empty slots, and check that a lock is held
250	 * for any allocated slots.  An allocated slot which we can lock
251	 * indicates a process died holding a handle and recovery needs to
252	 * be run.
253	 */
254	for (lcnt = 0;; ++lcnt) {
255		if ((ret = __os_read(
256		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
257			return (ret);
258		if (nr == 0)
259			break;
260
261		/*
262		 * A partial record at the end of the file is possible if a
263		 * previously un-registered process was interrupted while
264		 * registering.
265		 */
266		if (nr != PID_LEN) {
267			need_recovery = 1;
268			break;
269		}
270
271		if (PID_ISEMPTY(buf)) {
272			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
273				__db_msg(env, "%02u: EMPTY", lcnt);
274			continue;
275		}
276
277		/*
278		 * !!!
279		 * DB_REGISTER is implemented using per-process locking, only
280		 * a single ENV handle may be open per process.  Enforce
281		 * that restriction.
282		 */
283		if (memcmp(buf, pid_buf, PID_LEN) == 0) {
284			__db_errx(env,
285    "DB_REGISTER limits processes to one open DB_ENV handle per environment");
286			return (EINVAL);
287		}
288
289		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
290			for (p = buf; *p == ' ';)
291				++p;
292			buf[nr - 1] = '\0';
293		}
294
295#if DB_ENVREG_KILL_ALL
296		if (need_recovery) {
297			pid = (pid_t)strtoul(buf, NULL, 10);
298			(void)kill(pid, SIGKILL);
299
300			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
301				__db_msg(env, "%02u: %s: KILLED", lcnt, p);
302			continue;
303		}
304#endif
305		pos = (off_t)lcnt * PID_LEN;
306		if (REGISTRY_LOCK(env, pos, 1) == 0) {
307			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
308				return (ret);
309
310			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
311				__db_msg(env, "%02u: %s: FAILED", lcnt, p);
312
313			need_recovery = 1;
314			dead = pos;
315#if DB_ENVREG_KILL_ALL
316			goto kill_all;
317#else
318			break;
319#endif
320		} else
321			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
322				__db_msg(env, "%02u: %s: LOCKED", lcnt, p);
323	}
324
325	/*
326	 * If we have to perform recovery...
327	 *
328	 * Mark all slots empty.  Registry ignores empty slots we can't lock,
329	 * so it doesn't matter if any of the processes are in the middle of
330	 * exiting Berkeley DB -- they'll discard their lock when they exit.
331	 */
332	if (need_recovery) {
333		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
334			__db_msg(env, "%lu: recovery required", (u_long)pid);
335
336		if (LF_ISSET(DB_FAILCHK)) {
337			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
338				__db_msg(env,
339				    "%lu: performing failchk", (u_long)pid);
340			/* The environment will already exist, so we do not
341			 * want DB_CREATE set, nor do we want any recovery at
342			 * this point.  No need to put values back as flags is
343			 * passed in by value.  Save original dbenv flags in
344			 * case we need to recover/remove existing environment.
345			 * Set DB_ENV_FAILCHK before attach to help ensure we
346			 * dont block on a mutex held by the dead process.
347			 */
348			LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL);
349			orig_flags = dbenv->flags;
350			F_SET(dbenv, DB_ENV_FAILCHK);
351			/* Attach to environment and subsystems. */
352			if ((ret = __env_attach_regions(
353			    dbenv, flags, orig_flags, 0)) != 0)
354				goto sig_proc;
355			if ((t_ret =
356			    __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 &&
357			    ret == 0)
358				ret = t_ret;
359			if ((t_ret =
360			    __env_failchk_int(dbenv)) != 0 && ret == 0)
361				ret = t_ret;
362			/* Detach from environment and deregister thread. */
363			if ((t_ret =
364			    __env_refresh(dbenv, orig_flags, 0)) != 0 &&
365			    ret == 0)
366				ret = t_ret;
367			if (ret == 0) {
368				if ((ret = __os_seek(env, dbenv->registry,
369				    0, 0,(u_int32_t)dead)) != 0 ||
370				    (ret = __os_write(env, dbenv->registry,
371				    PID_EMPTY, PID_LEN, &nw)) != 0)
372					return (ret);
373				need_recovery = 0;
374				goto add;
375			}
376
377		}
378		/* If we can't attach, then we cannot set DB_REGISTER panic. */
379sig_proc:	if (__env_attach(env, NULL, 0, 0) == 0) {
380			infop = env->reginfo;
381			renv = infop->primary;
382			/* Indicate DB_REGSITER panic.  Also, set environment
383			 * panic as this is the panic trigger mechanism in
384			 * the code that everything looks for.
385			 */
386			renv->reg_panic = 1;
387			renv->panic = 1;
388			(void)__env_detach(env, 0);
389		}
390
391		/* Wait for processes to see the panic and leave. */
392		__os_yield(env, 0, dbenv->envreg_timeout);
393
394		/* FIGURE out how big the file is. */
395		if ((ret = __os_ioinfo(
396		    env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
397			return (ret);
398		end = (off_t)mbytes * MEGABYTE + bytes;
399
400		/*
401		 * Seek to the beginning of the file and overwrite slots to
402		 * the end of the file.
403		 *
404		 * It's possible for there to be a partial entry at the end of
405		 * the file if a process died when trying to register.  If so,
406		 * correct for it and overwrite it as well.
407		 */
408		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
409			return (ret);
410		for (lcnt = 0; lcnt < ((u_int)end / PID_LEN +
411		    ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) {
412
413			if ((ret = __os_read(
414			    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
415				return (ret);
416
417			pos = (off_t)lcnt * PID_LEN;
418			/* do not notify on dead process */
419			if (pos != dead) {
420				pid = (pid_t)strtoul(buf, NULL, 10);
421				DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid);
422			}
423
424			if ((ret = __os_seek(env,
425			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
426			    (ret = __os_write(env,
427			    dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
428				return (ret);
429		}
430		/* wait one last time to get everyone out */
431		__os_yield(env, 0, dbenv->envreg_timeout);
432	}
433
434	/*
435	 * Seek to the first process slot and add ourselves to the first empty
436	 * slot we can lock.
437	 */
438add:	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
439		return (ret);
440	for (lcnt = 0;; ++lcnt) {
441		if ((ret = __os_read(
442		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
443			return (ret);
444		if (nr == PID_LEN && !PID_ISEMPTY(buf))
445			continue;
446		pos = (off_t)lcnt * PID_LEN;
447		if (REGISTRY_LOCK(env, pos, 1) == 0) {
448			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
449				__db_msg(env,
450				    "%lu: locking slot %02u at offset %lu",
451				    (u_long)pid, lcnt, (u_long)pos);
452
453			if ((ret = __os_seek(env,
454			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
455			    (ret = __os_write(env,
456			    dbenv->registry, pid_buf, PID_LEN, &nw)) != 0)
457				return (ret);
458			dbenv->registry_off = (u_int32_t)pos;
459			break;
460		}
461	}
462
463	if (need_recovery)
464		*need_recoveryp = 1;
465
466	return (ret);
467}
468
469/*
470 * __envreg_unregister --
471 *	Unregister a ENV handle.
472 *
473 * PUBLIC: int __envreg_unregister __P((ENV *, int));
474 */
475int
476__envreg_unregister(env, recovery_failed)
477	ENV *env;
478	int recovery_failed;
479{
480	DB_ENV *dbenv;
481	size_t nw;
482	int ret, t_ret;
483
484	dbenv = env->dbenv;
485	ret = 0;
486
487	/*
488	 * If recovery failed, we want to drop our locks and return, but still
489	 * make sure any subsequent process doesn't decide everything is just
490	 * fine and try to get into the database environment.  In the case of
491	 * an error, discard our locks, but leave our slot filled-in.
492	 */
493	if (recovery_failed)
494		goto err;
495
496	/*
497	 * Why isn't an exclusive lock necessary to discard a ENV handle?
498	 *
499	 * We mark our process ID slot empty before we discard the process slot
500	 * lock, and threads of control reviewing the register file ignore any
501	 * slots which they can't lock.
502	 */
503	if ((ret = __os_seek(env,
504	    dbenv->registry, 0, 0, dbenv->registry_off)) != 0 ||
505	    (ret = __os_write(
506	    env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
507		goto err;
508
509	/*
510	 * !!!
511	 * This code assumes that closing the file descriptor discards all
512	 * held locks.
513	 *
514	 * !!!
515	 * There is an ordering problem here -- in the case of a process that
516	 * failed in recovery, we're unlocking both the exclusive lock and our
517	 * slot lock.  If the OS unlocked the exclusive lock and then allowed
518	 * another thread of control to acquire the exclusive lock before also
519	 * also releasing our slot lock, we could race.  That can't happen, I
520	 * don't think.
521	 */
522err:	if ((t_ret =
523	    __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
524		ret = t_ret;
525
526	dbenv->registry = NULL;
527	return (ret);
528}
529
530/*
531 * __envreg_xunlock --
532 *	Discard the exclusive lock held by the ENV handle.
533 *
534 * PUBLIC: int __envreg_xunlock __P((ENV *));
535 */
536int
537__envreg_xunlock(env)
538	ENV *env;
539{
540	DB_ENV *dbenv;
541	pid_t pid;
542	int ret;
543
544	dbenv = env->dbenv;
545
546	dbenv->thread_id(dbenv, &pid, NULL);
547
548	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
549		__db_msg(env,
550		    "%lu: recovery completed, unlocking", (u_long)pid);
551
552	if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0)
553		return (ret);
554
555	__db_err(env, ret, "%s: exclusive file unlock", REGISTER_FILE);
556	return (__env_panic(env, ret));
557}
558