1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2004,2008 Oracle.  All rights reserved.
5 *
6 * $Id: env_register.c,v 1.42 2008/05/07 12:27:33 bschmeck Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13#define	REGISTER_FILE	"__db.register"
14
15#define	PID_EMPTY	"X                      0\n"	/* Unused PID entry */
16#define	PID_FMT		"%24lu\n"			/* PID entry format */
17							/* Unused PID test */
18#define	PID_ISEMPTY(p)	(memcmp(p, PID_EMPTY, PID_LEN) == 0)
19#define	PID_LEN		(25)				/* PID entry length */
20
21#define	REGISTRY_LOCK(env, pos, nowait)					\
22	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait)
23#define	REGISTRY_UNLOCK(env, pos)					\
24	__os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0)
25#define	REGISTRY_EXCL_LOCK(env, nowait)					\
26	REGISTRY_LOCK(env, 1, nowait)
27#define	REGISTRY_EXCL_UNLOCK(env)					\
28	REGISTRY_UNLOCK(env, 1)
29
30static  int __envreg_add __P((ENV *, int *));
31
32/*
33 * Support for portable, multi-process database environment locking, based on
34 * the Subversion SR (#11511).
35 *
36 * The registry feature is configured by specifying the DB_REGISTER flag to the
37 * DbEnv.open method.  If DB_REGISTER is specified, DB opens the registry file
38 * in the database environment home directory.  The registry file is formatted
39 * as follows:
40 *
41 *	                    12345		# process ID slot 1
42 *	X		# empty slot
43 *	                    12346		# process ID slot 2
44 *	X		# empty slot
45 *	                    12347		# process ID slot 3
46 *	                    12348		# process ID slot 4
47 *	X                   12349		# empty slot
48 *	X		# empty slot
49 *
50 * All lines are fixed-length.  All lines are process ID slots.  Empty slots
51 * are marked with leading non-digit characters.
52 *
53 * To modify the file, you get an exclusive lock on the first byte of the file.
54 *
55 * While holding any DbEnv handle, each process has an exclusive lock on the
56 * first byte of a process ID slot.  There is a restriction on having more
57 * than one DbEnv handle open at a time, because Berkeley DB uses per-process
58 * locking to implement this feature, that is, a process may never have more
59 * than a single slot locked.
60 *
61 * This work requires that if a process dies or the system crashes, locks held
62 * by the dying processes will be dropped.  (We can't use system shared
63 * memory-backed or filesystem-backed locks because they're persistent when a
64 * process dies.)  On POSIX systems, we use fcntl(2) locks; on Win32 we have
65 * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
66 * Lockfile/UnlockFile.
67 *
68 * We could implement the same solution with flock locking instead of fcntl,
69 * but flock would require a separate file for each process of control (and
70 * probably each DbEnv handle) in the database environment, which is fairly
71 * ugly.
72 *
73 * Whenever a process opens a new DbEnv handle, it walks the registry file and
74 * verifies it CANNOT acquire the lock for any non-empty slot.  If a lock for
75 * a non-empty slot is available, we know a process died holding an open handle,
76 * and recovery needs to be run.
77 *
78 * It's possible to get corruption in the registry file.  If a write system
79 * call fails after partially completing, there can be corrupted entries in
80 * the registry file, or a partial entry at the end of the file.  This is OK.
81 * A corrupted entry will be flagged as a non-empty line during the registry
82 * file walk.  Since the line was corrupted by process failure, no process will
83 * hold a lock on the slot, which will lead to recovery being run.
84 *
85 * There can still be processes running in the environment when we recover it,
86 * and, in fact, there can still be processes running in the old environment
87 * after we're up and running in a new one.  This is safe because performing
88 * recovery panics (and removes) the existing environment, so the window of
89 * vulnerability is small.  Further, we check the panic flag in the DB API
90 * methods, when waking from spinning on a mutex, and whenever we're about to
91 * write to disk).  The only window of corruption is if the write check of the
92 * panic were to complete, the region subsequently be recovered, and then the
93 * write continues.  That's very, very unlikely to happen.  This vulnerability
94 * already exists in Berkeley DB, too, the registry code doesn't make it any
95 * worse than it already is.
96 *
97 * The only way to avoid that window entirely is to ensure that all processes
98 * in the Berkeley DB environment exit before we run recovery.   Applications
99 * can do that if they maintain their own process registry outside of Berkeley
100 * DB, but it's a little more difficult to do here.   The obvious approach is
101 * to send signals to any process using the database environment as soon as we
102 * decide to run recovery, but there are problems with that approach: we might
103 * not have permission to send signals to the process, the process might have
104 * signal handlers installed, the cookie stored might not be the same as kill's
105 * argument, we may not be able to reliably tell if the process died, and there
106 * are probably other problems.  However, if we can send a signal, it reduces
107 * the window, and so we include the code here.  To configure it, turn on the
108 * DB_ENVREG_KILL_ALL #define.
109 */
110#define	DB_ENVREG_KILL_ALL	0
111
112/*
113 * __envreg_register --
114 *	Register a ENV handle.
115 *
116 * PUBLIC: int __envreg_register __P((ENV *, int *));
117 */
118int
119__envreg_register(env, need_recoveryp)
120	ENV *env;
121	int *need_recoveryp;
122{
123	DB_ENV *dbenv;
124	pid_t pid;
125	u_int32_t bytes, mbytes;
126	int ret;
127	char *pp;
128
129	*need_recoveryp = 0;
130
131	dbenv = env->dbenv;
132	dbenv->thread_id(dbenv, &pid, NULL);
133	pp = NULL;
134
135	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
136		__db_msg(env, "%lu: register environment", (u_long)pid);
137
138	/* Build the path name and open the registry file. */
139	if ((ret =
140	    __db_appname(env, DB_APP_NONE, REGISTER_FILE, 0, NULL, &pp)) != 0)
141		goto err;
142	if ((ret = __os_open(env, pp, 0,
143	    DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0)
144		goto err;
145
146	/*
147	 * Wait for an exclusive lock on the file.
148	 *
149	 * !!!
150	 * We're locking bytes that don't yet exist, but that's OK as far as
151	 * I know.
152	 */
153	if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0)
154		goto err;
155
156	/*
157	 * If the file size is 0, initialize the file.
158	 *
159	 * Run recovery if we create the file, that means we can clean up the
160	 * system by removing the registry file and restarting the application.
161	 */
162	if ((ret = __os_ioinfo(
163	    env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
164		goto err;
165	if (mbytes == 0 && bytes == 0) {
166		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
167			__db_msg(env, "%lu: creating %s", (u_long)pid, pp);
168		*need_recoveryp = 1;
169	}
170
171	/* Register this process. */
172	if ((ret = __envreg_add(env, need_recoveryp)) != 0)
173		goto err;
174
175	/*
176	 * Release our exclusive lock if we don't need to run recovery.  If
177	 * we need to run recovery, ENV->open will call back into register
178	 * code once recovery has completed.
179	 */
180	if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0)
181		goto err;
182
183	if (0) {
184err:		*need_recoveryp = 0;
185
186		/*
187		 * !!!
188		 * Closing the file handle must release all of our locks.
189		 */
190		if (dbenv->registry != NULL)
191			(void)__os_closehandle(env, dbenv->registry);
192		dbenv->registry = NULL;
193	}
194
195	if (pp != NULL)
196		__os_free(env, pp);
197
198	return (ret);
199}
200
201/*
202 * __envreg_add --
203 *	Add the process' pid to the register.
204 */
205static int
206__envreg_add(env, need_recoveryp)
207	ENV *env;
208	int *need_recoveryp;
209{
210	DB_ENV *dbenv;
211	pid_t pid;
212	off_t end, pos;
213	size_t nr, nw;
214	u_int lcnt;
215	u_int32_t bytes, mbytes;
216	int need_recovery, ret;
217	char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10];
218
219	dbenv = env->dbenv;
220	need_recovery = 0;
221	COMPQUIET(p, NULL);
222
223	/* Get a copy of our process ID. */
224	dbenv->thread_id(dbenv, &pid, NULL);
225	snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);
226
227	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
228		__db_msg(env, "%lu: adding self to registry", (u_long)pid);
229
230#if DB_ENVREG_KILL_ALL
231	if (0) {
232kill_all:	/*
233		 * A second pass through the file, this time killing any
234		 * processes still running.
235		 */
236		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
237			return (ret);
238	}
239#endif
240
241	/*
242	 * Read the file.  Skip empty slots, and check that a lock is held
243	 * for any allocated slots.  An allocated slot which we can lock
244	 * indicates a process died holding a handle and recovery needs to
245	 * be run.
246	 */
247	for (lcnt = 0;; ++lcnt) {
248		if ((ret = __os_read(
249		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
250			return (ret);
251		if (nr == 0)
252			break;
253
254		/*
255		 * A partial record at the end of the file is possible if a
256		 * previously un-registered process was interrupted while
257		 * registering.
258		 */
259		if (nr != PID_LEN) {
260			need_recovery = 1;
261			break;
262		}
263
264		if (PID_ISEMPTY(buf)) {
265			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
266				__db_msg(env, "%02u: EMPTY", lcnt);
267			continue;
268		}
269
270		/*
271		 * !!!
272		 * DB_REGISTER is implemented using per-process locking, only
273		 * a single ENV handle may be open per process.  Enforce
274		 * that restriction.
275		 */
276		if (memcmp(buf, pid_buf, PID_LEN) == 0) {
277			__db_errx(env,
278    "DB_REGISTER limits processes to one open DB_ENV handle per environment");
279			return (EINVAL);
280		}
281
282		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
283			for (p = buf; *p == ' ';)
284				++p;
285			buf[nr - 1] = '\0';
286		}
287
288#if DB_ENVREG_KILL_ALL
289		if (need_recovery) {
290			pid = (pid_t)strtoul(buf, NULL, 10);
291			(void)kill(pid, SIGKILL);
292
293			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
294				__db_msg(env, "%02u: %s: KILLED", lcnt, p);
295			continue;
296		}
297#endif
298		pos = (off_t)lcnt * PID_LEN;
299		if (REGISTRY_LOCK(env, pos, 1) == 0) {
300			if ((ret = REGISTRY_UNLOCK(env, pos)) != 0)
301				return (ret);
302
303			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
304				__db_msg(env, "%02u: %s: FAILED", lcnt, p);
305
306			need_recovery = 1;
307#if DB_ENVREG_KILL_ALL
308			goto kill_all;
309#else
310			break;
311#endif
312		} else
313			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
314				__db_msg(env, "%02u: %s: LOCKED", lcnt, p);
315	}
316
317	/*
318	 * If we have to perform recovery...
319	 *
320	 * Mark all slots empty.  Registry ignores empty slots we can't lock,
321	 * so it doesn't matter if any of the processes are in the middle of
322	 * exiting Berkeley DB -- they'll discard their lock when they exit.
323	 */
324	if (need_recovery) {
325		if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
326			__db_msg(env, "%lu: recovery required", (u_long)pid);
327
328		/* Figure out how big the file is. */
329		if ((ret = __os_ioinfo(
330		    env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
331			return (ret);
332		end = (off_t)mbytes * MEGABYTE + bytes;
333
334		/*
335		 * Seek to the beginning of the file and overwrite slots to
336		 * the end of the file.
337		 *
338		 * It's possible for there to be a partial entry at the end of
339		 * the file if a process died when trying to register.  If so,
340		 * correct for it and overwrite it as well.
341		 */
342		if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
343			return (ret);
344		for (lcnt = (u_int)end / PID_LEN +
345		    ((u_int)end % PID_LEN == 0 ? 0 : 1); lcnt > 0; --lcnt)
346			if ((ret = __os_write(env,
347			    dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
348				return (ret);
349	}
350
351	/*
352	 * Seek to the first process slot and add ourselves to the first empty
353	 * slot we can lock.
354	 */
355	if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0)
356		return (ret);
357	for (lcnt = 0;; ++lcnt) {
358		if ((ret = __os_read(
359		    env, dbenv->registry, buf, PID_LEN, &nr)) != 0)
360			return (ret);
361		if (nr == PID_LEN && !PID_ISEMPTY(buf))
362			continue;
363		pos = (off_t)lcnt * PID_LEN;
364		if (REGISTRY_LOCK(env, pos, 1) == 0) {
365			if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
366				__db_msg(env,
367				    "%lu: locking slot %02u at offset %lu",
368				    (u_long)pid, lcnt, (u_long)pos);
369
370			if ((ret = __os_seek(env,
371			    dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 ||
372			    (ret = __os_write(env,
373			    dbenv->registry, pid_buf, PID_LEN, &nw)) != 0)
374				return (ret);
375			dbenv->registry_off = (u_int32_t)pos;
376			break;
377		}
378	}
379
380	if (need_recovery)
381		*need_recoveryp = 1;
382
383	return (ret);
384}
385
386/*
387 * __envreg_unregister --
388 *	Unregister a ENV handle.
389 *
390 * PUBLIC: int __envreg_unregister __P((ENV *, int));
391 */
392int
393__envreg_unregister(env, recovery_failed)
394	ENV *env;
395	int recovery_failed;
396{
397	DB_ENV *dbenv;
398	size_t nw;
399	int ret, t_ret;
400
401	dbenv = env->dbenv;
402	ret = 0;
403
404	/*
405	 * If recovery failed, we want to drop our locks and return, but still
406	 * make sure any subsequent process doesn't decide everything is just
407	 * fine and try to get into the database environment.  In the case of
408	 * an error, discard our locks, but leave our slot filled-in.
409	 */
410	if (recovery_failed)
411		goto err;
412
413	/*
414	 * Why isn't an exclusive lock necessary to discard a ENV handle?
415	 *
416	 * We mark our process ID slot empty before we discard the process slot
417	 * lock, and threads of control reviewing the register file ignore any
418	 * slots which they can't lock.
419	 */
420	if ((ret = __os_seek(env,
421	    dbenv->registry, 0, 0, dbenv->registry_off)) != 0 ||
422	    (ret = __os_write(
423	    env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0)
424		goto err;
425
426	/*
427	 * !!!
428	 * This code assumes that closing the file descriptor discards all
429	 * held locks.
430	 *
431	 * !!!
432	 * There is an ordering problem here -- in the case of a process that
433	 * failed in recovery, we're unlocking both the exclusive lock and our
434	 * slot lock.  If the OS unlocked the exclusive lock and then allowed
435	 * another thread of control to acquire the exclusive lock before also
436	 * also releasing our slot lock, we could race.  That can't happen, I
437	 * don't think.
438	 */
439err:	if ((t_ret =
440	    __os_closehandle(env, dbenv->registry)) != 0 && ret == 0)
441		ret = t_ret;
442
443	dbenv->registry = NULL;
444	return (ret);
445}
446
447/*
448 * __envreg_xunlock --
449 *	Discard the exclusive lock held by the ENV handle.
450 *
451 * PUBLIC: int __envreg_xunlock __P((ENV *));
452 */
453int
454__envreg_xunlock(env)
455	ENV *env;
456{
457	DB_ENV *dbenv;
458	pid_t pid;
459	int ret;
460
461	dbenv = env->dbenv;
462
463	dbenv->thread_id(dbenv, &pid, NULL);
464
465	if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
466		__db_msg(env,
467		    "%lu: recovery completed, unlocking", (u_long)pid);
468
469	if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0)
470		return (ret);
471
472	__db_err(env, ret, "%s: exclusive file unlock", REGISTER_FILE);
473	return (__env_panic(env, ret));
474}
475