1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001,2008 Oracle.  All rights reserved.
5 *
6 * $Id: fop_util.c,v 12.65 2008/05/07 12:27:34 bschmeck Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_am.h"
14#include "dbinc/hash.h"
15#include "dbinc/fop.h"
16#include "dbinc/lock.h"
17#include "dbinc/mp.h"
18#include "dbinc/log.h"
19#include "dbinc/txn.h"
20
21static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
22static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
23static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
24static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t));
25static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
26	       const char *, const char *, const char *, DB_LOCKER *));
27static int __fop_ondisk_dummy __P((DB *,
28	       DB_TXN *, const char *, u_int8_t *, u_int32_t));
29static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
30	     const char *, const char *, const char *, DB_LOCKER *, u_int32_t));
31
32/*
33 * Acquire the environment meta-data lock.  The parameters are the
34 * environment (ENV), the locker id to use in acquiring the lock (ID)
35 * and a pointer to a DB_LOCK.
36 *
37 * !!!
38 * Turn off locking for Critical Path.  The application must do its own
39 * synchronization of open/create.  Two threads creating and opening a
40 * file at the same time may have unpredictable results.
41 */
42#ifdef CRITICALPATH_10266
43#define	GET_ENVLOCK(ENV, ID, L) (0)
44#else
45#define	GET_ENVLOCK(ENV, ID, L) do {					\
46	DBT __dbt;							\
47	u_int32_t __lockval;						\
48									\
49	if (LOCKING_ON((ENV))) {					\
50		__lockval = 1;						\
51		__dbt.data = &__lockval;				\
52		__dbt.size = sizeof(__lockval);				\
53		if ((ret = __lock_get((ENV), (ID),			\
54		    0, &__dbt, DB_LOCK_WRITE, (L))) != 0)		\
55			goto err;					\
56	}								\
57} while (0)
58#endif
59
60#define	RESET_MPF(D, F) do {						\
61	(void)__memp_fclose((D)->mpf, (F));				\
62	(D)->mpf = NULL;						\
63	F_CLR((D), DB_AM_OPEN_CALLED);					\
64	if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0)		\
65		goto err;						\
66} while (0)
67
68/*
69 * If we open a file handle and our caller is doing fcntl(2) locking,
70 * we can't close the handle because that would discard the caller's
71 * lock. Save it until we close or refresh the DB handle.
72 */
73#define	CLOSE_HANDLE(D, F) {						\
74	if ((F) != NULL) {						\
75		if (LF_ISSET(DB_FCNTL_LOCKING))				\
76			(D)->saved_open_fhp = (F);			\
77		else if ((t_ret =					\
78		    __os_closehandle((D)->env, (F))) != 0) {		\
79			if (ret == 0)					\
80				ret = t_ret;				\
81			goto err;					\
82		}							\
83		(F) = NULL;						\
84	}								\
85}
86
87/*
88 * __fop_lock_handle --
89 *
90 * Get the handle lock for a database.  If the envlock is specified, do this
91 * as a lock_vec call that releases the environment lock before acquiring the
92 * handle lock.
93 *
94 * PUBLIC: int __fop_lock_handle __P((ENV *,
95 * PUBLIC:     DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
96 *
97 */
98int
99__fop_lock_handle(env, dbp, locker, mode, elockp, flags)
100	ENV *env;
101	DB *dbp;
102	DB_LOCKER *locker;
103	db_lockmode_t mode;
104	DB_LOCK *elockp;
105	u_int32_t flags;
106{
107	DBT fileobj;
108	DB_LOCKREQ reqs[2], *ereq;
109	DB_LOCK_ILOCK lock_desc;
110	int ret;
111
112	if (!LOCKING_ON(env) ||
113	    F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
114		return (0);
115
116	/*
117	 * If we are in recovery, the only locking we should be
118	 * doing is on the global environment.
119	 */
120	if (IS_RECOVERING(env))
121		return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
122
123	memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
124	lock_desc.pgno = dbp->meta_pgno;
125	lock_desc.type = DB_HANDLE_LOCK;
126
127	memset(&fileobj, 0, sizeof(fileobj));
128	fileobj.data = &lock_desc;
129	fileobj.size = sizeof(lock_desc);
130	DB_TEST_SUBLOCKS(env, flags);
131	if (elockp == NULL)
132		ret = __lock_get(env, locker,
133		    flags, &fileobj, mode, &dbp->handle_lock);
134	else {
135		reqs[0].op = DB_LOCK_PUT;
136		reqs[0].lock = *elockp;
137		reqs[1].op = DB_LOCK_GET;
138		reqs[1].mode = mode;
139		reqs[1].obj = &fileobj;
140		reqs[1].timeout = 0;
141		if ((ret = __lock_vec(env,
142		    locker, flags, reqs, 2, &ereq)) == 0) {
143			dbp->handle_lock = reqs[1].lock;
144			LOCK_INIT(*elockp);
145		} else if (ereq != reqs)
146			LOCK_INIT(*elockp);
147	}
148
149	dbp->cur_locker = locker;
150	return (ret);
151}
152
153/*
154 * __fop_file_setup --
155 *
156 * Perform all the needed checking and locking to open up or create a
157 * file.
158 *
159 * There's a reason we don't push this code down into the buffer cache.
160 * The problem is that there's no information external to the file that
161 * we can use as a unique ID.  UNIX has dev/inode pairs, but they are
162 * not necessarily unique after reboot, if the file was mounted via NFS.
163 * Windows has similar problems, as the FAT filesystem doesn't maintain
164 * dev/inode numbers across reboot.  So, we must get something from the
165 * file we can use to ensure that, even after a reboot, the file we're
166 * joining in the cache is the right file for us to join.  The solution
167 * we use is to maintain a file ID that's stored in the database, and
168 * that's why we have to open and read the file before calling into the
169 * buffer cache or obtaining a lock (we use this unique fileid to lock
170 * as well as to identify like files in the cache).
171 *
172 * There are a couple of idiosyncrasies that this code must support, in
173 * particular, DB_TRUNCATE and DB_FCNTL_LOCKING.  First, we disallow
174 * DB_TRUNCATE in the presence of transactions, since opening a file with
175 * O_TRUNC will result in data being lost in an unrecoverable fashion.
176 * We also disallow DB_TRUNCATE if locking is enabled, because even in
177 * the presence of locking, we cannot avoid race conditions, so allowing
178 * DB_TRUNCATE with locking would be misleading.  See SR [#7345] for more
179 * details.
180 *
181 * However, if you are running with neither locking nor transactions, then
182 * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
183 * regardless of its contents.
184 *
185 * FCNTL locking introduces another set of complications.  First, the only
186 * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
187 * with programs like Sendmail and Postfix.  In these cases, the caller may
188 * already have a lock on the file; we need to make sure that any file handles
189 * we open remain open, because if we were to close them, the lock held by the
190 * caller would go away.  Furthermore, Sendmail and/or Postfix need the ability
191 * to create databases in empty files.  So, when you're doing FCNTL locking,
192 * it's reasonable that you are trying to create a database into a 0-length
193 * file and we allow it, while under normal conditions, we do not create
194 * databases if the files already exist and are not Berkeley DB files.
195 *
196 * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
197 * PUBLIC:     DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
198 */
199int
200__fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
201	DB *dbp;
202	DB_THREAD_INFO *ip;
203	DB_TXN *txn;
204	const char *name;
205	int mode;
206	u_int32_t flags, *retidp;
207{
208	DBTYPE save_type;
209	DB_FH *fhp;
210	DB_LOCK elock;
211	DB_LOCKER *locker;
212	DB_TXN *stxn;
213	ENV *env;
214	size_t len;
215	u_int32_t dflags, oflags;
216	u_int8_t mbuf[DBMETASIZE];
217	int created_locker, create_ok, ret, retries, t_ret, tmp_created;
218	int truncating, was_inval;
219	char *real_name, *real_tmpname, *tmpname;
220
221	*retidp = TXN_INVALID;
222
223	env = dbp->env;
224	fhp = NULL;
225	LOCK_INIT(elock);
226	stxn = NULL;
227	created_locker = tmp_created = truncating = was_inval = 0;
228	real_name = real_tmpname = tmpname = NULL;
229	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
230
231	ret = 0;
232	retries = 0;
233	save_type = dbp->type;
234
235	/*
236	 * Get a lockerid for this handle.  There are paths through queue
237	 * rename and remove where this dbp already has a locker, so make
238	 * sure we don't clobber it and conflict.
239	 */
240	if (LOCKING_ON(env) &&
241	    !F_ISSET(dbp, DB_AM_COMPENSATE) &&
242	    !F_ISSET(dbp, DB_AM_RECOVER) &&
243	    dbp->locker == DB_LOCK_INVALIDID) {
244		if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
245			goto err;
246		created_locker = 1;
247	}
248	LOCK_INIT(dbp->handle_lock);
249
250	locker = txn == NULL ? dbp->locker : txn->locker;
251
252	oflags = 0;
253	if (F_ISSET(dbp, DB_AM_INMEM))
254		real_name = (char *)name;
255	else {
256		/* Get the real backing file name. */
257		if ((ret = __db_appname(env,
258		    DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
259			goto err;
260
261		/* Fill in the default file mode. */
262		if (mode == 0)
263			mode = DB_MODE_660;
264
265		if (LF_ISSET(DB_RDONLY))
266			oflags |= DB_OSO_RDONLY;
267		if (LF_ISSET(DB_TRUNCATE))
268			oflags |= DB_OSO_TRUNC;
269	}
270
271	retries = 0;
272	create_ok = LF_ISSET(DB_CREATE);
273	LF_CLR(DB_CREATE);
274
275retry:
276	/*
277	 * If we cannot create the file, only retry a few times.  We
278	 * think we might be in a race with another create, but it could
279	 * be that the backup filename exists (that is, is left over from
280	 * a previous crash).
281	 */
282	if (++retries > DB_RETRY) {
283		__db_errx(env, "__fop_file_setup:  Retry limit (%d) exceeded",
284		    DB_RETRY);
285		goto err;
286	}
287	if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
288		GET_ENVLOCK(env, locker, &elock);
289	if (name == NULL)
290		ret = ENOENT;
291	else if (F_ISSET(dbp, DB_AM_INMEM)) {
292		ret = __env_mpool(dbp, name, flags);
293		/*
294		 * We are using __env_open as a check for existence.
295		 * However, __env_mpool does an actual open and there
296		 * are scenarios where the object exists, but cannot be
297		 * opened, because our settings don't match those internally.
298		 * We need to check for that explicitly.  We'll need the
299		 * mpool open to read the meta-data page, so we're going to
300		 * have to temporarily turn this dbp into an UNKNOWN one.
301		 */
302		if (ret == EINVAL) {
303			was_inval = 1;
304			save_type = dbp->type;
305			dbp->type = DB_UNKNOWN;
306			ret = __env_mpool(dbp, name, flags);
307			dbp->type = save_type;
308		}
309	} else
310		ret = __os_exists(env, real_name, NULL);
311
312	if (ret == 0) {
313		/*
314		 * If the file exists, there are 5 possible cases:
315		 * 1. DB_EXCL was specified so this is an error, unless
316		 *	this is a file left around after a rename and we
317		 *	are in the same transaction.  This gets decomposed
318		 *	into several subcases, because we check for various
319		 *	errors before we know we're in rename.
320		 * 2. We are truncating, and it doesn't matter what kind
321		 *	of file it is, we should open/create it.
322		 * 3. It is 0-length, we are not doing transactions (i.e.,
323		 *      we are sendmail), we should open/create into it.
324		 *	-- on-disk files only!
325		 * 4. Is it a Berkeley DB file and we should simply open it.
326		 * 5. It is not a BDB file and we should return an error.
327		 */
328
329		/* Open file (if there is one). */
330reopen:		if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
331		    __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
332			goto err;
333
334		/* Case 2: DB_TRUNCATE: we must do the creation in place. */
335		if (LF_ISSET(DB_TRUNCATE)) {
336			if (LF_ISSET(DB_EXCL)) {
337				/* Case 1a: DB_EXCL and DB_TRUNCATE. */
338				ret = EEXIST;
339				goto err;
340			}
341			tmpname = (char *)name;
342			goto creat2;
343		}
344
345		/* Cases 1,3-5: we need to read the meta-data page. */
346		if (F_ISSET(dbp, DB_AM_INMEM))
347			ret = __fop_inmem_read_meta(dbp, txn, name, flags);
348		else {
349			ret = __fop_read_meta(env, real_name, mbuf,
350			    sizeof(mbuf), fhp,
351			    LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL ? 1 : 0,
352			    &len);
353
354			/* Case 3: 0-length, no txns. */
355			if (ret != 0 && len == 0 && txn == NULL) {
356				if (LF_ISSET(DB_EXCL)) {
357					/*
358					 * Case 1b: DB_EXCL and
359					 * 0-length file exists.
360					 */
361					ret = EEXIST;
362					goto err;
363				}
364				tmpname = (char *)name;
365				goto creat2;
366			}
367
368			/* Case 4: This is a valid file. */
369			if (ret == 0)
370				ret = __db_meta_setup(env, dbp,
371				    real_name, (DBMETA *)mbuf, flags, 1);
372
373		}
374
375		/* Case 5: Invalid file. */
376		if (ret != 0)
377			goto err;
378
379		/* Now, get our handle lock. */
380		if ((ret = __fop_lock_handle(env,
381		    dbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) == 0) {
382			if ((ret = __ENV_LPUT(env, elock)) != 0)
383				goto err;
384		} else if (ret != DB_LOCK_NOTGRANTED ||
385		    (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
386			goto err;
387		else {
388			/*
389			 * We were unable to acquire the handle lock without
390			 * blocking.  The fact that we are blocking might mean
391			 * that someone else is trying to delete the file.
392			 * Since some platforms cannot delete files while they
393			 * are open (Windows), we are going to have to close
394			 * the file.  This would be a problem if we were doing
395			 * FCNTL locking, because our closing the handle would
396			 * release the FCNTL locks.  Fortunately, if we are
397			 * doing FCNTL locking, then we should never fail to
398			 * acquire our handle lock, so we should never get here.
399			 * We assert it here to make sure we aren't destroying
400			 * any application level FCNTL semantics.
401			 */
402			DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
403			if (!F_ISSET(dbp, DB_AM_INMEM)) {
404				if ((ret = __os_closehandle(env, fhp)) != 0)
405					goto err;
406				fhp = NULL;
407			}
408			if ((ret = __fop_lock_handle(env,
409			    dbp, locker, DB_LOCK_READ, &elock, 0)) != 0) {
410				if (F_ISSET(dbp, DB_AM_INMEM))
411					RESET_MPF(dbp, 0);
412				goto err;
413			}
414
415			/*
416			 * It's possible that our DBP was initialized
417			 * with a different file last time we opened it.
418			 * Therefore, we need to reset the DBP type and then
419			 * re-read the meta-data page and reset any other
420			 * fields that __db_meta_setup initializes.  We
421			 * need to shut down this dbp and reopen for in-memory
422			 * named databases. Unfortunately __db_refresh is
423			 * pretty aggressive at the shutting down, so we need
424			 * to do a bunch of restoration.
425			 * XXX it would be nice to pull refresh apart into
426			 * the stuff you need to do to call __env_mpool
427			 * and the stuff you can really throw away.
428			 */
429			if (F_ISSET(dbp, DB_AM_INMEM)) {
430				if ((ret = __db_refresh(dbp,
431				    txn, DB_NOSYNC, NULL, 1)) != 0)
432					goto err;
433				ret = __env_mpool(dbp, name, flags);
434			} else
435				ret =
436				    __os_open(env, real_name, 0, 0, 0, &fhp);
437
438			if (ret != 0) {
439				if ((ret =
440				    __ENV_LPUT(env, dbp->handle_lock)) != 0) {
441					LOCK_INIT(dbp->handle_lock);
442					goto err;
443				}
444				goto retry;
445			}
446
447			dbp->type = save_type;
448			if (F_ISSET(dbp, DB_AM_INMEM))
449				ret = __fop_inmem_read_meta(dbp,
450				    txn, name, flags);
451			else if ((ret =
452			    __fop_read_meta(env, real_name, mbuf,
453			    sizeof(mbuf), fhp,
454			    LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL ? 1 : 0,
455			    &len)) != 0 ||
456			    (ret = __db_meta_setup(env, dbp, real_name,
457			    (DBMETA *)mbuf, flags, DB_CHK_META)) != 0)
458				goto err;
459
460		}
461
462		/* If we got here, then we have the handle lock. */
463
464		/*
465		 * Check for a file in the midst of a rename.  If we find that
466		 * the file is in the midst of a rename, it must be the case
467		 * that it is in our current transaction (else we would still
468		 * be blocking), so we can continue along and create a new file
469		 * with the same name.  In that case, we have to close the file
470		 * handle because we reuse it below.  This is a case where
471		 * a 'was_inval' above is OK.
472		 */
473		if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
474			was_inval = 0;
475			if (create_ok) {
476				if (F_ISSET(dbp, DB_AM_INMEM)) {
477					RESET_MPF(dbp, DB_MPOOL_DISCARD);
478				} else if ((ret =
479				    __os_closehandle(env, fhp)) != 0)
480					goto err;
481				LF_SET(DB_CREATE);
482				goto create;
483			} else {
484				ret = ENOENT;
485				goto err;
486			}
487		}
488
489		/* If we get here, a was_inval is bad. */
490		if (was_inval) {
491			ret = EINVAL;
492			goto err;
493		}
494
495		/*
496		 * Now, case 1: check for DB_EXCL, because the file that exists
497		 * is not in the middle of a rename, so we have an error.  This
498		 * is a weird case, but we need to make sure that we don't
499		 * continue to hold the handle lock, since technically, we
500		 * should not have been allowed to open it.
501		 */
502		if (LF_ISSET(DB_EXCL)) {
503			ret = __ENV_LPUT(env, dbp->handle_lock);
504			LOCK_INIT(dbp->handle_lock);
505			if (ret == 0)
506				ret = EEXIST;
507			goto err;
508		}
509		goto done;
510	}
511
512	/* File does not exist. */
513#ifdef	HAVE_VXWORKS
514	/*
515	 * VxWorks can return file-system specific error codes if the
516	 * file does not exist, not ENOENT.
517	 */
518	if (!create_ok)
519#else
520	if (!create_ok || ret != ENOENT)
521#endif
522		goto err;
523	LF_SET(DB_CREATE);
524	ret = 0;
525
526	/*
527	 * We need to create file, which means that we need to set up the file,
528	 * the fileid and the locks.  Then we need to call the appropriate
529	 * routines to create meta-data pages.  For in-memory files, we retain
530	 * the environment lock, while for on-disk files, we drop the env lock
531	 * and create into a temporary.
532	 */
533	if (!F_ISSET(dbp, DB_AM_INMEM) &&
534	    (ret = __ENV_LPUT(env, elock)) != 0)
535		goto err;
536
537create:	if (txn != NULL && IS_REP_CLIENT(env) &&
538	    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
539		__db_errx(env,
540		    "Transactional create on replication client disallowed");
541		ret = EINVAL;
542		goto err;
543	}
544
545	if (F_ISSET(dbp, DB_AM_INMEM))
546		ret = __fop_inmem_create(dbp, name, txn, flags);
547	else {
548		if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
549			goto err;
550		if (TXN_ON(env) && txn != NULL &&
551		    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
552			goto err;
553		if ((ret = __fop_create(env,
554		    stxn, &fhp, tmpname, DB_APP_DATA, mode, dflags)) != 0) {
555			/*
556			 * If no transactions, there is a race on creating the
557			 * backup file, as the backup file name is the same for
558			 * all processes.  Wait for the other process to finish
559			 * with the name.
560			 */
561			if (!TXN_ON(env) && ret == EEXIST) {
562				__os_free(env, tmpname);
563				tmpname = NULL;
564				__os_yield(env, 1, 0);
565				goto retry;
566			}
567			goto err;
568		}
569		tmp_created = 1;
570	}
571
572creat2:	if (!F_ISSET(dbp, DB_AM_INMEM)) {
573		if ((ret = __db_appname(env,
574		    DB_APP_DATA, tmpname, 0, NULL, &real_tmpname)) != 0)
575			goto err;
576
577		/* Set the pagesize if it isn't yet set. */
578		if (dbp->pgsize == 0 &&
579		    (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
580			goto errmsg;
581
582		/* Construct a file_id. */
583		if ((ret =
584		    __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
585			goto errmsg;
586	}
587
588	if ((ret = __db_new_file(dbp, ip,
589	    F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
590		goto err;
591
592	/*
593	 * We need to close the handle here on platforms where remove and
594	 * rename fail if a handle is open (including Windows).
595	 */
596	CLOSE_HANDLE(dbp, fhp);
597
598	/*
599	 * Now move the file into place unless we are creating in place (because
600	 * we created a database in a file that started out 0-length).  If
601	 * this is an in-memory file, we may or may not hold the environment
602	 * lock depending on how we got here.
603	 */
604	if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
605	    !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
606		GET_ENVLOCK(env, locker, &elock);
607
608	if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
609		F_CLR(dbp, DB_AM_IN_RENAME);
610		__txn_remrem(env, txn, real_name);
611	} else if (name == tmpname) {
612		/* We created it in place. */
613	} else if (!F_ISSET(dbp, DB_AM_INMEM) &&
614	    __os_exists(env, real_name, NULL) == 0) {
615		/*
616		 * Someone managed to create the file; remove our temp
617		 * and try to open the file that now exists.
618		 */
619		(void)__fop_remove(env,
620		    NULL, dbp->fileid, tmpname, DB_APP_DATA, dflags);
621		(void)__ENV_LPUT(env, dbp->handle_lock);
622		LOCK_INIT(dbp->handle_lock);
623
624		if (stxn != NULL) {
625			ret = __txn_abort(stxn);
626			stxn = NULL;
627		}
628		if (ret != 0)
629			goto err;
630		goto reopen;
631	}
632
633	if (name != NULL && (ret = __fop_lock_handle(env,
634	    dbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
635		goto err;
636	if (tmpname != NULL && tmpname != name && (ret = __fop_rename(env,
637	    stxn, tmpname, name, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
638		goto err;
639
640	if (stxn != NULL) {
641		*retidp = stxn->txnid;
642		ret = __txn_commit(stxn, 0);
643		stxn = NULL;
644	} else
645		*retidp = TXN_INVALID;
646
647	if (ret != 0)
648		goto err;
649
650	F_SET(dbp, DB_AM_CREATED);
651
652	if (0) {
653errmsg:		__db_err(env, ret, "%s", name);
654
655err:		CLOSE_HANDLE(dbp, fhp);
656		if (stxn != NULL)
657			(void)__txn_abort(stxn);
658		if (tmp_created && txn == NULL)
659			(void)__fop_remove(env,
660			    NULL, NULL, tmpname, DB_APP_DATA, dflags);
661		if (txn == NULL)
662			(void)__ENV_LPUT(env, dbp->handle_lock);
663		(void)__ENV_LPUT(env, elock);
664		if (created_locker) {
665			(void)__lock_id_free(env, dbp->locker);
666			dbp->locker = NULL;
667		}
668	}
669
670done:	/*
671	 * There are cases where real_name and tmpname take on the
672	 * exact same string, so we need to make sure that we do not
673	 * free twice.
674	 */
675	if (!truncating && tmpname != NULL && tmpname != name)
676		__os_free(env, tmpname);
677	if (real_name != name && real_name != NULL)
678		__os_free(env, real_name);
679	if (real_tmpname != NULL)
680		__os_free(env, real_tmpname);
681	CLOSE_HANDLE(dbp, fhp);
682
683	return (ret);
684}
685
686/*
687 * __fop_set_pgsize --
688 *	Set the page size based on file information.
689 */
690static int
691__fop_set_pgsize(dbp, fhp, name)
692	DB *dbp;
693	DB_FH *fhp;
694	const char *name;
695{
696	ENV *env;
697	u_int32_t iopsize;
698	int ret;
699
700	env = dbp->env;
701
702	/*
703	 * Use the filesystem's optimum I/O size as the pagesize if a pagesize
704	 * not specified.  Some filesystems have 64K as their optimum I/O size,
705	 * but as that results in fairly large default caches, we limit the
706	 * default pagesize to 16K.
707	 */
708	if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
709		__db_err(env, ret, "%s", name);
710		return (ret);
711	}
712	if (iopsize < 512)
713		iopsize = 512;
714	if (iopsize > 16 * 1024)
715		iopsize = 16 * 1024;
716
717	/*
718	 * Sheer paranoia, but we don't want anything that's not a power-of-2
719	 * (we rely on that for alignment of various types on the pages), and
720	 * we want a multiple of the sector size as well.  If the value
721	 * we got out of __os_ioinfo looks bad, use a default instead.
722	 */
723	if (!IS_VALID_PAGESIZE(iopsize))
724		iopsize = DB_DEF_IOSIZE;
725
726	dbp->pgsize = iopsize;
727	F_SET(dbp, DB_AM_PGDEF);
728
729	return (0);
730}
731
732/*
733 * __fop_subdb_setup --
734 *
735 * Subdb setup is significantly simpler than file setup.  In terms of
736 * locking, for the duration of the operation/transaction, the locks on
737 * the meta-data page will suffice to protect us from simultaneous operations
738 * on the sub-database.  Before we complete the operation though, we'll get a
739 * handle lock on the subdatabase so that on one else can try to remove it
740 * while we've got it open.  We use an object that looks like the meta-data
741 * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
742 * locks.
743 *
744 * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
745 * PUBLIC:     const char *, const char *, int, u_int32_t));
746 */
747int
748__fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
749	DB *dbp;
750	DB_THREAD_INFO *ip;
751	DB_TXN *txn;
752	const char *mname, *name;
753	int mode;
754	u_int32_t flags;
755{
756	DB *mdbp;
757	ENV *env;
758	db_lockmode_t lkmode;
759	int ret, t_ret;
760
761	mdbp = NULL;
762	env = dbp->env;
763
764	if ((ret = __db_master_open(dbp,
765	    ip, txn, mname, flags, mode, &mdbp)) != 0)
766		return (ret);
767	/*
768	 * If we created this file, then we need to set the DISCARD flag so
769	 * that if we fail in the middle of this routine, we discard from the
770	 * mpool any pages that we just created.
771	 */
772	if (F_ISSET(mdbp, DB_AM_CREATED))
773		F_SET(mdbp, DB_AM_DISCARD);
774
775	/*
776	 * We are going to close this instance of the master, so we can
777	 * steal its handle instead of reopening a handle on the database.
778	 */
779	if (LF_ISSET(DB_FCNTL_LOCKING)) {
780		dbp->saved_open_fhp = mdbp->saved_open_fhp;
781		mdbp->saved_open_fhp = NULL;
782	}
783
784	/* Copy the pagesize and set the sub-database flag. */
785	dbp->pgsize = mdbp->pgsize;
786	F_SET(dbp, DB_AM_SUBDB);
787
788	if (name != NULL && (ret = __db_master_update(mdbp, dbp,
789	    ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0)
790		goto err;
791
792	/*
793	 * Hijack the master's locker ID as well, so that our locks don't
794	 * conflict with the master's.  Since we're closing the master,
795	 * that locker would just have been freed anyway.  Once we've gotten
796	 * the locker id, we need to acquire the handle lock for this
797	 * subdatabase.
798	 */
799	dbp->locker = mdbp->locker;
800	mdbp->locker = NULL;
801
802	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
803
804	/*
805	 * We copy our fileid from our master so that we all open
806	 * the same file in mpool.  We'll use the meta-pgno to lock
807	 * so that we end up with different handle locks.
808	 */
809
810	memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
811	lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ?
812	    DB_LOCK_WRITE : DB_LOCK_READ;
813	if ((ret = __fop_lock_handle(env, dbp,
814	    txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
815	    NOWAIT_FLAG(txn))) != 0)
816		goto err;
817
818	if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
819		/*
820		 * If there was no transaction and we created this database,
821		 * then we need to undo the update of the master database.
822		 */
823		if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
824			(void)__db_master_update(mdbp, dbp,
825			    ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
826		F_CLR(dbp, DB_AM_CREATED);
827		goto err;
828	}
829
830	/*
831	 * XXX
832	 * This should have been done at the top of this routine.  The problem
833	 * is that __db_init_subdb() uses "standard" routines to process the
834	 * meta-data page and set information in the DB handle based on it.
835	 * Those routines have to deal with swapped pages and will normally set
836	 * the DB_AM_SWAP flag.  However, we use the master's metadata page and
837	 * that has already been swapped, so they get the is-swapped test wrong.
838	 */
839	F_CLR(dbp, DB_AM_SWAP);
840	F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
841
842	/*
843	 * In the file create case, these happen in separate places so we have
844	 * two different tests.  They end up in the same place for subdbs, but
845	 * for compatibility with file testing, we put them both here anyway.
846	 */
847	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
848	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
849
850	/*
851	 * File exists and we have the appropriate locks; we should now
852	 * process a normal open.
853	 */
854	if (F_ISSET(mdbp, DB_AM_CREATED)) {
855		F_SET(dbp, DB_AM_CREATED_MSTR);
856		F_CLR(mdbp, DB_AM_DISCARD);
857	}
858
859	if (0) {
860err:
861DB_TEST_RECOVERY_LABEL
862		if (txn == NULL)
863			(void)__ENV_LPUT(env, dbp->handle_lock);
864	}
865
866	/*
867	 * The master's handle lock is under the control of the
868	 * subdb (it acquired the master's locker).  We want to
869	 * keep the master's handle lock so that no one can remove
870	 * the file while the subdb is open.  If we register the
871	 * trade event and then invalidate the copy of the lock
872	 * in the master's handle, that will accomplish this.  However,
873	 * before we register this event, we'd better remove any
874	 * events that we've already registered for the master.
875	 */
876	if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
877		/* Unregister old master events. */
878		 __txn_remlock(env,
879		    txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
880
881		/* Now register the new event. */
882		if ((t_ret = __txn_lockevent(env, txn, dbp,
883		    &mdbp->handle_lock, dbp->locker == NULL ?
884		    mdbp->locker : dbp->locker)) != 0 && ret == 0)
885			ret = t_ret;
886	}
887	LOCK_INIT(mdbp->handle_lock);
888
889	/*
890	 * If the master was created, we need to sync so that the metadata
891	 * page is correct on disk for recovery, since it isn't read through
892	 * mpool.  If we're opening a subdb in an existing file, we can skip
893	 * the sync.
894	 */
895	if (txn == NULL || F_ISSET(txn, TXN_CDSGROUP) ||
896	    F_ISSET(mdbp, DB_AM_RECOVER)) {
897		if ((t_ret = __db_close(mdbp, txn,
898		    F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 &&
899		    ret == 0)
900			ret = t_ret;
901	} else {
902		if (F_ISSET(dbp, DB_AM_CREATED_MSTR) &&
903		    (t_ret = __memp_fsync(mdbp->mpf)) != 0 && ret == 0)
904			ret = t_ret;
905
906		if ((t_ret =
907		     __txn_closeevent(env, txn, mdbp)) != 0 && ret == 0)
908			ret = t_ret;
909	}
910
911	return (ret);
912}
913
914/*
915 * __fop_remove_setup --
916 *	Open handle appropriately and lock for removal of a database file.
917 *
918 * PUBLIC: int __fop_remove_setup __P((DB *,
919 * PUBLIC:      DB_TXN *, const char *, u_int32_t));
920 */
921int
922__fop_remove_setup(dbp, txn, name, flags)
923	DB *dbp;
924	DB_TXN *txn;
925	const char *name;
926	u_int32_t flags;
927{
928	DB_FH *fhp;
929	DB_LOCK elock;
930	ENV *env;
931	u_int8_t mbuf[DBMETASIZE];
932	int ret;
933
934	COMPQUIET(flags, 0);
935
936	env = dbp->env;
937
938	LOCK_INIT(elock);
939	fhp = NULL;
940	ret = 0;
941
942	/* Create locker if necessary. */
943retry:	if (LOCKING_ON(env)) {
944		if (txn != NULL)
945			dbp->locker = txn->locker;
946		else if (dbp->locker == DB_LOCK_INVALIDID) {
947			if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
948				goto err;
949		}
950	}
951
952	/*
953	 * We are about to open a file handle and then possibly close it.
954	 * We cannot close handles if we are doing FCNTL locking.  However,
955	 * there is no way to pass the FCNTL flag into this routine via the
956	 * user API.  The only way we can get in here and be doing FCNTL
957	 * locking is if we are trying to clean up an open that was called
958	 * with FCNTL locking.  In that case, the save_fhp should already be
959	 * set.  So, we use that field to tell us if we need to make sure
960	 * that we shouldn't close the handle.
961	 */
962	fhp = dbp->saved_open_fhp;
963	DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
964
965	/*
966	 * Lock environment to protect file open.  That will enable us to
967	 * read the meta-data page and get the fileid so that we can lock
968	 * the handle.
969	 */
970	GET_ENVLOCK(env, dbp->locker, &elock);
971
972	/* Open database. */
973	if (F_ISSET(dbp, DB_AM_INMEM)) {
974		if ((ret = __env_mpool(dbp, name, flags)) == 0)
975			ret = __os_strdup(env, name, &dbp->dname);
976	} else if (fhp == NULL)
977		ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
978	if (ret != 0)
979		goto err;
980
981	/* Get meta-data */
982	if (F_ISSET(dbp, DB_AM_INMEM))
983		ret = __fop_inmem_read_meta(dbp, txn, name, flags);
984	else if ((ret = __fop_read_meta(env,
985	    name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
986		ret = __db_meta_setup(env, dbp,
987		    name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
988	if (ret != 0)
989		goto err;
990
991	/*
992	 * Now, get the handle lock.  We first try with NOWAIT, because if
993	 * we have to wait, we're going to have to close the file and reopen
994	 * it, so that if there is someone else removing it, our open doesn't
995	 * prevent that.
996	 */
997	if ((ret = __fop_lock_handle(env,
998	    dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
999		/*
1000		 * Close the file, block on the lock, clean up the dbp, and
1001		 * then start all over again.
1002		 */
1003		if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
1004			(void)__os_closehandle(env, fhp);
1005			fhp = NULL;
1006		}
1007		if (ret != DB_LOCK_NOTGRANTED ||
1008		    (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
1009			goto err;
1010		else if ((ret = __fop_lock_handle(env,
1011		    dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1012			goto err;
1013
1014		if (F_ISSET(dbp, DB_AM_INMEM)) {
1015			(void)__lock_put(env, &dbp->handle_lock);
1016			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
1017		} else {
1018			if (txn != NULL)
1019				dbp->locker = NULL;
1020			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
1021		}
1022		goto retry;
1023	} else if ((ret = __ENV_LPUT(env, elock)) != 0)
1024		goto err;
1025	else if (F_ISSET(dbp, DB_AM_IN_RENAME))
1026		ret = ENOENT;
1027
1028	if (0) {
1029err:		(void)__ENV_LPUT(env, elock);
1030	}
1031	if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
1032		(void)__os_closehandle(env, fhp);
1033	/*
1034	 * If this is a real file and we are going to proceed with the removal,
1035	 * then we need to make sure that we don't leave any pages around in the
1036	 * mpool since the file is closed and will be reopened again before
1037	 * access.  However, this might be an in-memory file, in which case
1038	 * we will handle the discard from the mpool later as it's the "real"
1039	 * removal of the database.
1040	 */
1041	if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
1042		F_SET(dbp, DB_AM_DISCARD);
1043	return (ret);
1044}
1045
1046/*
1047 * __fop_read_meta --
1048 *	Read the meta-data page from a file and return it in buf.
1049 *
1050 * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
1051 * PUBLIC:     u_int8_t *, size_t, DB_FH *, int, size_t *));
1052 */
1053int
1054__fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
1055	ENV *env;
1056	const char *name;
1057	u_int8_t *buf;
1058	size_t size;
1059	DB_FH *fhp;
1060	int errok;
1061	size_t *nbytesp;
1062{
1063	size_t nr;
1064	int ret;
1065
1066	/*
1067	 * Our caller wants to know the number of bytes read, even if we
1068	 * return an error.
1069	 */
1070	if (nbytesp != NULL)
1071		*nbytesp = 0;
1072
1073	nr = 0;
1074	ret = __os_read(env, fhp, buf, size, &nr);
1075	if (nbytesp != NULL)
1076		*nbytesp = nr;
1077
1078	if (ret != 0) {
1079		if (!errok)
1080			__db_err(env, ret, "%s", name);
1081		goto err;
1082	}
1083
1084	if (nr != size) {
1085		if (!errok)
1086			__db_errx(env,
1087			    "%s: unexpected file type or format", name);
1088		ret = EINVAL;
1089	}
1090
1091err:
1092	return (ret);
1093}
1094
1095/*
1096 * __fop_dummy --
1097 *	This implements the creation and name swapping of dummy files that
1098 * we use for remove and rename (remove is simply a rename with a delayed
1099 * remove).
1100 *
1101 * PUBLIC: int __fop_dummy __P((DB *,
1102 * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t));
1103 */
1104int
1105__fop_dummy(dbp, txn, old, new, flags)
1106	DB *dbp;
1107	DB_TXN *txn;
1108	const char *old, *new;
1109	u_int32_t flags;
1110{
1111	DB *tmpdbp;
1112	DB_TXN *stxn;
1113	ENV *env;
1114	char *back;
1115	int ret, t_ret;
1116	u_int8_t mbuf[DBMETASIZE];
1117
1118	env = dbp->env;
1119	back = NULL;
1120	stxn = NULL;
1121	tmpdbp = NULL;
1122
1123	DB_ASSERT(env, txn != NULL);
1124
1125	/*
1126	 * Begin sub transaction to encapsulate the rename.  Note that we
1127	 * expect the inmem_swap calls to complete the sub-transaction,
1128	 * aborting on error and committing on success.
1129	 */
1130	if (TXN_ON(env) &&
1131	    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
1132		goto err;
1133
1134	/* We need to create a dummy file as a place holder. */
1135	if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
1136		goto err;
1137	/* Create a dummy dbp handle. */
1138	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1139		goto err;
1140
1141	memset(mbuf, 0, sizeof(mbuf));
1142	ret = F_ISSET(dbp, DB_AM_INMEM) ?
1143	    __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
1144	    __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf, flags);
1145
1146	if (ret != 0)
1147		goto err;
1148
1149	ret = F_ISSET(dbp, DB_AM_INMEM) ?
1150	    __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
1151	    __fop_ondisk_swap(dbp,
1152		tmpdbp, stxn, old, new, back, txn->locker, flags);
1153	stxn = NULL;
1154	if (ret != 0)
1155		goto err;
1156
1157err:	if (stxn != NULL)
1158		(void)__txn_abort(stxn);
1159	if (tmpdbp != NULL &&
1160	    (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1161		ret = t_ret;
1162	if (back != NULL)
1163		__os_free(env, back);
1164	return (ret);
1165}
1166
1167/*
1168 * __fop_dbrename --
1169 *	Do the appropriate file locking and file system operations
1170 * to effect a dbrename in the absence of transactions (__fop_dummy
1171 * and the subsequent calls in __db_rename do the work for the
1172 * transactional case).
1173 *
1174 * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
1175 */
1176int
1177__fop_dbrename(dbp, old, new)
1178	DB *dbp;
1179	const char *old, *new;
1180{
1181	DB_LOCK elock;
1182	ENV *env;
1183	char *real_new, *real_old;
1184	int ret, t_ret;
1185
1186	env = dbp->env;
1187	real_new = NULL;
1188	real_old = NULL;
1189	LOCK_INIT(elock);
1190
1191	if (F_ISSET(dbp, DB_AM_INMEM)) {
1192		real_new = (char *)new;
1193		real_old = (char *)old;
1194	} else {
1195		/* Get full names. */
1196		if ((ret = __db_appname(env,
1197		    DB_APP_DATA, new, 0, NULL, &real_new)) != 0)
1198			goto err;
1199
1200		if ((ret = __db_appname(env,
1201		    DB_APP_DATA, old, 0, NULL, &real_old)) != 0)
1202			goto err;
1203
1204	}
1205
1206	/*
1207	 * It is an error to rename a file over one that already exists,
1208	 * as that wouldn't be transaction-safe.  We check explicitly
1209	 * for ondisk files, but it's done memp_nameop for in-memory ones.
1210	 */
1211	GET_ENVLOCK(env, dbp->locker, &elock);
1212	ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
1213	    __os_exists(env, real_new, NULL);
1214
1215	if (ret == 0) {
1216		ret = EEXIST;
1217		__db_errx(env, "rename: file %s exists", real_new);
1218		goto err;
1219	}
1220
1221	ret = __memp_nameop(env,
1222	    dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
1223
1224err:	if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
1225		ret = t_ret;
1226	if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
1227		__os_free(env, real_old);
1228	if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
1229		__os_free(env, real_new);
1230	return (ret);
1231}
1232
1233static int
1234__fop_inmem_create(dbp, name, txn, flags)
1235	DB *dbp;
1236	const char *name;
1237	DB_TXN *txn;
1238	u_int32_t flags;
1239{
1240	DBT fid_dbt, name_dbt;
1241	DB_LSN lsn;
1242	ENV *env;
1243	int ret;
1244	int32_t lfid;
1245	u_int32_t *p32;
1246
1247	env = dbp->env;
1248
1249	MAKE_INMEM(dbp);
1250
1251	/* Set the pagesize if it isn't yet set. */
1252	if (dbp->pgsize == 0)
1253		dbp->pgsize = DB_DEF_IOSIZE;
1254
1255	/*
1256	 * Construct a file_id.
1257	 *
1258	 * If this file has no name, then we only need a fileid for locking.
1259	 * If this file has a name, we need the fileid both for locking and
1260	 * matching in the memory pool.  So, with unnamed in-memory databases,
1261	 * use a lock_id.  For named in-memory files, we need to find a value
1262	 * that we can use to uniquely identify a name/fid pair.  We use a
1263	 * combination of a unique id (__os_unique_id) and a hash of the
1264	 * original name.
1265	 */
1266	if (name == NULL) {
1267		if (LOCKING_ON(env) && (ret =
1268		    __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
1269			goto err;
1270	}  else {
1271		p32 = (u_int32_t *)(&dbp->fileid[0]);
1272		__os_unique_id(env, p32);
1273		p32++;
1274		(void)strncpy(
1275		    (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
1276		dbp->preserve_fid = 1;
1277
1278		if (DBENV_LOGGING(env) &&
1279#if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
1280		    txn != NULL &&
1281#endif
1282		    dbp->log_filename != NULL)
1283			memcpy(dbp->log_filename->ufid,
1284			    dbp->fileid, DB_FILE_ID_LEN);
1285	}
1286
1287	/* Now, set the fileid. */
1288	if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
1289		goto err;
1290
1291	if ((ret = __env_mpool(dbp, name, flags)) != 0)
1292		goto err;
1293
1294	if (DBENV_LOGGING(env) &&
1295#if !defined(DEBUG_WOP)
1296	    txn != NULL &&
1297#endif
1298	    name != NULL) {
1299		DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
1300		memset(&fid_dbt, 0, sizeof(fid_dbt));
1301		fid_dbt.data = dbp->fileid;
1302		fid_dbt.size = DB_FILE_ID_LEN;
1303		lfid = dbp->log_filename == NULL ?
1304		    DB_LOGFILEID_INVALID : dbp->log_filename->id;
1305		if ((ret = __crdel_inmem_create_log(env, txn,
1306		    &lsn, 0, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
1307			goto err;
1308	}
1309
1310	F_SET(dbp, DB_AM_CREATED);
1311
1312err:
1313	return (ret);
1314}
1315
1316static int
1317__fop_inmem_read_meta(dbp, txn, name, flags)
1318	DB *dbp;
1319	DB_TXN *txn;
1320	const char *name;
1321	u_int32_t flags;
1322{
1323	DBMETA *metap;
1324	DB_THREAD_INFO *ip;
1325	db_pgno_t pgno;
1326	int ret, t_ret;
1327
1328	if (txn == NULL)
1329		ENV_GET_THREAD_INFO(dbp->env, ip);
1330	else
1331		ip = txn->thread_info;
1332
1333	pgno  = PGNO_BASE_MD;
1334	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
1335		return (ret);
1336	ret = __db_meta_setup(dbp->env, dbp, name, metap, flags, 1);
1337
1338	if ((t_ret =
1339	    __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
1340		ret = t_ret;
1341
1342	return (ret);
1343}
1344
1345static int
1346__fop_ondisk_dummy(dbp, txn, name, mbuf, flags)
1347	DB *dbp;
1348	DB_TXN *txn;
1349	const char *name;
1350	u_int8_t *mbuf;
1351	u_int32_t flags;
1352{
1353	ENV *env;
1354	int ret;
1355	char *realname;
1356	u_int32_t dflags;
1357
1358	realname = NULL;
1359	env = dbp->env;
1360	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1361
1362	if ((ret = __db_appname(env,
1363	    DB_APP_DATA, name, flags, NULL, &realname)) != 0)
1364		goto err;
1365
1366	if ((ret = __fop_create(env,
1367	    txn, NULL, name, DB_APP_DATA, 0, dflags)) != 0)
1368		goto err;
1369
1370	if ((ret =
1371	    __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
1372		goto err;
1373
1374	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1375	if ((ret = __fop_write(env, txn, name,
1376	    DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
1377		goto err;
1378
1379	memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
1380
1381err:	if (realname != NULL)
1382		__os_free(env, realname);
1383
1384	return (ret);
1385}
1386
1387static int
1388__fop_inmem_dummy(dbp, txn, name, mbuf)
1389	DB *dbp;
1390	DB_TXN *txn;
1391	const char *name;
1392	u_int8_t *mbuf;
1393{
1394	DBMETA *metap;
1395	DB_THREAD_INFO *ip;
1396	db_pgno_t pgno;
1397	int ret, t_ret;
1398
1399	if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
1400		return (ret);
1401	if (txn == NULL)
1402		ENV_GET_THREAD_INFO(dbp->env, ip);
1403	else
1404		ip = txn->thread_info;
1405
1406	pgno  = PGNO_BASE_MD;
1407	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
1408	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
1409		return (ret);
1410	/* Check file existed. */
1411	if (metap->magic != 0)
1412		ret = EEXIST;
1413	else
1414		metap->magic = DB_RENAMEMAGIC;
1415
1416	/* Copy the fileid onto the meta-data page. */
1417	memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
1418
1419	if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
1420	    ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
1421		ret = t_ret;
1422
1423	if (ret != 0)
1424		goto err;
1425
1426	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1427
1428err:	return (ret);
1429}
1430
1431static int
1432__fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker, flags)
1433	DB *dbp, *tmpdbp;
1434	DB_TXN *txn;
1435	const char *old, *new, *back;
1436	DB_LOCKER *locker;
1437	u_int32_t flags;
1438{
1439	DBT fiddbt, namedbt, tmpdbt;
1440	DB_FH *fhp;
1441	DB_LOCK elock;
1442	DB_LSN lsn;
1443	DB_TXN *parent;
1444	ENV *env;
1445	u_int8_t mbuf[DBMETASIZE];
1446	u_int32_t child_txnid, dflags;
1447	int ret, t_ret;
1448	char *realold, *realnew;
1449
1450	env = dbp->env;
1451	DB_ASSERT(env, txn != NULL);
1452	DB_ASSERT(env, old != NULL);
1453
1454	realold = realnew = NULL;
1455	LOCK_INIT(elock);
1456	fhp = NULL;
1457	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1458
1459	if ((ret =
1460	    __db_appname(env, DB_APP_DATA, new, 0, NULL, &realnew)) != 0)
1461		goto err;
1462
1463	/* Now, lock the name space while we initialize this file. */
1464retry:	GET_ENVLOCK(env, locker, &elock);
1465	if (__os_exists(env, realnew, NULL) == 0) {
1466		/*
1467		 * It is possible that the only reason this file exists is
1468		 * because we've done a previous rename of it and we have
1469		 * left a placeholder here.  We need to check for that case
1470		 * and allow this rename to succeed if that's the case.
1471		 */
1472		if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
1473			goto err;
1474		if ((ret = __fop_read_meta(env,
1475		    realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
1476		    (ret = __db_meta_setup(env,
1477		    tmpdbp, realnew, (DBMETA *)mbuf, 0, 1)) != 0) {
1478			ret = EEXIST;
1479			goto err;
1480		}
1481		ret = __os_closehandle(env, fhp);
1482		fhp = NULL;
1483		if (ret != 0)
1484			goto err;
1485
1486		/*
1487		 * Now, try to acquire the handle lock.  If the handle is locked
1488		 * by our current, transaction, then we'll get it and life is
1489		 * good.
1490		 *
1491		 * Alternately, it's not locked at all, we'll get the lock, but
1492		 * we will realize it exists and consider this an error.
1493		 *
1494		 * However, if it's held by another transaction, then there
1495		 * could be two different scenarios: 1) the file is in the
1496		 * midst of being created or deleted and when that transaction
1497		 * is over, we might be able to proceed. 2) the file is open
1498		 * and exists and we should report an error. In order to
1499		 * distinguish these two cases, we do the following. First, we
1500		 * try to acquire a READLOCK.  If the handle is in the midst of
1501		 * being created, then we'll block because a writelock is held.
1502		 * In that case, we should request a blocking write, and when we
1503		 * get the lock, we should then go back and check to see if the
1504		 * object exists and start all over again.
1505		 *
1506		 * If we got the READLOCK, then either no one is holding the
1507		 * lock or someone has an open handle and the fact that the file
1508		 * exists is problematic.  So, in this case, we request the
1509		 * WRITELOCK non-blocking -- if it succeeds, we're golden.  If
1510		 * it fails, then the file exists and we return EEXIST.
1511		 */
1512		if ((ret = __fop_lock_handle(env,
1513		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1514			/*
1515			 * Someone holds a write-lock.  Wait for the write-lock
1516			 * and after we get it, release it and start over.
1517			 */
1518			if ((ret = __fop_lock_handle(env, tmpdbp,
1519			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1520				goto err;
1521			if ((ret =
1522			    __lock_put(env, &tmpdbp->handle_lock)) != 0)
1523				goto err;
1524			if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
1525				goto err;
1526			goto retry;
1527		}
1528
1529		/* We got the read lock; try to upgrade it. */
1530		ret = __fop_lock_handle(env,
1531		    tmpdbp, locker, DB_LOCK_WRITE,
1532		    NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
1533		if (ret != 0) {
1534			/*
1535			 * We did not get the writelock, so someone
1536			 * has the handle open.  This is an error.
1537			 */
1538			(void)__lock_put(env, &tmpdbp->handle_lock);
1539			ret = EEXIST;
1540		} else  if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1541			/* We got the lock and are renaming it. */
1542			ret = 0;
1543		else { /* We got the lock, but the file exists. */
1544			(void)__lock_put(env, &tmpdbp->handle_lock);
1545			ret = EEXIST;
1546		}
1547		if (ret != 0)
1548			goto err;
1549	}
1550
1551	/*
1552	 * While we have the namespace locked, do the renames and then
1553	 * swap for the handle lock.
1554	 */
1555	if ((ret = __fop_rename(env,
1556	    txn, old, new, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
1557		goto err;
1558	if ((ret = __fop_rename(env,
1559	    txn, back, old, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
1560		goto err;
1561	if ((ret = __fop_lock_handle(env,
1562	    tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
1563		goto err;
1564
1565	/*
1566	 * We just acquired a transactional lock on the tmp handle.
1567	 * We need to null out the tmp handle's lock so that it
1568	 * doesn't create problems for us in the close path.
1569	 */
1570	LOCK_INIT(tmpdbp->handle_lock);
1571
1572	/* Commit the child. */
1573	child_txnid = txn->txnid;
1574	parent = txn->parent;
1575	ret = __txn_commit(txn, 0);
1576	txn = NULL;
1577
1578	/* Now log the child information in the parent. */
1579	memset(&fiddbt, 0, sizeof(fiddbt));
1580	fiddbt.data = dbp->fileid;
1581	fiddbt.size = DB_FILE_ID_LEN;
1582	memset(&tmpdbt, 0, sizeof(fiddbt));
1583	tmpdbt.data = tmpdbp->fileid;
1584	tmpdbt.size = DB_FILE_ID_LEN;
1585	DB_INIT_DBT(namedbt, old, strlen(old) + 1);
1586	if ((t_ret = __fop_file_remove_log(env,
1587	    parent, &lsn, 0, &fiddbt, &tmpdbt, &namedbt,
1588	    (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
1589		ret = t_ret;
1590
1591	/* This is a delayed delete of the dummy file. */
1592	if ((ret = __db_appname(env,
1593	    DB_APP_DATA, old, flags, NULL, &realold)) != 0)
1594		goto err;
1595
1596	if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
1597		goto err;
1598
1599err:	if (txn != NULL)	/* Ret must already be set, so void abort. */
1600		(void)__txn_abort(txn);
1601
1602	(void)__ENV_LPUT(env, elock);
1603
1604	if (fhp != NULL &&
1605	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
1606		ret = t_ret;
1607
1608	if (realnew != NULL)
1609		__os_free(env, realnew);
1610	if (realold != NULL)
1611		__os_free(env, realold);
1612	return (ret);
1613}
1614
1615static int
1616__fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
1617	DB *olddbp, *backdbp;
1618	DB_TXN *txn;
1619	const char *old, *new, *back;
1620	DB_LOCKER *locker;
1621{
1622	DB *tmpdbp;
1623	DBT fid_dbt, n1_dbt, n2_dbt;
1624	DB_LOCK elock;
1625	DB_LSN lsn;
1626	DB_TXN *parent;
1627	ENV *env;
1628	int ret, t_ret;
1629
1630	env = olddbp->env;
1631	parent = txn->parent;
1632retry:	LOCK_INIT(elock);
1633	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1634		return (ret);
1635	MAKE_INMEM(tmpdbp);
1636
1637	GET_ENVLOCK(env, locker, &elock);
1638	if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
1639		/*
1640		 * It is possible that the only reason this database exists is
1641		 * because we've done a previous rename of it and we have
1642		 * left a placeholder here.  We need to check for that case
1643		 * and allow this rename to succeed if that's the case.
1644		 */
1645
1646		if ((ret = __fop_inmem_read_meta(tmpdbp, txn, new, 0)) != 0) {
1647			ret = EEXIST;
1648			goto err;
1649		}
1650
1651		/*
1652		 * Now, try to acquire the handle lock.  If it's from our txn,
1653		 * then we'll get the lock.  If it's not, then someone else has
1654		 * it locked.  See the comments in __fop_ondisk_swap for
1655		 * details.
1656		 */
1657		if ((ret = __fop_lock_handle(env,
1658		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1659			/*
1660			 * Someone holds a writelock.  Try for the WRITELOCK
1661			 * and after we get it, retry.
1662			 */
1663			if ((ret = __fop_lock_handle(env, tmpdbp,
1664			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1665				goto err;
1666
1667			/* We have the write lock; release it and start over. */
1668			(void)__lock_put(env, &tmpdbp->handle_lock);
1669			(void)__db_close(tmpdbp, NULL, DB_NOSYNC);
1670			(void)__ENV_LPUT(env, elock);
1671			goto retry;
1672		} else {
1673			(void)__lock_put(env, &tmpdbp->handle_lock);
1674			if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1675				ret = EEXIST;
1676		}
1677		if (ret != 0)
1678			goto err;
1679	}
1680
1681	/* Log the renames. */
1682	if (LOGGING_ON(env)
1683#ifndef DEBUG_WOP
1684	    && txn != NULL
1685#endif
1686	) {
1687		/* Rename old to new. */
1688		DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
1689		DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
1690		DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
1691		if ((ret = __crdel_inmem_rename_log(
1692		    env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
1693			goto err;
1694
1695		/* Rename back to old */
1696		fid_dbt.data = backdbp->fileid;
1697		DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
1698		if ((ret = __crdel_inmem_rename_log(
1699		    env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
1700			goto err;
1701	}
1702
1703	/*
1704	 * While we have the namespace locked, do the renames and then
1705	 * swap for the handle lock.   If we ran into a file in the midst
1706	 * of rename, then we need to delete it first, else nameop is
1707	 * going to consider it an error.
1708	 */
1709	if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
1710		if ((ret = __memp_nameop(env,
1711		    tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
1712			goto err;
1713		__txn_remrem(env, parent, new);
1714	}
1715
1716	if ((ret = __memp_nameop(
1717	    env, olddbp->fileid, new, old, new, 1)) != 0)
1718		goto err;
1719	if ((ret = __memp_nameop(
1720	    env, backdbp->fileid, old, back, old, 1)) != 0)
1721		goto err;
1722
1723	if ((ret = __fop_lock_handle(env,
1724	    tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1725		goto err;
1726
1727	/*
1728	 * We just acquired a transactional lock on the tmp handle.
1729	 * We need to null out the tmp handle's lock so that it
1730	 * doesn't create problems for us in the close path.
1731	 */
1732	LOCK_INIT(tmpdbp->handle_lock);
1733
1734	DB_ASSERT(env, txn != NULL);
1735
1736	/* Commit the child. */
1737	ret = __txn_commit(txn, 0);
1738	txn = NULL;
1739
1740	if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
1741		goto err;
1742
1743err:	(void)__ENV_LPUT(env, elock);
1744
1745	if (txn != NULL)
1746		(void)__txn_abort(txn);
1747
1748	if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1749		ret = t_ret;
1750
1751	return (ret);
1752}
1753