1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 */
6/*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 *	Keith Bostic.  All rights reserved.
9 */
10/*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 *	The Regents of the University of California.  All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * $Id: db.c,v 12.81 2008/02/18 19:11:59 bschmeck Exp $
39 */
40
41#include "db_config.h"
42
43#include "db_int.h"
44#include "dbinc/db_page.h"
45#include "dbinc/db_swap.h"
46#include "dbinc/btree.h"
47#include "dbinc/fop.h"
48#include "dbinc/hash.h"
49#include "dbinc/lock.h"
50#include "dbinc/log.h"
51#include "dbinc/mp.h"
52#include "dbinc/qam.h"
53#include "dbinc/txn.h"
54
55static int __db_disassociate __P((DB *));
56static int __db_disassociate_foreign __P ((DB *));
57
58#ifdef CONFIG_TEST
59static int __db_makecopy __P((ENV *, const char *, const char *));
60static int __db_testdocopy __P((ENV *, const char *));
61static int __qam_testdocopy __P((DB *, const char *));
62#endif
63
64/*
65 * DB.C --
66 *	This file contains the utility functions for the DBP layer.
67 */
68
69/*
70 * __db_master_open --
71 *	Open up a handle on a master database.
72 *
73 * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
74 * PUBLIC:     DB_TXN *, const char *, u_int32_t, int, DB **));
75 */
76int
77__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
78	DB *subdbp;
79	DB_THREAD_INFO *ip;
80	DB_TXN *txn;
81	const char *name;
82	u_int32_t flags;
83	int mode;
84	DB **dbpp;
85{
86	DB *dbp;
87	int ret;
88
89	*dbpp = NULL;
90
91	/* Open up a handle on the main database. */
92	if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
93		return (ret);
94
95	/*
96	 * It's always a btree.
97	 * Run in the transaction we've created.
98	 * Set the pagesize in case we're creating a new database.
99	 * Flag that we're creating a database with subdatabases.
100	 */
101	dbp->pgsize = subdbp->pgsize;
102	F_SET(dbp, DB_AM_SUBDB);
103	F_SET(dbp, F_ISSET(subdbp,
104	    DB_AM_RECOVER | DB_AM_SWAP |
105	    DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
106
107	/*
108	 * If there was a subdb specified, then we only want to apply
109	 * DB_EXCL to the subdb, not the actual file.  We only got here
110	 * because there was a subdb specified.
111	 */
112	LF_CLR(DB_EXCL);
113	LF_SET(DB_RDWRMASTER);
114	if ((ret = __db_open(dbp, ip,
115	    txn, name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
116		goto err;
117
118	/*
119	 * The items in dbp are initialized from the master file's meta page.
120	 * Other items such as checksum and encryption are checked when we
121	 * read the meta-page, so we do not check those here.  However, if
122	 * the meta-page caused checksumming to be turned on and it wasn't
123	 * already, set it here.
124	 */
125	if (F_ISSET(dbp, DB_AM_CHKSUM))
126		F_SET(subdbp, DB_AM_CHKSUM);
127
128	/*
129	 * The user may have specified a page size for an existing file,
130	 * which we want to ignore.
131	 */
132	subdbp->pgsize = dbp->pgsize;
133	*dbpp = dbp;
134
135	if (0) {
136err:		if (!F_ISSET(dbp, DB_AM_DISCARD))
137			(void)__db_close(dbp, txn, 0);
138	}
139
140	return (ret);
141}
142
143/*
144 * __db_master_update --
145 *	Add/Open/Remove a subdatabase from a master database.
146 *
147 * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
148 * PUBLIC:      const char *, DBTYPE, mu_action, const char *, u_int32_t));
149 */
150int
151__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
152	DB *mdbp, *sdbp;
153	DB_TXN *txn;
154	DB_THREAD_INFO *ip;
155	const char *subdb;
156	DBTYPE type;
157	mu_action action;
158	const char *newname;
159	u_int32_t flags;
160{
161	DBC *dbc, *ndbc;
162	DBT key, data, ndata;
163	ENV *env;
164	PAGE *p, *r;
165	db_pgno_t t_pgno;
166	int modify, ret, t_ret;
167
168	env = mdbp->env;
169	dbc = ndbc = NULL;
170	p = NULL;
171
172	/*
173	 * Open up a cursor.  If this is CDB and we're creating the database,
174	 * make it an update cursor.
175	 *
176	 * Might we modify the master database?  If so, we'll need to lock.
177	 */
178	modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0;
179
180	if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
181	    (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
182		return (ret);
183
184	/*
185	 * Point the cursor at the record.
186	 *
187	 * If we're removing or potentially creating an entry, lock the page
188	 * with DB_RMW.
189	 *
190	 * We do multiple cursor operations with the cursor in some cases and
191	 * subsequently access the data DBT information.  Set DB_DBT_MALLOC so
192	 * we don't risk modification of the data between our uses of it.
193	 *
194	 * !!!
195	 * We don't include the name's nul termination in the database.
196	 */
197	DB_INIT_DBT(key, subdb, strlen(subdb));
198	memset(&data, 0, sizeof(data));
199	F_SET(&data, DB_DBT_MALLOC);
200
201	ret = __dbc_get(dbc, &key, &data,
202	    DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
203
204	/*
205	 * What we do next--whether or not we found a record for the
206	 * specified subdatabase--depends on what the specified action is.
207	 * Handle ret appropriately as the first statement of each case.
208	 */
209	switch (action) {
210	case MU_REMOVE:
211		/*
212		 * We should have found something if we're removing it.  Note
213		 * that in the common case where the DB we're asking to remove
214		 * doesn't exist, we won't get this far;  __db_subdb_remove
215		 * will already have returned an error from __db_open.
216		 */
217		if (ret != 0)
218			goto err;
219
220		/*
221		 * Delete the subdatabase entry first;  if this fails,
222		 * we don't want to touch the actual subdb pages.
223		 */
224		if ((ret = __dbc_del(dbc, 0)) != 0)
225			goto err;
226
227		/*
228		 * We're handling actual data, not on-page meta-data,
229		 * so it hasn't been converted to/from opposite
230		 * endian architectures.  Do it explicitly, now.
231		 */
232		memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
233		DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
234		if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
235		    ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
236			goto err;
237
238		/* Free the root on the master db if it was created. */
239		if (TYPE(p) == P_BTREEMETA &&
240		    ((BTMETA *)p)->root != PGNO_INVALID) {
241			if ((ret = __memp_fget(mdbp->mpf,
242			    &((BTMETA *)p)->root, ip, dbc->txn,
243			    DB_MPOOL_DIRTY, &r)) != 0)
244				goto err;
245
246			/* Free and put the page. */
247			if ((ret = __db_free(dbc, r)) != 0) {
248				r = NULL;
249				goto err;
250			}
251		}
252		/* Free and put the page. */
253		if ((ret = __db_free(dbc, p)) != 0) {
254			p = NULL;
255			goto err;
256		}
257		p = NULL;
258		break;
259	case MU_RENAME:
260		/* We should have found something if we're renaming it. */
261		if (ret != 0)
262			goto err;
263
264		/*
265		 * Before we rename, we need to make sure we're not
266		 * overwriting another subdatabase, or else this operation
267		 * won't be undoable.  Open a second cursor and check
268		 * for the existence of newname;  it shouldn't appear under
269		 * us since we hold the metadata lock.
270		 */
271		if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
272		    CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
273			goto err;
274		DB_SET_DBT(key, newname, strlen(newname));
275
276		/*
277		 * We don't actually care what the meta page of the potentially-
278		 * overwritten DB is; we just care about existence.
279		 */
280		memset(&ndata, 0, sizeof(ndata));
281		F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
282
283		if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
284			/* A subdb called newname exists.  Bail. */
285			ret = EEXIST;
286			__db_errx(env, "rename: database %s exists", newname);
287			goto err;
288		} else if (ret != DB_NOTFOUND)
289			goto err;
290
291		/*
292		 * Now do the put first; we don't want to lose our only
293		 * reference to the subdb.  Use the second cursor so the
294		 * first one continues to point to the old record.
295		 */
296		if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
297			goto err;
298		if ((ret = __dbc_del(dbc, 0)) != 0) {
299			/*
300			 * If the delete fails, try to delete the record
301			 * we just put, in case we're not txn-protected.
302			 */
303			(void)__dbc_del(ndbc, 0);
304			goto err;
305		}
306
307		break;
308	case MU_OPEN:
309		/*
310		 * Get the subdatabase information.  If it already exists,
311		 * copy out the page number and we're done.
312		 */
313		switch (ret) {
314		case 0:
315			if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
316				ret = EEXIST;
317				goto err;
318			}
319			memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
320			DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
321			goto done;
322		case DB_NOTFOUND:
323			if (LF_ISSET(DB_CREATE))
324				break;
325			/*
326			 * No db_err, it is reasonable to remove a
327			 * nonexistent db.
328			 */
329			ret = ENOENT;
330			goto err;
331		default:
332			goto err;
333		}
334
335		/* Create a subdatabase. */
336		if ((ret = __db_new(dbc,
337		    type == DB_HASH ? P_HASHMETA : P_BTREEMETA, &p)) != 0)
338			goto err;
339		sdbp->meta_pgno = PGNO(p);
340
341		/*
342		 * XXX
343		 * We're handling actual data, not on-page meta-data, so it
344		 * hasn't been converted to/from opposite endian architectures.
345		 * Do it explicitly, now.
346		 */
347		t_pgno = PGNO(p);
348		DB_HTONL_SWAP(env, &t_pgno);
349		memset(&ndata, 0, sizeof(ndata));
350		ndata.data = &t_pgno;
351		ndata.size = sizeof(db_pgno_t);
352		if ((ret = __dbc_put(dbc, &key, &ndata, DB_KEYLAST)) != 0)
353			goto err;
354		F_SET(sdbp, DB_AM_CREATED);
355		break;
356	}
357
358err:
359done:	/*
360	 * If we allocated a page: if we're successful, mark the page dirty
361	 * and return it to the cache, otherwise, discard/free it.
362	 */
363	if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
364	     dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
365		ret = t_ret;
366
367	/* Discard the cursor(s) and data. */
368	if (data.data != NULL)
369		__os_ufree(env, data.data);
370	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
371		ret = t_ret;
372	if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
373		ret = t_ret;
374
375	return (ret);
376}
377
378/*
379 * __env_setup --
380 *	Set up the underlying environment during a db_open.
381 *
382 * PUBLIC: int __env_setup __P((DB *,
383 * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
384 */
385int
386__env_setup(dbp, txn, fname, dname, id, flags)
387	DB *dbp;
388	DB_TXN *txn;
389	const char *fname, *dname;
390	u_int32_t id, flags;
391{
392	DB *ldbp;
393	DB_ENV *dbenv;
394	ENV *env;
395	u_int32_t maxid;
396	int ret;
397
398	env = dbp->env;
399	dbenv = env->dbenv;
400
401	/* If we don't yet have an environment, it's time to create it. */
402	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
403		/* Make sure we have at least DB_MINCACHE pages in our cache. */
404		if (dbenv->mp_gbytes == 0 &&
405		    dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
406		    (ret = __memp_set_cachesize(
407		    dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
408			return (ret);
409
410		if ((ret = __env_open(dbenv, NULL, DB_CREATE |
411		    DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
412			return (ret);
413	}
414
415	/* Join the underlying cache. */
416	if ((!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
417	    (ret = __env_mpool(dbp, fname, flags)) != 0)
418		return (ret);
419
420	/* We may need a per-thread mutex. */
421	if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
422	    env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
423		return (ret);
424
425	/*
426	 * Set up a bookkeeping entry for this database in the log region,
427	 * if such a region exists.  Note that even if we're in recovery
428	 * or a replication client, where we won't log registries, we'll
429	 * still need an FNAME struct, so LOGGING_ON is the correct macro.
430	 */
431	if (LOGGING_ON(env) && dbp->log_filename == NULL
432#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
433	    && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
434#endif
435#if !defined(DEBUG_ROP)
436	    && !F_ISSET(dbp, DB_AM_RDONLY)
437#endif
438	    ) {
439		if ((ret = __dbreg_setup(dbp,
440		    F_ISSET(dbp, DB_AM_INMEM) ? dname : fname,
441		    F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
442			return (ret);
443
444		/*
445		 * If we're actively logging and our caller isn't a
446		 * recovery function that already did so, then assign
447		 * this dbp a log fileid.
448		 */
449		if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
450		    (ret = __dbreg_new_id(dbp, txn)) != 0)
451			return (ret);
452	}
453
454	/*
455	 * Insert ourselves into the ENV's dblist.  We allocate a
456	 * unique ID to each {fileid, meta page number} pair, and to
457	 * each temporary file (since they all have a zero fileid).
458	 * This ID gives us something to use to tell which DB handles
459	 * go with which databases in all the cursor adjustment
460	 * routines, where we don't want to do a lot of ugly and
461	 * expensive memcmps.
462	 */
463	MUTEX_LOCK(env, env->mtx_dblist);
464	maxid = 0;
465	TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
466		/*
467		 * There are three cases: on-disk database (first clause),
468		 * named in-memory database (second clause), temporary database
469		 * (never matches; no clause).
470		 */
471		if (!F_ISSET(dbp, DB_AM_INMEM)) {
472			if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
473			    == 0 && ldbp->meta_pgno == dbp->meta_pgno)
474				break;
475		} else if (dname != NULL) {
476			if (F_ISSET(ldbp, DB_AM_INMEM) &&
477			    ldbp->dname != NULL &&
478			    strcmp(ldbp->dname, dname) == 0)
479				break;
480		}
481		if (ldbp->adj_fileid > maxid)
482			maxid = ldbp->adj_fileid;
483	}
484
485	/*
486	 * If ldbp is NULL, we didn't find a match. Assign the dbp an
487	 * adj_fileid one higher than the largest we found, and
488	 * insert it at the head of the master dbp list.
489	 *
490	 * If ldbp is not NULL, it is a match for our dbp.  Give dbp
491	 * the same ID that ldbp has, and add it after ldbp so they're
492	 * together in the list.
493	 */
494	if (ldbp == NULL) {
495		dbp->adj_fileid = maxid + 1;
496		TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
497	} else {
498		dbp->adj_fileid = ldbp->adj_fileid;
499		TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
500	}
501	MUTEX_UNLOCK(env, env->mtx_dblist);
502
503	return (0);
504}
505
506/*
507 * __env_mpool --
508 *	Set up the underlying environment cache during a db_open.
509 *
510 * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
511 */
512int
513__env_mpool(dbp, fname, flags)
514	DB *dbp;
515	const char *fname;
516	u_int32_t flags;
517{
518	DBT pgcookie;
519	DB_MPOOLFILE *mpf;
520	DB_PGINFO pginfo;
521	ENV *env;
522	int fidset, ftype, ret;
523	int32_t lsn_off;
524	u_int8_t nullfid[DB_FILE_ID_LEN];
525	u_int32_t clear_len;
526
527	env = dbp->env;
528
529	/* The LSN is the first entry on a DB page, byte offset 0. */
530	lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
531
532	/* It's possible that this database is already open. */
533	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
534		return (0);
535
536	/*
537	 * If we need to pre- or post-process a file's pages on I/O, set the
538	 * file type.  If it's a hash file, always call the pgin and pgout
539	 * routines.  This means that hash files can never be mapped into
540	 * process memory.  If it's a btree file and requires swapping, we
541	 * need to page the file in and out.  This has to be right -- we can't
542	 * mmap files that are being paged in and out.
543	 */
544	switch (dbp->type) {
545	case DB_BTREE:
546	case DB_RECNO:
547		ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
548		    ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
549		clear_len = CRYPTO_ON(env) ?
550		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
551		    DB_PAGE_DB_LEN;
552		break;
553	case DB_HASH:
554		ftype = DB_FTYPE_SET;
555		clear_len = CRYPTO_ON(env) ?
556		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
557		    DB_PAGE_DB_LEN;
558		break;
559	case DB_QUEUE:
560		ftype = F_ISSET(dbp,
561		    DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
562		    DB_FTYPE_SET : DB_FTYPE_NOTSET;
563
564		/*
565		 * If we came in here without a pagesize set, then we need
566		 * to mark the in-memory handle as having clear_len not
567		 * set, because we don't really know the clear length or
568		 * the page size yet (since the file doesn't yet exist).
569		 */
570		clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
571		break;
572	case DB_UNKNOWN:
573		/*
574		 * If we're running in the verifier, our database might
575		 * be corrupt and we might not know its type--but we may
576		 * still want to be able to verify and salvage.
577		 *
578		 * If we can't identify the type, it's not going to be safe
579		 * to call __db_pgin--we pretty much have to give up all
580		 * hope of salvaging cross-endianness.  Proceed anyway;
581		 * at worst, the database will just appear more corrupt
582		 * than it actually is, but at best, we may be able
583		 * to salvage some data even with no metadata page.
584		 */
585		if (F_ISSET(dbp, DB_AM_VERIFYING)) {
586			ftype = DB_FTYPE_NOTSET;
587			clear_len = DB_PAGE_DB_LEN;
588			break;
589		}
590
591		/*
592		 * This might be an in-memory file and we won't know its
593		 * file type until after we open it and read the meta-data
594		 * page.
595		 */
596		if (F_ISSET(dbp, DB_AM_INMEM)) {
597			clear_len = DB_CLEARLEN_NOTSET;
598			ftype = DB_FTYPE_NOTSET;
599			lsn_off = DB_LSN_OFF_NOTSET;
600			break;
601		}
602		/* FALLTHROUGH */
603	default:
604		return (__db_unknown_type(env, "DB->open", dbp->type));
605	}
606
607	mpf = dbp->mpf;
608
609	memset(nullfid, 0, DB_FILE_ID_LEN);
610	fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
611	if (fidset)
612		(void)__memp_set_fileid(mpf, dbp->fileid);
613
614	(void)__memp_set_clear_len(mpf, clear_len);
615	(void)__memp_set_ftype(mpf, ftype);
616	(void)__memp_set_lsn_offset(mpf, lsn_off);
617
618	pginfo.db_pagesize = dbp->pgsize;
619	pginfo.flags =
620	    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
621	pginfo.type = dbp->type;
622	pgcookie.data = &pginfo;
623	pgcookie.size = sizeof(DB_PGINFO);
624	(void)__memp_set_pgcookie(mpf, &pgcookie);
625
626#ifndef DIAG_MVCC
627	if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
628#endif
629		if (F_ISSET(dbp, DB_AM_TXN) &&
630		    dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
631			LF_SET(DB_MULTIVERSION);
632
633	if ((ret = __memp_fopen(mpf, NULL, fname,
634	    LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
635		DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
636	    (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
637	    (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
638	    0, dbp->pgsize)) != 0) {
639		/*
640		 * The open didn't work; we need to reset the mpf,
641		 * retaining the in-memory semantics (if any).
642		 */
643		(void)__memp_fclose(dbp->mpf, 0);
644		(void)__memp_fcreate(env, &dbp->mpf);
645		if (F_ISSET(dbp, DB_AM_INMEM))
646			MAKE_INMEM(dbp);
647		return (ret);
648	}
649
650	/*
651	 * Set the open flag.  We use it to mean that the dbp has gone
652	 * through mpf setup, including dbreg_register.  Also, below,
653	 * the underlying access method open functions may want to do
654	 * things like acquire cursors, so the open flag has to be set
655	 * before calling them.
656	 */
657	F_SET(dbp, DB_AM_OPEN_CALLED);
658	if (!fidset && fname != NULL) {
659		(void)__memp_get_fileid(dbp->mpf, dbp->fileid);
660		dbp->preserve_fid = 1;
661	}
662
663	return (0);
664}
665
666/*
667 * __db_close --
668 *	DB->close method.
669 *
670 * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
671 */
672int
673__db_close(dbp, txn, flags)
674	DB *dbp;
675	DB_TXN *txn;
676	u_int32_t flags;
677{
678	ENV *env;
679	int db_ref, deferred_close, ret, t_ret;
680
681	env = dbp->env;
682	deferred_close = ret = 0;
683
684	/*
685	 * Validate arguments, but as a DB handle destructor, we can't fail.
686	 *
687	 * Check for consistent transaction usage -- ignore errors.  Only
688	 * internal callers specify transactions, so it's a serious problem
689	 * if we get error messages.
690	 */
691	if (txn != NULL)
692		(void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0);
693
694	/* Refresh the structure and close any underlying resources. */
695	ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
696
697	/*
698	 * If we've deferred the close because the logging of the close failed,
699	 * return our failure right away without destroying the handle.
700	 */
701	if (deferred_close)
702		return (ret);
703
704	/* !!!
705	 * This code has an apparent race between the moment we read and
706	 * decrement env->db_ref and the moment we check whether it's 0.
707	 * However, if the environment is DBLOCAL, the user shouldn't have a
708	 * reference to the env handle anyway;  the only way we can get
709	 * multiple dbps sharing a local env is if we open them internally
710	 * during something like a subdatabase open.  If any such thing is
711	 * going on while the user is closing the original dbp with a local
712	 * env, someone's already badly screwed up, so there's no reason
713	 * to bother engineering around this possibility.
714	 */
715	MUTEX_LOCK(env, env->mtx_dblist);
716	db_ref = --env->db_ref;
717	MUTEX_UNLOCK(env, env->mtx_dblist);
718	if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
719	    (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
720		ret = t_ret;
721
722	/* Free the database handle. */
723	memset(dbp, CLEAR_BYTE, sizeof(*dbp));
724	__os_free(env, dbp);
725
726	return (ret);
727}
728
729/*
730 * __db_refresh --
731 *	Refresh the DB structure, releasing any allocated resources.
732 * This does most of the work of closing files now because refresh
733 * is what is used during abort processing (since we can't destroy
734 * the actual handle) and during abort processing, we may have a
735 * fully opened handle.
736 *
737 * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
738 */
739int
740__db_refresh(dbp, txn, flags, deferred_closep, reuse)
741	DB *dbp;
742	DB_TXN *txn;
743	u_int32_t flags;
744	int *deferred_closep, reuse;
745{
746	DB *sdbp;
747	DBC *dbc;
748	DB_FOREIGN_INFO *f_info, *tmp;
749	DB_LOCKER *locker;
750	DB_LOCKREQ lreq;
751	ENV *env;
752	REGENV *renv;
753	REGINFO *infop;
754	u_int32_t save_flags;
755	int resync, ret, t_ret;
756
757	ret = 0;
758
759	env = dbp->env;
760	infop = env->reginfo;
761	if (infop != NULL)
762		renv = infop->primary;
763	else
764		renv = NULL;
765
766	/*
767	 * If this dbp is not completely open, avoid trapping by trying to
768	 * sync without an mpool file.
769	 */
770	if (dbp->mpf == NULL)
771		LF_SET(DB_NOSYNC);
772
773	/* If never opened, or not currently open, it's easy. */
774	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
775		goto never_opened;
776
777	/*
778	 * If we have any secondary indices, disassociate them from us.
779	 * We don't bother with the mutex here;  it only protects some
780	 * of the ops that will make us core-dump mid-close anyway, and
781	 * if you're trying to do something with a secondary *while* you're
782	 * closing the primary, you deserve what you get.  The disassociation
783	 * is mostly done just so we can close primaries and secondaries in
784	 * any order--but within one thread of control.
785	 */
786	LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
787		LIST_REMOVE(sdbp, s_links);
788		if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
789			ret = t_ret;
790	}
791
792	/*
793	 * Disassociate ourself from any databases using us as a foreign key
794	 * database by clearing the referring db's pointer.  Reclaim memory.
795	 */
796	f_info = LIST_FIRST(&dbp->f_primaries);
797	while (f_info != NULL) {
798		tmp = LIST_NEXT(f_info, f_links);
799		LIST_REMOVE(f_info, f_links);
800		f_info->dbp->s_foreign = NULL;
801		__os_free(env, f_info);
802		f_info = tmp;
803	}
804
805	if (dbp->s_foreign != NULL &&
806	    (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
807		ret = t_ret;
808
809	/*
810	 * Sync the underlying access method.  Do before closing the cursors
811	 * because DB->sync allocates cursors in order to write Recno backing
812	 * source text files.
813	 *
814	 * Sync is slow on some systems, notably Solaris filesystems where the
815	 * entire buffer cache is searched.  If we're in recovery, don't flush
816	 * the file, it's not necessary.
817	 */
818	if (!LF_ISSET(DB_NOSYNC) &&
819	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
820	    (t_ret = __db_sync(dbp)) != 0 && ret == 0)
821		ret = t_ret;
822
823	/*
824	 * Go through the active cursors and call the cursor recycle routine,
825	 * which resolves pending operations and moves the cursors onto the
826	 * free list.  Then, walk the free list and call the cursor destroy
827	 * routine.  Note that any failure on a close is considered "really
828	 * bad" and we just break out of the loop and force forward.
829	 */
830	resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
831	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
832		if ((t_ret = __dbc_close(dbc)) != 0) {
833			if (ret == 0)
834				ret = t_ret;
835			break;
836		}
837
838	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
839		if ((t_ret = __dbc_destroy(dbc)) != 0) {
840			if (ret == 0)
841				ret = t_ret;
842			break;
843		}
844
845	/*
846	 * Close any outstanding join cursors.  Join cursors destroy themselves
847	 * on close and have no separate destroy routine.  We don't have to set
848	 * the resync flag here, because join cursors aren't write cursors.
849	 */
850	while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
851		if ((t_ret = __db_join_close(dbc)) != 0) {
852			if (ret == 0)
853				ret = t_ret;
854			break;
855		}
856
857	/*
858	 * Sync the memory pool, even though we've already called DB->sync,
859	 * because closing cursors can dirty pages by deleting items they
860	 * referenced.
861	 *
862	 * Sync is slow on some systems, notably Solaris filesystems where the
863	 * entire buffer cache is searched.  If we're in recovery, don't flush
864	 * the file, it's not necessary.
865	 */
866	if (resync && !LF_ISSET(DB_NOSYNC) &&
867	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
868	    (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
869		ret = t_ret;
870
871never_opened:
872	/*
873	 * At this point, we haven't done anything to render the DB handle
874	 * unusable, at least by a transaction abort.  Take the opportunity
875	 * now to log the file close if we have initialized the logging
876	 * information.  If this log fails and we're in a transaction,
877	 * we have to bail out of the attempted close; we'll need a dbp in
878	 * order to successfully abort the transaction, and we can't conjure
879	 * a new one up because we haven't gotten out the dbreg_register
880	 * record that represents the close.  In this case, we put off
881	 * actually closing the dbp until we've performed the abort.
882	 */
883	if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
884		/*
885		 * Discard the log file id, if any.  We want to log the close
886		 * if and only if this is not a recovery dbp or a client dbp,
887		 * or a dead dbp handle.
888		 */
889		DB_ASSERT(env, renv != NULL);
890		if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
891		    dbp->timestamp != renv->rep_timestamp) {
892			if ((t_ret = __dbreg_revoke_id(dbp,
893			    0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
894				ret = t_ret;
895			if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
896				ret = t_ret;
897		} else {
898			if ((t_ret = __dbreg_close_id(dbp,
899			    txn, DBREG_CLOSE)) != 0 && txn != NULL) {
900				/*
901				 * We're in a txn and the attempt to log the
902				 * close failed;  let the txn subsystem know
903				 * that we need to destroy this dbp once we're
904				 * done with the abort, then bail from the
905				 * close.
906				 *
907				 * Note that if the attempt to put off the
908				 * close -also- fails--which it won't unless
909				 * we're out of heap memory--we're really
910				 * screwed.  Panic.
911				 */
912				if ((ret =
913				    __txn_closeevent(env, txn, dbp)) != 0)
914					return (__env_panic(env, ret));
915				if (deferred_closep != NULL)
916					*deferred_closep = 1;
917				return (t_ret);
918			}
919			/*
920			 * If dbreg_close_id failed and we were not in a
921			 * transaction, then we need to finish this close
922			 * because the caller can't do anything with the
923			 * handle after we return an error.  We rely on
924			 * dbreg_close_id to mark the entry in some manner
925			 * so that we do not do a clean shutdown of this
926			 * environment.  If shutdown isn't clean, then the
927			 * application *must* run recovery and that will
928			 * generate the RCLOSE record.
929			 */
930		}
931
932	}
933
934	/* Close any handle we've been holding since the open.  */
935	if (dbp->saved_open_fhp != NULL &&
936	    (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
937	    ret == 0)
938		ret = t_ret;
939
940	/*
941	 * Remove this DB handle from the ENV's dblist, if it's been added.
942	 *
943	 * Close our reference to the underlying cache while locked, we don't
944	 * want to race with a thread searching for our underlying cache link
945	 * while opening a DB handle.
946	 *
947	 * The DB handle may not yet have been added to the ENV list, don't
948	 * blindly call the underlying TAILQ_REMOVE macro.  Explicitly reset
949	 * the field values to NULL so that we can't call TAILQ_REMOVE twice.
950	 */
951	MUTEX_LOCK(env, env->mtx_dblist);
952	if (!reuse &&
953	    (dbp->dblistlinks.tqe_next != NULL ||
954	    dbp->dblistlinks.tqe_prev != NULL)) {
955		TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
956		dbp->dblistlinks.tqe_next = NULL;
957		dbp->dblistlinks.tqe_prev = NULL;
958	}
959
960	/* Close the memory pool file handle. */
961	if (dbp->mpf != NULL) {
962		if ((t_ret = __memp_fclose(dbp->mpf,
963		    F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
964		    ret == 0)
965			ret = t_ret;
966		dbp->mpf = NULL;
967		if (reuse &&
968		    (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
969		    ret == 0)
970			ret = t_ret;
971	}
972
973	MUTEX_UNLOCK(env, env->mtx_dblist);
974
975	/*
976	 * Call the access specific close function.
977	 *
978	 * We do this here rather than in __db_close as we need to do this when
979	 * aborting an open so that file descriptors are closed and abort of
980	 * renames can succeed on platforms that lock open files (such as
981	 * Windows).  In particular, we need to ensure that all the extents
982	 * associated with a queue are closed so that queue renames can be
983	 * aborted.
984	 *
985	 * It is also important that we do this before releasing the handle
986	 * lock, because dbremove and dbrename assume that once they have the
987	 * handle lock, it is safe to modify the underlying file(s).
988	 *
989	 * !!!
990	 * Because of where these functions are called in the DB handle close
991	 * process, these routines can't do anything that would dirty pages or
992	 * otherwise affect closing down the database.  Specifically, we can't
993	 * abort and recover any of the information they control.
994	 */
995	if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
996		ret = t_ret;
997	if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
998		ret = t_ret;
999	if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
1000		ret = t_ret;
1001
1002	/*
1003	 * !!!
1004	 * At this point, the access-method specific information has been
1005	 * freed.  From now on, we can use the dbp, but not touch any
1006	 * access-method specific data.
1007	 */
1008
1009	if (!reuse && dbp->locker != NULL) {
1010		/* We may have pending trade operations on this dbp. */
1011		if (txn == NULL)
1012			txn = dbp->cur_txn;
1013		if (IS_REAL_TXN(txn))
1014			__txn_remlock(env,
1015			     txn, &dbp->handle_lock, dbp->locker);
1016
1017		/* We may be holding the handle lock; release it. */
1018		lreq.op = DB_LOCK_PUT_ALL;
1019		lreq.obj = NULL;
1020		if ((t_ret = __lock_vec(env,
1021		    dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
1022			ret = t_ret;
1023
1024		if ((t_ret =
1025		     __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
1026			ret = t_ret;
1027		dbp->locker = NULL;
1028		LOCK_INIT(dbp->handle_lock);
1029	}
1030
1031	/*
1032	 * If this is a temporary file (un-named in-memory file), then
1033	 * discard the locker ID allocated as the fileid.
1034	 */
1035	if (LOCKING_ON(env) &&
1036	    F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
1037	    *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
1038		if ((t_ret = __lock_getlocker(env->lk_handle,
1039		     *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
1040			t_ret = __lock_id_free(env, locker);
1041		if (ret == 0)
1042			ret = t_ret;
1043	}
1044
1045	if (reuse) {
1046		/*
1047		 * If we are reusing this dbp, then we're done now. Re-init
1048		 * the handle, preserving important flags, and then return.
1049		 * This code is borrowed from __db_init, which does more
1050		 * than we can do here.
1051		 */
1052		save_flags = F_ISSET(dbp, DB_AM_INMEM | DB_AM_TXN);
1053
1054		/*
1055		 * XXX If this is an XA handle, we'll want to specify
1056		 * DB_XA_CREATE.
1057		 */
1058		if ((ret = __bam_db_create(dbp)) != 0)
1059			return (ret);
1060		if ((ret = __ham_db_create(dbp)) != 0)
1061			return (ret);
1062		if ((ret = __qam_db_create(dbp)) != 0)
1063			return (ret);
1064
1065		/* Restore flags */
1066		dbp->flags = dbp->orig_flags | save_flags;
1067
1068		if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
1069			/*
1070			 * If this is inmem, then it may have a fileid
1071			 * even if it was never opened, and we need to
1072			 * clear out that fileid.
1073			 */
1074			memset(dbp->fileid, 0, sizeof(dbp->fileid));
1075			MAKE_INMEM(dbp);
1076		}
1077		return (ret);
1078	}
1079
1080	dbp->type = DB_UNKNOWN;
1081
1082	/*
1083	 * The thread mutex may have been invalidated in __dbreg_close_id if the
1084	 * fname refcount did not go to 0. If not, discard the thread mutex.
1085	 */
1086	if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
1087		ret = t_ret;
1088
1089	/* Discard any memory allocated for the file and database names. */
1090	if (dbp->fname != NULL) {
1091		__os_free(dbp->env, dbp->fname);
1092		dbp->fname = NULL;
1093	}
1094	if (dbp->dname != NULL) {
1095		__os_free(dbp->env, dbp->dname);
1096		dbp->dname = NULL;
1097	}
1098
1099	/* Discard any memory used to store returned data. */
1100	if (dbp->my_rskey.data != NULL)
1101		__os_free(dbp->env, dbp->my_rskey.data);
1102	if (dbp->my_rkey.data != NULL)
1103		__os_free(dbp->env, dbp->my_rkey.data);
1104	if (dbp->my_rdata.data != NULL)
1105		__os_free(dbp->env, dbp->my_rdata.data);
1106
1107	/* For safety's sake;  we may refresh twice. */
1108	memset(&dbp->my_rskey, 0, sizeof(DBT));
1109	memset(&dbp->my_rkey, 0, sizeof(DBT));
1110	memset(&dbp->my_rdata, 0, sizeof(DBT));
1111
1112	/* Clear out fields that normally get set during open. */
1113	memset(dbp->fileid, 0, sizeof(dbp->fileid));
1114	dbp->adj_fileid = 0;
1115	dbp->meta_pgno = 0;
1116	dbp->cur_locker = NULL;
1117	dbp->cur_txn = NULL;
1118	dbp->associate_locker = NULL;
1119	dbp->cl_id = 0;
1120	dbp->open_flags = 0;
1121
1122	/*
1123	 * If we are being refreshed with a txn specified, then we need
1124	 * to make sure that we clear out the lock handle field, because
1125	 * releasing all the locks for this transaction will release this
1126	 * lock and we don't want close to stumble upon this handle and
1127	 * try to close it.
1128	 */
1129	if (txn != NULL)
1130		LOCK_INIT(dbp->handle_lock);
1131
1132	/* Reset flags to whatever the user configured. */
1133	dbp->flags = dbp->orig_flags;
1134
1135	return (ret);
1136}
1137
1138/*
1139 * __db_disassociate --
1140 *	Destroy the association between a given secondary and its primary.
1141 */
1142static int
1143__db_disassociate(sdbp)
1144	DB *sdbp;
1145{
1146	DBC *dbc;
1147	int ret, t_ret;
1148
1149	ret = 0;
1150
1151	sdbp->s_callback = NULL;
1152	sdbp->s_primary = NULL;
1153	sdbp->get = sdbp->stored_get;
1154	sdbp->close = sdbp->stored_close;
1155
1156	/*
1157	 * Complain, but proceed, if we have any active cursors.  (We're in
1158	 * the middle of a close, so there's really no turning back.)
1159	 */
1160	if (sdbp->s_refcnt != 1 ||
1161	    TAILQ_FIRST(&sdbp->active_queue) != NULL ||
1162	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
1163		__db_errx(sdbp->env,
1164    "Closing a primary DB while a secondary DB has active cursors is unsafe");
1165		ret = EINVAL;
1166	}
1167	sdbp->s_refcnt = 0;
1168
1169	while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
1170		if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
1171			ret = t_ret;
1172
1173	F_CLR(sdbp, DB_AM_SECONDARY);
1174	return (ret);
1175}
1176
1177/*
1178 * __db_disassociate_foreign --
1179 *     Destroy the association between a given secondary and its foreign.
1180 */
1181static int
1182__db_disassociate_foreign(sdbp)
1183	DB *sdbp;
1184{
1185	DB *fdbp;
1186	DB_FOREIGN_INFO *f_info, *tmp;
1187	int ret;
1188
1189	if (sdbp->s_foreign == NULL)
1190		return (0);
1191	if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
1192		return (ret);
1193
1194	fdbp = sdbp->s_foreign;
1195	ret = 0;
1196	f_info = LIST_FIRST(&fdbp->f_primaries);
1197	while (f_info != NULL) {
1198		tmp = LIST_NEXT(f_info, f_links);
1199		if (f_info ->dbp == sdbp) {
1200			LIST_REMOVE(f_info, f_links);
1201			__os_free(sdbp->env, f_info);
1202		}
1203		f_info = tmp;
1204	}
1205
1206	return (ret);
1207}
1208
1209/*
1210 * __db_log_page
1211 *	Log a meta-data or root page during a subdatabase create operation.
1212 *
1213 * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
1214 */
1215int
1216__db_log_page(dbp, txn, lsn, pgno, page)
1217	DB *dbp;
1218	DB_TXN *txn;
1219	DB_LSN *lsn;
1220	db_pgno_t pgno;
1221	PAGE *page;
1222{
1223	DBT page_dbt;
1224	DB_LSN new_lsn;
1225	int ret;
1226
1227	if (!LOGGING_ON(dbp->env) || txn == NULL)
1228		return (0);
1229
1230	memset(&page_dbt, 0, sizeof(page_dbt));
1231	page_dbt.size = dbp->pgsize;
1232	page_dbt.data = page;
1233
1234	ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn);
1235
1236	if (ret == 0)
1237		page->lsn = new_lsn;
1238	return (ret);
1239}
1240
1241/*
1242 * __db_backup_name
1243 *	Create the backup file name for a given file.
1244 *
1245 * PUBLIC: int __db_backup_name __P((ENV *,
1246 * PUBLIC:     const char *, DB_TXN *, char **));
1247 */
1248#undef	BACKUP_PREFIX
1249#define	BACKUP_PREFIX	"__db."
1250
1251#undef	MAX_INT_TO_HEX
1252#define	MAX_INT_TO_HEX	8
1253
1254int
1255__db_backup_name(env, name, txn, backup)
1256	ENV *env;
1257	const char *name;
1258	DB_TXN *txn;
1259	char **backup;
1260{
1261	u_int32_t id;
1262	size_t len;
1263	int ret;
1264	char *p, *retp;
1265
1266	*backup = NULL;
1267
1268	/*
1269	 * Part of the name may be a full path, so we need to make sure that
1270	 * we allocate enough space for it, even in the case where we don't
1271	 * use the entire filename for the backup name.
1272	 */
1273	len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
1274	if ((ret = __os_malloc(env, len, &retp)) != 0)
1275		return (ret);
1276
1277	/*
1278	 * Create the name.  Backup file names are in one of 2 forms: in a
1279	 * transactional env "__db.TXNID.ID", where ID is a random number,
1280	 * and in any other env "__db.FILENAME".
1281	 *
1282	 * In addition, the name passed may contain an env-relative path.
1283	 * In that case, put the "__db." in the right place (in the last
1284	 * component of the pathname).
1285	 *
1286	 * There are four cases here:
1287	 *	1. simple path w/out transaction
1288	 *	2. simple path + transaction
1289	 *	3. multi-component path w/out transaction
1290	 *	4. multi-component path + transaction
1291	 */
1292	p = __db_rpath(name);
1293	if (IS_REAL_TXN(txn)) {
1294		__os_unique_id(env, &id);
1295		if (p == NULL)				/* Case 2. */
1296			snprintf(retp, len, "%s%x.%x",
1297			    BACKUP_PREFIX, txn->txnid, id);
1298		else					/* Case 4. */
1299			snprintf(retp, len, "%.*s%x.%x",
1300			    (int)(p - name) + 1, name, txn->txnid, id);
1301	} else {
1302		if (p == NULL)				/* Case 1. */
1303			snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
1304		else					/* Case 3. */
1305			snprintf(retp, len, "%.*s%s%s",
1306			    (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
1307	}
1308
1309	*backup = retp;
1310	return (0);
1311}
1312
1313#ifdef CONFIG_TEST
1314/*
1315 * __db_testcopy
1316 *	Create a copy of all backup files and our "main" DB.
1317 *
1318 * PUBLIC: #ifdef CONFIG_TEST
1319 * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
1320 * PUBLIC: #endif
1321 */
1322int
1323__db_testcopy(env, dbp, name)
1324	ENV *env;
1325	DB *dbp;
1326	const char *name;
1327{
1328	DB_MPOOL *dbmp;
1329	DB_MPOOLFILE *mpf;
1330
1331	DB_ASSERT(env, dbp != NULL || name != NULL);
1332
1333	if (name == NULL) {
1334		dbmp = env->mp_handle;
1335		mpf = dbp->mpf;
1336		name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
1337	}
1338
1339	if (dbp != NULL && dbp->type == DB_QUEUE)
1340		return (__qam_testdocopy(dbp, name));
1341	else
1342		return (__db_testdocopy(env, name));
1343}
1344
1345static int
1346__qam_testdocopy(dbp, name)
1347	DB *dbp;
1348	const char *name;
1349{
1350	DB_THREAD_INFO *ip;
1351	QUEUE_FILELIST *filelist, *fp;
1352	int ret;
1353	char buf[DB_MAXPATHLEN], *dir;
1354
1355	filelist = NULL;
1356	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
1357		return (ret);
1358
1359	/* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
1360	ENV_GET_THREAD_INFO(dbp->env, ip);
1361	if (dbp->mpf != NULL &&
1362	    (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
1363		goto done;
1364
1365	if (filelist == NULL)
1366		return (0);
1367	dir = ((QUEUE *)dbp->q_internal)->dir;
1368	for (fp = filelist; fp->mpf != NULL; fp++) {
1369		snprintf(buf, sizeof(buf),
1370		    QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
1371		if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
1372			return (ret);
1373	}
1374
1375done:	__os_free(dbp->env, filelist);
1376	return (0);
1377}
1378
1379/*
1380 * __db_testdocopy
1381 *	Create a copy of all backup files and our "main" DB.
1382 */
1383static int
1384__db_testdocopy(env, name)
1385	ENV *env;
1386	const char *name;
1387{
1388	size_t len;
1389	int dircnt, i, ret;
1390	char *copy, **namesp, *p, *real_name;
1391
1392	dircnt = 0;
1393	copy = NULL;
1394	namesp = NULL;
1395
1396	/* Create the real backing file name. */
1397	if ((ret = __db_appname(env,
1398	    DB_APP_DATA, name, 0, NULL, &real_name)) != 0)
1399		return (ret);
1400
1401	/*
1402	 * !!!
1403	 * There are tests that attempt to copy non-existent files.  I'd guess
1404	 * it's a testing bug, but I don't have time to figure it out.  Block
1405	 * the case here.
1406	 */
1407	if (__os_exists(env, real_name, NULL) != 0) {
1408		__os_free(env, real_name);
1409		return (0);
1410	}
1411
1412	/*
1413	 * Copy the file itself.
1414	 *
1415	 * Allocate space for the file name, including adding an ".afterop" and
1416	 * trailing nul byte.
1417	 */
1418	len = strlen(real_name) + sizeof(".afterop");
1419	if ((ret = __os_malloc(env, len, &copy)) != 0)
1420		goto err;
1421	snprintf(copy, len, "%s.afterop", real_name);
1422	if ((ret = __db_makecopy(env, real_name, copy)) != 0)
1423		goto err;
1424
1425	/*
1426	 * Get the directory path to call __os_dirlist().
1427	 */
1428	if ((p = __db_rpath(real_name)) != NULL)
1429		*p = '\0';
1430	if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
1431		goto err;
1432
1433	/*
1434	 * Walk the directory looking for backup files.  Backup file names in
1435	 * transactional environments are of the form:
1436	 *
1437	 *	BACKUP_PREFIX.TXNID.ID
1438	 */
1439	for (i = 0; i < dircnt; i++) {
1440		/* Check for a related backup file name. */
1441		if (strncmp(
1442		    namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
1443			continue;
1444		p = namesp[i] + sizeof(BACKUP_PREFIX);
1445		p += strspn(p, "0123456789ABCDEFabcdef");
1446		if (*p != '.')
1447			continue;
1448		++p;
1449		p += strspn(p, "0123456789ABCDEFabcdef");
1450		if (*p != '\0')
1451			continue;
1452
1453		/*
1454		 * Copy the backup file.
1455		 *
1456		 * Allocate space for the file name, including adding a
1457		 * ".afterop" and trailing nul byte.
1458		 */
1459		if (real_name != NULL) {
1460			__os_free(env, real_name);
1461			real_name = NULL;
1462		}
1463		if ((ret = __db_appname(
1464		    env, DB_APP_DATA, namesp[i], 0, NULL, &real_name)) != 0)
1465			goto err;
1466		if (copy != NULL) {
1467			__os_free(env, copy);
1468			copy = NULL;
1469		}
1470		len = strlen(real_name) + sizeof(".afterop");
1471		if ((ret = __os_malloc(env, len, &copy)) != 0)
1472			goto err;
1473		snprintf(copy, len, "%s.afterop", real_name);
1474		if ((ret = __db_makecopy(env, real_name, copy)) != 0)
1475			goto err;
1476	}
1477
1478err:	if (namesp != NULL)
1479		__os_dirfree(env, namesp, dircnt);
1480	if (copy != NULL)
1481		__os_free(env, copy);
1482	if (real_name != NULL)
1483		__os_free(env, real_name);
1484	return (ret);
1485}
1486
1487static int
1488__db_makecopy(env, src, dest)
1489	ENV *env;
1490	const char *src, *dest;
1491{
1492	DB_FH *rfhp, *wfhp;
1493	size_t rcnt, wcnt;
1494	int ret;
1495	char *buf;
1496
1497	rfhp = wfhp = NULL;
1498
1499	if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
1500		goto err;
1501
1502	if ((ret = __os_open(env, src, 0,
1503	    DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
1504		goto err;
1505	if ((ret = __os_open(env, dest, 0,
1506	    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
1507		goto err;
1508
1509	for (;;) {
1510		if ((ret =
1511		    __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
1512			goto err;
1513		if (rcnt == 0)
1514			break;
1515		if ((ret =
1516		    __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
1517			goto err;
1518	}
1519
1520	if (0) {
1521err:		__db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
1522	}
1523
1524	if (buf != NULL)
1525		__os_free(env, buf);
1526	if (rfhp != NULL)
1527		(void)__os_closehandle(env, rfhp);
1528	if (wfhp != NULL)
1529		(void)__os_closehandle(env, wfhp);
1530	return (ret);
1531}
1532#endif
1533