1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: db_open.c,v 12.43 2008/01/08 20:58:10 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/db_page.h"
13#include "dbinc/db_swap.h"
14#include "dbinc/btree.h"
15#include "dbinc/crypto.h"
16#include "dbinc/hmac.h"
17#include "dbinc/fop.h"
18#include "dbinc/hash.h"
19#include "dbinc/lock.h"
20#include "dbinc/log.h"
21#include "dbinc/mp.h"
22#include "dbinc/qam.h"
23#include "dbinc/txn.h"
24
25/*
26 * __db_open --
27 *	DB->open method.
28 *
29 * This routine gets called in three different ways:
30 *
31 * 1. It can be called to open a file/database.  In this case, subdb will
32 *    be NULL and meta_pgno will be PGNO_BASE_MD.
33 * 2. It can be called to open a subdatabase during normal operation.  In
34 *    this case, name and subname will both be non-NULL and meta_pgno will
35 *    be PGNO_BASE_MD (also PGNO_INVALID).
36 * 3. It can be called to open an in-memory database (name == NULL;
37 *    subname = name).
38 * 4. It can be called during recovery to open a file/database, in which case
39 *    name will be non-NULL, subname will be NULL, and meta-pgno will be
40 *    PGNO_BASE_MD.
41 * 5. It can be called during recovery to open a subdatabase, in which case
42 *    name will be non-NULL, subname may be NULL and meta-pgno will be
43 *    a valid pgno (i.e., not PGNO_BASE_MD).
44 * 6. It can be called during recovery to open an in-memory database.
45 *
46 * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *,
47 * PUBLIC:     const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
48 */
49int
50__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
51	DB *dbp;
52	DB_THREAD_INFO *ip;
53	DB_TXN *txn;
54	const char *fname, *dname;
55	DBTYPE type;
56	u_int32_t flags;
57	int mode;
58	db_pgno_t meta_pgno;
59{
60	ENV *env;
61	int ret;
62	u_int32_t id;
63
64	env = dbp->env;
65	id = TXN_INVALID;
66
67	DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname);
68
69	/*
70	 * If the environment was configured with threads, the DB handle
71	 * must also be free-threaded, so we force the DB_THREAD flag on.
72	 * (See SR #2033 for why this is a requirement--recovery needs
73	 * to be able to grab a dbp using __db_fileid_to_dbp, and it has
74	 * no way of knowing which dbp goes with which thread, so whichever
75	 * one it finds has to be usable in any of them.)
76	 */
77	if (F_ISSET(env, ENV_THREAD))
78		LF_SET(DB_THREAD);
79
80	/* Convert any DB->open flags. */
81	if (LF_ISSET(DB_RDONLY))
82		F_SET(dbp, DB_AM_RDONLY);
83	if (LF_ISSET(DB_READ_UNCOMMITTED))
84		F_SET(dbp, DB_AM_READ_UNCOMMITTED);
85
86	if (IS_REAL_TXN(txn))
87		F_SET(dbp, DB_AM_TXN);
88
89	/* Fill in the type. */
90	dbp->type = type;
91
92	/*
93	 * If both fname and subname are NULL, it's always a create, so make
94	 * sure that we have both DB_CREATE and a type specified.  It would
95	 * be nice if this checking were done in __db_open where most of the
96	 * interface checking is done, but this interface (__db_dbopen) is
97	 * used by the recovery and limbo system, so we need to safeguard
98	 * this interface as well.
99	 */
100	if (fname == NULL) {
101		if (dname == NULL) {
102			if (!LF_ISSET(DB_CREATE)) {
103				__db_errx(env,
104			    "DB_CREATE must be specified to create databases.");
105				return (ENOENT);
106			}
107
108			F_SET(dbp, DB_AM_INMEM);
109			F_SET(dbp, DB_AM_CREATED);
110
111			if (dbp->type == DB_UNKNOWN) {
112				__db_errx(env,
113				    "DBTYPE of unknown without existing file");
114				return (EINVAL);
115			}
116
117			if (dbp->pgsize == 0)
118				dbp->pgsize = DB_DEF_IOSIZE;
119
120			/*
121			 * If the file is a temporary file and we're
122			 * doing locking, then we have to create a
123			 * unique file ID.  We can't use our normal
124			 * dev/inode pair (or whatever this OS uses
125			 * in place of dev/inode pairs) because no
126			 * backing file will be created until the
127			 * mpool cache is filled forcing the buffers
128			 * to disk.  Grab a random locker ID to use
129			 * as a file ID.  The created ID must never
130			 * match a potential real file ID -- we know
131			 * it won't because real file IDs contain a
132			 * time stamp after the dev/inode pair, and
133			 * we're simply storing a 4-byte value.
134
135			 * !!!
136			 * Store the locker in the file id structure
137			 * -- we can get it from there as necessary,
138			 * and it saves having two copies.
139			*/
140			if (LOCKING_ON(env) && (ret = __lock_id(env,
141			    (u_int32_t *)dbp->fileid, NULL)) != 0)
142				return (ret);
143		} else
144			MAKE_INMEM(dbp);
145
146		/*
147		 * Normally we would do handle locking here, however, with
148		 * in-memory files, we cannot do any database manipulation
149		 * until the mpool is open, so it happens later.
150		 */
151	} else if (dname == NULL && meta_pgno == PGNO_BASE_MD) {
152		/* Open/create the underlying file.  Acquire locks. */
153		if ((ret = __fop_file_setup(dbp, ip,
154		    txn, fname, mode, flags, &id)) != 0)
155			return (ret);
156	} else {
157		if ((ret = __fop_subdb_setup(dbp, ip,
158		    txn, fname, dname, mode, flags)) != 0)
159			return (ret);
160		meta_pgno = dbp->meta_pgno;
161	}
162
163	/*
164	 * If we created the file, set the truncate flag for the mpool.  This
165	 * isn't for anything we've done, it's protection against stupid user
166	 * tricks: if the user deleted a file behind Berkeley DB's back, we
167	 * may still have pages in the mpool that match the file's "unique" ID.
168	 *
169	 * Note that if we're opening a subdatabase, we don't want to set
170	 * the TRUNCATE flag even if we just created the file--we already
171	 * opened and updated the master using access method interfaces,
172	 * so we don't want to get rid of any pages that are in the mpool.
173	 * If we created the file when we opened the master, we already hit
174	 * this check in a non-subdatabase context then.
175	 */
176	if (dname == NULL && F_ISSET(dbp, DB_AM_CREATED))
177		LF_SET(DB_TRUNCATE);
178
179	/* Set up the underlying environment. */
180	if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0)
181		return (ret);
182
183	/* For in-memory databases, we now need to open/create the database. */
184	if (F_ISSET(dbp, DB_AM_INMEM)) {
185		if (dname == NULL)
186			ret = __db_new_file(dbp, ip, txn, NULL, NULL);
187		else {
188			id = TXN_INVALID;
189			if ((ret = __fop_file_setup(dbp, ip,
190			    txn, dname, mode, flags, &id)) == 0 &&
191			    DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER)
192#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
193			    && txn != NULL
194#endif
195#if !defined(DEBUG_ROP)
196			    && !F_ISSET(dbp, DB_AM_RDONLY)
197#endif
198			)
199				ret = __dbreg_log_id(dbp,
200				    txn, dbp->log_filename->id, 1);
201		}
202		if (ret != 0)
203			goto err;
204	}
205
206	switch (dbp->type) {
207		case DB_BTREE:
208			ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags);
209			break;
210		case DB_HASH:
211			ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags);
212			break;
213		case DB_RECNO:
214			ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags);
215			break;
216		case DB_QUEUE:
217			ret = __qam_open(
218			    dbp, ip, txn, fname, meta_pgno, mode, flags);
219			break;
220		case DB_UNKNOWN:
221			return (
222			    __db_unknown_type(env, "__db_dbopen", dbp->type));
223	}
224	if (ret != 0)
225		goto err;
226
227	DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname);
228
229	/*
230	 * Temporary files don't need handle locks, so we only have to check
231	 * for a handle lock downgrade or lockevent in the case of named
232	 * files.
233	 */
234	if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) &&
235	    LOCK_ISSET(dbp->handle_lock)) {
236		if (IS_REAL_TXN(txn))
237			ret = __txn_lockevent(env,
238			    txn, dbp, &dbp->handle_lock, dbp->locker);
239		else if (LOCKING_ON(env))
240			/* Trade write handle lock for read handle lock. */
241			ret = __lock_downgrade(env,
242			    &dbp->handle_lock, DB_LOCK_READ, 0);
243	}
244DB_TEST_RECOVERY_LABEL
245err:
246	return (ret);
247}
248
249/*
250 * __db_get_open_flags --
251 *	Accessor for flags passed into DB->open call
252 *
253 * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *));
254 */
255int
256__db_get_open_flags(dbp, flagsp)
257	DB *dbp;
258	u_int32_t *flagsp;
259{
260	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags");
261
262	*flagsp = dbp->open_flags;
263	return (0);
264}
265
266/*
267 * __db_new_file --
268 *	Create a new database file.
269 *
270 * PUBLIC: int __db_new_file __P((DB *,
271 * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
272 */
273int
274__db_new_file(dbp, ip, txn, fhp, name)
275	DB *dbp;
276	DB_THREAD_INFO *ip;
277	DB_TXN *txn;
278	DB_FH *fhp;
279	const char *name;
280{
281	int ret;
282
283	switch (dbp->type) {
284	case DB_BTREE:
285	case DB_RECNO:
286		ret = __bam_new_file(dbp, ip, txn, fhp, name);
287		break;
288	case DB_HASH:
289		ret = __ham_new_file(dbp, ip, txn, fhp, name);
290		break;
291	case DB_QUEUE:
292		ret = __qam_new_file(dbp, ip, txn, fhp, name);
293		break;
294	case DB_UNKNOWN:
295	default:
296		__db_errx(dbp->env,
297		    "%s: Invalid type %d specified", name, dbp->type);
298		ret = EINVAL;
299		break;
300	}
301
302	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
303	/* Sync the file in preparation for moving it into place. */
304	if (ret == 0 && fhp != NULL)
305		ret = __os_fsync(dbp->env, fhp);
306
307	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
308
309DB_TEST_RECOVERY_LABEL
310	return (ret);
311}
312
313/*
314 * __db_init_subdb --
315 *	Initialize the dbp for a subdb.
316 *
317 * PUBLIC: int __db_init_subdb __P((DB *,
318 * PUBLIC:       DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
319 */
320int
321__db_init_subdb(mdbp, dbp, name, ip, txn)
322	DB *mdbp, *dbp;
323	const char *name;
324	DB_THREAD_INFO *ip;
325	DB_TXN *txn;
326{
327	DBMETA *meta;
328	DB_MPOOLFILE *mpf;
329	int ret, t_ret;
330
331	ret = 0;
332	if (!F_ISSET(dbp, DB_AM_CREATED)) {
333		/* Subdb exists; read meta-data page and initialize. */
334		mpf = mdbp->mpf;
335		if  ((ret = __memp_fget(mpf, &dbp->meta_pgno,
336		    ip, txn, 0, &meta)) != 0)
337			goto err;
338		ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0);
339		if ((t_ret = __memp_fput(mpf,
340		    ip, meta, dbp->priority)) != 0 && ret == 0)
341			ret = t_ret;
342		/*
343		 * If __db_meta_setup found that the meta-page hadn't
344		 * been written out during recovery, we can just return.
345		 */
346		if (ret == ENOENT)
347			ret = 0;
348		goto err;
349	}
350
351	/* Handle the create case here. */
352	switch (dbp->type) {
353	case DB_BTREE:
354	case DB_RECNO:
355		ret = __bam_new_subdb(mdbp, dbp, ip, txn);
356		break;
357	case DB_HASH:
358		ret = __ham_new_subdb(mdbp, dbp, ip, txn);
359		break;
360	case DB_QUEUE:
361		ret = EINVAL;
362		break;
363	case DB_UNKNOWN:
364	default:
365		__db_errx(dbp->env,
366		    "Invalid subdatabase type %d specified", dbp->type);
367		return (EINVAL);
368	}
369
370err:	return (ret);
371}
372
373/*
374 * __db_chk_meta --
375 *	Take a buffer containing a meta-data page and check it for a valid LSN,
376 *	checksum (and verify the checksum if necessary) and possibly decrypt it.
377 *
378 *	Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
379 *
380 * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
381 */
382int
383__db_chk_meta(env, dbp, meta, flags)
384	ENV *env;
385	DB *dbp;
386	DBMETA *meta;
387	u_int32_t flags;
388{
389	DB_LSN swap_lsn;
390	int is_hmac, ret, swapped;
391	u_int32_t magic, orig_chk;
392	u_int8_t *chksum;
393
394	ret = 0;
395	swapped = 0;
396
397	if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
398		if (dbp != NULL)
399			F_SET(dbp, DB_AM_CHKSUM);
400
401		is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
402		chksum = ((BTMETA *)meta)->chksum;
403
404		/*
405		 * If we need to swap, the checksum function overwrites the
406		 * original checksum with 0, so we need to save a copy of the
407		 * original for swapping later.
408		 */
409		orig_chk = *(u_int32_t *)chksum;
410
411		/*
412		 * We cannot add this to __db_metaswap because that gets done
413		 * later after we've verified the checksum or decrypted.
414		 */
415		if (LF_ISSET(DB_CHK_META)) {
416			swapped = 0;
417chk_retry:		if ((ret =
418			    __db_check_chksum(env, NULL, env->crypto_handle,
419			    chksum, meta, DBMETASIZE, is_hmac)) != 0) {
420				if (is_hmac || swapped)
421					return (ret);
422
423				M_32_SWAP(orig_chk);
424				swapped = 1;
425				*(u_int32_t *)chksum = orig_chk;
426				goto chk_retry;
427			}
428		}
429	} else if (dbp != NULL)
430		F_CLR(dbp, DB_AM_CHKSUM);
431
432#ifdef HAVE_CRYPTO
433	ret = __crypto_decrypt_meta(env,
434	     dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META));
435#endif
436
437	/* Now that we're decrypted, we can check LSN. */
438	if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
439		/*
440		 * This gets called both before and after swapping, so we
441		 * need to check ourselves.  If we already swapped it above,
442		 * we'll know that here.
443		 */
444
445		swap_lsn = meta->lsn;
446		magic = meta->magic;
447lsn_retry:
448		if (swapped) {
449			M_32_SWAP(swap_lsn.file);
450			M_32_SWAP(swap_lsn.offset);
451			M_32_SWAP(magic);
452		}
453		switch (magic) {
454		case DB_BTREEMAGIC:
455		case DB_HASHMAGIC:
456		case DB_QAMMAGIC:
457		case DB_RENAMEMAGIC:
458			break;
459		default:
460			if (swapped)
461				return (EINVAL);
462			swapped = 1;
463			goto lsn_retry;
464		}
465		if (!IS_REP_CLIENT(env) &&
466		    !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
467			/* Need to do check. */
468			ret = __log_check_page_lsn(env, dbp, &swap_lsn);
469	}
470	return (ret);
471}
472
473/*
474 * __db_meta_setup --
475 *
476 * Take a buffer containing a meta-data page and figure out if it's
477 * valid, and if so, initialize the dbp from the meta-data page.
478 *
479 * PUBLIC: int __db_meta_setup __P((ENV *,
480 * PUBLIC:     DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
481 */
482int
483__db_meta_setup(env, dbp, name, meta, oflags, flags)
484	ENV *env;
485	DB *dbp;
486	const char *name;
487	DBMETA *meta;
488	u_int32_t oflags;
489	u_int32_t flags;
490{
491	u_int32_t magic;
492	int ret;
493
494	ret = 0;
495
496	/*
497	 * Figure out what access method we're dealing with, and then
498	 * call access method specific code to check error conditions
499	 * based on conflicts between the found file and application
500	 * arguments.  A found file overrides some user information --
501	 * we don't consider it an error, for example, if the user set
502	 * an expected byte order and the found file doesn't match it.
503	 */
504	F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME);
505	magic = meta->magic;
506
507swap_retry:
508	switch (magic) {
509	case DB_BTREEMAGIC:
510	case DB_HASHMAGIC:
511	case DB_QAMMAGIC:
512	case DB_RENAMEMAGIC:
513		break;
514	case 0:
515		/*
516		 * The only time this should be 0 is if we're in the
517		 * midst of opening a subdb during recovery and that
518		 * subdatabase had its meta-data page allocated, but
519		 * not yet initialized.
520		 */
521		if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) &&
522		    F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) ||
523		    meta->pgno != PGNO_INVALID))
524			return (ENOENT);
525
526		goto bad_format;
527	default:
528		if (F_ISSET(dbp, DB_AM_SWAP))
529			goto bad_format;
530
531		M_32_SWAP(magic);
532		F_SET(dbp, DB_AM_SWAP);
533		goto swap_retry;
534	}
535
536	/*
537	 * We can only check the meta page if we are sure we have a meta page.
538	 * If it is random data, then this check can fail.  So only now can we
539	 * checksum and decrypt.  Don't distinguish between configuration and
540	 * checksum match errors here, because we haven't opened the database
541	 * and even a checksum error isn't a reason to panic the environment.
542	 */
543	if ((ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
544		if (ret == -1)
545			__db_errx(env,
546			    "%s: metadata page checksum error", name);
547		goto bad_format;
548	}
549
550	switch (magic) {
551	case DB_BTREEMAGIC:
552		if (dbp->type != DB_UNKNOWN &&
553		    dbp->type != DB_RECNO && dbp->type != DB_BTREE)
554			goto bad_format;
555
556		flags = meta->flags;
557		if (F_ISSET(dbp, DB_AM_SWAP))
558			M_32_SWAP(flags);
559		if (LF_ISSET(BTM_RECNO))
560			dbp->type = DB_RECNO;
561		else
562			dbp->type = DB_BTREE;
563		if ((oflags & DB_TRUNCATE) == 0 && (ret =
564		    __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
565			return (ret);
566		break;
567	case DB_HASHMAGIC:
568		if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH)
569			goto bad_format;
570
571		dbp->type = DB_HASH;
572		if ((oflags & DB_TRUNCATE) == 0 && (ret =
573		    __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
574			return (ret);
575		break;
576	case DB_QAMMAGIC:
577		if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE)
578			goto bad_format;
579		dbp->type = DB_QUEUE;
580		if ((oflags & DB_TRUNCATE) == 0 && (ret =
581		    __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
582			return (ret);
583		break;
584	case DB_RENAMEMAGIC:
585		F_SET(dbp, DB_AM_IN_RENAME);
586
587		/* Copy the file's ID. */
588		memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
589
590		break;
591	default:
592		goto bad_format;
593	}
594	return (0);
595
596bad_format:
597	if (F_ISSET(dbp, DB_AM_RECOVER))
598		ret = ENOENT;
599	else
600		__db_errx(env, "%s: unexpected file type or format", name);
601	return (ret == 0 ? EINVAL : ret);
602}
603