bt_open.c revision 17141
11573Srgrimes/*-
214287Spst * Copyright (c) 1990, 1993, 1994
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * Mike Olson.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 3. All advertising materials mentioning features or use of this software
171573Srgrimes *    must display the following acknowledgement:
181573Srgrimes *	This product includes software developed by the University of
191573Srgrimes *	California, Berkeley and its contributors.
201573Srgrimes * 4. Neither the name of the University nor the names of its contributors
211573Srgrimes *    may be used to endorse or promote products derived from this software
221573Srgrimes *    without specific prior written permission.
231573Srgrimes *
241573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
251573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
261573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
271573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
281573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
291573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
301573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
311573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
321573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
331573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
341573Srgrimes * SUCH DAMAGE.
351573Srgrimes */
361573Srgrimes
371573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
3814287Spststatic char sccsid[] = "@(#)bt_open.c	8.10 (Berkeley) 8/17/94";
391573Srgrimes#endif /* LIBC_SCCS and not lint */
401573Srgrimes
411573Srgrimes/*
421573Srgrimes * Implementation of btree access method for 4.4BSD.
431573Srgrimes *
441573Srgrimes * The design here was originally based on that of the btree access method
451573Srgrimes * used in the Postgres database system at UC Berkeley.  This implementation
461573Srgrimes * is wholly independent of the Postgres code.
471573Srgrimes */
481573Srgrimes
491573Srgrimes#include <sys/param.h>
501573Srgrimes#include <sys/stat.h>
511573Srgrimes
521573Srgrimes#include <errno.h>
531573Srgrimes#include <fcntl.h>
541573Srgrimes#include <limits.h>
551573Srgrimes#include <signal.h>
561573Srgrimes#include <stdio.h>
571573Srgrimes#include <stdlib.h>
581573Srgrimes#include <string.h>
591573Srgrimes#include <unistd.h>
601573Srgrimes
611573Srgrimes#include <db.h>
621573Srgrimes#include "btree.h"
631573Srgrimes
6414287Spst#ifdef DEBUG
6514287Spst#undef	MINPSIZE
6614287Spst#define	MINPSIZE	128
6714287Spst#endif
6814287Spst
691573Srgrimesstatic int byteorder __P((void));
701573Srgrimesstatic int nroot __P((BTREE *));
711573Srgrimesstatic int tmp __P((void));
721573Srgrimes
731573Srgrimes/*
741573Srgrimes * __BT_OPEN -- Open a btree.
751573Srgrimes *
761573Srgrimes * Creates and fills a DB struct, and calls the routine that actually
771573Srgrimes * opens the btree.
781573Srgrimes *
791573Srgrimes * Parameters:
801573Srgrimes *	fname:	filename (NULL for in-memory trees)
811573Srgrimes *	flags:	open flag bits
821573Srgrimes *	mode:	open permission bits
831573Srgrimes *	b:	BTREEINFO pointer
841573Srgrimes *
851573Srgrimes * Returns:
861573Srgrimes *	NULL on failure, pointer to DB on success.
871573Srgrimes *
881573Srgrimes */
891573SrgrimesDB *
901573Srgrimes__bt_open(fname, flags, mode, openinfo, dflags)
911573Srgrimes	const char *fname;
921573Srgrimes	int flags, mode, dflags;
931573Srgrimes	const BTREEINFO *openinfo;
941573Srgrimes{
951573Srgrimes	struct stat sb;
961573Srgrimes	BTMETA m;
971573Srgrimes	BTREE *t;
981573Srgrimes	BTREEINFO b;
991573Srgrimes	DB *dbp;
1001573Srgrimes	pgno_t ncache;
1011573Srgrimes	ssize_t nr;
1021573Srgrimes	int machine_lorder;
1031573Srgrimes
1041573Srgrimes	t = NULL;
1051573Srgrimes
1061573Srgrimes	/*
1071573Srgrimes	 * Intention is to make sure all of the user's selections are okay
1081573Srgrimes	 * here and then use them without checking.  Can't be complete, since
1091573Srgrimes	 * we don't know the right page size, lorder or flags until the backing
1101573Srgrimes	 * file is opened.  Also, the file's page size can cause the cachesize
1111573Srgrimes	 * to change.
1121573Srgrimes	 */
1131573Srgrimes	machine_lorder = byteorder();
1141573Srgrimes	if (openinfo) {
1151573Srgrimes		b = *openinfo;
1161573Srgrimes
1171573Srgrimes		/* Flags: R_DUP. */
1181573Srgrimes		if (b.flags & ~(R_DUP))
1191573Srgrimes			goto einval;
1201573Srgrimes
1211573Srgrimes		/*
1221573Srgrimes		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
1231573Srgrimes		 * page size is set farther on, based on the underlying file
1241573Srgrimes		 * transfer size.
1251573Srgrimes		 */
1261573Srgrimes		if (b.psize &&
1271573Srgrimes		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
12817141Sjkh		    b.psize & (sizeof(indx_t) - 1) ))
1291573Srgrimes			goto einval;
1301573Srgrimes
1311573Srgrimes		/* Minimum number of keys per page; absolute minimum is 2. */
1321573Srgrimes		if (b.minkeypage) {
1331573Srgrimes			if (b.minkeypage < 2)
1341573Srgrimes				goto einval;
1351573Srgrimes		} else
1361573Srgrimes			b.minkeypage = DEFMINKEYPAGE;
1371573Srgrimes
1381573Srgrimes		/* If no comparison, use default comparison and prefix. */
1391573Srgrimes		if (b.compare == NULL) {
1401573Srgrimes			b.compare = __bt_defcmp;
1411573Srgrimes			if (b.prefix == NULL)
1421573Srgrimes				b.prefix = __bt_defpfx;
1431573Srgrimes		}
1441573Srgrimes
1451573Srgrimes		if (b.lorder == 0)
1461573Srgrimes			b.lorder = machine_lorder;
1471573Srgrimes	} else {
1481573Srgrimes		b.compare = __bt_defcmp;
1491573Srgrimes		b.cachesize = 0;
1501573Srgrimes		b.flags = 0;
1511573Srgrimes		b.lorder = machine_lorder;
1521573Srgrimes		b.minkeypage = DEFMINKEYPAGE;
1531573Srgrimes		b.prefix = __bt_defpfx;
1541573Srgrimes		b.psize = 0;
1551573Srgrimes	}
1561573Srgrimes
1571573Srgrimes	/* Check for the ubiquitous PDP-11. */
1581573Srgrimes	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
1591573Srgrimes		goto einval;
1601573Srgrimes
1611573Srgrimes	/* Allocate and initialize DB and BTREE structures. */
1621573Srgrimes	if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
1631573Srgrimes		goto err;
1641573Srgrimes	memset(t, 0, sizeof(BTREE));
1651573Srgrimes	t->bt_fd = -1;			/* Don't close unopened fd on error. */
1661573Srgrimes	t->bt_lorder = b.lorder;
1671573Srgrimes	t->bt_order = NOT;
1681573Srgrimes	t->bt_cmp = b.compare;
1691573Srgrimes	t->bt_pfx = b.prefix;
1701573Srgrimes	t->bt_rfd = -1;
1711573Srgrimes
1721573Srgrimes	if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
1731573Srgrimes		goto err;
17414287Spst	memset(t->bt_dbp, 0, sizeof(DB));
1751573Srgrimes	if (t->bt_lorder != machine_lorder)
17614287Spst		F_SET(t, B_NEEDSWAP);
1771573Srgrimes
1781573Srgrimes	dbp->type = DB_BTREE;
1791573Srgrimes	dbp->internal = t;
1801573Srgrimes	dbp->close = __bt_close;
1811573Srgrimes	dbp->del = __bt_delete;
1821573Srgrimes	dbp->fd = __bt_fd;
1831573Srgrimes	dbp->get = __bt_get;
1841573Srgrimes	dbp->put = __bt_put;
1851573Srgrimes	dbp->seq = __bt_seq;
1861573Srgrimes	dbp->sync = __bt_sync;
1871573Srgrimes
1881573Srgrimes	/*
1891573Srgrimes	 * If no file name was supplied, this is an in-memory btree and we
1901573Srgrimes	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
1911573Srgrimes	 */
1921573Srgrimes	if (fname) {
19314287Spst		switch (flags & O_ACCMODE) {
1941573Srgrimes		case O_RDONLY:
19514287Spst			F_SET(t, B_RDONLY);
1961573Srgrimes			break;
1971573Srgrimes		case O_RDWR:
1981573Srgrimes			break;
1991573Srgrimes		case O_WRONLY:
2001573Srgrimes		default:
2011573Srgrimes			goto einval;
2021573Srgrimes		}
20314287Spst
2041573Srgrimes		if ((t->bt_fd = open(fname, flags, mode)) < 0)
2051573Srgrimes			goto err;
2061573Srgrimes
2071573Srgrimes	} else {
2081573Srgrimes		if ((flags & O_ACCMODE) != O_RDWR)
2091573Srgrimes			goto einval;
2101573Srgrimes		if ((t->bt_fd = tmp()) == -1)
2111573Srgrimes			goto err;
21214287Spst		F_SET(t, B_INMEM);
2131573Srgrimes	}
2141573Srgrimes
2151573Srgrimes	if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
2161573Srgrimes		goto err;
2171573Srgrimes
2181573Srgrimes	if (fstat(t->bt_fd, &sb))
2191573Srgrimes		goto err;
2201573Srgrimes	if (sb.st_size) {
22114287Spst		if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
2221573Srgrimes			goto err;
2231573Srgrimes		if (nr != sizeof(BTMETA))
2241573Srgrimes			goto eftype;
2251573Srgrimes
2261573Srgrimes		/*
2271573Srgrimes		 * Read in the meta-data.  This can change the notion of what
2281573Srgrimes		 * the lorder, page size and flags are, and, when the page size
2291573Srgrimes		 * changes, the cachesize value can change too.  If the user
2301573Srgrimes		 * specified the wrong byte order for an existing database, we
2311573Srgrimes		 * don't bother to return an error, we just clear the NEEDSWAP
2321573Srgrimes		 * bit.
2331573Srgrimes		 */
23414287Spst		if (m.magic == BTREEMAGIC)
23514287Spst			F_CLR(t, B_NEEDSWAP);
2361573Srgrimes		else {
23714287Spst			F_SET(t, B_NEEDSWAP);
23814287Spst			M_32_SWAP(m.magic);
23914287Spst			M_32_SWAP(m.version);
24014287Spst			M_32_SWAP(m.psize);
24114287Spst			M_32_SWAP(m.free);
24214287Spst			M_32_SWAP(m.nrecs);
24314287Spst			M_32_SWAP(m.flags);
2441573Srgrimes		}
24514287Spst		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
2461573Srgrimes			goto eftype;
24714287Spst		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
24817141Sjkh		    m.psize & (sizeof(indx_t) - 1) )
2491573Srgrimes			goto eftype;
25014287Spst		if (m.flags & ~SAVEMETA)
2511573Srgrimes			goto eftype;
25214287Spst		b.psize = m.psize;
25314287Spst		F_SET(t, m.flags);
25414287Spst		t->bt_free = m.free;
25514287Spst		t->bt_nrecs = m.nrecs;
2561573Srgrimes	} else {
2571573Srgrimes		/*
2581573Srgrimes		 * Set the page size to the best value for I/O to this file.
2591573Srgrimes		 * Don't overflow the page offset type.
2601573Srgrimes		 */
2611573Srgrimes		if (b.psize == 0) {
2621573Srgrimes			b.psize = sb.st_blksize;
2631573Srgrimes			if (b.psize < MINPSIZE)
2641573Srgrimes				b.psize = MINPSIZE;
2651573Srgrimes			if (b.psize > MAX_PAGE_OFFSET + 1)
2661573Srgrimes				b.psize = MAX_PAGE_OFFSET + 1;
2671573Srgrimes		}
2681573Srgrimes
2691573Srgrimes		/* Set flag if duplicates permitted. */
2701573Srgrimes		if (!(b.flags & R_DUP))
27114287Spst			F_SET(t, B_NODUPS);
2721573Srgrimes
2731573Srgrimes		t->bt_free = P_INVALID;
2741573Srgrimes		t->bt_nrecs = 0;
27514287Spst		F_SET(t, B_METADIRTY);
2761573Srgrimes	}
2771573Srgrimes
2781573Srgrimes	t->bt_psize = b.psize;
2791573Srgrimes
2801573Srgrimes	/* Set the cache size; must be a multiple of the page size. */
28117141Sjkh	if (b.cachesize && b.cachesize & (b.psize - 1) )
28217141Sjkh		b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1;
2831573Srgrimes	if (b.cachesize < b.psize * MINCACHE)
2841573Srgrimes		b.cachesize = b.psize * MINCACHE;
2851573Srgrimes
2861573Srgrimes	/* Calculate number of pages to cache. */
2871573Srgrimes	ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
2881573Srgrimes
2891573Srgrimes	/*
2901573Srgrimes	 * The btree data structure requires that at least two keys can fit on
2911573Srgrimes	 * a page, but other than that there's no fixed requirement.  The user
2921573Srgrimes	 * specified a minimum number per page, and we translated that into the
2931573Srgrimes	 * number of bytes a key/data pair can use before being placed on an
2941573Srgrimes	 * overflow page.  This calculation includes the page header, the size
2951573Srgrimes	 * of the index referencing the leaf item and the size of the leaf item
2961573Srgrimes	 * structure.  Also, don't let the user specify a minkeypage such that
2971573Srgrimes	 * a key/data pair won't fit even if both key and data are on overflow
2981573Srgrimes	 * pages.
2991573Srgrimes	 */
3001573Srgrimes	t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
3011573Srgrimes	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
3021573Srgrimes	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
3031573Srgrimes		t->bt_ovflsize =
3041573Srgrimes		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
3051573Srgrimes
3061573Srgrimes	/* Initialize the buffer pool. */
3071573Srgrimes	if ((t->bt_mp =
3081573Srgrimes	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
3091573Srgrimes		goto err;
31014287Spst	if (!F_ISSET(t, B_INMEM))
3111573Srgrimes		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
3121573Srgrimes
3131573Srgrimes	/* Create a root page if new tree. */
3141573Srgrimes	if (nroot(t) == RET_ERROR)
3151573Srgrimes		goto err;
3161573Srgrimes
3171573Srgrimes	/* Global flags. */
3181573Srgrimes	if (dflags & DB_LOCK)
31914287Spst		F_SET(t, B_DB_LOCK);
3201573Srgrimes	if (dflags & DB_SHMEM)
32114287Spst		F_SET(t, B_DB_SHMEM);
3221573Srgrimes	if (dflags & DB_TXN)
32314287Spst		F_SET(t, B_DB_TXN);
3241573Srgrimes
3251573Srgrimes	return (dbp);
3261573Srgrimes
3271573Srgrimeseinval:	errno = EINVAL;
3281573Srgrimes	goto err;
3291573Srgrimes
3301573Srgrimeseftype:	errno = EFTYPE;
3311573Srgrimes	goto err;
3321573Srgrimes
3331573Srgrimeserr:	if (t) {
3341573Srgrimes		if (t->bt_dbp)
3351573Srgrimes			free(t->bt_dbp);
3361573Srgrimes		if (t->bt_fd != -1)
3371573Srgrimes			(void)close(t->bt_fd);
3381573Srgrimes		free(t);
3391573Srgrimes	}
3401573Srgrimes	return (NULL);
3411573Srgrimes}
3421573Srgrimes
3431573Srgrimes/*
3441573Srgrimes * NROOT -- Create the root of a new tree.
3451573Srgrimes *
3461573Srgrimes * Parameters:
3471573Srgrimes *	t:	tree
3481573Srgrimes *
3491573Srgrimes * Returns:
3501573Srgrimes *	RET_ERROR, RET_SUCCESS
3511573Srgrimes */
3521573Srgrimesstatic int
3531573Srgrimesnroot(t)
3541573Srgrimes	BTREE *t;
3551573Srgrimes{
3561573Srgrimes	PAGE *meta, *root;
3571573Srgrimes	pgno_t npg;
3581573Srgrimes
3591573Srgrimes	if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
3601573Srgrimes		mpool_put(t->bt_mp, meta, 0);
3611573Srgrimes		return (RET_SUCCESS);
3621573Srgrimes	}
36314287Spst	if (errno != EINVAL)		/* It's OK to not exist. */
3641573Srgrimes		return (RET_ERROR);
36514287Spst	errno = 0;
3661573Srgrimes
3671573Srgrimes	if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
3681573Srgrimes		return (RET_ERROR);
3691573Srgrimes
3701573Srgrimes	if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
3711573Srgrimes		return (RET_ERROR);
3721573Srgrimes
3731573Srgrimes	if (npg != P_ROOT)
3741573Srgrimes		return (RET_ERROR);
3751573Srgrimes	root->pgno = npg;
3761573Srgrimes	root->prevpg = root->nextpg = P_INVALID;
3771573Srgrimes	root->lower = BTDATAOFF;
3781573Srgrimes	root->upper = t->bt_psize;
3791573Srgrimes	root->flags = P_BLEAF;
3801573Srgrimes	memset(meta, 0, t->bt_psize);
3811573Srgrimes	mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
3821573Srgrimes	mpool_put(t->bt_mp, root, MPOOL_DIRTY);
3831573Srgrimes	return (RET_SUCCESS);
3841573Srgrimes}
3851573Srgrimes
3861573Srgrimesstatic int
3871573Srgrimestmp()
3881573Srgrimes{
3891573Srgrimes	sigset_t set, oset;
3901573Srgrimes	int fd;
3911573Srgrimes	char *envtmp;
3921573Srgrimes	char path[MAXPATHLEN];
3931573Srgrimes
3941573Srgrimes	envtmp = getenv("TMPDIR");
3951573Srgrimes	(void)snprintf(path,
3961573Srgrimes	    sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
3971573Srgrimes
3981573Srgrimes	(void)sigfillset(&set);
3991573Srgrimes	(void)sigprocmask(SIG_BLOCK, &set, &oset);
4001573Srgrimes	if ((fd = mkstemp(path)) != -1)
4011573Srgrimes		(void)unlink(path);
4021573Srgrimes	(void)sigprocmask(SIG_SETMASK, &oset, NULL);
4031573Srgrimes	return(fd);
4041573Srgrimes}
4051573Srgrimes
4061573Srgrimesstatic int
4071573Srgrimesbyteorder()
4081573Srgrimes{
4091573Srgrimes	u_int32_t x;
4101573Srgrimes	u_char *p;
4111573Srgrimes
4121573Srgrimes	x = 0x01020304;
4131573Srgrimes	p = (u_char *)&x;
4141573Srgrimes	switch (*p) {
4151573Srgrimes	case 1:
4161573Srgrimes		return (BIG_ENDIAN);
4171573Srgrimes	case 4:
4181573Srgrimes		return (LITTLE_ENDIAN);
4191573Srgrimes	default:
4201573Srgrimes		return (0);
4211573Srgrimes	}
4221573Srgrimes}
4231573Srgrimes
4241573Srgrimesint
4251573Srgrimes__bt_fd(dbp)
4261573Srgrimes        const DB *dbp;
4271573Srgrimes{
4281573Srgrimes	BTREE *t;
4291573Srgrimes
4301573Srgrimes	t = dbp->internal;
4311573Srgrimes
4321573Srgrimes	/* Toss any page pinned across calls. */
4331573Srgrimes	if (t->bt_pinned != NULL) {
4341573Srgrimes		mpool_put(t->bt_mp, t->bt_pinned, 0);
4351573Srgrimes		t->bt_pinned = NULL;
4361573Srgrimes	}
4371573Srgrimes
4381573Srgrimes	/* In-memory database can't have a file descriptor. */
43914287Spst	if (F_ISSET(t, B_INMEM)) {
4401573Srgrimes		errno = ENOENT;
4411573Srgrimes		return (-1);
4421573Srgrimes	}
4431573Srgrimes	return (t->bt_fd);
4441573Srgrimes}
445