11573Srgrimes/*-
214287Spst * Copyright (c) 1990, 1993, 1994
31573Srgrimes *	The Regents of the University of California.  All rights reserved.
41573Srgrimes *
51573Srgrimes * This code is derived from software contributed to Berkeley by
61573Srgrimes * Mike Olson.
71573Srgrimes *
81573Srgrimes * Redistribution and use in source and binary forms, with or without
91573Srgrimes * modification, are permitted provided that the following conditions
101573Srgrimes * are met:
111573Srgrimes * 1. Redistributions of source code must retain the above copyright
121573Srgrimes *    notice, this list of conditions and the following disclaimer.
131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
141573Srgrimes *    notice, this list of conditions and the following disclaimer in the
151573Srgrimes *    documentation and/or other materials provided with the distribution.
161573Srgrimes * 4. Neither the name of the University nor the names of its contributors
171573Srgrimes *    may be used to endorse or promote products derived from this software
181573Srgrimes *    without specific prior written permission.
191573Srgrimes *
201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
231573Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
301573Srgrimes * SUCH DAMAGE.
311573Srgrimes */
321573Srgrimes
331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint)
3414287Spststatic char sccsid[] = "@(#)bt_open.c	8.10 (Berkeley) 8/17/94";
351573Srgrimes#endif /* LIBC_SCCS and not lint */
3692986Sobrien#include <sys/cdefs.h>
3792986Sobrien__FBSDID("$FreeBSD$");
381573Srgrimes
391573Srgrimes/*
401573Srgrimes * Implementation of btree access method for 4.4BSD.
411573Srgrimes *
421573Srgrimes * The design here was originally based on that of the btree access method
431573Srgrimes * used in the Postgres database system at UC Berkeley.  This implementation
441573Srgrimes * is wholly independent of the Postgres code.
451573Srgrimes */
461573Srgrimes
4771579Sdeischen#include "namespace.h"
481573Srgrimes#include <sys/param.h>
491573Srgrimes#include <sys/stat.h>
501573Srgrimes
511573Srgrimes#include <errno.h>
521573Srgrimes#include <fcntl.h>
531573Srgrimes#include <limits.h>
541573Srgrimes#include <signal.h>
551573Srgrimes#include <stdio.h>
561573Srgrimes#include <stdlib.h>
571573Srgrimes#include <string.h>
581573Srgrimes#include <unistd.h>
5971579Sdeischen#include "un-namespace.h"
60287292Skib#include "libc_private.h"
611573Srgrimes
621573Srgrimes#include <db.h>
631573Srgrimes#include "btree.h"
641573Srgrimes
6514287Spst#ifdef DEBUG
6614287Spst#undef	MINPSIZE
6714287Spst#define	MINPSIZE	128
6814287Spst#endif
6914287Spst
7092905Sobrienstatic int byteorder(void);
7192905Sobrienstatic int nroot(BTREE *);
7292905Sobrienstatic int tmp(void);
731573Srgrimes
741573Srgrimes/*
751573Srgrimes * __BT_OPEN -- Open a btree.
761573Srgrimes *
771573Srgrimes * Creates and fills a DB struct, and calls the routine that actually
781573Srgrimes * opens the btree.
791573Srgrimes *
801573Srgrimes * Parameters:
811573Srgrimes *	fname:	filename (NULL for in-memory trees)
821573Srgrimes *	flags:	open flag bits
831573Srgrimes *	mode:	open permission bits
841573Srgrimes *	b:	BTREEINFO pointer
851573Srgrimes *
861573Srgrimes * Returns:
871573Srgrimes *	NULL on failure, pointer to DB on success.
881573Srgrimes *
891573Srgrimes */
901573SrgrimesDB *
91189291Sdelphij__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags)
921573Srgrimes{
931573Srgrimes	struct stat sb;
941573Srgrimes	BTMETA m;
951573Srgrimes	BTREE *t;
961573Srgrimes	BTREEINFO b;
971573Srgrimes	DB *dbp;
981573Srgrimes	pgno_t ncache;
991573Srgrimes	ssize_t nr;
100190344Sdelphij	int machine_lorder, saved_errno;
1011573Srgrimes
1021573Srgrimes	t = NULL;
1031573Srgrimes
1041573Srgrimes	/*
1051573Srgrimes	 * Intention is to make sure all of the user's selections are okay
1061573Srgrimes	 * here and then use them without checking.  Can't be complete, since
1071573Srgrimes	 * we don't know the right page size, lorder or flags until the backing
1081573Srgrimes	 * file is opened.  Also, the file's page size can cause the cachesize
1091573Srgrimes	 * to change.
1101573Srgrimes	 */
1111573Srgrimes	machine_lorder = byteorder();
1121573Srgrimes	if (openinfo) {
1131573Srgrimes		b = *openinfo;
1141573Srgrimes
1151573Srgrimes		/* Flags: R_DUP. */
1161573Srgrimes		if (b.flags & ~(R_DUP))
1171573Srgrimes			goto einval;
1181573Srgrimes
1191573Srgrimes		/*
1201573Srgrimes		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
1211573Srgrimes		 * page size is set farther on, based on the underlying file
1221573Srgrimes		 * transfer size.
1231573Srgrimes		 */
1241573Srgrimes		if (b.psize &&
1251573Srgrimes		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
12617141Sjkh		    b.psize & (sizeof(indx_t) - 1) ))
1271573Srgrimes			goto einval;
1281573Srgrimes
1291573Srgrimes		/* Minimum number of keys per page; absolute minimum is 2. */
1301573Srgrimes		if (b.minkeypage) {
1311573Srgrimes			if (b.minkeypage < 2)
1321573Srgrimes				goto einval;
1331573Srgrimes		} else
1341573Srgrimes			b.minkeypage = DEFMINKEYPAGE;
1351573Srgrimes
1361573Srgrimes		/* If no comparison, use default comparison and prefix. */
1371573Srgrimes		if (b.compare == NULL) {
1381573Srgrimes			b.compare = __bt_defcmp;
1391573Srgrimes			if (b.prefix == NULL)
1401573Srgrimes				b.prefix = __bt_defpfx;
1411573Srgrimes		}
1421573Srgrimes
1431573Srgrimes		if (b.lorder == 0)
1441573Srgrimes			b.lorder = machine_lorder;
1451573Srgrimes	} else {
1461573Srgrimes		b.compare = __bt_defcmp;
1471573Srgrimes		b.cachesize = 0;
1481573Srgrimes		b.flags = 0;
1491573Srgrimes		b.lorder = machine_lorder;
1501573Srgrimes		b.minkeypage = DEFMINKEYPAGE;
1511573Srgrimes		b.prefix = __bt_defpfx;
1521573Srgrimes		b.psize = 0;
1531573Srgrimes	}
1541573Srgrimes
1551573Srgrimes	/* Check for the ubiquitous PDP-11. */
1561573Srgrimes	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
1571573Srgrimes		goto einval;
1581573Srgrimes
1591573Srgrimes	/* Allocate and initialize DB and BTREE structures. */
160190482Sdelphij	if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL)
1611573Srgrimes		goto err;
1621573Srgrimes	t->bt_fd = -1;			/* Don't close unopened fd on error. */
1631573Srgrimes	t->bt_lorder = b.lorder;
1641573Srgrimes	t->bt_order = NOT;
1651573Srgrimes	t->bt_cmp = b.compare;
1661573Srgrimes	t->bt_pfx = b.prefix;
1671573Srgrimes	t->bt_rfd = -1;
1681573Srgrimes
169190482Sdelphij	if ((t->bt_dbp = dbp = (DB *)calloc(1, sizeof(DB))) == NULL)
1701573Srgrimes		goto err;
1711573Srgrimes	if (t->bt_lorder != machine_lorder)
17214287Spst		F_SET(t, B_NEEDSWAP);
1731573Srgrimes
1741573Srgrimes	dbp->type = DB_BTREE;
1751573Srgrimes	dbp->internal = t;
1761573Srgrimes	dbp->close = __bt_close;
1771573Srgrimes	dbp->del = __bt_delete;
1781573Srgrimes	dbp->fd = __bt_fd;
1791573Srgrimes	dbp->get = __bt_get;
1801573Srgrimes	dbp->put = __bt_put;
1811573Srgrimes	dbp->seq = __bt_seq;
1821573Srgrimes	dbp->sync = __bt_sync;
1831573Srgrimes
1841573Srgrimes	/*
1851573Srgrimes	 * If no file name was supplied, this is an in-memory btree and we
1861573Srgrimes	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
1871573Srgrimes	 */
1881573Srgrimes	if (fname) {
18914287Spst		switch (flags & O_ACCMODE) {
1901573Srgrimes		case O_RDONLY:
19114287Spst			F_SET(t, B_RDONLY);
1921573Srgrimes			break;
1931573Srgrimes		case O_RDWR:
1941573Srgrimes			break;
1951573Srgrimes		case O_WRONLY:
1961573Srgrimes		default:
1971573Srgrimes			goto einval;
1981573Srgrimes		}
199189327Sdelphij
200254289Sjilles		if ((t->bt_fd = _open(fname, flags | O_CLOEXEC, mode)) < 0)
2011573Srgrimes			goto err;
2021573Srgrimes
2031573Srgrimes	} else {
2041573Srgrimes		if ((flags & O_ACCMODE) != O_RDWR)
2051573Srgrimes			goto einval;
2061573Srgrimes		if ((t->bt_fd = tmp()) == -1)
2071573Srgrimes			goto err;
20814287Spst		F_SET(t, B_INMEM);
2091573Srgrimes	}
2101573Srgrimes
21171579Sdeischen	if (_fstat(t->bt_fd, &sb))
2121573Srgrimes		goto err;
2131573Srgrimes	if (sb.st_size) {
21456698Sjasone		if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
2151573Srgrimes			goto err;
2161573Srgrimes		if (nr != sizeof(BTMETA))
2171573Srgrimes			goto eftype;
2181573Srgrimes
2191573Srgrimes		/*
2201573Srgrimes		 * Read in the meta-data.  This can change the notion of what
2211573Srgrimes		 * the lorder, page size and flags are, and, when the page size
2221573Srgrimes		 * changes, the cachesize value can change too.  If the user
2231573Srgrimes		 * specified the wrong byte order for an existing database, we
2241573Srgrimes		 * don't bother to return an error, we just clear the NEEDSWAP
2251573Srgrimes		 * bit.
2261573Srgrimes		 */
22714287Spst		if (m.magic == BTREEMAGIC)
22814287Spst			F_CLR(t, B_NEEDSWAP);
2291573Srgrimes		else {
23014287Spst			F_SET(t, B_NEEDSWAP);
23114287Spst			M_32_SWAP(m.magic);
23214287Spst			M_32_SWAP(m.version);
23314287Spst			M_32_SWAP(m.psize);
23414287Spst			M_32_SWAP(m.free);
23514287Spst			M_32_SWAP(m.nrecs);
23614287Spst			M_32_SWAP(m.flags);
2371573Srgrimes		}
23814287Spst		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
2391573Srgrimes			goto eftype;
24014287Spst		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
24117141Sjkh		    m.psize & (sizeof(indx_t) - 1) )
2421573Srgrimes			goto eftype;
24314287Spst		if (m.flags & ~SAVEMETA)
2441573Srgrimes			goto eftype;
24514287Spst		b.psize = m.psize;
24614287Spst		F_SET(t, m.flags);
24714287Spst		t->bt_free = m.free;
24814287Spst		t->bt_nrecs = m.nrecs;
2491573Srgrimes	} else {
2501573Srgrimes		/*
2511573Srgrimes		 * Set the page size to the best value for I/O to this file.
2521573Srgrimes		 * Don't overflow the page offset type.
2531573Srgrimes		 */
2541573Srgrimes		if (b.psize == 0) {
2551573Srgrimes			b.psize = sb.st_blksize;
2561573Srgrimes			if (b.psize < MINPSIZE)
2571573Srgrimes				b.psize = MINPSIZE;
2581573Srgrimes			if (b.psize > MAX_PAGE_OFFSET + 1)
2591573Srgrimes				b.psize = MAX_PAGE_OFFSET + 1;
2601573Srgrimes		}
2611573Srgrimes
2621573Srgrimes		/* Set flag if duplicates permitted. */
2631573Srgrimes		if (!(b.flags & R_DUP))
26414287Spst			F_SET(t, B_NODUPS);
2651573Srgrimes
2661573Srgrimes		t->bt_free = P_INVALID;
2671573Srgrimes		t->bt_nrecs = 0;
26814287Spst		F_SET(t, B_METADIRTY);
2691573Srgrimes	}
2701573Srgrimes
2711573Srgrimes	t->bt_psize = b.psize;
2721573Srgrimes
2731573Srgrimes	/* Set the cache size; must be a multiple of the page size. */
27417141Sjkh	if (b.cachesize && b.cachesize & (b.psize - 1) )
27517141Sjkh		b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1;
2761573Srgrimes	if (b.cachesize < b.psize * MINCACHE)
2771573Srgrimes		b.cachesize = b.psize * MINCACHE;
2781573Srgrimes
2791573Srgrimes	/* Calculate number of pages to cache. */
280298600Spfg	ncache = howmany(b.cachesize, t->bt_psize);
2811573Srgrimes
2821573Srgrimes	/*
2831573Srgrimes	 * The btree data structure requires that at least two keys can fit on
2841573Srgrimes	 * a page, but other than that there's no fixed requirement.  The user
2851573Srgrimes	 * specified a minimum number per page, and we translated that into the
2861573Srgrimes	 * number of bytes a key/data pair can use before being placed on an
2871573Srgrimes	 * overflow page.  This calculation includes the page header, the size
2881573Srgrimes	 * of the index referencing the leaf item and the size of the leaf item
2891573Srgrimes	 * structure.  Also, don't let the user specify a minkeypage such that
2901573Srgrimes	 * a key/data pair won't fit even if both key and data are on overflow
2911573Srgrimes	 * pages.
2921573Srgrimes	 */
2931573Srgrimes	t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
2941573Srgrimes	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
2951573Srgrimes	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
2961573Srgrimes		t->bt_ovflsize =
2971573Srgrimes		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
2981573Srgrimes
2991573Srgrimes	/* Initialize the buffer pool. */
3001573Srgrimes	if ((t->bt_mp =
3011573Srgrimes	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
3021573Srgrimes		goto err;
30314287Spst	if (!F_ISSET(t, B_INMEM))
3041573Srgrimes		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
3051573Srgrimes
3061573Srgrimes	/* Create a root page if new tree. */
3071573Srgrimes	if (nroot(t) == RET_ERROR)
3081573Srgrimes		goto err;
3091573Srgrimes
3101573Srgrimes	/* Global flags. */
3111573Srgrimes	if (dflags & DB_LOCK)
31214287Spst		F_SET(t, B_DB_LOCK);
3131573Srgrimes	if (dflags & DB_SHMEM)
31414287Spst		F_SET(t, B_DB_SHMEM);
3151573Srgrimes	if (dflags & DB_TXN)
31614287Spst		F_SET(t, B_DB_TXN);
3171573Srgrimes
3181573Srgrimes	return (dbp);
3191573Srgrimes
3201573Srgrimeseinval:	errno = EINVAL;
3211573Srgrimes	goto err;
3221573Srgrimes
3231573Srgrimeseftype:	errno = EFTYPE;
3241573Srgrimes	goto err;
3251573Srgrimes
326190344Sdelphijerr:	saved_errno = errno;
327190344Sdelphij	if (t) {
3281573Srgrimes		if (t->bt_dbp)
3291573Srgrimes			free(t->bt_dbp);
3301573Srgrimes		if (t->bt_fd != -1)
33156698Sjasone			(void)_close(t->bt_fd);
3321573Srgrimes		free(t);
3331573Srgrimes	}
334190344Sdelphij	errno = saved_errno;
3351573Srgrimes	return (NULL);
3361573Srgrimes}
3371573Srgrimes
3381573Srgrimes/*
3391573Srgrimes * NROOT -- Create the root of a new tree.
3401573Srgrimes *
3411573Srgrimes * Parameters:
3421573Srgrimes *	t:	tree
3431573Srgrimes *
3441573Srgrimes * Returns:
3451573Srgrimes *	RET_ERROR, RET_SUCCESS
3461573Srgrimes */
3471573Srgrimesstatic int
348189291Sdelphijnroot(BTREE *t)
3491573Srgrimes{
3501573Srgrimes	PAGE *meta, *root;
3511573Srgrimes	pgno_t npg;
3521573Srgrimes
353190498Sdelphij	if ((root = mpool_get(t->bt_mp, 1, 0)) != NULL) {
354190498Sdelphij		if (root->lower == 0 &&
355190498Sdelphij		    root->pgno == 0 &&
356190498Sdelphij		    root->linp[0] == 0) {
357190498Sdelphij			mpool_delete(t->bt_mp, root);
358190498Sdelphij			errno = EINVAL;
359190498Sdelphij		} else {
360190498Sdelphij			mpool_put(t->bt_mp, root, 0);
361190498Sdelphij			return (RET_SUCCESS);
362190498Sdelphij		}
3631573Srgrimes	}
36414287Spst	if (errno != EINVAL)		/* It's OK to not exist. */
3651573Srgrimes		return (RET_ERROR);
36614287Spst	errno = 0;
3671573Srgrimes
368190498Sdelphij	if ((meta = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL)
3691573Srgrimes		return (RET_ERROR);
3701573Srgrimes
371190498Sdelphij	if ((root = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL)
3721573Srgrimes		return (RET_ERROR);
3731573Srgrimes
3741573Srgrimes	if (npg != P_ROOT)
3751573Srgrimes		return (RET_ERROR);
3761573Srgrimes	root->pgno = npg;
3771573Srgrimes	root->prevpg = root->nextpg = P_INVALID;
3781573Srgrimes	root->lower = BTDATAOFF;
3791573Srgrimes	root->upper = t->bt_psize;
3801573Srgrimes	root->flags = P_BLEAF;
3811573Srgrimes	memset(meta, 0, t->bt_psize);
3821573Srgrimes	mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
3831573Srgrimes	mpool_put(t->bt_mp, root, MPOOL_DIRTY);
3841573Srgrimes	return (RET_SUCCESS);
3851573Srgrimes}
3861573Srgrimes
3871573Srgrimesstatic int
388189291Sdelphijtmp(void)
3891573Srgrimes{
3901573Srgrimes	sigset_t set, oset;
391190485Sdelphij	int fd, len;
39239058Simp	char *envtmp = NULL;
3931573Srgrimes	char path[MAXPATHLEN];
3941573Srgrimes
39539058Simp	if (issetugid() == 0)
39639058Simp		envtmp = getenv("TMPDIR");
397190485Sdelphij	len = snprintf(path,
39868234Skris	    sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp");
399190485Sdelphij	if (len < 0 || len >= (int)sizeof(path)) {
400190485Sdelphij		errno = ENAMETOOLONG;
401190485Sdelphij		return(-1);
402190485Sdelphij	}
4031573Srgrimes
4041573Srgrimes	(void)sigfillset(&set);
405287292Skib	(void)__libc_sigprocmask(SIG_BLOCK, &set, &oset);
406254289Sjilles	if ((fd = mkostemp(path, O_CLOEXEC)) != -1)
4071573Srgrimes		(void)unlink(path);
408287292Skib	(void)__libc_sigprocmask(SIG_SETMASK, &oset, NULL);
4091573Srgrimes	return(fd);
4101573Srgrimes}
4111573Srgrimes
4121573Srgrimesstatic int
413189291Sdelphijbyteorder(void)
4141573Srgrimes{
4151573Srgrimes	u_int32_t x;
4161573Srgrimes	u_char *p;
4171573Srgrimes
4181573Srgrimes	x = 0x01020304;
4191573Srgrimes	p = (u_char *)&x;
4201573Srgrimes	switch (*p) {
4211573Srgrimes	case 1:
4221573Srgrimes		return (BIG_ENDIAN);
4231573Srgrimes	case 4:
4241573Srgrimes		return (LITTLE_ENDIAN);
4251573Srgrimes	default:
4261573Srgrimes		return (0);
4271573Srgrimes	}
4281573Srgrimes}
4291573Srgrimes
4301573Srgrimesint
431189291Sdelphij__bt_fd(const DB *dbp)
4321573Srgrimes{
4331573Srgrimes	BTREE *t;
4341573Srgrimes
4351573Srgrimes	t = dbp->internal;
4361573Srgrimes
4371573Srgrimes	/* Toss any page pinned across calls. */
4381573Srgrimes	if (t->bt_pinned != NULL) {
4391573Srgrimes		mpool_put(t->bt_mp, t->bt_pinned, 0);
4401573Srgrimes		t->bt_pinned = NULL;
4411573Srgrimes	}
4421573Srgrimes
4431573Srgrimes	/* In-memory database can't have a file descriptor. */
44414287Spst	if (F_ISSET(t, B_INMEM)) {
4451573Srgrimes		errno = ENOENT;
4461573Srgrimes		return (-1);
4471573Srgrimes	}
4481573Srgrimes	return (t->bt_fd);
4491573Srgrimes}
450