bt_open.c revision 190344
1262569Simp/*-
2262569Simp * Copyright (c) 1990, 1993, 1994
3262569Simp *	The Regents of the University of California.  All rights reserved.
4262569Simp *
5262569Simp * This code is derived from software contributed to Berkeley by
6262569Simp * Mike Olson.
7262569Simp *
8262569Simp * Redistribution and use in source and binary forms, with or without
9262569Simp * modification, are permitted provided that the following conditions
10262569Simp * are met:
11270864Simp * 1. Redistributions of source code must retain the above copyright
12270864Simp *    notice, this list of conditions and the following disclaimer.
13262569Simp * 2. Redistributions in binary form must reproduce the above copyright
14262569Simp *    notice, this list of conditions and the following disclaimer in the
15262569Simp *    documentation and/or other materials provided with the distribution.
16262569Simp * 4. Neither the name of the University nor the names of its contributors
17262569Simp *    may be used to endorse or promote products derived from this software
18262569Simp *    without specific prior written permission.
19262569Simp *
20262569Simp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21262569Simp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22262569Simp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23279385Simp * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24279385Simp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25279385Simp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26279385Simp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27279385Simp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28279385Simp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29279385Simp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30270864Simp * SUCH DAMAGE.
31270864Simp */
32270864Simp
33270864Simp#if defined(LIBC_SCCS) && !defined(lint)
34270864Simpstatic char sccsid[] = "@(#)bt_open.c	8.10 (Berkeley) 8/17/94";
35270864Simp#endif /* LIBC_SCCS and not lint */
36270864Simp#include <sys/cdefs.h>
37270864Simp__FBSDID("$FreeBSD: head/lib/libc/db/btree/bt_open.c 190344 2009-03-23 23:43:07Z delphij $");
38270864Simp
39270864Simp/*
40270864Simp * Implementation of btree access method for 4.4BSD.
41270864Simp *
42262569Simp * The design here was originally based on that of the btree access method
43262569Simp * used in the Postgres database system at UC Berkeley.  This implementation
44270864Simp * is wholly independent of the Postgres code.
45270864Simp */
46262569Simp
47270864Simp#include "namespace.h"
48262569Simp#include <sys/param.h>
49270864Simp#include <sys/stat.h>
50262569Simp
51262569Simp#include <errno.h>
52262569Simp#include <fcntl.h>
53262569Simp#include <limits.h>
54262569Simp#include <signal.h>
55295436Sandrew#include <stdio.h>
56262569Simp#include <stdlib.h>
57262569Simp#include <string.h>
58270864Simp#include <unistd.h>
59262569Simp#include "un-namespace.h"
60270864Simp
61262569Simp#include <db.h>
62262569Simp#include "btree.h"
63262569Simp
64262569Simp#ifdef DEBUG
65262569Simp#undef	MINPSIZE
66295436Sandrew#define	MINPSIZE	128
67262569Simp#endif
68270864Simp
69270864Simpstatic int byteorder(void);
70270864Simpstatic int nroot(BTREE *);
71270864Simpstatic int tmp(void);
72270864Simp
73270864Simp/*
74270864Simp * __BT_OPEN -- Open a btree.
75270864Simp *
76270864Simp * Creates and fills a DB struct, and calls the routine that actually
77270864Simp * opens the btree.
78270864Simp *
79270864Simp * Parameters:
80270864Simp *	fname:	filename (NULL for in-memory trees)
81270864Simp *	flags:	open flag bits
82270864Simp *	mode:	open permission bits
83270864Simp *	b:	BTREEINFO pointer
84270864Simp *
85270864Simp * Returns:
86279385Simp *	NULL on failure, pointer to DB on success.
87279385Simp *
88279385Simp */
89279385SimpDB *
90279385Simp__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags)
91279385Simp{
92279385Simp	struct stat sb;
93279385Simp	BTMETA m;
94262569Simp	BTREE *t;
95270864Simp	BTREEINFO b;
96270864Simp	DB *dbp;
97270864Simp	pgno_t ncache;
98270864Simp	ssize_t nr;
99270864Simp	int machine_lorder, saved_errno;
100270864Simp
101270864Simp	t = NULL;
102270864Simp
103270864Simp	/*
104270864Simp	 * Intention is to make sure all of the user's selections are okay
105270864Simp	 * here and then use them without checking.  Can't be complete, since
106270864Simp	 * we don't know the right page size, lorder or flags until the backing
107270864Simp	 * file is opened.  Also, the file's page size can cause the cachesize
108270864Simp	 * to change.
109270864Simp	 */
110270864Simp	machine_lorder = byteorder();
111262569Simp	if (openinfo) {
112262569Simp		b = *openinfo;
113270864Simp
114270864Simp		/* Flags: R_DUP. */
115270864Simp		if (b.flags & ~(R_DUP))
116270864Simp			goto einval;
117270864Simp
118270864Simp		/*
119262569Simp		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
120262569Simp		 * page size is set farther on, based on the underlying file
121262569Simp		 * transfer size.
122262569Simp		 */
123270864Simp		if (b.psize &&
124262569Simp		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
125262569Simp		    b.psize & (sizeof(indx_t) - 1) ))
126262569Simp			goto einval;
127262569Simp
128262569Simp		/* Minimum number of keys per page; absolute minimum is 2. */
129295436Sandrew		if (b.minkeypage) {
130262569Simp			if (b.minkeypage < 2)
131262569Simp				goto einval;
132262569Simp		} else
133262569Simp			b.minkeypage = DEFMINKEYPAGE;
134262569Simp
135262569Simp		/* If no comparison, use default comparison and prefix. */
136270864Simp		if (b.compare == NULL) {
137270864Simp			b.compare = __bt_defcmp;
138270864Simp			if (b.prefix == NULL)
139262569Simp				b.prefix = __bt_defpfx;
140262569Simp		}
141262569Simp
142262569Simp		if (b.lorder == 0)
143270864Simp			b.lorder = machine_lorder;
144270864Simp	} else {
145270864Simp		b.compare = __bt_defcmp;
146270864Simp		b.cachesize = 0;
147270864Simp		b.flags = 0;
148270864Simp		b.lorder = machine_lorder;
149270864Simp		b.minkeypage = DEFMINKEYPAGE;
150270864Simp		b.prefix = __bt_defpfx;
151270864Simp		b.psize = 0;
152270864Simp	}
153270864Simp
154270864Simp	/* Check for the ubiquitous PDP-11. */
155270864Simp	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
156270864Simp		goto einval;
157270864Simp
158270864Simp	/* Allocate and initialize DB and BTREE structures. */
159270864Simp	if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
160270864Simp		goto err;
161270864Simp	memset(t, 0, sizeof(BTREE));
162270864Simp	t->bt_fd = -1;			/* Don't close unopened fd on error. */
163270864Simp	t->bt_lorder = b.lorder;
164270864Simp	t->bt_order = NOT;
165270864Simp	t->bt_cmp = b.compare;
166270864Simp	t->bt_pfx = b.prefix;
167270864Simp	t->bt_rfd = -1;
168270864Simp
169270864Simp	if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
170270864Simp		goto err;
171270864Simp	memset(t->bt_dbp, 0, sizeof(DB));
172270864Simp	if (t->bt_lorder != machine_lorder)
173270864Simp		F_SET(t, B_NEEDSWAP);
174270864Simp
175270864Simp	dbp->type = DB_BTREE;
176270864Simp	dbp->internal = t;
177270864Simp	dbp->close = __bt_close;
178270864Simp	dbp->del = __bt_delete;
179270864Simp	dbp->fd = __bt_fd;
180270864Simp	dbp->get = __bt_get;
181270864Simp	dbp->put = __bt_put;
182270864Simp	dbp->seq = __bt_seq;
183270864Simp	dbp->sync = __bt_sync;
184270864Simp
185270864Simp	/*
186270864Simp	 * If no file name was supplied, this is an in-memory btree and we
187270864Simp	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
188270864Simp	 */
189270864Simp	if (fname) {
190270864Simp		switch (flags & O_ACCMODE) {
191270864Simp		case O_RDONLY:
192270864Simp			F_SET(t, B_RDONLY);
193270864Simp			break;
194270864Simp		case O_RDWR:
195270864Simp			break;
196270864Simp		case O_WRONLY:
197270864Simp		default:
198270864Simp			goto einval;
199270864Simp		}
200270864Simp
201270864Simp		if ((t->bt_fd = _open(fname, flags, mode)) < 0)
202270864Simp			goto err;
203270864Simp
204270864Simp	} else {
205270864Simp		if ((flags & O_ACCMODE) != O_RDWR)
206270864Simp			goto einval;
207270864Simp		if ((t->bt_fd = tmp()) == -1)
208270864Simp			goto err;
209270864Simp		F_SET(t, B_INMEM);
210270864Simp	}
211270864Simp
212270864Simp	if (_fcntl(t->bt_fd, F_SETFD, 1) == -1)
213270864Simp		goto err;
214270864Simp
215270864Simp	if (_fstat(t->bt_fd, &sb))
216270864Simp		goto err;
217270864Simp	if (sb.st_size) {
218270864Simp		if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
219270864Simp			goto err;
220270864Simp		if (nr != sizeof(BTMETA))
221270864Simp			goto eftype;
222270864Simp
223270864Simp		/*
224270864Simp		 * Read in the meta-data.  This can change the notion of what
225270864Simp		 * the lorder, page size and flags are, and, when the page size
226270864Simp		 * changes, the cachesize value can change too.  If the user
227270864Simp		 * specified the wrong byte order for an existing database, we
228270864Simp		 * don't bother to return an error, we just clear the NEEDSWAP
229270864Simp		 * bit.
230270864Simp		 */
231270864Simp		if (m.magic == BTREEMAGIC)
232270864Simp			F_CLR(t, B_NEEDSWAP);
233270864Simp		else {
234270864Simp			F_SET(t, B_NEEDSWAP);
235270864Simp			M_32_SWAP(m.magic);
236270864Simp			M_32_SWAP(m.version);
237270864Simp			M_32_SWAP(m.psize);
238270864Simp			M_32_SWAP(m.free);
239270864Simp			M_32_SWAP(m.nrecs);
240270864Simp			M_32_SWAP(m.flags);
241270864Simp		}
242270864Simp		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
243270864Simp			goto eftype;
244270864Simp		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
245270864Simp		    m.psize & (sizeof(indx_t) - 1) )
246270864Simp			goto eftype;
247270864Simp		if (m.flags & ~SAVEMETA)
248270864Simp			goto eftype;
249270864Simp		b.psize = m.psize;
250270864Simp		F_SET(t, m.flags);
251270864Simp		t->bt_free = m.free;
252270864Simp		t->bt_nrecs = m.nrecs;
253270864Simp	} else {
254270864Simp		/*
255270864Simp		 * Set the page size to the best value for I/O to this file.
256270864Simp		 * Don't overflow the page offset type.
257270864Simp		 */
258270864Simp		if (b.psize == 0) {
259270864Simp			b.psize = sb.st_blksize;
260270864Simp			if (b.psize < MINPSIZE)
261270864Simp				b.psize = MINPSIZE;
262270864Simp			if (b.psize > MAX_PAGE_OFFSET + 1)
263270864Simp				b.psize = MAX_PAGE_OFFSET + 1;
264270864Simp		}
265270864Simp
266270864Simp		/* Set flag if duplicates permitted. */
267270864Simp		if (!(b.flags & R_DUP))
268270864Simp			F_SET(t, B_NODUPS);
269270864Simp
270270864Simp		t->bt_free = P_INVALID;
271262569Simp		t->bt_nrecs = 0;
272262569Simp		F_SET(t, B_METADIRTY);
273262569Simp	}
274262569Simp
275270864Simp	t->bt_psize = b.psize;
276262569Simp
277262569Simp	/* Set the cache size; must be a multiple of the page size. */
278262569Simp	if (b.cachesize && b.cachesize & (b.psize - 1) )
279262569Simp		b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1;
280262569Simp	if (b.cachesize < b.psize * MINCACHE)
281262569Simp		b.cachesize = b.psize * MINCACHE;
282262569Simp
283262569Simp	/* Calculate number of pages to cache. */
284262569Simp	ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
285270864Simp
286262569Simp	/*
287262569Simp	 * The btree data structure requires that at least two keys can fit on
288270864Simp	 * a page, but other than that there's no fixed requirement.  The user
289270864Simp	 * specified a minimum number per page, and we translated that into the
290270864Simp	 * number of bytes a key/data pair can use before being placed on an
291270864Simp	 * overflow page.  This calculation includes the page header, the size
292270864Simp	 * of the index referencing the leaf item and the size of the leaf item
293270864Simp	 * structure.  Also, don't let the user specify a minkeypage such that
294270864Simp	 * a key/data pair won't fit even if both key and data are on overflow
295270864Simp	 * pages.
296270864Simp	 */
297270864Simp	t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
298270864Simp	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
299270864Simp	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
300270864Simp		t->bt_ovflsize =
301270864Simp		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
302270864Simp
303270864Simp	/* Initialize the buffer pool. */
304270864Simp	if ((t->bt_mp =
305270864Simp	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
306270864Simp		goto err;
307270864Simp	if (!F_ISSET(t, B_INMEM))
308270864Simp		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
309270864Simp
310270864Simp	/* Create a root page if new tree. */
311270864Simp	if (nroot(t) == RET_ERROR)
312270864Simp		goto err;
313270864Simp
314270864Simp	/* Global flags. */
315270864Simp	if (dflags & DB_LOCK)
316270864Simp		F_SET(t, B_DB_LOCK);
317270864Simp	if (dflags & DB_SHMEM)
318270864Simp		F_SET(t, B_DB_SHMEM);
319270864Simp	if (dflags & DB_TXN)
320270864Simp		F_SET(t, B_DB_TXN);
321270864Simp
322270864Simp	return (dbp);
323270864Simp
324270864Simpeinval:	errno = EINVAL;
325270864Simp	goto err;
326270864Simp
327270864Simpeftype:	errno = EFTYPE;
328270864Simp	goto err;
329270864Simp
330270864Simperr:	saved_errno = errno;
331270864Simp	if (t) {
332270864Simp		if (t->bt_dbp)
333270864Simp			free(t->bt_dbp);
334270864Simp		if (t->bt_fd != -1)
335270864Simp			(void)_close(t->bt_fd);
336270864Simp		free(t);
337270864Simp	}
338270864Simp	errno = saved_errno;
339270864Simp	return (NULL);
340270864Simp}
341270864Simp
342270864Simp/*
343270864Simp * NROOT -- Create the root of a new tree.
344270864Simp *
345270864Simp * Parameters:
346270864Simp *	t:	tree
347270864Simp *
348270864Simp * Returns:
349270864Simp *	RET_ERROR, RET_SUCCESS
350270864Simp */
351270864Simpstatic int
352270864Simpnroot(BTREE *t)
353270864Simp{
354270864Simp	PAGE *meta, *root;
355270864Simp	pgno_t npg;
356270864Simp
357270864Simp	if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
358270864Simp		mpool_put(t->bt_mp, meta, 0);
359270864Simp		return (RET_SUCCESS);
360279385Simp	}
361279385Simp	if (errno != EINVAL)		/* It's OK to not exist. */
362279385Simp		return (RET_ERROR);
363279385Simp	errno = 0;
364279385Simp
365279385Simp	if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
366279385Simp		return (RET_ERROR);
367279385Simp
368279385Simp	if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
369279385Simp		return (RET_ERROR);
370279385Simp
371279385Simp	if (npg != P_ROOT)
372279385Simp		return (RET_ERROR);
373279385Simp	root->pgno = npg;
374279385Simp	root->prevpg = root->nextpg = P_INVALID;
375279385Simp	root->lower = BTDATAOFF;
376279385Simp	root->upper = t->bt_psize;
377279385Simp	root->flags = P_BLEAF;
378279385Simp	memset(meta, 0, t->bt_psize);
379279385Simp	mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
380279385Simp	mpool_put(t->bt_mp, root, MPOOL_DIRTY);
381279385Simp	return (RET_SUCCESS);
382279385Simp}
383279385Simp
384279385Simpstatic int
385279385Simptmp(void)
386279385Simp{
387279385Simp	sigset_t set, oset;
388279385Simp	int fd;
389279385Simp	char *envtmp = NULL;
390279385Simp	char path[MAXPATHLEN];
391279385Simp
392279385Simp	if (issetugid() == 0)
393279385Simp		envtmp = getenv("TMPDIR");
394279385Simp	(void)snprintf(path,
395279385Simp	    sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp");
396279385Simp
397279385Simp	(void)sigfillset(&set);
398279385Simp	(void)_sigprocmask(SIG_BLOCK, &set, &oset);
399279385Simp	if ((fd = mkstemp(path)) != -1)
400279385Simp		(void)unlink(path);
401279385Simp	(void)_sigprocmask(SIG_SETMASK, &oset, NULL);
402279385Simp	return(fd);
403279385Simp}
404279385Simp
405270864Simpstatic int
406270864Simpbyteorder(void)
407270864Simp{
408270864Simp	u_int32_t x;
409270864Simp	u_char *p;
410270864Simp
411270864Simp	x = 0x01020304;
412270864Simp	p = (u_char *)&x;
413270864Simp	switch (*p) {
414270864Simp	case 1:
415270864Simp		return (BIG_ENDIAN);
416270864Simp	case 4:
417270864Simp		return (LITTLE_ENDIAN);
418270864Simp	default:
419270864Simp		return (0);
420270864Simp	}
421270864Simp}
422270864Simp
423270864Simpint
424270864Simp__bt_fd(const DB *dbp)
425270864Simp{
426270864Simp	BTREE *t;
427270864Simp
428270864Simp	t = dbp->internal;
429270864Simp
430270864Simp	/* Toss any page pinned across calls. */
431270864Simp	if (t->bt_pinned != NULL) {
432270864Simp		mpool_put(t->bt_mp, t->bt_pinned, 0);
433270864Simp		t->bt_pinned = NULL;
434270864Simp	}
435270864Simp
436270864Simp	/* In-memory database can't have a file descriptor. */
437270864Simp	if (F_ISSET(t, B_INMEM)) {
438270864Simp		errno = ENOENT;
439270864Simp		return (-1);
440270864Simp	}
441270864Simp	return (t->bt_fd);
442270864Simp}
443270864Simp