bt_open.c revision 190344
1285242Sachim/*-
2285242Sachim * Copyright (c) 1990, 1993, 1994
3285242Sachim *	The Regents of the University of California.  All rights reserved.
4285242Sachim *
5285242Sachim * This code is derived from software contributed to Berkeley by
6285242Sachim * Mike Olson.
7285242Sachim *
8285242Sachim * Redistribution and use in source and binary forms, with or without
9285242Sachim * modification, are permitted provided that the following conditions
10285242Sachim * are met:
11285242Sachim * 1. Redistributions of source code must retain the above copyright
12285242Sachim *    notice, this list of conditions and the following disclaimer.
13285242Sachim * 2. Redistributions in binary form must reproduce the above copyright
14285242Sachim *    notice, this list of conditions and the following disclaimer in the
15285242Sachim *    documentation and/or other materials provided with the distribution.
16285242Sachim * 4. Neither the name of the University nor the names of its contributors
17285242Sachim *    may be used to endorse or promote products derived from this software
18285242Sachim *    without specific prior written permission.
19285242Sachim *
20285242Sachim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21285242Sachim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22285242Sachim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23285242Sachim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24285242Sachim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25285242Sachim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26285242Sachim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27285242Sachim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28285242Sachim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29285242Sachim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30285242Sachim * SUCH DAMAGE.
31285242Sachim */
32285242Sachim
33285242Sachim#if defined(LIBC_SCCS) && !defined(lint)
34285242Sachimstatic char sccsid[] = "@(#)bt_open.c	8.10 (Berkeley) 8/17/94";
35285242Sachim#endif /* LIBC_SCCS and not lint */
36285242Sachim#include <sys/cdefs.h>
37285242Sachim__FBSDID("$FreeBSD: head/lib/libc/db/btree/bt_open.c 190344 2009-03-23 23:43:07Z delphij $");
38285242Sachim
39285242Sachim/*
40285242Sachim * Implementation of btree access method for 4.4BSD.
41285242Sachim *
42285242Sachim * The design here was originally based on that of the btree access method
43285242Sachim * used in the Postgres database system at UC Berkeley.  This implementation
44285242Sachim * is wholly independent of the Postgres code.
45285242Sachim */
46285242Sachim
47285242Sachim#include "namespace.h"
48285242Sachim#include <sys/param.h>
49285242Sachim#include <sys/stat.h>
50285242Sachim
51285242Sachim#include <errno.h>
52285242Sachim#include <fcntl.h>
53285242Sachim#include <limits.h>
54285242Sachim#include <signal.h>
55285242Sachim#include <stdio.h>
56285242Sachim#include <stdlib.h>
57285242Sachim#include <string.h>
58285242Sachim#include <unistd.h>
59285242Sachim#include "un-namespace.h"
60285242Sachim
61285242Sachim#include <db.h>
62285242Sachim#include "btree.h"
63285242Sachim
64285242Sachim#ifdef DEBUG
65285242Sachim#undef	MINPSIZE
66285242Sachim#define	MINPSIZE	128
67285242Sachim#endif
68285242Sachim
69285242Sachimstatic int byteorder(void);
70285242Sachimstatic int nroot(BTREE *);
71285242Sachimstatic int tmp(void);
72285242Sachim
73285242Sachim/*
74285242Sachim * __BT_OPEN -- Open a btree.
75285242Sachim *
76285242Sachim * Creates and fills a DB struct, and calls the routine that actually
77285242Sachim * opens the btree.
78285242Sachim *
79285242Sachim * Parameters:
80285242Sachim *	fname:	filename (NULL for in-memory trees)
81285242Sachim *	flags:	open flag bits
82285242Sachim *	mode:	open permission bits
83285242Sachim *	b:	BTREEINFO pointer
84285242Sachim *
85285242Sachim * Returns:
86285242Sachim *	NULL on failure, pointer to DB on success.
87285242Sachim *
88285242Sachim */
89285242SachimDB *
90285242Sachim__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags)
91285242Sachim{
92285242Sachim	struct stat sb;
93285242Sachim	BTMETA m;
94285242Sachim	BTREE *t;
95285242Sachim	BTREEINFO b;
96285242Sachim	DB *dbp;
97285242Sachim	pgno_t ncache;
98285242Sachim	ssize_t nr;
99285242Sachim	int machine_lorder, saved_errno;
100285242Sachim
101285242Sachim	t = NULL;
102285242Sachim
103285242Sachim	/*
104285242Sachim	 * Intention is to make sure all of the user's selections are okay
105285242Sachim	 * here and then use them without checking.  Can't be complete, since
106285242Sachim	 * we don't know the right page size, lorder or flags until the backing
107285242Sachim	 * file is opened.  Also, the file's page size can cause the cachesize
108285242Sachim	 * to change.
109285242Sachim	 */
110285242Sachim	machine_lorder = byteorder();
111285242Sachim	if (openinfo) {
112285242Sachim		b = *openinfo;
113285242Sachim
114285242Sachim		/* Flags: R_DUP. */
115285242Sachim		if (b.flags & ~(R_DUP))
116285242Sachim			goto einval;
117285242Sachim
118285242Sachim		/*
119285242Sachim		 * Page size must be indx_t aligned and >= MINPSIZE.  Default
120285242Sachim		 * page size is set farther on, based on the underlying file
121285242Sachim		 * transfer size.
122285242Sachim		 */
123285242Sachim		if (b.psize &&
124285242Sachim		    (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
125285242Sachim		    b.psize & (sizeof(indx_t) - 1) ))
126285242Sachim			goto einval;
127285242Sachim
128285242Sachim		/* Minimum number of keys per page; absolute minimum is 2. */
129285242Sachim		if (b.minkeypage) {
130285242Sachim			if (b.minkeypage < 2)
131285242Sachim				goto einval;
132285242Sachim		} else
133285242Sachim			b.minkeypage = DEFMINKEYPAGE;
134285242Sachim
135285242Sachim		/* If no comparison, use default comparison and prefix. */
136285242Sachim		if (b.compare == NULL) {
137285242Sachim			b.compare = __bt_defcmp;
138285242Sachim			if (b.prefix == NULL)
139285242Sachim				b.prefix = __bt_defpfx;
140285242Sachim		}
141285242Sachim
142285242Sachim		if (b.lorder == 0)
143285242Sachim			b.lorder = machine_lorder;
144285242Sachim	} else {
145285242Sachim		b.compare = __bt_defcmp;
146285242Sachim		b.cachesize = 0;
147285242Sachim		b.flags = 0;
148285242Sachim		b.lorder = machine_lorder;
149285242Sachim		b.minkeypage = DEFMINKEYPAGE;
150285242Sachim		b.prefix = __bt_defpfx;
151285242Sachim		b.psize = 0;
152285242Sachim	}
153285242Sachim
154285242Sachim	/* Check for the ubiquitous PDP-11. */
155285242Sachim	if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
156285242Sachim		goto einval;
157285242Sachim
158285242Sachim	/* Allocate and initialize DB and BTREE structures. */
159285242Sachim	if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
160285242Sachim		goto err;
161285242Sachim	memset(t, 0, sizeof(BTREE));
162285242Sachim	t->bt_fd = -1;			/* Don't close unopened fd on error. */
163285242Sachim	t->bt_lorder = b.lorder;
164285242Sachim	t->bt_order = NOT;
165285242Sachim	t->bt_cmp = b.compare;
166285242Sachim	t->bt_pfx = b.prefix;
167285242Sachim	t->bt_rfd = -1;
168285242Sachim
169285242Sachim	if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
170285242Sachim		goto err;
171285242Sachim	memset(t->bt_dbp, 0, sizeof(DB));
172285242Sachim	if (t->bt_lorder != machine_lorder)
173285242Sachim		F_SET(t, B_NEEDSWAP);
174285242Sachim
175285242Sachim	dbp->type = DB_BTREE;
176285242Sachim	dbp->internal = t;
177285242Sachim	dbp->close = __bt_close;
178285242Sachim	dbp->del = __bt_delete;
179285242Sachim	dbp->fd = __bt_fd;
180285242Sachim	dbp->get = __bt_get;
181285242Sachim	dbp->put = __bt_put;
182285242Sachim	dbp->seq = __bt_seq;
183285242Sachim	dbp->sync = __bt_sync;
184285242Sachim
185285242Sachim	/*
186285242Sachim	 * If no file name was supplied, this is an in-memory btree and we
187285242Sachim	 * open a backing temporary file.  Otherwise, it's a disk-based tree.
188285242Sachim	 */
189285242Sachim	if (fname) {
190285242Sachim		switch (flags & O_ACCMODE) {
191285242Sachim		case O_RDONLY:
192285242Sachim			F_SET(t, B_RDONLY);
193285242Sachim			break;
194285242Sachim		case O_RDWR:
195285242Sachim			break;
196285242Sachim		case O_WRONLY:
197285242Sachim		default:
198285242Sachim			goto einval;
199285242Sachim		}
200285242Sachim
201285242Sachim		if ((t->bt_fd = _open(fname, flags, mode)) < 0)
202285242Sachim			goto err;
203285242Sachim
204285242Sachim	} else {
205285242Sachim		if ((flags & O_ACCMODE) != O_RDWR)
206285242Sachim			goto einval;
207285242Sachim		if ((t->bt_fd = tmp()) == -1)
208285242Sachim			goto err;
209285242Sachim		F_SET(t, B_INMEM);
210285242Sachim	}
211285242Sachim
212285242Sachim	if (_fcntl(t->bt_fd, F_SETFD, 1) == -1)
213285242Sachim		goto err;
214285242Sachim
215285242Sachim	if (_fstat(t->bt_fd, &sb))
216285242Sachim		goto err;
217285242Sachim	if (sb.st_size) {
218285242Sachim		if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
219285242Sachim			goto err;
220285242Sachim		if (nr != sizeof(BTMETA))
221285242Sachim			goto eftype;
222285242Sachim
223285242Sachim		/*
224285242Sachim		 * Read in the meta-data.  This can change the notion of what
225285242Sachim		 * the lorder, page size and flags are, and, when the page size
226285242Sachim		 * changes, the cachesize value can change too.  If the user
227285242Sachim		 * specified the wrong byte order for an existing database, we
228285242Sachim		 * don't bother to return an error, we just clear the NEEDSWAP
229285242Sachim		 * bit.
230285242Sachim		 */
231285242Sachim		if (m.magic == BTREEMAGIC)
232285242Sachim			F_CLR(t, B_NEEDSWAP);
233285242Sachim		else {
234285242Sachim			F_SET(t, B_NEEDSWAP);
235285242Sachim			M_32_SWAP(m.magic);
236285242Sachim			M_32_SWAP(m.version);
237285242Sachim			M_32_SWAP(m.psize);
238285242Sachim			M_32_SWAP(m.free);
239285242Sachim			M_32_SWAP(m.nrecs);
240285242Sachim			M_32_SWAP(m.flags);
241285242Sachim		}
242285242Sachim		if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
243285242Sachim			goto eftype;
244285242Sachim		if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
245285242Sachim		    m.psize & (sizeof(indx_t) - 1) )
246285242Sachim			goto eftype;
247285242Sachim		if (m.flags & ~SAVEMETA)
248285242Sachim			goto eftype;
249285242Sachim		b.psize = m.psize;
250285242Sachim		F_SET(t, m.flags);
251285242Sachim		t->bt_free = m.free;
252285242Sachim		t->bt_nrecs = m.nrecs;
253285242Sachim	} else {
254285242Sachim		/*
255285242Sachim		 * Set the page size to the best value for I/O to this file.
256285242Sachim		 * Don't overflow the page offset type.
257285242Sachim		 */
258285242Sachim		if (b.psize == 0) {
259285242Sachim			b.psize = sb.st_blksize;
260285242Sachim			if (b.psize < MINPSIZE)
261285242Sachim				b.psize = MINPSIZE;
262285242Sachim			if (b.psize > MAX_PAGE_OFFSET + 1)
263285242Sachim				b.psize = MAX_PAGE_OFFSET + 1;
264285242Sachim		}
265285242Sachim
266285242Sachim		/* Set flag if duplicates permitted. */
267285242Sachim		if (!(b.flags & R_DUP))
268285242Sachim			F_SET(t, B_NODUPS);
269285242Sachim
270285242Sachim		t->bt_free = P_INVALID;
271285242Sachim		t->bt_nrecs = 0;
272285242Sachim		F_SET(t, B_METADIRTY);
273285242Sachim	}
274285242Sachim
275285242Sachim	t->bt_psize = b.psize;
276285242Sachim
277285242Sachim	/* Set the cache size; must be a multiple of the page size. */
278285242Sachim	if (b.cachesize && b.cachesize & (b.psize - 1) )
279285242Sachim		b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1;
280285242Sachim	if (b.cachesize < b.psize * MINCACHE)
281285242Sachim		b.cachesize = b.psize * MINCACHE;
282285242Sachim
283285242Sachim	/* Calculate number of pages to cache. */
284285242Sachim	ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
285285242Sachim
286285242Sachim	/*
287285242Sachim	 * The btree data structure requires that at least two keys can fit on
288285242Sachim	 * a page, but other than that there's no fixed requirement.  The user
289285242Sachim	 * specified a minimum number per page, and we translated that into the
290285242Sachim	 * number of bytes a key/data pair can use before being placed on an
291285242Sachim	 * overflow page.  This calculation includes the page header, the size
292285242Sachim	 * of the index referencing the leaf item and the size of the leaf item
293285242Sachim	 * structure.  Also, don't let the user specify a minkeypage such that
294285242Sachim	 * a key/data pair won't fit even if both key and data are on overflow
295285242Sachim	 * pages.
296285242Sachim	 */
297285242Sachim	t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
298285242Sachim	    (sizeof(indx_t) + NBLEAFDBT(0, 0));
299285242Sachim	if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
300285242Sachim		t->bt_ovflsize =
301285242Sachim		    NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
302285242Sachim
303285242Sachim	/* Initialize the buffer pool. */
304285242Sachim	if ((t->bt_mp =
305285242Sachim	    mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
306285242Sachim		goto err;
307285242Sachim	if (!F_ISSET(t, B_INMEM))
308285242Sachim		mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
309285242Sachim
310285242Sachim	/* Create a root page if new tree. */
311285242Sachim	if (nroot(t) == RET_ERROR)
312285242Sachim		goto err;
313285242Sachim
314285242Sachim	/* Global flags. */
315285242Sachim	if (dflags & DB_LOCK)
316285242Sachim		F_SET(t, B_DB_LOCK);
317285242Sachim	if (dflags & DB_SHMEM)
318285242Sachim		F_SET(t, B_DB_SHMEM);
319285242Sachim	if (dflags & DB_TXN)
320285242Sachim		F_SET(t, B_DB_TXN);
321285242Sachim
322285242Sachim	return (dbp);
323285242Sachim
324285242Sachimeinval:	errno = EINVAL;
325285242Sachim	goto err;
326285242Sachim
327285242Sachimeftype:	errno = EFTYPE;
328285242Sachim	goto err;
329285242Sachim
330285242Sachimerr:	saved_errno = errno;
331285242Sachim	if (t) {
332285242Sachim		if (t->bt_dbp)
333285242Sachim			free(t->bt_dbp);
334285242Sachim		if (t->bt_fd != -1)
335285242Sachim			(void)_close(t->bt_fd);
336285242Sachim		free(t);
337285242Sachim	}
338285242Sachim	errno = saved_errno;
339285242Sachim	return (NULL);
340285242Sachim}
341285242Sachim
342285242Sachim/*
343285242Sachim * NROOT -- Create the root of a new tree.
344285242Sachim *
345285242Sachim * Parameters:
346285242Sachim *	t:	tree
347285242Sachim *
348285242Sachim * Returns:
349285242Sachim *	RET_ERROR, RET_SUCCESS
350285242Sachim */
351285242Sachimstatic int
352285242Sachimnroot(BTREE *t)
353285242Sachim{
354285242Sachim	PAGE *meta, *root;
355285242Sachim	pgno_t npg;
356285242Sachim
357285242Sachim	if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
358285242Sachim		mpool_put(t->bt_mp, meta, 0);
359285242Sachim		return (RET_SUCCESS);
360285242Sachim	}
361285242Sachim	if (errno != EINVAL)		/* It's OK to not exist. */
362285242Sachim		return (RET_ERROR);
363285242Sachim	errno = 0;
364285242Sachim
365285242Sachim	if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
366285242Sachim		return (RET_ERROR);
367285242Sachim
368285242Sachim	if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
369285242Sachim		return (RET_ERROR);
370285242Sachim
371285242Sachim	if (npg != P_ROOT)
372285242Sachim		return (RET_ERROR);
373285242Sachim	root->pgno = npg;
374285242Sachim	root->prevpg = root->nextpg = P_INVALID;
375285242Sachim	root->lower = BTDATAOFF;
376285242Sachim	root->upper = t->bt_psize;
377285242Sachim	root->flags = P_BLEAF;
378285242Sachim	memset(meta, 0, t->bt_psize);
379285242Sachim	mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
380285242Sachim	mpool_put(t->bt_mp, root, MPOOL_DIRTY);
381285242Sachim	return (RET_SUCCESS);
382285242Sachim}
383285242Sachim
384285242Sachimstatic int
385285242Sachimtmp(void)
386285242Sachim{
387285242Sachim	sigset_t set, oset;
388285242Sachim	int fd;
389285242Sachim	char *envtmp = NULL;
390285242Sachim	char path[MAXPATHLEN];
391285242Sachim
392285242Sachim	if (issetugid() == 0)
393285242Sachim		envtmp = getenv("TMPDIR");
394285242Sachim	(void)snprintf(path,
395285242Sachim	    sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp");
396285242Sachim
397285242Sachim	(void)sigfillset(&set);
398285242Sachim	(void)_sigprocmask(SIG_BLOCK, &set, &oset);
399285242Sachim	if ((fd = mkstemp(path)) != -1)
400285242Sachim		(void)unlink(path);
401285242Sachim	(void)_sigprocmask(SIG_SETMASK, &oset, NULL);
402285242Sachim	return(fd);
403285242Sachim}
404285242Sachim
405285242Sachimstatic int
406285242Sachimbyteorder(void)
407285242Sachim{
408285242Sachim	u_int32_t x;
409285242Sachim	u_char *p;
410285242Sachim
411285242Sachim	x = 0x01020304;
412285242Sachim	p = (u_char *)&x;
413285242Sachim	switch (*p) {
414285242Sachim	case 1:
415285242Sachim		return (BIG_ENDIAN);
416285242Sachim	case 4:
417285242Sachim		return (LITTLE_ENDIAN);
418285242Sachim	default:
419285242Sachim		return (0);
420285242Sachim	}
421285242Sachim}
422285242Sachim
423285242Sachimint
424285242Sachim__bt_fd(const DB *dbp)
425285242Sachim{
426285242Sachim	BTREE *t;
427285242Sachim
428285242Sachim	t = dbp->internal;
429285242Sachim
430285242Sachim	/* Toss any page pinned across calls. */
431285242Sachim	if (t->bt_pinned != NULL) {
432285242Sachim		mpool_put(t->bt_mp, t->bt_pinned, 0);
433285242Sachim		t->bt_pinned = NULL;
434285242Sachim	}
435285242Sachim
436285242Sachim	/* In-memory database can't have a file descriptor. */
437285242Sachim	if (F_ISSET(t, B_INMEM)) {
438285242Sachim		errno = ENOENT;
439285242Sachim		return (-1);
440285242Sachim	}
441285242Sachim	return (t->bt_fd);
442285242Sachim}
443285242Sachim