bt_open.c revision 17141
11573Srgrimes/*- 214287Spst * Copyright (c) 1990, 1993, 1994 31573Srgrimes * The Regents of the University of California. All rights reserved. 41573Srgrimes * 51573Srgrimes * This code is derived from software contributed to Berkeley by 61573Srgrimes * Mike Olson. 71573Srgrimes * 81573Srgrimes * Redistribution and use in source and binary forms, with or without 91573Srgrimes * modification, are permitted provided that the following conditions 101573Srgrimes * are met: 111573Srgrimes * 1. Redistributions of source code must retain the above copyright 121573Srgrimes * notice, this list of conditions and the following disclaimer. 131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141573Srgrimes * notice, this list of conditions and the following disclaimer in the 151573Srgrimes * documentation and/or other materials provided with the distribution. 161573Srgrimes * 3. All advertising materials mentioning features or use of this software 171573Srgrimes * must display the following acknowledgement: 181573Srgrimes * This product includes software developed by the University of 191573Srgrimes * California, Berkeley and its contributors. 201573Srgrimes * 4. Neither the name of the University nor the names of its contributors 211573Srgrimes * may be used to endorse or promote products derived from this software 221573Srgrimes * without specific prior written permission. 231573Srgrimes * 241573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 251573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 261573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 271573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 281573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 291573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 301573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 311573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 321573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 331573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 341573Srgrimes * SUCH DAMAGE. 351573Srgrimes */ 361573Srgrimes 371573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 3814287Spststatic char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; 391573Srgrimes#endif /* LIBC_SCCS and not lint */ 401573Srgrimes 411573Srgrimes/* 421573Srgrimes * Implementation of btree access method for 4.4BSD. 431573Srgrimes * 441573Srgrimes * The design here was originally based on that of the btree access method 451573Srgrimes * used in the Postgres database system at UC Berkeley. This implementation 461573Srgrimes * is wholly independent of the Postgres code. 471573Srgrimes */ 481573Srgrimes 491573Srgrimes#include <sys/param.h> 501573Srgrimes#include <sys/stat.h> 511573Srgrimes 521573Srgrimes#include <errno.h> 531573Srgrimes#include <fcntl.h> 541573Srgrimes#include <limits.h> 551573Srgrimes#include <signal.h> 561573Srgrimes#include <stdio.h> 571573Srgrimes#include <stdlib.h> 581573Srgrimes#include <string.h> 591573Srgrimes#include <unistd.h> 601573Srgrimes 611573Srgrimes#include <db.h> 621573Srgrimes#include "btree.h" 631573Srgrimes 6414287Spst#ifdef DEBUG 6514287Spst#undef MINPSIZE 6614287Spst#define MINPSIZE 128 6714287Spst#endif 6814287Spst 691573Srgrimesstatic int byteorder __P((void)); 701573Srgrimesstatic int nroot __P((BTREE *)); 711573Srgrimesstatic int tmp __P((void)); 721573Srgrimes 731573Srgrimes/* 741573Srgrimes * __BT_OPEN -- Open a btree. 751573Srgrimes * 761573Srgrimes * Creates and fills a DB struct, and calls the routine that actually 771573Srgrimes * opens the btree. 781573Srgrimes * 791573Srgrimes * Parameters: 801573Srgrimes * fname: filename (NULL for in-memory trees) 811573Srgrimes * flags: open flag bits 821573Srgrimes * mode: open permission bits 831573Srgrimes * b: BTREEINFO pointer 841573Srgrimes * 851573Srgrimes * Returns: 861573Srgrimes * NULL on failure, pointer to DB on success. 871573Srgrimes * 881573Srgrimes */ 891573SrgrimesDB * 901573Srgrimes__bt_open(fname, flags, mode, openinfo, dflags) 911573Srgrimes const char *fname; 921573Srgrimes int flags, mode, dflags; 931573Srgrimes const BTREEINFO *openinfo; 941573Srgrimes{ 951573Srgrimes struct stat sb; 961573Srgrimes BTMETA m; 971573Srgrimes BTREE *t; 981573Srgrimes BTREEINFO b; 991573Srgrimes DB *dbp; 1001573Srgrimes pgno_t ncache; 1011573Srgrimes ssize_t nr; 1021573Srgrimes int machine_lorder; 1031573Srgrimes 1041573Srgrimes t = NULL; 1051573Srgrimes 1061573Srgrimes /* 1071573Srgrimes * Intention is to make sure all of the user's selections are okay 1081573Srgrimes * here and then use them without checking. Can't be complete, since 1091573Srgrimes * we don't know the right page size, lorder or flags until the backing 1101573Srgrimes * file is opened. Also, the file's page size can cause the cachesize 1111573Srgrimes * to change. 1121573Srgrimes */ 1131573Srgrimes machine_lorder = byteorder(); 1141573Srgrimes if (openinfo) { 1151573Srgrimes b = *openinfo; 1161573Srgrimes 1171573Srgrimes /* Flags: R_DUP. */ 1181573Srgrimes if (b.flags & ~(R_DUP)) 1191573Srgrimes goto einval; 1201573Srgrimes 1211573Srgrimes /* 1221573Srgrimes * Page size must be indx_t aligned and >= MINPSIZE. Default 1231573Srgrimes * page size is set farther on, based on the underlying file 1241573Srgrimes * transfer size. 1251573Srgrimes */ 1261573Srgrimes if (b.psize && 1271573Srgrimes (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || 12817141Sjkh b.psize & (sizeof(indx_t) - 1) )) 1291573Srgrimes goto einval; 1301573Srgrimes 1311573Srgrimes /* Minimum number of keys per page; absolute minimum is 2. */ 1321573Srgrimes if (b.minkeypage) { 1331573Srgrimes if (b.minkeypage < 2) 1341573Srgrimes goto einval; 1351573Srgrimes } else 1361573Srgrimes b.minkeypage = DEFMINKEYPAGE; 1371573Srgrimes 1381573Srgrimes /* If no comparison, use default comparison and prefix. */ 1391573Srgrimes if (b.compare == NULL) { 1401573Srgrimes b.compare = __bt_defcmp; 1411573Srgrimes if (b.prefix == NULL) 1421573Srgrimes b.prefix = __bt_defpfx; 1431573Srgrimes } 1441573Srgrimes 1451573Srgrimes if (b.lorder == 0) 1461573Srgrimes b.lorder = machine_lorder; 1471573Srgrimes } else { 1481573Srgrimes b.compare = __bt_defcmp; 1491573Srgrimes b.cachesize = 0; 1501573Srgrimes b.flags = 0; 1511573Srgrimes b.lorder = machine_lorder; 1521573Srgrimes b.minkeypage = DEFMINKEYPAGE; 1531573Srgrimes b.prefix = __bt_defpfx; 1541573Srgrimes b.psize = 0; 1551573Srgrimes } 1561573Srgrimes 1571573Srgrimes /* Check for the ubiquitous PDP-11. */ 1581573Srgrimes if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) 1591573Srgrimes goto einval; 1601573Srgrimes 1611573Srgrimes /* Allocate and initialize DB and BTREE structures. */ 1621573Srgrimes if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) 1631573Srgrimes goto err; 1641573Srgrimes memset(t, 0, sizeof(BTREE)); 1651573Srgrimes t->bt_fd = -1; /* Don't close unopened fd on error. */ 1661573Srgrimes t->bt_lorder = b.lorder; 1671573Srgrimes t->bt_order = NOT; 1681573Srgrimes t->bt_cmp = b.compare; 1691573Srgrimes t->bt_pfx = b.prefix; 1701573Srgrimes t->bt_rfd = -1; 1711573Srgrimes 1721573Srgrimes if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) 1731573Srgrimes goto err; 17414287Spst memset(t->bt_dbp, 0, sizeof(DB)); 1751573Srgrimes if (t->bt_lorder != machine_lorder) 17614287Spst F_SET(t, B_NEEDSWAP); 1771573Srgrimes 1781573Srgrimes dbp->type = DB_BTREE; 1791573Srgrimes dbp->internal = t; 1801573Srgrimes dbp->close = __bt_close; 1811573Srgrimes dbp->del = __bt_delete; 1821573Srgrimes dbp->fd = __bt_fd; 1831573Srgrimes dbp->get = __bt_get; 1841573Srgrimes dbp->put = __bt_put; 1851573Srgrimes dbp->seq = __bt_seq; 1861573Srgrimes dbp->sync = __bt_sync; 1871573Srgrimes 1881573Srgrimes /* 1891573Srgrimes * If no file name was supplied, this is an in-memory btree and we 1901573Srgrimes * open a backing temporary file. Otherwise, it's a disk-based tree. 1911573Srgrimes */ 1921573Srgrimes if (fname) { 19314287Spst switch (flags & O_ACCMODE) { 1941573Srgrimes case O_RDONLY: 19514287Spst F_SET(t, B_RDONLY); 1961573Srgrimes break; 1971573Srgrimes case O_RDWR: 1981573Srgrimes break; 1991573Srgrimes case O_WRONLY: 2001573Srgrimes default: 2011573Srgrimes goto einval; 2021573Srgrimes } 20314287Spst 2041573Srgrimes if ((t->bt_fd = open(fname, flags, mode)) < 0) 2051573Srgrimes goto err; 2061573Srgrimes 2071573Srgrimes } else { 2081573Srgrimes if ((flags & O_ACCMODE) != O_RDWR) 2091573Srgrimes goto einval; 2101573Srgrimes if ((t->bt_fd = tmp()) == -1) 2111573Srgrimes goto err; 21214287Spst F_SET(t, B_INMEM); 2131573Srgrimes } 2141573Srgrimes 2151573Srgrimes if (fcntl(t->bt_fd, F_SETFD, 1) == -1) 2161573Srgrimes goto err; 2171573Srgrimes 2181573Srgrimes if (fstat(t->bt_fd, &sb)) 2191573Srgrimes goto err; 2201573Srgrimes if (sb.st_size) { 22114287Spst if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) 2221573Srgrimes goto err; 2231573Srgrimes if (nr != sizeof(BTMETA)) 2241573Srgrimes goto eftype; 2251573Srgrimes 2261573Srgrimes /* 2271573Srgrimes * Read in the meta-data. This can change the notion of what 2281573Srgrimes * the lorder, page size and flags are, and, when the page size 2291573Srgrimes * changes, the cachesize value can change too. If the user 2301573Srgrimes * specified the wrong byte order for an existing database, we 2311573Srgrimes * don't bother to return an error, we just clear the NEEDSWAP 2321573Srgrimes * bit. 2331573Srgrimes */ 23414287Spst if (m.magic == BTREEMAGIC) 23514287Spst F_CLR(t, B_NEEDSWAP); 2361573Srgrimes else { 23714287Spst F_SET(t, B_NEEDSWAP); 23814287Spst M_32_SWAP(m.magic); 23914287Spst M_32_SWAP(m.version); 24014287Spst M_32_SWAP(m.psize); 24114287Spst M_32_SWAP(m.free); 24214287Spst M_32_SWAP(m.nrecs); 24314287Spst M_32_SWAP(m.flags); 2441573Srgrimes } 24514287Spst if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) 2461573Srgrimes goto eftype; 24714287Spst if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || 24817141Sjkh m.psize & (sizeof(indx_t) - 1) ) 2491573Srgrimes goto eftype; 25014287Spst if (m.flags & ~SAVEMETA) 2511573Srgrimes goto eftype; 25214287Spst b.psize = m.psize; 25314287Spst F_SET(t, m.flags); 25414287Spst t->bt_free = m.free; 25514287Spst t->bt_nrecs = m.nrecs; 2561573Srgrimes } else { 2571573Srgrimes /* 2581573Srgrimes * Set the page size to the best value for I/O to this file. 2591573Srgrimes * Don't overflow the page offset type. 2601573Srgrimes */ 2611573Srgrimes if (b.psize == 0) { 2621573Srgrimes b.psize = sb.st_blksize; 2631573Srgrimes if (b.psize < MINPSIZE) 2641573Srgrimes b.psize = MINPSIZE; 2651573Srgrimes if (b.psize > MAX_PAGE_OFFSET + 1) 2661573Srgrimes b.psize = MAX_PAGE_OFFSET + 1; 2671573Srgrimes } 2681573Srgrimes 2691573Srgrimes /* Set flag if duplicates permitted. */ 2701573Srgrimes if (!(b.flags & R_DUP)) 27114287Spst F_SET(t, B_NODUPS); 2721573Srgrimes 2731573Srgrimes t->bt_free = P_INVALID; 2741573Srgrimes t->bt_nrecs = 0; 27514287Spst F_SET(t, B_METADIRTY); 2761573Srgrimes } 2771573Srgrimes 2781573Srgrimes t->bt_psize = b.psize; 2791573Srgrimes 2801573Srgrimes /* Set the cache size; must be a multiple of the page size. */ 28117141Sjkh if (b.cachesize && b.cachesize & (b.psize - 1) ) 28217141Sjkh b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1; 2831573Srgrimes if (b.cachesize < b.psize * MINCACHE) 2841573Srgrimes b.cachesize = b.psize * MINCACHE; 2851573Srgrimes 2861573Srgrimes /* Calculate number of pages to cache. */ 2871573Srgrimes ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; 2881573Srgrimes 2891573Srgrimes /* 2901573Srgrimes * The btree data structure requires that at least two keys can fit on 2911573Srgrimes * a page, but other than that there's no fixed requirement. The user 2921573Srgrimes * specified a minimum number per page, and we translated that into the 2931573Srgrimes * number of bytes a key/data pair can use before being placed on an 2941573Srgrimes * overflow page. This calculation includes the page header, the size 2951573Srgrimes * of the index referencing the leaf item and the size of the leaf item 2961573Srgrimes * structure. Also, don't let the user specify a minkeypage such that 2971573Srgrimes * a key/data pair won't fit even if both key and data are on overflow 2981573Srgrimes * pages. 2991573Srgrimes */ 3001573Srgrimes t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - 3011573Srgrimes (sizeof(indx_t) + NBLEAFDBT(0, 0)); 3021573Srgrimes if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) 3031573Srgrimes t->bt_ovflsize = 3041573Srgrimes NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); 3051573Srgrimes 3061573Srgrimes /* Initialize the buffer pool. */ 3071573Srgrimes if ((t->bt_mp = 3081573Srgrimes mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) 3091573Srgrimes goto err; 31014287Spst if (!F_ISSET(t, B_INMEM)) 3111573Srgrimes mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); 3121573Srgrimes 3131573Srgrimes /* Create a root page if new tree. */ 3141573Srgrimes if (nroot(t) == RET_ERROR) 3151573Srgrimes goto err; 3161573Srgrimes 3171573Srgrimes /* Global flags. */ 3181573Srgrimes if (dflags & DB_LOCK) 31914287Spst F_SET(t, B_DB_LOCK); 3201573Srgrimes if (dflags & DB_SHMEM) 32114287Spst F_SET(t, B_DB_SHMEM); 3221573Srgrimes if (dflags & DB_TXN) 32314287Spst F_SET(t, B_DB_TXN); 3241573Srgrimes 3251573Srgrimes return (dbp); 3261573Srgrimes 3271573Srgrimeseinval: errno = EINVAL; 3281573Srgrimes goto err; 3291573Srgrimes 3301573Srgrimeseftype: errno = EFTYPE; 3311573Srgrimes goto err; 3321573Srgrimes 3331573Srgrimeserr: if (t) { 3341573Srgrimes if (t->bt_dbp) 3351573Srgrimes free(t->bt_dbp); 3361573Srgrimes if (t->bt_fd != -1) 3371573Srgrimes (void)close(t->bt_fd); 3381573Srgrimes free(t); 3391573Srgrimes } 3401573Srgrimes return (NULL); 3411573Srgrimes} 3421573Srgrimes 3431573Srgrimes/* 3441573Srgrimes * NROOT -- Create the root of a new tree. 3451573Srgrimes * 3461573Srgrimes * Parameters: 3471573Srgrimes * t: tree 3481573Srgrimes * 3491573Srgrimes * Returns: 3501573Srgrimes * RET_ERROR, RET_SUCCESS 3511573Srgrimes */ 3521573Srgrimesstatic int 3531573Srgrimesnroot(t) 3541573Srgrimes BTREE *t; 3551573Srgrimes{ 3561573Srgrimes PAGE *meta, *root; 3571573Srgrimes pgno_t npg; 3581573Srgrimes 3591573Srgrimes if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { 3601573Srgrimes mpool_put(t->bt_mp, meta, 0); 3611573Srgrimes return (RET_SUCCESS); 3621573Srgrimes } 36314287Spst if (errno != EINVAL) /* It's OK to not exist. */ 3641573Srgrimes return (RET_ERROR); 36514287Spst errno = 0; 3661573Srgrimes 3671573Srgrimes if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) 3681573Srgrimes return (RET_ERROR); 3691573Srgrimes 3701573Srgrimes if ((root = mpool_new(t->bt_mp, &npg)) == NULL) 3711573Srgrimes return (RET_ERROR); 3721573Srgrimes 3731573Srgrimes if (npg != P_ROOT) 3741573Srgrimes return (RET_ERROR); 3751573Srgrimes root->pgno = npg; 3761573Srgrimes root->prevpg = root->nextpg = P_INVALID; 3771573Srgrimes root->lower = BTDATAOFF; 3781573Srgrimes root->upper = t->bt_psize; 3791573Srgrimes root->flags = P_BLEAF; 3801573Srgrimes memset(meta, 0, t->bt_psize); 3811573Srgrimes mpool_put(t->bt_mp, meta, MPOOL_DIRTY); 3821573Srgrimes mpool_put(t->bt_mp, root, MPOOL_DIRTY); 3831573Srgrimes return (RET_SUCCESS); 3841573Srgrimes} 3851573Srgrimes 3861573Srgrimesstatic int 3871573Srgrimestmp() 3881573Srgrimes{ 3891573Srgrimes sigset_t set, oset; 3901573Srgrimes int fd; 3911573Srgrimes char *envtmp; 3921573Srgrimes char path[MAXPATHLEN]; 3931573Srgrimes 3941573Srgrimes envtmp = getenv("TMPDIR"); 3951573Srgrimes (void)snprintf(path, 3961573Srgrimes sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp"); 3971573Srgrimes 3981573Srgrimes (void)sigfillset(&set); 3991573Srgrimes (void)sigprocmask(SIG_BLOCK, &set, &oset); 4001573Srgrimes if ((fd = mkstemp(path)) != -1) 4011573Srgrimes (void)unlink(path); 4021573Srgrimes (void)sigprocmask(SIG_SETMASK, &oset, NULL); 4031573Srgrimes return(fd); 4041573Srgrimes} 4051573Srgrimes 4061573Srgrimesstatic int 4071573Srgrimesbyteorder() 4081573Srgrimes{ 4091573Srgrimes u_int32_t x; 4101573Srgrimes u_char *p; 4111573Srgrimes 4121573Srgrimes x = 0x01020304; 4131573Srgrimes p = (u_char *)&x; 4141573Srgrimes switch (*p) { 4151573Srgrimes case 1: 4161573Srgrimes return (BIG_ENDIAN); 4171573Srgrimes case 4: 4181573Srgrimes return (LITTLE_ENDIAN); 4191573Srgrimes default: 4201573Srgrimes return (0); 4211573Srgrimes } 4221573Srgrimes} 4231573Srgrimes 4241573Srgrimesint 4251573Srgrimes__bt_fd(dbp) 4261573Srgrimes const DB *dbp; 4271573Srgrimes{ 4281573Srgrimes BTREE *t; 4291573Srgrimes 4301573Srgrimes t = dbp->internal; 4311573Srgrimes 4321573Srgrimes /* Toss any page pinned across calls. */ 4331573Srgrimes if (t->bt_pinned != NULL) { 4341573Srgrimes mpool_put(t->bt_mp, t->bt_pinned, 0); 4351573Srgrimes t->bt_pinned = NULL; 4361573Srgrimes } 4371573Srgrimes 4381573Srgrimes /* In-memory database can't have a file descriptor. */ 43914287Spst if (F_ISSET(t, B_INMEM)) { 4401573Srgrimes errno = ENOENT; 4411573Srgrimes return (-1); 4421573Srgrimes } 4431573Srgrimes return (t->bt_fd); 4441573Srgrimes} 445