bt_open.c revision 68234
16059Samurai/*- 26059Samurai * Copyright (c) 1990, 1993, 1994 36059Samurai * The Regents of the University of California. All rights reserved. 46059Samurai * 56059Samurai * This code is derived from software contributed to Berkeley by 66059Samurai * Mike Olson. 76059Samurai * 86059Samurai * Redistribution and use in source and binary forms, with or without 96059Samurai * modification, are permitted provided that the following conditions 106059Samurai * are met: 116059Samurai * 1. Redistributions of source code must retain the above copyright 126059Samurai * notice, this list of conditions and the following disclaimer. 136059Samurai * 2. Redistributions in binary form must reproduce the above copyright 146059Samurai * notice, this list of conditions and the following disclaimer in the 156059Samurai * documentation and/or other materials provided with the distribution. 166059Samurai * 3. All advertising materials mentioning features or use of this software 176059Samurai * must display the following acknowledgement: 186059Samurai * This product includes software developed by the University of 196059Samurai * California, Berkeley and its contributors. 2032860Sbrian * 4. Neither the name of the University nor the names of its contributors 218857Srgrimes * may be used to endorse or promote products derived from this software 226059Samurai * without specific prior written permission. 236059Samurai * 246059Samurai * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 256059Samurai * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2630715Sbrian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2731195Sbrian * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2831195Sbrian * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2930715Sbrian * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3032663Sbrian * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3132663Sbrian * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3230715Sbrian * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3330715Sbrian * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3430715Sbrian * SUCH DAMAGE. 3530715Sbrian * 3630715Sbrian * $FreeBSD: head/lib/libc/db/btree/bt_open.c 68234 2000-11-02 10:14:09Z kris $ 3730715Sbrian */ 386059Samurai 3911336Samurai#if defined(LIBC_SCCS) && !defined(lint) 4030715Sbrianstatic char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; 4130715Sbrian#endif /* LIBC_SCCS and not lint */ 4230715Sbrian 4330715Sbrian/* 446059Samurai * Implementation of btree access method for 4.4BSD. 4530715Sbrian * 466059Samurai * The design here was originally based on that of the btree access method 4718786Sjkh * used in the Postgres database system at UC Berkeley. This implementation 4830715Sbrian * is wholly independent of the Postgres code. 4931343Sbrian */ 5030715Sbrian 5130715Sbrian#include <sys/param.h> 5230715Sbrian#include <sys/stat.h> 5331061Sbrian 5430715Sbrian#include <errno.h> 5530715Sbrian#include <fcntl.h> 566059Samurai#include <limits.h> 576059Samurai#include <signal.h> 586059Samurai#include <stdio.h> 5931514Sbrian#include <stdlib.h> 6013389Sphk#include <string.h> 616059Samurai#include <unistd.h> 6226142Sbrian 636059Samurai#include <db.h> 646735Samurai#include "btree.h" 657001Samurai 6613389Sphk#ifdef DEBUG 6713389Sphk#undef MINPSIZE 6823840Sbrian#define MINPSIZE 128 6926940Sbrian#endif 7030715Sbrian 7130715Sbrianstatic int byteorder __P((void)); 7230715Sbrianstatic int nroot __P((BTREE *)); 7331158Sbrianstatic int tmp __P((void)); 7431195Sbrian 7531343Sbrian/* 766059Samurai * __BT_OPEN -- Open a btree. 776735Samurai * 786735Samurai * Creates and fills a DB struct, and calls the routine that actually 796735Samurai * opens the btree. 806735Samurai * 816735Samurai * Parameters: 826735Samurai * fname: filename (NULL for in-memory trees) 8330715Sbrian * flags: open flag bits 8430715Sbrian * mode: open permission bits 856059Samurai * b: BTREEINFO pointer 8628679Sbrian * 8728679Sbrian * Returns: 8820813Sjkh * NULL on failure, pointer to DB on success. 8925634Sbrian * 9027061Sbrian */ 916059SamuraiDB * 9230715Sbrian__bt_open(fname, flags, mode, openinfo, dflags) 9330715Sbrian const char *fname; 9431343Sbrian int flags, mode, dflags; 9530715Sbrian const BTREEINFO *openinfo; 966059Samurai{ 9726858Sbrian struct stat sb; 986059Samurai BTMETA m; 996059Samurai BTREE *t; 1006059Samurai BTREEINFO b; 1016059Samurai DB *dbp; 10232129Sbrian pgno_t ncache; 10325630Sbrian ssize_t nr; 10428679Sbrian int machine_lorder; 10532129Sbrian 10625630Sbrian t = NULL; 1076059Samurai 10828679Sbrian /* 1096059Samurai * Intention is to make sure all of the user's selections are okay 1106059Samurai * here and then use them without checking. Can't be complete, since 1116059Samurai * we don't know the right page size, lorder or flags until the backing 11226858Sbrian * file is opened. Also, the file's page size can cause the cachesize 11326858Sbrian * to change. 1146059Samurai */ 1156059Samurai machine_lorder = byteorder(); 1166059Samurai if (openinfo) { 11732129Sbrian b = *openinfo; 1186059Samurai 1196059Samurai /* Flags: R_DUP. */ 1206059Samurai if (b.flags & ~(R_DUP)) 1216059Samurai goto einval; 1226059Samurai 1236059Samurai /* 12410528Samurai * Page size must be indx_t aligned and >= MINPSIZE. Default 12528679Sbrian * page size is set farther on, based on the underlying file 1266059Samurai * transfer size. 1276059Samurai */ 1286059Samurai if (b.psize && 1296059Samurai (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || 1306059Samurai b.psize & (sizeof(indx_t) - 1) )) 1316059Samurai goto einval; 13232129Sbrian 13328679Sbrian /* Minimum number of keys per page; absolute minimum is 2. */ 1346059Samurai if (b.minkeypage) { 1356059Samurai if (b.minkeypage < 2) 13632129Sbrian goto einval; 13732129Sbrian } else 13825630Sbrian b.minkeypage = DEFMINKEYPAGE; 13928679Sbrian 14032129Sbrian /* If no comparison, use default comparison and prefix. */ 14125630Sbrian if (b.compare == NULL) { 1426059Samurai b.compare = __bt_defcmp; 14328679Sbrian if (b.prefix == NULL) 14428679Sbrian b.prefix = __bt_defpfx; 1456059Samurai } 1466059Samurai 1476059Samurai if (b.lorder == 0) 1486059Samurai b.lorder = machine_lorder; 1496059Samurai } else { 1506059Samurai b.compare = __bt_defcmp; 1516059Samurai b.cachesize = 0; 1526059Samurai b.flags = 0; 1536059Samurai b.lorder = machine_lorder; 1546059Samurai b.minkeypage = DEFMINKEYPAGE; 15532129Sbrian b.prefix = __bt_defpfx; 15632129Sbrian b.psize = 0; 15725630Sbrian } 15828679Sbrian 15932129Sbrian /* Check for the ubiquitous PDP-11. */ 16025630Sbrian if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) 1616059Samurai goto einval; 1626059Samurai 1636059Samurai /* Allocate and initialize DB and BTREE structures. */ 1646059Samurai if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) 16510528Samurai goto err; 1666059Samurai memset(t, 0, sizeof(BTREE)); 1676059Samurai t->bt_fd = -1; /* Don't close unopened fd on error. */ 1686059Samurai t->bt_lorder = b.lorder; 16932129Sbrian t->bt_order = NOT; 17025630Sbrian t->bt_cmp = b.compare; 17128679Sbrian t->bt_pfx = b.prefix; 17232129Sbrian t->bt_rfd = -1; 17325630Sbrian 17432129Sbrian if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) 17510528Samurai goto err; 17610528Samurai memset(t->bt_dbp, 0, sizeof(DB)); 17710528Samurai if (t->bt_lorder != machine_lorder) 17828679Sbrian F_SET(t, B_NEEDSWAP); 17910528Samurai 18032021Sbrian dbp->type = DB_BTREE; 18131081Sbrian dbp->internal = t; 18230825Sbrian dbp->close = __bt_close; 18330825Sbrian dbp->del = __bt_delete; 18430697Sbrian dbp->fd = __bt_fd; 18531121Sbrian dbp->get = __bt_get; 18631061Sbrian dbp->put = __bt_put; 18723863Sbrian dbp->seq = __bt_seq; 18823863Sbrian dbp->sync = __bt_sync; 18928679Sbrian 19028679Sbrian /* 19128679Sbrian * If no file name was supplied, this is an in-memory btree and we 19223863Sbrian * open a backing temporary file. Otherwise, it's a disk-based tree. 19328679Sbrian */ 19423863Sbrian if (fname) { 19523863Sbrian switch (flags & O_ACCMODE) { 19628679Sbrian case O_RDONLY: 19710528Samurai F_SET(t, B_RDONLY); 19831061Sbrian break; 1996059Samurai case O_RDWR: 2006059Samurai break; 2016059Samurai case O_WRONLY: 2026059Samurai default: 2036059Samurai goto einval; 20428679Sbrian } 2056059Samurai 20626858Sbrian if ((t->bt_fd = _open(fname, flags, mode)) < 0) 20731121Sbrian goto err; 20827157Sbrian 20928679Sbrian } else { 21028684Sbrian if ((flags & O_ACCMODE) != O_RDWR) 21128684Sbrian goto einval; 21230715Sbrian if ((t->bt_fd = tmp()) == -1) 21331121Sbrian goto err; 2146059Samurai F_SET(t, B_INMEM); 2156059Samurai } 2166059Samurai 21728679Sbrian if (_fcntl(t->bt_fd, F_SETFD, 1) == -1) 2186059Samurai goto err; 21928679Sbrian 22028679Sbrian if (fstat(t->bt_fd, &sb)) 22128679Sbrian goto err; 22228679Sbrian if (sb.st_size) { 22328679Sbrian if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0) 22428679Sbrian goto err; 22528679Sbrian if (nr != sizeof(BTMETA)) 22628679Sbrian goto eftype; 2276059Samurai 2286059Samurai /* 22910528Samurai * Read in the meta-data. This can change the notion of what 23031343Sbrian * the lorder, page size and flags are, and, when the page size 23110528Samurai * changes, the cachesize value can change too. If the user 23223840Sbrian * specified the wrong byte order for an existing database, we 23323840Sbrian * don't bother to return an error, we just clear the NEEDSWAP 23432129Sbrian * bit. 23510528Samurai */ 23610528Samurai if (m.magic == BTREEMAGIC) 23710528Samurai F_CLR(t, B_NEEDSWAP); 23828679Sbrian else { 23910528Samurai F_SET(t, B_NEEDSWAP); 24023840Sbrian M_32_SWAP(m.magic); 24110528Samurai M_32_SWAP(m.version); 24223840Sbrian M_32_SWAP(m.psize); 24310528Samurai M_32_SWAP(m.free); 24410528Samurai M_32_SWAP(m.nrecs); 24510528Samurai M_32_SWAP(m.flags); 24626940Sbrian } 24728679Sbrian if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) 24826940Sbrian goto eftype; 24926940Sbrian if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || 25028679Sbrian m.psize & (sizeof(indx_t) - 1) ) 25131081Sbrian goto eftype; 25231081Sbrian if (m.flags & ~SAVEMETA) 25328679Sbrian goto eftype; 25429083Sbrian b.psize = m.psize; 25529083Sbrian F_SET(t, m.flags); 25626940Sbrian t->bt_free = m.free; 25726940Sbrian t->bt_nrecs = m.nrecs; 25831081Sbrian } else { 25931081Sbrian /* 26031081Sbrian * Set the page size to the best value for I/O to this file. 26131081Sbrian * Don't overflow the page offset type. 26231081Sbrian */ 26331081Sbrian if (b.psize == 0) { 26431081Sbrian b.psize = sb.st_blksize; 26531081Sbrian if (b.psize < MINPSIZE) 26631343Sbrian b.psize = MINPSIZE; 26725908Sbrian if (b.psize > MAX_PAGE_OFFSET + 1) 26825908Sbrian b.psize = MAX_PAGE_OFFSET + 1; 26925908Sbrian } 27031343Sbrian 27131343Sbrian /* Set flag if duplicates permitted. */ 27231343Sbrian if (!(b.flags & R_DUP)) 27331343Sbrian F_SET(t, B_NODUPS); 27410528Samurai 27531962Sbrian t->bt_free = P_INVALID; 27625908Sbrian t->bt_nrecs = 0; 27725908Sbrian F_SET(t, B_METADIRTY); 27825908Sbrian } 27925908Sbrian 28025908Sbrian t->bt_psize = b.psize; 28130715Sbrian 28231343Sbrian /* Set the cache size; must be a multiple of the page size. */ 2836059Samurai if (b.cachesize && b.cachesize & (b.psize - 1) ) 28420120Snate b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1; 28531343Sbrian if (b.cachesize < b.psize * MINCACHE) 28631343Sbrian b.cachesize = b.psize * MINCACHE; 28731343Sbrian 28831343Sbrian /* Calculate number of pages to cache. */ 28931343Sbrian ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; 2906059Samurai 2916059Samurai /* 2926059Samurai * The btree data structure requires that at least two keys can fit on 29331197Sbrian * a page, but other than that there's no fixed requirement. The user 2946059Samurai * specified a minimum number per page, and we translated that into the 2956059Samurai * number of bytes a key/data pair can use before being placed on an 2966059Samurai * overflow page. This calculation includes the page header, the size 2976059Samurai * of the index referencing the leaf item and the size of the leaf item 2986059Samurai * structure. Also, don't let the user specify a minkeypage such that 2996059Samurai * a key/data pair won't fit even if both key and data are on overflow 30031121Sbrian * pages. 3016059Samurai */ 3026059Samurai t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - 30331121Sbrian (sizeof(indx_t) + NBLEAFDBT(0, 0)); 3046059Samurai if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) 30531121Sbrian t->bt_ovflsize = 30631121Sbrian NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); 30731121Sbrian 30831121Sbrian /* Initialize the buffer pool. */ 30931121Sbrian if ((t->bt_mp = 3106059Samurai mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) 31131121Sbrian goto err; 31231121Sbrian if (!F_ISSET(t, B_INMEM)) 3136059Samurai mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); 31431121Sbrian 31531121Sbrian /* Create a root page if new tree. */ 31631121Sbrian if (nroot(t) == RET_ERROR) 31731121Sbrian goto err; 31831343Sbrian 31931121Sbrian /* Global flags. */ 32026142Sbrian if (dflags & DB_LOCK) 32128679Sbrian F_SET(t, B_DB_LOCK); 32226142Sbrian if (dflags & DB_SHMEM) 32328679Sbrian F_SET(t, B_DB_SHMEM); 32428679Sbrian if (dflags & DB_TXN) 32531343Sbrian F_SET(t, B_DB_TXN); 32628679Sbrian 3276059Samurai return (dbp); 3286059Samurai 32928679Sbrianeinval: errno = EINVAL; 33028679Sbrian goto err; 3316059Samurai 3326059Samuraieftype: errno = EFTYPE; 3336059Samurai goto err; 3346059Samurai 3356059Samuraierr: if (t) { 3366059Samurai if (t->bt_dbp) 3376059Samurai free(t->bt_dbp); 3386059Samurai if (t->bt_fd != -1) 3396059Samurai (void)_close(t->bt_fd); 3406059Samurai free(t); 34131197Sbrian } 34231197Sbrian return (NULL); 3436059Samurai} 3446059Samurai 34526940Sbrian/* 34628679Sbrian * NROOT -- Create the root of a new tree. 3476059Samurai * 34825707Sbrian * Parameters: 34931197Sbrian * t: tree 35031823Sbrian * 35126516Sbrian * Returns: 35231823Sbrian * RET_ERROR, RET_SUCCESS 35331823Sbrian */ 35431823Sbrianstatic int 35531823Sbriannroot(t) 35631823Sbrian BTREE *t; 35731823Sbrian{ 35831823Sbrian PAGE *meta, *root; 35931823Sbrian pgno_t npg; 36031823Sbrian 36131823Sbrian if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { 36226551Sbrian mpool_put(t->bt_mp, meta, 0); 36330715Sbrian return (RET_SUCCESS); 36428679Sbrian } 36526516Sbrian if (errno != EINVAL) /* It's OK to not exist. */ 36632129Sbrian return (RET_ERROR); 36732129Sbrian errno = 0; 36828679Sbrian 36928679Sbrian if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) 37031197Sbrian return (RET_ERROR); 37131121Sbrian 37226551Sbrian if ((root = mpool_new(t->bt_mp, &npg)) == NULL) 37331121Sbrian return (RET_ERROR); 37431121Sbrian 37531158Sbrian if (npg != P_ROOT) 37631158Sbrian return (RET_ERROR); 37731158Sbrian root->pgno = npg; 37831158Sbrian root->prevpg = root->nextpg = P_INVALID; 37931158Sbrian root->lower = BTDATAOFF; 38031158Sbrian root->upper = t->bt_psize; 38131158Sbrian root->flags = P_BLEAF; 38231158Sbrian memset(meta, 0, t->bt_psize); 38331158Sbrian mpool_put(t->bt_mp, meta, MPOOL_DIRTY); 38431158Sbrian mpool_put(t->bt_mp, root, MPOOL_DIRTY); 38531158Sbrian return (RET_SUCCESS); 38631158Sbrian} 38731158Sbrian 38831158Sbrianstatic int 38931158Sbriantmp() 39031197Sbrian{ 39131121Sbrian sigset_t set, oset; 39231157Sbrian int fd; 39331157Sbrian char *envtmp = NULL; 39431197Sbrian char path[MAXPATHLEN]; 39531157Sbrian 39631157Sbrian if (issetugid() == 0) 39731157Sbrian envtmp = getenv("TMPDIR"); 39831121Sbrian (void)snprintf(path, 39929083Sbrian sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp"); 40031121Sbrian 40131196Sbrian (void)sigfillset(&set); 40231196Sbrian (void)sigprocmask(SIG_BLOCK, &set, &oset); 40332833Sbrian if ((fd = mkstemp(path)) != -1) 4046059Samurai (void)unlink(path); 4056059Samurai (void)sigprocmask(SIG_SETMASK, &oset, NULL); 40631285Sbrian return(fd); 40731285Sbrian} 40831285Sbrian 40926516Sbrianstatic int 41026516Sbrianbyteorder() 4116059Samurai{ 4126059Samurai u_int32_t x; 41331690Sbrian u_char *p; 41426940Sbrian 4156059Samurai x = 0x01020304; 41632351Sbrian p = (u_char *)&x; 41732833Sbrian switch (*p) { 41831197Sbrian case 1: 41926516Sbrian return (BIG_ENDIAN); 42028679Sbrian case 4: 42128679Sbrian return (LITTLE_ENDIAN); 42226940Sbrian default: 4236059Samurai return (0); 42431121Sbrian } 42527157Sbrian} 42623840Sbrian 42727157Sbrianint 42823840Sbrian__bt_fd(dbp) 4296735Samurai const DB *dbp; 43024753Sache{ 4316735Samurai BTREE *t; 4326735Samurai 43323840Sbrian t = dbp->internal; 4346735Samurai 43528679Sbrian /* Toss any page pinned across calls. */ 43610528Samurai if (t->bt_pinned != NULL) { 43726940Sbrian mpool_put(t->bt_mp, t->bt_pinned, 0); 43810528Samurai t->bt_pinned = NULL; 43910528Samurai } 44026940Sbrian 44110528Samurai /* In-memory database can't have a file descriptor. */ 44210528Samurai if (F_ISSET(t, B_INMEM)) { 44326940Sbrian errno = ENOENT; 44410528Samurai return (-1); 44526940Sbrian } 44631121Sbrian return (t->bt_fd); 44726940Sbrian} 44826940Sbrian