11573Srgrimes/*- 214287Spst * Copyright (c) 1990, 1993, 1994 31573Srgrimes * The Regents of the University of California. All rights reserved. 41573Srgrimes * 51573Srgrimes * This code is derived from software contributed to Berkeley by 61573Srgrimes * Mike Olson. 71573Srgrimes * 81573Srgrimes * Redistribution and use in source and binary forms, with or without 91573Srgrimes * modification, are permitted provided that the following conditions 101573Srgrimes * are met: 111573Srgrimes * 1. Redistributions of source code must retain the above copyright 121573Srgrimes * notice, this list of conditions and the following disclaimer. 131573Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141573Srgrimes * notice, this list of conditions and the following disclaimer in the 151573Srgrimes * documentation and/or other materials provided with the distribution. 161573Srgrimes * 4. Neither the name of the University nor the names of its contributors 171573Srgrimes * may be used to endorse or promote products derived from this software 181573Srgrimes * without specific prior written permission. 191573Srgrimes * 201573Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211573Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221573Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231573Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241573Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251573Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261573Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271573Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281573Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291573Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301573Srgrimes * SUCH DAMAGE. 311573Srgrimes */ 321573Srgrimes 331573Srgrimes#if defined(LIBC_SCCS) && !defined(lint) 3414287Spststatic char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; 351573Srgrimes#endif /* LIBC_SCCS and not lint */ 3692986Sobrien#include <sys/cdefs.h> 3792986Sobrien__FBSDID("$FreeBSD$"); 381573Srgrimes 391573Srgrimes/* 401573Srgrimes * Implementation of btree access method for 4.4BSD. 411573Srgrimes * 421573Srgrimes * The design here was originally based on that of the btree access method 431573Srgrimes * used in the Postgres database system at UC Berkeley. This implementation 441573Srgrimes * is wholly independent of the Postgres code. 451573Srgrimes */ 461573Srgrimes 4771579Sdeischen#include "namespace.h" 481573Srgrimes#include <sys/param.h> 491573Srgrimes#include <sys/stat.h> 501573Srgrimes 511573Srgrimes#include <errno.h> 521573Srgrimes#include <fcntl.h> 531573Srgrimes#include <limits.h> 541573Srgrimes#include <signal.h> 551573Srgrimes#include <stdio.h> 561573Srgrimes#include <stdlib.h> 571573Srgrimes#include <string.h> 581573Srgrimes#include <unistd.h> 5971579Sdeischen#include "un-namespace.h" 60287292Skib#include "libc_private.h" 611573Srgrimes 621573Srgrimes#include <db.h> 631573Srgrimes#include "btree.h" 641573Srgrimes 6514287Spst#ifdef DEBUG 6614287Spst#undef MINPSIZE 6714287Spst#define MINPSIZE 128 6814287Spst#endif 6914287Spst 7092905Sobrienstatic int byteorder(void); 7192905Sobrienstatic int nroot(BTREE *); 7292905Sobrienstatic int tmp(void); 731573Srgrimes 741573Srgrimes/* 751573Srgrimes * __BT_OPEN -- Open a btree. 761573Srgrimes * 771573Srgrimes * Creates and fills a DB struct, and calls the routine that actually 781573Srgrimes * opens the btree. 791573Srgrimes * 801573Srgrimes * Parameters: 811573Srgrimes * fname: filename (NULL for in-memory trees) 821573Srgrimes * flags: open flag bits 831573Srgrimes * mode: open permission bits 841573Srgrimes * b: BTREEINFO pointer 851573Srgrimes * 861573Srgrimes * Returns: 871573Srgrimes * NULL on failure, pointer to DB on success. 881573Srgrimes * 891573Srgrimes */ 901573SrgrimesDB * 91189291Sdelphij__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags) 921573Srgrimes{ 931573Srgrimes struct stat sb; 941573Srgrimes BTMETA m; 951573Srgrimes BTREE *t; 961573Srgrimes BTREEINFO b; 971573Srgrimes DB *dbp; 981573Srgrimes pgno_t ncache; 991573Srgrimes ssize_t nr; 100190344Sdelphij int machine_lorder, saved_errno; 1011573Srgrimes 1021573Srgrimes t = NULL; 1031573Srgrimes 1041573Srgrimes /* 1051573Srgrimes * Intention is to make sure all of the user's selections are okay 1061573Srgrimes * here and then use them without checking. Can't be complete, since 1071573Srgrimes * we don't know the right page size, lorder or flags until the backing 1081573Srgrimes * file is opened. Also, the file's page size can cause the cachesize 1091573Srgrimes * to change. 1101573Srgrimes */ 1111573Srgrimes machine_lorder = byteorder(); 1121573Srgrimes if (openinfo) { 1131573Srgrimes b = *openinfo; 1141573Srgrimes 1151573Srgrimes /* Flags: R_DUP. */ 1161573Srgrimes if (b.flags & ~(R_DUP)) 1171573Srgrimes goto einval; 1181573Srgrimes 1191573Srgrimes /* 1201573Srgrimes * Page size must be indx_t aligned and >= MINPSIZE. Default 1211573Srgrimes * page size is set farther on, based on the underlying file 1221573Srgrimes * transfer size. 1231573Srgrimes */ 1241573Srgrimes if (b.psize && 1251573Srgrimes (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || 12617141Sjkh b.psize & (sizeof(indx_t) - 1) )) 1271573Srgrimes goto einval; 1281573Srgrimes 1291573Srgrimes /* Minimum number of keys per page; absolute minimum is 2. */ 1301573Srgrimes if (b.minkeypage) { 1311573Srgrimes if (b.minkeypage < 2) 1321573Srgrimes goto einval; 1331573Srgrimes } else 1341573Srgrimes b.minkeypage = DEFMINKEYPAGE; 1351573Srgrimes 1361573Srgrimes /* If no comparison, use default comparison and prefix. */ 1371573Srgrimes if (b.compare == NULL) { 1381573Srgrimes b.compare = __bt_defcmp; 1391573Srgrimes if (b.prefix == NULL) 1401573Srgrimes b.prefix = __bt_defpfx; 1411573Srgrimes } 1421573Srgrimes 1431573Srgrimes if (b.lorder == 0) 1441573Srgrimes b.lorder = machine_lorder; 1451573Srgrimes } else { 1461573Srgrimes b.compare = __bt_defcmp; 1471573Srgrimes b.cachesize = 0; 1481573Srgrimes b.flags = 0; 1491573Srgrimes b.lorder = machine_lorder; 1501573Srgrimes b.minkeypage = DEFMINKEYPAGE; 1511573Srgrimes b.prefix = __bt_defpfx; 1521573Srgrimes b.psize = 0; 1531573Srgrimes } 1541573Srgrimes 1551573Srgrimes /* Check for the ubiquitous PDP-11. */ 1561573Srgrimes if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) 1571573Srgrimes goto einval; 1581573Srgrimes 1591573Srgrimes /* Allocate and initialize DB and BTREE structures. */ 160190482Sdelphij if ((t = (BTREE *)calloc(1, sizeof(BTREE))) == NULL) 1611573Srgrimes goto err; 1621573Srgrimes t->bt_fd = -1; /* Don't close unopened fd on error. */ 1631573Srgrimes t->bt_lorder = b.lorder; 1641573Srgrimes t->bt_order = NOT; 1651573Srgrimes t->bt_cmp = b.compare; 1661573Srgrimes t->bt_pfx = b.prefix; 1671573Srgrimes t->bt_rfd = -1; 1681573Srgrimes 169190482Sdelphij if ((t->bt_dbp = dbp = (DB *)calloc(1, sizeof(DB))) == NULL) 1701573Srgrimes goto err; 1711573Srgrimes if (t->bt_lorder != machine_lorder) 17214287Spst F_SET(t, B_NEEDSWAP); 1731573Srgrimes 1741573Srgrimes dbp->type = DB_BTREE; 1751573Srgrimes dbp->internal = t; 1761573Srgrimes dbp->close = __bt_close; 1771573Srgrimes dbp->del = __bt_delete; 1781573Srgrimes dbp->fd = __bt_fd; 1791573Srgrimes dbp->get = __bt_get; 1801573Srgrimes dbp->put = __bt_put; 1811573Srgrimes dbp->seq = __bt_seq; 1821573Srgrimes dbp->sync = __bt_sync; 1831573Srgrimes 1841573Srgrimes /* 1851573Srgrimes * If no file name was supplied, this is an in-memory btree and we 1861573Srgrimes * open a backing temporary file. Otherwise, it's a disk-based tree. 1871573Srgrimes */ 1881573Srgrimes if (fname) { 18914287Spst switch (flags & O_ACCMODE) { 1901573Srgrimes case O_RDONLY: 19114287Spst F_SET(t, B_RDONLY); 1921573Srgrimes break; 1931573Srgrimes case O_RDWR: 1941573Srgrimes break; 1951573Srgrimes case O_WRONLY: 1961573Srgrimes default: 1971573Srgrimes goto einval; 1981573Srgrimes } 199189327Sdelphij 200254289Sjilles if ((t->bt_fd = _open(fname, flags | O_CLOEXEC, mode)) < 0) 2011573Srgrimes goto err; 2021573Srgrimes 2031573Srgrimes } else { 2041573Srgrimes if ((flags & O_ACCMODE) != O_RDWR) 2051573Srgrimes goto einval; 2061573Srgrimes if ((t->bt_fd = tmp()) == -1) 2071573Srgrimes goto err; 20814287Spst F_SET(t, B_INMEM); 2091573Srgrimes } 2101573Srgrimes 21171579Sdeischen if (_fstat(t->bt_fd, &sb)) 2121573Srgrimes goto err; 2131573Srgrimes if (sb.st_size) { 21456698Sjasone if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0) 2151573Srgrimes goto err; 2161573Srgrimes if (nr != sizeof(BTMETA)) 2171573Srgrimes goto eftype; 2181573Srgrimes 2191573Srgrimes /* 2201573Srgrimes * Read in the meta-data. This can change the notion of what 2211573Srgrimes * the lorder, page size and flags are, and, when the page size 2221573Srgrimes * changes, the cachesize value can change too. If the user 2231573Srgrimes * specified the wrong byte order for an existing database, we 2241573Srgrimes * don't bother to return an error, we just clear the NEEDSWAP 2251573Srgrimes * bit. 2261573Srgrimes */ 22714287Spst if (m.magic == BTREEMAGIC) 22814287Spst F_CLR(t, B_NEEDSWAP); 2291573Srgrimes else { 23014287Spst F_SET(t, B_NEEDSWAP); 23114287Spst M_32_SWAP(m.magic); 23214287Spst M_32_SWAP(m.version); 23314287Spst M_32_SWAP(m.psize); 23414287Spst M_32_SWAP(m.free); 23514287Spst M_32_SWAP(m.nrecs); 23614287Spst M_32_SWAP(m.flags); 2371573Srgrimes } 23814287Spst if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) 2391573Srgrimes goto eftype; 24014287Spst if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || 24117141Sjkh m.psize & (sizeof(indx_t) - 1) ) 2421573Srgrimes goto eftype; 24314287Spst if (m.flags & ~SAVEMETA) 2441573Srgrimes goto eftype; 24514287Spst b.psize = m.psize; 24614287Spst F_SET(t, m.flags); 24714287Spst t->bt_free = m.free; 24814287Spst t->bt_nrecs = m.nrecs; 2491573Srgrimes } else { 2501573Srgrimes /* 2511573Srgrimes * Set the page size to the best value for I/O to this file. 2521573Srgrimes * Don't overflow the page offset type. 2531573Srgrimes */ 2541573Srgrimes if (b.psize == 0) { 2551573Srgrimes b.psize = sb.st_blksize; 2561573Srgrimes if (b.psize < MINPSIZE) 2571573Srgrimes b.psize = MINPSIZE; 2581573Srgrimes if (b.psize > MAX_PAGE_OFFSET + 1) 2591573Srgrimes b.psize = MAX_PAGE_OFFSET + 1; 2601573Srgrimes } 2611573Srgrimes 2621573Srgrimes /* Set flag if duplicates permitted. */ 2631573Srgrimes if (!(b.flags & R_DUP)) 26414287Spst F_SET(t, B_NODUPS); 2651573Srgrimes 2661573Srgrimes t->bt_free = P_INVALID; 2671573Srgrimes t->bt_nrecs = 0; 26814287Spst F_SET(t, B_METADIRTY); 2691573Srgrimes } 2701573Srgrimes 2711573Srgrimes t->bt_psize = b.psize; 2721573Srgrimes 2731573Srgrimes /* Set the cache size; must be a multiple of the page size. */ 27417141Sjkh if (b.cachesize && b.cachesize & (b.psize - 1) ) 27517141Sjkh b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1; 2761573Srgrimes if (b.cachesize < b.psize * MINCACHE) 2771573Srgrimes b.cachesize = b.psize * MINCACHE; 2781573Srgrimes 2791573Srgrimes /* Calculate number of pages to cache. */ 280298600Spfg ncache = howmany(b.cachesize, t->bt_psize); 2811573Srgrimes 2821573Srgrimes /* 2831573Srgrimes * The btree data structure requires that at least two keys can fit on 2841573Srgrimes * a page, but other than that there's no fixed requirement. The user 2851573Srgrimes * specified a minimum number per page, and we translated that into the 2861573Srgrimes * number of bytes a key/data pair can use before being placed on an 2871573Srgrimes * overflow page. This calculation includes the page header, the size 2881573Srgrimes * of the index referencing the leaf item and the size of the leaf item 2891573Srgrimes * structure. Also, don't let the user specify a minkeypage such that 2901573Srgrimes * a key/data pair won't fit even if both key and data are on overflow 2911573Srgrimes * pages. 2921573Srgrimes */ 2931573Srgrimes t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - 2941573Srgrimes (sizeof(indx_t) + NBLEAFDBT(0, 0)); 2951573Srgrimes if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) 2961573Srgrimes t->bt_ovflsize = 2971573Srgrimes NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); 2981573Srgrimes 2991573Srgrimes /* Initialize the buffer pool. */ 3001573Srgrimes if ((t->bt_mp = 3011573Srgrimes mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) 3021573Srgrimes goto err; 30314287Spst if (!F_ISSET(t, B_INMEM)) 3041573Srgrimes mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); 3051573Srgrimes 3061573Srgrimes /* Create a root page if new tree. */ 3071573Srgrimes if (nroot(t) == RET_ERROR) 3081573Srgrimes goto err; 3091573Srgrimes 3101573Srgrimes /* Global flags. */ 3111573Srgrimes if (dflags & DB_LOCK) 31214287Spst F_SET(t, B_DB_LOCK); 3131573Srgrimes if (dflags & DB_SHMEM) 31414287Spst F_SET(t, B_DB_SHMEM); 3151573Srgrimes if (dflags & DB_TXN) 31614287Spst F_SET(t, B_DB_TXN); 3171573Srgrimes 3181573Srgrimes return (dbp); 3191573Srgrimes 3201573Srgrimeseinval: errno = EINVAL; 3211573Srgrimes goto err; 3221573Srgrimes 3231573Srgrimeseftype: errno = EFTYPE; 3241573Srgrimes goto err; 3251573Srgrimes 326190344Sdelphijerr: saved_errno = errno; 327190344Sdelphij if (t) { 3281573Srgrimes if (t->bt_dbp) 3291573Srgrimes free(t->bt_dbp); 3301573Srgrimes if (t->bt_fd != -1) 33156698Sjasone (void)_close(t->bt_fd); 3321573Srgrimes free(t); 3331573Srgrimes } 334190344Sdelphij errno = saved_errno; 3351573Srgrimes return (NULL); 3361573Srgrimes} 3371573Srgrimes 3381573Srgrimes/* 3391573Srgrimes * NROOT -- Create the root of a new tree. 3401573Srgrimes * 3411573Srgrimes * Parameters: 3421573Srgrimes * t: tree 3431573Srgrimes * 3441573Srgrimes * Returns: 3451573Srgrimes * RET_ERROR, RET_SUCCESS 3461573Srgrimes */ 3471573Srgrimesstatic int 348189291Sdelphijnroot(BTREE *t) 3491573Srgrimes{ 3501573Srgrimes PAGE *meta, *root; 3511573Srgrimes pgno_t npg; 3521573Srgrimes 353190498Sdelphij if ((root = mpool_get(t->bt_mp, 1, 0)) != NULL) { 354190498Sdelphij if (root->lower == 0 && 355190498Sdelphij root->pgno == 0 && 356190498Sdelphij root->linp[0] == 0) { 357190498Sdelphij mpool_delete(t->bt_mp, root); 358190498Sdelphij errno = EINVAL; 359190498Sdelphij } else { 360190498Sdelphij mpool_put(t->bt_mp, root, 0); 361190498Sdelphij return (RET_SUCCESS); 362190498Sdelphij } 3631573Srgrimes } 36414287Spst if (errno != EINVAL) /* It's OK to not exist. */ 3651573Srgrimes return (RET_ERROR); 36614287Spst errno = 0; 3671573Srgrimes 368190498Sdelphij if ((meta = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) 3691573Srgrimes return (RET_ERROR); 3701573Srgrimes 371190498Sdelphij if ((root = mpool_new(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) 3721573Srgrimes return (RET_ERROR); 3731573Srgrimes 3741573Srgrimes if (npg != P_ROOT) 3751573Srgrimes return (RET_ERROR); 3761573Srgrimes root->pgno = npg; 3771573Srgrimes root->prevpg = root->nextpg = P_INVALID; 3781573Srgrimes root->lower = BTDATAOFF; 3791573Srgrimes root->upper = t->bt_psize; 3801573Srgrimes root->flags = P_BLEAF; 3811573Srgrimes memset(meta, 0, t->bt_psize); 3821573Srgrimes mpool_put(t->bt_mp, meta, MPOOL_DIRTY); 3831573Srgrimes mpool_put(t->bt_mp, root, MPOOL_DIRTY); 3841573Srgrimes return (RET_SUCCESS); 3851573Srgrimes} 3861573Srgrimes 3871573Srgrimesstatic int 388189291Sdelphijtmp(void) 3891573Srgrimes{ 3901573Srgrimes sigset_t set, oset; 391190485Sdelphij int fd, len; 39239058Simp char *envtmp = NULL; 3931573Srgrimes char path[MAXPATHLEN]; 3941573Srgrimes 39539058Simp if (issetugid() == 0) 39639058Simp envtmp = getenv("TMPDIR"); 397190485Sdelphij len = snprintf(path, 39868234Skris sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp"); 399190485Sdelphij if (len < 0 || len >= (int)sizeof(path)) { 400190485Sdelphij errno = ENAMETOOLONG; 401190485Sdelphij return(-1); 402190485Sdelphij } 4031573Srgrimes 4041573Srgrimes (void)sigfillset(&set); 405287292Skib (void)__libc_sigprocmask(SIG_BLOCK, &set, &oset); 406254289Sjilles if ((fd = mkostemp(path, O_CLOEXEC)) != -1) 4071573Srgrimes (void)unlink(path); 408287292Skib (void)__libc_sigprocmask(SIG_SETMASK, &oset, NULL); 4091573Srgrimes return(fd); 4101573Srgrimes} 4111573Srgrimes 4121573Srgrimesstatic int 413189291Sdelphijbyteorder(void) 4141573Srgrimes{ 4151573Srgrimes u_int32_t x; 4161573Srgrimes u_char *p; 4171573Srgrimes 4181573Srgrimes x = 0x01020304; 4191573Srgrimes p = (u_char *)&x; 4201573Srgrimes switch (*p) { 4211573Srgrimes case 1: 4221573Srgrimes return (BIG_ENDIAN); 4231573Srgrimes case 4: 4241573Srgrimes return (LITTLE_ENDIAN); 4251573Srgrimes default: 4261573Srgrimes return (0); 4271573Srgrimes } 4281573Srgrimes} 4291573Srgrimes 4301573Srgrimesint 431189291Sdelphij__bt_fd(const DB *dbp) 4321573Srgrimes{ 4331573Srgrimes BTREE *t; 4341573Srgrimes 4351573Srgrimes t = dbp->internal; 4361573Srgrimes 4371573Srgrimes /* Toss any page pinned across calls. */ 4381573Srgrimes if (t->bt_pinned != NULL) { 4391573Srgrimes mpool_put(t->bt_mp, t->bt_pinned, 0); 4401573Srgrimes t->bt_pinned = NULL; 4411573Srgrimes } 4421573Srgrimes 4431573Srgrimes /* In-memory database can't have a file descriptor. */ 44414287Spst if (F_ISSET(t, B_INMEM)) { 4451573Srgrimes errno = ENOENT; 4461573Srgrimes return (-1); 4471573Srgrimes } 4481573Srgrimes return (t->bt_fd); 4491573Srgrimes} 450