bt_open.c revision 190344
1285242Sachim/*- 2285242Sachim * Copyright (c) 1990, 1993, 1994 3285242Sachim * The Regents of the University of California. All rights reserved. 4285242Sachim * 5285242Sachim * This code is derived from software contributed to Berkeley by 6285242Sachim * Mike Olson. 7285242Sachim * 8285242Sachim * Redistribution and use in source and binary forms, with or without 9285242Sachim * modification, are permitted provided that the following conditions 10285242Sachim * are met: 11285242Sachim * 1. Redistributions of source code must retain the above copyright 12285242Sachim * notice, this list of conditions and the following disclaimer. 13285242Sachim * 2. Redistributions in binary form must reproduce the above copyright 14285242Sachim * notice, this list of conditions and the following disclaimer in the 15285242Sachim * documentation and/or other materials provided with the distribution. 16285242Sachim * 4. Neither the name of the University nor the names of its contributors 17285242Sachim * may be used to endorse or promote products derived from this software 18285242Sachim * without specific prior written permission. 19285242Sachim * 20285242Sachim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21285242Sachim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22285242Sachim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23285242Sachim * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24285242Sachim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25285242Sachim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26285242Sachim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27285242Sachim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28285242Sachim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29285242Sachim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30285242Sachim * SUCH DAMAGE. 31285242Sachim */ 32285242Sachim 33285242Sachim#if defined(LIBC_SCCS) && !defined(lint) 34285242Sachimstatic char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; 35285242Sachim#endif /* LIBC_SCCS and not lint */ 36285242Sachim#include <sys/cdefs.h> 37285242Sachim__FBSDID("$FreeBSD: head/lib/libc/db/btree/bt_open.c 190344 2009-03-23 23:43:07Z delphij $"); 38285242Sachim 39285242Sachim/* 40285242Sachim * Implementation of btree access method for 4.4BSD. 41285242Sachim * 42285242Sachim * The design here was originally based on that of the btree access method 43285242Sachim * used in the Postgres database system at UC Berkeley. This implementation 44285242Sachim * is wholly independent of the Postgres code. 45285242Sachim */ 46285242Sachim 47285242Sachim#include "namespace.h" 48285242Sachim#include <sys/param.h> 49285242Sachim#include <sys/stat.h> 50285242Sachim 51285242Sachim#include <errno.h> 52285242Sachim#include <fcntl.h> 53285242Sachim#include <limits.h> 54285242Sachim#include <signal.h> 55285242Sachim#include <stdio.h> 56285242Sachim#include <stdlib.h> 57285242Sachim#include <string.h> 58285242Sachim#include <unistd.h> 59285242Sachim#include "un-namespace.h" 60285242Sachim 61285242Sachim#include <db.h> 62285242Sachim#include "btree.h" 63285242Sachim 64285242Sachim#ifdef DEBUG 65285242Sachim#undef MINPSIZE 66285242Sachim#define MINPSIZE 128 67285242Sachim#endif 68285242Sachim 69285242Sachimstatic int byteorder(void); 70285242Sachimstatic int nroot(BTREE *); 71285242Sachimstatic int tmp(void); 72285242Sachim 73285242Sachim/* 74285242Sachim * __BT_OPEN -- Open a btree. 75285242Sachim * 76285242Sachim * Creates and fills a DB struct, and calls the routine that actually 77285242Sachim * opens the btree. 78285242Sachim * 79285242Sachim * Parameters: 80285242Sachim * fname: filename (NULL for in-memory trees) 81285242Sachim * flags: open flag bits 82285242Sachim * mode: open permission bits 83285242Sachim * b: BTREEINFO pointer 84285242Sachim * 85285242Sachim * Returns: 86285242Sachim * NULL on failure, pointer to DB on success. 87285242Sachim * 88285242Sachim */ 89285242SachimDB * 90285242Sachim__bt_open(const char *fname, int flags, int mode, const BTREEINFO *openinfo, int dflags) 91285242Sachim{ 92285242Sachim struct stat sb; 93285242Sachim BTMETA m; 94285242Sachim BTREE *t; 95285242Sachim BTREEINFO b; 96285242Sachim DB *dbp; 97285242Sachim pgno_t ncache; 98285242Sachim ssize_t nr; 99285242Sachim int machine_lorder, saved_errno; 100285242Sachim 101285242Sachim t = NULL; 102285242Sachim 103285242Sachim /* 104285242Sachim * Intention is to make sure all of the user's selections are okay 105285242Sachim * here and then use them without checking. Can't be complete, since 106285242Sachim * we don't know the right page size, lorder or flags until the backing 107285242Sachim * file is opened. Also, the file's page size can cause the cachesize 108285242Sachim * to change. 109285242Sachim */ 110285242Sachim machine_lorder = byteorder(); 111285242Sachim if (openinfo) { 112285242Sachim b = *openinfo; 113285242Sachim 114285242Sachim /* Flags: R_DUP. */ 115285242Sachim if (b.flags & ~(R_DUP)) 116285242Sachim goto einval; 117285242Sachim 118285242Sachim /* 119285242Sachim * Page size must be indx_t aligned and >= MINPSIZE. Default 120285242Sachim * page size is set farther on, based on the underlying file 121285242Sachim * transfer size. 122285242Sachim */ 123285242Sachim if (b.psize && 124285242Sachim (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || 125285242Sachim b.psize & (sizeof(indx_t) - 1) )) 126285242Sachim goto einval; 127285242Sachim 128285242Sachim /* Minimum number of keys per page; absolute minimum is 2. */ 129285242Sachim if (b.minkeypage) { 130285242Sachim if (b.minkeypage < 2) 131285242Sachim goto einval; 132285242Sachim } else 133285242Sachim b.minkeypage = DEFMINKEYPAGE; 134285242Sachim 135285242Sachim /* If no comparison, use default comparison and prefix. */ 136285242Sachim if (b.compare == NULL) { 137285242Sachim b.compare = __bt_defcmp; 138285242Sachim if (b.prefix == NULL) 139285242Sachim b.prefix = __bt_defpfx; 140285242Sachim } 141285242Sachim 142285242Sachim if (b.lorder == 0) 143285242Sachim b.lorder = machine_lorder; 144285242Sachim } else { 145285242Sachim b.compare = __bt_defcmp; 146285242Sachim b.cachesize = 0; 147285242Sachim b.flags = 0; 148285242Sachim b.lorder = machine_lorder; 149285242Sachim b.minkeypage = DEFMINKEYPAGE; 150285242Sachim b.prefix = __bt_defpfx; 151285242Sachim b.psize = 0; 152285242Sachim } 153285242Sachim 154285242Sachim /* Check for the ubiquitous PDP-11. */ 155285242Sachim if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) 156285242Sachim goto einval; 157285242Sachim 158285242Sachim /* Allocate and initialize DB and BTREE structures. */ 159285242Sachim if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) 160285242Sachim goto err; 161285242Sachim memset(t, 0, sizeof(BTREE)); 162285242Sachim t->bt_fd = -1; /* Don't close unopened fd on error. */ 163285242Sachim t->bt_lorder = b.lorder; 164285242Sachim t->bt_order = NOT; 165285242Sachim t->bt_cmp = b.compare; 166285242Sachim t->bt_pfx = b.prefix; 167285242Sachim t->bt_rfd = -1; 168285242Sachim 169285242Sachim if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) 170285242Sachim goto err; 171285242Sachim memset(t->bt_dbp, 0, sizeof(DB)); 172285242Sachim if (t->bt_lorder != machine_lorder) 173285242Sachim F_SET(t, B_NEEDSWAP); 174285242Sachim 175285242Sachim dbp->type = DB_BTREE; 176285242Sachim dbp->internal = t; 177285242Sachim dbp->close = __bt_close; 178285242Sachim dbp->del = __bt_delete; 179285242Sachim dbp->fd = __bt_fd; 180285242Sachim dbp->get = __bt_get; 181285242Sachim dbp->put = __bt_put; 182285242Sachim dbp->seq = __bt_seq; 183285242Sachim dbp->sync = __bt_sync; 184285242Sachim 185285242Sachim /* 186285242Sachim * If no file name was supplied, this is an in-memory btree and we 187285242Sachim * open a backing temporary file. Otherwise, it's a disk-based tree. 188285242Sachim */ 189285242Sachim if (fname) { 190285242Sachim switch (flags & O_ACCMODE) { 191285242Sachim case O_RDONLY: 192285242Sachim F_SET(t, B_RDONLY); 193285242Sachim break; 194285242Sachim case O_RDWR: 195285242Sachim break; 196285242Sachim case O_WRONLY: 197285242Sachim default: 198285242Sachim goto einval; 199285242Sachim } 200285242Sachim 201285242Sachim if ((t->bt_fd = _open(fname, flags, mode)) < 0) 202285242Sachim goto err; 203285242Sachim 204285242Sachim } else { 205285242Sachim if ((flags & O_ACCMODE) != O_RDWR) 206285242Sachim goto einval; 207285242Sachim if ((t->bt_fd = tmp()) == -1) 208285242Sachim goto err; 209285242Sachim F_SET(t, B_INMEM); 210285242Sachim } 211285242Sachim 212285242Sachim if (_fcntl(t->bt_fd, F_SETFD, 1) == -1) 213285242Sachim goto err; 214285242Sachim 215285242Sachim if (_fstat(t->bt_fd, &sb)) 216285242Sachim goto err; 217285242Sachim if (sb.st_size) { 218285242Sachim if ((nr = _read(t->bt_fd, &m, sizeof(BTMETA))) < 0) 219285242Sachim goto err; 220285242Sachim if (nr != sizeof(BTMETA)) 221285242Sachim goto eftype; 222285242Sachim 223285242Sachim /* 224285242Sachim * Read in the meta-data. This can change the notion of what 225285242Sachim * the lorder, page size and flags are, and, when the page size 226285242Sachim * changes, the cachesize value can change too. If the user 227285242Sachim * specified the wrong byte order for an existing database, we 228285242Sachim * don't bother to return an error, we just clear the NEEDSWAP 229285242Sachim * bit. 230285242Sachim */ 231285242Sachim if (m.magic == BTREEMAGIC) 232285242Sachim F_CLR(t, B_NEEDSWAP); 233285242Sachim else { 234285242Sachim F_SET(t, B_NEEDSWAP); 235285242Sachim M_32_SWAP(m.magic); 236285242Sachim M_32_SWAP(m.version); 237285242Sachim M_32_SWAP(m.psize); 238285242Sachim M_32_SWAP(m.free); 239285242Sachim M_32_SWAP(m.nrecs); 240285242Sachim M_32_SWAP(m.flags); 241285242Sachim } 242285242Sachim if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) 243285242Sachim goto eftype; 244285242Sachim if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || 245285242Sachim m.psize & (sizeof(indx_t) - 1) ) 246285242Sachim goto eftype; 247285242Sachim if (m.flags & ~SAVEMETA) 248285242Sachim goto eftype; 249285242Sachim b.psize = m.psize; 250285242Sachim F_SET(t, m.flags); 251285242Sachim t->bt_free = m.free; 252285242Sachim t->bt_nrecs = m.nrecs; 253285242Sachim } else { 254285242Sachim /* 255285242Sachim * Set the page size to the best value for I/O to this file. 256285242Sachim * Don't overflow the page offset type. 257285242Sachim */ 258285242Sachim if (b.psize == 0) { 259285242Sachim b.psize = sb.st_blksize; 260285242Sachim if (b.psize < MINPSIZE) 261285242Sachim b.psize = MINPSIZE; 262285242Sachim if (b.psize > MAX_PAGE_OFFSET + 1) 263285242Sachim b.psize = MAX_PAGE_OFFSET + 1; 264285242Sachim } 265285242Sachim 266285242Sachim /* Set flag if duplicates permitted. */ 267285242Sachim if (!(b.flags & R_DUP)) 268285242Sachim F_SET(t, B_NODUPS); 269285242Sachim 270285242Sachim t->bt_free = P_INVALID; 271285242Sachim t->bt_nrecs = 0; 272285242Sachim F_SET(t, B_METADIRTY); 273285242Sachim } 274285242Sachim 275285242Sachim t->bt_psize = b.psize; 276285242Sachim 277285242Sachim /* Set the cache size; must be a multiple of the page size. */ 278285242Sachim if (b.cachesize && b.cachesize & (b.psize - 1) ) 279285242Sachim b.cachesize += (~b.cachesize & (b.psize - 1) ) + 1; 280285242Sachim if (b.cachesize < b.psize * MINCACHE) 281285242Sachim b.cachesize = b.psize * MINCACHE; 282285242Sachim 283285242Sachim /* Calculate number of pages to cache. */ 284285242Sachim ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; 285285242Sachim 286285242Sachim /* 287285242Sachim * The btree data structure requires that at least two keys can fit on 288285242Sachim * a page, but other than that there's no fixed requirement. The user 289285242Sachim * specified a minimum number per page, and we translated that into the 290285242Sachim * number of bytes a key/data pair can use before being placed on an 291285242Sachim * overflow page. This calculation includes the page header, the size 292285242Sachim * of the index referencing the leaf item and the size of the leaf item 293285242Sachim * structure. Also, don't let the user specify a minkeypage such that 294285242Sachim * a key/data pair won't fit even if both key and data are on overflow 295285242Sachim * pages. 296285242Sachim */ 297285242Sachim t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - 298285242Sachim (sizeof(indx_t) + NBLEAFDBT(0, 0)); 299285242Sachim if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) 300285242Sachim t->bt_ovflsize = 301285242Sachim NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); 302285242Sachim 303285242Sachim /* Initialize the buffer pool. */ 304285242Sachim if ((t->bt_mp = 305285242Sachim mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) 306285242Sachim goto err; 307285242Sachim if (!F_ISSET(t, B_INMEM)) 308285242Sachim mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); 309285242Sachim 310285242Sachim /* Create a root page if new tree. */ 311285242Sachim if (nroot(t) == RET_ERROR) 312285242Sachim goto err; 313285242Sachim 314285242Sachim /* Global flags. */ 315285242Sachim if (dflags & DB_LOCK) 316285242Sachim F_SET(t, B_DB_LOCK); 317285242Sachim if (dflags & DB_SHMEM) 318285242Sachim F_SET(t, B_DB_SHMEM); 319285242Sachim if (dflags & DB_TXN) 320285242Sachim F_SET(t, B_DB_TXN); 321285242Sachim 322285242Sachim return (dbp); 323285242Sachim 324285242Sachimeinval: errno = EINVAL; 325285242Sachim goto err; 326285242Sachim 327285242Sachimeftype: errno = EFTYPE; 328285242Sachim goto err; 329285242Sachim 330285242Sachimerr: saved_errno = errno; 331285242Sachim if (t) { 332285242Sachim if (t->bt_dbp) 333285242Sachim free(t->bt_dbp); 334285242Sachim if (t->bt_fd != -1) 335285242Sachim (void)_close(t->bt_fd); 336285242Sachim free(t); 337285242Sachim } 338285242Sachim errno = saved_errno; 339285242Sachim return (NULL); 340285242Sachim} 341285242Sachim 342285242Sachim/* 343285242Sachim * NROOT -- Create the root of a new tree. 344285242Sachim * 345285242Sachim * Parameters: 346285242Sachim * t: tree 347285242Sachim * 348285242Sachim * Returns: 349285242Sachim * RET_ERROR, RET_SUCCESS 350285242Sachim */ 351285242Sachimstatic int 352285242Sachimnroot(BTREE *t) 353285242Sachim{ 354285242Sachim PAGE *meta, *root; 355285242Sachim pgno_t npg; 356285242Sachim 357285242Sachim if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { 358285242Sachim mpool_put(t->bt_mp, meta, 0); 359285242Sachim return (RET_SUCCESS); 360285242Sachim } 361285242Sachim if (errno != EINVAL) /* It's OK to not exist. */ 362285242Sachim return (RET_ERROR); 363285242Sachim errno = 0; 364285242Sachim 365285242Sachim if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) 366285242Sachim return (RET_ERROR); 367285242Sachim 368285242Sachim if ((root = mpool_new(t->bt_mp, &npg)) == NULL) 369285242Sachim return (RET_ERROR); 370285242Sachim 371285242Sachim if (npg != P_ROOT) 372285242Sachim return (RET_ERROR); 373285242Sachim root->pgno = npg; 374285242Sachim root->prevpg = root->nextpg = P_INVALID; 375285242Sachim root->lower = BTDATAOFF; 376285242Sachim root->upper = t->bt_psize; 377285242Sachim root->flags = P_BLEAF; 378285242Sachim memset(meta, 0, t->bt_psize); 379285242Sachim mpool_put(t->bt_mp, meta, MPOOL_DIRTY); 380285242Sachim mpool_put(t->bt_mp, root, MPOOL_DIRTY); 381285242Sachim return (RET_SUCCESS); 382285242Sachim} 383285242Sachim 384285242Sachimstatic int 385285242Sachimtmp(void) 386285242Sachim{ 387285242Sachim sigset_t set, oset; 388285242Sachim int fd; 389285242Sachim char *envtmp = NULL; 390285242Sachim char path[MAXPATHLEN]; 391285242Sachim 392285242Sachim if (issetugid() == 0) 393285242Sachim envtmp = getenv("TMPDIR"); 394285242Sachim (void)snprintf(path, 395285242Sachim sizeof(path), "%s/bt.XXXXXXXXXX", envtmp ? envtmp : "/tmp"); 396285242Sachim 397285242Sachim (void)sigfillset(&set); 398285242Sachim (void)_sigprocmask(SIG_BLOCK, &set, &oset); 399285242Sachim if ((fd = mkstemp(path)) != -1) 400285242Sachim (void)unlink(path); 401285242Sachim (void)_sigprocmask(SIG_SETMASK, &oset, NULL); 402285242Sachim return(fd); 403285242Sachim} 404285242Sachim 405285242Sachimstatic int 406285242Sachimbyteorder(void) 407285242Sachim{ 408285242Sachim u_int32_t x; 409285242Sachim u_char *p; 410285242Sachim 411285242Sachim x = 0x01020304; 412285242Sachim p = (u_char *)&x; 413285242Sachim switch (*p) { 414285242Sachim case 1: 415285242Sachim return (BIG_ENDIAN); 416285242Sachim case 4: 417285242Sachim return (LITTLE_ENDIAN); 418285242Sachim default: 419285242Sachim return (0); 420285242Sachim } 421285242Sachim} 422285242Sachim 423285242Sachimint 424285242Sachim__bt_fd(const DB *dbp) 425285242Sachim{ 426285242Sachim BTREE *t; 427285242Sachim 428285242Sachim t = dbp->internal; 429285242Sachim 430285242Sachim /* Toss any page pinned across calls. */ 431285242Sachim if (t->bt_pinned != NULL) { 432285242Sachim mpool_put(t->bt_mp, t->bt_pinned, 0); 433285242Sachim t->bt_pinned = NULL; 434285242Sachim } 435285242Sachim 436285242Sachim /* In-memory database can't have a file descriptor. */ 437285242Sachim if (F_ISSET(t, B_INMEM)) { 438285242Sachim errno = ENOENT; 439285242Sachim return (-1); 440285242Sachim } 441285242Sachim return (t->bt_fd); 442285242Sachim} 443285242Sachim