1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23/*
24 * Database functions
25 * Copyright (C) Andrew Tridgell 1999
26 *
27 * Redistribution and use in source and binary forms are permitted
28 * provided that the above copyright notice and this paragraph are
29 * duplicated in all such forms AND provided that this software or
30 * any derived work is only used as part of the PPP daemon (pppd)
31 * and related utilities.
32 * The name of the author may not be used to endorse or promote products
33 * derived from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
35 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
36 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
37 *
38 * Note: this software is also available under the Gnu Public License
39 * version 2 or later.
40 */
41
42#include <stdlib.h>
43#include <stdio.h>
44#include <fcntl.h>
45#include <unistd.h>
46#include <string.h>
47#include <fcntl.h>
48#include <errno.h>
49#include <sys/mman.h>
50#include <sys/stat.h>
51#include "tdb.h"
52
53#define TDB_VERSION (0x26011967 + 1)
54#define TDB_MAGIC (0x26011999U)
55#define TDB_FREE_MAGIC (~TDB_MAGIC)
56#define TDB_ALIGN 4
57#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGN)
58#define DEFAULT_HASH_SIZE 128
59#define TDB_PAGE_SIZE 0x2000
60#define TDB_LEN_MULTIPLIER 10
61#define FREELIST_TOP (sizeof(struct tdb_header))
62
63#define LOCK_SET 1
64#define LOCK_CLEAR 0
65
66/* lock offsets */
67#define GLOBAL_LOCK 0
68#define ACTIVE_LOCK 4
69#define LIST_LOCK_BASE 1024
70
71#define BUCKET(hash) ((hash) % tdb->header.hash_size)
72
73#ifndef MAP_FILE
74#define MAP_FILE 0
75#endif
76
77/* the body of the database is made of one list_struct for the free space
78   plus a separate data list for each hash value */
79struct list_struct {
80	tdb_len rec_len; /* total byte length of record */
81	tdb_off next; /* offset of the next record in the list */
82	tdb_len key_len; /* byte length of key */
83	tdb_len data_len; /* byte length of data */
84	unsigned full_hash; /* the full 32 bit hash of the key */
85	unsigned magic;   /* try to catch errors */
86	/*
87	   the following union is implied
88	   union {
89              char record[rec_len];
90	      struct {
91	        char key[key_len];
92		char data[data_len];
93	      }
94           }
95	*/
96};
97
98/* a null data record - useful for error returns */
99static TDB_DATA null_data;
100
101/* a byte range locking function - return 0 on success
102   this functions locks/unlocks 1 byte at the specified offset */
103static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
104		      int set, int rw_type, int lck_type)
105{
106#if NOLOCK
107	return 0;
108#else
109	struct flock fl;
110
111        if (tdb->fd == -1) return 0;   /* for in memory tdb */
112
113	if (tdb->read_only) return -1;
114
115	fl.l_type = set==LOCK_SET?rw_type:F_UNLCK;
116	fl.l_whence = SEEK_SET;
117	fl.l_start = offset;
118	fl.l_len = 1;
119	fl.l_pid = 0;
120
121	if (fcntl(tdb->fd, lck_type, &fl) != 0) {
122#if TDB_DEBUG
123		if (lck_type == F_SETLKW) {
124			printf("lock %d failed at %d (%s)\n",
125			       set, offset, strerror(errno));
126		}
127#endif
128		tdb->ecode = TDB_ERR_LOCK;
129		return -1;
130	}
131	return 0;
132#endif
133}
134
135/* lock a list in the database. list -1 is the alloc list */
136static int tdb_lock(TDB_CONTEXT *tdb, int list)
137{
138	if (list < -1 || list >= (int)tdb->header.hash_size) {
139#if TDB_DEBUG
140		printf("bad list %d\n", list);
141#endif
142		return -1;
143	}
144	if (tdb->locked[list+1] == 0) {
145		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_SET,
146			       F_WRLCK, F_SETLKW) != 0) {
147			return -1;
148		}
149	}
150	tdb->locked[list+1]++;
151	return 0;
152}
153
154/* unlock the database. */
155static int tdb_unlock(TDB_CONTEXT *tdb, int list)
156{
157	if (list < -1 || list >= (int)tdb->header.hash_size) {
158#if TDB_DEBUG
159		printf("bad unlock list %d\n", list);
160#endif
161		return -1;
162	}
163
164	if (tdb->locked[list+1] == 0) {
165#if TDB_DEBUG
166		printf("not locked %d\n", list);
167#endif
168		tdb->ecode = TDB_ERR_LOCK;
169		return -1;
170	}
171	if (tdb->locked[list+1] == 1) {
172		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_CLEAR,
173			       F_WRLCK, F_SETLKW) != 0) {
174			return -1;
175		}
176	}
177	tdb->locked[list+1]--;
178	return 0;
179}
180
181/* the hash algorithm - turn a key into an integer
182   This is based on the hash agorithm from gdbm */
183static unsigned tdb_hash(TDB_DATA *key)
184{
185	unsigned value;	/* Used to compute the hash value.  */
186	unsigned   i;	/* Used to cycle through random values. */
187
188	/* Set the initial value from the key size. */
189	value = 0x238F13AF * key->dsize;
190	for (i=0; i < key->dsize; i++) {
191		value = (value + (key->dptr[i] << (i*5 % 24)));
192	}
193
194	value = (1103515243 * value + 12345);
195
196	return value;
197}
198
199/* find the top of the hash chain for an open database */
200static tdb_off tdb_hash_top(TDB_CONTEXT *tdb, unsigned hash)
201{
202	tdb_off ret;
203	hash = BUCKET(hash);
204	ret = FREELIST_TOP + (hash+1)*sizeof(tdb_off);
205	return ret;
206}
207
208
209/* check for an out of bounds access - if it is out of bounds then
210   see if the database has been expanded by someone else and expand
211   if necessary */
212static int tdb_oob(TDB_CONTEXT *tdb, tdb_off offset)
213{
214	struct stat st;
215	if ((offset <= tdb->map_size) || (tdb->fd == -1)) return 0;
216
217	fstat(tdb->fd, &st);
218	if (st.st_size <= (ssize_t)offset) {
219		tdb->ecode = TDB_ERR_IO;
220		return -1;
221	}
222
223#if HAVE_MMAP
224	if (tdb->map_ptr) {
225		munmap(tdb->map_ptr, tdb->map_size);
226		tdb->map_ptr = NULL;
227	}
228#endif
229
230	tdb->map_size = st.st_size;
231#if HAVE_MMAP
232	tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
233				    tdb->read_only?PROT_READ:PROT_READ|PROT_WRITE,
234				    MAP_SHARED | MAP_FILE, tdb->fd, 0);
235	if (tdb->map_ptr == MAP_FAILED) {
236	    tdb->map_ptr = NULL;
237	}
238#endif
239	return 0;
240}
241
242
243/* write a lump of data at a specified offset */
244static int tdb_write(TDB_CONTEXT *tdb, tdb_off offset, const char *buf, tdb_len len)
245{
246	if (tdb_oob(tdb, offset + len) != 0) {
247		/* oops - trying to write beyond the end of the database! */
248		return -1;
249	}
250
251	if (tdb->map_ptr) {
252		memcpy(offset + (char *)tdb->map_ptr, buf, len);
253	} else {
254		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
255		    write(tdb->fd, buf, len) != (ssize_t)len) {
256			tdb->ecode = TDB_ERR_IO;
257			return -1;
258		}
259	}
260	return 0;
261}
262
263/* read a lump of data at a specified offset */
264static int tdb_read(TDB_CONTEXT *tdb, tdb_off offset, char *buf, tdb_len len)
265{
266	if (tdb_oob(tdb, offset + len) != 0) {
267		/* oops - trying to read beyond the end of the database! */
268		return -1;
269	}
270
271	if (tdb->map_ptr) {
272		memcpy(buf, offset + (char *)tdb->map_ptr, len);
273	} else {
274		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
275		    read(tdb->fd, buf, len) != (ssize_t)len) {
276			tdb->ecode = TDB_ERR_IO;
277			return -1;
278		}
279	}
280	return 0;
281}
282
283
284/* read a lump of data, allocating the space for it */
285static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
286{
287	char *buf;
288
289	buf = (char *)malloc(len);
290
291	if (!buf) {
292		tdb->ecode = TDB_ERR_OOM;
293		return NULL;
294	}
295
296	if (tdb_read(tdb, offset, buf, len) == -1) {
297		free(buf);
298		return NULL;
299	}
300
301	return buf;
302}
303
304/* convenience routine for writing a record */
305static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
306{
307	return tdb_write(tdb, offset, (char *)rec, sizeof(*rec));
308}
309
310/* convenience routine for writing a tdb_off */
311static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
312{
313	return tdb_write(tdb, offset, (char *)d, sizeof(*d));
314}
315
316/* read a tdb_off from the store */
317static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
318{
319	return tdb_read(tdb, offset, (char *)d, sizeof(*d));
320}
321
322/* read a record and check for simple errors */
323static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
324{
325	if (tdb_read(tdb, offset, (char *)rec, sizeof(*rec)) == -1) return -1;
326	if (rec->magic != TDB_MAGIC) {
327#if TDB_DEBUG
328		printf("bad magic 0x%08x at offset %d\n",
329		       rec->magic, offset);
330#endif
331		tdb->ecode = TDB_ERR_CORRUPT;
332		return -1;
333	}
334	if (tdb_oob(tdb, rec->next) != 0) {
335		return -1;
336	}
337	return 0;
338}
339
340/* expand the database at least length bytes by expanding the
341   underlying file and doing the mmap again if necessary */
342static int tdb_expand(TDB_CONTEXT *tdb, tdb_off length)
343{
344	struct list_struct rec;
345	tdb_off offset, ptr;
346	char b = 0;
347
348	tdb_lock(tdb,-1);
349
350	/* make sure we know about any previous expansions by another
351           process */
352	tdb_oob(tdb,tdb->map_size + 1);
353
354	/* always make room for at least 10 more records */
355	length *= TDB_LEN_MULTIPLIER;
356
357	/* and round the database up to a multiple of TDB_PAGE_SIZE */
358	length = ((tdb->map_size + length + TDB_PAGE_SIZE) & ~(TDB_PAGE_SIZE - 1)) - tdb->map_size;
359
360	/* expand the file itself */
361        if (tdb->fd != -1) {
362            lseek(tdb->fd, tdb->map_size + length - 1, SEEK_SET);
363            if (write(tdb->fd, &b, 1) != 1) goto fail;
364        }
365
366	/* form a new freelist record */
367	offset = FREELIST_TOP;
368	rec.rec_len = length - sizeof(rec);
369	rec.magic = TDB_FREE_MAGIC;
370	if (ofs_read(tdb, offset, &rec.next) == -1) {
371		goto fail;
372	}
373
374#if HAVE_MMAP
375	if (tdb->fd != -1 && tdb->map_ptr) {
376		munmap(tdb->map_ptr, tdb->map_size);
377		tdb->map_ptr = NULL;
378	}
379#endif
380
381	tdb->map_size += length;
382
383        if (tdb->fd == -1) {
384            tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
385        }
386
387	/* write it out */
388	if (rec_write(tdb, tdb->map_size - length, &rec) == -1) {
389		goto fail;
390	}
391
392	/* link it into the free list */
393	ptr = tdb->map_size - length;
394	if (ofs_write(tdb, offset, &ptr) == -1) goto fail;
395
396#if HAVE_MMAP
397        if (tdb->fd != -1) {
398            tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
399                                        PROT_READ|PROT_WRITE,
400                                        MAP_SHARED | MAP_FILE, tdb->fd, 0);
401	    if (tdb->map_ptr == MAP_FAILED) {
402		tdb->map_ptr = NULL;
403	    }
404        }
405#endif
406
407	tdb_unlock(tdb, -1);
408	return 0;
409
410 fail:
411	tdb_unlock(tdb,-1);
412	return -1;
413}
414
415/* allocate some space from the free list. The offset returned points
416   to a unconnected list_struct within the database with room for at
417   least length bytes of total data
418
419   0 is returned if the space could not be allocated
420 */
421static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length)
422{
423	tdb_off offset, rec_ptr, last_ptr;
424	struct list_struct rec, lastrec, newrec;
425
426	tdb_lock(tdb, -1);
427
428 again:
429	last_ptr = 0;
430	offset = FREELIST_TOP;
431
432	/* read in the freelist top */
433	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
434		goto fail;
435	}
436
437	/* keep looking until we find a freelist record that is big
438           enough */
439	while (rec_ptr) {
440		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
441			goto fail;
442		}
443
444		if (rec.magic != TDB_FREE_MAGIC) {
445#if TDB_DEBUG
446			printf("bad magic 0x%08x in free list\n", rec.magic);
447#endif
448			goto fail;
449		}
450
451		if (rec.rec_len >= length) {
452			/* found it - now possibly split it up  */
453			if (rec.rec_len > length + MIN_REC_SIZE) {
454				length = (length + TDB_ALIGN) & ~(TDB_ALIGN-1);
455
456				newrec.rec_len = rec.rec_len - (sizeof(rec) + length);
457				newrec.next = rec.next;
458				newrec.magic = TDB_FREE_MAGIC;
459
460				rec.rec_len = length;
461				rec.next = rec_ptr + sizeof(rec) + rec.rec_len;
462
463				if (rec_write(tdb, rec.next, &newrec) == -1) {
464					goto fail;
465				}
466
467				if (rec_write(tdb, rec_ptr, &rec) == -1) {
468					goto fail;
469				}
470			}
471
472			/* remove it from the list */
473			if (last_ptr == 0) {
474				offset = FREELIST_TOP;
475
476				if (ofs_write(tdb, offset, &rec.next) == -1) {
477					goto fail;
478				}
479			} else {
480				lastrec.next = rec.next;
481				if (rec_write(tdb, last_ptr, &lastrec) == -1) {
482					goto fail;
483				}
484			}
485
486			/* all done - return the new record offset */
487			tdb_unlock(tdb, -1);
488			return rec_ptr;
489		}
490
491		/* move to the next record */
492		lastrec = rec;
493		last_ptr = rec_ptr;
494		rec_ptr = rec.next;
495	}
496
497	/* we didn't find enough space. See if we can expand the
498	   database and if we can then try again */
499	if (tdb_expand(tdb, length + sizeof(rec)) == 0) goto again;
500
501 fail:
502#if TDB_DEBUG
503	printf("tdb_allocate failed for size %u\n", length);
504#endif
505	tdb_unlock(tdb, -1);
506	return 0;
507}
508
509/* initialise a new database with a specified hash size */
510static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
511{
512	struct tdb_header header;
513	tdb_off offset;
514	int i, size = 0;
515	tdb_off buf[16];
516
517        /* create the header */
518        memset(&header, 0, sizeof(header));
519        memcpy(header.magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
520        header.version = TDB_VERSION;
521        header.hash_size = hash_size;
522        lseek(tdb->fd, 0, SEEK_SET);
523        ftruncate(tdb->fd, 0);
524
525        if (tdb->fd != -1 && write(tdb->fd, &header, sizeof(header)) !=
526            sizeof(header)) {
527            tdb->ecode = TDB_ERR_IO;
528            return -1;
529        } else size += sizeof(header);
530
531        /* the freelist and hash pointers */
532        offset = 0;
533        memset(buf, 0, sizeof(buf));
534
535        for (i=0;(hash_size+1)-i >= 16; i += 16) {
536            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(buf)) !=
537                sizeof(buf)) {
538                tdb->ecode = TDB_ERR_IO;
539                return -1;
540            } else size += sizeof(buf);
541        }
542
543        for (;i<hash_size+1; i++) {
544            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(tdb_off)) !=
545                sizeof(tdb_off)) {
546                tdb->ecode = TDB_ERR_IO;
547                return -1;
548            } else size += sizeof(tdb_off);
549        }
550
551        if (tdb->fd == -1) {
552            tdb->map_ptr = calloc(size, 1);
553            tdb->map_size = size;
554            if (tdb->map_ptr == NULL) {
555                tdb->ecode = TDB_ERR_IO;
556                return -1;
557            }
558            memcpy(&tdb->header, &header, sizeof(header));
559        }
560
561#if TDB_DEBUG
562	printf("initialised database of hash_size %u\n",
563	       hash_size);
564#endif
565	return 0;
566}
567
568/* Returns 0 on fail.  On success, return offset of record, and fills
569   in rec */
570static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, unsigned int hash,
571			struct list_struct *rec)
572{
573	tdb_off offset, rec_ptr;
574
575	/* find the top of the hash chain */
576	offset = tdb_hash_top(tdb, hash);
577
578	/* read in the hash top */
579	if (ofs_read(tdb, offset, &rec_ptr) == -1)
580		return 0;
581
582	/* keep looking until we find the right record */
583	while (rec_ptr) {
584		if (rec_read(tdb, rec_ptr, rec) == -1)
585			return 0;
586
587		if (hash == rec->full_hash && key.dsize == rec->key_len) {
588			char *k;
589			/* a very likely hit - read the key */
590			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*rec),
591					   rec->key_len);
592
593			if (!k)
594				return 0;
595
596			if (memcmp(key.dptr, k, key.dsize) == 0) {
597				free(k);
598				return rec_ptr;
599			}
600			free(k);
601		}
602
603		/* move to the next record */
604		rec_ptr = rec->next;
605	}
606	return 0;
607}
608
609/*
610   return an error string for the last tdb error
611*/
612char *tdb_error(TDB_CONTEXT *tdb)
613{
614	int i;
615	static struct {
616		enum TDB_ERROR ecode;
617		char *estring;
618	} emap[] = {
619		{TDB_SUCCESS, "Success"},
620		{TDB_ERR_CORRUPT, "Corrupt database"},
621		{TDB_ERR_IO, "IO Error"},
622		{TDB_ERR_LOCK, "Locking error"},
623		{TDB_ERR_OOM, "Out of memory"},
624		{TDB_ERR_EXISTS, "Record exists"},
625		{-1, NULL}};
626        if (tdb != NULL) {
627            for (i=0;emap[i].estring;i++) {
628		if (tdb->ecode == emap[i].ecode) return emap[i].estring;
629            }
630        } else {
631            return "Invalid tdb context";
632        }
633	return "Invalid error code";
634}
635
636
637/* update an entry in place - this only works if the new data size
638   is <= the old data size and the key exists.
639   on failure return -1
640*/
641int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
642{
643	unsigned hash;
644	struct list_struct rec;
645	tdb_off rec_ptr;
646	int ret = -1;
647
648        if (tdb == NULL) {
649#ifdef TDB_DEBUG
650            printf("tdb_update() called with null context\n");
651#endif
652            return -1;
653        }
654
655	/* find which hash bucket it is in */
656	hash = tdb_hash(&key);
657
658	tdb_lock(tdb, BUCKET(hash));
659	rec_ptr = tdb_find(tdb, key, hash, &rec);
660
661	if (!rec_ptr)
662		goto out;
663
664	/* must be long enough */
665	if (rec.rec_len < key.dsize + dbuf.dsize)
666		goto out;
667
668	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
669		      dbuf.dptr, dbuf.dsize) == -1)
670		goto out;
671
672	if (dbuf.dsize != rec.data_len) {
673		/* update size */
674		rec.data_len = dbuf.dsize;
675		ret = rec_write(tdb, rec_ptr, &rec);
676	} else
677		ret = 0;
678
679 out:
680	tdb_unlock(tdb, BUCKET(hash));
681	return ret;
682}
683
684/* find an entry in the database given a key */
685TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
686{
687	unsigned hash;
688	tdb_off rec_ptr;
689	struct list_struct rec;
690	TDB_DATA ret = null_data;
691
692        if (tdb == NULL) {
693#ifdef TDB_DEBUG
694            printf("tdb_fetch() called with null context\n");
695#endif
696            return null_data;
697        }
698
699	/* find which hash bucket it is in */
700	hash = tdb_hash(&key);
701
702	tdb_lock(tdb, BUCKET(hash));
703	rec_ptr = tdb_find(tdb, key, hash, &rec);
704
705	if (rec_ptr) {
706		ret.dptr = tdb_alloc_read(tdb,
707					  rec_ptr + sizeof(rec) + rec.key_len,
708					  rec.data_len);
709		ret.dsize = rec.data_len;
710	}
711
712	tdb_unlock(tdb, BUCKET(hash));
713	return ret;
714}
715
716/* check if an entry in the database exists
717
718   note that 1 is returned if the key is found and 0 is returned if not found
719   this doesn't match the conventions in the rest of this module, but is
720   compatible with gdbm
721*/
722int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
723{
724	unsigned hash;
725	tdb_off rec_ptr;
726	struct list_struct rec;
727
728        if (tdb == NULL) {
729#ifdef TDB_DEBUG
730            printf("tdb_exists() called with null context\n");
731#endif
732            return 0;
733        }
734
735	/* find which hash bucket it is in */
736	hash = tdb_hash(&key);
737
738	tdb_lock(tdb, BUCKET(hash));
739	rec_ptr = tdb_find(tdb, key, hash, &rec);
740	tdb_unlock(tdb, BUCKET(hash));
741
742	return rec_ptr != 0;
743}
744
745/* traverse the entire database - calling fn(tdb, key, data) on each element.
746   return -1 on error or the record count traversed
747   if fn is NULL then it is not called
748   a non-zero return value from fn() indicates that the traversal should stop
749  */
750int tdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void* state), void* state)
751{
752	int count = 0;
753	unsigned h;
754	tdb_off offset, rec_ptr;
755	struct list_struct rec;
756	char *data;
757	TDB_DATA key, dbuf;
758
759        if (tdb == NULL) {
760#ifdef TDB_DEBUG
761            printf("tdb_traverse() called with null context\n");
762#endif
763            return -1;
764        }
765
766	/* loop over all hash chains */
767	for (h = 0; h < tdb->header.hash_size; h++) {
768		tdb_lock(tdb, BUCKET(h));
769
770		/* read in the hash top */
771		offset = tdb_hash_top(tdb, h);
772		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
773			goto fail;
774		}
775
776		/* traverse all records for this hash */
777		while (rec_ptr) {
778			if (rec_read(tdb, rec_ptr, &rec) == -1) {
779				goto fail;
780			}
781
782			/* now read the full record */
783			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
784					     rec.key_len + rec.data_len);
785			if (!data) {
786				goto fail;
787			}
788
789			key.dptr = data;
790			key.dsize = rec.key_len;
791			dbuf.dptr = data + rec.key_len;
792			dbuf.dsize = rec.data_len;
793			count++;
794
795			if (fn && fn(tdb, key, dbuf, state) != 0) {
796				/* they want us to stop traversing */
797				free(data);
798				tdb_unlock(tdb, BUCKET(h));
799				return count;
800			}
801
802			/* a miss - drat */
803			free(data);
804
805			/* move to the next record */
806			rec_ptr = rec.next;
807		}
808		tdb_unlock(tdb, BUCKET(h));
809	}
810
811	/* return the number traversed */
812	return count;
813
814 fail:
815	tdb_unlock(tdb, BUCKET(h));
816	return -1;
817}
818
819
820/* find the first entry in the database and return its key */
821TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
822{
823	tdb_off offset, rec_ptr;
824	struct list_struct rec;
825	unsigned hash;
826	TDB_DATA ret;
827
828        if (tdb == NULL) {
829#ifdef TDB_DEBUG
830            printf("tdb_firstkey() called with null context\n");
831#endif
832            return null_data;
833        }
834
835	/* look for a non-empty hash chain */
836	for (hash = 0, rec_ptr = 0;
837	     hash < tdb->header.hash_size;
838	     hash++) {
839		/* find the top of the hash chain */
840		offset = tdb_hash_top(tdb, hash);
841
842		tdb_lock(tdb, BUCKET(hash));
843
844		/* read in the hash top */
845		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
846			goto fail;
847		}
848
849		if (rec_ptr) break;
850
851		tdb_unlock(tdb, BUCKET(hash));
852	}
853
854	if (rec_ptr == 0) return null_data;
855
856	/* we've found a non-empty chain, now read the record */
857	if (rec_read(tdb, rec_ptr, &rec) == -1) {
858		goto fail;
859	}
860
861	/* allocate and read the key space */
862	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
863	ret.dsize = rec.key_len;
864	tdb_unlock(tdb, BUCKET(hash));
865	return ret;
866
867 fail:
868	tdb_unlock(tdb, BUCKET(hash));
869	return null_data;
870}
871
872/* find the next entry in the database, returning its key */
873TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key)
874{
875	unsigned hash, hbucket;
876	tdb_off rec_ptr, offset;
877	struct list_struct rec;
878	TDB_DATA ret;
879
880        if (tdb == NULL) {
881#ifdef TDB_DEBUG
882            printf("tdb_nextkey() called with null context\n");
883#endif
884            return null_data;
885        }
886
887	/* find which hash bucket it is in */
888	hash = tdb_hash(&key);
889	hbucket = BUCKET(hash);
890
891	tdb_lock(tdb, hbucket);
892	rec_ptr = tdb_find(tdb, key, hash, &rec);
893	if (rec_ptr) {
894		/* we want the next record after this one */
895		rec_ptr = rec.next;
896	}
897
898	/* not found or last in hash: look for next non-empty hash chain */
899	while (rec_ptr == 0) {
900		tdb_unlock(tdb, hbucket);
901
902		if (++hbucket >= tdb->header.hash_size - 1)
903			return null_data;
904
905		offset = tdb_hash_top(tdb, hbucket);
906		tdb_lock(tdb, hbucket);
907		/* read in the hash top */
908		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
909			tdb_unlock(tdb, hbucket);
910			return null_data;
911		}
912	}
913
914	/* Read the record. */
915	if (rec_read(tdb, rec_ptr, &rec) == -1) {
916		tdb_unlock(tdb, hbucket);
917		return null_data;
918	}
919	/* allocate and read the key */
920	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
921	ret.dsize = rec.key_len;
922	tdb_unlock(tdb, hbucket);
923
924	return ret;
925}
926
927/* delete an entry in the database given a key */
928int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
929{
930	unsigned hash;
931	tdb_off offset, rec_ptr, last_ptr;
932	struct list_struct rec, lastrec;
933	char *data = NULL;
934
935        if (tdb == NULL) {
936#ifdef TDB_DEBUG
937            printf("tdb_delete() called with null context\n");
938#endif
939            return -1;
940        }
941
942	/* find which hash bucket it is in */
943	hash = tdb_hash(&key);
944
945	tdb_lock(tdb, BUCKET(hash));
946
947	/* find the top of the hash chain */
948	offset = tdb_hash_top(tdb, hash);
949
950	/* read in the hash top */
951	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
952		goto fail;
953	}
954
955	last_ptr = 0;
956
957	/* keep looking until we find the right record */
958	while (rec_ptr) {
959		if (rec_read(tdb, rec_ptr, &rec) == -1) {
960			goto fail;
961		}
962
963		if (hash == rec.full_hash && key.dsize == rec.key_len) {
964			/* a very likely hit - read the record and full key */
965			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
966					     rec.key_len);
967			if (!data) {
968				goto fail;
969			}
970
971			if (memcmp(key.dptr, data, key.dsize) == 0) {
972				/* a definite match - delete it */
973				if (last_ptr == 0) {
974					offset = tdb_hash_top(tdb, hash);
975					if (ofs_write(tdb, offset, &rec.next) == -1) {
976						goto fail;
977					}
978				} else {
979					lastrec.next = rec.next;
980					if (rec_write(tdb, last_ptr, &lastrec) == -1) {
981						goto fail;
982					}
983				}
984				tdb_unlock(tdb, BUCKET(hash));
985				tdb_lock(tdb, -1);
986				/* and recover the space */
987				offset = FREELIST_TOP;
988				if (ofs_read(tdb, offset, &rec.next) == -1) {
989					goto fail2;
990				}
991				rec.magic = TDB_FREE_MAGIC;
992				if (rec_write(tdb, rec_ptr, &rec) == -1) {
993					goto fail2;
994				}
995				if (ofs_write(tdb, offset, &rec_ptr) == -1) {
996					goto fail2;
997				}
998
999				/* yipee - all done */
1000				free(data);
1001				tdb_unlock(tdb, -1);
1002				return 0;
1003			}
1004
1005			/* a miss - drat */
1006			free(data);
1007			data = NULL;
1008		}
1009
1010		/* move to the next record */
1011		last_ptr = rec_ptr;
1012		lastrec = rec;
1013		rec_ptr = rec.next;
1014	}
1015
1016 fail:
1017	if (data) free(data);
1018	tdb_unlock(tdb, BUCKET(hash));
1019	return -1;
1020
1021 fail2:
1022	if (data) free(data);
1023	tdb_unlock(tdb, -1);
1024	return -1;
1025}
1026
1027
1028/* store an element in the database, replacing any existing element
1029   with the same key
1030
1031   return 0 on success, -1 on failure
1032*/
1033int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1034{
1035	struct list_struct rec;
1036	unsigned hash;
1037	tdb_off rec_ptr, offset;
1038	char *p = NULL;
1039
1040        if (tdb == NULL) {
1041#ifdef TDB_DEBUG
1042            printf("tdb_store() called with null context\n");
1043#endif
1044            return -1;
1045        }
1046
1047	/* find which hash bucket it is in */
1048	hash = tdb_hash(&key);
1049
1050	/* check for it existing */
1051	if (flag == TDB_INSERT && tdb_exists(tdb, key)) {
1052		tdb->ecode = TDB_ERR_EXISTS;
1053		return -1;
1054	}
1055
1056	/* first try in-place update */
1057	if (flag != TDB_INSERT && tdb_update(tdb, key, dbuf) == 0) {
1058		return 0;
1059	}
1060
1061	rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize);
1062	if (rec_ptr == 0) {
1063		return -1;
1064	}
1065
1066	tdb_lock(tdb, BUCKET(hash));
1067
1068	/* delete any existing record - if it doesn't exist we don't care */
1069	if (flag != TDB_INSERT) {
1070		tdb_delete(tdb, key);
1071	}
1072
1073	/* read the newly created record */
1074	if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
1075		goto fail;
1076	}
1077
1078	if (rec.magic != TDB_FREE_MAGIC) goto fail;
1079
1080	/* find the top of the hash chain */
1081	offset = tdb_hash_top(tdb, hash);
1082
1083	/* read in the hash top diretcly into our next pointer */
1084	if (ofs_read(tdb, offset, &rec.next) == -1) {
1085		goto fail;
1086	}
1087
1088	rec.key_len = key.dsize;
1089	rec.data_len = dbuf.dsize;
1090	rec.full_hash = hash;
1091	rec.magic = TDB_MAGIC;
1092
1093	p = (char *)malloc(sizeof(rec) + key.dsize + dbuf.dsize);
1094	if (!p) {
1095		tdb->ecode = TDB_ERR_OOM;
1096		goto fail;
1097	}
1098
1099	memcpy(p, &rec, sizeof(rec));
1100	memcpy(p+sizeof(rec), key.dptr, key.dsize);
1101	memcpy(p+sizeof(rec)+key.dsize, dbuf.dptr, dbuf.dsize);
1102
1103	if (tdb_write(tdb, rec_ptr, p, sizeof(rec)+key.dsize+dbuf.dsize) == -1)
1104		goto fail;
1105
1106	free(p);
1107	p = NULL;
1108
1109	/* and point the top of the hash chain at it */
1110	if (ofs_write(tdb, offset, &rec_ptr) == -1) goto fail;
1111
1112	tdb_unlock(tdb, BUCKET(hash));
1113	return 0;
1114
1115 fail:
1116#if TDB_DEBUG
1117	printf("store failed for hash 0x%08x in bucket %u\n", hash, BUCKET(hash));
1118#endif
1119	if (p) free(p);
1120	tdb_unlock(tdb, BUCKET(hash));
1121	return -1;
1122}
1123
1124
1125/* open the database, creating it if necessary
1126
1127   The open_flags and mode are passed straight to the open call on the database
1128   file. A flags value of O_WRONLY is invalid
1129
1130   The hash size is advisory, use zero for a default value.
1131
1132   return is NULL on error
1133*/
1134TDB_CONTEXT *tdb_open(char *name, int hash_size, int tdb_flags,
1135		      int open_flags, mode_t mode)
1136{
1137	TDB_CONTEXT tdb, *ret;
1138	struct stat st;
1139
1140	memset(&tdb, 0, sizeof(tdb));
1141
1142	tdb.fd = -1;
1143	tdb.name = NULL;
1144	tdb.map_ptr = NULL;
1145
1146	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1147		goto fail;
1148	}
1149
1150	if (hash_size == 0) hash_size = DEFAULT_HASH_SIZE;
1151
1152	tdb.read_only = ((open_flags & O_ACCMODE) == O_RDONLY);
1153
1154        if (name != NULL) {
1155            tdb.fd = open(name, open_flags, mode);
1156            if (tdb.fd == -1) {
1157		goto fail;
1158            }
1159	    (void) fcntl(tdb.fd, F_SETFD, FD_CLOEXEC);
1160        }
1161
1162	/* ensure there is only one process initialising at once */
1163	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_SET, F_WRLCK, F_SETLKW);
1164
1165	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1166		/* we need to zero the database if we are the only
1167		   one with it open */
1168		if (tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_WRLCK, F_SETLK) == 0) {
1169			ftruncate(tdb.fd, 0);
1170			tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLK);
1171		}
1172	}
1173
1174	/* leave this lock in place */
1175	tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_RDLCK, F_SETLKW);
1176
1177	if (read(tdb.fd, &tdb.header, sizeof(tdb.header)) != sizeof(tdb.header) ||
1178	    strcmp(tdb.header.magic_food, TDB_MAGIC_FOOD) != 0 ||
1179	    tdb.header.version != TDB_VERSION) {
1180		/* its not a valid database - possibly initialise it */
1181		if (!(open_flags & O_CREAT)) {
1182			goto fail;
1183		}
1184		if (tdb_new_database(&tdb, hash_size) == -1) goto fail;
1185
1186		lseek(tdb.fd, 0, SEEK_SET);
1187		if (tdb.fd != -1 && read(tdb.fd, &tdb.header,
1188                                         sizeof(tdb.header)) !=
1189                                         sizeof(tdb.header))
1190                    goto fail;
1191	}
1192
1193        if (tdb.fd != -1) {
1194            fstat(tdb.fd, &st);
1195
1196            /* map the database and fill in the return structure */
1197            tdb.name = (char *)strdup(name);
1198            tdb.map_size = st.st_size;
1199        }
1200
1201        tdb.locked = (int *)calloc(tdb.header.hash_size+1,
1202                                   sizeof(tdb.locked[0]));
1203        if (!tdb.locked) {
1204            goto fail;
1205        }
1206
1207#if HAVE_MMAP
1208        if (tdb.fd != -1) {
1209            tdb.map_ptr = (void *)mmap(NULL, st.st_size,
1210                                       tdb.read_only? PROT_READ : PROT_READ|PROT_WRITE,
1211                                       MAP_SHARED | MAP_FILE, tdb.fd, 0);
1212	    if (tdb.map_ptr == MAP_FAILED) {
1213		tdb.map_ptr = NULL;
1214	    }
1215        }
1216#endif
1217
1218	ret = (TDB_CONTEXT *)malloc(sizeof(tdb));
1219	if (!ret) goto fail;
1220
1221	*ret = tdb;
1222
1223#if TDB_DEBUG
1224	printf("mapped database of hash_size %u map_size=%u\n",
1225	       hash_size, tdb.map_size);
1226#endif
1227
1228	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLKW);
1229	return ret;
1230
1231 fail:
1232        if (tdb.name) free(tdb.name);
1233	if (tdb.fd != -1) close(tdb.fd);
1234	if (tdb.map_ptr) munmap(tdb.map_ptr, tdb.map_size);
1235
1236	return NULL;
1237}
1238
1239/* close a database */
1240int tdb_close(TDB_CONTEXT *tdb)
1241{
1242	if (!tdb) return -1;
1243
1244	if (tdb->name) free(tdb->name);
1245	if (tdb->fd != -1) close(tdb->fd);
1246	if (tdb->locked) free(tdb->locked);
1247
1248	if (tdb->map_ptr) {
1249            if (tdb->fd != -1) {
1250                munmap(tdb->map_ptr, tdb->map_size);
1251            } else {
1252                free(tdb->map_ptr);
1253            }
1254        }
1255
1256	memset(tdb, 0, sizeof(*tdb));
1257	free(tdb);
1258
1259	return 0;
1260}
1261
1262/* lock the database. If we already have it locked then don't do anything */
1263int tdb_writelock(TDB_CONTEXT *tdb)
1264{
1265        if (tdb == NULL) {
1266#ifdef TDB_DEBUG
1267            printf("tdb_writelock() called with null context\n");
1268#endif
1269            return -1;
1270        }
1271
1272	return tdb_lock(tdb, -1);
1273}
1274
1275/* unlock the database. */
1276int tdb_writeunlock(TDB_CONTEXT *tdb)
1277{
1278        if (tdb == NULL) {
1279#ifdef TDB_DEBUG
1280            printf("tdb_writeunlock() called with null context\n");
1281#endif
1282            return -1;
1283        }
1284
1285	return tdb_unlock(tdb, -1);
1286}
1287
1288/* lock one hash chain. This is meant to be used to reduce locking
1289   contention - it cannot guarantee how many records will be locked */
1290int tdb_lockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1291{
1292        if (tdb == NULL) {
1293#ifdef TDB_DEBUG
1294            printf("tdb_lockchain() called with null context\n");
1295#endif
1296            return -1;
1297        }
1298
1299	return tdb_lock(tdb, BUCKET(tdb_hash(&key)));
1300}
1301
1302
1303/* unlock one hash chain */
1304int tdb_unlockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1305{
1306        if (tdb == NULL) {
1307#ifdef TDB_DEBUG
1308            printf("tdb_unlockchain() called with null context\n");
1309#endif
1310            return -1;
1311        }
1312
1313	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)));
1314}
1315