1/*
2 * Database functions
3 * Copyright (C) Andrew Tridgell 1999
4 *
5 * Redistribution and use in source and binary forms are permitted
6 * provided that the above copyright notice and this paragraph are
7 * duplicated in all such forms AND provided that this software or
8 * any derived work is only used as part of the PPP daemon (pppd)
9 * and related utilities.
10 * The name of the author may not be used to endorse or promote products
11 * derived from this software without specific prior written permission.
12 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
13 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
14 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
15 *
16 * Note: this software is also available under the Gnu Public License
17 * version 2 or later.
18 */
19
20#include <stdlib.h>
21#include <stdio.h>
22#include <fcntl.h>
23#include <unistd.h>
24#include <string.h>
25#include <fcntl.h>
26#include <errno.h>
27#include <sys/mman.h>
28#include <sys/stat.h>
29#include "tdb.h"
30
31#define TDB_VERSION (0x26011967 + 1)
32#define TDB_MAGIC (0x26011999U)
33#define TDB_FREE_MAGIC (~TDB_MAGIC)
34#define TDB_ALIGN 4
35#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGN)
36#define DEFAULT_HASH_SIZE 128
37#define TDB_PAGE_SIZE 0x2000
38#define TDB_LEN_MULTIPLIER 10
39#define FREELIST_TOP (sizeof(struct tdb_header))
40
41#define LOCK_SET 1
42#define LOCK_CLEAR 0
43
44/* lock offsets */
45#define GLOBAL_LOCK 0
46#define ACTIVE_LOCK 4
47#define LIST_LOCK_BASE 1024
48
49#define BUCKET(hash) ((hash) % tdb->header.hash_size)
50
51#ifndef MAP_FILE
52#define MAP_FILE 0
53#endif
54
55/* the body of the database is made of one list_struct for the free space
56   plus a separate data list for each hash value */
57struct list_struct {
58	tdb_len rec_len; /* total byte length of record */
59	tdb_off next; /* offset of the next record in the list */
60	tdb_len key_len; /* byte length of key */
61	tdb_len data_len; /* byte length of data */
62	unsigned full_hash; /* the full 32 bit hash of the key */
63	unsigned magic;   /* try to catch errors */
64	/*
65	   the following union is implied
66	   union {
67              char record[rec_len];
68	      struct {
69	        char key[key_len];
70		char data[data_len];
71	      }
72           }
73	*/
74};
75
76/* a null data record - useful for error returns */
77static TDB_DATA null_data;
78
79/* a byte range locking function - return 0 on success
80   this functions locks/unlocks 1 byte at the specified offset */
81static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
82		      int set, int rw_type, int lck_type)
83{
84#if NOLOCK
85	return 0;
86#else
87	struct flock fl;
88
89        if (tdb->fd == -1) return 0;   /* for in memory tdb */
90
91	if (tdb->read_only) return -1;
92
93	fl.l_type = set==LOCK_SET?rw_type:F_UNLCK;
94	fl.l_whence = SEEK_SET;
95	fl.l_start = offset;
96	fl.l_len = 1;
97	fl.l_pid = 0;
98
99	if (fcntl(tdb->fd, lck_type, &fl) != 0) {
100#if TDB_DEBUG
101		if (lck_type == F_SETLKW) {
102			printf("lock %d failed at %d (%s)\n",
103			       set, offset, strerror(errno));
104		}
105#endif
106		tdb->ecode = TDB_ERR_LOCK;
107		return -1;
108	}
109	return 0;
110#endif
111}
112
113/* lock a list in the database. list -1 is the alloc list */
114static int tdb_lock(TDB_CONTEXT *tdb, int list)
115{
116	if (list < -1 || list >= (int)tdb->header.hash_size) {
117#if TDB_DEBUG
118		printf("bad list %d\n", list);
119#endif
120		return -1;
121	}
122	if (tdb->locked[list+1] == 0) {
123		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_SET,
124			       F_WRLCK, F_SETLKW) != 0) {
125			return -1;
126		}
127	}
128	tdb->locked[list+1]++;
129	return 0;
130}
131
132/* unlock the database. */
133static int tdb_unlock(TDB_CONTEXT *tdb, int list)
134{
135	if (list < -1 || list >= (int)tdb->header.hash_size) {
136#if TDB_DEBUG
137		printf("bad unlock list %d\n", list);
138#endif
139		return -1;
140	}
141
142	if (tdb->locked[list+1] == 0) {
143#if TDB_DEBUG
144		printf("not locked %d\n", list);
145#endif
146		tdb->ecode = TDB_ERR_LOCK;
147		return -1;
148	}
149	if (tdb->locked[list+1] == 1) {
150		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_CLEAR,
151			       F_WRLCK, F_SETLKW) != 0) {
152			return -1;
153		}
154	}
155	tdb->locked[list+1]--;
156	return 0;
157}
158
159/* the hash algorithm - turn a key into an integer
160   This is based on the hash agorithm from gdbm */
161static unsigned tdb_hash(TDB_DATA *key)
162{
163	unsigned value;	/* Used to compute the hash value.  */
164	unsigned   i;	/* Used to cycle through random values. */
165
166	/* Set the initial value from the key size. */
167	value = 0x238F13AF * key->dsize;
168	for (i=0; i < key->dsize; i++) {
169		value = (value + (key->dptr[i] << (i*5 % 24)));
170	}
171
172	value = (1103515243 * value + 12345);
173
174	return value;
175}
176
177/* find the top of the hash chain for an open database */
178static tdb_off tdb_hash_top(TDB_CONTEXT *tdb, unsigned hash)
179{
180	tdb_off ret;
181	hash = BUCKET(hash);
182	ret = FREELIST_TOP + (hash+1)*sizeof(tdb_off);
183	return ret;
184}
185
186
187/* check for an out of bounds access - if it is out of bounds then
188   see if the database has been expanded by someone else and expand
189   if necessary */
190static int tdb_oob(TDB_CONTEXT *tdb, tdb_off offset)
191{
192	struct stat st;
193	if ((offset <= tdb->map_size) || (tdb->fd == -1)) return 0;
194
195	fstat(tdb->fd, &st);
196	if (st.st_size <= (ssize_t)offset) {
197		tdb->ecode = TDB_ERR_IO;
198		return -1;
199	}
200
201#if HAVE_MMAP
202	if (tdb->map_ptr) {
203		munmap(tdb->map_ptr, tdb->map_size);
204		tdb->map_ptr = NULL;
205	}
206#endif
207
208	tdb->map_size = st.st_size;
209#if HAVE_MMAP
210	tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
211				    tdb->read_only?PROT_READ:PROT_READ|PROT_WRITE,
212				    MAP_SHARED | MAP_FILE, tdb->fd, 0);
213#endif
214	return 0;
215}
216
217
218/* write a lump of data at a specified offset */
219static int tdb_write(TDB_CONTEXT *tdb, tdb_off offset, const char *buf, tdb_len len)
220{
221	if (tdb_oob(tdb, offset + len) != 0) {
222		/* oops - trying to write beyond the end of the database! */
223		return -1;
224	}
225
226	if (tdb->map_ptr) {
227		memcpy(offset + (char *)tdb->map_ptr, buf, len);
228	} else {
229		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
230		    write(tdb->fd, buf, len) != (ssize_t)len) {
231			tdb->ecode = TDB_ERR_IO;
232			return -1;
233		}
234	}
235	return 0;
236}
237
238/* read a lump of data at a specified offset */
239static int tdb_read(TDB_CONTEXT *tdb, tdb_off offset, char *buf, tdb_len len)
240{
241	if (tdb_oob(tdb, offset + len) != 0) {
242		/* oops - trying to read beyond the end of the database! */
243		return -1;
244	}
245
246	if (tdb->map_ptr) {
247		memcpy(buf, offset + (char *)tdb->map_ptr, len);
248	} else {
249		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
250		    read(tdb->fd, buf, len) != (ssize_t)len) {
251			tdb->ecode = TDB_ERR_IO;
252			return -1;
253		}
254	}
255	return 0;
256}
257
258
259/* read a lump of data, allocating the space for it */
260static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
261{
262	char *buf;
263
264	buf = (char *)malloc(len);
265
266	if (!buf) {
267		tdb->ecode = TDB_ERR_OOM;
268		return NULL;
269	}
270
271	if (tdb_read(tdb, offset, buf, len) == -1) {
272		free(buf);
273		return NULL;
274	}
275
276	return buf;
277}
278
279/* convenience routine for writing a record */
280static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
281{
282	return tdb_write(tdb, offset, (char *)rec, sizeof(*rec));
283}
284
285/* convenience routine for writing a tdb_off */
286static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
287{
288	return tdb_write(tdb, offset, (char *)d, sizeof(*d));
289}
290
291/* read a tdb_off from the store */
292static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
293{
294	return tdb_read(tdb, offset, (char *)d, sizeof(*d));
295}
296
297/* read a record and check for simple errors */
298static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
299{
300	if (tdb_read(tdb, offset, (char *)rec, sizeof(*rec)) == -1) return -1;
301	if (rec->magic != TDB_MAGIC) {
302#if TDB_DEBUG
303		printf("bad magic 0x%08x at offset %d\n",
304		       rec->magic, offset);
305#endif
306		tdb->ecode = TDB_ERR_CORRUPT;
307		return -1;
308	}
309	if (tdb_oob(tdb, rec->next) != 0) {
310		return -1;
311	}
312	return 0;
313}
314
315/* expand the database at least length bytes by expanding the
316   underlying file and doing the mmap again if necessary */
317static int tdb_expand(TDB_CONTEXT *tdb, tdb_off length)
318{
319	struct list_struct rec;
320	tdb_off offset, ptr;
321	char b = 0;
322
323	tdb_lock(tdb,-1);
324
325	/* make sure we know about any previous expansions by another
326           process */
327	tdb_oob(tdb,tdb->map_size + 1);
328
329	/* always make room for at least 10 more records */
330	length *= TDB_LEN_MULTIPLIER;
331
332	/* and round the database up to a multiple of TDB_PAGE_SIZE */
333	length = ((tdb->map_size + length + TDB_PAGE_SIZE) & ~(TDB_PAGE_SIZE - 1)) - tdb->map_size;
334
335	/* expand the file itself */
336        if (tdb->fd != -1) {
337            lseek(tdb->fd, tdb->map_size + length - 1, SEEK_SET);
338            if (write(tdb->fd, &b, 1) != 1) goto fail;
339        }
340
341	/* form a new freelist record */
342	offset = FREELIST_TOP;
343	rec.rec_len = length - sizeof(rec);
344	rec.magic = TDB_FREE_MAGIC;
345	if (ofs_read(tdb, offset, &rec.next) == -1) {
346		goto fail;
347	}
348
349#if HAVE_MMAP
350	if (tdb->fd != -1 && tdb->map_ptr) {
351		munmap(tdb->map_ptr, tdb->map_size);
352		tdb->map_ptr = NULL;
353	}
354#endif
355
356	tdb->map_size += length;
357
358        if (tdb->fd == -1) {
359            tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
360        }
361
362	/* write it out */
363	if (rec_write(tdb, tdb->map_size - length, &rec) == -1) {
364		goto fail;
365	}
366
367	/* link it into the free list */
368	ptr = tdb->map_size - length;
369	if (ofs_write(tdb, offset, &ptr) == -1) goto fail;
370
371#if HAVE_MMAP
372        if (tdb->fd != -1) {
373            tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
374                                        PROT_READ|PROT_WRITE,
375                                        MAP_SHARED | MAP_FILE, tdb->fd, 0);
376        }
377#endif
378
379	tdb_unlock(tdb, -1);
380	return 0;
381
382 fail:
383	tdb_unlock(tdb,-1);
384	return -1;
385}
386
387/* allocate some space from the free list. The offset returned points
388   to a unconnected list_struct within the database with room for at
389   least length bytes of total data
390
391   0 is returned if the space could not be allocated
392 */
393static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length)
394{
395	tdb_off offset, rec_ptr, last_ptr;
396	struct list_struct rec, lastrec, newrec;
397
398	tdb_lock(tdb, -1);
399
400 again:
401	last_ptr = 0;
402	offset = FREELIST_TOP;
403
404	/* read in the freelist top */
405	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
406		goto fail;
407	}
408
409	/* keep looking until we find a freelist record that is big
410           enough */
411	while (rec_ptr) {
412		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
413			goto fail;
414		}
415
416		if (rec.magic != TDB_FREE_MAGIC) {
417#if TDB_DEBUG
418			printf("bad magic 0x%08x in free list\n", rec.magic);
419#endif
420			goto fail;
421		}
422
423		if (rec.rec_len >= length) {
424			/* found it - now possibly split it up  */
425			if (rec.rec_len > length + MIN_REC_SIZE) {
426				length = (length + TDB_ALIGN) & ~(TDB_ALIGN-1);
427
428				newrec.rec_len = rec.rec_len - (sizeof(rec) + length);
429				newrec.next = rec.next;
430				newrec.magic = TDB_FREE_MAGIC;
431
432				rec.rec_len = length;
433				rec.next = rec_ptr + sizeof(rec) + rec.rec_len;
434
435				if (rec_write(tdb, rec.next, &newrec) == -1) {
436					goto fail;
437				}
438
439				if (rec_write(tdb, rec_ptr, &rec) == -1) {
440					goto fail;
441				}
442			}
443
444			/* remove it from the list */
445			if (last_ptr == 0) {
446				offset = FREELIST_TOP;
447
448				if (ofs_write(tdb, offset, &rec.next) == -1) {
449					goto fail;
450				}
451			} else {
452				lastrec.next = rec.next;
453				if (rec_write(tdb, last_ptr, &lastrec) == -1) {
454					goto fail;
455				}
456			}
457
458			/* all done - return the new record offset */
459			tdb_unlock(tdb, -1);
460			return rec_ptr;
461		}
462
463		/* move to the next record */
464		lastrec = rec;
465		last_ptr = rec_ptr;
466		rec_ptr = rec.next;
467	}
468
469	/* we didn't find enough space. See if we can expand the
470	   database and if we can then try again */
471	if (tdb_expand(tdb, length + sizeof(rec)) == 0) goto again;
472
473 fail:
474#if TDB_DEBUG
475	printf("tdb_allocate failed for size %u\n", length);
476#endif
477	tdb_unlock(tdb, -1);
478	return 0;
479}
480
481/* initialise a new database with a specified hash size */
482static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
483{
484	struct tdb_header header;
485	tdb_off offset;
486	int i, size = 0;
487	tdb_off buf[16];
488
489        /* create the header */
490        memset(&header, 0, sizeof(header));
491        memcpy(header.magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
492        header.version = TDB_VERSION;
493        header.hash_size = hash_size;
494        lseek(tdb->fd, 0, SEEK_SET);
495        ftruncate(tdb->fd, 0);
496
497        if (tdb->fd != -1 && write(tdb->fd, &header, sizeof(header)) !=
498            sizeof(header)) {
499            tdb->ecode = TDB_ERR_IO;
500            return -1;
501        } else size += sizeof(header);
502
503        /* the freelist and hash pointers */
504        offset = 0;
505        memset(buf, 0, sizeof(buf));
506
507        for (i=0;(hash_size+1)-i >= 16; i += 16) {
508            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(buf)) !=
509                sizeof(buf)) {
510                tdb->ecode = TDB_ERR_IO;
511                return -1;
512            } else size += sizeof(buf);
513        }
514
515        for (;i<hash_size+1; i++) {
516            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(tdb_off)) !=
517                sizeof(tdb_off)) {
518                tdb->ecode = TDB_ERR_IO;
519                return -1;
520            } else size += sizeof(tdb_off);
521        }
522
523        if (tdb->fd == -1) {
524            tdb->map_ptr = calloc(size, 1);
525            tdb->map_size = size;
526            if (tdb->map_ptr == NULL) {
527                tdb->ecode = TDB_ERR_IO;
528                return -1;
529            }
530            memcpy(&tdb->header, &header, sizeof(header));
531        }
532
533#if TDB_DEBUG
534	printf("initialised database of hash_size %u\n",
535	       hash_size);
536#endif
537	return 0;
538}
539
540/* Returns 0 on fail.  On success, return offset of record, and fills
541   in rec */
542static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, unsigned int hash,
543			struct list_struct *rec)
544{
545	tdb_off offset, rec_ptr;
546
547	/* find the top of the hash chain */
548	offset = tdb_hash_top(tdb, hash);
549
550	/* read in the hash top */
551	if (ofs_read(tdb, offset, &rec_ptr) == -1)
552		return 0;
553
554	/* keep looking until we find the right record */
555	while (rec_ptr) {
556		if (rec_read(tdb, rec_ptr, rec) == -1)
557			return 0;
558
559		if (hash == rec->full_hash && key.dsize == rec->key_len) {
560			char *k;
561			/* a very likely hit - read the key */
562			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*rec),
563					   rec->key_len);
564
565			if (!k)
566				return 0;
567
568			if (memcmp(key.dptr, k, key.dsize) == 0) {
569				free(k);
570				return rec_ptr;
571			}
572			free(k);
573		}
574
575		/* move to the next record */
576		rec_ptr = rec->next;
577	}
578	return 0;
579}
580
581/*
582   return an error string for the last tdb error
583*/
584char *tdb_error(TDB_CONTEXT *tdb)
585{
586	int i;
587	static struct {
588		enum TDB_ERROR ecode;
589		char *estring;
590	} emap[] = {
591		{TDB_SUCCESS, "Success"},
592		{TDB_ERR_CORRUPT, "Corrupt database"},
593		{TDB_ERR_IO, "IO Error"},
594		{TDB_ERR_LOCK, "Locking error"},
595		{TDB_ERR_OOM, "Out of memory"},
596		{TDB_ERR_EXISTS, "Record exists"},
597		{-1, NULL}};
598        if (tdb != NULL) {
599            for (i=0;emap[i].estring;i++) {
600		if (tdb->ecode == emap[i].ecode) return emap[i].estring;
601            }
602        } else {
603            return "Invalid tdb context";
604        }
605	return "Invalid error code";
606}
607
608
609/* update an entry in place - this only works if the new data size
610   is <= the old data size and the key exists.
611   on failure return -1
612*/
613int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
614{
615	unsigned hash;
616	struct list_struct rec;
617	tdb_off rec_ptr;
618	int ret = -1;
619
620        if (tdb == NULL) {
621#ifdef TDB_DEBUG
622            printf("tdb_update() called with null context\n");
623#endif
624            return -1;
625        }
626
627	/* find which hash bucket it is in */
628	hash = tdb_hash(&key);
629
630	tdb_lock(tdb, BUCKET(hash));
631	rec_ptr = tdb_find(tdb, key, hash, &rec);
632
633	if (!rec_ptr)
634		goto out;
635
636	/* must be long enough */
637	if (rec.rec_len < key.dsize + dbuf.dsize)
638		goto out;
639
640	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
641		      dbuf.dptr, dbuf.dsize) == -1)
642		goto out;
643
644	if (dbuf.dsize != rec.data_len) {
645		/* update size */
646		rec.data_len = dbuf.dsize;
647		ret = rec_write(tdb, rec_ptr, &rec);
648	} else
649		ret = 0;
650
651 out:
652	tdb_unlock(tdb, BUCKET(hash));
653	return ret;
654}
655
656/* find an entry in the database given a key */
657TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
658{
659	unsigned hash;
660	tdb_off rec_ptr;
661	struct list_struct rec;
662	TDB_DATA ret = null_data;
663
664        if (tdb == NULL) {
665#ifdef TDB_DEBUG
666            printf("tdb_fetch() called with null context\n");
667#endif
668            return null_data;
669        }
670
671	/* find which hash bucket it is in */
672	hash = tdb_hash(&key);
673
674	tdb_lock(tdb, BUCKET(hash));
675	rec_ptr = tdb_find(tdb, key, hash, &rec);
676
677	if (rec_ptr) {
678		ret.dptr = tdb_alloc_read(tdb,
679					  rec_ptr + sizeof(rec) + rec.key_len,
680					  rec.data_len);
681		ret.dsize = rec.data_len;
682	}
683
684	tdb_unlock(tdb, BUCKET(hash));
685	return ret;
686}
687
688/* check if an entry in the database exists
689
690   note that 1 is returned if the key is found and 0 is returned if not found
691   this doesn't match the conventions in the rest of this module, but is
692   compatible with gdbm
693*/
694int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
695{
696	unsigned hash;
697	tdb_off rec_ptr;
698	struct list_struct rec;
699
700        if (tdb == NULL) {
701#ifdef TDB_DEBUG
702            printf("tdb_exists() called with null context\n");
703#endif
704            return 0;
705        }
706
707	/* find which hash bucket it is in */
708	hash = tdb_hash(&key);
709
710	tdb_lock(tdb, BUCKET(hash));
711	rec_ptr = tdb_find(tdb, key, hash, &rec);
712	tdb_unlock(tdb, BUCKET(hash));
713
714	return rec_ptr != 0;
715}
716
717/* traverse the entire database - calling fn(tdb, key, data) on each element.
718   return -1 on error or the record count traversed
719   if fn is NULL then it is not called
720   a non-zero return value from fn() indicates that the traversal should stop
721  */
722int tdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void* state), void* state)
723{
724	int count = 0;
725	unsigned h;
726	tdb_off offset, rec_ptr;
727	struct list_struct rec;
728	char *data;
729	TDB_DATA key, dbuf;
730
731        if (tdb == NULL) {
732#ifdef TDB_DEBUG
733            printf("tdb_traverse() called with null context\n");
734#endif
735            return -1;
736        }
737
738	/* loop over all hash chains */
739	for (h = 0; h < tdb->header.hash_size; h++) {
740		tdb_lock(tdb, BUCKET(h));
741
742		/* read in the hash top */
743		offset = tdb_hash_top(tdb, h);
744		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
745			goto fail;
746		}
747
748		/* traverse all records for this hash */
749		while (rec_ptr) {
750			if (rec_read(tdb, rec_ptr, &rec) == -1) {
751				goto fail;
752			}
753
754			/* now read the full record */
755			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
756					     rec.key_len + rec.data_len);
757			if (!data) {
758				goto fail;
759			}
760
761			key.dptr = data;
762			key.dsize = rec.key_len;
763			dbuf.dptr = data + rec.key_len;
764			dbuf.dsize = rec.data_len;
765			count++;
766
767			if (fn && fn(tdb, key, dbuf, state) != 0) {
768				/* they want us to stop traversing */
769				free(data);
770				tdb_unlock(tdb, BUCKET(h));
771				return count;
772			}
773
774			/* a miss - drat */
775			free(data);
776
777			/* move to the next record */
778			rec_ptr = rec.next;
779		}
780		tdb_unlock(tdb, BUCKET(h));
781	}
782
783	/* return the number traversed */
784	return count;
785
786 fail:
787	tdb_unlock(tdb, BUCKET(h));
788	return -1;
789}
790
791
792/* find the first entry in the database and return its key */
793TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
794{
795	tdb_off offset, rec_ptr;
796	struct list_struct rec;
797	unsigned hash;
798	TDB_DATA ret;
799
800        if (tdb == NULL) {
801#ifdef TDB_DEBUG
802            printf("tdb_firstkey() called with null context\n");
803#endif
804            return null_data;
805        }
806
807	/* look for a non-empty hash chain */
808	for (hash = 0, rec_ptr = 0;
809	     hash < tdb->header.hash_size;
810	     hash++) {
811		/* find the top of the hash chain */
812		offset = tdb_hash_top(tdb, hash);
813
814		tdb_lock(tdb, BUCKET(hash));
815
816		/* read in the hash top */
817		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
818			goto fail;
819		}
820
821		if (rec_ptr) break;
822
823		tdb_unlock(tdb, BUCKET(hash));
824	}
825
826	if (rec_ptr == 0) return null_data;
827
828	/* we've found a non-empty chain, now read the record */
829	if (rec_read(tdb, rec_ptr, &rec) == -1) {
830		goto fail;
831	}
832
833	/* allocate and read the key space */
834	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
835	ret.dsize = rec.key_len;
836	tdb_unlock(tdb, BUCKET(hash));
837	return ret;
838
839 fail:
840	tdb_unlock(tdb, BUCKET(hash));
841	return null_data;
842}
843
844/* find the next entry in the database, returning its key */
845TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key)
846{
847	unsigned hash, hbucket;
848	tdb_off rec_ptr, offset;
849	struct list_struct rec;
850	TDB_DATA ret;
851
852        if (tdb == NULL) {
853#ifdef TDB_DEBUG
854            printf("tdb_nextkey() called with null context\n");
855#endif
856            return null_data;
857        }
858
859	/* find which hash bucket it is in */
860	hash = tdb_hash(&key);
861	hbucket = BUCKET(hash);
862
863	tdb_lock(tdb, hbucket);
864	rec_ptr = tdb_find(tdb, key, hash, &rec);
865	if (rec_ptr) {
866		/* we want the next record after this one */
867		rec_ptr = rec.next;
868	}
869
870	/* not found or last in hash: look for next non-empty hash chain */
871	while (rec_ptr == 0) {
872		tdb_unlock(tdb, hbucket);
873
874		if (++hbucket >= tdb->header.hash_size - 1)
875			return null_data;
876
877		offset = tdb_hash_top(tdb, hbucket);
878		tdb_lock(tdb, hbucket);
879		/* read in the hash top */
880		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
881			tdb_unlock(tdb, hbucket);
882			return null_data;
883		}
884	}
885
886	/* Read the record. */
887	if (rec_read(tdb, rec_ptr, &rec) == -1) {
888		tdb_unlock(tdb, hbucket);
889		return null_data;
890	}
891	/* allocate and read the key */
892	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
893	ret.dsize = rec.key_len;
894	tdb_unlock(tdb, hbucket);
895
896	return ret;
897}
898
899/* delete an entry in the database given a key */
900int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
901{
902	unsigned hash;
903	tdb_off offset, rec_ptr, last_ptr;
904	struct list_struct rec, lastrec;
905	char *data = NULL;
906
907        if (tdb == NULL) {
908#ifdef TDB_DEBUG
909            printf("tdb_delete() called with null context\n");
910#endif
911            return -1;
912        }
913
914	/* find which hash bucket it is in */
915	hash = tdb_hash(&key);
916
917	tdb_lock(tdb, BUCKET(hash));
918
919	/* find the top of the hash chain */
920	offset = tdb_hash_top(tdb, hash);
921
922	/* read in the hash top */
923	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
924		goto fail;
925	}
926
927	last_ptr = 0;
928
929	/* keep looking until we find the right record */
930	while (rec_ptr) {
931		if (rec_read(tdb, rec_ptr, &rec) == -1) {
932			goto fail;
933		}
934
935		if (hash == rec.full_hash && key.dsize == rec.key_len) {
936			/* a very likely hit - read the record and full key */
937			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
938					     rec.key_len);
939			if (!data) {
940				goto fail;
941			}
942
943			if (memcmp(key.dptr, data, key.dsize) == 0) {
944				/* a definite match - delete it */
945				if (last_ptr == 0) {
946					offset = tdb_hash_top(tdb, hash);
947					if (ofs_write(tdb, offset, &rec.next) == -1) {
948						goto fail;
949					}
950				} else {
951					lastrec.next = rec.next;
952					if (rec_write(tdb, last_ptr, &lastrec) == -1) {
953						goto fail;
954					}
955				}
956				tdb_unlock(tdb, BUCKET(hash));
957				tdb_lock(tdb, -1);
958				/* and recover the space */
959				offset = FREELIST_TOP;
960				if (ofs_read(tdb, offset, &rec.next) == -1) {
961					goto fail2;
962				}
963				rec.magic = TDB_FREE_MAGIC;
964				if (rec_write(tdb, rec_ptr, &rec) == -1) {
965					goto fail2;
966				}
967				if (ofs_write(tdb, offset, &rec_ptr) == -1) {
968					goto fail2;
969				}
970
971				/* yipee - all done */
972				free(data);
973				tdb_unlock(tdb, -1);
974				return 0;
975			}
976
977			/* a miss - drat */
978			free(data);
979			data = NULL;
980		}
981
982		/* move to the next record */
983		last_ptr = rec_ptr;
984		lastrec = rec;
985		rec_ptr = rec.next;
986	}
987
988 fail:
989	if (data) free(data);
990	tdb_unlock(tdb, BUCKET(hash));
991	return -1;
992
993 fail2:
994	if (data) free(data);
995	tdb_unlock(tdb, -1);
996	return -1;
997}
998
999
1000/* store an element in the database, replacing any existing element
1001   with the same key
1002
1003   return 0 on success, -1 on failure
1004*/
1005int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1006{
1007	struct list_struct rec;
1008	unsigned hash;
1009	tdb_off rec_ptr, offset;
1010	char *p = NULL;
1011
1012        if (tdb == NULL) {
1013#ifdef TDB_DEBUG
1014            printf("tdb_store() called with null context\n");
1015#endif
1016            return -1;
1017        }
1018
1019	/* find which hash bucket it is in */
1020	hash = tdb_hash(&key);
1021
1022	/* check for it existing */
1023	if (flag == TDB_INSERT && tdb_exists(tdb, key)) {
1024		tdb->ecode = TDB_ERR_EXISTS;
1025		return -1;
1026	}
1027
1028	/* first try in-place update */
1029	if (flag != TDB_INSERT && tdb_update(tdb, key, dbuf) == 0) {
1030		return 0;
1031	}
1032
1033	rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize);
1034	if (rec_ptr == 0) {
1035		return -1;
1036	}
1037
1038	tdb_lock(tdb, BUCKET(hash));
1039
1040	/* delete any existing record - if it doesn't exist we don't care */
1041	if (flag != TDB_INSERT) {
1042		tdb_delete(tdb, key);
1043	}
1044
1045	/* read the newly created record */
1046	if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
1047		goto fail;
1048	}
1049
1050	if (rec.magic != TDB_FREE_MAGIC) goto fail;
1051
1052	/* find the top of the hash chain */
1053	offset = tdb_hash_top(tdb, hash);
1054
1055	/* read in the hash top diretcly into our next pointer */
1056	if (ofs_read(tdb, offset, &rec.next) == -1) {
1057		goto fail;
1058	}
1059
1060	rec.key_len = key.dsize;
1061	rec.data_len = dbuf.dsize;
1062	rec.full_hash = hash;
1063	rec.magic = TDB_MAGIC;
1064
1065	p = (char *)malloc(sizeof(rec) + key.dsize + dbuf.dsize);
1066	if (!p) {
1067		tdb->ecode = TDB_ERR_OOM;
1068		goto fail;
1069	}
1070
1071	memcpy(p, &rec, sizeof(rec));
1072	memcpy(p+sizeof(rec), key.dptr, key.dsize);
1073	memcpy(p+sizeof(rec)+key.dsize, dbuf.dptr, dbuf.dsize);
1074
1075	if (tdb_write(tdb, rec_ptr, p, sizeof(rec)+key.dsize+dbuf.dsize) == -1)
1076		goto fail;
1077
1078	free(p);
1079	p = NULL;
1080
1081	/* and point the top of the hash chain at it */
1082	if (ofs_write(tdb, offset, &rec_ptr) == -1) goto fail;
1083
1084	tdb_unlock(tdb, BUCKET(hash));
1085	return 0;
1086
1087 fail:
1088#if TDB_DEBUG
1089	printf("store failed for hash 0x%08x in bucket %u\n", hash, BUCKET(hash));
1090#endif
1091	if (p) free(p);
1092	tdb_unlock(tdb, BUCKET(hash));
1093	return -1;
1094}
1095
1096
1097/* open the database, creating it if necessary
1098
1099   The open_flags and mode are passed straight to the open call on the database
1100   file. A flags value of O_WRONLY is invalid
1101
1102   The hash size is advisory, use zero for a default value.
1103
1104   return is NULL on error
1105*/
1106TDB_CONTEXT *tdb_open(char *name, int hash_size, int tdb_flags,
1107		      int open_flags, mode_t mode)
1108{
1109	TDB_CONTEXT tdb, *ret;
1110	struct stat st;
1111
1112	memset(&tdb, 0, sizeof(tdb));
1113
1114	tdb.fd = -1;
1115	tdb.name = NULL;
1116	tdb.map_ptr = NULL;
1117
1118	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1119		goto fail;
1120	}
1121
1122	if (hash_size == 0) hash_size = DEFAULT_HASH_SIZE;
1123
1124	tdb.read_only = ((open_flags & O_ACCMODE) == O_RDONLY);
1125
1126        if (name != NULL) {
1127            tdb.fd = open(name, open_flags, mode);
1128            if (tdb.fd == -1) {
1129		goto fail;
1130            }
1131        }
1132
1133	/* ensure there is only one process initialising at once */
1134	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_SET, F_WRLCK, F_SETLKW);
1135
1136	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1137		/* we need to zero the database if we are the only
1138		   one with it open */
1139		if (tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_WRLCK, F_SETLK) == 0) {
1140			ftruncate(tdb.fd, 0);
1141			tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLK);
1142		}
1143	}
1144
1145	/* leave this lock in place */
1146	tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_RDLCK, F_SETLKW);
1147
1148	if (read(tdb.fd, &tdb.header, sizeof(tdb.header)) != sizeof(tdb.header) ||
1149	    strcmp(tdb.header.magic_food, TDB_MAGIC_FOOD) != 0 ||
1150	    tdb.header.version != TDB_VERSION) {
1151		/* its not a valid database - possibly initialise it */
1152		if (!(open_flags & O_CREAT)) {
1153			goto fail;
1154		}
1155		if (tdb_new_database(&tdb, hash_size) == -1) goto fail;
1156
1157		lseek(tdb.fd, 0, SEEK_SET);
1158		if (tdb.fd != -1 && read(tdb.fd, &tdb.header,
1159                                         sizeof(tdb.header)) !=
1160                                         sizeof(tdb.header))
1161                    goto fail;
1162	}
1163
1164        if (tdb.fd != -1) {
1165            fstat(tdb.fd, &st);
1166
1167            /* map the database and fill in the return structure */
1168            tdb.name = (char *)strdup(name);
1169            tdb.map_size = st.st_size;
1170        }
1171
1172        tdb.locked = (int *)calloc(tdb.header.hash_size+1,
1173                                   sizeof(tdb.locked[0]));
1174        if (!tdb.locked) {
1175            goto fail;
1176        }
1177
1178#if HAVE_MMAP
1179        if (tdb.fd != -1) {
1180            tdb.map_ptr = (void *)mmap(NULL, st.st_size,
1181                                       tdb.read_only? PROT_READ : PROT_READ|PROT_WRITE,
1182                                       MAP_SHARED | MAP_FILE, tdb.fd, 0);
1183        }
1184#endif
1185
1186	ret = (TDB_CONTEXT *)malloc(sizeof(tdb));
1187	if (!ret) goto fail;
1188
1189	*ret = tdb;
1190
1191#if TDB_DEBUG
1192	printf("mapped database of hash_size %u map_size=%u\n",
1193	       hash_size, tdb.map_size);
1194#endif
1195
1196	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLKW);
1197	return ret;
1198
1199 fail:
1200        if (tdb.name) free(tdb.name);
1201	if (tdb.fd != -1) close(tdb.fd);
1202	if (tdb.map_ptr) munmap(tdb.map_ptr, tdb.map_size);
1203
1204	return NULL;
1205}
1206
1207/* close a database */
1208int tdb_close(TDB_CONTEXT *tdb)
1209{
1210	if (!tdb) return -1;
1211
1212	if (tdb->name) free(tdb->name);
1213	if (tdb->fd != -1) close(tdb->fd);
1214	if (tdb->locked) free(tdb->locked);
1215
1216	if (tdb->map_ptr) {
1217            if (tdb->fd != -1) {
1218                munmap(tdb->map_ptr, tdb->map_size);
1219            } else {
1220                free(tdb->map_ptr);
1221            }
1222        }
1223
1224	memset(tdb, 0, sizeof(*tdb));
1225	free(tdb);
1226
1227	return 0;
1228}
1229
1230/* lock the database. If we already have it locked then don't do anything */
1231int tdb_writelock(TDB_CONTEXT *tdb)
1232{
1233        if (tdb == NULL) {
1234#ifdef TDB_DEBUG
1235            printf("tdb_writelock() called with null context\n");
1236#endif
1237            return -1;
1238        }
1239
1240	return tdb_lock(tdb, -1);
1241}
1242
1243/* unlock the database. */
1244int tdb_writeunlock(TDB_CONTEXT *tdb)
1245{
1246        if (tdb == NULL) {
1247#ifdef TDB_DEBUG
1248            printf("tdb_writeunlock() called with null context\n");
1249#endif
1250            return -1;
1251        }
1252
1253	return tdb_unlock(tdb, -1);
1254}
1255
1256/* lock one hash chain. This is meant to be used to reduce locking
1257   contention - it cannot guarantee how many records will be locked */
1258int tdb_lockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1259{
1260        if (tdb == NULL) {
1261#ifdef TDB_DEBUG
1262            printf("tdb_lockchain() called with null context\n");
1263#endif
1264            return -1;
1265        }
1266
1267	return tdb_lock(tdb, BUCKET(tdb_hash(&key)));
1268}
1269
1270
1271/* unlock one hash chain */
1272int tdb_unlockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1273{
1274        if (tdb == NULL) {
1275#ifdef TDB_DEBUG
1276            printf("tdb_unlockchain() called with null context\n");
1277#endif
1278            return -1;
1279        }
1280
1281	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)));
1282}
1283