1/*
2 * Database functions
3 * Copyright (C) Andrew Tridgell 1999
4 *
5 * Redistribution and use in source and binary forms are permitted
6 * provided that the above copyright notice and this paragraph are
7 * duplicated in all such forms AND provided that this software or
8 * any derived work is only used as part of the PPP daemon (pppd)
9 * and related utilities.
10 * The name of the author may not be used to endorse or promote products
11 * derived from this software without specific prior written permission.
12 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
13 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
14 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
15 *
16 * Note: this software is also available under the Gnu Public License
17 * version 2 or later.
18 */
19
20#include <stdlib.h>
21#include <stdio.h>
22#include <fcntl.h>
23#include <unistd.h>
24#include <string.h>
25#include <fcntl.h>
26#include <errno.h>
27#include <sys/mman.h>
28#include <sys/stat.h>
29#include "tdb.h"
30
31#define TDB_VERSION (0x26011967 + 1)
32#define TDB_MAGIC (0x26011999U)
33#define TDB_FREE_MAGIC (~TDB_MAGIC)
34#define TDB_ALIGN 4
35#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGN)
36#define DEFAULT_HASH_SIZE 128
37#define TDB_PAGE_SIZE 0x2000
38#define TDB_LEN_MULTIPLIER 10
39#define FREELIST_TOP (sizeof(struct tdb_header))
40
41#define LOCK_SET 1
42#define LOCK_CLEAR 0
43
44/* lock offsets */
45#define GLOBAL_LOCK 0
46#define ACTIVE_LOCK 4
47#define LIST_LOCK_BASE 1024
48
49#define BUCKET(hash) ((hash) % tdb->header.hash_size)
50
51#ifndef MAP_FILE
52#define MAP_FILE 0
53#endif
54
55/* the body of the database is made of one list_struct for the free space
56   plus a separate data list for each hash value */
57struct list_struct {
58	tdb_len rec_len; /* total byte length of record */
59	tdb_off next; /* offset of the next record in the list */
60	tdb_len key_len; /* byte length of key */
61	tdb_len data_len; /* byte length of data */
62	unsigned full_hash; /* the full 32 bit hash of the key */
63	unsigned magic;   /* try to catch errors */
64	/*
65	   the following union is implied
66	   union {
67              char record[rec_len];
68	      struct {
69	        char key[key_len];
70		char data[data_len];
71	      }
72           }
73	*/
74};
75
76/* a null data record - useful for error returns */
77static TDB_DATA null_data;
78
79/* a byte range locking function - return 0 on success
80   this functions locks/unlocks 1 byte at the specified offset */
81static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
82		      int set, int rw_type, int lck_type)
83{
84#if NOLOCK
85	return 0;
86#else
87	struct flock fl;
88
89        if (tdb->fd == -1) return 0;   /* for in memory tdb */
90
91	if (tdb->read_only) return -1;
92
93	fl.l_type = set==LOCK_SET?rw_type:F_UNLCK;
94	fl.l_whence = SEEK_SET;
95	fl.l_start = offset;
96	fl.l_len = 1;
97	fl.l_pid = 0;
98
99	if (fcntl(tdb->fd, lck_type, &fl) != 0) {
100#if TDB_DEBUG
101		if (lck_type == F_SETLKW) {
102			printf("lock %d failed at %d (%s)\n",
103			       set, offset, strerror(errno));
104		}
105#endif
106		tdb->ecode = TDB_ERR_LOCK;
107		return -1;
108	}
109	return 0;
110#endif
111}
112
113/* lock a list in the database. list -1 is the alloc list */
114static int tdb_lock(TDB_CONTEXT *tdb, int list)
115{
116	if (list < -1 || list >= (int)tdb->header.hash_size) {
117#if TDB_DEBUG
118		printf("bad list %d\n", list);
119#endif
120		return -1;
121	}
122	if (tdb->locked[list+1] == 0) {
123		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_SET,
124			       F_WRLCK, F_SETLKW) != 0) {
125			return -1;
126		}
127	}
128	tdb->locked[list+1]++;
129	return 0;
130}
131
132/* unlock the database. */
133static int tdb_unlock(TDB_CONTEXT *tdb, int list)
134{
135	if (list < -1 || list >= (int)tdb->header.hash_size) {
136#if TDB_DEBUG
137		printf("bad unlock list %d\n", list);
138#endif
139		return -1;
140	}
141
142	if (tdb->locked[list+1] == 0) {
143#if TDB_DEBUG
144		printf("not locked %d\n", list);
145#endif
146		tdb->ecode = TDB_ERR_LOCK;
147		return -1;
148	}
149	if (tdb->locked[list+1] == 1) {
150		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_CLEAR,
151			       F_WRLCK, F_SETLKW) != 0) {
152			return -1;
153		}
154	}
155	tdb->locked[list+1]--;
156	return 0;
157}
158
159/* the hash algorithm - turn a key into an integer
160   This is based on the hash agorithm from gdbm */
161static unsigned tdb_hash(TDB_DATA *key)
162{
163	unsigned value;	/* Used to compute the hash value.  */
164	unsigned   i;	/* Used to cycle through random values. */
165
166	/* Set the initial value from the key size. */
167	value = 0x238F13AF * key->dsize;
168	for (i=0; i < key->dsize; i++) {
169		value = (value + (key->dptr[i] << (i*5 % 24)));
170	}
171
172	value = (1103515243 * value + 12345);
173
174	return value;
175}
176
177/* find the top of the hash chain for an open database */
178static tdb_off tdb_hash_top(TDB_CONTEXT *tdb, unsigned hash)
179{
180	tdb_off ret;
181	hash = BUCKET(hash);
182	ret = FREELIST_TOP + (hash+1)*sizeof(tdb_off);
183	return ret;
184}
185
186
187/* check for an out of bounds access - if it is out of bounds then
188   see if the database has been expanded by someone else and expand
189   if necessary */
190static int tdb_oob(TDB_CONTEXT *tdb, tdb_off offset)
191{
192	struct stat st;
193	if ((offset <= tdb->map_size) || (tdb->fd == -1)) return 0;
194
195	fstat(tdb->fd, &st);
196	if (st.st_size <= (ssize_t)offset) {
197		tdb->ecode = TDB_ERR_IO;
198		return -1;
199	}
200
201#if HAVE_MMAP
202	if (tdb->map_ptr) {
203		munmap(tdb->map_ptr, tdb->map_size);
204		tdb->map_ptr = NULL;
205	}
206#endif
207
208	tdb->map_size = st.st_size;
209#if HAVE_MMAP
210	tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
211				    tdb->read_only?PROT_READ:PROT_READ|PROT_WRITE,
212				    MAP_SHARED | MAP_FILE, tdb->fd, 0);
213	if (tdb->map_ptr == MAP_FAILED) {
214	    tdb->map_ptr = NULL;
215	}
216#endif
217	return 0;
218}
219
220
221/* write a lump of data at a specified offset */
222static int tdb_write(TDB_CONTEXT *tdb, tdb_off offset, const char *buf, tdb_len len)
223{
224	if (tdb_oob(tdb, offset + len) != 0) {
225		/* oops - trying to write beyond the end of the database! */
226		return -1;
227	}
228
229	if (tdb->map_ptr) {
230		memcpy(offset + (char *)tdb->map_ptr, buf, len);
231	} else {
232		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
233		    write(tdb->fd, buf, len) != (ssize_t)len) {
234			tdb->ecode = TDB_ERR_IO;
235			return -1;
236		}
237	}
238	return 0;
239}
240
241/* read a lump of data at a specified offset */
242static int tdb_read(TDB_CONTEXT *tdb, tdb_off offset, char *buf, tdb_len len)
243{
244	if (tdb_oob(tdb, offset + len) != 0) {
245		/* oops - trying to read beyond the end of the database! */
246		return -1;
247	}
248
249	if (tdb->map_ptr) {
250		memcpy(buf, offset + (char *)tdb->map_ptr, len);
251	} else {
252		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
253		    read(tdb->fd, buf, len) != (ssize_t)len) {
254			tdb->ecode = TDB_ERR_IO;
255			return -1;
256		}
257	}
258	return 0;
259}
260
261
262/* read a lump of data, allocating the space for it */
263static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
264{
265	char *buf;
266
267	buf = (char *)malloc(len);
268
269	if (!buf) {
270		tdb->ecode = TDB_ERR_OOM;
271		return NULL;
272	}
273
274	if (tdb_read(tdb, offset, buf, len) == -1) {
275		free(buf);
276		return NULL;
277	}
278
279	return buf;
280}
281
282/* convenience routine for writing a record */
283static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
284{
285	return tdb_write(tdb, offset, (char *)rec, sizeof(*rec));
286}
287
288/* convenience routine for writing a tdb_off */
289static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
290{
291	return tdb_write(tdb, offset, (char *)d, sizeof(*d));
292}
293
294/* read a tdb_off from the store */
295static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
296{
297	return tdb_read(tdb, offset, (char *)d, sizeof(*d));
298}
299
300/* read a record and check for simple errors */
301static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
302{
303	if (tdb_read(tdb, offset, (char *)rec, sizeof(*rec)) == -1) return -1;
304	if (rec->magic != TDB_MAGIC) {
305#if TDB_DEBUG
306		printf("bad magic 0x%08x at offset %d\n",
307		       rec->magic, offset);
308#endif
309		tdb->ecode = TDB_ERR_CORRUPT;
310		return -1;
311	}
312	if (tdb_oob(tdb, rec->next) != 0) {
313		return -1;
314	}
315	return 0;
316}
317
318/* expand the database at least length bytes by expanding the
319   underlying file and doing the mmap again if necessary */
320static int tdb_expand(TDB_CONTEXT *tdb, tdb_off length)
321{
322	struct list_struct rec;
323	tdb_off offset, ptr;
324	char b = 0;
325
326	tdb_lock(tdb,-1);
327
328	/* make sure we know about any previous expansions by another
329           process */
330	tdb_oob(tdb,tdb->map_size + 1);
331
332	/* always make room for at least 10 more records */
333	length *= TDB_LEN_MULTIPLIER;
334
335	/* and round the database up to a multiple of TDB_PAGE_SIZE */
336	length = ((tdb->map_size + length + TDB_PAGE_SIZE) & ~(TDB_PAGE_SIZE - 1)) - tdb->map_size;
337
338	/* expand the file itself */
339        if (tdb->fd != -1) {
340            lseek(tdb->fd, tdb->map_size + length - 1, SEEK_SET);
341            if (write(tdb->fd, &b, 1) != 1) goto fail;
342        }
343
344	/* form a new freelist record */
345	offset = FREELIST_TOP;
346	rec.rec_len = length - sizeof(rec);
347	rec.magic = TDB_FREE_MAGIC;
348	if (ofs_read(tdb, offset, &rec.next) == -1) {
349		goto fail;
350	}
351
352#if HAVE_MMAP
353	if (tdb->fd != -1 && tdb->map_ptr) {
354		munmap(tdb->map_ptr, tdb->map_size);
355		tdb->map_ptr = NULL;
356	}
357#endif
358
359	tdb->map_size += length;
360
361        if (tdb->fd == -1) {
362            tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
363        }
364
365	/* write it out */
366	if (rec_write(tdb, tdb->map_size - length, &rec) == -1) {
367		goto fail;
368	}
369
370	/* link it into the free list */
371	ptr = tdb->map_size - length;
372	if (ofs_write(tdb, offset, &ptr) == -1) goto fail;
373
374#if HAVE_MMAP
375        if (tdb->fd != -1) {
376            tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
377                                        PROT_READ|PROT_WRITE,
378                                        MAP_SHARED | MAP_FILE, tdb->fd, 0);
379	    if (tdb->map_ptr == MAP_FAILED) {
380		tdb->map_ptr = NULL;
381	    }
382        }
383#endif
384
385	tdb_unlock(tdb, -1);
386	return 0;
387
388 fail:
389	tdb_unlock(tdb,-1);
390	return -1;
391}
392
393/* allocate some space from the free list. The offset returned points
394   to a unconnected list_struct within the database with room for at
395   least length bytes of total data
396
397   0 is returned if the space could not be allocated
398 */
399static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length)
400{
401	tdb_off offset, rec_ptr, last_ptr;
402	struct list_struct rec, lastrec, newrec;
403
404	tdb_lock(tdb, -1);
405
406 again:
407	last_ptr = 0;
408	offset = FREELIST_TOP;
409
410	/* read in the freelist top */
411	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
412		goto fail;
413	}
414
415	/* keep looking until we find a freelist record that is big
416           enough */
417	while (rec_ptr) {
418		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
419			goto fail;
420		}
421
422		if (rec.magic != TDB_FREE_MAGIC) {
423#if TDB_DEBUG
424			printf("bad magic 0x%08x in free list\n", rec.magic);
425#endif
426			goto fail;
427		}
428
429		if (rec.rec_len >= length) {
430			/* found it - now possibly split it up  */
431			if (rec.rec_len > length + MIN_REC_SIZE) {
432				length = (length + TDB_ALIGN) & ~(TDB_ALIGN-1);
433
434				newrec.rec_len = rec.rec_len - (sizeof(rec) + length);
435				newrec.next = rec.next;
436				newrec.magic = TDB_FREE_MAGIC;
437
438				rec.rec_len = length;
439				rec.next = rec_ptr + sizeof(rec) + rec.rec_len;
440
441				if (rec_write(tdb, rec.next, &newrec) == -1) {
442					goto fail;
443				}
444
445				if (rec_write(tdb, rec_ptr, &rec) == -1) {
446					goto fail;
447				}
448			}
449
450			/* remove it from the list */
451			if (last_ptr == 0) {
452				offset = FREELIST_TOP;
453
454				if (ofs_write(tdb, offset, &rec.next) == -1) {
455					goto fail;
456				}
457			} else {
458				lastrec.next = rec.next;
459				if (rec_write(tdb, last_ptr, &lastrec) == -1) {
460					goto fail;
461				}
462			}
463
464			/* all done - return the new record offset */
465			tdb_unlock(tdb, -1);
466			return rec_ptr;
467		}
468
469		/* move to the next record */
470		lastrec = rec;
471		last_ptr = rec_ptr;
472		rec_ptr = rec.next;
473	}
474
475	/* we didn't find enough space. See if we can expand the
476	   database and if we can then try again */
477	if (tdb_expand(tdb, length + sizeof(rec)) == 0) goto again;
478
479 fail:
480#if TDB_DEBUG
481	printf("tdb_allocate failed for size %u\n", length);
482#endif
483	tdb_unlock(tdb, -1);
484	return 0;
485}
486
487/* initialise a new database with a specified hash size */
488static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
489{
490	struct tdb_header header;
491	tdb_off offset;
492	int i, size = 0;
493	tdb_off buf[16];
494
495        /* create the header */
496        memset(&header, 0, sizeof(header));
497        memcpy(header.magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
498        header.version = TDB_VERSION;
499        header.hash_size = hash_size;
500        lseek(tdb->fd, 0, SEEK_SET);
501        ftruncate(tdb->fd, 0);
502
503        if (tdb->fd != -1 && write(tdb->fd, &header, sizeof(header)) !=
504            sizeof(header)) {
505            tdb->ecode = TDB_ERR_IO;
506            return -1;
507        } else size += sizeof(header);
508
509        /* the freelist and hash pointers */
510        offset = 0;
511        memset(buf, 0, sizeof(buf));
512
513        for (i=0;(hash_size+1)-i >= 16; i += 16) {
514            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(buf)) !=
515                sizeof(buf)) {
516                tdb->ecode = TDB_ERR_IO;
517                return -1;
518            } else size += sizeof(buf);
519        }
520
521        for (;i<hash_size+1; i++) {
522            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(tdb_off)) !=
523                sizeof(tdb_off)) {
524                tdb->ecode = TDB_ERR_IO;
525                return -1;
526            } else size += sizeof(tdb_off);
527        }
528
529        if (tdb->fd == -1) {
530            tdb->map_ptr = calloc(size, 1);
531            tdb->map_size = size;
532            if (tdb->map_ptr == NULL) {
533                tdb->ecode = TDB_ERR_IO;
534                return -1;
535            }
536            memcpy(&tdb->header, &header, sizeof(header));
537        }
538
539#if TDB_DEBUG
540	printf("initialised database of hash_size %u\n",
541	       hash_size);
542#endif
543	return 0;
544}
545
546/* Returns 0 on fail.  On success, return offset of record, and fills
547   in rec */
548static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, unsigned int hash,
549			struct list_struct *rec)
550{
551	tdb_off offset, rec_ptr;
552
553	/* find the top of the hash chain */
554	offset = tdb_hash_top(tdb, hash);
555
556	/* read in the hash top */
557	if (ofs_read(tdb, offset, &rec_ptr) == -1)
558		return 0;
559
560	/* keep looking until we find the right record */
561	while (rec_ptr) {
562		if (rec_read(tdb, rec_ptr, rec) == -1)
563			return 0;
564
565		if (hash == rec->full_hash && key.dsize == rec->key_len) {
566			char *k;
567			/* a very likely hit - read the key */
568			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*rec),
569					   rec->key_len);
570
571			if (!k)
572				return 0;
573
574			if (memcmp(key.dptr, k, key.dsize) == 0) {
575				free(k);
576				return rec_ptr;
577			}
578			free(k);
579		}
580
581		/* move to the next record */
582		rec_ptr = rec->next;
583	}
584	return 0;
585}
586
587/*
588   return an error string for the last tdb error
589*/
590char *tdb_error(TDB_CONTEXT *tdb)
591{
592	int i;
593	static struct {
594		enum TDB_ERROR ecode;
595		char *estring;
596	} emap[] = {
597		{TDB_SUCCESS, "Success"},
598		{TDB_ERR_CORRUPT, "Corrupt database"},
599		{TDB_ERR_IO, "IO Error"},
600		{TDB_ERR_LOCK, "Locking error"},
601		{TDB_ERR_OOM, "Out of memory"},
602		{TDB_ERR_EXISTS, "Record exists"},
603		{-1, NULL}};
604        if (tdb != NULL) {
605            for (i=0;emap[i].estring;i++) {
606		if (tdb->ecode == emap[i].ecode) return emap[i].estring;
607            }
608        } else {
609            return "Invalid tdb context";
610        }
611	return "Invalid error code";
612}
613
614
615/* update an entry in place - this only works if the new data size
616   is <= the old data size and the key exists.
617   on failure return -1
618*/
619int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
620{
621	unsigned hash;
622	struct list_struct rec;
623	tdb_off rec_ptr;
624	int ret = -1;
625
626        if (tdb == NULL) {
627#ifdef TDB_DEBUG
628            printf("tdb_update() called with null context\n");
629#endif
630            return -1;
631        }
632
633	/* find which hash bucket it is in */
634	hash = tdb_hash(&key);
635
636	tdb_lock(tdb, BUCKET(hash));
637	rec_ptr = tdb_find(tdb, key, hash, &rec);
638
639	if (!rec_ptr)
640		goto out;
641
642	/* must be long enough */
643	if (rec.rec_len < key.dsize + dbuf.dsize)
644		goto out;
645
646	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
647		      dbuf.dptr, dbuf.dsize) == -1)
648		goto out;
649
650	if (dbuf.dsize != rec.data_len) {
651		/* update size */
652		rec.data_len = dbuf.dsize;
653		ret = rec_write(tdb, rec_ptr, &rec);
654	} else
655		ret = 0;
656
657 out:
658	tdb_unlock(tdb, BUCKET(hash));
659	return ret;
660}
661
662/* find an entry in the database given a key */
663TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
664{
665	unsigned hash;
666	tdb_off rec_ptr;
667	struct list_struct rec;
668	TDB_DATA ret = null_data;
669
670        if (tdb == NULL) {
671#ifdef TDB_DEBUG
672            printf("tdb_fetch() called with null context\n");
673#endif
674            return null_data;
675        }
676
677	/* find which hash bucket it is in */
678	hash = tdb_hash(&key);
679
680	tdb_lock(tdb, BUCKET(hash));
681	rec_ptr = tdb_find(tdb, key, hash, &rec);
682
683	if (rec_ptr) {
684		ret.dptr = tdb_alloc_read(tdb,
685					  rec_ptr + sizeof(rec) + rec.key_len,
686					  rec.data_len);
687		ret.dsize = rec.data_len;
688	}
689
690	tdb_unlock(tdb, BUCKET(hash));
691	return ret;
692}
693
694/* check if an entry in the database exists
695
696   note that 1 is returned if the key is found and 0 is returned if not found
697   this doesn't match the conventions in the rest of this module, but is
698   compatible with gdbm
699*/
700int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
701{
702	unsigned hash;
703	tdb_off rec_ptr;
704	struct list_struct rec;
705
706        if (tdb == NULL) {
707#ifdef TDB_DEBUG
708            printf("tdb_exists() called with null context\n");
709#endif
710            return 0;
711        }
712
713	/* find which hash bucket it is in */
714	hash = tdb_hash(&key);
715
716	tdb_lock(tdb, BUCKET(hash));
717	rec_ptr = tdb_find(tdb, key, hash, &rec);
718	tdb_unlock(tdb, BUCKET(hash));
719
720	return rec_ptr != 0;
721}
722
723/* traverse the entire database - calling fn(tdb, key, data) on each element.
724   return -1 on error or the record count traversed
725   if fn is NULL then it is not called
726   a non-zero return value from fn() indicates that the traversal should stop
727  */
728int tdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void* state), void* state)
729{
730	int count = 0;
731	unsigned h;
732	tdb_off offset, rec_ptr;
733	struct list_struct rec;
734	char *data;
735	TDB_DATA key, dbuf;
736
737        if (tdb == NULL) {
738#ifdef TDB_DEBUG
739            printf("tdb_traverse() called with null context\n");
740#endif
741            return -1;
742        }
743
744	/* loop over all hash chains */
745	for (h = 0; h < tdb->header.hash_size; h++) {
746		tdb_lock(tdb, BUCKET(h));
747
748		/* read in the hash top */
749		offset = tdb_hash_top(tdb, h);
750		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
751			goto fail;
752		}
753
754		/* traverse all records for this hash */
755		while (rec_ptr) {
756			if (rec_read(tdb, rec_ptr, &rec) == -1) {
757				goto fail;
758			}
759
760			/* now read the full record */
761			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
762					     rec.key_len + rec.data_len);
763			if (!data) {
764				goto fail;
765			}
766
767			key.dptr = data;
768			key.dsize = rec.key_len;
769			dbuf.dptr = data + rec.key_len;
770			dbuf.dsize = rec.data_len;
771			count++;
772
773			if (fn && fn(tdb, key, dbuf, state) != 0) {
774				/* they want us to stop traversing */
775				free(data);
776				tdb_unlock(tdb, BUCKET(h));
777				return count;
778			}
779
780			/* a miss - drat */
781			free(data);
782
783			/* move to the next record */
784			rec_ptr = rec.next;
785		}
786		tdb_unlock(tdb, BUCKET(h));
787	}
788
789	/* return the number traversed */
790	return count;
791
792 fail:
793	tdb_unlock(tdb, BUCKET(h));
794	return -1;
795}
796
797
798/* find the first entry in the database and return its key */
799TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
800{
801	tdb_off offset, rec_ptr;
802	struct list_struct rec;
803	unsigned hash;
804	TDB_DATA ret;
805
806        if (tdb == NULL) {
807#ifdef TDB_DEBUG
808            printf("tdb_firstkey() called with null context\n");
809#endif
810            return null_data;
811        }
812
813	/* look for a non-empty hash chain */
814	for (hash = 0, rec_ptr = 0;
815	     hash < tdb->header.hash_size;
816	     hash++) {
817		/* find the top of the hash chain */
818		offset = tdb_hash_top(tdb, hash);
819
820		tdb_lock(tdb, BUCKET(hash));
821
822		/* read in the hash top */
823		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
824			goto fail;
825		}
826
827		if (rec_ptr) break;
828
829		tdb_unlock(tdb, BUCKET(hash));
830	}
831
832	if (rec_ptr == 0) return null_data;
833
834	/* we've found a non-empty chain, now read the record */
835	if (rec_read(tdb, rec_ptr, &rec) == -1) {
836		goto fail;
837	}
838
839	/* allocate and read the key space */
840	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
841	ret.dsize = rec.key_len;
842	tdb_unlock(tdb, BUCKET(hash));
843	return ret;
844
845 fail:
846	tdb_unlock(tdb, BUCKET(hash));
847	return null_data;
848}
849
850/* find the next entry in the database, returning its key */
851TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key)
852{
853	unsigned hash, hbucket;
854	tdb_off rec_ptr, offset;
855	struct list_struct rec;
856	TDB_DATA ret;
857
858        if (tdb == NULL) {
859#ifdef TDB_DEBUG
860            printf("tdb_nextkey() called with null context\n");
861#endif
862            return null_data;
863        }
864
865	/* find which hash bucket it is in */
866	hash = tdb_hash(&key);
867	hbucket = BUCKET(hash);
868
869	tdb_lock(tdb, hbucket);
870	rec_ptr = tdb_find(tdb, key, hash, &rec);
871	if (rec_ptr) {
872		/* we want the next record after this one */
873		rec_ptr = rec.next;
874	}
875
876	/* not found or last in hash: look for next non-empty hash chain */
877	while (rec_ptr == 0) {
878		tdb_unlock(tdb, hbucket);
879
880		if (++hbucket >= tdb->header.hash_size - 1)
881			return null_data;
882
883		offset = tdb_hash_top(tdb, hbucket);
884		tdb_lock(tdb, hbucket);
885		/* read in the hash top */
886		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
887			tdb_unlock(tdb, hbucket);
888			return null_data;
889		}
890	}
891
892	/* Read the record. */
893	if (rec_read(tdb, rec_ptr, &rec) == -1) {
894		tdb_unlock(tdb, hbucket);
895		return null_data;
896	}
897	/* allocate and read the key */
898	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
899	ret.dsize = rec.key_len;
900	tdb_unlock(tdb, hbucket);
901
902	return ret;
903}
904
905/* delete an entry in the database given a key */
906int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
907{
908	unsigned hash;
909	tdb_off offset, rec_ptr, last_ptr;
910	struct list_struct rec, lastrec;
911	char *data = NULL;
912
913        if (tdb == NULL) {
914#ifdef TDB_DEBUG
915            printf("tdb_delete() called with null context\n");
916#endif
917            return -1;
918        }
919
920	/* find which hash bucket it is in */
921	hash = tdb_hash(&key);
922
923	tdb_lock(tdb, BUCKET(hash));
924
925	/* find the top of the hash chain */
926	offset = tdb_hash_top(tdb, hash);
927
928	/* read in the hash top */
929	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
930		goto fail;
931	}
932
933	last_ptr = 0;
934
935	/* keep looking until we find the right record */
936	while (rec_ptr) {
937		if (rec_read(tdb, rec_ptr, &rec) == -1) {
938			goto fail;
939		}
940
941		if (hash == rec.full_hash && key.dsize == rec.key_len) {
942			/* a very likely hit - read the record and full key */
943			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
944					     rec.key_len);
945			if (!data) {
946				goto fail;
947			}
948
949			if (memcmp(key.dptr, data, key.dsize) == 0) {
950				/* a definite match - delete it */
951				if (last_ptr == 0) {
952					offset = tdb_hash_top(tdb, hash);
953					if (ofs_write(tdb, offset, &rec.next) == -1) {
954						goto fail;
955					}
956				} else {
957					lastrec.next = rec.next;
958					if (rec_write(tdb, last_ptr, &lastrec) == -1) {
959						goto fail;
960					}
961				}
962				tdb_unlock(tdb, BUCKET(hash));
963				tdb_lock(tdb, -1);
964				/* and recover the space */
965				offset = FREELIST_TOP;
966				if (ofs_read(tdb, offset, &rec.next) == -1) {
967					goto fail2;
968				}
969				rec.magic = TDB_FREE_MAGIC;
970				if (rec_write(tdb, rec_ptr, &rec) == -1) {
971					goto fail2;
972				}
973				if (ofs_write(tdb, offset, &rec_ptr) == -1) {
974					goto fail2;
975				}
976
977				/* yipee - all done */
978				free(data);
979				tdb_unlock(tdb, -1);
980				return 0;
981			}
982
983			/* a miss - drat */
984			free(data);
985			data = NULL;
986		}
987
988		/* move to the next record */
989		last_ptr = rec_ptr;
990		lastrec = rec;
991		rec_ptr = rec.next;
992	}
993
994 fail:
995	if (data) free(data);
996	tdb_unlock(tdb, BUCKET(hash));
997	return -1;
998
999 fail2:
1000	if (data) free(data);
1001	tdb_unlock(tdb, -1);
1002	return -1;
1003}
1004
1005
1006/* store an element in the database, replacing any existing element
1007   with the same key
1008
1009   return 0 on success, -1 on failure
1010*/
1011int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1012{
1013	struct list_struct rec;
1014	unsigned hash;
1015	tdb_off rec_ptr, offset;
1016	char *p = NULL;
1017
1018        if (tdb == NULL) {
1019#ifdef TDB_DEBUG
1020            printf("tdb_store() called with null context\n");
1021#endif
1022            return -1;
1023        }
1024
1025	/* find which hash bucket it is in */
1026	hash = tdb_hash(&key);
1027
1028	/* check for it existing */
1029	if (flag == TDB_INSERT && tdb_exists(tdb, key)) {
1030		tdb->ecode = TDB_ERR_EXISTS;
1031		return -1;
1032	}
1033
1034	/* first try in-place update */
1035	if (flag != TDB_INSERT && tdb_update(tdb, key, dbuf) == 0) {
1036		return 0;
1037	}
1038
1039	rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize);
1040	if (rec_ptr == 0) {
1041		return -1;
1042	}
1043
1044	tdb_lock(tdb, BUCKET(hash));
1045
1046	/* delete any existing record - if it doesn't exist we don't care */
1047	if (flag != TDB_INSERT) {
1048		tdb_delete(tdb, key);
1049	}
1050
1051	/* read the newly created record */
1052	if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
1053		goto fail;
1054	}
1055
1056	if (rec.magic != TDB_FREE_MAGIC) goto fail;
1057
1058	/* find the top of the hash chain */
1059	offset = tdb_hash_top(tdb, hash);
1060
1061	/* read in the hash top diretcly into our next pointer */
1062	if (ofs_read(tdb, offset, &rec.next) == -1) {
1063		goto fail;
1064	}
1065
1066	rec.key_len = key.dsize;
1067	rec.data_len = dbuf.dsize;
1068	rec.full_hash = hash;
1069	rec.magic = TDB_MAGIC;
1070
1071	p = (char *)malloc(sizeof(rec) + key.dsize + dbuf.dsize);
1072	if (!p) {
1073		tdb->ecode = TDB_ERR_OOM;
1074		goto fail;
1075	}
1076
1077	memcpy(p, &rec, sizeof(rec));
1078	memcpy(p+sizeof(rec), key.dptr, key.dsize);
1079	memcpy(p+sizeof(rec)+key.dsize, dbuf.dptr, dbuf.dsize);
1080
1081	if (tdb_write(tdb, rec_ptr, p, sizeof(rec)+key.dsize+dbuf.dsize) == -1)
1082		goto fail;
1083
1084	free(p);
1085	p = NULL;
1086
1087	/* and point the top of the hash chain at it */
1088	if (ofs_write(tdb, offset, &rec_ptr) == -1) goto fail;
1089
1090	tdb_unlock(tdb, BUCKET(hash));
1091	return 0;
1092
1093 fail:
1094#if TDB_DEBUG
1095	printf("store failed for hash 0x%08x in bucket %u\n", hash, BUCKET(hash));
1096#endif
1097	if (p) free(p);
1098	tdb_unlock(tdb, BUCKET(hash));
1099	return -1;
1100}
1101
1102
1103/* open the database, creating it if necessary
1104
1105   The open_flags and mode are passed straight to the open call on the database
1106   file. A flags value of O_WRONLY is invalid
1107
1108   The hash size is advisory, use zero for a default value.
1109
1110   return is NULL on error
1111*/
1112TDB_CONTEXT *tdb_open(char *name, int hash_size, int tdb_flags,
1113		      int open_flags, mode_t mode)
1114{
1115	TDB_CONTEXT tdb, *ret;
1116	struct stat st;
1117
1118	memset(&tdb, 0, sizeof(tdb));
1119
1120	tdb.fd = -1;
1121	tdb.name = NULL;
1122	tdb.map_ptr = NULL;
1123
1124	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1125		goto fail;
1126	}
1127
1128	if (hash_size == 0) hash_size = DEFAULT_HASH_SIZE;
1129
1130	tdb.read_only = ((open_flags & O_ACCMODE) == O_RDONLY);
1131
1132        if (name != NULL) {
1133            tdb.fd = open(name, open_flags, mode);
1134            if (tdb.fd == -1) {
1135		goto fail;
1136            }
1137	    (void) fcntl(tdb.fd, F_SETFD, FD_CLOEXEC);
1138        }
1139
1140	/* ensure there is only one process initialising at once */
1141	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_SET, F_WRLCK, F_SETLKW);
1142
1143	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1144		/* we need to zero the database if we are the only
1145		   one with it open */
1146		if (tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_WRLCK, F_SETLK) == 0) {
1147			ftruncate(tdb.fd, 0);
1148			tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLK);
1149		}
1150	}
1151
1152	/* leave this lock in place */
1153	tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_RDLCK, F_SETLKW);
1154
1155	if (read(tdb.fd, &tdb.header, sizeof(tdb.header)) != sizeof(tdb.header) ||
1156	    strcmp(tdb.header.magic_food, TDB_MAGIC_FOOD) != 0 ||
1157	    tdb.header.version != TDB_VERSION) {
1158		/* its not a valid database - possibly initialise it */
1159		if (!(open_flags & O_CREAT)) {
1160			goto fail;
1161		}
1162		if (tdb_new_database(&tdb, hash_size) == -1) goto fail;
1163
1164		lseek(tdb.fd, 0, SEEK_SET);
1165		if (tdb.fd != -1 && read(tdb.fd, &tdb.header,
1166                                         sizeof(tdb.header)) !=
1167                                         sizeof(tdb.header))
1168                    goto fail;
1169	}
1170
1171        if (tdb.fd != -1) {
1172            fstat(tdb.fd, &st);
1173
1174            /* map the database and fill in the return structure */
1175            tdb.name = (char *)strdup(name);
1176            tdb.map_size = st.st_size;
1177        }
1178
1179        tdb.locked = (int *)calloc(tdb.header.hash_size+1,
1180                                   sizeof(tdb.locked[0]));
1181        if (!tdb.locked) {
1182            goto fail;
1183        }
1184
1185#if HAVE_MMAP
1186        if (tdb.fd != -1) {
1187            tdb.map_ptr = (void *)mmap(NULL, st.st_size,
1188                                       tdb.read_only? PROT_READ : PROT_READ|PROT_WRITE,
1189                                       MAP_SHARED | MAP_FILE, tdb.fd, 0);
1190	    if (tdb.map_ptr == MAP_FAILED) {
1191		tdb.map_ptr = NULL;
1192	    }
1193        }
1194#endif
1195
1196	ret = (TDB_CONTEXT *)malloc(sizeof(tdb));
1197	if (!ret) goto fail;
1198
1199	*ret = tdb;
1200
1201#if TDB_DEBUG
1202	printf("mapped database of hash_size %u map_size=%u\n",
1203	       hash_size, tdb.map_size);
1204#endif
1205
1206	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLKW);
1207	return ret;
1208
1209 fail:
1210        if (tdb.name) free(tdb.name);
1211	if (tdb.fd != -1) close(tdb.fd);
1212	if (tdb.map_ptr) munmap(tdb.map_ptr, tdb.map_size);
1213
1214	return NULL;
1215}
1216
1217/* close a database */
1218int tdb_close(TDB_CONTEXT *tdb)
1219{
1220	if (!tdb) return -1;
1221
1222	if (tdb->name) free(tdb->name);
1223	if (tdb->fd != -1) close(tdb->fd);
1224	if (tdb->locked) free(tdb->locked);
1225
1226	if (tdb->map_ptr) {
1227            if (tdb->fd != -1) {
1228                munmap(tdb->map_ptr, tdb->map_size);
1229            } else {
1230                free(tdb->map_ptr);
1231            }
1232        }
1233
1234	memset(tdb, 0, sizeof(*tdb));
1235	free(tdb);
1236
1237	return 0;
1238}
1239
1240/* lock the database. If we already have it locked then don't do anything */
1241int tdb_writelock(TDB_CONTEXT *tdb)
1242{
1243        if (tdb == NULL) {
1244#ifdef TDB_DEBUG
1245            printf("tdb_writelock() called with null context\n");
1246#endif
1247            return -1;
1248        }
1249
1250	return tdb_lock(tdb, -1);
1251}
1252
1253/* unlock the database. */
1254int tdb_writeunlock(TDB_CONTEXT *tdb)
1255{
1256        if (tdb == NULL) {
1257#ifdef TDB_DEBUG
1258            printf("tdb_writeunlock() called with null context\n");
1259#endif
1260            return -1;
1261        }
1262
1263	return tdb_unlock(tdb, -1);
1264}
1265
1266/* lock one hash chain. This is meant to be used to reduce locking
1267   contention - it cannot guarantee how many records will be locked */
1268int tdb_lockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1269{
1270        if (tdb == NULL) {
1271#ifdef TDB_DEBUG
1272            printf("tdb_lockchain() called with null context\n");
1273#endif
1274            return -1;
1275        }
1276
1277	return tdb_lock(tdb, BUCKET(tdb_hash(&key)));
1278}
1279
1280
1281/* unlock one hash chain */
1282int tdb_unlockchain(TDB_CONTEXT *tdb, TDB_DATA key)
1283{
1284        if (tdb == NULL) {
1285#ifdef TDB_DEBUG
1286            printf("tdb_unlockchain() called with null context\n");
1287#endif
1288            return -1;
1289        }
1290
1291	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)));
1292}
1293