1/*	NetBSD: tdb.c,v 1.7 2013/10/19 17:16:38 christos Exp 	*/
2
3/*
4 * Database functions
5 * Copyright (C) Andrew Tridgell 1999
6 *
7 * Redistribution and use in source and binary forms are permitted
8 * provided that the above copyright notice and this paragraph are
9 * duplicated in all such forms AND provided that this software or
10 * any derived work is only used as part of the PPP daemon (pppd)
11 * and related utilities.
12 * The name of the author may not be used to endorse or promote products
13 * derived from this software without specific prior written permission.
14 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
16 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
17 *
18 * Note: this software is also available under the Gnu Public License
19 * version 2 or later.
20 */
21
22#include <sys/cdefs.h>
23#ifndef lint
24__RCSID("NetBSD: tdb.c,v 1.7 2013/10/19 17:16:38 christos Exp ");
25#endif
26
27#include <stdlib.h>
28#include <stdio.h>
29#include <fcntl.h>
30#include <unistd.h>
31#include <string.h>
32#include <errno.h>
33#include <sys/mman.h>
34#include <sys/stat.h>
35#include "tdb.h"
36
37#define TDB_VERSION (0x26011967 + 1)
38#define TDB_MAGIC (0x26011999U)
39#define TDB_FREE_MAGIC (~TDB_MAGIC)
40#define TDB_ALIGN 4
41#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGN)
42#define DEFAULT_HASH_SIZE 128
43#define TDB_PAGE_SIZE 0x2000
44#define TDB_LEN_MULTIPLIER 10
45#define FREELIST_TOP (sizeof(struct tdb_header))
46
47#define LOCK_SET 1
48#define LOCK_CLEAR 0
49
50/* lock offsets */
51#define GLOBAL_LOCK 0
52#define ACTIVE_LOCK 4
53#define LIST_LOCK_BASE 1024
54
55#define BUCKET(hash) ((hash) % tdb->header.hash_size)
56
57#ifndef MAP_FILE
58#define MAP_FILE 0
59#endif
60
61/* the body of the database is made of one list_struct for the free space
62   plus a separate data list for each hash value */
63struct list_struct {
64	tdb_len rec_len; /* total byte length of record */
65	tdb_off next; /* offset of the next record in the list */
66	tdb_len key_len; /* byte length of key */
67	tdb_len data_len; /* byte length of data */
68	unsigned full_hash; /* the full 32 bit hash of the key */
69	unsigned magic;   /* try to catch errors */
70	/*
71	   the following union is implied
72	   union {
73              char record[rec_len];
74	      struct {
75	        char key[key_len];
76		char data[data_len];
77	      }
78           }
79	*/
80};
81
82/* a null data record - useful for error returns */
83static TDB_DATA null_data;
84
85static int tdb_update __P((TDB_CONTEXT *, TDB_DATA, TDB_DATA));
86#ifdef notyet
87static int tdb_lockchain __P((TDB_CONTEXT *, TDB_DATA));
88static int tdb_unlockchain __P((TDB_CONTEXT *, TDB_DATA));
89#endif
90
91/* a byte range locking function - return 0 on success
92   this functions locks/unlocks 1 byte at the specified offset */
93static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
94		      int set, int rw_type, int lck_type)
95{
96#if NOLOCK
97	return 0;
98#else
99	struct flock fl;
100
101        if (tdb->fd == -1) return 0;   /* for in memory tdb */
102
103	if (tdb->read_only) return -1;
104
105	fl.l_type = set==LOCK_SET?rw_type:F_UNLCK;
106	fl.l_whence = SEEK_SET;
107	fl.l_start = offset;
108	fl.l_len = 1;
109	fl.l_pid = 0;
110
111	if (fcntl(tdb->fd, lck_type, &fl) != 0) {
112#if TDB_DEBUG
113		if (lck_type == F_SETLKW) {
114			printf("lock %d failed at %d (%s)\n",
115			       set, offset, strerror(errno));
116		}
117#endif
118		tdb->ecode = TDB_ERR_LOCK;
119		return -1;
120	}
121	return 0;
122#endif
123}
124
125/* lock a list in the database. list -1 is the alloc list */
126static int tdb_lock(TDB_CONTEXT *tdb, int list)
127{
128	if (list < -1 || list >= (int)tdb->header.hash_size) {
129#if TDB_DEBUG
130		printf("bad list %d\n", list);
131#endif
132		return -1;
133	}
134	if (tdb->locked[list+1] == 0) {
135		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_SET,
136			       F_WRLCK, F_SETLKW) != 0) {
137			return -1;
138		}
139	}
140	tdb->locked[list+1]++;
141	return 0;
142}
143
144/* unlock the database. */
145static int tdb_unlock(TDB_CONTEXT *tdb, int list)
146{
147	if (list < -1 || list >= (int)tdb->header.hash_size) {
148#if TDB_DEBUG
149		printf("bad unlock list %d\n", list);
150#endif
151		return -1;
152	}
153
154	if (tdb->locked[list+1] == 0) {
155#if TDB_DEBUG
156		printf("not locked %d\n", list);
157#endif
158		tdb->ecode = TDB_ERR_LOCK;
159		return -1;
160	}
161	if (tdb->locked[list+1] == 1) {
162		if (tdb_brlock(tdb, LIST_LOCK_BASE + 4*list, LOCK_CLEAR,
163			       F_WRLCK, F_SETLKW) != 0) {
164			return -1;
165		}
166	}
167	tdb->locked[list+1]--;
168	return 0;
169}
170
171/* the hash algorithm - turn a key into an integer
172   This is based on the hash agorithm from gdbm */
173static unsigned tdb_hash(TDB_DATA *key)
174{
175	unsigned value;	/* Used to compute the hash value.  */
176	unsigned   i;	/* Used to cycle through random values. */
177
178	/* Set the initial value from the key size. */
179	value = 0x238F13AF * key->dsize;
180	for (i=0; i < key->dsize; i++) {
181		value = (value + (key->dptr[i] << (i*5 % 24)));
182	}
183
184	value = (1103515243 * value + 12345);
185
186	return value;
187}
188
189/* find the top of the hash chain for an open database */
190static tdb_off tdb_hash_top(TDB_CONTEXT *tdb, unsigned hash)
191{
192	tdb_off ret;
193	hash = BUCKET(hash);
194	ret = FREELIST_TOP + (hash+1)*sizeof(tdb_off);
195	return ret;
196}
197
198
199/* check for an out of bounds access - if it is out of bounds then
200   see if the database has been expanded by someone else and expand
201   if necessary */
202static int tdb_oob(TDB_CONTEXT *tdb, tdb_off offset)
203{
204	struct stat st;
205	if ((offset <= tdb->map_size) || (tdb->fd == -1)) return 0;
206
207	fstat(tdb->fd, &st);
208	if (st.st_size <= (ssize_t)offset) {
209		tdb->ecode = TDB_ERR_IO;
210		return -1;
211	}
212
213#if HAVE_MMAP
214	if (tdb->map_ptr) {
215		munmap(tdb->map_ptr, tdb->map_size);
216		tdb->map_ptr = NULL;
217	}
218#endif
219
220	tdb->map_size = st.st_size;
221#if HAVE_MMAP
222	tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
223				    tdb->read_only?PROT_READ:PROT_READ|PROT_WRITE,
224				    MAP_SHARED | MAP_FILE, tdb->fd, 0);
225	if (tdb->map_ptr == MAP_FAILED)
226		tdb->map_ptr = NULL;
227#endif
228	return 0;
229}
230
231
232/* write a lump of data at a specified offset */
233static int tdb_write(TDB_CONTEXT *tdb, tdb_off offset, const char *buf, tdb_len len)
234{
235	if (tdb_oob(tdb, offset + len) != 0) {
236		/* oops - trying to write beyond the end of the database! */
237		return -1;
238	}
239
240	if (tdb->map_ptr) {
241		memcpy(offset + (char *)tdb->map_ptr, buf, len);
242	} else {
243		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
244		    write(tdb->fd, buf, len) != (ssize_t)len) {
245			tdb->ecode = TDB_ERR_IO;
246			return -1;
247		}
248	}
249	return 0;
250}
251
252/* read a lump of data at a specified offset */
253static int tdb_read(TDB_CONTEXT *tdb, tdb_off offset, char *buf, tdb_len len)
254{
255	if (tdb_oob(tdb, offset + len) != 0) {
256		/* oops - trying to read beyond the end of the database! */
257		return -1;
258	}
259
260	if (tdb->map_ptr) {
261		memcpy(buf, offset + (char *)tdb->map_ptr, len);
262	} else {
263		if (lseek(tdb->fd, offset, SEEK_SET) != offset ||
264		    read(tdb->fd, buf, len) != (ssize_t)len) {
265			tdb->ecode = TDB_ERR_IO;
266			return -1;
267		}
268	}
269	return 0;
270}
271
272
273/* read a lump of data, allocating the space for it */
274static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
275{
276	char *buf;
277
278	buf = (char *)malloc(len);
279
280	if (!buf) {
281		tdb->ecode = TDB_ERR_OOM;
282		return NULL;
283	}
284
285	if (tdb_read(tdb, offset, buf, len) == -1) {
286		free(buf);
287		return NULL;
288	}
289
290	return buf;
291}
292
293/* convenience routine for writing a record */
294static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
295{
296	return tdb_write(tdb, offset, (char *)rec, sizeof(*rec));
297}
298
299/* convenience routine for writing a tdb_off */
300static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
301{
302	return tdb_write(tdb, offset, (char *)d, sizeof(*d));
303}
304
305/* read a tdb_off from the store */
306static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
307{
308	return tdb_read(tdb, offset, (char *)d, sizeof(*d));
309}
310
311/* read a record and check for simple errors */
312static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
313{
314	if (tdb_read(tdb, offset, (char *)rec, sizeof(*rec)) == -1) return -1;
315	if (rec->magic != TDB_MAGIC) {
316#if TDB_DEBUG
317		printf("bad magic 0x%08x at offset %d\n",
318		       rec->magic, offset);
319#endif
320		tdb->ecode = TDB_ERR_CORRUPT;
321		return -1;
322	}
323	if (tdb_oob(tdb, rec->next) != 0) {
324		return -1;
325	}
326	return 0;
327}
328
329/* expand the database at least length bytes by expanding the
330   underlying file and doing the mmap again if necessary */
331static int tdb_expand(TDB_CONTEXT *tdb, tdb_off length)
332{
333	struct list_struct rec;
334	tdb_off offset, ptr;
335	char b = 0;
336
337	tdb_lock(tdb,-1);
338
339	/* make sure we know about any previous expansions by another
340           process */
341	tdb_oob(tdb,tdb->map_size + 1);
342
343	/* always make room for at least 10 more records */
344	length *= TDB_LEN_MULTIPLIER;
345
346	/* and round the database up to a multiple of TDB_PAGE_SIZE */
347	length = ((tdb->map_size + length + TDB_PAGE_SIZE) & ~(TDB_PAGE_SIZE - 1)) - tdb->map_size;
348
349	/* expand the file itself */
350        if (tdb->fd != -1) {
351            lseek(tdb->fd, tdb->map_size + length - 1, SEEK_SET);
352            if (write(tdb->fd, &b, 1) != 1) goto fail;
353        }
354
355	/* form a new freelist record */
356	offset = FREELIST_TOP;
357	rec.rec_len = length - sizeof(rec);
358	rec.magic = TDB_FREE_MAGIC;
359	if (ofs_read(tdb, offset, &rec.next) == -1) {
360		goto fail;
361	}
362
363#if HAVE_MMAP
364	if (tdb->fd != -1 && tdb->map_ptr) {
365		munmap(tdb->map_ptr, tdb->map_size);
366		tdb->map_ptr = NULL;
367	}
368#endif
369
370	tdb->map_size += length;
371
372        if (tdb->fd == -1) {
373		void *n;
374		n = realloc(tdb->map_ptr, tdb->map_size);
375		if (!n)
376			goto fail;
377		tdb->map_ptr = n;
378        }
379
380	/* write it out */
381	if (rec_write(tdb, tdb->map_size - length, &rec) == -1) {
382		goto fail;
383	}
384
385	/* link it into the free list */
386	ptr = tdb->map_size - length;
387	if (ofs_write(tdb, offset, &ptr) == -1) goto fail;
388
389#if HAVE_MMAP
390        if (tdb->fd != -1) {
391            tdb->map_ptr = (void *)mmap(NULL, tdb->map_size,
392                                        PROT_READ|PROT_WRITE,
393                                        MAP_SHARED | MAP_FILE, tdb->fd, 0);
394	    if (tdb->map_ptr == MAP_FAILED)
395		    tdb->map_ptr = NULL;
396        }
397#endif
398
399	tdb_unlock(tdb, -1);
400	return 0;
401
402 fail:
403	tdb_unlock(tdb,-1);
404	return -1;
405}
406
407/* allocate some space from the free list. The offset returned points
408   to a unconnected list_struct within the database with room for at
409   least length bytes of total data
410
411   0 is returned if the space could not be allocated
412 */
413static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length)
414{
415	tdb_off offset, rec_ptr, last_ptr;
416	struct list_struct rec, lastrec, newrec;
417
418	tdb_lock(tdb, -1);
419
420 again:
421	last_ptr = 0;
422	offset = FREELIST_TOP;
423
424	/* read in the freelist top */
425	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
426		goto fail;
427	}
428
429	/* keep looking until we find a freelist record that is big
430           enough */
431	while (rec_ptr) {
432		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
433			goto fail;
434		}
435
436		if (rec.magic != TDB_FREE_MAGIC) {
437#if TDB_DEBUG
438			printf("bad magic 0x%08x in free list\n", rec.magic);
439#endif
440			goto fail;
441		}
442
443		if (rec.rec_len >= length) {
444			/* found it - now possibly split it up  */
445			if (rec.rec_len > length + MIN_REC_SIZE) {
446				length = (length + TDB_ALIGN) & ~(TDB_ALIGN-1);
447
448				newrec.rec_len = rec.rec_len - (sizeof(rec) + length);
449				newrec.next = rec.next;
450				newrec.magic = TDB_FREE_MAGIC;
451
452				rec.rec_len = length;
453				rec.next = rec_ptr + sizeof(rec) + rec.rec_len;
454
455				if (rec_write(tdb, rec.next, &newrec) == -1) {
456					goto fail;
457				}
458
459				if (rec_write(tdb, rec_ptr, &rec) == -1) {
460					goto fail;
461				}
462			}
463
464			/* remove it from the list */
465			if (last_ptr == 0) {
466				offset = FREELIST_TOP;
467
468				if (ofs_write(tdb, offset, &rec.next) == -1) {
469					goto fail;
470				}
471			} else {
472				lastrec.next = rec.next;
473				if (rec_write(tdb, last_ptr, &lastrec) == -1) {
474					goto fail;
475				}
476			}
477
478			/* all done - return the new record offset */
479			tdb_unlock(tdb, -1);
480			return rec_ptr;
481		}
482
483		/* move to the next record */
484		lastrec = rec;
485		last_ptr = rec_ptr;
486		rec_ptr = rec.next;
487	}
488
489	/* we didn't find enough space. See if we can expand the
490	   database and if we can then try again */
491	if (tdb_expand(tdb, length + sizeof(rec)) == 0) goto again;
492
493 fail:
494#if TDB_DEBUG
495	printf("tdb_allocate failed for size %u\n", length);
496#endif
497	tdb_unlock(tdb, -1);
498	return 0;
499}
500
501/* initialise a new database with a specified hash size */
502static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
503{
504	struct tdb_header header;
505	int i, size = 0;
506	tdb_off buf[16];
507
508        /* create the header */
509        memset(&header, 0, sizeof(header));
510        memcpy(header.magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
511        header.version = TDB_VERSION;
512        header.hash_size = hash_size;
513        lseek(tdb->fd, 0, SEEK_SET);
514        ftruncate(tdb->fd, 0);
515
516        if (tdb->fd != -1 && write(tdb->fd, &header, sizeof(header)) !=
517            sizeof(header)) {
518            tdb->ecode = TDB_ERR_IO;
519            return -1;
520        } else size += sizeof(header);
521
522        /* the freelist and hash pointers */
523        memset(buf, 0, sizeof(buf));
524
525        for (i=0;(hash_size+1)-i >= 16; i += 16) {
526            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(buf)) !=
527                sizeof(buf)) {
528                tdb->ecode = TDB_ERR_IO;
529                return -1;
530            } else size += sizeof(buf);
531        }
532
533        for (;i<hash_size+1; i++) {
534            if (tdb->fd != -1 && write(tdb->fd, buf, sizeof(tdb_off)) !=
535                sizeof(tdb_off)) {
536                tdb->ecode = TDB_ERR_IO;
537                return -1;
538            } else size += sizeof(tdb_off);
539        }
540
541        if (tdb->fd == -1) {
542            tdb->map_ptr = calloc(size, 1);
543            tdb->map_size = size;
544            if (tdb->map_ptr == NULL) {
545                tdb->ecode = TDB_ERR_IO;
546                return -1;
547            }
548            memcpy(&tdb->header, &header, sizeof(header));
549        }
550
551#if TDB_DEBUG
552	printf("initialised database of hash_size %u\n",
553	       hash_size);
554#endif
555	return 0;
556}
557
558/* Returns 0 on fail.  On success, return offset of record, and fills
559   in rec */
560static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, unsigned int hash,
561			struct list_struct *rec)
562{
563	tdb_off offset, rec_ptr;
564
565	/* find the top of the hash chain */
566	offset = tdb_hash_top(tdb, hash);
567
568	/* read in the hash top */
569	if (ofs_read(tdb, offset, &rec_ptr) == -1)
570		return 0;
571
572	/* keep looking until we find the right record */
573	while (rec_ptr) {
574		if (rec_read(tdb, rec_ptr, rec) == -1)
575			return 0;
576
577		if (hash == rec->full_hash && key.dsize == rec->key_len) {
578			char *k;
579			/* a very likely hit - read the key */
580			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*rec),
581					   rec->key_len);
582
583			if (!k)
584				return 0;
585
586			if (memcmp(key.dptr, k, key.dsize) == 0) {
587				free(k);
588				return rec_ptr;
589			}
590			free(k);
591		}
592
593		/* move to the next record */
594		rec_ptr = rec->next;
595	}
596	return 0;
597}
598
599/*
600   return an error string for the last tdb error
601*/
602char *tdb_errorstr(TDB_CONTEXT *tdb)
603{
604	int i;
605	static struct {
606		enum TDB_ERROR ecode;
607		char *estring;
608	} emap[] = {
609		{TDB_SUCCESS, "Success"},
610		{TDB_ERR_CORRUPT, "Corrupt database"},
611		{TDB_ERR_IO, "IO Error"},
612		{TDB_ERR_LOCK, "Locking error"},
613		{TDB_ERR_OOM, "Out of memory"},
614		{TDB_ERR_EXISTS, "Record exists"},
615		{-1, NULL}};
616        if (tdb != NULL) {
617            for (i=0;emap[i].estring;i++) {
618		if (tdb->ecode == emap[i].ecode) return emap[i].estring;
619            }
620        } else {
621            return "Invalid tdb context";
622        }
623	return "Invalid error code";
624}
625
626
627/* update an entry in place - this only works if the new data size
628   is <= the old data size and the key exists.
629   on failure return -1
630*/
631static int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
632{
633	unsigned hash;
634	struct list_struct rec;
635	tdb_off rec_ptr;
636	int ret = -1;
637
638        if (tdb == NULL) {
639#ifdef TDB_DEBUG
640            printf("tdb_update() called with null context\n");
641#endif
642            return -1;
643        }
644
645	/* find which hash bucket it is in */
646	hash = tdb_hash(&key);
647
648	tdb_lock(tdb, BUCKET(hash));
649	rec_ptr = tdb_find(tdb, key, hash, &rec);
650
651	if (!rec_ptr)
652		goto out;
653
654	/* must be long enough */
655	if (rec.rec_len < key.dsize + dbuf.dsize)
656		goto out;
657
658	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
659		      dbuf.dptr, dbuf.dsize) == -1)
660		goto out;
661
662	if (dbuf.dsize != rec.data_len) {
663		/* update size */
664		rec.data_len = dbuf.dsize;
665		ret = rec_write(tdb, rec_ptr, &rec);
666	} else
667		ret = 0;
668
669 out:
670	tdb_unlock(tdb, BUCKET(hash));
671	return ret;
672}
673
674/* find an entry in the database given a key */
675TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
676{
677	unsigned hash;
678	tdb_off rec_ptr;
679	struct list_struct rec;
680	TDB_DATA ret = null_data;
681
682        if (tdb == NULL) {
683#ifdef TDB_DEBUG
684            printf("tdb_fetch() called with null context\n");
685#endif
686            return null_data;
687        }
688
689	/* find which hash bucket it is in */
690	hash = tdb_hash(&key);
691
692	tdb_lock(tdb, BUCKET(hash));
693	rec_ptr = tdb_find(tdb, key, hash, &rec);
694
695	if (rec_ptr) {
696		ret.dptr = tdb_alloc_read(tdb,
697					  rec_ptr + sizeof(rec) + rec.key_len,
698					  rec.data_len);
699		ret.dsize = rec.data_len;
700	}
701
702	tdb_unlock(tdb, BUCKET(hash));
703	return ret;
704}
705
706/* check if an entry in the database exists
707
708   note that 1 is returned if the key is found and 0 is returned if not found
709   this doesn't match the conventions in the rest of this module, but is
710   compatible with gdbm
711*/
712int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
713{
714	unsigned hash;
715	tdb_off rec_ptr;
716	struct list_struct rec;
717
718        if (tdb == NULL) {
719#ifdef TDB_DEBUG
720            printf("tdb_exists() called with null context\n");
721#endif
722            return 0;
723        }
724
725	/* find which hash bucket it is in */
726	hash = tdb_hash(&key);
727
728	tdb_lock(tdb, BUCKET(hash));
729	rec_ptr = tdb_find(tdb, key, hash, &rec);
730	tdb_unlock(tdb, BUCKET(hash));
731
732	return rec_ptr != 0;
733}
734
735/* traverse the entire database - calling fn(tdb, key, data) on each element.
736   return -1 on error or the record count traversed
737   if fn is NULL then it is not called
738   a non-zero return value from fn() indicates that the traversal should stop
739  */
740int tdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void* state), void* state)
741{
742	int count = 0;
743	unsigned h;
744	tdb_off offset, rec_ptr;
745	struct list_struct rec;
746	char *data;
747	TDB_DATA key, dbuf;
748
749        if (tdb == NULL) {
750#ifdef TDB_DEBUG
751            printf("tdb_traverse() called with null context\n");
752#endif
753            return -1;
754        }
755
756	/* loop over all hash chains */
757	for (h = 0; h < tdb->header.hash_size; h++) {
758		tdb_lock(tdb, BUCKET(h));
759
760		/* read in the hash top */
761		offset = tdb_hash_top(tdb, h);
762		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
763			goto fail;
764		}
765
766		/* traverse all records for this hash */
767		while (rec_ptr) {
768			if (rec_read(tdb, rec_ptr, &rec) == -1) {
769				goto fail;
770			}
771
772			/* now read the full record */
773			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
774					     rec.key_len + rec.data_len);
775			if (!data) {
776				goto fail;
777			}
778
779			key.dptr = data;
780			key.dsize = rec.key_len;
781			dbuf.dptr = data + rec.key_len;
782			dbuf.dsize = rec.data_len;
783			count++;
784
785			if (fn && fn(tdb, key, dbuf, state) != 0) {
786				/* they want us to stop traversing */
787				free(data);
788				tdb_unlock(tdb, BUCKET(h));
789				return count;
790			}
791
792			/* a miss - drat */
793			free(data);
794
795			/* move to the next record */
796			rec_ptr = rec.next;
797		}
798		tdb_unlock(tdb, BUCKET(h));
799	}
800
801	/* return the number traversed */
802	return count;
803
804 fail:
805	tdb_unlock(tdb, BUCKET(h));
806	return -1;
807}
808
809
810/* find the first entry in the database and return its key */
811TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
812{
813	tdb_off offset, rec_ptr;
814	struct list_struct rec;
815	unsigned hash;
816	TDB_DATA ret;
817
818        if (tdb == NULL) {
819#ifdef TDB_DEBUG
820            printf("tdb_firstkey() called with null context\n");
821#endif
822            return null_data;
823        }
824
825	/* look for a non-empty hash chain */
826	for (hash = 0, rec_ptr = 0;
827	     hash < tdb->header.hash_size;
828	     hash++) {
829		/* find the top of the hash chain */
830		offset = tdb_hash_top(tdb, hash);
831
832		tdb_lock(tdb, BUCKET(hash));
833
834		/* read in the hash top */
835		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
836			goto fail;
837		}
838
839		if (rec_ptr) break;
840
841		tdb_unlock(tdb, BUCKET(hash));
842	}
843
844	if (rec_ptr == 0) return null_data;
845
846	/* we've found a non-empty chain, now read the record */
847	if (rec_read(tdb, rec_ptr, &rec) == -1) {
848		goto fail;
849	}
850
851	/* allocate and read the key space */
852	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
853	ret.dsize = rec.key_len;
854	tdb_unlock(tdb, BUCKET(hash));
855	return ret;
856
857 fail:
858	tdb_unlock(tdb, BUCKET(hash));
859	return null_data;
860}
861
862/* find the next entry in the database, returning its key */
863TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key)
864{
865	unsigned hash, hbucket;
866	tdb_off rec_ptr, offset;
867	struct list_struct rec;
868	TDB_DATA ret;
869
870        if (tdb == NULL) {
871#ifdef TDB_DEBUG
872            printf("tdb_nextkey() called with null context\n");
873#endif
874            return null_data;
875        }
876
877	/* find which hash bucket it is in */
878	hash = tdb_hash(&key);
879	hbucket = BUCKET(hash);
880
881	tdb_lock(tdb, hbucket);
882	rec_ptr = tdb_find(tdb, key, hash, &rec);
883	if (rec_ptr) {
884		/* we want the next record after this one */
885		rec_ptr = rec.next;
886	}
887
888	/* not found or last in hash: look for next non-empty hash chain */
889	while (rec_ptr == 0) {
890		tdb_unlock(tdb, hbucket);
891
892		if (++hbucket >= tdb->header.hash_size - 1)
893			return null_data;
894
895		offset = tdb_hash_top(tdb, hbucket);
896		tdb_lock(tdb, hbucket);
897		/* read in the hash top */
898		if (ofs_read(tdb, offset, &rec_ptr) == -1) {
899			tdb_unlock(tdb, hbucket);
900			return null_data;
901		}
902	}
903
904	/* Read the record. */
905	if (rec_read(tdb, rec_ptr, &rec) == -1) {
906		tdb_unlock(tdb, hbucket);
907		return null_data;
908	}
909	/* allocate and read the key */
910	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec), rec.key_len);
911	ret.dsize = rec.key_len;
912	tdb_unlock(tdb, hbucket);
913
914	return ret;
915}
916
917/* delete an entry in the database given a key */
918int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
919{
920	unsigned hash;
921	tdb_off offset, rec_ptr, last_ptr;
922	struct list_struct rec, lastrec;
923	char *data = NULL;
924
925        if (tdb == NULL) {
926#ifdef TDB_DEBUG
927            printf("tdb_delete() called with null context\n");
928#endif
929            return -1;
930        }
931
932	/* find which hash bucket it is in */
933	hash = tdb_hash(&key);
934
935	tdb_lock(tdb, BUCKET(hash));
936
937	/* find the top of the hash chain */
938	offset = tdb_hash_top(tdb, hash);
939
940	/* read in the hash top */
941	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
942		goto fail;
943	}
944
945	last_ptr = 0;
946
947	/* keep looking until we find the right record */
948	while (rec_ptr) {
949		if (rec_read(tdb, rec_ptr, &rec) == -1) {
950			goto fail;
951		}
952
953		if (hash == rec.full_hash && key.dsize == rec.key_len) {
954			/* a very likely hit - read the record and full key */
955			data = tdb_alloc_read(tdb, rec_ptr + sizeof(rec),
956					     rec.key_len);
957			if (!data) {
958				goto fail;
959			}
960
961			if (memcmp(key.dptr, data, key.dsize) == 0) {
962				/* a definite match - delete it */
963				if (last_ptr == 0) {
964					offset = tdb_hash_top(tdb, hash);
965					if (ofs_write(tdb, offset, &rec.next) == -1) {
966						goto fail;
967					}
968				} else {
969					lastrec.next = rec.next;
970					if (rec_write(tdb, last_ptr, &lastrec) == -1) {
971						goto fail;
972					}
973				}
974				tdb_unlock(tdb, BUCKET(hash));
975				tdb_lock(tdb, -1);
976				/* and recover the space */
977				offset = FREELIST_TOP;
978				if (ofs_read(tdb, offset, &rec.next) == -1) {
979					goto fail2;
980				}
981				rec.magic = TDB_FREE_MAGIC;
982				if (rec_write(tdb, rec_ptr, &rec) == -1) {
983					goto fail2;
984				}
985				if (ofs_write(tdb, offset, &rec_ptr) == -1) {
986					goto fail2;
987				}
988
989				/* yipee - all done */
990				free(data);
991				tdb_unlock(tdb, -1);
992				return 0;
993			}
994
995			/* a miss - drat */
996			free(data);
997			data = NULL;
998		}
999
1000		/* move to the next record */
1001		last_ptr = rec_ptr;
1002		lastrec = rec;
1003		rec_ptr = rec.next;
1004	}
1005
1006 fail:
1007	if (data) free(data);
1008	tdb_unlock(tdb, BUCKET(hash));
1009	return -1;
1010
1011 fail2:
1012	if (data) free(data);
1013	tdb_unlock(tdb, -1);
1014	return -1;
1015}
1016
1017
1018/* store an element in the database, replacing any existing element
1019   with the same key
1020
1021   return 0 on success, -1 on failure
1022*/
1023int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1024{
1025	struct list_struct rec;
1026	unsigned hash;
1027	tdb_off rec_ptr, offset;
1028	char *p = NULL;
1029
1030        if (tdb == NULL) {
1031#ifdef TDB_DEBUG
1032            printf("tdb_store() called with null context\n");
1033#endif
1034            return -1;
1035        }
1036
1037	/* find which hash bucket it is in */
1038	hash = tdb_hash(&key);
1039
1040	/* check for it existing */
1041	if (flag == TDB_INSERT && tdb_exists(tdb, key)) {
1042		tdb->ecode = TDB_ERR_EXISTS;
1043		return -1;
1044	}
1045
1046	/* first try in-place update */
1047	if (flag != TDB_INSERT && tdb_update(tdb, key, dbuf) == 0) {
1048		return 0;
1049	}
1050
1051	rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize);
1052	if (rec_ptr == 0) {
1053		return -1;
1054	}
1055
1056	tdb_lock(tdb, BUCKET(hash));
1057
1058	/* delete any existing record - if it doesn't exist we don't care */
1059	if (flag != TDB_INSERT) {
1060		tdb_delete(tdb, key);
1061	}
1062
1063	/* read the newly created record */
1064	if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec)) == -1) {
1065		goto fail;
1066	}
1067
1068	if (rec.magic != TDB_FREE_MAGIC) goto fail;
1069
1070	/* find the top of the hash chain */
1071	offset = tdb_hash_top(tdb, hash);
1072
1073	/* read in the hash top diretcly into our next pointer */
1074	if (ofs_read(tdb, offset, &rec.next) == -1) {
1075		goto fail;
1076	}
1077
1078	rec.key_len = key.dsize;
1079	rec.data_len = dbuf.dsize;
1080	rec.full_hash = hash;
1081	rec.magic = TDB_MAGIC;
1082
1083	p = (char *)malloc(sizeof(rec) + key.dsize + dbuf.dsize);
1084	if (!p) {
1085		tdb->ecode = TDB_ERR_OOM;
1086		goto fail;
1087	}
1088
1089	memcpy(p, &rec, sizeof(rec));
1090	memcpy(p+sizeof(rec), key.dptr, key.dsize);
1091	memcpy(p+sizeof(rec)+key.dsize, dbuf.dptr, dbuf.dsize);
1092
1093	if (tdb_write(tdb, rec_ptr, p, sizeof(rec)+key.dsize+dbuf.dsize) == -1)
1094		goto fail;
1095
1096	free(p);
1097	p = NULL;
1098
1099	/* and point the top of the hash chain at it */
1100	if (ofs_write(tdb, offset, &rec_ptr) == -1) goto fail;
1101
1102	tdb_unlock(tdb, BUCKET(hash));
1103	return 0;
1104
1105 fail:
1106#if TDB_DEBUG
1107	printf("store failed for hash 0x%08x in bucket %u\n", hash, BUCKET(hash));
1108#endif
1109	if (p) free(p);
1110	tdb_unlock(tdb, BUCKET(hash));
1111	return -1;
1112}
1113
1114
1115/* open the database, creating it if necessary
1116
1117   The open_flags and mode are passed straight to the open call on the database
1118   file. A flags value of O_WRONLY is invalid
1119
1120   The hash size is advisory, use zero for a default value.
1121
1122   return is NULL on error
1123*/
1124TDB_CONTEXT *tdb_open(char *name, int hash_size, int tdb_flags,
1125		      int open_flags, mode_t mode)
1126{
1127	TDB_CONTEXT tdb, *ret;
1128	struct stat st;
1129
1130	memset(&tdb, 0, sizeof(tdb));
1131
1132	tdb.fd = -1;
1133	tdb.name = NULL;
1134	tdb.map_ptr = NULL;
1135
1136	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1137		goto fail;
1138	}
1139
1140	if (hash_size == 0) hash_size = DEFAULT_HASH_SIZE;
1141
1142	tdb.read_only = ((open_flags & O_ACCMODE) == O_RDONLY);
1143
1144        if (name != NULL) {
1145            tdb.fd = open(name, open_flags, mode);
1146            if (tdb.fd == -1) {
1147		goto fail;
1148            }
1149        }
1150
1151	/* ensure there is only one process initialising at once */
1152	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_SET, F_WRLCK, F_SETLKW);
1153
1154	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1155		/* we need to zero the database if we are the only
1156		   one with it open */
1157		if (tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_WRLCK, F_SETLK) == 0) {
1158			ftruncate(tdb.fd, 0);
1159			tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLK);
1160		}
1161	}
1162
1163	/* leave this lock in place */
1164	tdb_brlock(&tdb, ACTIVE_LOCK, LOCK_SET, F_RDLCK, F_SETLKW);
1165
1166	if (read(tdb.fd, &tdb.header, sizeof(tdb.header)) != sizeof(tdb.header) ||
1167	    strcmp(tdb.header.magic_food, TDB_MAGIC_FOOD) != 0 ||
1168	    tdb.header.version != TDB_VERSION) {
1169		/* its not a valid database - possibly initialise it */
1170		if (!(open_flags & O_CREAT)) {
1171			goto fail;
1172		}
1173		if (tdb_new_database(&tdb, hash_size) == -1) goto fail;
1174
1175		lseek(tdb.fd, 0, SEEK_SET);
1176		if (tdb.fd != -1 && read(tdb.fd, &tdb.header,
1177                                         sizeof(tdb.header)) !=
1178                                         sizeof(tdb.header))
1179                    goto fail;
1180	}
1181
1182        if (tdb.fd != -1) {
1183            fstat(tdb.fd, &st);
1184
1185            /* map the database and fill in the return structure */
1186            tdb.name = name ? strdup(name) : NULL;
1187            tdb.map_size = st.st_size;
1188        }
1189
1190        tdb.locked = (int *)calloc(tdb.header.hash_size+1,
1191                                   sizeof(tdb.locked[0]));
1192        if (!tdb.locked) {
1193            goto fail;
1194        }
1195
1196#if HAVE_MMAP
1197        if (tdb.fd != -1) {
1198            tdb.map_ptr = (void *)mmap(NULL, st.st_size,
1199                                       tdb.read_only? PROT_READ : PROT_READ|PROT_WRITE,
1200                                       MAP_SHARED | MAP_FILE, tdb.fd, 0);
1201	    if (tdb->map_ptr == MAP_FAILED)
1202		    tdb->map_ptr = NULL;
1203        }
1204#endif
1205
1206	ret = (TDB_CONTEXT *)malloc(sizeof(tdb));
1207	if (!ret) goto fail;
1208
1209	*ret = tdb;
1210
1211#if TDB_DEBUG
1212	printf("mapped database of hash_size %u map_size=%u\n",
1213	       hash_size, tdb.map_size);
1214#endif
1215
1216	tdb_brlock(&tdb, GLOBAL_LOCK, LOCK_CLEAR, F_WRLCK, F_SETLKW);
1217	return ret;
1218
1219 fail:
1220        if (tdb.name) free(tdb.name);
1221	if (tdb.fd != -1) close(tdb.fd);
1222	if (tdb.map_ptr) munmap(tdb.map_ptr, tdb.map_size);
1223
1224	return NULL;
1225}
1226
1227/* close a database */
1228int tdb_close(TDB_CONTEXT *tdb)
1229{
1230	if (!tdb) return -1;
1231
1232	if (tdb->name) free(tdb->name);
1233	if (tdb->fd != -1) close(tdb->fd);
1234	if (tdb->locked) free(tdb->locked);
1235
1236	if (tdb->map_ptr) {
1237            if (tdb->fd != -1) {
1238                munmap(tdb->map_ptr, tdb->map_size);
1239            } else {
1240                free(tdb->map_ptr);
1241            }
1242        }
1243
1244	memset(tdb, 0, sizeof(*tdb));
1245	free(tdb);
1246
1247	return 0;
1248}
1249
1250/* lock the database. If we already have it locked then don't do anything */
1251int tdb_writelock(TDB_CONTEXT *tdb)
1252{
1253        if (tdb == NULL) {
1254#ifdef TDB_DEBUG
1255            printf("tdb_writelock() called with null context\n");
1256#endif
1257            return -1;
1258        }
1259
1260	return tdb_lock(tdb, -1);
1261}
1262
1263/* unlock the database. */
1264int tdb_writeunlock(TDB_CONTEXT *tdb)
1265{
1266        if (tdb == NULL) {
1267#ifdef TDB_DEBUG
1268            printf("tdb_writeunlock() called with null context\n");
1269#endif
1270            return -1;
1271        }
1272
1273	return tdb_unlock(tdb, -1);
1274}
1275
1276int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1277{
1278        if (tdb == NULL) {
1279#ifdef TDB_DEBUG
1280            printf("tdb_lockchain() called with null context\n");
1281#endif
1282            return -1;
1283        }
1284
1285	return tdb_lock(tdb, BUCKET(tdb_hash(&key)));
1286}
1287
1288
1289int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1290{
1291        if (tdb == NULL) {
1292#ifdef TDB_DEBUG
1293            printf("tdb_unlockchain() called with null context\n");
1294#endif
1295            return -1;
1296        }
1297
1298	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)));
1299}
1300