1 /*
2   Unix SMB/CIFS implementation.
3
4   trivial database library
5
6   Copyright (C) Andrew Tridgell              1999-2004
7   Copyright (C) Paul `Rusty' Russell		   2000
8   Copyright (C) Jeremy Allison			   2000-2003
9
10     ** NOTE! The following LGPL license applies to the tdb
11     ** library. This does NOT imply that all of Samba is released
12     ** under the LGPL
13
14   This library is free software; you can redistribute it and/or
15   modify it under the terms of the GNU Lesser General Public
16   License as published by the Free Software Foundation; either
17   version 2 of the License, or (at your option) any later version.
18
19   This library is distributed in the hope that it will be useful,
20   but WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22   Lesser General Public License for more details.
23
24   You should have received a copy of the GNU Lesser General Public
25   License along with this library; if not, write to the Free Software
26   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
27*/
28
29
30/* NOTE: If you use tdbs under valgrind, and in particular if you run
31 * tdbtorture, you may get spurious "uninitialized value" warnings.  I
32 * think this is because valgrind doesn't understand that the mmap'd
33 * area may be written to by other processes.  Memory can, from the
34 * point of view of the grinded process, spontaneously become
35 * initialized.
36 *
37 * I can think of a few solutions.  [mbp 20030311]
38 *
39 * 1 - Write suppressions for Valgrind so that it doesn't complain
40 * about this.  Probably the most reasonable but people need to
41 * remember to use them.
42 *
43 * 2 - Use IO not mmap when running under valgrind.  Not so nice.
44 *
45 * 3 - Use the special valgrind macros to mark memory as valid at the
46 * right time.  Probably too hard -- the process just doesn't know.
47 */
48
49#ifdef STANDALONE
50#if HAVE_CONFIG_H
51#include <config.h>
52#endif
53
54#include <stdlib.h>
55#include <stdio.h>
56#include <fcntl.h>
57#include <unistd.h>
58#include <string.h>
59#include <fcntl.h>
60#include <errno.h>
61#include <sys/mman.h>
62#include <sys/stat.h>
63#include <signal.h>
64#include "tdb.h"
65#include "spinlock.h"
66#else
67#include "includes.h"
68
69#if defined(PARANOID_MALLOC_CHECKER)
70#ifdef malloc
71#undef malloc
72#endif
73
74#ifdef realloc
75#undef realloc
76#endif
77
78#ifdef calloc
79#undef calloc
80#endif
81
82#ifdef strdup
83#undef strdup
84#endif
85
86#ifdef strndup
87#undef strndup
88#endif
89
90#endif
91
92#endif
93
94#define TDB_MAGIC_FOOD "TDB file\n"
95#define TDB_VERSION (0x26011967 + 6)
96#define TDB_MAGIC (0x26011999U)
97#define TDB_FREE_MAGIC (~TDB_MAGIC)
98#define TDB_DEAD_MAGIC (0xFEE1DEAD)
99#define TDB_ALIGNMENT 4
100#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
101#define DEFAULT_HASH_SIZE 131
102#define TDB_PAGE_SIZE 0x2000
103#define FREELIST_TOP (sizeof(struct tdb_header))
104#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
105#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
106#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
107#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
108#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
109#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
110
111
112/* NB assumes there is a local variable called "tdb" that is the
113 * current context, also takes doubly-parenthesized print-style
114 * argument. */
115#define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
116
117/* lock offsets */
118#define GLOBAL_LOCK 0
119#define ACTIVE_LOCK 4
120
121#ifndef MAP_FILE
122#define MAP_FILE 0
123#endif
124
125#ifndef MAP_FAILED
126#define MAP_FAILED ((void *)-1)
127#endif
128
129/* free memory if the pointer is valid and zero the pointer */
130#ifndef SAFE_FREE
131#define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
132#endif
133
134#define BUCKET(hash) ((hash) % tdb->header.hash_size)
135TDB_DATA tdb_null;
136
137/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
138static TDB_CONTEXT *tdbs = NULL;
139
140static int tdb_munmap(TDB_CONTEXT *tdb)
141{
142	if (tdb->flags & TDB_INTERNAL)
143		return 0;
144
145#ifdef HAVE_MMAP
146	if (tdb->map_ptr) {
147		int ret = munmap(tdb->map_ptr, tdb->map_size);
148		if (ret != 0)
149			return ret;
150	}
151#endif
152	tdb->map_ptr = NULL;
153	return 0;
154}
155
156static void tdb_mmap(TDB_CONTEXT *tdb)
157{
158	if (tdb->flags & TDB_INTERNAL)
159		return;
160
161#ifdef HAVE_MMAP
162	if (!(tdb->flags & TDB_NOMMAP)) {
163		tdb->map_ptr = mmap(NULL, tdb->map_size,
164				    PROT_READ|(tdb->read_only? 0:PROT_WRITE),
165				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
166
167		/*
168		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
169		 */
170
171		if (tdb->map_ptr == MAP_FAILED) {
172			tdb->map_ptr = NULL;
173			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
174				 tdb->map_size, strerror(errno)));
175		}
176	} else {
177		tdb->map_ptr = NULL;
178	}
179#else
180	tdb->map_ptr = NULL;
181#endif
182}
183
184/* Endian conversion: we only ever deal with 4 byte quantities */
185static void *convert(void *buf, u32 size)
186{
187	u32 i, *p = buf;
188	for (i = 0; i < size / 4; i++)
189		p[i] = TDB_BYTEREV(p[i]);
190	return buf;
191}
192#define DOCONV() (tdb->flags & TDB_CONVERT)
193#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
194
195/* the body of the database is made of one list_struct for the free space
196   plus a separate data list for each hash value */
197struct list_struct {
198	tdb_off next; /* offset of the next record in the list */
199	tdb_len rec_len; /* total byte length of record */
200	tdb_len key_len; /* byte length of key */
201	tdb_len data_len; /* byte length of data */
202	u32 full_hash; /* the full 32 bit hash of the key */
203	u32 magic;   /* try to catch errors */
204	/* the following union is implied:
205		union {
206			char record[rec_len];
207			struct {
208				char key[key_len];
209				char data[data_len];
210			}
211			u32 totalsize; (tailer)
212		}
213	*/
214};
215
216/***************************************************************
217 Allow a caller to set a "alarm" flag that tdb can check to abort
218 a blocking lock on SIGALRM.
219***************************************************************/
220
221static sig_atomic_t *palarm_fired;
222
223void tdb_set_lock_alarm(sig_atomic_t *palarm)
224{
225	palarm_fired = palarm;
226}
227
228/* a byte range locking function - return 0 on success
229   this functions locks/unlocks 1 byte at the specified offset.
230
231   On error, errno is also set so that errors are passed back properly
232   through tdb_open(). */
233static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
234		      int rw_type, int lck_type, int probe)
235{
236	struct flock fl;
237	int ret;
238
239	if (tdb->flags & TDB_NOLOCK)
240		return 0;
241	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
242		errno = EACCES;
243		return -1;
244	}
245
246	fl.l_type = rw_type;
247	fl.l_whence = SEEK_SET;
248	fl.l_start = offset;
249	fl.l_len = 1;
250	fl.l_pid = 0;
251
252	do {
253		ret = fcntl(tdb->fd,lck_type,&fl);
254		if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
255			break;
256	} while (ret == -1 && errno == EINTR);
257
258	if (ret == -1) {
259		if (!probe && lck_type != F_SETLK) {
260			/* Ensure error code is set for log fun to examine. */
261			if (errno == EINTR && palarm_fired && *palarm_fired)
262				tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
263			else
264				tdb->ecode = TDB_ERR_LOCK;
265			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
266				 tdb->fd, offset, rw_type, lck_type));
267		}
268		/* Was it an alarm timeout ? */
269		if (errno == EINTR && palarm_fired && *palarm_fired) {
270			TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
271				 tdb->fd, offset, rw_type, lck_type));
272			return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
273		}
274		/* Otherwise - generic lock error. errno set by fcntl.
275		 * EAGAIN is an expected return from non-blocking
276		 * locks. */
277		if (errno != EAGAIN) {
278			TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
279				 tdb->fd, offset, rw_type, lck_type,
280				 strerror(errno)));
281		}
282		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
283	}
284	return 0;
285}
286
287/* lock a list in the database. list -1 is the alloc list */
288static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
289{
290	if (list < -1 || list >= (int)tdb->header.hash_size) {
291		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
292			   list, ltype));
293		return -1;
294	}
295	if (tdb->flags & TDB_NOLOCK)
296		return 0;
297
298	/* Since fcntl locks don't nest, we do a lock for the first one,
299	   and simply bump the count for future ones */
300	if (tdb->locked[list+1].count == 0) {
301		if (!tdb->read_only && tdb->header.rwlocks) {
302			if (tdb_spinlock(tdb, list, ltype)) {
303				TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
304					   list, ltype));
305				return -1;
306			}
307		} else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
308			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
309					   list, ltype, strerror(errno)));
310			return -1;
311		}
312		tdb->locked[list+1].ltype = ltype;
313	}
314	tdb->locked[list+1].count++;
315	return 0;
316}
317
318/* unlock the database: returns void because it's too late for errors. */
319	/* changed to return int it may be interesting to know there
320	   has been an error  --simo */
321static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
322{
323	int ret = -1;
324
325	if (tdb->flags & TDB_NOLOCK)
326		return 0;
327
328	/* Sanity checks */
329	if (list < -1 || list >= (int)tdb->header.hash_size) {
330		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
331		return ret;
332	}
333
334	if (tdb->locked[list+1].count==0) {
335		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
336		return ret;
337	}
338
339	if (tdb->locked[list+1].count == 1) {
340		/* Down to last nested lock: unlock underneath */
341		if (!tdb->read_only && tdb->header.rwlocks) {
342			ret = tdb_spinunlock(tdb, list, ltype);
343		} else {
344			ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
345		}
346	} else {
347		ret = 0;
348	}
349	tdb->locked[list+1].count--;
350
351	if (ret)
352		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
353	return ret;
354}
355
356/* check for an out of bounds access - if it is out of bounds then
357   see if the database has been expanded by someone else and expand
358   if necessary
359   note that "len" is the minimum length needed for the db
360*/
361static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
362{
363	struct stat st;
364	if (len <= tdb->map_size)
365		return 0;
366	if (tdb->flags & TDB_INTERNAL) {
367		if (!probe) {
368			/* Ensure ecode is set for log fn. */
369			tdb->ecode = TDB_ERR_IO;
370			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
371				 (int)len, (int)tdb->map_size));
372		}
373		return TDB_ERRCODE(TDB_ERR_IO, -1);
374	}
375
376	if (fstat(tdb->fd, &st) == -1)
377		return TDB_ERRCODE(TDB_ERR_IO, -1);
378
379	if (st.st_size < (size_t)len) {
380		if (!probe) {
381			/* Ensure ecode is set for log fn. */
382			tdb->ecode = TDB_ERR_IO;
383			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
384				 (int)len, (int)st.st_size));
385		}
386		return TDB_ERRCODE(TDB_ERR_IO, -1);
387	}
388
389	/* Unmap, update size, remap */
390	if (tdb_munmap(tdb) == -1)
391		return TDB_ERRCODE(TDB_ERR_IO, -1);
392	tdb->map_size = st.st_size;
393	tdb_mmap(tdb);
394	return 0;
395}
396
397/* write a lump of data at a specified offset */
398static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
399{
400	if (tdb_oob(tdb, off + len, 0) != 0)
401		return -1;
402
403	if (tdb->map_ptr)
404		memcpy(off + (char *)tdb->map_ptr, buf, len);
405#ifdef HAVE_PWRITE
406	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
407#else
408	else if (lseek(tdb->fd, off, SEEK_SET) != off
409		 || write(tdb->fd, buf, len) != (ssize_t)len) {
410#endif
411		/* Ensure ecode is set for log fn. */
412		tdb->ecode = TDB_ERR_IO;
413		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
414			   off, len, strerror(errno)));
415		return TDB_ERRCODE(TDB_ERR_IO, -1);
416	}
417	return 0;
418}
419
420/* read a lump of data at a specified offset, maybe convert */
421static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
422{
423	if (tdb_oob(tdb, off + len, 0) != 0)
424		return -1;
425
426	if (tdb->map_ptr)
427		memcpy(buf, off + (char *)tdb->map_ptr, len);
428#ifdef HAVE_PREAD
429	else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
430#else
431	else if (lseek(tdb->fd, off, SEEK_SET) != off
432		 || read(tdb->fd, buf, len) != (ssize_t)len) {
433#endif
434		/* Ensure ecode is set for log fn. */
435		tdb->ecode = TDB_ERR_IO;
436		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
437			   off, len, strerror(errno)));
438		return TDB_ERRCODE(TDB_ERR_IO, -1);
439	}
440	if (cv)
441		convert(buf, len);
442	return 0;
443}
444
445/* read a lump of data, allocating the space for it */
446static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
447{
448	char *buf;
449
450	if (!(buf = malloc(len))) {
451		/* Ensure ecode is set for log fn. */
452		tdb->ecode = TDB_ERR_OOM;
453		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
454			   len, strerror(errno)));
455		return TDB_ERRCODE(TDB_ERR_OOM, buf);
456	}
457	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
458		SAFE_FREE(buf);
459		return NULL;
460	}
461	return buf;
462}
463
464/* read/write a tdb_off */
465static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
466{
467	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
468}
469static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
470{
471	tdb_off off = *d;
472	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
473}
474
475/* read/write a record */
476static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
477{
478	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
479		return -1;
480	if (TDB_BAD_MAGIC(rec)) {
481		/* Ensure ecode is set for log fn. */
482		tdb->ecode = TDB_ERR_CORRUPT;
483		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
484		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
485	}
486	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
487}
488static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
489{
490	struct list_struct r = *rec;
491	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
492}
493
494/* read a freelist record and check for simple errors */
495static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
496{
497	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
498		return -1;
499
500	if (rec->magic == TDB_MAGIC) {
501		/* this happens when a app is showdown while deleting a record - we should
502		   not completely fail when this happens */
503		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
504			 rec->magic, off));
505		rec->magic = TDB_FREE_MAGIC;
506		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
507			return -1;
508	}
509
510	if (rec->magic != TDB_FREE_MAGIC) {
511		/* Ensure ecode is set for log fn. */
512		tdb->ecode = TDB_ERR_CORRUPT;
513		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
514			   rec->magic, off));
515		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
516	}
517	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
518		return -1;
519	return 0;
520}
521
522/* update a record tailer (must hold allocation lock) */
523static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
524			 const struct list_struct *rec)
525{
526	tdb_off totalsize;
527
528	/* Offset of tailer from record header */
529	totalsize = sizeof(*rec) + rec->rec_len;
530	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
531			 &totalsize);
532}
533
534static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
535{
536	struct list_struct rec;
537	tdb_off tailer_ofs, tailer;
538
539	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
540		printf("ERROR: failed to read record at %u\n", offset);
541		return 0;
542	}
543
544	printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
545	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
546
547	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
548	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
549		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
550		return rec.next;
551	}
552
553	if (tailer != rec.rec_len + sizeof(rec)) {
554		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
555				(unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
556	}
557	return rec.next;
558}
559
560static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
561{
562	tdb_off rec_ptr, top;
563
564	top = TDB_HASH_TOP(i);
565
566	if (tdb_lock(tdb, i, F_WRLCK) != 0)
567		return -1;
568
569	if (ofs_read(tdb, top, &rec_ptr) == -1)
570		return tdb_unlock(tdb, i, F_WRLCK);
571
572	if (rec_ptr)
573		printf("hash=%d\n", i);
574
575	while (rec_ptr) {
576		rec_ptr = tdb_dump_record(tdb, rec_ptr);
577	}
578
579	return tdb_unlock(tdb, i, F_WRLCK);
580}
581
582void tdb_dump_all(TDB_CONTEXT *tdb)
583{
584	int i;
585	for (i=0;i<tdb->header.hash_size;i++) {
586		tdb_dump_chain(tdb, i);
587	}
588	printf("freelist:\n");
589	tdb_dump_chain(tdb, -1);
590}
591
592int tdb_printfreelist(TDB_CONTEXT *tdb)
593{
594	int ret;
595	long total_free = 0;
596	tdb_off offset, rec_ptr;
597	struct list_struct rec;
598
599	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
600		return ret;
601
602	offset = FREELIST_TOP;
603
604	/* read in the freelist top */
605	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
606		tdb_unlock(tdb, -1, F_WRLCK);
607		return 0;
608	}
609
610	printf("freelist top=[0x%08x]\n", rec_ptr );
611	while (rec_ptr) {
612		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
613			tdb_unlock(tdb, -1, F_WRLCK);
614			return -1;
615		}
616
617		if (rec.magic != TDB_FREE_MAGIC) {
618			printf("bad magic 0x%08x in free list\n", rec.magic);
619			tdb_unlock(tdb, -1, F_WRLCK);
620			return -1;
621		}
622
623		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
624		total_free += rec.rec_len;
625
626		/* move to the next record */
627		rec_ptr = rec.next;
628	}
629	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
630               (int)total_free);
631
632	return tdb_unlock(tdb, -1, F_WRLCK);
633}
634
635/* Remove an element from the freelist.  Must have alloc lock. */
636static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
637{
638	tdb_off last_ptr, i;
639
640	/* read in the freelist top */
641	last_ptr = FREELIST_TOP;
642	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
643		if (i == off) {
644			/* We've found it! */
645			return ofs_write(tdb, last_ptr, &next);
646		}
647		/* Follow chain (next offset is at start of record) */
648		last_ptr = i;
649	}
650	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
651	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
652}
653
654/* Add an element into the freelist. Merge adjacent records if
655   neccessary. */
656static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
657{
658	tdb_off right, left;
659
660	/* Allocation and tailer lock */
661	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
662		return -1;
663
664	/* set an initial tailer, so if we fail we don't leave a bogus record */
665	if (update_tailer(tdb, offset, rec) != 0) {
666		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
667		goto fail;
668	}
669
670	/* Look right first (I'm an Australian, dammit) */
671	right = offset + sizeof(*rec) + rec->rec_len;
672	if (right + sizeof(*rec) <= tdb->map_size) {
673		struct list_struct r;
674
675		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
676			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
677			goto left;
678		}
679
680		/* If it's free, expand to include it. */
681		if (r.magic == TDB_FREE_MAGIC) {
682			if (remove_from_freelist(tdb, right, r.next) == -1) {
683				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
684				goto left;
685			}
686			rec->rec_len += sizeof(r) + r.rec_len;
687		}
688	}
689
690left:
691	/* Look left */
692	left = offset - sizeof(tdb_off);
693	if (left > TDB_DATA_START(tdb->header.hash_size)) {
694		struct list_struct l;
695		tdb_off leftsize;
696
697		/* Read in tailer and jump back to header */
698		if (ofs_read(tdb, left, &leftsize) == -1) {
699			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
700			goto update;
701		}
702		left = offset - leftsize;
703
704		/* Now read in record */
705		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
706			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
707			goto update;
708		}
709
710		/* If it's free, expand to include it. */
711		if (l.magic == TDB_FREE_MAGIC) {
712			if (remove_from_freelist(tdb, left, l.next) == -1) {
713				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
714				goto update;
715			} else {
716				offset = left;
717				rec->rec_len += leftsize;
718			}
719		}
720	}
721
722update:
723	if (update_tailer(tdb, offset, rec) == -1) {
724		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
725		goto fail;
726	}
727
728	/* Now, prepend to free list */
729	rec->magic = TDB_FREE_MAGIC;
730
731	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
732	    rec_write(tdb, offset, rec) == -1 ||
733	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
734		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
735		goto fail;
736	}
737
738	/* And we're done. */
739	tdb_unlock(tdb, -1, F_WRLCK);
740	return 0;
741
742 fail:
743	tdb_unlock(tdb, -1, F_WRLCK);
744	return -1;
745}
746
747
748/* expand a file.  we prefer to use ftruncate, as that is what posix
749  says to use for mmap expansion */
750static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
751{
752	char buf[1024];
753#if HAVE_FTRUNCATE_EXTEND
754	if (ftruncate(tdb->fd, size+addition) != 0) {
755		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
756			   size+addition, strerror(errno)));
757		return -1;
758	}
759#else
760	char b = 0;
761
762#ifdef HAVE_PWRITE
763	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
764#else
765	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
766	    write(tdb->fd, &b, 1) != 1) {
767#endif
768		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
769			   size+addition, strerror(errno)));
770		return -1;
771	}
772#endif
773
774	/* now fill the file with something. This ensures that the file isn't sparse, which would be
775	   very bad if we ran out of disk. This must be done with write, not via mmap */
776	memset(buf, 0x42, sizeof(buf));
777	while (addition) {
778		int n = addition>sizeof(buf)?sizeof(buf):addition;
779#ifdef HAVE_PWRITE
780		int ret = pwrite(tdb->fd, buf, n, size);
781#else
782		int ret;
783		if (lseek(tdb->fd, size, SEEK_SET) != size)
784			return -1;
785		ret = write(tdb->fd, buf, n);
786#endif
787		if (ret != n) {
788			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
789				   n, strerror(errno)));
790			return -1;
791		}
792		addition -= n;
793		size += n;
794	}
795	return 0;
796}
797
798
799/* expand the database at least size bytes by expanding the underlying
800   file and doing the mmap again if necessary */
801static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
802{
803	struct list_struct rec;
804	tdb_off offset;
805
806	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
807		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
808		return -1;
809	}
810
811	/* must know about any previous expansions by another process */
812	tdb_oob(tdb, tdb->map_size + 1, 1);
813
814	/* always make room for at least 10 more records, and round
815           the database up to a multiple of TDB_PAGE_SIZE */
816	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
817
818	if (!(tdb->flags & TDB_INTERNAL))
819		tdb_munmap(tdb);
820
821	/*
822	 * We must ensure the file is unmapped before doing this
823	 * to ensure consistency with systems like OpenBSD where
824	 * writes and mmaps are not consistent.
825	 */
826
827	/* expand the file itself */
828	if (!(tdb->flags & TDB_INTERNAL)) {
829		if (expand_file(tdb, tdb->map_size, size) != 0)
830			goto fail;
831	}
832
833	tdb->map_size += size;
834
835	if (tdb->flags & TDB_INTERNAL)
836		tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
837	else {
838		/*
839		 * We must ensure the file is remapped before adding the space
840		 * to ensure consistency with systems like OpenBSD where
841		 * writes and mmaps are not consistent.
842		 */
843
844		/* We're ok if the mmap fails as we'll fallback to read/write */
845		tdb_mmap(tdb);
846	}
847
848	/* form a new freelist record */
849	memset(&rec,'\0',sizeof(rec));
850	rec.rec_len = size - sizeof(rec);
851
852	/* link it into the free list */
853	offset = tdb->map_size - size;
854	if (tdb_free(tdb, offset, &rec) == -1)
855		goto fail;
856
857	tdb_unlock(tdb, -1, F_WRLCK);
858	return 0;
859 fail:
860	tdb_unlock(tdb, -1, F_WRLCK);
861	return -1;
862}
863
864/* allocate some space from the free list. The offset returned points
865   to a unconnected list_struct within the database with room for at
866   least length bytes of total data
867
868   0 is returned if the space could not be allocated
869 */
870static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
871			    struct list_struct *rec)
872{
873	tdb_off rec_ptr, last_ptr, newrec_ptr;
874	struct list_struct newrec;
875
876	memset(&newrec, '\0', sizeof(newrec));
877
878	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
879		return 0;
880
881	/* Extra bytes required for tailer */
882	length += sizeof(tdb_off);
883
884 again:
885	last_ptr = FREELIST_TOP;
886
887	/* read in the freelist top */
888	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
889		goto fail;
890
891	/* keep looking until we find a freelist record big enough */
892	while (rec_ptr) {
893		if (rec_free_read(tdb, rec_ptr, rec) == -1)
894			goto fail;
895
896		if (rec->rec_len >= length) {
897			/* found it - now possibly split it up  */
898			if (rec->rec_len > length + MIN_REC_SIZE) {
899				/* Length of left piece */
900				length = TDB_ALIGN(length, TDB_ALIGNMENT);
901
902				/* Right piece to go on free list */
903				newrec.rec_len = rec->rec_len
904					- (sizeof(*rec) + length);
905				newrec_ptr = rec_ptr + sizeof(*rec) + length;
906
907				/* And left record is shortened */
908				rec->rec_len = length;
909			} else
910				newrec_ptr = 0;
911
912			/* Remove allocated record from the free list */
913			if (ofs_write(tdb, last_ptr, &rec->next) == -1)
914				goto fail;
915
916			/* Update header: do this before we drop alloc
917                           lock, otherwise tdb_free() might try to
918                           merge with us, thinking we're free.
919                           (Thanks Jeremy Allison). */
920			rec->magic = TDB_MAGIC;
921			if (rec_write(tdb, rec_ptr, rec) == -1)
922				goto fail;
923
924			/* Did we create new block? */
925			if (newrec_ptr) {
926				/* Update allocated record tailer (we
927                                   shortened it). */
928				if (update_tailer(tdb, rec_ptr, rec) == -1)
929					goto fail;
930
931				/* Free new record */
932				if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
933					goto fail;
934			}
935
936			/* all done - return the new record offset */
937			tdb_unlock(tdb, -1, F_WRLCK);
938			return rec_ptr;
939		}
940		/* move to the next record */
941		last_ptr = rec_ptr;
942		rec_ptr = rec->next;
943	}
944	/* we didn't find enough space. See if we can expand the
945	   database and if we can then try again */
946	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
947		goto again;
948 fail:
949	tdb_unlock(tdb, -1, F_WRLCK);
950	return 0;
951}
952
953/* initialise a new database with a specified hash size */
954static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
955{
956	struct tdb_header *newdb;
957	int size, ret = -1;
958
959	/* We make it up in memory, then write it out if not internal */
960	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
961	if (!(newdb = calloc(size, 1)))
962		return TDB_ERRCODE(TDB_ERR_OOM, -1);
963
964	/* Fill in the header */
965	newdb->version = TDB_VERSION;
966	newdb->hash_size = hash_size;
967#ifdef USE_SPINLOCKS
968	newdb->rwlocks = size;
969#endif
970	if (tdb->flags & TDB_INTERNAL) {
971		tdb->map_size = size;
972		tdb->map_ptr = (char *)newdb;
973		memcpy(&tdb->header, newdb, sizeof(tdb->header));
974		/* Convert the `ondisk' version if asked. */
975		CONVERT(*newdb);
976		return 0;
977	}
978	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
979		goto fail;
980
981	if (ftruncate(tdb->fd, 0) == -1)
982		goto fail;
983
984	/* This creates an endian-converted header, as if read from disk */
985	CONVERT(*newdb);
986	memcpy(&tdb->header, newdb, sizeof(tdb->header));
987	/* Don't endian-convert the magic food! */
988	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
989	if (write(tdb->fd, newdb, size) != size)
990		ret = -1;
991	else
992		ret = tdb_create_rwlocks(tdb->fd, hash_size);
993
994  fail:
995	SAFE_FREE(newdb);
996	return ret;
997}
998
999/* Returns 0 on fail.  On success, return offset of record, and fills
1000   in rec */
1001static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
1002			struct list_struct *r)
1003{
1004	tdb_off rec_ptr;
1005
1006	/* read in the hash top */
1007	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
1008		return 0;
1009
1010	/* keep looking until we find the right record */
1011	while (rec_ptr) {
1012		if (rec_read(tdb, rec_ptr, r) == -1)
1013			return 0;
1014
1015		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
1016			char *k;
1017			/* a very likely hit - read the key */
1018			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
1019					   r->key_len);
1020			if (!k)
1021				return 0;
1022
1023			if (memcmp(key.dptr, k, key.dsize) == 0) {
1024				SAFE_FREE(k);
1025				return rec_ptr;
1026			}
1027			SAFE_FREE(k);
1028		}
1029		rec_ptr = r->next;
1030	}
1031	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1032}
1033
1034/* As tdb_find, but if you succeed, keep the lock */
1035static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1036			     struct list_struct *rec)
1037{
1038	u32 rec_ptr;
1039
1040	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1041		return 0;
1042	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1043		tdb_unlock(tdb, BUCKET(hash), locktype);
1044	return rec_ptr;
1045}
1046
1047enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1048{
1049	return tdb->ecode;
1050}
1051
1052static struct tdb_errname {
1053	enum TDB_ERROR ecode; const char *estring;
1054} emap[] = { {TDB_SUCCESS, "Success"},
1055	     {TDB_ERR_CORRUPT, "Corrupt database"},
1056	     {TDB_ERR_IO, "IO Error"},
1057	     {TDB_ERR_LOCK, "Locking error"},
1058	     {TDB_ERR_OOM, "Out of memory"},
1059	     {TDB_ERR_EXISTS, "Record exists"},
1060	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1061	     {TDB_ERR_NOEXIST, "Record does not exist"} };
1062
1063/* Error string for the last tdb error */
1064const char *tdb_errorstr(TDB_CONTEXT *tdb)
1065{
1066	u32 i;
1067	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1068		if (tdb->ecode == emap[i].ecode)
1069			return emap[i].estring;
1070	return "Invalid error code";
1071}
1072
1073/* update an entry in place - this only works if the new data size
1074   is <= the old data size and the key exists.
1075   on failure return -1.
1076*/
1077
1078static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1079{
1080	struct list_struct rec;
1081	tdb_off rec_ptr;
1082
1083	/* find entry */
1084	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1085		return -1;
1086
1087	/* must be long enough key, data and tailer */
1088	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1089		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1090		return -1;
1091	}
1092
1093	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1094		      dbuf.dptr, dbuf.dsize) == -1)
1095		return -1;
1096
1097	if (dbuf.dsize != rec.data_len) {
1098		/* update size */
1099		rec.data_len = dbuf.dsize;
1100		return rec_write(tdb, rec_ptr, &rec);
1101	}
1102
1103	return 0;
1104}
1105
1106/* find an entry in the database given a key */
1107/* If an entry doesn't exist tdb_err will be set to
1108 * TDB_ERR_NOEXIST. If a key has no data attached
1109 * tdb_err will not be set. Both will return a
1110 * zero pptr and zero dsize.
1111 */
1112
1113TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1114{
1115	tdb_off rec_ptr;
1116	struct list_struct rec;
1117	TDB_DATA ret;
1118	u32 hash;
1119
1120	/* find which hash bucket it is in */
1121	hash = tdb->hash_fn(&key);
1122	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1123		return tdb_null;
1124
1125	if (rec.data_len)
1126		ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1127					  rec.data_len);
1128	else
1129		ret.dptr = NULL;
1130	ret.dsize = rec.data_len;
1131	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1132	return ret;
1133}
1134
1135/* check if an entry in the database exists
1136
1137   note that 1 is returned if the key is found and 0 is returned if not found
1138   this doesn't match the conventions in the rest of this module, but is
1139   compatible with gdbm
1140*/
1141static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1142{
1143	struct list_struct rec;
1144
1145	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1146		return 0;
1147	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1148	return 1;
1149}
1150
1151int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1152{
1153	u32 hash = tdb->hash_fn(&key);
1154	return tdb_exists_hash(tdb, key, hash);
1155}
1156
1157/* record lock stops delete underneath */
1158static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1159{
1160	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1161}
1162/*
1163  Write locks override our own fcntl readlocks, so check it here.
1164  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1165  an error to fail to get the lock here.
1166*/
1167
1168static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1169{
1170	struct tdb_traverse_lock *i;
1171	for (i = &tdb->travlocks; i; i = i->next)
1172		if (i->off == off)
1173			return -1;
1174	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1175}
1176
1177/*
1178  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1179  an error to fail to get the lock here.
1180*/
1181
1182static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1183{
1184	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1185}
1186/* fcntl locks don't stack: avoid unlocking someone else's */
1187static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1188{
1189	struct tdb_traverse_lock *i;
1190	u32 count = 0;
1191
1192	if (off == 0)
1193		return 0;
1194	for (i = &tdb->travlocks; i; i = i->next)
1195		if (i->off == off)
1196			count++;
1197	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1198}
1199
1200/* actually delete an entry in the database given the offset */
1201static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1202{
1203	tdb_off last_ptr, i;
1204	struct list_struct lastrec;
1205
1206	if (tdb->read_only) return -1;
1207
1208	if (write_lock_record(tdb, rec_ptr) == -1) {
1209		/* Someone traversing here: mark it as dead */
1210		rec->magic = TDB_DEAD_MAGIC;
1211		return rec_write(tdb, rec_ptr, rec);
1212	}
1213	if (write_unlock_record(tdb, rec_ptr) != 0)
1214		return -1;
1215
1216	/* find previous record in hash chain */
1217	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1218		return -1;
1219	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1220		if (rec_read(tdb, i, &lastrec) == -1)
1221			return -1;
1222
1223	/* unlink it: next ptr is at start of record. */
1224	if (last_ptr == 0)
1225		last_ptr = TDB_HASH_TOP(rec->full_hash);
1226	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1227		return -1;
1228
1229	/* recover the space */
1230	if (tdb_free(tdb, rec_ptr, rec) == -1)
1231		return -1;
1232	return 0;
1233}
1234
1235/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1236static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1237			 struct list_struct *rec)
1238{
1239	int want_next = (tlock->off != 0);
1240
1241	/* Lock each chain from the start one. */
1242	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1243		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1244			return -1;
1245
1246		/* No previous record?  Start at top of chain. */
1247		if (!tlock->off) {
1248			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1249				     &tlock->off) == -1)
1250				goto fail;
1251		} else {
1252			/* Otherwise unlock the previous record. */
1253			if (unlock_record(tdb, tlock->off) != 0)
1254				goto fail;
1255		}
1256
1257		if (want_next) {
1258			/* We have offset of old record: grab next */
1259			if (rec_read(tdb, tlock->off, rec) == -1)
1260				goto fail;
1261			tlock->off = rec->next;
1262		}
1263
1264		/* Iterate through chain */
1265		while( tlock->off) {
1266			tdb_off current;
1267			if (rec_read(tdb, tlock->off, rec) == -1)
1268				goto fail;
1269			if (!TDB_DEAD(rec)) {
1270				/* Woohoo: we found one! */
1271				if (lock_record(tdb, tlock->off) != 0)
1272					goto fail;
1273				return tlock->off;
1274			}
1275
1276			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
1277			if (tlock->off == rec->next) {
1278				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
1279				goto fail;
1280			}
1281
1282			/* Try to clean dead ones from old traverses */
1283			current = tlock->off;
1284			tlock->off = rec->next;
1285			if (!tdb->read_only &&
1286			    do_delete(tdb, current, rec) != 0)
1287				goto fail;
1288		}
1289		tdb_unlock(tdb, tlock->hash, F_WRLCK);
1290		want_next = 0;
1291	}
1292	/* We finished iteration without finding anything */
1293	return TDB_ERRCODE(TDB_SUCCESS, 0);
1294
1295 fail:
1296	tlock->off = 0;
1297	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1298		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1299	return -1;
1300}
1301
1302/* traverse the entire database - calling fn(tdb, key, data) on each element.
1303   return -1 on error or the record count traversed
1304   if fn is NULL then it is not called
1305   a non-zero return value from fn() indicates that the traversal should stop
1306  */
1307int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1308{
1309	TDB_DATA key, dbuf;
1310	struct list_struct rec;
1311	struct tdb_traverse_lock tl = { NULL, 0, 0 };
1312	int ret, count = 0;
1313
1314	/* This was in the initializaton, above, but the IRIX compiler
1315	 * did not like it.  crh
1316	 */
1317	tl.next = tdb->travlocks.next;
1318
1319	/* fcntl locks don't stack: beware traverse inside traverse */
1320	tdb->travlocks.next = &tl;
1321
1322	/* tdb_next_lock places locks on the record returned, and its chain */
1323	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1324		count++;
1325		/* now read the full record */
1326		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1327					  rec.key_len + rec.data_len);
1328		if (!key.dptr) {
1329			ret = -1;
1330			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1331				goto out;
1332			if (unlock_record(tdb, tl.off) != 0)
1333				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1334			goto out;
1335		}
1336		key.dsize = rec.key_len;
1337		dbuf.dptr = key.dptr + rec.key_len;
1338		dbuf.dsize = rec.data_len;
1339
1340		/* Drop chain lock, call out */
1341		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1342			ret = -1;
1343			goto out;
1344		}
1345		if (fn && fn(tdb, key, dbuf, private)) {
1346			/* They want us to terminate traversal */
1347			ret = count;
1348			if (unlock_record(tdb, tl.off) != 0) {
1349				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1350				ret = -1;
1351			}
1352			tdb->travlocks.next = tl.next;
1353			SAFE_FREE(key.dptr);
1354			return count;
1355		}
1356		SAFE_FREE(key.dptr);
1357	}
1358out:
1359	tdb->travlocks.next = tl.next;
1360	if (ret < 0)
1361		return -1;
1362	else
1363		return count;
1364}
1365
1366/* find the first entry in the database and return its key */
1367TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1368{
1369	TDB_DATA key;
1370	struct list_struct rec;
1371
1372	/* release any old lock */
1373	if (unlock_record(tdb, tdb->travlocks.off) != 0)
1374		return tdb_null;
1375	tdb->travlocks.off = tdb->travlocks.hash = 0;
1376
1377	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1378		return tdb_null;
1379	/* now read the key */
1380	key.dsize = rec.key_len;
1381	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1382	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1383		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1384	return key;
1385}
1386
1387/* find the next entry in the database, returning its key */
1388TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1389{
1390	u32 oldhash;
1391	TDB_DATA key = tdb_null;
1392	struct list_struct rec;
1393	char *k = NULL;
1394
1395	/* Is locked key the old key?  If so, traverse will be reliable. */
1396	if (tdb->travlocks.off) {
1397		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1398			return tdb_null;
1399		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1400		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1401					    rec.key_len))
1402		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1403			/* No, it wasn't: unlock it and start from scratch */
1404			if (unlock_record(tdb, tdb->travlocks.off) != 0)
1405				return tdb_null;
1406			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1407				return tdb_null;
1408			tdb->travlocks.off = 0;
1409		}
1410
1411		SAFE_FREE(k);
1412	}
1413
1414	if (!tdb->travlocks.off) {
1415		/* No previous element: do normal find, and lock record */
1416		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1417		if (!tdb->travlocks.off)
1418			return tdb_null;
1419		tdb->travlocks.hash = BUCKET(rec.full_hash);
1420		if (lock_record(tdb, tdb->travlocks.off) != 0) {
1421			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1422			return tdb_null;
1423		}
1424	}
1425	oldhash = tdb->travlocks.hash;
1426
1427	/* Grab next record: locks chain and returned record,
1428	   unlocks old record */
1429	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1430		key.dsize = rec.key_len;
1431		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1432					  key.dsize);
1433		/* Unlock the chain of this new record */
1434		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1435			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1436	}
1437	/* Unlock the chain of old record */
1438	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1439		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1440	return key;
1441}
1442
1443/* delete an entry in the database given a key */
1444static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1445{
1446	tdb_off rec_ptr;
1447	struct list_struct rec;
1448	int ret;
1449
1450	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1451		return -1;
1452	ret = do_delete(tdb, rec_ptr, &rec);
1453	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1454		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1455	return ret;
1456}
1457
1458int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1459{
1460	u32 hash = tdb->hash_fn(&key);
1461	return tdb_delete_hash(tdb, key, hash);
1462}
1463
1464/* store an element in the database, replacing any existing element
1465   with the same key
1466
1467   return 0 on success, -1 on failure
1468*/
1469int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1470{
1471	struct list_struct rec;
1472	u32 hash;
1473	tdb_off rec_ptr;
1474	char *p = NULL;
1475	int ret = 0;
1476
1477	/* find which hash bucket it is in */
1478	hash = tdb->hash_fn(&key);
1479	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1480		return -1;
1481
1482	/* check for it existing, on insert. */
1483	if (flag == TDB_INSERT) {
1484		if (tdb_exists_hash(tdb, key, hash)) {
1485			tdb->ecode = TDB_ERR_EXISTS;
1486			goto fail;
1487		}
1488	} else {
1489		/* first try in-place update, on modify or replace. */
1490		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1491			goto out;
1492		if (tdb->ecode == TDB_ERR_NOEXIST &&
1493		    flag == TDB_MODIFY) {
1494			/* if the record doesn't exist and we are in TDB_MODIFY mode then
1495			 we should fail the store */
1496			goto fail;
1497	}
1498	}
1499	/* reset the error code potentially set by the tdb_update() */
1500	tdb->ecode = TDB_SUCCESS;
1501
1502	/* delete any existing record - if it doesn't exist we don't
1503           care.  Doing this first reduces fragmentation, and avoids
1504           coalescing with `allocated' block before it's updated. */
1505	if (flag != TDB_INSERT)
1506		tdb_delete_hash(tdb, key, hash);
1507
1508	/* Copy key+value *before* allocating free space in case malloc
1509	   fails and we are left with a dead spot in the tdb. */
1510
1511	if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1512		tdb->ecode = TDB_ERR_OOM;
1513		goto fail;
1514	}
1515
1516	memcpy(p, key.dptr, key.dsize);
1517	if (dbuf.dsize)
1518		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1519
1520	/* we have to allocate some space */
1521	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1522		goto fail;
1523
1524	/* Read hash top into next ptr */
1525	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1526		goto fail;
1527
1528	rec.key_len = key.dsize;
1529	rec.data_len = dbuf.dsize;
1530	rec.full_hash = hash;
1531	rec.magic = TDB_MAGIC;
1532
1533	/* write out and point the top of the hash chain at it */
1534	if (rec_write(tdb, rec_ptr, &rec) == -1
1535	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1536	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1537		/* Need to tdb_unallocate() here */
1538		goto fail;
1539	}
1540 out:
1541	SAFE_FREE(p);
1542	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1543	return ret;
1544fail:
1545	ret = -1;
1546	goto out;
1547}
1548
1549/* Attempt to append data to an entry in place - this only works if the new data size
1550   is <= the old data size and the key exists.
1551   on failure return -1. Record must be locked before calling.
1552*/
1553static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1554{
1555	struct list_struct rec;
1556	tdb_off rec_ptr;
1557
1558	/* find entry */
1559	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1560		return -1;
1561
1562	/* Append of 0 is always ok. */
1563	if (new_dbuf.dsize == 0)
1564		return 0;
1565
1566	/* must be long enough for key, old data + new data and tailer */
1567	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1568		/* No room. */
1569		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1570		return -1;
1571	}
1572
1573	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1574		      new_dbuf.dptr, new_dbuf.dsize) == -1)
1575		return -1;
1576
1577	/* update size */
1578	rec.data_len += new_dbuf.dsize;
1579	return rec_write(tdb, rec_ptr, &rec);
1580}
1581
1582/* Append to an entry. Create if not exist. */
1583
1584int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1585{
1586	struct list_struct rec;
1587	u32 hash;
1588	tdb_off rec_ptr;
1589	char *p = NULL;
1590	int ret = 0;
1591	size_t new_data_size = 0;
1592
1593	/* find which hash bucket it is in */
1594	hash = tdb->hash_fn(&key);
1595	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1596		return -1;
1597
1598	/* first try in-place. */
1599	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1600		goto out;
1601
1602	/* reset the error code potentially set by the tdb_append_inplace() */
1603	tdb->ecode = TDB_SUCCESS;
1604
1605	/* find entry */
1606	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1607		if (tdb->ecode != TDB_ERR_NOEXIST)
1608			goto fail;
1609
1610		/* Not found - create. */
1611
1612		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1613		goto out;
1614	}
1615
1616	new_data_size = rec.data_len + new_dbuf.dsize;
1617
1618	/* Copy key+old_value+value *before* allocating free space in case malloc
1619	   fails and we are left with a dead spot in the tdb. */
1620
1621	if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1622		tdb->ecode = TDB_ERR_OOM;
1623		goto fail;
1624	}
1625
1626	/* Copy the key in place. */
1627	memcpy(p, key.dptr, key.dsize);
1628
1629	/* Now read the old data into place. */
1630	if (rec.data_len &&
1631		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1632			goto fail;
1633
1634	/* Finally append the new data. */
1635	if (new_dbuf.dsize)
1636		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1637
1638	/* delete any existing record - if it doesn't exist we don't
1639           care.  Doing this first reduces fragmentation, and avoids
1640           coalescing with `allocated' block before it's updated. */
1641
1642	tdb_delete_hash(tdb, key, hash);
1643
1644	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1645		goto fail;
1646
1647	/* Read hash top into next ptr */
1648	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1649		goto fail;
1650
1651	rec.key_len = key.dsize;
1652	rec.data_len = new_data_size;
1653	rec.full_hash = hash;
1654	rec.magic = TDB_MAGIC;
1655
1656	/* write out and point the top of the hash chain at it */
1657	if (rec_write(tdb, rec_ptr, &rec) == -1
1658	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1659	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1660		/* Need to tdb_unallocate() here */
1661		goto fail;
1662	}
1663
1664 out:
1665	SAFE_FREE(p);
1666	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1667	return ret;
1668
1669fail:
1670	ret = -1;
1671	goto out;
1672}
1673
1674static int tdb_already_open(dev_t device,
1675			    ino_t ino)
1676{
1677	TDB_CONTEXT *i;
1678
1679	for (i = tdbs; i; i = i->next) {
1680		if (i->device == device && i->inode == ino) {
1681			return 1;
1682		}
1683	}
1684
1685	return 0;
1686}
1687
1688/* This is based on the hash algorithm from gdbm */
1689static u32 default_tdb_hash(TDB_DATA *key)
1690{
1691	u32 value;	/* Used to compute the hash value.  */
1692	u32   i;	/* Used to cycle through random values. */
1693
1694	/* Set the initial value from the key size. */
1695	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1696		value = (value + (key->dptr[i] << (i*5 % 24)));
1697
1698	return (1103515243 * value + 12345);
1699}
1700
1701/* open the database, creating it if necessary
1702
1703   The open_flags and mode are passed straight to the open call on the
1704   database file. A flags value of O_WRONLY is invalid. The hash size
1705   is advisory, use zero for a default value.
1706
1707   Return is NULL on error, in which case errno is also set.  Don't
1708   try to call tdb_error or tdb_errname, just do strerror(errno).
1709
1710   @param name may be NULL for internal databases. */
1711TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1712		      int open_flags, mode_t mode)
1713{
1714	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1715}
1716
1717
1718TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1719			 int open_flags, mode_t mode,
1720			 tdb_log_func log_fn,
1721			 tdb_hash_func hash_fn)
1722{
1723	TDB_CONTEXT *tdb;
1724	struct stat st;
1725	int rev = 0, locked = 0;
1726	unsigned char *vp;
1727	u32 vertest;
1728
1729	if (!(tdb = calloc(1, sizeof *tdb))) {
1730		/* Can't log this */
1731		errno = ENOMEM;
1732		goto fail;
1733	}
1734	tdb->fd = -1;
1735	tdb->name = NULL;
1736	tdb->map_ptr = NULL;
1737	tdb->flags = tdb_flags;
1738	tdb->open_flags = open_flags;
1739	tdb->log_fn = log_fn;
1740	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1741
1742	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1743		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1744			 name));
1745		errno = EINVAL;
1746		goto fail;
1747	}
1748
1749	if (hash_size == 0)
1750		hash_size = DEFAULT_HASH_SIZE;
1751	if ((open_flags & O_ACCMODE) == O_RDONLY) {
1752		tdb->read_only = 1;
1753		/* read only databases don't do locking or clear if first */
1754		tdb->flags |= TDB_NOLOCK;
1755		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1756	}
1757
1758	/* internal databases don't mmap or lock, and start off cleared */
1759	if (tdb->flags & TDB_INTERNAL) {
1760		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1761		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1762		if (tdb_new_database(tdb, hash_size) != 0) {
1763			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1764			goto fail;
1765		}
1766		goto internal;
1767	}
1768
1769	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1770		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1771			 name, strerror(errno)));
1772		goto fail;	/* errno set by open(2) */
1773	}
1774
1775	/* ensure there is only one process initialising at once */
1776	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1777		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1778			 name, strerror(errno)));
1779		goto fail;	/* errno set by tdb_brlock */
1780	}
1781
1782	/* we need to zero database if we are the only one with it open */
1783	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1784		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1785		open_flags |= O_CREAT;
1786		if (ftruncate(tdb->fd, 0) == -1) {
1787			TDB_LOG((tdb, 0, "tdb_open_ex: "
1788				 "failed to truncate %s: %s\n",
1789				 name, strerror(errno)));
1790			goto fail; /* errno set by ftruncate */
1791		}
1792	}
1793
1794	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1795	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1796	    || (tdb->header.version != TDB_VERSION
1797		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1798		/* its not a valid database - possibly initialise it */
1799		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1800			errno = EIO; /* ie bad format or something */
1801			goto fail;
1802		}
1803		rev = (tdb->flags & TDB_CONVERT);
1804	}
1805	vp = (unsigned char *)&tdb->header.version;
1806	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1807		  (((u32)vp[2]) << 8) | (u32)vp[3];
1808	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1809	if (!rev)
1810		tdb->flags &= ~TDB_CONVERT;
1811	else {
1812		tdb->flags |= TDB_CONVERT;
1813		convert(&tdb->header, sizeof(tdb->header));
1814	}
1815	if (fstat(tdb->fd, &st) == -1)
1816		goto fail;
1817
1818	/* Is it already in the open list?  If so, fail. */
1819	if (tdb_already_open(st.st_dev, st.st_ino)) {
1820		TDB_LOG((tdb, 2, "tdb_open_ex: "
1821			 "%s (%d,%d) is already open in this process\n",
1822			 name, (int)st.st_dev, (int)st.st_ino));
1823		errno = EBUSY;
1824		goto fail;
1825	}
1826
1827	if (!(tdb->name = (char *)strdup(name))) {
1828		errno = ENOMEM;
1829		goto fail;
1830	}
1831
1832	tdb->map_size = st.st_size;
1833	tdb->device = st.st_dev;
1834	tdb->inode = st.st_ino;
1835	tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1836	if (!tdb->locked) {
1837		TDB_LOG((tdb, 2, "tdb_open_ex: "
1838			 "failed to allocate lock structure for %s\n",
1839			 name));
1840		errno = ENOMEM;
1841		goto fail;
1842	}
1843	tdb_mmap(tdb);
1844	if (locked) {
1845		if (!tdb->read_only)
1846			if (tdb_clear_spinlocks(tdb) != 0) {
1847				TDB_LOG((tdb, 0, "tdb_open_ex: "
1848				"failed to clear spinlock\n"));
1849				goto fail;
1850			}
1851		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1852			TDB_LOG((tdb, 0, "tdb_open_ex: "
1853				 "failed to take ACTIVE_LOCK on %s: %s\n",
1854				 name, strerror(errno)));
1855			goto fail;
1856		}
1857
1858	}
1859
1860	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1861	   we didn't get the initial exclusive lock as we need to let all other
1862	   users know we're using it. */
1863
1864	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1865		/* leave this lock in place to indicate it's in use */
1866		if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1867			goto fail;
1868	}
1869
1870
1871 internal:
1872	/* Internal (memory-only) databases skip all the code above to
1873	 * do with disk files, and resume here by releasing their
1874	 * global lock and hooking into the active list. */
1875	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1876		goto fail;
1877	tdb->next = tdbs;
1878	tdbs = tdb;
1879	return tdb;
1880
1881 fail:
1882	{ int save_errno = errno;
1883
1884	if (!tdb)
1885		return NULL;
1886
1887	if (tdb->map_ptr) {
1888		if (tdb->flags & TDB_INTERNAL)
1889			SAFE_FREE(tdb->map_ptr);
1890		else
1891			tdb_munmap(tdb);
1892	}
1893	SAFE_FREE(tdb->name);
1894	if (tdb->fd != -1)
1895		if (close(tdb->fd) != 0)
1896			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1897	SAFE_FREE(tdb->locked);
1898	SAFE_FREE(tdb);
1899	errno = save_errno;
1900	return NULL;
1901	}
1902}
1903
1904/**
1905 * Close a database.
1906 *
1907 * @returns -1 for error; 0 for success.
1908 **/
1909int tdb_close(TDB_CONTEXT *tdb)
1910{
1911	TDB_CONTEXT **i;
1912	int ret = 0;
1913
1914	if (tdb->map_ptr) {
1915		if (tdb->flags & TDB_INTERNAL)
1916			SAFE_FREE(tdb->map_ptr);
1917		else
1918			tdb_munmap(tdb);
1919	}
1920	SAFE_FREE(tdb->name);
1921	if (tdb->fd != -1)
1922		ret = close(tdb->fd);
1923	SAFE_FREE(tdb->locked);
1924
1925	/* Remove from contexts list */
1926	for (i = &tdbs; *i; i = &(*i)->next) {
1927		if (*i == tdb) {
1928			*i = tdb->next;
1929			break;
1930		}
1931	}
1932
1933	memset(tdb, 0, sizeof(*tdb));
1934	SAFE_FREE(tdb);
1935
1936	return ret;
1937}
1938
1939/* lock/unlock entire database */
1940int tdb_lockall(TDB_CONTEXT *tdb)
1941{
1942	u32 i;
1943
1944	/* There are no locks on read-only dbs */
1945	if (tdb->read_only)
1946		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1947	for (i = 0; i < tdb->header.hash_size; i++)
1948		if (tdb_lock(tdb, i, F_WRLCK))
1949			break;
1950
1951	/* If error, release locks we have... */
1952	if (i < tdb->header.hash_size) {
1953		u32 j;
1954
1955		for ( j = 0; j < i; j++)
1956			tdb_unlock(tdb, j, F_WRLCK);
1957		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1958	}
1959
1960	return 0;
1961}
1962void tdb_unlockall(TDB_CONTEXT *tdb)
1963{
1964	u32 i;
1965	for (i=0; i < tdb->header.hash_size; i++)
1966		tdb_unlock(tdb, i, F_WRLCK);
1967}
1968
1969/* lock/unlock one hash chain. This is meant to be used to reduce
1970   contention - it cannot guarantee how many records will be locked */
1971int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1972{
1973	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1974}
1975
1976int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1977{
1978	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1979}
1980
1981int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1982{
1983	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1984}
1985
1986int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1987{
1988	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1989}
1990
1991
1992/* register a loging function */
1993void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1994{
1995	tdb->log_fn = fn;
1996}
1997
1998/* reopen a tdb - this can be used after a fork to ensure that we have an independent
1999   seek pointer from our parent and to re-establish locks */
2000int tdb_reopen(TDB_CONTEXT *tdb)
2001{
2002	struct stat st;
2003
2004	if (tdb->flags & TDB_INTERNAL)
2005		return 0; /* Nothing to do. */
2006	if (tdb_munmap(tdb) != 0) {
2007		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2008		goto fail;
2009	}
2010	if (close(tdb->fd) != 0)
2011		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2012	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2013	if (tdb->fd == -1) {
2014		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2015		goto fail;
2016	}
2017	if (fstat(tdb->fd, &st) != 0) {
2018		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2019		goto fail;
2020	}
2021	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2022		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2023		goto fail;
2024	}
2025	tdb_mmap(tdb);
2026	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
2027		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2028		goto fail;
2029	}
2030
2031	return 0;
2032
2033fail:
2034	tdb_close(tdb);
2035	return -1;
2036}
2037
2038/* reopen all tdb's */
2039int tdb_reopen_all(void)
2040{
2041	TDB_CONTEXT *tdb;
2042
2043	for (tdb=tdbs; tdb; tdb = tdb->next) {
2044		/* Ensure no clear-if-first. */
2045		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2046		if (tdb_reopen(tdb) != 0)
2047			return -1;
2048	}
2049
2050	return 0;
2051}
2052