1 /*
2   Unix SMB/CIFS implementation.
3
4   trivial database library
5
6   Copyright (C) Andrew Tridgell              1999-2004
7   Copyright (C) Paul `Rusty' Russell		   2000
8   Copyright (C) Jeremy Allison			   2000-2003
9
10     ** NOTE! The following LGPL license applies to the tdb
11     ** library. This does NOT imply that all of Samba is released
12     ** under the LGPL
13
14   This library is free software; you can redistribute it and/or
15   modify it under the terms of the GNU Lesser General Public
16   License as published by the Free Software Foundation; either
17   version 2 of the License, or (at your option) any later version.
18
19   This library is distributed in the hope that it will be useful,
20   but WITHOUT ANY WARRANTY; without even the implied warranty of
21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22   Lesser General Public License for more details.
23
24   You should have received a copy of the GNU Lesser General Public
25   License along with this library; if not, write to the Free Software
26   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
27*/
28
29
30/* NOTE: If you use tdbs under valgrind, and in particular if you run
31 * tdbtorture, you may get spurious "uninitialized value" warnings.  I
32 * think this is because valgrind doesn't understand that the mmap'd
33 * area may be written to by other processes.  Memory can, from the
34 * point of view of the grinded process, spontaneously become
35 * initialized.
36 *
37 * I can think of a few solutions.  [mbp 20030311]
38 *
39 * 1 - Write suppressions for Valgrind so that it doesn't complain
40 * about this.  Probably the most reasonable but people need to
41 * remember to use them.
42 *
43 * 2 - Use IO not mmap when running under valgrind.  Not so nice.
44 *
45 * 3 - Use the special valgrind macros to mark memory as valid at the
46 * right time.  Probably too hard -- the process just doesn't know.
47 */
48
49#include <stdlib.h>
50#include <stdio.h>
51#include <fcntl.h>
52#include <unistd.h>
53#include <string.h>
54#include <fcntl.h>
55#include <errno.h>
56#include <sys/mman.h>
57#include <sys/stat.h>
58#include <signal.h>
59#include "tdb.h"
60#include "spinlock.h"
61
62#define TDB_MAGIC_FOOD "TDB file\n"
63#define TDB_VERSION (0x26011967 + 6)
64#define TDB_MAGIC (0x26011999U)
65#define TDB_FREE_MAGIC (~TDB_MAGIC)
66#define TDB_DEAD_MAGIC (0xFEE1DEAD)
67#define TDB_ALIGNMENT 4
68#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
69#define DEFAULT_HASH_SIZE 131
70#define TDB_PAGE_SIZE 0x2000
71#define FREELIST_TOP (sizeof(struct tdb_header))
72#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
73#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
74#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
75#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
76#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
77#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
78
79
80/* NB assumes there is a local variable called "tdb" that is the
81 * current context, also takes doubly-parenthesized print-style
82 * argument. */
83#define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
84
85/* lock offsets */
86#define GLOBAL_LOCK 0
87#define ACTIVE_LOCK 4
88
89#ifndef MAP_FILE
90#define MAP_FILE 0
91#endif
92
93#ifndef MAP_FAILED
94#define MAP_FAILED ((void *)-1)
95#endif
96
97/* free memory if the pointer is valid and zero the pointer */
98#ifndef SAFE_FREE
99#define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
100#endif
101
102#define BUCKET(hash) ((hash) % tdb->header.hash_size)
103TDB_DATA tdb_null;
104
105/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
106static TDB_CONTEXT *tdbs = NULL;
107
108static int tdb_munmap(TDB_CONTEXT *tdb)
109{
110	if (tdb->flags & TDB_INTERNAL)
111		return 0;
112
113#ifdef HAVE_MMAP
114	if (tdb->map_ptr) {
115		int ret = munmap(tdb->map_ptr, tdb->map_size);
116		if (ret != 0)
117			return ret;
118	}
119#endif
120	tdb->map_ptr = NULL;
121	return 0;
122}
123
124static void tdb_mmap(TDB_CONTEXT *tdb)
125{
126	if (tdb->flags & TDB_INTERNAL)
127		return;
128
129#ifdef HAVE_MMAP
130	if (!(tdb->flags & TDB_NOMMAP)) {
131		tdb->map_ptr = mmap(NULL, tdb->map_size,
132				    PROT_READ|(tdb->read_only? 0:PROT_WRITE),
133				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
134
135		/*
136		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
137		 */
138
139		if (tdb->map_ptr == MAP_FAILED) {
140			tdb->map_ptr = NULL;
141			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
142				 tdb->map_size, strerror(errno)));
143		}
144	} else {
145		tdb->map_ptr = NULL;
146	}
147#else
148	tdb->map_ptr = NULL;
149#endif
150}
151
152/* Endian conversion: we only ever deal with 4 byte quantities */
153static void *convert(void *buf, u32 size)
154{
155	u32 i, *p = buf;
156	for (i = 0; i < size / 4; i++)
157		p[i] = TDB_BYTEREV(p[i]);
158	return buf;
159}
160#define DOCONV() (tdb->flags & TDB_CONVERT)
161#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
162
163/* the body of the database is made of one list_struct for the free space
164   plus a separate data list for each hash value */
165struct list_struct {
166	tdb_off next; /* offset of the next record in the list */
167	tdb_len rec_len; /* total byte length of record */
168	tdb_len key_len; /* byte length of key */
169	tdb_len data_len; /* byte length of data */
170	u32 full_hash; /* the full 32 bit hash of the key */
171	u32 magic;   /* try to catch errors */
172	/* the following union is implied:
173		union {
174			char record[rec_len];
175			struct {
176				char key[key_len];
177				char data[data_len];
178			}
179			u32 totalsize; (tailer)
180		}
181	*/
182};
183
184/***************************************************************
185 Allow a caller to set a "alarm" flag that tdb can check to abort
186 a blocking lock on SIGALRM.
187***************************************************************/
188
189static sig_atomic_t *palarm_fired;
190
191void tdb_set_lock_alarm(sig_atomic_t *palarm)
192{
193	palarm_fired = palarm;
194}
195
196/* a byte range locking function - return 0 on success
197   this functions locks/unlocks 1 byte at the specified offset.
198
199   On error, errno is also set so that errors are passed back properly
200   through tdb_open(). */
201static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
202		      int rw_type, int lck_type, int probe)
203{
204	struct flock fl;
205	int ret;
206
207	if (tdb->flags & TDB_NOLOCK)
208		return 0;
209	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
210		errno = EACCES;
211		return -1;
212	}
213
214	fl.l_type = rw_type;
215	fl.l_whence = SEEK_SET;
216	fl.l_start = offset;
217	fl.l_len = 1;
218	fl.l_pid = 0;
219
220	do {
221		ret = fcntl(tdb->fd,lck_type,&fl);
222		if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
223			break;
224	} while (ret == -1 && errno == EINTR);
225
226	if (ret == -1) {
227		if (!probe && lck_type != F_SETLK) {
228			/* Ensure error code is set for log fun to examine. */
229			if (errno == EINTR && palarm_fired && *palarm_fired)
230				tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
231			else
232				tdb->ecode = TDB_ERR_LOCK;
233			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
234				 tdb->fd, offset, rw_type, lck_type));
235		}
236		/* Was it an alarm timeout ? */
237		if (errno == EINTR && palarm_fired && *palarm_fired) {
238			TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
239				 tdb->fd, offset, rw_type, lck_type));
240			return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
241		}
242		/* Otherwise - generic lock error. errno set by fcntl.
243		 * EAGAIN is an expected return from non-blocking
244		 * locks. */
245		if (errno != EAGAIN) {
246			TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
247				 tdb->fd, offset, rw_type, lck_type,
248				 strerror(errno)));
249		}
250		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
251	}
252	return 0;
253}
254
255/* lock a list in the database. list -1 is the alloc list */
256static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
257{
258	if (list < -1 || list >= (int)tdb->header.hash_size) {
259		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
260			   list, ltype));
261		return -1;
262	}
263	if (tdb->flags & TDB_NOLOCK)
264		return 0;
265
266	/* Since fcntl locks don't nest, we do a lock for the first one,
267	   and simply bump the count for future ones */
268	if (tdb->locked[list+1].count == 0) {
269		if (!tdb->read_only && tdb->header.rwlocks) {
270			if (tdb_spinlock(tdb, list, ltype)) {
271				TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
272					   list, ltype));
273				return -1;
274			}
275		} else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
276			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
277					   list, ltype, strerror(errno)));
278			return -1;
279		}
280		tdb->locked[list+1].ltype = ltype;
281	}
282	tdb->locked[list+1].count++;
283	return 0;
284}
285
286/* unlock the database: returns void because it's too late for errors. */
287	/* changed to return int it may be interesting to know there
288	   has been an error  --simo */
289static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
290{
291	int ret = -1;
292
293	if (tdb->flags & TDB_NOLOCK)
294		return 0;
295
296	/* Sanity checks */
297	if (list < -1 || list >= (int)tdb->header.hash_size) {
298		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
299		return ret;
300	}
301
302	if (tdb->locked[list+1].count==0) {
303		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
304		return ret;
305	}
306
307	if (tdb->locked[list+1].count == 1) {
308		/* Down to last nested lock: unlock underneath */
309		if (!tdb->read_only && tdb->header.rwlocks) {
310			ret = tdb_spinunlock(tdb, list, ltype);
311		} else {
312			ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
313		}
314	} else {
315		ret = 0;
316	}
317	tdb->locked[list+1].count--;
318
319	if (ret)
320		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
321	return ret;
322}
323
324/* check for an out of bounds access - if it is out of bounds then
325   see if the database has been expanded by someone else and expand
326   if necessary
327   note that "len" is the minimum length needed for the db
328*/
329static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
330{
331	struct stat st;
332	if (len <= tdb->map_size)
333		return 0;
334	if (tdb->flags & TDB_INTERNAL) {
335		if (!probe) {
336			/* Ensure ecode is set for log fn. */
337			tdb->ecode = TDB_ERR_IO;
338			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
339				 (int)len, (int)tdb->map_size));
340		}
341		return TDB_ERRCODE(TDB_ERR_IO, -1);
342	}
343
344	if (fstat(tdb->fd, &st) == -1)
345		return TDB_ERRCODE(TDB_ERR_IO, -1);
346
347	if (st.st_size < (size_t)len) {
348		if (!probe) {
349			/* Ensure ecode is set for log fn. */
350			tdb->ecode = TDB_ERR_IO;
351			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
352				 (int)len, (int)st.st_size));
353		}
354		return TDB_ERRCODE(TDB_ERR_IO, -1);
355	}
356
357	/* Unmap, update size, remap */
358	if (tdb_munmap(tdb) == -1)
359		return TDB_ERRCODE(TDB_ERR_IO, -1);
360	tdb->map_size = st.st_size;
361	tdb_mmap(tdb);
362	return 0;
363}
364
365/* write a lump of data at a specified offset */
366static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
367{
368	if (tdb_oob(tdb, off + len, 0) != 0)
369		return -1;
370
371	if (tdb->map_ptr)
372		memcpy(off + (char *)tdb->map_ptr, buf, len);
373#ifdef HAVE_PWRITE
374	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
375#else
376	else if (lseek(tdb->fd, off, SEEK_SET) != off
377		 || write(tdb->fd, buf, len) != (ssize_t)len) {
378#endif
379		/* Ensure ecode is set for log fn. */
380		tdb->ecode = TDB_ERR_IO;
381		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
382			   off, len, strerror(errno)));
383		return TDB_ERRCODE(TDB_ERR_IO, -1);
384	}
385	return 0;
386}
387
388/* read a lump of data at a specified offset, maybe convert */
389static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
390{
391	if (tdb_oob(tdb, off + len, 0) != 0)
392		return -1;
393
394	if (tdb->map_ptr)
395		memcpy(buf, off + (char *)tdb->map_ptr, len);
396#ifdef HAVE_PREAD
397	else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
398#else
399	else if (lseek(tdb->fd, off, SEEK_SET) != off
400		 || read(tdb->fd, buf, len) != (ssize_t)len) {
401#endif
402		/* Ensure ecode is set for log fn. */
403		tdb->ecode = TDB_ERR_IO;
404		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
405			   off, len, strerror(errno)));
406		return TDB_ERRCODE(TDB_ERR_IO, -1);
407	}
408	if (cv)
409		convert(buf, len);
410	return 0;
411}
412
413/* read a lump of data, allocating the space for it */
414static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
415{
416	char *buf;
417
418	if (!(buf = malloc(len))) {
419		/* Ensure ecode is set for log fn. */
420		tdb->ecode = TDB_ERR_OOM;
421		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
422			   len, strerror(errno)));
423		return TDB_ERRCODE(TDB_ERR_OOM, buf);
424	}
425	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
426		SAFE_FREE(buf);
427		return NULL;
428	}
429	return buf;
430}
431
432/* read/write a tdb_off */
433static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
434{
435	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
436}
437static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
438{
439	tdb_off off = *d;
440	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
441}
442
443/* read/write a record */
444static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
445{
446	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
447		return -1;
448	if (TDB_BAD_MAGIC(rec)) {
449		/* Ensure ecode is set for log fn. */
450		tdb->ecode = TDB_ERR_CORRUPT;
451		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
452		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
453	}
454	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
455}
456static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
457{
458	struct list_struct r = *rec;
459	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
460}
461
462/* read a freelist record and check for simple errors */
463static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
464{
465	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
466		return -1;
467
468	if (rec->magic == TDB_MAGIC) {
469		/* this happens when a app is showdown while deleting a record - we should
470		   not completely fail when this happens */
471		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
472			 rec->magic, off));
473		rec->magic = TDB_FREE_MAGIC;
474		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
475			return -1;
476	}
477
478	if (rec->magic != TDB_FREE_MAGIC) {
479		/* Ensure ecode is set for log fn. */
480		tdb->ecode = TDB_ERR_CORRUPT;
481		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
482			   rec->magic, off));
483		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
484	}
485	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
486		return -1;
487	return 0;
488}
489
490/* update a record tailer (must hold allocation lock) */
491static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
492			 const struct list_struct *rec)
493{
494	tdb_off totalsize;
495
496	/* Offset of tailer from record header */
497	totalsize = sizeof(*rec) + rec->rec_len;
498	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
499			 &totalsize);
500}
501
502static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
503{
504	struct list_struct rec;
505	tdb_off tailer_ofs, tailer;
506
507	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
508		printf("ERROR: failed to read record at %u\n", offset);
509		return 0;
510	}
511
512	printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
513	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
514
515	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
516	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
517		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
518		return rec.next;
519	}
520
521	if (tailer != rec.rec_len + sizeof(rec)) {
522		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
523				(unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
524	}
525	return rec.next;
526}
527
528static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
529{
530	tdb_off rec_ptr, top;
531
532	top = TDB_HASH_TOP(i);
533
534	if (tdb_lock(tdb, i, F_WRLCK) != 0)
535		return -1;
536
537	if (ofs_read(tdb, top, &rec_ptr) == -1)
538		return tdb_unlock(tdb, i, F_WRLCK);
539
540	if (rec_ptr)
541		printf("hash=%d\n", i);
542
543	while (rec_ptr) {
544		rec_ptr = tdb_dump_record(tdb, rec_ptr);
545	}
546
547	return tdb_unlock(tdb, i, F_WRLCK);
548}
549
550void tdb_dump_all(TDB_CONTEXT *tdb)
551{
552	int i;
553	for (i=0;i<tdb->header.hash_size;i++) {
554		tdb_dump_chain(tdb, i);
555	}
556	printf("freelist:\n");
557	tdb_dump_chain(tdb, -1);
558}
559
560int tdb_printfreelist(TDB_CONTEXT *tdb)
561{
562	int ret;
563	long total_free = 0;
564	tdb_off offset, rec_ptr;
565	struct list_struct rec;
566
567	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
568		return ret;
569
570	offset = FREELIST_TOP;
571
572	/* read in the freelist top */
573	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
574		tdb_unlock(tdb, -1, F_WRLCK);
575		return 0;
576	}
577
578	printf("freelist top=[0x%08x]\n", rec_ptr );
579	while (rec_ptr) {
580		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
581			tdb_unlock(tdb, -1, F_WRLCK);
582			return -1;
583		}
584
585		if (rec.magic != TDB_FREE_MAGIC) {
586			printf("bad magic 0x%08x in free list\n", rec.magic);
587			tdb_unlock(tdb, -1, F_WRLCK);
588			return -1;
589		}
590
591		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
592		total_free += rec.rec_len;
593
594		/* move to the next record */
595		rec_ptr = rec.next;
596	}
597	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
598               (int)total_free);
599
600	return tdb_unlock(tdb, -1, F_WRLCK);
601}
602
603/* Remove an element from the freelist.  Must have alloc lock. */
604static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
605{
606	tdb_off last_ptr, i;
607
608	/* read in the freelist top */
609	last_ptr = FREELIST_TOP;
610	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
611		if (i == off) {
612			/* We've found it! */
613			return ofs_write(tdb, last_ptr, &next);
614		}
615		/* Follow chain (next offset is at start of record) */
616		last_ptr = i;
617	}
618	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
619	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
620}
621
622/* Add an element into the freelist. Merge adjacent records if
623   neccessary. */
624static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
625{
626	tdb_off right, left;
627
628	/* Allocation and tailer lock */
629	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
630		return -1;
631
632	/* set an initial tailer, so if we fail we don't leave a bogus record */
633	if (update_tailer(tdb, offset, rec) != 0) {
634		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
635		goto fail;
636	}
637
638	/* Look right first (I'm an Australian, dammit) */
639	right = offset + sizeof(*rec) + rec->rec_len;
640	if (right + sizeof(*rec) <= tdb->map_size) {
641		struct list_struct r;
642
643		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
644			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
645			goto left;
646		}
647
648		/* If it's free, expand to include it. */
649		if (r.magic == TDB_FREE_MAGIC) {
650			if (remove_from_freelist(tdb, right, r.next) == -1) {
651				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
652				goto left;
653			}
654			rec->rec_len += sizeof(r) + r.rec_len;
655		}
656	}
657
658left:
659	/* Look left */
660	left = offset - sizeof(tdb_off);
661	if (left > TDB_DATA_START(tdb->header.hash_size)) {
662		struct list_struct l;
663		tdb_off leftsize;
664
665		/* Read in tailer and jump back to header */
666		if (ofs_read(tdb, left, &leftsize) == -1) {
667			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
668			goto update;
669		}
670		left = offset - leftsize;
671
672		/* Now read in record */
673		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
674			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
675			goto update;
676		}
677
678		/* If it's free, expand to include it. */
679		if (l.magic == TDB_FREE_MAGIC) {
680			if (remove_from_freelist(tdb, left, l.next) == -1) {
681				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
682				goto update;
683			} else {
684				offset = left;
685				rec->rec_len += leftsize;
686			}
687		}
688	}
689
690update:
691	if (update_tailer(tdb, offset, rec) == -1) {
692		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
693		goto fail;
694	}
695
696	/* Now, prepend to free list */
697	rec->magic = TDB_FREE_MAGIC;
698
699	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
700	    rec_write(tdb, offset, rec) == -1 ||
701	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
702		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
703		goto fail;
704	}
705
706	/* And we're done. */
707	tdb_unlock(tdb, -1, F_WRLCK);
708	return 0;
709
710 fail:
711	tdb_unlock(tdb, -1, F_WRLCK);
712	return -1;
713}
714
715
716/* expand a file.  we prefer to use ftruncate, as that is what posix
717  says to use for mmap expansion */
718static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
719{
720	char buf[1024];
721#if HAVE_FTRUNCATE_EXTEND
722	if (ftruncate(tdb->fd, size+addition) != 0) {
723		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
724			   size+addition, strerror(errno)));
725		return -1;
726	}
727#else
728	char b = 0;
729
730#ifdef HAVE_PWRITE
731	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
732#else
733	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
734	    write(tdb->fd, &b, 1) != 1) {
735#endif
736		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
737			   size+addition, strerror(errno)));
738		return -1;
739	}
740#endif
741
742	/* now fill the file with something. This ensures that the file isn't sparse, which would be
743	   very bad if we ran out of disk. This must be done with write, not via mmap */
744	memset(buf, 0x42, sizeof(buf));
745	while (addition) {
746		int n = addition>sizeof(buf)?sizeof(buf):addition;
747#ifdef HAVE_PWRITE
748		int ret = pwrite(tdb->fd, buf, n, size);
749#else
750		int ret;
751		if (lseek(tdb->fd, size, SEEK_SET) != size)
752			return -1;
753		ret = write(tdb->fd, buf, n);
754#endif
755		if (ret != n) {
756			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
757				   n, strerror(errno)));
758			return -1;
759		}
760		addition -= n;
761		size += n;
762	}
763	return 0;
764}
765
766
767/* expand the database at least size bytes by expanding the underlying
768   file and doing the mmap again if necessary */
769static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
770{
771	struct list_struct rec;
772	tdb_off offset;
773
774	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
775		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
776		return -1;
777	}
778
779	/* must know about any previous expansions by another process */
780	tdb_oob(tdb, tdb->map_size + 1, 1);
781
782	/* always make room for at least 10 more records, and round
783           the database up to a multiple of TDB_PAGE_SIZE */
784	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
785
786	if (!(tdb->flags & TDB_INTERNAL))
787		tdb_munmap(tdb);
788
789	/*
790	 * We must ensure the file is unmapped before doing this
791	 * to ensure consistency with systems like OpenBSD where
792	 * writes and mmaps are not consistent.
793	 */
794
795	/* expand the file itself */
796	if (!(tdb->flags & TDB_INTERNAL)) {
797		if (expand_file(tdb, tdb->map_size, size) != 0)
798			goto fail;
799	}
800
801	tdb->map_size += size;
802
803	if (tdb->flags & TDB_INTERNAL)
804		tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
805	else {
806		/*
807		 * We must ensure the file is remapped before adding the space
808		 * to ensure consistency with systems like OpenBSD where
809		 * writes and mmaps are not consistent.
810		 */
811
812		/* We're ok if the mmap fails as we'll fallback to read/write */
813		tdb_mmap(tdb);
814	}
815
816	/* form a new freelist record */
817	memset(&rec,'\0',sizeof(rec));
818	rec.rec_len = size - sizeof(rec);
819
820	/* link it into the free list */
821	offset = tdb->map_size - size;
822	if (tdb_free(tdb, offset, &rec) == -1)
823		goto fail;
824
825	tdb_unlock(tdb, -1, F_WRLCK);
826	return 0;
827 fail:
828	tdb_unlock(tdb, -1, F_WRLCK);
829	return -1;
830}
831
832/* allocate some space from the free list. The offset returned points
833   to a unconnected list_struct within the database with room for at
834   least length bytes of total data
835
836   0 is returned if the space could not be allocated
837 */
838static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
839			    struct list_struct *rec)
840{
841	tdb_off rec_ptr, last_ptr, newrec_ptr;
842	struct list_struct newrec;
843
844	memset(&newrec, '\0', sizeof(newrec));
845
846	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
847		return 0;
848
849	/* Extra bytes required for tailer */
850	length += sizeof(tdb_off);
851
852 again:
853	last_ptr = FREELIST_TOP;
854
855	/* read in the freelist top */
856	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
857		goto fail;
858
859	/* keep looking until we find a freelist record big enough */
860	while (rec_ptr) {
861		if (rec_free_read(tdb, rec_ptr, rec) == -1)
862			goto fail;
863
864		if (rec->rec_len >= length) {
865			/* found it - now possibly split it up  */
866			if (rec->rec_len > length + MIN_REC_SIZE) {
867				/* Length of left piece */
868				length = TDB_ALIGN(length, TDB_ALIGNMENT);
869
870				/* Right piece to go on free list */
871				newrec.rec_len = rec->rec_len
872					- (sizeof(*rec) + length);
873				newrec_ptr = rec_ptr + sizeof(*rec) + length;
874
875				/* And left record is shortened */
876				rec->rec_len = length;
877			} else
878				newrec_ptr = 0;
879
880			/* Remove allocated record from the free list */
881			if (ofs_write(tdb, last_ptr, &rec->next) == -1)
882				goto fail;
883
884			/* Update header: do this before we drop alloc
885                           lock, otherwise tdb_free() might try to
886                           merge with us, thinking we're free.
887                           (Thanks Jeremy Allison). */
888			rec->magic = TDB_MAGIC;
889			if (rec_write(tdb, rec_ptr, rec) == -1)
890				goto fail;
891
892			/* Did we create new block? */
893			if (newrec_ptr) {
894				/* Update allocated record tailer (we
895                                   shortened it). */
896				if (update_tailer(tdb, rec_ptr, rec) == -1)
897					goto fail;
898
899				/* Free new record */
900				if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
901					goto fail;
902			}
903
904			/* all done - return the new record offset */
905			tdb_unlock(tdb, -1, F_WRLCK);
906			return rec_ptr;
907		}
908		/* move to the next record */
909		last_ptr = rec_ptr;
910		rec_ptr = rec->next;
911	}
912	/* we didn't find enough space. See if we can expand the
913	   database and if we can then try again */
914	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
915		goto again;
916 fail:
917	tdb_unlock(tdb, -1, F_WRLCK);
918	return 0;
919}
920
921/* initialise a new database with a specified hash size */
922static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
923{
924	struct tdb_header *newdb;
925	int size, ret = -1;
926
927	/* We make it up in memory, then write it out if not internal */
928	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
929	if (!(newdb = calloc(size, 1)))
930		return TDB_ERRCODE(TDB_ERR_OOM, -1);
931
932	/* Fill in the header */
933	newdb->version = TDB_VERSION;
934	newdb->hash_size = hash_size;
935	if (tdb->flags & TDB_INTERNAL) {
936		tdb->map_size = size;
937		tdb->map_ptr = (char *)newdb;
938		memcpy(&tdb->header, newdb, sizeof(tdb->header));
939		/* Convert the `ondisk' version if asked. */
940		CONVERT(*newdb);
941		return 0;
942	}
943	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
944		goto fail;
945
946	if (ftruncate(tdb->fd, 0) == -1)
947		goto fail;
948
949	/* This creates an endian-converted header, as if read from disk */
950	CONVERT(*newdb);
951	memcpy(&tdb->header, newdb, sizeof(tdb->header));
952	/* Don't endian-convert the magic food! */
953	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
954	if (write(tdb->fd, newdb, size) != size)
955		ret = -1;
956	else
957		ret = tdb_create_rwlocks(tdb->fd, hash_size);
958
959  fail:
960	SAFE_FREE(newdb);
961	return ret;
962}
963
964/* Returns 0 on fail.  On success, return offset of record, and fills
965   in rec */
966static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
967			struct list_struct *r)
968{
969	tdb_off rec_ptr;
970
971	/* read in the hash top */
972	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
973		return 0;
974
975	/* keep looking until we find the right record */
976	while (rec_ptr) {
977		if (rec_read(tdb, rec_ptr, r) == -1)
978			return 0;
979
980		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
981			char *k;
982			/* a very likely hit - read the key */
983			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
984					   r->key_len);
985			if (!k)
986				return 0;
987
988			if (memcmp(key.dptr, k, key.dsize) == 0) {
989				SAFE_FREE(k);
990				return rec_ptr;
991			}
992			SAFE_FREE(k);
993		}
994		rec_ptr = r->next;
995	}
996	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
997}
998
999/* As tdb_find, but if you succeed, keep the lock */
1000static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1001			     struct list_struct *rec)
1002{
1003	u32 rec_ptr;
1004
1005	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1006		return 0;
1007	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1008		tdb_unlock(tdb, BUCKET(hash), locktype);
1009	return rec_ptr;
1010}
1011
1012enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1013{
1014	return tdb->ecode;
1015}
1016
1017static struct tdb_errname {
1018	enum TDB_ERROR ecode; const char *estring;
1019} emap[] = { {TDB_SUCCESS, "Success"},
1020	     {TDB_ERR_CORRUPT, "Corrupt database"},
1021	     {TDB_ERR_IO, "IO Error"},
1022	     {TDB_ERR_LOCK, "Locking error"},
1023	     {TDB_ERR_OOM, "Out of memory"},
1024	     {TDB_ERR_EXISTS, "Record exists"},
1025	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1026	     {TDB_ERR_NOEXIST, "Record does not exist"} };
1027
1028/* Error string for the last tdb error */
1029const char *tdb_errorstr(TDB_CONTEXT *tdb)
1030{
1031	u32 i;
1032	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1033		if (tdb->ecode == emap[i].ecode)
1034			return emap[i].estring;
1035	return "Invalid error code";
1036}
1037
1038/* update an entry in place - this only works if the new data size
1039   is <= the old data size and the key exists.
1040   on failure return -1.
1041*/
1042
1043static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1044{
1045	struct list_struct rec;
1046	tdb_off rec_ptr;
1047
1048	/* find entry */
1049	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1050		return -1;
1051
1052	/* must be long enough key, data and tailer */
1053	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1054		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1055		return -1;
1056	}
1057
1058	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1059		      dbuf.dptr, dbuf.dsize) == -1)
1060		return -1;
1061
1062	if (dbuf.dsize != rec.data_len) {
1063		/* update size */
1064		rec.data_len = dbuf.dsize;
1065		return rec_write(tdb, rec_ptr, &rec);
1066	}
1067
1068	return 0;
1069}
1070
1071/* find an entry in the database given a key */
1072/* If an entry doesn't exist tdb_err will be set to
1073 * TDB_ERR_NOEXIST. If a key has no data attached
1074 * tdb_err will not be set. Both will return a
1075 * zero pptr and zero dsize.
1076 */
1077
1078TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1079{
1080	tdb_off rec_ptr;
1081	struct list_struct rec;
1082	TDB_DATA ret;
1083	u32 hash;
1084
1085	/* find which hash bucket it is in */
1086	hash = tdb->hash_fn(&key);
1087	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1088		return tdb_null;
1089
1090	if (rec.data_len)
1091		ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1092					  rec.data_len);
1093	else
1094		ret.dptr = NULL;
1095	ret.dsize = rec.data_len;
1096	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1097	return ret;
1098}
1099
1100/* check if an entry in the database exists
1101
1102   note that 1 is returned if the key is found and 0 is returned if not found
1103   this doesn't match the conventions in the rest of this module, but is
1104   compatible with gdbm
1105*/
1106static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1107{
1108	struct list_struct rec;
1109
1110	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1111		return 0;
1112	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1113	return 1;
1114}
1115
1116int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1117{
1118	u32 hash = tdb->hash_fn(&key);
1119	return tdb_exists_hash(tdb, key, hash);
1120}
1121
1122/* record lock stops delete underneath */
1123static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1124{
1125	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1126}
1127/*
1128  Write locks override our own fcntl readlocks, so check it here.
1129  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1130  an error to fail to get the lock here.
1131*/
1132
1133static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1134{
1135	struct tdb_traverse_lock *i;
1136	for (i = &tdb->travlocks; i; i = i->next)
1137		if (i->off == off)
1138			return -1;
1139	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1140}
1141
1142/*
1143  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1144  an error to fail to get the lock here.
1145*/
1146
1147static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1148{
1149	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1150}
1151/* fcntl locks don't stack: avoid unlocking someone else's */
1152static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1153{
1154	struct tdb_traverse_lock *i;
1155	u32 count = 0;
1156
1157	if (off == 0)
1158		return 0;
1159	for (i = &tdb->travlocks; i; i = i->next)
1160		if (i->off == off)
1161			count++;
1162	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1163}
1164
1165/* actually delete an entry in the database given the offset */
1166static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1167{
1168	tdb_off last_ptr, i;
1169	struct list_struct lastrec;
1170
1171	if (tdb->read_only) return -1;
1172
1173	if (write_lock_record(tdb, rec_ptr) == -1) {
1174		/* Someone traversing here: mark it as dead */
1175		rec->magic = TDB_DEAD_MAGIC;
1176		return rec_write(tdb, rec_ptr, rec);
1177	}
1178	if (write_unlock_record(tdb, rec_ptr) != 0)
1179		return -1;
1180
1181	/* find previous record in hash chain */
1182	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1183		return -1;
1184	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1185		if (rec_read(tdb, i, &lastrec) == -1)
1186			return -1;
1187
1188	/* unlink it: next ptr is at start of record. */
1189	if (last_ptr == 0)
1190		last_ptr = TDB_HASH_TOP(rec->full_hash);
1191	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1192		return -1;
1193
1194	/* recover the space */
1195	if (tdb_free(tdb, rec_ptr, rec) == -1)
1196		return -1;
1197	return 0;
1198}
1199
1200/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1201static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1202			 struct list_struct *rec)
1203{
1204	int want_next = (tlock->off != 0);
1205
1206	/* Lock each chain from the start one. */
1207	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1208		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1209			return -1;
1210
1211		/* No previous record?  Start at top of chain. */
1212		if (!tlock->off) {
1213			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1214				     &tlock->off) == -1)
1215				goto fail;
1216		} else {
1217			/* Otherwise unlock the previous record. */
1218			if (unlock_record(tdb, tlock->off) != 0)
1219				goto fail;
1220		}
1221
1222		if (want_next) {
1223			/* We have offset of old record: grab next */
1224			if (rec_read(tdb, tlock->off, rec) == -1)
1225				goto fail;
1226			tlock->off = rec->next;
1227		}
1228
1229		/* Iterate through chain */
1230		while( tlock->off) {
1231			tdb_off current;
1232			if (rec_read(tdb, tlock->off, rec) == -1)
1233				goto fail;
1234			if (!TDB_DEAD(rec)) {
1235				/* Woohoo: we found one! */
1236				if (lock_record(tdb, tlock->off) != 0)
1237					goto fail;
1238				return tlock->off;
1239			}
1240			/* Try to clean dead ones from old traverses */
1241			current = tlock->off;
1242			tlock->off = rec->next;
1243			if (!tdb->read_only &&
1244			    do_delete(tdb, current, rec) != 0)
1245				goto fail;
1246		}
1247		tdb_unlock(tdb, tlock->hash, F_WRLCK);
1248		want_next = 0;
1249	}
1250	/* We finished iteration without finding anything */
1251	return TDB_ERRCODE(TDB_SUCCESS, 0);
1252
1253 fail:
1254	tlock->off = 0;
1255	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1256		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1257	return -1;
1258}
1259
1260/* traverse the entire database - calling fn(tdb, key, data) on each element.
1261   return -1 on error or the record count traversed
1262   if fn is NULL then it is not called
1263   a non-zero return value from fn() indicates that the traversal should stop
1264  */
1265int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1266{
1267	TDB_DATA key, dbuf;
1268	struct list_struct rec;
1269	struct tdb_traverse_lock tl = { NULL, 0, 0 };
1270	int ret, count = 0;
1271
1272	/* This was in the initializaton, above, but the IRIX compiler
1273	 * did not like it.  crh
1274	 */
1275	tl.next = tdb->travlocks.next;
1276
1277	/* fcntl locks don't stack: beware traverse inside traverse */
1278	tdb->travlocks.next = &tl;
1279
1280	/* tdb_next_lock places locks on the record returned, and its chain */
1281	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1282		count++;
1283		/* now read the full record */
1284		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1285					  rec.key_len + rec.data_len);
1286		if (!key.dptr) {
1287			ret = -1;
1288			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1289				goto out;
1290			if (unlock_record(tdb, tl.off) != 0)
1291				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1292			goto out;
1293		}
1294		key.dsize = rec.key_len;
1295		dbuf.dptr = key.dptr + rec.key_len;
1296		dbuf.dsize = rec.data_len;
1297
1298		/* Drop chain lock, call out */
1299		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1300			ret = -1;
1301			goto out;
1302		}
1303		if (fn && fn(tdb, key, dbuf, private)) {
1304			/* They want us to terminate traversal */
1305			ret = count;
1306			if (unlock_record(tdb, tl.off) != 0) {
1307				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1308				ret = -1;
1309			}
1310			tdb->travlocks.next = tl.next;
1311			SAFE_FREE(key.dptr);
1312			return count;
1313		}
1314		SAFE_FREE(key.dptr);
1315	}
1316out:
1317	tdb->travlocks.next = tl.next;
1318	if (ret < 0)
1319		return -1;
1320	else
1321		return count;
1322}
1323
1324/* find the first entry in the database and return its key */
1325TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1326{
1327	TDB_DATA key;
1328	struct list_struct rec;
1329
1330	/* release any old lock */
1331	if (unlock_record(tdb, tdb->travlocks.off) != 0)
1332		return tdb_null;
1333	tdb->travlocks.off = tdb->travlocks.hash = 0;
1334
1335	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1336		return tdb_null;
1337	/* now read the key */
1338	key.dsize = rec.key_len;
1339	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1340	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1341		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1342	return key;
1343}
1344
1345/* find the next entry in the database, returning its key */
1346TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1347{
1348	u32 oldhash;
1349	TDB_DATA key = tdb_null;
1350	struct list_struct rec;
1351	char *k = NULL;
1352
1353	/* Is locked key the old key?  If so, traverse will be reliable. */
1354	if (tdb->travlocks.off) {
1355		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1356			return tdb_null;
1357		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1358		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1359					    rec.key_len))
1360		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1361			/* No, it wasn't: unlock it and start from scratch */
1362			if (unlock_record(tdb, tdb->travlocks.off) != 0)
1363				return tdb_null;
1364			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1365				return tdb_null;
1366			tdb->travlocks.off = 0;
1367		}
1368
1369		SAFE_FREE(k);
1370	}
1371
1372	if (!tdb->travlocks.off) {
1373		/* No previous element: do normal find, and lock record */
1374		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1375		if (!tdb->travlocks.off)
1376			return tdb_null;
1377		tdb->travlocks.hash = BUCKET(rec.full_hash);
1378		if (lock_record(tdb, tdb->travlocks.off) != 0) {
1379			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1380			return tdb_null;
1381		}
1382	}
1383	oldhash = tdb->travlocks.hash;
1384
1385	/* Grab next record: locks chain and returned record,
1386	   unlocks old record */
1387	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1388		key.dsize = rec.key_len;
1389		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1390					  key.dsize);
1391		/* Unlock the chain of this new record */
1392		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1393			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1394	}
1395	/* Unlock the chain of old record */
1396	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1397		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1398	return key;
1399}
1400
1401/* delete an entry in the database given a key */
1402static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1403{
1404	tdb_off rec_ptr;
1405	struct list_struct rec;
1406	int ret;
1407
1408	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1409		return -1;
1410	ret = do_delete(tdb, rec_ptr, &rec);
1411	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1412		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1413	return ret;
1414}
1415
1416int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1417{
1418	u32 hash = tdb->hash_fn(&key);
1419	return tdb_delete_hash(tdb, key, hash);
1420}
1421
1422/* store an element in the database, replacing any existing element
1423   with the same key
1424
1425   return 0 on success, -1 on failure
1426*/
1427int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1428{
1429	struct list_struct rec;
1430	u32 hash;
1431	tdb_off rec_ptr;
1432	char *p = NULL;
1433	int ret = 0;
1434
1435	/* find which hash bucket it is in */
1436	hash = tdb->hash_fn(&key);
1437	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1438		return -1;
1439
1440	/* check for it existing, on insert. */
1441	if (flag == TDB_INSERT) {
1442		if (tdb_exists_hash(tdb, key, hash)) {
1443			tdb->ecode = TDB_ERR_EXISTS;
1444			goto fail;
1445		}
1446	} else {
1447		/* first try in-place update, on modify or replace. */
1448		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1449			goto out;
1450		if (tdb->ecode == TDB_ERR_NOEXIST &&
1451		    flag == TDB_MODIFY) {
1452			/* if the record doesn't exist and we are in TDB_MODIFY mode then
1453			 we should fail the store */
1454			goto fail;
1455	}
1456	}
1457	/* reset the error code potentially set by the tdb_update() */
1458	tdb->ecode = TDB_SUCCESS;
1459
1460	/* delete any existing record - if it doesn't exist we don't
1461           care.  Doing this first reduces fragmentation, and avoids
1462           coalescing with `allocated' block before it's updated. */
1463	if (flag != TDB_INSERT)
1464		tdb_delete_hash(tdb, key, hash);
1465
1466	/* Copy key+value *before* allocating free space in case malloc
1467	   fails and we are left with a dead spot in the tdb. */
1468
1469	if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1470		tdb->ecode = TDB_ERR_OOM;
1471		goto fail;
1472	}
1473
1474	memcpy(p, key.dptr, key.dsize);
1475	if (dbuf.dsize)
1476		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1477
1478	/* we have to allocate some space */
1479	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1480		goto fail;
1481
1482	/* Read hash top into next ptr */
1483	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1484		goto fail;
1485
1486	rec.key_len = key.dsize;
1487	rec.data_len = dbuf.dsize;
1488	rec.full_hash = hash;
1489	rec.magic = TDB_MAGIC;
1490
1491	/* write out and point the top of the hash chain at it */
1492	if (rec_write(tdb, rec_ptr, &rec) == -1
1493	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1494	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1495		/* Need to tdb_unallocate() here */
1496		goto fail;
1497	}
1498 out:
1499	SAFE_FREE(p);
1500	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1501	return ret;
1502fail:
1503	ret = -1;
1504	goto out;
1505}
1506
1507/* Attempt to append data to an entry in place - this only works if the new data size
1508   is <= the old data size and the key exists.
1509   on failure return -1. Record must be locked before calling.
1510*/
1511static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1512{
1513	struct list_struct rec;
1514	tdb_off rec_ptr;
1515
1516	/* find entry */
1517	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1518		return -1;
1519
1520	/* Append of 0 is always ok. */
1521	if (new_dbuf.dsize == 0)
1522		return 0;
1523
1524	/* must be long enough for key, old data + new data and tailer */
1525	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1526		/* No room. */
1527		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1528		return -1;
1529	}
1530
1531	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1532		      new_dbuf.dptr, new_dbuf.dsize) == -1)
1533		return -1;
1534
1535	/* update size */
1536	rec.data_len += new_dbuf.dsize;
1537	return rec_write(tdb, rec_ptr, &rec);
1538}
1539
1540/* Append to an entry. Create if not exist. */
1541
1542int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1543{
1544	struct list_struct rec;
1545	u32 hash;
1546	tdb_off rec_ptr;
1547	char *p = NULL;
1548	int ret = 0;
1549	size_t new_data_size = 0;
1550
1551	/* find which hash bucket it is in */
1552	hash = tdb->hash_fn(&key);
1553	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1554		return -1;
1555
1556	/* first try in-place. */
1557	if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1558		goto out;
1559
1560	/* reset the error code potentially set by the tdb_append_inplace() */
1561	tdb->ecode = TDB_SUCCESS;
1562
1563	/* find entry */
1564	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1565		if (tdb->ecode != TDB_ERR_NOEXIST)
1566			goto fail;
1567
1568		/* Not found - create. */
1569
1570		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1571		goto out;
1572	}
1573
1574	new_data_size = rec.data_len + new_dbuf.dsize;
1575
1576	/* Copy key+old_value+value *before* allocating free space in case malloc
1577	   fails and we are left with a dead spot in the tdb. */
1578
1579	if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1580		tdb->ecode = TDB_ERR_OOM;
1581		goto fail;
1582	}
1583
1584	/* Copy the key in place. */
1585	memcpy(p, key.dptr, key.dsize);
1586
1587	/* Now read the old data into place. */
1588	if (rec.data_len &&
1589		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1590			goto fail;
1591
1592	/* Finally append the new data. */
1593	if (new_dbuf.dsize)
1594		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1595
1596	/* delete any existing record - if it doesn't exist we don't
1597           care.  Doing this first reduces fragmentation, and avoids
1598           coalescing with `allocated' block before it's updated. */
1599
1600	tdb_delete_hash(tdb, key, hash);
1601
1602	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1603		goto fail;
1604
1605	/* Read hash top into next ptr */
1606	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1607		goto fail;
1608
1609	rec.key_len = key.dsize;
1610	rec.data_len = new_data_size;
1611	rec.full_hash = hash;
1612	rec.magic = TDB_MAGIC;
1613
1614	/* write out and point the top of the hash chain at it */
1615	if (rec_write(tdb, rec_ptr, &rec) == -1
1616	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1617	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1618		/* Need to tdb_unallocate() here */
1619		goto fail;
1620	}
1621
1622 out:
1623	SAFE_FREE(p);
1624	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1625	return ret;
1626
1627fail:
1628	ret = -1;
1629	goto out;
1630}
1631
1632static int tdb_already_open(dev_t device,
1633			    ino_t ino)
1634{
1635	TDB_CONTEXT *i;
1636
1637	for (i = tdbs; i; i = i->next) {
1638		if (i->device == device && i->inode == ino) {
1639			return 1;
1640		}
1641	}
1642
1643	return 0;
1644}
1645
1646/* This is based on the hash algorithm from gdbm */
1647static u32 default_tdb_hash(TDB_DATA *key)
1648{
1649	u32 value;	/* Used to compute the hash value.  */
1650	u32   i;	/* Used to cycle through random values. */
1651
1652	/* Set the initial value from the key size. */
1653	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1654		value = (value + (key->dptr[i] << (i*5 % 24)));
1655
1656	return (1103515243 * value + 12345);
1657}
1658
1659/* open the database, creating it if necessary
1660
1661   The open_flags and mode are passed straight to the open call on the
1662   database file. A flags value of O_WRONLY is invalid. The hash size
1663   is advisory, use zero for a default value.
1664
1665   Return is NULL on error, in which case errno is also set.  Don't
1666   try to call tdb_error or tdb_errname, just do strerror(errno).
1667
1668   @param name may be NULL for internal databases. */
1669TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1670		      int open_flags, mode_t mode)
1671{
1672	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1673}
1674
1675
1676TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1677			 int open_flags, mode_t mode,
1678			 tdb_log_func log_fn,
1679			 tdb_hash_func hash_fn)
1680{
1681	TDB_CONTEXT *tdb;
1682	struct stat st;
1683	int rev = 0, locked = 0;
1684	unsigned char *vp;
1685	u32 vertest;
1686
1687	if (!(tdb = calloc(1, sizeof *tdb))) {
1688		/* Can't log this */
1689		errno = ENOMEM;
1690		goto fail;
1691	}
1692	tdb->fd = -1;
1693	tdb->name = NULL;
1694	tdb->map_ptr = NULL;
1695	tdb->flags = tdb_flags;
1696	tdb->open_flags = open_flags;
1697	tdb->log_fn = log_fn;
1698	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1699
1700	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1701		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1702			 name));
1703		errno = EINVAL;
1704		goto fail;
1705	}
1706
1707	if (hash_size == 0)
1708		hash_size = DEFAULT_HASH_SIZE;
1709	if ((open_flags & O_ACCMODE) == O_RDONLY) {
1710		tdb->read_only = 1;
1711		/* read only databases don't do locking or clear if first */
1712		tdb->flags |= TDB_NOLOCK;
1713		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1714	}
1715
1716	/* internal databases don't mmap or lock, and start off cleared */
1717	if (tdb->flags & TDB_INTERNAL) {
1718		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1719		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1720		if (tdb_new_database(tdb, hash_size) != 0) {
1721			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1722			goto fail;
1723		}
1724		goto internal;
1725	}
1726
1727	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1728		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1729			 name, strerror(errno)));
1730		goto fail;	/* errno set by open(2) */
1731	}
1732
1733	/* ensure there is only one process initialising at once */
1734	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1735		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1736			 name, strerror(errno)));
1737		goto fail;	/* errno set by tdb_brlock */
1738	}
1739
1740	/* we need to zero database if we are the only one with it open */
1741	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1742		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1743		open_flags |= O_CREAT;
1744		if (ftruncate(tdb->fd, 0) == -1) {
1745			TDB_LOG((tdb, 0, "tdb_open_ex: "
1746				 "failed to truncate %s: %s\n",
1747				 name, strerror(errno)));
1748			goto fail; /* errno set by ftruncate */
1749		}
1750	}
1751
1752	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1753	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1754	    || (tdb->header.version != TDB_VERSION
1755		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1756		/* its not a valid database - possibly initialise it */
1757		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1758			errno = EIO; /* ie bad format or something */
1759			goto fail;
1760		}
1761		rev = (tdb->flags & TDB_CONVERT);
1762	}
1763	vp = (unsigned char *)&tdb->header.version;
1764	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1765		  (((u32)vp[2]) << 8) | (u32)vp[3];
1766	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1767	if (!rev)
1768		tdb->flags &= ~TDB_CONVERT;
1769	else {
1770		tdb->flags |= TDB_CONVERT;
1771		convert(&tdb->header, sizeof(tdb->header));
1772	}
1773	if (fstat(tdb->fd, &st) == -1)
1774		goto fail;
1775
1776	/* Is it already in the open list?  If so, fail. */
1777	if (tdb_already_open(st.st_dev, st.st_ino)) {
1778		TDB_LOG((tdb, 2, "tdb_open_ex: "
1779			 "%s (%d,%d) is already open in this process\n",
1780			 name, (int)st.st_dev, (int)st.st_ino));
1781		errno = EBUSY;
1782		goto fail;
1783	}
1784
1785	if (!(tdb->name = (char *)strdup(name))) {
1786		errno = ENOMEM;
1787		goto fail;
1788	}
1789
1790	tdb->map_size = st.st_size;
1791	tdb->device = st.st_dev;
1792	tdb->inode = st.st_ino;
1793	tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1794	if (!tdb->locked) {
1795		TDB_LOG((tdb, 2, "tdb_open_ex: "
1796			 "failed to allocate lock structure for %s\n",
1797			 name));
1798		errno = ENOMEM;
1799		goto fail;
1800	}
1801	tdb_mmap(tdb);
1802	if (locked) {
1803		if (!tdb->read_only)
1804			if (tdb_clear_spinlocks(tdb) != 0) {
1805				TDB_LOG((tdb, 0, "tdb_open_ex: "
1806				"failed to clear spinlock\n"));
1807				goto fail;
1808			}
1809		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1810			TDB_LOG((tdb, 0, "tdb_open_ex: "
1811				 "failed to take ACTIVE_LOCK on %s: %s\n",
1812				 name, strerror(errno)));
1813			goto fail;
1814		}
1815
1816	}
1817
1818	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1819	   we didn't get the initial exclusive lock as we need to let all other
1820	   users know we're using it. */
1821
1822	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1823		/* leave this lock in place to indicate it's in use */
1824		if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1825			goto fail;
1826	}
1827
1828
1829 internal:
1830	/* Internal (memory-only) databases skip all the code above to
1831	 * do with disk files, and resume here by releasing their
1832	 * global lock and hooking into the active list. */
1833	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1834		goto fail;
1835	tdb->next = tdbs;
1836	tdbs = tdb;
1837	return tdb;
1838
1839 fail:
1840	{ int save_errno = errno;
1841
1842	if (!tdb)
1843		return NULL;
1844
1845	if (tdb->map_ptr) {
1846		if (tdb->flags & TDB_INTERNAL)
1847			SAFE_FREE(tdb->map_ptr);
1848		else
1849			tdb_munmap(tdb);
1850	}
1851	SAFE_FREE(tdb->name);
1852	if (tdb->fd != -1)
1853		if (close(tdb->fd) != 0)
1854			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1855	SAFE_FREE(tdb->locked);
1856	SAFE_FREE(tdb);
1857	errno = save_errno;
1858	return NULL;
1859	}
1860}
1861
1862/**
1863 * Close a database.
1864 *
1865 * @returns -1 for error; 0 for success.
1866 **/
1867int tdb_close(TDB_CONTEXT *tdb)
1868{
1869	TDB_CONTEXT **i;
1870	int ret = 0;
1871
1872	if (tdb->map_ptr) {
1873		if (tdb->flags & TDB_INTERNAL)
1874			SAFE_FREE(tdb->map_ptr);
1875		else
1876			tdb_munmap(tdb);
1877	}
1878	SAFE_FREE(tdb->name);
1879	if (tdb->fd != -1)
1880		ret = close(tdb->fd);
1881	SAFE_FREE(tdb->locked);
1882
1883	/* Remove from contexts list */
1884	for (i = &tdbs; *i; i = &(*i)->next) {
1885		if (*i == tdb) {
1886			*i = tdb->next;
1887			break;
1888		}
1889	}
1890
1891	memset(tdb, 0, sizeof(*tdb));
1892	SAFE_FREE(tdb);
1893
1894	return ret;
1895}
1896
1897/* lock/unlock entire database */
1898int tdb_lockall(TDB_CONTEXT *tdb)
1899{
1900	u32 i;
1901
1902	/* There are no locks on read-only dbs */
1903	if (tdb->read_only)
1904		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1905	for (i = 0; i < tdb->header.hash_size; i++)
1906		if (tdb_lock(tdb, i, F_WRLCK))
1907			break;
1908
1909	/* If error, release locks we have... */
1910	if (i < tdb->header.hash_size) {
1911		u32 j;
1912
1913		for ( j = 0; j < i; j++)
1914			tdb_unlock(tdb, j, F_WRLCK);
1915		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1916	}
1917
1918	return 0;
1919}
1920void tdb_unlockall(TDB_CONTEXT *tdb)
1921{
1922	u32 i;
1923	for (i=0; i < tdb->header.hash_size; i++)
1924		tdb_unlock(tdb, i, F_WRLCK);
1925}
1926
1927/* lock/unlock one hash chain. This is meant to be used to reduce
1928   contention - it cannot guarantee how many records will be locked */
1929int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1930{
1931	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1932}
1933
1934int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1935{
1936	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1937}
1938
1939int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1940{
1941	return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1942}
1943
1944int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1945{
1946	return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1947}
1948
1949
1950/* register a loging function */
1951void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1952{
1953	tdb->log_fn = fn;
1954}
1955
1956/* reopen a tdb - this can be used after a fork to ensure that we have an independent
1957   seek pointer from our parent and to re-establish locks */
1958int tdb_reopen(TDB_CONTEXT *tdb)
1959{
1960	struct stat st;
1961
1962	if (tdb->flags & TDB_INTERNAL)
1963		return 0; /* Nothing to do. */
1964	if (tdb_munmap(tdb) != 0) {
1965		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1966		goto fail;
1967	}
1968	if (close(tdb->fd) != 0)
1969		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1970	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1971	if (tdb->fd == -1) {
1972		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1973		goto fail;
1974	}
1975	if (fstat(tdb->fd, &st) != 0) {
1976		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1977		goto fail;
1978	}
1979	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1980		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1981		goto fail;
1982	}
1983	tdb_mmap(tdb);
1984	if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
1985		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1986		goto fail;
1987	}
1988
1989	return 0;
1990
1991fail:
1992	tdb_close(tdb);
1993	return -1;
1994}
1995
1996/* reopen all tdb's */
1997int tdb_reopen_all(void)
1998{
1999	TDB_CONTEXT *tdb;
2000
2001	for (tdb=tdbs; tdb; tdb = tdb->next) {
2002		/* Ensure no clear-if-first. */
2003		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2004		if (tdb_reopen(tdb) != 0)
2005			return -1;
2006	}
2007
2008	return 0;
2009}
2010