• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /netgear-WNDR4500v2-V1.0.0.60_1.0.38/ap/gpl/timemachine/netatalk-2.2.5/libatalk/tdb/
1 /*
2   Unix SMB/CIFS implementation.
3   Samba database functions
4   Copyright (C) Andrew Tridgell              1999-2000
5   Copyright (C) Luke Kenneth Casson Leighton      2000
6   Copyright (C) Paul `Rusty' Russell		   2000
7   Copyright (C) Jeremy Allison			   2000-2003
8
9   This program is free software; you can redistribute it and/or modify
10   it under the terms of the GNU General Public License as published by
11   the Free Software Foundation; either version 2 of the License, or
12   (at your option) any later version.
13
14   This program is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   GNU General Public License for more details.
18
19   You should have received a copy of the GNU General Public License
20   along with this program; if not, write to the Free Software
21   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22*/
23
24#define STANDALONE 1
25
26/* NOTE: If you use tdbs under valgrind, and in particular if you run
27 * tdbtorture, you may get spurious "uninitialized value" warnings.  I
28 * think this is because valgrind doesn't understand that the mmap'd
29 * area may be written to by other processes.  Memory can, from the
30 * point of view of the grinded process, spontaneously become
31 * initialized.
32 *
33 * I can think of a few solutions.  [mbp 20030311]
34 *
35 * 1 - Write suppressions for Valgrind so that it doesn't complain
36 * about this.  Probably the most reasonable but people need to
37 * remember to use them.
38 *
39 * 2 - Use IO not mmap when running under valgrind.  Not so nice.
40 *
41 * 3 - Use the special valgrind macros to mark memory as valid at the
42 * right time.  Probably too hard -- the process just doesn't know.
43 */
44
45#ifdef STANDALONE
46#if HAVE_CONFIG_H
47#include <config.h>
48#endif
49
50#define _XOPEN_SOURCE 600
51#include <unistd.h>
52#include <stdlib.h>
53#include <stdio.h>
54#include <fcntl.h>
55#include <unistd.h>
56#include <string.h>
57#include <fcntl.h>
58#include <errno.h>
59#include <sys/mman.h>
60#include <sys/stat.h>
61#include <signal.h>
62#include "spinlock.h"
63#else
64#include "includes.h"
65#endif
66
67#define TDB_MAGIC_FOOD "TDB file\n"
68#define TDB_VERSION (0x26011967 + 6)
69#define TDB_MAGIC (0x26011999U)
70#define TDB_FREE_MAGIC (~TDB_MAGIC)
71#define TDB_DEAD_MAGIC (0xFEE1DEAD)
72#define TDB_ALIGNMENT 4
73#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
74#define DEFAULT_HASH_SIZE 131
75#define TDB_PAGE_SIZE 0x2000
76#define FREELIST_TOP (sizeof(struct tdb_header))
77#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
78#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
79#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
80#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
81#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
82
83/* NB assumes there is a local variable called "tdb" that is the
84 * current context, also takes doubly-parenthesized print-style
85 * argument. */
86#define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
87
88/* lock offsets */
89#define GLOBAL_LOCK 0
90#define ACTIVE_LOCK 4
91
92#ifndef MAP_FILE
93#define MAP_FILE 0
94#endif
95
96#ifndef MAP_FAILED
97#define MAP_FAILED ((void *)-1)
98#endif
99
100/* free memory if the pointer is valid and zero the pointer */
101#ifndef SAFE_FREE
102#define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
103#endif
104
105#define BUCKET(hash) ((hash) % tdb->header.hash_size)
106TDB_DATA tdb_null;
107
108/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
109static TDB_CONTEXT *tdbs = NULL;
110
111static int tdb_munmap(TDB_CONTEXT *tdb)
112{
113	if (tdb->flags & TDB_INTERNAL)
114		return 0;
115
116#ifdef HAVE_MMAP
117	if (tdb->map_ptr) {
118		int ret = munmap(tdb->map_ptr, tdb->map_size);
119		if (ret != 0)
120			return ret;
121	}
122#endif
123	tdb->map_ptr = NULL;
124	return 0;
125}
126
127static void tdb_mmap(TDB_CONTEXT *tdb)
128{
129	if (tdb->flags & TDB_INTERNAL)
130		return;
131
132#ifdef HAVE_MMAP
133	if (!(tdb->flags & TDB_NOMMAP)) {
134		tdb->map_ptr = mmap(NULL, tdb->map_size,
135				    PROT_READ|(tdb->read_only? 0:PROT_WRITE),
136				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
137
138		/*
139		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
140		 */
141
142		if (tdb->map_ptr == MAP_FAILED) {
143			tdb->map_ptr = NULL;
144			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
145				 tdb->map_size, strerror(errno)));
146		}
147	} else {
148		tdb->map_ptr = NULL;
149	}
150#else
151	tdb->map_ptr = NULL;
152#endif
153}
154
155/* Endian conversion: we only ever deal with 4 byte quantities */
156static void *convert(void *buf, u32 size)
157{
158	u32 i, *p = buf;
159	for (i = 0; i < size / 4; i++)
160		p[i] = TDB_BYTEREV(p[i]);
161	return buf;
162}
163#define DOCONV() (tdb->flags & TDB_CONVERT)
164#define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
165
166/* the body of the database is made of one list_struct for the free space
167   plus a separate data list for each hash value */
168struct list_struct {
169	tdb_off next; /* offset of the next record in the list */
170	tdb_len rec_len; /* total byte length of record */
171	tdb_len key_len; /* byte length of key */
172	tdb_len data_len; /* byte length of data */
173	u32 full_hash; /* the full 32 bit hash of the key */
174	u32 magic;   /* try to catch errors */
175	/* the following union is implied:
176		union {
177			char record[rec_len];
178			struct {
179				char key[key_len];
180				char data[data_len];
181			}
182			u32 totalsize; (tailer)
183		}
184	*/
185};
186
187/***************************************************************
188 Allow a caller to set a "alarm" flag that tdb can check to abort
189 a blocking lock on SIGALRM.
190***************************************************************/
191
192static sig_atomic_t *palarm_fired;
193
194void tdb_set_lock_alarm(sig_atomic_t *palarm)
195{
196	palarm_fired = palarm;
197}
198
199/* a byte range locking function - return 0 on success
200   this functions locks/unlocks 1 byte at the specified offset.
201
202   On error, errno is also set so that errors are passed back properly
203   through tdb_open(). */
204static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
205		      int rw_type, int lck_type, int probe)
206{
207	struct flock fl;
208	int ret;
209
210	if (tdb->flags & TDB_NOLOCK)
211		return 0;
212	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
213		errno = EACCES;
214		return -1;
215	}
216
217	fl.l_type = rw_type;
218	fl.l_whence = SEEK_SET;
219	fl.l_start = offset;
220	fl.l_len = 1;
221	fl.l_pid = 0;
222
223	do {
224		ret = fcntl(tdb->fd,lck_type,&fl);
225		if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
226			break;
227	} while (ret == -1 && errno == EINTR);
228
229	if (ret == -1) {
230		if (!probe && lck_type != F_SETLK) {
231			/* Ensure error code is set for log fun to examine. */
232			if (errno == EINTR && palarm_fired && *palarm_fired)
233				tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
234			else
235				tdb->ecode = TDB_ERR_LOCK;
236			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
237				 tdb->fd, offset, rw_type, lck_type));
238		}
239		/* Was it an alarm timeout ? */
240		if (errno == EINTR && palarm_fired && *palarm_fired)
241			return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
242		/* Otherwise - generic lock error. */
243		/* errno set by fcntl */
244		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
245	}
246	return 0;
247}
248
249/* lock a list in the database. list -1 is the alloc list */
250static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
251{
252	if (list < -1 || list >= (int)tdb->header.hash_size) {
253		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
254			   list, ltype));
255		return -1;
256	}
257	if (tdb->flags & TDB_NOLOCK)
258		return 0;
259
260	/* Since fcntl locks don't nest, we do a lock for the first one,
261	   and simply bump the count for future ones */
262	if (tdb->locked[list+1].count == 0) {
263		if (!tdb->read_only && tdb->header.rwlocks) {
264			if (tdb_spinlock(tdb, list, ltype)) {
265				TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
266					   list, ltype));
267				return -1;
268			}
269		} else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
270			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
271					   list, ltype, strerror(errno)));
272			return -1;
273		}
274		tdb->locked[list+1].ltype = ltype;
275	}
276	tdb->locked[list+1].count++;
277	return 0;
278}
279
280/* unlock the database: returns void because it's too late for errors. */
281	/* changed to return int it may be interesting to know there
282	   has been an error  --simo */
283static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
284{
285	int ret = -1;
286
287	if (tdb->flags & TDB_NOLOCK)
288		return 0;
289
290	/* Sanity checks */
291	if (list < -1 || list >= (int)tdb->header.hash_size) {
292		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
293		return ret;
294	}
295
296	if (tdb->locked[list+1].count==0) {
297		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
298		return ret;
299	}
300
301	if (tdb->locked[list+1].count == 1) {
302		/* Down to last nested lock: unlock underneath */
303		if (!tdb->read_only && tdb->header.rwlocks) {
304			ret = tdb_spinunlock(tdb, list, ltype);
305		} else {
306			ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
307		}
308	} else {
309		ret = 0;
310	}
311	tdb->locked[list+1].count--;
312
313	if (ret)
314		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
315	return ret;
316}
317
318/* This is based on the hash algorithm from gdbm */
319static u32 tdb_hash(TDB_DATA *key)
320{
321	u32 value;	/* Used to compute the hash value.  */
322	u32   i;	/* Used to cycle through random values. */
323
324	/* Set the initial value from the key size. */
325	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
326		value = (value + (key->dptr[i] << (i*5 % 24)));
327
328	return (1103515243 * value + 12345);
329}
330
331/* check for an out of bounds access - if it is out of bounds then
332   see if the database has been expanded by someone else and expand
333   if necessary
334   note that "len" is the minimum length needed for the db
335*/
336static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
337{
338	struct stat st;
339	if (len <= tdb->map_size)
340		return 0;
341	if (tdb->flags & TDB_INTERNAL) {
342		if (!probe) {
343			/* Ensure ecode is set for log fn. */
344			tdb->ecode = TDB_ERR_IO;
345			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
346				 (int)len, (int)tdb->map_size));
347		}
348		return TDB_ERRCODE(TDB_ERR_IO, -1);
349	}
350
351	if (fstat(tdb->fd, &st) == -1)
352		return TDB_ERRCODE(TDB_ERR_IO, -1);
353
354	if (st.st_size < (size_t)len) {
355		if (!probe) {
356			/* Ensure ecode is set for log fn. */
357			tdb->ecode = TDB_ERR_IO;
358			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
359				 (int)len, (int)st.st_size));
360		}
361		return TDB_ERRCODE(TDB_ERR_IO, -1);
362	}
363
364	/* Unmap, update size, remap */
365	if (tdb_munmap(tdb) == -1)
366		return TDB_ERRCODE(TDB_ERR_IO, -1);
367	tdb->map_size = st.st_size;
368	tdb_mmap(tdb);
369	return 0;
370}
371
372/* write a lump of data at a specified offset */
373static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
374{
375	if (tdb_oob(tdb, off + len, 0) != 0)
376		return -1;
377
378	if (tdb->map_ptr)
379		memcpy(off + (char *)tdb->map_ptr, buf, len);
380#ifdef HAVE_PWRITE
381	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
382#else
383	else if (lseek(tdb->fd, off, SEEK_SET) != off
384		 || write(tdb->fd, buf, len) != (ssize_t)len) {
385#endif
386		/* Ensure ecode is set for log fn. */
387		tdb->ecode = TDB_ERR_IO;
388		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
389			   off, len, strerror(errno)));
390		return TDB_ERRCODE(TDB_ERR_IO, -1);
391	}
392	return 0;
393}
394
395/* read a lump of data at a specified offset, maybe convert */
396static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
397{
398	if (tdb_oob(tdb, off + len, 0) != 0)
399		return -1;
400
401	if (tdb->map_ptr)
402		memcpy(buf, off + (char *)tdb->map_ptr, len);
403#ifdef HAVE_PREAD
404	else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
405#else
406	else if (lseek(tdb->fd, off, SEEK_SET) != off
407		 || read(tdb->fd, buf, len) != (ssize_t)len) {
408#endif
409		/* Ensure ecode is set for log fn. */
410		tdb->ecode = TDB_ERR_IO;
411		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
412			   off, len, strerror(errno)));
413		return TDB_ERRCODE(TDB_ERR_IO, -1);
414	}
415	if (cv)
416		convert(buf, len);
417	return 0;
418}
419
420/* read a lump of data, allocating the space for it */
421static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
422{
423	char *buf;
424
425	if (!(buf = malloc(len))) {
426		/* Ensure ecode is set for log fn. */
427		tdb->ecode = TDB_ERR_OOM;
428		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
429			   len, strerror(errno)));
430		return TDB_ERRCODE(TDB_ERR_OOM, buf);
431	}
432	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
433		SAFE_FREE(buf);
434		return NULL;
435	}
436	return buf;
437}
438
439/* read/write a tdb_off */
440static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
441{
442	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
443}
444static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
445{
446	tdb_off off = *d;
447	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
448}
449
450/* read/write a record */
451static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
452{
453	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
454		return -1;
455	if (TDB_BAD_MAGIC(rec)) {
456		/* Ensure ecode is set for log fn. */
457		tdb->ecode = TDB_ERR_CORRUPT;
458		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
459		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
460	}
461	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
462}
463static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
464{
465	struct list_struct r = *rec;
466	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
467}
468
469/* read a freelist record and check for simple errors */
470static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
471{
472	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
473		return -1;
474
475	if (rec->magic == TDB_MAGIC) {
476		/* this happens when a app is showdown while deleting a record - we should
477		   not completely fail when this happens */
478		TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
479			 rec->magic, off));
480		rec->magic = TDB_FREE_MAGIC;
481		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
482			return -1;
483	}
484
485	if (rec->magic != TDB_FREE_MAGIC) {
486		/* Ensure ecode is set for log fn. */
487		tdb->ecode = TDB_ERR_CORRUPT;
488		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
489			   rec->magic, off));
490		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
491	}
492	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
493		return -1;
494	return 0;
495}
496
497/* update a record tailer (must hold allocation lock) */
498static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
499			 const struct list_struct *rec)
500{
501	tdb_off totalsize;
502
503	/* Offset of tailer from record header */
504	totalsize = sizeof(*rec) + rec->rec_len;
505	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
506			 &totalsize);
507}
508
509static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
510{
511	struct list_struct rec;
512	tdb_off tailer_ofs, tailer;
513
514	if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
515		printf("ERROR: failed to read record at %u\n", offset);
516		return 0;
517	}
518
519	printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
520	       offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
521
522	tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
523	if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
524		printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
525		return rec.next;
526	}
527
528	if (tailer != rec.rec_len + sizeof(rec)) {
529		printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
530				(unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
531	}
532	return rec.next;
533}
534
535static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
536{
537	tdb_off rec_ptr, top;
538
539	top = TDB_HASH_TOP(i);
540
541	if (tdb_lock(tdb, i, F_WRLCK) != 0)
542		return -1;
543
544	if (ofs_read(tdb, top, &rec_ptr) == -1)
545		return tdb_unlock(tdb, i, F_WRLCK);
546
547	if (rec_ptr)
548		printf("hash=%d\n", i);
549
550	while (rec_ptr) {
551		rec_ptr = tdb_dump_record(tdb, rec_ptr);
552	}
553
554	return tdb_unlock(tdb, i, F_WRLCK);
555}
556
557void tdb_dump_all(TDB_CONTEXT *tdb)
558{
559	unsigned int i;
560	for (i=0;i<tdb->header.hash_size;i++) {
561		tdb_dump_chain(tdb, i);
562	}
563	printf("freelist:\n");
564	tdb_dump_chain(tdb, -1);
565}
566
567int tdb_printfreelist(TDB_CONTEXT *tdb)
568{
569	int ret;
570	long total_free = 0;
571	tdb_off offset, rec_ptr;
572	struct list_struct rec;
573
574	if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
575		return ret;
576
577	offset = FREELIST_TOP;
578
579	/* read in the freelist top */
580	if (ofs_read(tdb, offset, &rec_ptr) == -1) {
581		tdb_unlock(tdb, -1, F_WRLCK);
582		return 0;
583	}
584
585	printf("freelist top=[0x%08x]\n", rec_ptr );
586	while (rec_ptr) {
587		if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
588			tdb_unlock(tdb, -1, F_WRLCK);
589			return -1;
590		}
591
592		if (rec.magic != TDB_FREE_MAGIC) {
593			printf("bad magic 0x%08x in free list\n", rec.magic);
594			tdb_unlock(tdb, -1, F_WRLCK);
595			return -1;
596		}
597
598		printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
599		total_free += rec.rec_len;
600
601		/* move to the next record */
602		rec_ptr = rec.next;
603	}
604	printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
605               (int)total_free);
606
607	return tdb_unlock(tdb, -1, F_WRLCK);
608}
609
610/* Remove an element from the freelist.  Must have alloc lock. */
611static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
612{
613	tdb_off last_ptr, i;
614
615	/* read in the freelist top */
616	last_ptr = FREELIST_TOP;
617	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
618		if (i == off) {
619			/* We've found it! */
620			return ofs_write(tdb, last_ptr, &next);
621		}
622		/* Follow chain (next offset is at start of record) */
623		last_ptr = i;
624	}
625	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
626	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
627}
628
629/* Add an element into the freelist. Merge adjacent records if
630   neccessary. */
631static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
632{
633	tdb_off right, left;
634
635	/* Allocation and tailer lock */
636	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
637		return -1;
638
639	/* set an initial tailer, so if we fail we don't leave a bogus record */
640	if (update_tailer(tdb, offset, rec) != 0) {
641		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
642		goto fail;
643	}
644
645	/* Look right first (I'm an Australian, dammit) */
646	right = offset + sizeof(*rec) + rec->rec_len;
647	if (right + sizeof(*rec) <= tdb->map_size) {
648		struct list_struct r;
649
650		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
651			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
652			goto left;
653		}
654
655		/* If it's free, expand to include it. */
656		if (r.magic == TDB_FREE_MAGIC) {
657			if (remove_from_freelist(tdb, right, r.next) == -1) {
658				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
659				goto left;
660			}
661			rec->rec_len += sizeof(r) + r.rec_len;
662		}
663	}
664
665left:
666	/* Look left */
667	left = offset - sizeof(tdb_off);
668	if (left > TDB_HASH_TOP(tdb->header.hash_size-1)) {
669		struct list_struct l;
670		tdb_off leftsize;
671
672		/* Read in tailer and jump back to header */
673		if (ofs_read(tdb, left, &leftsize) == -1) {
674			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
675			goto update;
676		}
677		left = offset - leftsize;
678
679		/* Now read in record */
680		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
681			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
682			goto update;
683		}
684
685		/* If it's free, expand to include it. */
686		if (l.magic == TDB_FREE_MAGIC) {
687			if (remove_from_freelist(tdb, left, l.next) == -1) {
688				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
689				goto update;
690			} else {
691				offset = left;
692				rec->rec_len += leftsize;
693			}
694		}
695	}
696
697update:
698	if (update_tailer(tdb, offset, rec) == -1) {
699		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
700		goto fail;
701	}
702
703	/* Now, prepend to free list */
704	rec->magic = TDB_FREE_MAGIC;
705
706	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
707	    rec_write(tdb, offset, rec) == -1 ||
708	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
709		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
710		goto fail;
711	}
712
713	/* And we're done. */
714	tdb_unlock(tdb, -1, F_WRLCK);
715	return 0;
716
717 fail:
718	tdb_unlock(tdb, -1, F_WRLCK);
719	return -1;
720}
721
722
723/* expand a file.  we prefer to use ftruncate, as that is what posix
724  says to use for mmap expansion */
725static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
726{
727	char buf[1024];
728#if HAVE_FTRUNCATE_EXTEND
729	if (ftruncate(tdb->fd, size+addition) != 0) {
730		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
731			   size+addition, strerror(errno)));
732		return -1;
733	}
734#else
735	char b = 0;
736
737#ifdef HAVE_PWRITE
738	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
739#else
740	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
741	    write(tdb->fd, &b, 1) != 1) {
742#endif
743		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
744			   size+addition, strerror(errno)));
745		return -1;
746	}
747#endif
748
749	/* now fill the file with something. This ensures that the file isn't sparse, which would be
750	   very bad if we ran out of disk. This must be done with write, not via mmap */
751	memset(buf, 0x42, sizeof(buf));
752	while (addition) {
753		int n = addition>sizeof(buf)?sizeof(buf):addition;
754#ifdef HAVE_PWRITE
755		int ret = pwrite(tdb->fd, buf, n, size);
756#else
757		int ret;
758		if (lseek(tdb->fd, size, SEEK_SET) != size)
759			return -1;
760		ret = write(tdb->fd, buf, n);
761#endif
762		if (ret != n) {
763			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
764				   n, strerror(errno)));
765			return -1;
766		}
767		addition -= n;
768		size += n;
769	}
770	return 0;
771}
772
773
774/* expand the database at least size bytes by expanding the underlying
775   file and doing the mmap again if necessary */
776static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
777{
778	struct list_struct rec;
779	tdb_off offset;
780
781	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
782		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
783		return -1;
784	}
785
786	/* must know about any previous expansions by another process */
787	tdb_oob(tdb, tdb->map_size + 1, 1);
788
789	/* always make room for at least 10 more records, and round
790           the database up to a multiple of TDB_PAGE_SIZE */
791	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
792
793	if (!(tdb->flags & TDB_INTERNAL))
794		tdb_munmap(tdb);
795
796	/*
797	 * We must ensure the file is unmapped before doing this
798	 * to ensure consistency with systems like OpenBSD where
799	 * writes and mmaps are not consistent.
800	 */
801
802	/* expand the file itself */
803	if (!(tdb->flags & TDB_INTERNAL)) {
804		if (expand_file(tdb, tdb->map_size, size) != 0)
805			goto fail;
806	}
807
808	tdb->map_size += size;
809
810	if (tdb->flags & TDB_INTERNAL)
811		tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
812	else {
813		/*
814		 * We must ensure the file is remapped before adding the space
815		 * to ensure consistency with systems like OpenBSD where
816		 * writes and mmaps are not consistent.
817		 */
818
819		/* We're ok if the mmap fails as we'll fallback to read/write */
820		tdb_mmap(tdb);
821	}
822
823	/* form a new freelist record */
824	memset(&rec,'\0',sizeof(rec));
825	rec.rec_len = size - sizeof(rec);
826
827	/* link it into the free list */
828	offset = tdb->map_size - size;
829	if (tdb_free(tdb, offset, &rec) == -1)
830		goto fail;
831
832	tdb_unlock(tdb, -1, F_WRLCK);
833	return 0;
834 fail:
835	tdb_unlock(tdb, -1, F_WRLCK);
836	return -1;
837}
838
839/* allocate some space from the free list. The offset returned points
840   to a unconnected list_struct within the database with room for at
841   least length bytes of total data
842
843   0 is returned if the space could not be allocated
844 */
845static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
846			    struct list_struct *rec)
847{
848	tdb_off rec_ptr, last_ptr, newrec_ptr;
849	struct list_struct newrec;
850
851	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
852		return 0;
853
854	/* Extra bytes required for tailer */
855	length += sizeof(tdb_off);
856
857 again:
858	last_ptr = FREELIST_TOP;
859
860	/* read in the freelist top */
861	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
862		goto fail;
863
864	/* keep looking until we find a freelist record big enough */
865	while (rec_ptr) {
866		if (rec_free_read(tdb, rec_ptr, rec) == -1)
867			goto fail;
868
869		if (rec->rec_len >= length) {
870			/* found it - now possibly split it up  */
871			if (rec->rec_len > length + MIN_REC_SIZE) {
872				/* Length of left piece */
873				length = TDB_ALIGN(length, TDB_ALIGNMENT);
874
875				/* Right piece to go on free list */
876				newrec.rec_len = rec->rec_len
877					- (sizeof(*rec) + length);
878				newrec_ptr = rec_ptr + sizeof(*rec) + length;
879
880				/* And left record is shortened */
881				rec->rec_len = length;
882			} else
883				newrec_ptr = 0;
884
885			/* Remove allocated record from the free list */
886			if (ofs_write(tdb, last_ptr, &rec->next) == -1)
887				goto fail;
888
889			/* Update header: do this before we drop alloc
890                           lock, otherwise tdb_free() might try to
891                           merge with us, thinking we're free.
892                           (Thanks Jeremy Allison). */
893			rec->magic = TDB_MAGIC;
894			if (rec_write(tdb, rec_ptr, rec) == -1)
895				goto fail;
896
897			/* Did we create new block? */
898			if (newrec_ptr) {
899				/* Update allocated record tailer (we
900                                   shortened it). */
901				if (update_tailer(tdb, rec_ptr, rec) == -1)
902					goto fail;
903
904				/* Free new record */
905				if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
906					goto fail;
907			}
908
909			/* all done - return the new record offset */
910			tdb_unlock(tdb, -1, F_WRLCK);
911			return rec_ptr;
912		}
913		/* move to the next record */
914		last_ptr = rec_ptr;
915		rec_ptr = rec->next;
916	}
917	/* we didn't find enough space. See if we can expand the
918	   database and if we can then try again */
919	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
920		goto again;
921 fail:
922	tdb_unlock(tdb, -1, F_WRLCK);
923	return 0;
924}
925
926/* initialise a new database with a specified hash size */
927static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
928{
929	struct tdb_header *newdb;
930	int size, ret = -1;
931
932	/* We make it up in memory, then write it out if not internal */
933	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
934	if (!(newdb = calloc(size, 1)))
935		return TDB_ERRCODE(TDB_ERR_OOM, -1);
936
937	/* Fill in the header */
938	newdb->version = TDB_VERSION;
939	newdb->hash_size = hash_size;
940#ifdef USE_SPINLOCKS
941	newdb->rwlocks = size;
942#endif
943	if (tdb->flags & TDB_INTERNAL) {
944		tdb->map_size = size;
945		tdb->map_ptr = (char *)newdb;
946		memcpy(&tdb->header, newdb, sizeof(tdb->header));
947		/* Convert the `ondisk' version if asked. */
948		CONVERT(*newdb);
949		return 0;
950	}
951	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
952		goto fail;
953
954	if (ftruncate(tdb->fd, 0) == -1)
955		goto fail;
956
957	/* This creates an endian-converted header, as if read from disk */
958	CONVERT(*newdb);
959	memcpy(&tdb->header, newdb, sizeof(tdb->header));
960	/* Don't endian-convert the magic food! */
961	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
962	if (write(tdb->fd, newdb, size) != size)
963		ret = -1;
964	else
965		ret = tdb_create_rwlocks(tdb->fd, hash_size);
966
967  fail:
968	SAFE_FREE(newdb);
969	return ret;
970}
971
972/* Returns 0 on fail.  On success, return offset of record, and fills
973   in rec */
974static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
975			struct list_struct *r)
976{
977	tdb_off rec_ptr;
978
979	/* read in the hash top */
980	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
981		return 0;
982
983	/* keep looking until we find the right record */
984	while (rec_ptr) {
985		if (rec_read(tdb, rec_ptr, r) == -1)
986			return 0;
987
988		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
989			char *k;
990			/* a very likely hit - read the key */
991			k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
992					   r->key_len);
993			if (!k)
994				return 0;
995
996			if (memcmp(key.dptr, k, key.dsize) == 0) {
997				SAFE_FREE(k);
998				return rec_ptr;
999			}
1000			SAFE_FREE(k);
1001		}
1002		rec_ptr = r->next;
1003	}
1004	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1005}
1006
1007/* If they do lockkeys, check that this hash is one they locked */
1008static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
1009{
1010	u32 i;
1011	if (!tdb->lockedkeys)
1012		return 1;
1013	for (i = 0; i < tdb->lockedkeys[0]; i++)
1014		if (tdb->lockedkeys[i+1] == hash)
1015			return 1;
1016	return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
1017}
1018
1019/* As tdb_find, but if you succeed, keep the lock */
1020static tdb_off tdb_find_lock(TDB_CONTEXT *tdb, TDB_DATA key, int locktype,
1021			     struct list_struct *rec)
1022{
1023	u32 hash, rec_ptr;
1024
1025	hash = tdb_hash(&key);
1026	if (!tdb_keylocked(tdb, hash))
1027		return 0;
1028	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1029		return 0;
1030	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1031		tdb_unlock(tdb, BUCKET(hash), locktype);
1032	return rec_ptr;
1033}
1034
1035enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1036{
1037	return tdb->ecode;
1038}
1039
1040static struct tdb_errname {
1041	enum TDB_ERROR ecode; const char *estring;
1042} emap[] = { {TDB_SUCCESS, "Success"},
1043	     {TDB_ERR_CORRUPT, "Corrupt database"},
1044	     {TDB_ERR_IO, "IO Error"},
1045	     {TDB_ERR_LOCK, "Locking error"},
1046	     {TDB_ERR_OOM, "Out of memory"},
1047	     {TDB_ERR_EXISTS, "Record exists"},
1048	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1049	     {TDB_ERR_NOEXIST, "Record does not exist"} };
1050
1051/* Error string for the last tdb error */
1052const char *tdb_errorstr(TDB_CONTEXT *tdb)
1053{
1054	u32 i;
1055	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1056		if (tdb->ecode == emap[i].ecode)
1057			return emap[i].estring;
1058	return "Invalid error code";
1059}
1060
1061/* update an entry in place - this only works if the new data size
1062   is <= the old data size and the key exists.
1063   on failure return -1.
1064*/
1065
1066static int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
1067{
1068	struct list_struct rec;
1069	tdb_off rec_ptr;
1070
1071	/* find entry */
1072	if (!(rec_ptr = tdb_find(tdb, key, tdb_hash(&key), &rec)))
1073		return -1;
1074
1075	/* must be long enough key, data and tailer */
1076	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1077		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1078		return -1;
1079	}
1080
1081	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1082		      dbuf.dptr, dbuf.dsize) == -1)
1083		return -1;
1084
1085	if (dbuf.dsize != rec.data_len) {
1086		/* update size */
1087		rec.data_len = dbuf.dsize;
1088		return rec_write(tdb, rec_ptr, &rec);
1089	}
1090
1091	return 0;
1092}
1093
1094/* find an entry in the database given a key */
1095/* If an entry doesn't exist tdb_err will be set to
1096 * TDB_ERR_NOEXIST. If a key has no data attached
1097 * tdb_err will not be set. Both will return a
1098 * zero pptr and zero dsize.
1099 */
1100
1101TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1102{
1103	tdb_off rec_ptr;
1104	struct list_struct rec;
1105	TDB_DATA ret;
1106
1107	/* find which hash bucket it is in */
1108	if (!(rec_ptr = tdb_find_lock(tdb,key,F_RDLCK,&rec)))
1109		return tdb_null;
1110
1111	if (rec.data_len)
1112		ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1113					  rec.data_len);
1114	else
1115		ret.dptr = NULL;
1116	ret.dsize = rec.data_len;
1117	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1118	return ret;
1119}
1120
1121/* check if an entry in the database exists
1122
1123   note that 1 is returned if the key is found and 0 is returned if not found
1124   this doesn't match the conventions in the rest of this module, but is
1125   compatible with gdbm
1126*/
1127int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1128{
1129	struct list_struct rec;
1130
1131	if (tdb_find_lock(tdb, key, F_RDLCK, &rec) == 0)
1132		return 0;
1133	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1134	return 1;
1135}
1136
1137/* record lock stops delete underneath */
1138static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1139{
1140	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1141}
1142/*
1143  Write locks override our own fcntl readlocks, so check it here.
1144  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1145  an error to fail to get the lock here.
1146*/
1147
1148static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1149{
1150	struct tdb_traverse_lock *i;
1151	for (i = &tdb->travlocks; i; i = i->next)
1152		if (i->off == off)
1153			return -1;
1154	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1155}
1156
1157/*
1158  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1159  an error to fail to get the lock here.
1160*/
1161
1162static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1163{
1164	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1165}
1166/* fcntl locks don't stack: avoid unlocking someone else's */
1167static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1168{
1169	struct tdb_traverse_lock *i;
1170	u32 count = 0;
1171
1172	if (off == 0)
1173		return 0;
1174	for (i = &tdb->travlocks; i; i = i->next)
1175		if (i->off == off)
1176			count++;
1177	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1178}
1179
1180/* actually delete an entry in the database given the offset */
1181static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1182{
1183	tdb_off last_ptr, i;
1184	struct list_struct lastrec;
1185
1186	if (tdb->read_only) return -1;
1187
1188	if (write_lock_record(tdb, rec_ptr) == -1) {
1189		/* Someone traversing here: mark it as dead */
1190		rec->magic = TDB_DEAD_MAGIC;
1191		return rec_write(tdb, rec_ptr, rec);
1192	}
1193	if (write_unlock_record(tdb, rec_ptr) != 0)
1194		return -1;
1195
1196	/* find previous record in hash chain */
1197	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1198		return -1;
1199	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1200		if (rec_read(tdb, i, &lastrec) == -1)
1201			return -1;
1202
1203	/* unlink it: next ptr is at start of record. */
1204	if (last_ptr == 0)
1205		last_ptr = TDB_HASH_TOP(rec->full_hash);
1206	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1207		return -1;
1208
1209	/* recover the space */
1210	if (tdb_free(tdb, rec_ptr, rec) == -1)
1211		return -1;
1212	return 0;
1213}
1214
1215/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1216static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1217			 struct list_struct *rec)
1218{
1219	int want_next = (tlock->off != 0);
1220
1221	/* No traversal allows if you've called tdb_lockkeys() */
1222	if (tdb->lockedkeys)
1223		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1224
1225	/* Lock each chain from the start one. */
1226	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1227		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1228			return -1;
1229
1230		/* No previous record?  Start at top of chain. */
1231		if (!tlock->off) {
1232			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1233				     &tlock->off) == -1)
1234				goto fail;
1235		} else {
1236			/* Otherwise unlock the previous record. */
1237			if (unlock_record(tdb, tlock->off) != 0)
1238				goto fail;
1239		}
1240
1241		if (want_next) {
1242			/* We have offset of old record: grab next */
1243			if (rec_read(tdb, tlock->off, rec) == -1)
1244				goto fail;
1245			tlock->off = rec->next;
1246		}
1247
1248		/* Iterate through chain */
1249		while( tlock->off) {
1250			tdb_off current;
1251			if (rec_read(tdb, tlock->off, rec) == -1)
1252				goto fail;
1253			if (!TDB_DEAD(rec)) {
1254				/* Woohoo: we found one! */
1255				if (lock_record(tdb, tlock->off) != 0)
1256					goto fail;
1257				return tlock->off;
1258			}
1259			/* Try to clean dead ones from old traverses */
1260			current = tlock->off;
1261			tlock->off = rec->next;
1262			if (do_delete(tdb, current, rec) != 0)
1263				goto fail;
1264		}
1265		tdb_unlock(tdb, tlock->hash, F_WRLCK);
1266		want_next = 0;
1267	}
1268	/* We finished iteration without finding anything */
1269	return TDB_ERRCODE(TDB_SUCCESS, 0);
1270
1271 fail:
1272	tlock->off = 0;
1273	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1274		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1275	return -1;
1276}
1277
1278/* traverse the entire database - calling fn(tdb, key, data) on each element.
1279   return -1 on error or the record count traversed
1280   if fn is NULL then it is not called
1281   a non-zero return value from fn() indicates that the traversal should stop
1282  */
1283int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1284{
1285	TDB_DATA key, dbuf;
1286	struct list_struct rec;
1287	struct tdb_traverse_lock tl = { NULL, 0, 0 };
1288	int ret, count = 0;
1289
1290	/* This was in the initializaton, above, but the IRIX compiler
1291	 * did not like it.  crh
1292	 */
1293	tl.next = tdb->travlocks.next;
1294
1295	/* fcntl locks don't stack: beware traverse inside traverse */
1296	tdb->travlocks.next = &tl;
1297
1298	/* tdb_next_lock places locks on the record returned, and its chain */
1299	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1300		count++;
1301		/* now read the full record */
1302		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1303					  rec.key_len + rec.data_len);
1304		if (!key.dptr) {
1305			ret = -1;
1306			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1307				goto out;
1308			if (unlock_record(tdb, tl.off) != 0)
1309				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1310			goto out;
1311		}
1312		key.dsize = rec.key_len;
1313		dbuf.dptr = key.dptr + rec.key_len;
1314		dbuf.dsize = rec.data_len;
1315
1316		/* Drop chain lock, call out */
1317		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1318			ret = -1;
1319			goto out;
1320		}
1321		if (fn && fn(tdb, key, dbuf, state)) {
1322			/* They want us to terminate traversal */
1323			ret = count;
1324			if (unlock_record(tdb, tl.off) != 0) {
1325				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1326				ret = -1;
1327			}
1328			tdb->travlocks.next = tl.next;
1329			SAFE_FREE(key.dptr);
1330			return count;
1331		}
1332		SAFE_FREE(key.dptr);
1333	}
1334out:
1335	tdb->travlocks.next = tl.next;
1336	if (ret < 0)
1337		return -1;
1338	else
1339		return count;
1340}
1341
1342/* find the first entry in the database and return its key */
1343TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1344{
1345	TDB_DATA key;
1346	struct list_struct rec;
1347
1348	/* release any old lock */
1349	if (unlock_record(tdb, tdb->travlocks.off) != 0)
1350		return tdb_null;
1351	tdb->travlocks.off = tdb->travlocks.hash = 0;
1352
1353	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1354		return tdb_null;
1355	/* now read the key */
1356	key.dsize = rec.key_len;
1357	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1358	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1359		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1360	return key;
1361}
1362
1363/* find the next entry in the database, returning its key */
1364TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1365{
1366	u32 oldhash;
1367	TDB_DATA key = tdb_null;
1368	struct list_struct rec;
1369	char *k = NULL;
1370
1371	/* Is locked key the old key?  If so, traverse will be reliable. */
1372	if (tdb->travlocks.off) {
1373		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1374			return tdb_null;
1375		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1376		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1377					    rec.key_len))
1378		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1379			/* No, it wasn't: unlock it and start from scratch */
1380			if (unlock_record(tdb, tdb->travlocks.off) != 0)
1381				return tdb_null;
1382			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1383				return tdb_null;
1384			tdb->travlocks.off = 0;
1385		}
1386
1387		SAFE_FREE(k);
1388	}
1389
1390	if (!tdb->travlocks.off) {
1391		/* No previous element: do normal find, and lock record */
1392		tdb->travlocks.off = tdb_find_lock(tdb, oldkey, F_WRLCK, &rec);
1393		if (!tdb->travlocks.off)
1394			return tdb_null;
1395		tdb->travlocks.hash = BUCKET(rec.full_hash);
1396		if (lock_record(tdb, tdb->travlocks.off) != 0) {
1397			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1398			return tdb_null;
1399		}
1400	}
1401	oldhash = tdb->travlocks.hash;
1402
1403	/* Grab next record: locks chain and returned record,
1404	   unlocks old record */
1405	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1406		key.dsize = rec.key_len;
1407		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1408					  key.dsize);
1409		/* Unlock the chain of this new record */
1410		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1411			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1412	}
1413	/* Unlock the chain of old record */
1414	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1415		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1416	return key;
1417}
1418
1419/* delete an entry in the database given a key */
1420int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1421{
1422	tdb_off rec_ptr;
1423	struct list_struct rec;
1424	int ret;
1425
1426	if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
1427		return -1;
1428	ret = do_delete(tdb, rec_ptr, &rec);
1429	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1430		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1431	return ret;
1432}
1433
1434/* store an element in the database, replacing any existing element
1435   with the same key
1436
1437   return 0 on success, -1 on failure
1438*/
1439int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1440{
1441	struct list_struct rec;
1442	u32 hash;
1443	tdb_off rec_ptr;
1444	char *p = NULL;
1445	int ret = 0;
1446
1447	/* find which hash bucket it is in */
1448	hash = tdb_hash(&key);
1449	if (!tdb_keylocked(tdb, hash))
1450		return -1;
1451	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1452		return -1;
1453
1454	/* check for it existing, on insert. */
1455	if (flag == TDB_INSERT) {
1456		if (tdb_exists(tdb, key)) {
1457			tdb->ecode = TDB_ERR_EXISTS;
1458			goto fail;
1459		}
1460	} else {
1461		/* first try in-place update, on modify or replace. */
1462		if (tdb_update(tdb, key, dbuf) == 0)
1463			goto out;
1464		if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1465			goto fail;
1466	}
1467	/* reset the error code potentially set by the tdb_update() */
1468	tdb->ecode = TDB_SUCCESS;
1469
1470	/* delete any existing record - if it doesn't exist we don't
1471           care.  Doing this first reduces fragmentation, and avoids
1472           coalescing with `allocated' block before it's updated. */
1473	if (flag != TDB_INSERT)
1474		tdb_delete(tdb, key);
1475
1476	/* Copy key+value *before* allocating free space in case malloc
1477	   fails and we are left with a dead spot in the tdb. */
1478
1479	if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1480		tdb->ecode = TDB_ERR_OOM;
1481		goto fail;
1482	}
1483
1484	memcpy(p, key.dptr, key.dsize);
1485	if (dbuf.dsize)
1486		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1487
1488	/* now we're into insert / modify / replace of a record which
1489	 * we know could not be optimised by an in-place store (for
1490	 * various reasons).  */
1491	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1492		goto fail;
1493
1494	/* Read hash top into next ptr */
1495	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1496		goto fail;
1497
1498	rec.key_len = key.dsize;
1499	rec.data_len = dbuf.dsize;
1500	rec.full_hash = hash;
1501	rec.magic = TDB_MAGIC;
1502
1503	/* write out and point the top of the hash chain at it */
1504	if (rec_write(tdb, rec_ptr, &rec) == -1
1505	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1506	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1507		/* Need to tdb_unallocate() here */
1508		goto fail;
1509	}
1510 out:
1511	SAFE_FREE(p);
1512	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1513	return ret;
1514fail:
1515	ret = -1;
1516	goto out;
1517}
1518
1519/* Attempt to append data to an entry in place - this only works if the new data size
1520   is <= the old data size and the key exists.
1521   on failure return -1. Record must be locked before calling.
1522*/
1523static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1524{
1525	struct list_struct rec;
1526	tdb_off rec_ptr;
1527
1528	/* find entry */
1529	if (!(rec_ptr = tdb_find(tdb, key, tdb_hash(&key), &rec)))
1530		return -1;
1531
1532	/* Append of 0 is always ok. */
1533	if (new_dbuf.dsize == 0)
1534		return 0;
1535
1536	/* must be long enough for key, old data + new data and tailer */
1537	if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1538		/* No room. */
1539		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1540		return -1;
1541	}
1542
1543	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1544		      new_dbuf.dptr, new_dbuf.dsize) == -1)
1545		return -1;
1546
1547	/* update size */
1548	rec.data_len += new_dbuf.dsize;
1549	return rec_write(tdb, rec_ptr, &rec);
1550}
1551
1552/* Append to an entry. Create if not exist. */
1553
1554int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1555{
1556	struct list_struct rec;
1557	u32 hash;
1558	tdb_off rec_ptr;
1559	char *p = NULL;
1560	int ret = 0;
1561	size_t new_data_size = 0;
1562
1563	/* find which hash bucket it is in */
1564	hash = tdb_hash(&key);
1565	if (!tdb_keylocked(tdb, hash))
1566		return -1;
1567	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1568		return -1;
1569
1570	/* first try in-place. */
1571	if (tdb_append_inplace(tdb, key, new_dbuf) == 0)
1572		goto out;
1573
1574	/* reset the error code potentially set by the tdb_append_inplace() */
1575	tdb->ecode = TDB_SUCCESS;
1576
1577	/* find entry */
1578	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1579		if (tdb->ecode != TDB_ERR_NOEXIST)
1580			goto fail;
1581
1582		/* Not found - create. */
1583
1584		ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1585		goto out;
1586	}
1587
1588	new_data_size = rec.data_len + new_dbuf.dsize;
1589
1590	/* Copy key+old_value+value *before* allocating free space in case malloc
1591	   fails and we are left with a dead spot in the tdb. */
1592
1593	if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1594		tdb->ecode = TDB_ERR_OOM;
1595		goto fail;
1596	}
1597
1598	/* Copy the key in place. */
1599	memcpy(p, key.dptr, key.dsize);
1600
1601	/* Now read the old data into place. */
1602	if (rec.data_len &&
1603		tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1604			goto fail;
1605
1606	/* Finally append the new data. */
1607	if (new_dbuf.dsize)
1608		memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1609
1610	/* delete any existing record - if it doesn't exist we don't
1611           care.  Doing this first reduces fragmentation, and avoids
1612           coalescing with `allocated' block before it's updated. */
1613
1614	tdb_delete(tdb, key);
1615
1616	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1617		goto fail;
1618
1619	/* Read hash top into next ptr */
1620	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1621		goto fail;
1622
1623	rec.key_len = key.dsize;
1624	rec.data_len = new_data_size;
1625	rec.full_hash = hash;
1626	rec.magic = TDB_MAGIC;
1627
1628	/* write out and point the top of the hash chain at it */
1629	if (rec_write(tdb, rec_ptr, &rec) == -1
1630	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1631	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1632		/* Need to tdb_unallocate() here */
1633		goto fail;
1634	}
1635
1636 out:
1637	SAFE_FREE(p);
1638	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1639	return ret;
1640
1641fail:
1642	ret = -1;
1643	goto out;
1644}
1645
1646static int tdb_already_open(dev_t device,
1647			    ino_t ino)
1648{
1649	TDB_CONTEXT *i;
1650
1651	for (i = tdbs; i; i = i->next) {
1652		if (i->device == device && i->inode == ino) {
1653			return 1;
1654		}
1655	}
1656
1657	return 0;
1658}
1659
1660/* open the database, creating it if necessary
1661
1662   The open_flags and mode are passed straight to the open call on the
1663   database file. A flags value of O_WRONLY is invalid. The hash size
1664   is advisory, use zero for a default value.
1665
1666   Return is NULL on error, in which case errno is also set.  Don't
1667   try to call tdb_error or tdb_errname, just do strerror(errno).
1668
1669   @param name may be NULL for internal databases. */
1670TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1671		      int open_flags, mode_t mode)
1672{
1673	return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1674}
1675
1676
1677TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1678			 int open_flags, mode_t mode,
1679			 tdb_log_func log_fn)
1680{
1681	TDB_CONTEXT *tdb;
1682	struct stat st;
1683	int rev = 0, locked;
1684	unsigned char *vp;
1685	u32 vertest;
1686
1687	if (!(tdb = calloc(1, sizeof *tdb))) {
1688		/* Can't log this */
1689		errno = ENOMEM;
1690		goto fail;
1691	}
1692	tdb->fd = -1;
1693	tdb->name = NULL;
1694	tdb->map_ptr = NULL;
1695	tdb->lockedkeys = NULL;
1696	tdb->flags = tdb_flags;
1697	tdb->open_flags = open_flags;
1698	tdb->log_fn = log_fn;
1699
1700	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1701		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1702			 name));
1703		errno = EINVAL;
1704		goto fail;
1705	}
1706
1707	if (hash_size == 0)
1708		hash_size = DEFAULT_HASH_SIZE;
1709	if ((open_flags & O_ACCMODE) == O_RDONLY) {
1710		tdb->read_only = 1;
1711		/* read only databases don't do locking or clear if first */
1712		tdb->flags |= TDB_NOLOCK;
1713		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1714	}
1715
1716	/* internal databases don't mmap or lock, and start off cleared */
1717	if (tdb->flags & TDB_INTERNAL) {
1718		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1719		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1720		if (tdb_new_database(tdb, hash_size) != 0) {
1721			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1722			goto fail;
1723		}
1724		goto internal;
1725	}
1726
1727	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1728		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1729			 name, strerror(errno)));
1730		goto fail;	/* errno set by open(2) */
1731	}
1732
1733	/* ensure there is only one process initialising at once */
1734	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1735		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1736			 name, strerror(errno)));
1737		goto fail;	/* errno set by tdb_brlock */
1738	}
1739
1740	/* we need to zero database if we are the only one with it open */
1741	if ((locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))
1742	    && (tdb_flags & TDB_CLEAR_IF_FIRST)) {
1743		open_flags |= O_CREAT;
1744		if (ftruncate(tdb->fd, 0) == -1) {
1745			TDB_LOG((tdb, 0, "tdb_open_ex: "
1746				 "failed to truncate %s: %s\n",
1747				 name, strerror(errno)));
1748			goto fail; /* errno set by ftruncate */
1749		}
1750	}
1751
1752	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1753	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1754	    || (tdb->header.version != TDB_VERSION
1755		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1756		/* its not a valid database - possibly initialise it */
1757		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1758			errno = EIO; /* ie bad format or something */
1759			goto fail;
1760		}
1761		rev = (tdb->flags & TDB_CONVERT);
1762	}
1763	vp = (unsigned char *)&tdb->header.version;
1764	vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1765		  (((u32)vp[2]) << 8) | (u32)vp[3];
1766	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1767	if (!rev)
1768		tdb->flags &= ~TDB_CONVERT;
1769	else {
1770		tdb->flags |= TDB_CONVERT;
1771		convert(&tdb->header, sizeof(tdb->header));
1772	}
1773	if (fstat(tdb->fd, &st) == -1)
1774		goto fail;
1775
1776	/* Is it already in the open list?  If so, fail. */
1777	if (tdb_already_open(st.st_dev, st.st_ino)) {
1778		TDB_LOG((tdb, 2, "tdb_open_ex: "
1779			 "%s (%d,%d) is already open in this process\n",
1780			 name, st.st_dev, st.st_ino));
1781		errno = EBUSY;
1782		goto fail;
1783	}
1784
1785	if (!(tdb->name = (char *)strdup(name))) {
1786		errno = ENOMEM;
1787		goto fail;
1788	}
1789
1790	tdb->map_size = st.st_size;
1791	tdb->device = st.st_dev;
1792	tdb->inode = st.st_ino;
1793	tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1794	if (!tdb->locked) {
1795		TDB_LOG((tdb, 2, "tdb_open_ex: "
1796			 "failed to allocate lock structure for %s\n",
1797			 name));
1798		errno = ENOMEM;
1799		goto fail;
1800	}
1801	tdb_mmap(tdb);
1802	if (locked) {
1803		if (!tdb->read_only)
1804			if (tdb_clear_spinlocks(tdb) != 0) {
1805				TDB_LOG((tdb, 0, "tdb_open_ex: "
1806				"failed to clear spinlock\n"));
1807				goto fail;
1808			}
1809		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1810			TDB_LOG((tdb, 0, "tdb_open_ex: "
1811				 "failed to take ACTIVE_LOCK on %s: %s\n",
1812				 name, strerror(errno)));
1813			goto fail;
1814		}
1815	}
1816	/* leave this lock in place to indicate it's in use */
1817	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1818		goto fail;
1819
1820 internal:
1821	/* Internal (memory-only) databases skip all the code above to
1822	 * do with disk files, and resume here by releasing their
1823	 * global lock and hooking into the active list. */
1824	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1825		goto fail;
1826	tdb->next = tdbs;
1827	tdbs = tdb;
1828	return tdb;
1829
1830 fail:
1831	{ int save_errno = errno;
1832
1833	if (!tdb)
1834		return NULL;
1835
1836	if (tdb->map_ptr) {
1837		if (tdb->flags & TDB_INTERNAL)
1838			SAFE_FREE(tdb->map_ptr);
1839		else
1840			tdb_munmap(tdb);
1841	}
1842	SAFE_FREE(tdb->name);
1843	if (tdb->fd != -1)
1844		if (close(tdb->fd) != 0)
1845			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1846	SAFE_FREE(tdb->locked);
1847	SAFE_FREE(tdb);
1848	errno = save_errno;
1849	return NULL;
1850	}
1851}
1852
1853/**
1854 * Close a database.
1855 *
1856 * @returns -1 for error; 0 for success.
1857 **/
1858int tdb_close(TDB_CONTEXT *tdb)
1859{
1860	TDB_CONTEXT **i;
1861	int ret = 0;
1862
1863	if (tdb->map_ptr) {
1864		if (tdb->flags & TDB_INTERNAL)
1865			SAFE_FREE(tdb->map_ptr);
1866		else
1867			tdb_munmap(tdb);
1868	}
1869	SAFE_FREE(tdb->name);
1870	if (tdb->fd != -1)
1871		ret = close(tdb->fd);
1872	SAFE_FREE(tdb->locked);
1873	SAFE_FREE(tdb->lockedkeys);
1874
1875	/* Remove from contexts list */
1876	for (i = &tdbs; *i; i = &(*i)->next) {
1877		if (*i == tdb) {
1878			*i = tdb->next;
1879			break;
1880		}
1881	}
1882
1883	memset(tdb, 0, sizeof(*tdb));
1884	SAFE_FREE(tdb);
1885
1886	return ret;
1887}
1888
1889/* lock/unlock entire database */
1890int tdb_lockall(TDB_CONTEXT *tdb)
1891{
1892	u32 i;
1893
1894	/* There are no locks on read-only dbs */
1895	if (tdb->read_only)
1896		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1897	if (tdb->lockedkeys)
1898		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1899	for (i = 0; i < tdb->header.hash_size; i++)
1900		if (tdb_lock(tdb, i, F_WRLCK))
1901			break;
1902
1903	/* If error, release locks we have... */
1904	if (i < tdb->header.hash_size) {
1905		u32 j;
1906
1907		for ( j = 0; j < i; j++)
1908			tdb_unlock(tdb, j, F_WRLCK);
1909		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1910	}
1911
1912	return 0;
1913}
1914void tdb_unlockall(TDB_CONTEXT *tdb)
1915{
1916	u32 i;
1917	for (i=0; i < tdb->header.hash_size; i++)
1918		tdb_unlock(tdb, i, F_WRLCK);
1919}
1920
1921int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1922{
1923	u32 i, j, hash;
1924
1925	/* Can't lock more keys if already locked */
1926	if (tdb->lockedkeys)
1927		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1928	if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1929		return TDB_ERRCODE(TDB_ERR_OOM, -1);
1930	/* First number in array is # keys */
1931	tdb->lockedkeys[0] = number;
1932
1933	/* Insertion sort by bucket */
1934	for (i = 0; i < number; i++) {
1935		hash = tdb_hash(&keys[i]);
1936		for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1937			memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1938		tdb->lockedkeys[j+1] = hash;
1939	}
1940	/* Finally, lock in order */
1941	for (i = 0; i < number; i++)
1942		if (tdb_lock(tdb, i, F_WRLCK))
1943			break;
1944
1945	/* If error, release locks we have... */
1946	if (i < number) {
1947		for ( j = 0; j < i; j++)
1948			tdb_unlock(tdb, j, F_WRLCK);
1949		SAFE_FREE(tdb->lockedkeys);
1950		return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1951	}
1952	return 0;
1953}
1954
1955/* Unlock the keys previously locked by tdb_lockkeys() */
1956void tdb_unlockkeys(TDB_CONTEXT *tdb)
1957{
1958	u32 i;
1959	if (!tdb->lockedkeys)
1960		return;
1961	for (i = 0; i < tdb->lockedkeys[0]; i++)
1962		tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1963	SAFE_FREE(tdb->lockedkeys);
1964}
1965
1966/* lock/unlock one hash chain. This is meant to be used to reduce
1967   contention - it cannot guarantee how many records will be locked */
1968int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1969{
1970	return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1971}
1972
1973int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1974{
1975	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1976}
1977
1978#if 0
1979static int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1980{
1981	return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
1982}
1983#endif
1984
1985#if 0
1986static int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1987{
1988	return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
1989}
1990#endif
1991
1992/* register a loging function */
1993void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1994{
1995	tdb->log_fn = fn;
1996}
1997
1998
1999/* reopen a tdb - this is used after a fork to ensure that we have an independent
2000   seek pointer from our parent and to re-establish locks */
2001int tdb_reopen(TDB_CONTEXT *tdb)
2002{
2003	struct stat st;
2004
2005	if (tdb_munmap(tdb) != 0) {
2006		TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2007		goto fail;
2008	}
2009	if (close(tdb->fd) != 0)
2010		TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2011	tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2012	if (tdb->fd == -1) {
2013		TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2014		goto fail;
2015	}
2016	if (fstat(tdb->fd, &st) != 0) {
2017		TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2018		goto fail;
2019	}
2020	if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2021		TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2022		goto fail;
2023	}
2024	tdb_mmap(tdb);
2025	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1) {
2026		TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2027		goto fail;
2028	}
2029
2030	return 0;
2031
2032fail:
2033	tdb_close(tdb);
2034	return -1;
2035}
2036
2037/* reopen all tdb's */
2038int tdb_reopen_all(void)
2039{
2040	TDB_CONTEXT *tdb;
2041
2042	for (tdb=tdbs; tdb; tdb = tdb->next) {
2043		if (tdb_reopen(tdb) != 0) return -1;
2044	}
2045
2046	return 0;
2047}
2048