1/* udb.c - u(micro) data base.
2 * By W.C.A. Wijngaards
3 * Copyright 2010, NLnet Labs.
4 * BSD, see LICENSE.
5 */
6#include "config.h"
7#include "udb.h"
8#include <string.h>
9#include <errno.h>
10#include <stdio.h>
11#include <unistd.h>
12#include <assert.h>
13#include "lookup3.h"
14#include "util.h"
15
16/* mmap and friends */
17#include <sys/types.h>
18#include <sys/stat.h>
19#include <fcntl.h>
20#include <sys/mman.h>
21
22/* for systems without, portable definition, failed-1 and async is a flag */
23#ifndef MAP_FAILED
24#define MAP_FAILED ((void*)-1)
25#endif
26#ifndef MS_SYNC
27#define MS_SYNC 0
28#endif
29
30/** move and fixup xl segment */
31static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32	udb_void n, uint64_t sz, uint64_t startseg);
33/** attempt to compact the data and move free space to the end */
34static int udb_alloc_compact(void* base, udb_alloc* alloc);
35
36/** convert pointer to the data part to a pointer to the base of the chunk */
37static udb_void
38chunk_from_dataptr(udb_void data)
39{
40	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42	udb_void xl = data - sizeof(udb_xl_chunk_d);
43	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44		return xl;
45	return data - sizeof(udb_chunk_d);
46}
47
48udb_void chunk_from_dataptr_ext(udb_void data) {
49	return chunk_from_dataptr(data);
50}
51
52#ifndef NDEBUG
53/** read last octet from a chunk */
54static uint8_t
55chunk_get_last(void* base, udb_void chunk, int exp)
56{
57	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58}
59#endif
60
61/** write last octet of a chunk */
62static void
63chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64{
65	assert(exp >= 0 && exp <= 63);
66	*((uint8_t*)UDB_REL(base, chunk+((uint64_t)1<<exp)-1)) = value;
67}
68
69/** create udb_base from a file descriptor (must be at start of file) */
70udb_base*
71udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
72	void* arg)
73{
74	uint64_t m, fsz;
75	udb_glob_d g;
76	ssize_t r;
77	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
78	if(!udb) {
79		log_msg(LOG_ERR, "out of memory");
80		close(fd);
81		return NULL;
82	}
83	udb->fname = strdup(fname);
84	if(!udb->fname) {
85		log_msg(LOG_ERR, "out of memory");
86		free(udb);
87		close(fd);
88		return NULL;
89	}
90	udb->walkfunc = walkfunc;
91	udb->walkarg = arg;
92	udb->fd = fd;
93	udb->ram_size = 1024;
94	udb->ram_mask = (int)udb->ram_size - 1;
95	udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*),
96		udb->ram_size);
97	if(!udb->ram_hash) {
98		free(udb->fname);
99		free(udb);
100		log_msg(LOG_ERR, "out of memory");
101		close(fd);
102		return NULL;
103	}
104
105	/* read magic */
106	if((r=read(fd, &m, sizeof(m))) == -1) {
107		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
108		goto fail;
109	} else if(r != (ssize_t)sizeof(m)) {
110		log_msg(LOG_ERR, "%s: file too short", fname);
111		goto fail;
112	}
113	/* TODO : what if bigendian and littleendian file, see magic */
114	if(m != UDB_MAGIC) {
115		log_msg(LOG_ERR, "%s: wrong type of file", fname);
116		goto fail;
117	}
118	/* read header */
119	if((r=read(fd, &g, sizeof(g))) == -1) {
120		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
121		goto fail;
122	} else if(r != (ssize_t)sizeof(g)) {
123		log_msg(LOG_ERR, "%s: file too short", fname);
124		goto fail;
125	}
126	if(g.version != 0) {
127		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
128			(int)g.version);
129		goto fail;
130	}
131	if(g.hsize < UDB_HEADER_SIZE) {
132		log_msg(LOG_ERR, "%s: header size too small %d", fname,
133			(int)g.hsize);
134		goto fail;
135	}
136	if(g.hsize > UDB_HEADER_SIZE) {
137		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
138			(int)g.hsize);
139		goto fail;
140	}
141	if(g.clean_close != 1) {
142		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
143			(int)g.clean_close);
144		goto fail;
145	}
146	if(g.dirty_alloc != 0) {
147		log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
148			(int)g.dirty_alloc);
149		goto fail;
150	}
151
152	/* check file size correctly written, for 4.0.2 nsd.db failure */
153	fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
154	(void)lseek(fd, (off_t)0, SEEK_SET);
155	if(g.fsize != fsz) {
156		log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
157			"has size %llu", fname, (unsigned long long)fsz,
158			(unsigned long long)g.fsize);
159		goto fail;
160	}
161
162	/* mmap it */
163	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
164		log_msg(LOG_ERR, "%s: file too short", fname);
165		goto fail;
166	}
167	if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
168		log_msg(LOG_WARNING, "%s: file size too large %llu",
169			fname, (unsigned long long)g.fsize);
170		goto fail;
171	}
172	udb->base_size = (size_t)g.fsize;
173#ifdef HAVE_MMAP
174	/* note the size_t casts must be there for portability, on some
175	 * systems the layout of memory is otherwise broken. */
176	udb->base = mmap(NULL, (size_t)udb->base_size,
177		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
178		(int)udb->fd, (off_t)0);
179#else
180	udb->base = MAP_FAILED; errno = ENOSYS;
181#endif
182	if(udb->base == MAP_FAILED) {
183		udb->base = NULL;
184		log_msg(LOG_ERR, "mmap(size %u) error: %s",
185			(unsigned)udb->base_size, strerror(errno));
186	fail:
187		close(fd);
188		free(udb->fname);
189		free(udb->ram_hash);
190		free(udb);
191		return NULL;
192	}
193
194	/* init completion */
195	udb->glob_data = (udb_glob_d*)((char*)udb->base+sizeof(uint64_t));
196	r = 0;
197	/* cannot be dirty because that is goto fail above */
198	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
199		r = 1;
200	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
201		(char*)udb->glob_data+sizeof(*udb->glob_data)));
202	if(!udb->alloc) {
203		log_msg(LOG_ERR, "out of memory");
204		udb_base_free(udb);
205		return NULL;
206	}
207	if(r) {
208		/* and compact now, or resume compacting */
209		udb_alloc_compact(udb, udb->alloc);
210		udb_base_sync(udb, 1);
211	}
212	udb->glob_data->clean_close = 0;
213
214	return udb;
215}
216
217udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
218	void* arg)
219{
220	int fd = open(fname, O_RDWR);
221	if(fd == -1) {
222		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
223		return NULL;
224	}
225	return udb_base_create_fd(fname, fd, walkfunc, arg);
226}
227
228/** init new udb_global structure */
229static void udb_glob_init_new(udb_glob_d* g)
230{
231	memset(g, 0, sizeof(*g));
232	g->hsize = UDB_HEADER_SIZE;
233	g->fsize = UDB_HEADER_SIZE;
234}
235
236/** write data to file and check result */
237static int
238write_fdata(const char* fname, int fd, void* data, size_t len)
239{
240	ssize_t w;
241	if((w=write(fd, data, len)) == -1) {
242		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
243		close(fd);
244		return 0;
245	} else if(w != (ssize_t)len) {
246		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
247		close(fd);
248		return 0;
249	}
250	return 1;
251}
252
253udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
254	void* arg)
255{
256	uint64_t m;
257	udb_glob_d g;
258	udb_alloc_d a;
259	uint64_t endsize = UDB_HEADER_SIZE;
260	uint64_t endexp = 0;
261	int fd = open(fname, O_CREAT|O_RDWR, 0600);
262	if(fd == -1) {
263		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
264		return NULL;
265	}
266	m = UDB_MAGIC;
267	udb_glob_init_new(&g);
268	udb_alloc_init_new(&a);
269	g.clean_close = 1;
270
271	/* write new data to file (closes fd on error) */
272	if(!write_fdata(fname, fd, &m, sizeof(m)))
273		return NULL;
274	if(!write_fdata(fname, fd, &g, sizeof(g)))
275		return NULL;
276	if(!write_fdata(fname, fd, &a, sizeof(a)))
277		return NULL;
278	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
279		return NULL;
280	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
281		return NULL;
282	/* rewind to start */
283	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
284		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
285		close(fd);
286		return NULL;
287	}
288	/* truncate to the right size */
289	if(ftruncate(fd, (off_t)g.fsize) < 0) {
290		log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
291			(int)g.fsize, strerror(errno));
292		close(fd);
293		return NULL;
294	}
295	return udb_base_create_fd(fname, fd, walkfunc, arg);
296}
297
298/** shrink the udb base if it has unused space at the end */
299static void
300udb_base_shrink(udb_base* udb, uint64_t nsize)
301{
302	udb->glob_data->dirty_alloc = udb_dirty_fsize;
303	udb->glob_data->fsize = nsize;
304	/* sync, does not *seem* to be required on Linux, but it is
305	   certainly required on OpenBSD.  Otherwise changed data is lost. */
306#ifdef HAVE_MMAP
307	msync(udb->base, udb->base_size, MS_ASYNC);
308#endif
309	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
310		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
311			(unsigned)nsize, strerror(errno));
312	}
313	udb->glob_data->dirty_alloc = udb_dirty_clean;
314}
315
316void udb_base_close(udb_base* udb)
317{
318	if(!udb)
319		return;
320	if(udb->fd != -1 && udb->base && udb->alloc) {
321		uint64_t nsize = udb->alloc->disk->nextgrow;
322		if(nsize < udb->base_size)
323			udb_base_shrink(udb, nsize);
324	}
325	if(udb->fd != -1) {
326		udb->glob_data->clean_close = 1;
327		close(udb->fd);
328		udb->fd = -1;
329	}
330	if(udb->base) {
331#ifdef HAVE_MMAP
332		if(munmap(udb->base, udb->base_size) == -1) {
333			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
334		}
335#endif
336		udb->base = NULL;
337	}
338}
339
340void udb_base_free(udb_base* udb)
341{
342	if(!udb)
343		return;
344	udb_base_close(udb);
345	udb_alloc_delete(udb->alloc);
346	free(udb->ram_hash);
347	free(udb->fname);
348	free(udb);
349}
350
351void udb_base_free_keep_mmap(udb_base* udb)
352{
353	if(!udb) return;
354	if(udb->fd != -1) {
355		close(udb->fd);
356		udb->fd = -1;
357	}
358	udb->base = NULL;
359	udb_alloc_delete(udb->alloc);
360	free(udb->ram_hash);
361	free(udb->fname);
362	free(udb);
363}
364
365void udb_base_sync(udb_base* udb, int wait)
366{
367	if(!udb) return;
368#ifdef HAVE_MMAP
369	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
370		log_msg(LOG_ERR, "msync(%s) error %s",
371			udb->fname, strerror(errno));
372	}
373#else
374	(void)wait;
375#endif
376}
377
378/** hash a chunk pointer */
379static uint32_t
380chunk_hash_ptr(udb_void p)
381{
382	/* put p into an array of uint32 */
383	uint32_t h[sizeof(p)/sizeof(uint32_t)];
384	memcpy(&h, &p, sizeof(h));
385	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
386}
387
388/** check that the given pointer is on the bucket for the given offset */
389int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
390{
391	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
392	udb_ptr* p;
393	assert((size_t)i < udb->ram_size);
394	for(p = udb->ram_hash[i]; p; p=p->next) {
395		if(p == ptr)
396			return 1;
397	}
398	return 0;
399}
400
401/** grow the ram array */
402static void
403grow_ram_hash(udb_base* udb, udb_ptr** newhash)
404{
405	size_t i;
406	size_t osize= udb->ram_size;
407	udb_ptr* p, *np;
408	udb_ptr** oldhash = udb->ram_hash;
409	udb->ram_size *= 2;
410	udb->ram_mask <<= 1;
411	udb->ram_mask |= 1;
412	udb->ram_hash = newhash;
413	/* have to link in every element in the old list into the new list*/
414	for(i=0; i<osize; i++) {
415		p = oldhash[i];
416		while(p) {
417			np = p->next;
418			/* link into newhash */
419			p->prev=NULL;
420			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
421			if(p->next) p->next->prev = p;
422			/* go to next element of oldhash */
423			p = np;
424		}
425	}
426	free(oldhash);
427}
428
429void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
430{
431	uint32_t i;
432#ifdef UDB_CHECK
433	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
434#endif
435	udb->ram_num++;
436	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) {
437		/* grow the array, if allocation succeeds */
438		udb_ptr** newram = (udb_ptr**)xalloc_array_zero(
439			sizeof(udb_ptr*), udb->ram_size*2);
440		if(newram) {
441			grow_ram_hash(udb, newram);
442		}
443	}
444	i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
445	assert((size_t)i < udb->ram_size);
446
447	ptr->prev = NULL;
448	ptr->next = udb->ram_hash[i];
449	udb->ram_hash[i] = ptr;
450	if(ptr->next)
451		ptr->next->prev = ptr;
452}
453
454void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
455{
456	assert(ptr->data);
457#ifdef UDB_CHECK
458	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
459	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
460#endif
461	udb->ram_num--;
462	if(ptr->next)
463		ptr->next->prev = ptr->prev;
464	if(ptr->prev)
465		ptr->prev->next = ptr->next;
466	else	{
467		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
468		assert((size_t)i < udb->ram_size);
469		udb->ram_hash[i] = ptr->next;
470	}
471}
472
473/** change a set of ram ptrs to a new value */
474static void
475udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
476{
477	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
478	udb_ptr* p, *np;
479	/* edit them and move them into the new position */
480	p = udb->ram_hash[io];
481	while(p) {
482		np = p->next;
483		if(p->data == old) {
484			udb_base_unlink_ptr(udb, p);
485			p->data = newd;
486			udb_base_link_ptr(udb, p);
487		}
488		p = np;
489	}
490}
491
492udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
493{
494	return &udb->glob_data->user_global;
495}
496
497void udb_base_set_userdata(udb_base* udb, udb_void user)
498{
499#ifdef UDB_CHECK
500	if(user) { assert(udb_valid_dataptr(udb, user)); }
501#endif
502	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
503}
504
505void udb_base_set_userflags(udb_base* udb, uint8_t v)
506{
507	udb->glob_data->userflags = v;
508}
509
510uint8_t udb_base_get_userflags(udb_base* udb)
511{
512	return udb->glob_data->userflags;
513}
514
515/** re-mmap the udb to specified size */
516static void*
517udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
518{
519#ifdef HAVE_MMAP
520	void* nb;
521	/* for use with valgrind, do not use mremap, but the other version */
522#ifdef MREMAP_MAYMOVE
523	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
524	if(nb == MAP_FAILED) {
525		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
526			udb->fname, (unsigned)nsize, strerror(errno));
527		return 0;
528	}
529#else /* !HAVE MREMAP */
530	/* use munmap-mmap to simulate mremap */
531	if(munmap(udb->base, udb->base_size) != 0) {
532		log_msg(LOG_ERR, "munmap(%s) error %s",
533			udb->fname, strerror(errno));
534	}
535	/* provide hint for new location */
536	/* note the size_t casts must be there for portability, on some
537	 * systems the layout of memory is otherwise broken. */
538	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
539		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
540	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
541	 * the kernel can then try to mmap it at a different location
542	 * where more memory is available */
543	if(nb == MAP_FAILED && errno == ENOMEM) {
544		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
545			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
546	}
547	if(nb == MAP_FAILED) {
548		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
549			udb->fname, (unsigned)nsize, strerror(errno));
550		udb->base = NULL;
551		return 0;
552	}
553#endif /* HAVE MREMAP */
554	if(nb != udb->base) {
555		/* fix up realpointers in udb and alloc */
556		/* but mremap may have been nice and not move the base */
557		udb->base = nb;
558		udb->glob_data = (udb_glob_d*)((char*)nb+sizeof(uint64_t));
559		/* use passed alloc pointer because the udb->alloc may not
560		 * be initialized yet */
561		alloc->disk = (udb_alloc_d*)((char*)udb->glob_data
562			+sizeof(*udb->glob_data));
563	}
564	udb->base_size = nsize;
565	return nb;
566#else /* HAVE_MMAP */
567	(void)udb; (void)alloc; (void)nsize;
568	return NULL;
569#endif /* HAVE_MMAP */
570}
571
572void
573udb_base_remap_process(udb_base* udb)
574{
575	/* assume that fsize is still accessible */
576	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
577}
578
579/** grow file to specified size and re-mmap, return new base */
580static void*
581udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
582{
583	/* grow file by writing a single zero at that spot, the
584	 * rest is filled in with zeroes. */
585	uint8_t z = 0;
586	ssize_t w;
587
588	assert(nsize > 0);
589	udb->glob_data->dirty_alloc = udb_dirty_fsize;
590#ifdef HAVE_PWRITE
591	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
592#else
593	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
594		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
595		return 0;
596	}
597	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
598#endif
599		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
600			udb->fname, (unsigned)nsize, strerror(errno));
601		return 0;
602	} else if(w != (ssize_t)sizeof(z)) {
603		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
604			udb->fname, (unsigned)nsize);
605		return 0;
606	}
607	udb->glob_data->fsize = nsize;
608	udb->glob_data->dirty_alloc = udb_dirty_clean;
609	return udb_base_remap(udb, udb->alloc, nsize);
610}
611
612int udb_exp_size(uint64_t a)
613{
614	/* find enclosing value such that 2**x >= a */
615	int x = 0;
616	uint64_t i = a;
617	assert(a != 0);
618
619	i --;
620	/* could optimise this with uint8* access, depends on endianness */
621	/* first whole bytes */
622	while( (i&(~(uint64_t)0xff)) ) {
623		i >>= 8;
624		x += 8;
625	}
626	/* now details */
627	while(i) {
628		i >>= 1;
629		x ++;
630	}
631	assert( x>=0 && x<=63);
632	assert( ((uint64_t)1<<x) >= a);
633	assert( x==0 || /* <<x-1 without negative number analyzer complaints: */ (((uint64_t)1<<x)>>1) < a);
634	return x;
635}
636
637int udb_exp_offset(uint64_t o)
638{
639	/* this means measuring the number of 0 bits on the right */
640	/* so, if exp zero bits then (o&(2**x-1))==0 */
641	int x = 0;
642	uint64_t i = o;
643	assert(o != 0);
644	/* first whole bytes */
645	while( (i&(uint64_t)0xff) == 0) {
646		i >>= 8;
647		x += 8;
648	}
649	/* now details */
650	while( (i&(uint64_t)0x1) == 0) {
651		i >>= 1;
652		x ++;
653	}
654	assert( o % ((uint64_t)1<<x) == 0);
655	assert( o % ((uint64_t)1<<(x+1)) != 0);
656	return x;
657}
658
659void udb_alloc_init_new(udb_alloc_d* a)
660{
661	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
662	memset(a, 0, sizeof(*a));
663	/* set new allocations after header, as if allocated in a sequence
664	 * of minsize allocations */
665	a->nextgrow = UDB_HEADER_SIZE;
666}
667
668/** fsck the file size, false if failed and file is useless */
669static int
670fsck_fsize(udb_base* udb, udb_alloc* alloc)
671{
672	off_t realsize;
673	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
674	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
675	if(realsize == (off_t)-1) {
676		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
677		return 0;
678	}
679	udb->glob_data->fsize = (uint64_t)realsize;
680	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
681		return 0;
682	udb->glob_data->dirty_alloc = udb_dirty_clean;
683	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
684	udb_base_sync(udb, 1);
685	return 1;
686}
687
688/** regenerate freelist add a new free chunk, return next todo */
689static udb_void
690regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
691{
692	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
693	uint64_t esz = (uint64_t)1<<exp;
694	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
695		return 0;
696	}
697	cp->type = udb_chunk_type_free;
698	cp->flags = 0;
699	chunk_set_last(base, c, exp, (uint8_t)exp);
700	cp->prev = 0;
701	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
702	if(cp->next)
703		UDB_FREE_CHUNK(cp->next)->prev = c;
704	regen->stat_free += esz;
705	return c + esz;
706}
707
708/** regenerate xl chunk, return next todo */
709static udb_void
710regen_xl(void* base, udb_void c, udb_alloc_d* regen)
711{
712	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
713	uint64_t xlsz = cp->size;
714	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
715		return 0;
716	}
717	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
718		return 0;
719	}
720	/* fixup end-size and end-expmarker */
721	regen->stat_alloc += xlsz;
722	return c + xlsz;
723}
724
725/** regenerate data chunk, return next todo */
726static udb_void
727regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
728{
729	uint64_t esz = (uint64_t)1<<exp;
730	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
731		return 0;
732	}
733	chunk_set_last(base, c, exp, (uint8_t)exp);
734	regen->stat_alloc += esz;
735	return c + esz;
736}
737
738/** regenerate a relptr structure inside a data segment */
739static void
740regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
741{
742	udb_void* a = (udb_void*)arg;
743	/* ignore 0 pointers */
744	if(!rp->data)
745		return;
746
747	/* edit relptrs that point to oldmoved to point to newmoved. */
748	if(rp->data == a[0])
749		rp->data = a[1];
750
751	/* regenerate relptr lists, add this item to the relptr list for
752	 * the data that it points to */
753	udb_rel_ptr_link(base, rp, rp->data);
754}
755
756/** regenerate the relptrs store in this data segment */
757static void
758regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
759	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
760{
761	udb_void arg[2];
762	arg[0] = rb_old; arg[1] = rb_new;
763	/* walk through the structs here and put them on their respective
764	 * relptr lists */
765	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
766		&regen_relptr_func, arg);
767
768}
769
770/** regenerate relptrlists in the file */
771static void
772regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
773	udb_void rb_old, udb_void rb_new)
774{
775	udb_void at = alloc->udb->glob_data->hsize;
776	/* clear all ptrlist start pointers in the file. */
777	while(at < alloc->disk->nextgrow) {
778		int exp = (int)UDB_CHUNK(at)->exp;
779		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
780		if(exp == UDB_EXP_XL) {
781			UDB_XL_CHUNK(at)->ptrlist = 0;
782			at += UDB_XL_CHUNK(at)->size;
783		} else if(tp == udb_chunk_type_free) {
784			at += (uint64_t)1<<exp;
785		} else { /* data chunk */
786			UDB_CHUNK(at)->ptrlist = 0;
787			at += (uint64_t)1<<exp;
788		}
789	}
790	/* walk through all relptr structs and put on the right list. */
791	at = alloc->udb->glob_data->hsize;
792	while(at < alloc->disk->nextgrow) {
793		udb_chunk_d* atp = UDB_CHUNK(at);
794		int exp = (int)atp->exp;
795		udb_chunk_type tp = (udb_chunk_type)atp->type;
796		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
797			(uint64_t)1<<exp);
798		if(exp == UDB_EXP_XL) {
799			assert(at != rb_old); /* should have been freed */
800			regen_its_ptrs(base, udb, atp,
801				((char*)atp)+sizeof(udb_xl_chunk_d),
802				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
803				rb_old, rb_new);
804			at += sz;
805		} else if(tp == udb_chunk_type_free) {
806			at += sz;
807		} else { /* data chunk */
808			assert(at != rb_old); /* should have been freed */
809			regen_its_ptrs(base, udb, atp,
810				((char*)atp)+sizeof(udb_chunk_d),
811				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
812			at += sz;
813		}
814	}
815}
816
817
818/** mark free elements from ex XL chunk space and later fixups pick that up */
819static void
820rb_mark_free_segs(void* base, udb_void s, uint64_t m)
821{
822	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
823	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
824	assert(s >= UDB_ALLOC_CHUNK_SIZE);
825	while(q >= s) {
826		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
827		UDB_CHUNK(q)->type = udb_chunk_type_free;
828		q -= UDB_ALLOC_CHUNK_SIZE;
829	}
830}
831
832
833/** fsck rollback or rollforward XL move results */
834static int
835fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
836	uint64_t rb_size, uint64_t rb_seg)
837{
838
839	if(rb_old <= rb_new)
840		return 0; /* XL move one way */
841	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
842		return 0; /* not aligned */
843	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
844		return 0; /* not aligned */
845	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
846		return 0; /* not aligned */
847	if(rb_new + rb_size <= rb_old) {
848		/* not overlapping: resume copy */
849		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
850		/* and free up old piece(s) */
851		rb_mark_free_segs(base, rb_old, rb_size);
852	} else {
853		/* overlapping, see what segment we stopped at
854		 * and continue there. */
855		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
856		/* free up old piece(s); from the end of the moved segment,
857		 * until the end of the old segment */
858		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
859			(rb_new+rb_size));
860	}
861	/* do not call fix_ptrs, regenptrs does the job */
862	return 1;
863}
864
865/** fsck rollback or rollforward move results */
866static int
867fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
868	udb_void* make_free)
869{
870	if( (rb_size&(rb_size-1)) != 0)
871		return 0; /* not powerof2 */
872	if( (rb_old&(rb_size-1)) != 0)
873		return 0; /* not aligned */
874	if( (rb_new&(rb_size-1)) != 0)
875		return 0; /* not aligned */
876	/* resume copy */
877	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
878	/* do not call fix_ptrs, regenptrs does the job */
879	/* make sure udb_old is freed */
880	*make_free = rb_old;
881	return 1;
882}
883
884/** fsck the file and salvage, false if failed and file is useless */
885static int
886fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
887{
888	void* base = udb->base;
889	udb_alloc_d regen;
890	udb_void at = udb->glob_data->hsize;
891	udb_void rb_old = udb->glob_data->rb_old;
892	udb_void rb_new = udb->glob_data->rb_new;
893	udb_void rb_seg = udb->glob_data->rb_seg;
894	udb_void make_free = 0;
895	uint64_t rb_size = udb->glob_data->rb_size;
896	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
897	/* walk through the file, use the exp values to see what can be
898	 * salvaged */
899	if(moved && rb_old && rb_new && rb_size) {
900		if(rb_old+rb_size <= alloc->disk->nextgrow
901			&& rb_new+rb_size <= alloc->disk->nextgrow) {
902			/* we can use the move information to fix up the
903			 * duplicate element (or partially moved element) */
904			if(rb_size > 1024*1024) {
905				/* XL chunk */
906				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
907					rb_size, rb_seg))
908					return 0;
909			} else {
910				if(!fsck_rb(base, rb_old, rb_new, rb_size,
911					&make_free))
912					return 0;
913			}
914		}
915	}
916
917	/* rebuild freelists */
918	/* recalculate stats in alloc (except 'stat_data') */
919	/* possibly new end 'nextgrow' value */
920	memset(&regen, 0, sizeof(regen));
921	regen.nextgrow = alloc->disk->nextgrow;
922	while(at < regen.nextgrow) {
923		/* figure out this chunk */
924		int exp = (int)UDB_CHUNK(at)->exp;
925		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
926		/* consistency check possible here with end-exp */
927		if(tp == udb_chunk_type_free || at == make_free) {
928			at = regen_free(base, at, exp, &regen);
929			if(!at) return 0;
930		} else if(exp == UDB_EXP_XL) {
931			/* allocated data of XL size */
932			at = regen_xl(base, at, &regen);
933			if(!at) return 0;
934		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
935			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
936			/* allocated data */
937			at = regen_data(base, at, exp, &regen);
938			if(!at) return 0;
939		} else {
940			/* garbage; this must be EOF then */
941			regen.nextgrow = at;
942			break;
943		}
944	}
945	*alloc->disk = regen;
946
947	/* rebuild relptr lists */
948	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
949
950	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
951		udb->fname);
952	udb->glob_data->rb_old = 0;
953	udb->glob_data->rb_new = 0;
954	udb->glob_data->rb_size = 0;
955	udb->glob_data->dirty_alloc = udb_dirty_clean;
956	udb_base_sync(udb, 1);
957	return 1;
958}
959
960
961udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
962{
963	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
964	if(!alloc)
965		return NULL;
966	alloc->udb = udb;
967	alloc->disk = disk;
968	/* see if committed but uncompleted actions need to be done */
969	/* preserves the alloc state */
970	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
971		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
972			if(fsck_fsize(udb, alloc))
973				return alloc;
974		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
975			if(fsck_file(udb, alloc, 0))
976				return alloc;
977		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
978			if(fsck_file(udb, alloc, 1))
979				return alloc;
980		}
981		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
982			(int)udb->glob_data->dirty_alloc);
983		free(alloc);
984		return NULL;
985	}
986	return alloc;
987}
988
989void udb_alloc_delete(udb_alloc* alloc)
990{
991	if(!alloc) return;
992	free(alloc);
993}
994
995/** unlink this element from its freelist */
996static void
997udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
998{
999	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
1000	assert(chunk);
1001	/* chunk is a free chunk */
1002	assert(fp->exp == (uint8_t)exp);
1003	assert(fp->type == udb_chunk_type_free);
1004	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
1005	/* and thus freelist not empty */
1006	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
1007	/* unlink */
1008	if(fp->prev)
1009		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
1010	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1011	if(fp->next)
1012		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
1013}
1014
1015/** pop first element off freelist, list may not be empty */
1016static udb_void
1017udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
1018{
1019	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1020	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1021	assert(f);
1022	assert(fp->exp == (uint8_t)exp);
1023	assert(fp->type == udb_chunk_type_free);
1024	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1025	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1026	if(fp->next) {
1027		UDB_FREE_CHUNK(fp->next)->prev = 0;
1028	}
1029	return f;
1030}
1031
1032/** push new element onto freelist */
1033static void
1034udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1035{
1036	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1037	assert(f);
1038	fp->exp = (uint8_t)exp;
1039	fp->type = udb_chunk_type_free;
1040	fp->flags = 0;
1041	fp->prev = 0;
1042	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1043	if(fp->next)
1044		UDB_FREE_CHUNK(fp->next)->prev = f;
1045	chunk_set_last(base, f, exp, (uint8_t)exp);
1046	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1047}
1048
1049/** push new element onto freelist - do not initialize the elt */
1050static void
1051udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1052{
1053	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1054	assert(f);
1055	assert(fp->exp == (uint8_t)exp);
1056	assert(fp->type == udb_chunk_type_free);
1057	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1058	fp->prev = 0;
1059	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1060	if(fp->next)
1061		UDB_FREE_CHUNK(fp->next)->prev = f;
1062	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1063}
1064
1065/** add free chunks at end until specified alignment occurs */
1066static void
1067grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1068{
1069	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1070		/* the nextgrow is not a whole multiple of esz. */
1071		/* grow a free chunk of max allowed size */
1072		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1073		uint64_t fsz = (uint64_t)1<<fexp;
1074		udb_void f = alloc->disk->nextgrow;
1075		udb_void fn = alloc->disk->nextgrow+fsz;
1076		assert(fn <= alloc->udb->base_size);
1077		alloc->disk->stat_free += fsz;
1078		udb_alloc_push_fl(base, alloc, f, fexp);
1079		/* now increase nextgrow to commit that free chunk */
1080		alloc->disk->nextgrow = fn;
1081	}
1082}
1083
1084/** append chunks at end of memory space to get size exp, return dataptr */
1085static udb_void
1086grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1087{
1088	uint64_t esz = (uint64_t)1<<exp;
1089	udb_void ret;
1090	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1091	grow_align(base, alloc, esz);
1092	/* free chunks are grown, grow the one we want to use */
1093	ret = alloc->disk->nextgrow;
1094	/* take a new alloced chunk into use */
1095	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1096	UDB_CHUNK(ret)->flags = 0;
1097	UDB_CHUNK(ret)->ptrlist = 0;
1098	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1099	/* store last octet */
1100	chunk_set_last(base, ret, exp, (uint8_t)exp);
1101	/* update stats */
1102	alloc->disk->stat_alloc += esz;
1103	alloc->disk->stat_data += sz;
1104	/* now increase nextgrow to commit this newly allocated chunk */
1105	alloc->disk->nextgrow += esz;
1106	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1107	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1108	return ret + sizeof(udb_chunk_d); /* ptr to data */
1109}
1110
1111/** calculate how much space is necessary to grow for this exp */
1112static uint64_t
1113grow_end_calc(udb_alloc* alloc, int exp)
1114{
1115	uint64_t sz = (uint64_t)1<<exp;
1116	uint64_t ng = alloc->disk->nextgrow;
1117	uint64_t res;
1118	/* if nextgrow is 2**expness, no extra growth needed, only size */
1119	if( (ng & (sz-1)) == 0) {
1120		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1121		return ng+sz; /* must grow exactly 2**exp */
1122	}
1123	/* grow until 2**expness and then we need 2**exp as well */
1124	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1125	 * and then add the sz twice (go up to whole sz, and to allocate) */
1126	res = (ng & ~(sz-1)) + 2*sz;
1127	return res;
1128}
1129
1130/** see if we need to grow more than specified to enable sustained growth */
1131static uint64_t
1132grow_extra_check(udb_alloc* alloc, uint64_t ge)
1133{
1134	const uint64_t mb = 1024*1024;
1135	uint64_t bsz = alloc->udb->base_size;
1136	if(bsz <= mb) {
1137		/* below 1 Mb, double sizes for exponential growth */
1138		/* takes about 15 times to grow to 1Mb */
1139		if(ge < bsz*2)
1140			return bsz*2;
1141	} else {
1142		uint64_t gnow = ge - bsz;
1143		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1144		 * in whole megabytes rounded up. */
1145		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1146		if(gnow < want)
1147			return bsz + want;
1148	}
1149	return ge;
1150}
1151
1152/** see if free space is enough to warrant shrink (while file is open) */
1153static int
1154enough_free(udb_alloc* alloc)
1155{
1156	if(alloc->udb->base_size <= 2*1024*1024) {
1157		/* below 1 Mb, grown by double size, (so up to 2 mb),
1158		 * do not shrink unless we can 1/3 in size */
1159		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1160			return 1;
1161	} else {
1162		/* grown 12.5%, shrink 25% if possible, at least one mb */
1163		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1164		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1165		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1166			|| alloc->udb->base_size < 4*1024*1024))
1167			return 1;
1168	}
1169	return 0;
1170}
1171
1172/** grow space for a chunk of 2**exp and return dataptr */
1173static udb_void
1174udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1175{
1176	/* commit the grow action
1177	 * - the file grow only changes filesize, but not the nextgrow.
1178	 * - taking space after nextgrow into use (as free space),
1179	 *   is like free-ing a chunk (one at a time).
1180	 * - and the last chunk taken into use is like alloc.
1181	 */
1182	/* predict how much free space is needed for this */
1183	uint64_t grow_end = grow_end_calc(alloc, exp);
1184	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1185	if(grow_end <= alloc->udb->base_size) {
1186		/* we can do this with the available space */
1187		return grow_chunks(base, alloc, sz, exp);
1188	}
1189	/* we have to grow the file, re-mmap */
1190	/* see if we need to grow a little more, to avoid endless grow
1191	 * efforts on adding data */
1192	grow_end = grow_extra_check(alloc, grow_end);
1193	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1194		return 0; /* mmap or write failed (disk or mem full) */
1195	}
1196	/* we have enough space now */
1197	assert(grow_end <= alloc->udb->base_size);
1198	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1199	return grow_chunks(base, alloc, sz, exp);
1200}
1201
1202/** take XL allocation into use at end of file, return dataptr */
1203static udb_void
1204grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1205{
1206	udb_void ret;
1207	udb_xl_chunk_d* p;
1208	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1209
1210	/* align growth to whole mbs */
1211	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1212
1213	/* grow XL segment */
1214	ret = alloc->disk->nextgrow;
1215	p = UDB_XL_CHUNK(ret);
1216	p->exp = UDB_EXP_XL;
1217	p->size = xlsz;
1218	p->flags = 0;
1219	p->ptrlist = 0;
1220	p->type = udb_chunk_type_data;
1221
1222	/* also put size and marker at end for compaction */
1223	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1224	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1225
1226	/* stats */
1227	alloc->disk->stat_data += sz;
1228	alloc->disk->stat_alloc += xlsz;
1229	/* now increase the nextgrow to commit this xl chunk */
1230	alloc->disk->nextgrow += xlsz;
1231	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1232	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1233}
1234
1235/** make space for XL allocation */
1236static udb_void
1237udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1238{
1239	/* allocate whole mbs of space, at end of space */
1240	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1241	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1242	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1243	assert(need >= asz);
1244	if(grow_end <= alloc->udb->base_size) {
1245		/* can do this in available space */
1246		return grow_xl(base, alloc, need, sz);
1247	}
1248	/* have to grow file and re-mmap */
1249	grow_end = grow_extra_check(alloc, grow_end);
1250	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1251		return 0; /* mmap or write failed (disk or mem full) */
1252	}
1253	/* we have enough space now */
1254	assert(grow_end <= alloc->udb->base_size);
1255	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1256	return grow_xl(base, alloc, need, sz);
1257}
1258
1259/** divide big(2**e2) into pieces so 2**exp fits */
1260static udb_void
1261udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1262	int exp)
1263{
1264	int e = e2;
1265	uint64_t sz = (uint64_t)1<<e2;
1266	assert(big && e2 > exp);
1267	/* so the returned piece to use is the first piece,
1268	 * offload the later half until it fits */
1269	do {
1270		sz >>= 1; /* divide size of big by two */
1271		e--;      /* that means its exp is one smaller */
1272		udb_alloc_push_fl(base, alloc, big+sz, e);
1273	} while(e != exp);
1274	/* exit loop when last pushed is same size as what we want */
1275	return big;
1276}
1277
1278/** returns the exponent size of the chunk needed for data sz */
1279static int
1280udb_alloc_exp_needed(size_t sz)
1281{
1282	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1283	int exp;
1284	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1285		return UDB_EXP_XL;
1286	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1287		return UDB_ALLOC_CHUNK_MINEXP;
1288	}
1289	exp = udb_exp_size(asz);
1290	assert(exp <= UDB_ALLOC_CHUNKS_MAX);
1291	return exp;
1292}
1293
1294udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1295{
1296	void* base = alloc->udb->base;
1297	/* calculate actual allocation size */
1298	int e2, exp = udb_alloc_exp_needed(sz);
1299	if(exp == UDB_EXP_XL)
1300		return udb_alloc_xl_space(base, alloc, sz);
1301	/* see if there is a free chunk of that size exactly */
1302	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1303		/* snip from freelist, udb_chunk_d */
1304		udb_void ret;
1305		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1306		ret = udb_alloc_pop_fl(base, alloc, exp);
1307		/* use it - size octets already OK */
1308		UDB_CHUNK(ret)->flags = 0;
1309		UDB_CHUNK(ret)->ptrlist = 0;
1310		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1311		/* update stats */
1312		alloc->disk->stat_data += sz;
1313		alloc->disk->stat_alloc += (1<<exp);
1314		assert(alloc->disk->stat_free >= (1u<<exp));
1315		alloc->disk->stat_free -= (1<<exp);
1316		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1317		return ret + sizeof(udb_chunk_d); /* ptr to data */
1318	}
1319	/* see if we can subdivide a larger chunk */
1320	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1321		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1322			udb_void big, ret; /* udb_chunk_d */
1323			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1324			big = udb_alloc_pop_fl(base, alloc, e2);
1325			/* push other parts onto freelists (needs inited) */
1326			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1327			/* use final part (needs inited) */
1328			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1329			/* if stop here; the new exp makes smaller free chunk*/
1330			UDB_CHUNK(ret)->flags = 0;
1331			UDB_CHUNK(ret)->ptrlist = 0;
1332			/* set type to commit data chunk */
1333			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1334			/* store last octet */
1335			chunk_set_last(base, ret, exp, (uint8_t)exp);
1336			/* update stats */
1337			alloc->disk->stat_data += sz;
1338			alloc->disk->stat_alloc += (1<<exp);
1339			assert(alloc->disk->stat_free >= (1u<<exp));
1340			alloc->disk->stat_free -= (1<<exp);
1341			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1342			return ret + sizeof(udb_chunk_d); /* ptr to data */
1343		}
1344	/* we need to grow an extra chunk */
1345	return udb_alloc_grow_space(base, alloc, sz, exp);
1346}
1347
1348/** see if there is free space to allocate a chunk into */
1349static int
1350have_free_for(udb_alloc* alloc, int exp)
1351{
1352	int e2;
1353	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1354		return exp;
1355	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1356		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1357			return e2;
1358		}
1359	return 0;
1360}
1361
1362/** fix relptr prev and next for moved relptr structures */
1363static void
1364chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1365{
1366	udb_void* data = (udb_void*)arg;
1367	udb_void r;
1368	if(!rp->data)
1369		return;
1370	r = UDB_SYSTOREL(base, rp);
1371	if(rp->next)
1372		UDB_REL_PTR(rp->next)->prev = r;
1373	if(rp->prev)
1374		UDB_REL_PTR(rp->prev)->next = r;
1375	else	{
1376		/* if this is a pointer to its own chunk, fix it up;
1377		 * the data ptr gets set by relptr_edit later. */
1378		if(rp->data == data[0])
1379			UDB_CHUNK(data[1])->ptrlist = r;
1380		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1381	}
1382}
1383
1384/** fix pointers from and to a moved chunk */
1385static void
1386chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1387	uint64_t dsz, udb_void olddata)
1388{
1389	udb_void d[2];
1390	d[0] = olddata;
1391	d[1] = data;
1392	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1393		dsz, &chunk_fix_ptr_each, d);
1394	udb_rel_ptr_edit(base, cp->ptrlist, data);
1395	udb_base_ram_ptr_edit(udb, olddata, data);
1396}
1397
1398/** move an allocated chunk to use a free chunk */
1399static void
1400move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1401	int e2)
1402{
1403	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1404	udb_chunk_d* rp;
1405	udb_chunk_d* fp;
1406	if(exp != e2) {
1407		/* it is bigger, subdivide it */
1408		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1409	}
1410	assert(res != f);
1411	/* setup rollback information */
1412	alloc->udb->glob_data->rb_old = f;
1413	alloc->udb->glob_data->rb_new = res;
1414	alloc->udb->glob_data->rb_size = esz;
1415	/* take the res, exp into use */
1416	rp = UDB_CHUNK(res);
1417	fp = UDB_CHUNK(f);
1418	/* copy over the data */
1419	memcpy(rp, fp, esz);
1420	/* adjust rel ptrs */
1421	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1422		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1423
1424	/* do not freeup the fp; caller does that */
1425}
1426
1427/** unlink several free elements to overwrite with xl chunk */
1428static void
1429free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1430{
1431	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1432	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1433	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1434	while(q >= s) {
1435		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1436		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1437		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1438		q -= UDB_ALLOC_CHUNK_SIZE;
1439	}
1440}
1441
1442/** move an XL chunk, and keep track of segments for rollback */
1443static void
1444move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1445	uint64_t sz, uint64_t startseg)
1446{
1447	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1448	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1449	uint64_t amount = xl - n;
1450	assert(n < xl); /* move to compact */
1451
1452	/* setup move rollback */
1453	udb->glob_data->rb_old = xl;
1454	udb->glob_data->rb_new = n;
1455	udb->glob_data->rb_size = sz;
1456
1457	/* is it overlapping? */
1458	if(sz <= amount) {
1459		memcpy(np, xlp, sz);
1460	} else {
1461		/* move and commit per 1M segment to avoid data loss */
1462		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1463		for(seg = startseg; seg<maxseg; seg++) {
1464			udb->glob_data->rb_seg = seg;
1465			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1466				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1467				UDB_ALLOC_CHUNK_SIZE);
1468		}
1469
1470	}
1471}
1472
1473/** move list of XL chunks to the front by the shift amount */
1474static void
1475move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1476	uint64_t amount)
1477{
1478	udb_void xl = xl_start;
1479	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1480	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1481	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1482	while(xl < xl_start+xl_sz) {
1483		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1484		udb_void n = xl-amount;
1485		uint64_t sz = xlp->size;
1486		assert(xlp->exp == UDB_EXP_XL);
1487		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1488		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1489			n+sizeof(udb_xl_chunk_d),
1490			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1491			xl+sizeof(udb_xl_chunk_d));
1492	}
1493	alloc->disk->stat_free -= amount;
1494	alloc->disk->nextgrow -= amount;
1495	alloc->udb->glob_data->rb_old = 0;
1496	alloc->udb->glob_data->rb_new = 0;
1497	alloc->udb->glob_data->rb_size = 0;
1498}
1499
1500/** see if free chunk can coagulate with another chunk, return other chunk */
1501static udb_void
1502coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1503	uint64_t esz)
1504{
1505	udb_void other = f^esz;
1506	if(exp == UDB_ALLOC_CHUNKS_MAX)
1507		return 0; /* no further merges */
1508	if(other >= alloc->udb->base_size)
1509		return 0; /* not allocated */
1510	if(other >= alloc->disk->nextgrow)
1511		return 0; /* not in use */
1512	if(other < alloc->udb->glob_data->hsize)
1513		return 0; /* cannot merge with header */
1514		/* the header is also protected by the special exp marker */
1515	/* see if the other chunk is a free chunk */
1516
1517	/* check closest marker to avoid large memory churn */
1518	/* and also it makes XL allocations and header special markers work */
1519	if(f > other) {
1520		assert(f > 1); /* this is certain because of header */
1521		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1522			/* can do it if the other part is a free chunk */
1523			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1524			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1525				return other;
1526		}
1527	} else {
1528		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1529			/* can do it if the other part is a free chunk */
1530			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1531			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1532				return other;
1533		}
1534	}
1535	return 0;
1536}
1537
1538/** coagulate and then add new free segment, return final free segment */
1539static udb_void
1540coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1541	uint64_t esz)
1542{
1543	/* new free chunk here, attempt coagulate */
1544	udb_void other;
1545	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1546		/* unlink that other chunk */
1547		udb_alloc_unlink_fl(base, alloc, other, exp);
1548		/* merge up */
1549		if(other < last)
1550			last = other;
1551		exp++;
1552		esz <<= 1;
1553	}
1554	/* free the final segment */
1555	udb_alloc_push_fl(base, alloc, last, exp);
1556	return last;
1557}
1558
1559/** attempt to compact the data and move free space to the end */
1560int
1561udb_alloc_compact(void* base, udb_alloc* alloc)
1562{
1563	udb_void last;
1564	int exp, e2;
1565	uint64_t esz;
1566	uint64_t at = alloc->disk->nextgrow;
1567	udb_void xl_start = 0;
1568	uint64_t xl_sz = 0;
1569	if(alloc->udb->inhibit_compact)
1570		return 1;
1571	alloc->udb->useful_compact = 0;
1572	while(at > alloc->udb->glob_data->hsize) {
1573		/* grab last entry */
1574		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1575		if(exp == UDB_EXP_XL) {
1576			/* for XL chunks:
1577			 * - inspect the size of the XLchunklist at end
1578			 * - attempt to compact in front of of XLchunklist
1579			 */
1580			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1581				at-sizeof(uint64_t)*2));
1582			udb_void xl = at-xlsz;
1583#ifndef NDEBUG
1584			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1585			assert(xlp->exp == UDB_EXP_XL);
1586			assert(xlp->type != udb_chunk_type_free);
1587#endif
1588			/* got thesegment add to the xl chunk list */
1589			if(xl_start != 0 && xl+xlsz != xl_start) {
1590				/* nonadjoining XL part, but they are aligned,
1591				 * so the space in between is whole Mbs,
1592				 * shift the later part(s) and continue */
1593				uint64_t m = xl_start - (xl+xlsz);
1594				assert(xl_start > xl+xlsz);
1595				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1596				free_xl_space(base, alloc, xl+xlsz, m);
1597				move_xl_list(base, alloc, xl_start, xl_sz, m);
1598				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1599			}
1600			xl_start = xl;
1601			xl_sz += xlsz;
1602			at = xl;
1603			continue;
1604			/* end of XL if */
1605		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1606			|| exp > UDB_ALLOC_CHUNKS_MAX)
1607			break; /* special chunk or garbage */
1608		esz = (uint64_t)1<<exp;
1609		last = at - esz;
1610		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1611		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1612			/* if xlstart continue looking to move stuff, but do
1613			 * not unlink this free segment */
1614			if(!xl_start) {
1615				/* it is a free chunk, remove it */
1616				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1617				udb_alloc_unlink_fl(base, alloc, last, exp);
1618				alloc->disk->stat_free -= esz;
1619				alloc->disk->nextgrow = last;
1620				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1621				/* and continue at this point */
1622			}
1623			at = last;
1624		} else if( (e2=have_free_for(alloc, exp)) ) {
1625			/* last entry can be allocated in free chunks
1626			 * move it to its new position, adjust rel_ptrs */
1627			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1628			move_chunk(base, alloc, last, exp, esz, e2);
1629			if(xl_start) {
1630				last = coagulate_and_push(base, alloc,
1631					last, exp, esz);
1632			} else {
1633				/* shorten usage */
1634				alloc->disk->stat_free -= esz;
1635				alloc->disk->nextgrow = last;
1636			}
1637			alloc->udb->glob_data->rb_old = 0;
1638			alloc->udb->glob_data->rb_new = 0;
1639			alloc->udb->glob_data->rb_size = 0;
1640			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1641			/* and continue in front of it */
1642			at = last;
1643		} else {
1644			/* cannot compact this block, stop compacting */
1645			break;
1646		}
1647		/* if that worked, repeat it */
1648	}
1649	/* if we passed xl chunks, see if XL-chunklist can move */
1650	if(xl_start) {
1651		/* calculate free space in front of the XLchunklist. */
1652		/* has to be whole mbs of free space */
1653		/* if so, we can move the XL chunks.  Move them all back
1654		 * by the new free space. */
1655		/* this compacts very well, but the XL chunks can be moved
1656		 * multiple times; worst case for every mb freed a huge sized
1657		 * xlchunklist gets moved. */
1658		/* free space must be, since aligned and coagulated, in
1659		 * chunks of a whole MB */
1660		udb_void at = xl_start;
1661		uint64_t m = 0;
1662		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1663			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1664			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1665				break;
1666			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1667			m += UDB_ALLOC_CHUNK_SIZE;
1668			at = chunk;
1669		}
1670		if(m != 0) {
1671			assert(at+m == xl_start);
1672			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1673			free_xl_space(base, alloc, at, m);
1674			move_xl_list(base, alloc, xl_start, xl_sz, m);
1675			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1676		}
1677	}
1678
1679	/* if enough free, shrink the file; re-mmap */
1680	if(enough_free(alloc)) {
1681		uint64_t nsize = alloc->disk->nextgrow;
1682		udb_base_shrink(alloc->udb, nsize);
1683		if(!udb_base_remap(alloc->udb, alloc, nsize))
1684			return 0;
1685	}
1686	return 1;
1687}
1688
1689int
1690udb_compact(udb_base* udb)
1691{
1692	if(!udb) return 1;
1693	if(!udb->useful_compact) return 1;
1694	DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database..."));
1695	return udb_alloc_compact(udb->base, udb->alloc);
1696}
1697
1698void udb_compact_inhibited(udb_base* udb, int inhibit)
1699{
1700	if(!udb) return;
1701	udb->inhibit_compact = inhibit;
1702}
1703
1704#ifdef UDB_CHECK
1705/** check that rptrs are really zero before free */
1706void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1707{
1708	(void)base;
1709	(void)arg;
1710	assert(p->data == 0);
1711}
1712#endif /* UDB_CHECK */
1713
1714/** free XL chunk as multiples of CHUNK_SIZE free segments */
1715static void
1716udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1717	size_t sz)
1718{
1719	uint64_t xlsz = fp->size;
1720	uint64_t c;
1721	/* lightweight check for buffer overflow in xl data */
1722	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1723	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1724	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1725	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1726#ifdef UDB_CHECK
1727	/* check that relptrs in this chunk have been zeroed */
1728	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1729		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1730		&udb_check_rptr_zero, NULL);
1731#endif
1732	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1733	/* update stats */
1734	alloc->disk->stat_data -= sz;
1735	alloc->disk->stat_alloc -= xlsz;
1736	alloc->disk->stat_free += xlsz;
1737	/* walk in reverse, so the front blocks go first on the list */
1738	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1739	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1740	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1741	while(c >= f) {
1742		/* free a block of CHUNK_SIZE (1 Mb) */
1743		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1744		c -= UDB_ALLOC_CHUNK_SIZE;
1745	}
1746	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1747}
1748
1749int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1750{
1751	void* base;
1752	/* lookup chunk ptr */
1753	udb_void f;
1754	udb_chunk_d* fp;
1755	uint64_t esz;
1756	int exp;
1757	udb_void other;
1758	int coagulated = 0;
1759	if(!r)
1760		return 1; /* free(NULL) does nothing */
1761
1762	/* lookup size of chunk */
1763	base = alloc->udb->base;
1764	/* fails for XL blocks */
1765	f = chunk_from_dataptr(r);
1766	fp = UDB_CHUNK(f);
1767	assert(fp->type != udb_chunk_type_free);
1768
1769	/* see if it has a ptrlist, if so: trouble, the list is not properly
1770	 * cleaned up. (although you can imagine a wholesale delete where
1771	 * it does not matter) */
1772	assert(fp->ptrlist == 0);
1773
1774	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1775	fp->ptrlist = 0;
1776
1777	if(fp->exp == UDB_EXP_XL) {
1778		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1779		/* compact */
1780		if(alloc->udb->inhibit_compact) {
1781			alloc->udb->useful_compact = 1;
1782			return 1;
1783		}
1784		return udb_alloc_compact(base, alloc);
1785	}
1786	/* it is a regular chunk of 2**exp size */
1787	exp = (int)fp->exp;
1788	esz = (uint64_t)1<<exp;
1789	/* light check for e.g. buffer overflow of the data */
1790	assert(sz < esz);
1791	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1792#ifdef UDB_CHECK
1793	/* check that relptrs in this chunk have been zeroed */
1794	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1795		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1796#endif
1797
1798	/* update the stats */
1799	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1800	alloc->disk->stat_data -= sz;
1801	alloc->disk->stat_free += esz;
1802	alloc->disk->stat_alloc -= esz;
1803
1804	/* if it can be merged with other free chunks, do so */
1805	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1806		coagulated = 1;
1807		/* unlink that other chunk and expand it (it has same size) */
1808		udb_alloc_unlink_fl(base, alloc, other, exp);
1809		/* merge up */
1810		if(other < f)
1811			f = other;
1812		exp++;
1813		esz <<= 1;
1814	}
1815	if(coagulated) {
1816		/* put big free chunk into freelist, and init it */
1817		udb_alloc_push_fl(base, alloc, f, exp);
1818	} else {
1819		/* we do not need to touch the last-exp-byte, which may save
1820		 * a reference to that page of memory */
1821		fp->type = udb_chunk_type_free;
1822		fp->flags = 0;
1823		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1824	}
1825	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1826	/* compact */
1827	if(alloc->udb->inhibit_compact) {
1828		alloc->udb->useful_compact = 1;
1829		return 1;
1830	}
1831	return udb_alloc_compact(base, alloc);
1832}
1833
1834udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1835{
1836	/* could be faster maybe, if grown? */
1837	udb_void r = udb_alloc_space(alloc, sz);
1838	if(!r) return r;
1839	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1840	return r;
1841}
1842
1843udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1844{
1845	void* base = alloc->udb->base;
1846	udb_void c, n, newd;
1847	udb_chunk_d* cp, *np;
1848	uint64_t avail;
1849	uint8_t cp_type;
1850	/* emulate some posix realloc stuff */
1851	if(r == 0)
1852		return udb_alloc_space(alloc, sz);
1853	if(sz == 0) {
1854		if(!udb_alloc_free(alloc, r, osz))
1855			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1856		return 0;
1857	}
1858	c = chunk_from_dataptr(r);
1859	cp = UDB_CHUNK(c);
1860	cp_type = cp->type;
1861	if(cp->exp == UDB_EXP_XL) {
1862		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1863			- sizeof(uint64_t)*2;
1864	} else {
1865		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1866	}
1867	if(sz <= avail)
1868		return r;
1869	/* reallocate it, and copy */
1870	newd = udb_alloc_space(alloc, sz);
1871	if(!newd) return 0;
1872	/* re-base after alloc, since re-mmap may have happened */
1873	base = alloc->udb->base;
1874	cp = NULL; /* may be invalid now, robustness */
1875	n = chunk_from_dataptr(newd);
1876	np = UDB_CHUNK(n);
1877	np->type = cp_type;
1878	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1879	/* fixup ptrs */
1880	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1881
1882	if(!udb_alloc_free(alloc, r, osz))
1883		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1884	return newd;
1885}
1886
1887int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1888{
1889	const uint64_t mb = 1024*1024;
1890	int exp = udb_alloc_exp_needed(sz);
1891	uint64_t esz;
1892	uint64_t want;
1893	if(exp == UDB_EXP_XL)
1894		esz = (sz&(mb-1))+mb;
1895	else	esz = (uint64_t)1<<exp;
1896	/* we need grow_end_calc to take into account alignment */
1897	want = grow_end_calc(alloc, exp) + esz*(num-1);
1898	assert(want >= alloc->udb->base_size);
1899	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1900		log_msg(LOG_ERR, "failed to grow the specified amount");
1901		return 0;
1902	}
1903	return 1;
1904}
1905
1906void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1907{
1908	void* base = alloc->udb->base;
1909	udb_void f = chunk_from_dataptr(r);
1910	udb_chunk_d* fp = UDB_CHUNK(f);
1911	/* not the 'free' type, that must be set by allocation routines */
1912	assert(fp->type != udb_chunk_type_free);
1913	assert(tp != udb_chunk_type_free);
1914	fp->type = tp;
1915}
1916
1917int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1918{
1919	/* pointers are not valid before the header-size or after the
1920	 * used-region of the mmap */
1921	return ( (to+destsize) <= udb->base_size &&
1922		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1923		(to+destsize) <= udb->alloc->disk->nextgrow);
1924}
1925
1926int udb_valid_dataptr(udb_base* udb, udb_void to)
1927{
1928	void* base = udb->base;
1929	udb_void ch;
1930	int exp;
1931	uint64_t esz;
1932	/* our data chunks are aligned and at least 8 bytes */
1933	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1934		return 0;
1935	/* get the chunk pointer */
1936	ch = chunk_from_dataptr(to);
1937	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1938		return 0;
1939	/* check its size */
1940	exp = UDB_CHUNK(ch)->exp;
1941	if(exp == UDB_EXP_XL) {
1942		/* check XL chunk */
1943		uint64_t xlsz;
1944		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1945			return 0;
1946		xlsz = UDB_XL_CHUNK(ch)->size;
1947		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1948			return 0;
1949		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1950			return 0;
1951		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1952			!= xlsz)
1953			return 0;
1954		return 1;
1955	}
1956	/* check if regular chunk has matching end byte */
1957	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1958		return 0; /* cannot be a valid chunk */
1959	esz = 1<<exp;
1960	if(!udb_valid_offset(udb, ch+esz-1, 1))
1961		return 0;
1962	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1963		return 0;
1964	return 1;
1965}
1966
1967int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1968{
1969	void* base = udb->base;
1970	udb_void p;
1971	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1972		return 0;
1973	if(!udb_valid_dataptr(udb, to))
1974		return 0;
1975	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1976	while(p) {
1977		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1978			return 0;
1979		if(p == rptr)
1980			return 1;
1981		p = UDB_REL_PTR(p)->next;
1982	}
1983	return 0;
1984}
1985
1986void udb_rel_ptr_init(udb_rel_ptr* ptr)
1987{
1988	memset(ptr, 0, sizeof(*ptr));
1989}
1990
1991void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1992{
1993	if(!ptr->data)
1994		return;
1995	if(ptr->prev) {
1996		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1997	} else {
1998		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1999	}
2000	if(ptr->next) {
2001		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
2002	}
2003}
2004
2005void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
2006{
2007	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
2008	ptr->prev = 0;
2009	ptr->next = chunk->ptrlist;
2010	if(ptr->next)
2011		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
2012	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
2013	ptr->data = to;
2014}
2015
2016void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
2017{
2018	assert(to == 0 || to > 64);
2019	udb_rel_ptr_unlink(base, ptr);
2020	if(to)
2021		udb_rel_ptr_link(base, ptr, to);
2022	else	ptr->data = to;
2023}
2024
2025void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
2026{
2027	udb_void p = list;
2028	while(p) {
2029		UDB_REL_PTR(p)->data = to;
2030		p = UDB_REL_PTR(p)->next;
2031	}
2032}
2033
2034#ifdef UDB_CHECK
2035/** check that all pointers are validly chained */
2036static void
2037udb_check_ptrs_valid(udb_base* udb)
2038{
2039	size_t i;
2040	udb_ptr* p, *prev;
2041	for(i=0; i<udb->ram_size; i++) {
2042		prev = NULL;
2043		for(p=udb->ram_hash[i]; p; p=p->next) {
2044			assert(p->prev == prev);
2045			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
2046				== i);
2047			assert(p->base == &udb->base);
2048			prev = p;
2049		}
2050	}
2051}
2052#endif /* UDB_CHECK */
2053
2054void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2055{
2056#ifdef UDB_CHECK
2057	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2058#endif
2059	memset(ptr, 0, sizeof(*ptr));
2060	ptr->base = &udb->base;
2061}
2062
2063void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2064{
2065	assert(newval == 0 || newval > 64);
2066	if(ptr->data)
2067		udb_base_unlink_ptr(udb, ptr);
2068	ptr->data = newval;
2069	if(newval)
2070		udb_base_link_ptr(udb, ptr);
2071}
2072
2073int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2074	size_t sz)
2075{
2076	udb_void r;
2077	r = udb_alloc_space(udb->alloc, sz);
2078	if(!r) return 0;
2079	udb_alloc_set_type(udb->alloc, r, type);
2080	udb_ptr_init(ptr, udb);
2081	udb_ptr_set(ptr, udb, r);
2082	return 1;
2083}
2084
2085void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2086{
2087	if(ptr->data) {
2088		udb_void d = ptr->data;
2089		udb_ptr_set(ptr, udb, 0);
2090		udb_alloc_free(udb->alloc, d, sz);
2091	}
2092}
2093
2094udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2095{
2096	udb_void f;
2097	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2098	f = chunk_from_dataptr(ptr->data);
2099	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2100}
2101