1/* udb.c - u(micro) data base. 2 * By W.C.A. Wijngaards 3 * Copyright 2010, NLnet Labs. 4 * BSD, see LICENSE. 5 */ 6#include "config.h" 7#include "udb.h" 8#include <string.h> 9#include <errno.h> 10#include <stdio.h> 11#include <unistd.h> 12#include <assert.h> 13#include "lookup3.h" 14#include "util.h" 15 16/* mmap and friends */ 17#include <sys/types.h> 18#include <sys/stat.h> 19#include <fcntl.h> 20#include <sys/mman.h> 21 22/* for systems without, portable definition, failed-1 and async is a flag */ 23#ifndef MAP_FAILED 24#define MAP_FAILED ((void*)-1) 25#endif 26#ifndef MS_SYNC 27#define MS_SYNC 0 28#endif 29 30/** move and fixup xl segment */ 31static void move_xl_segment(void* base, udb_base* udb, udb_void xl, 32 udb_void n, uint64_t sz, uint64_t startseg); 33/** attempt to compact the data and move free space to the end */ 34static int udb_alloc_compact(void* base, udb_alloc* alloc); 35 36/** convert pointer to the data part to a pointer to the base of the chunk */ 37static udb_void 38chunk_from_dataptr(udb_void data) 39{ 40 /* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and 41 * that xl_chunk_d is aligned on x**1024 boundaries. */ 42 udb_void xl = data - sizeof(udb_xl_chunk_d); 43 if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0) 44 return xl; 45 return data - sizeof(udb_chunk_d); 46} 47 48udb_void chunk_from_dataptr_ext(udb_void data) { 49 return chunk_from_dataptr(data); 50} 51 52#ifndef NDEBUG 53/** read last octet from a chunk */ 54static uint8_t 55chunk_get_last(void* base, udb_void chunk, int exp) 56{ 57 return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)); 58} 59#endif 60 61/** write last octet of a chunk */ 62static void 63chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value) 64{ 65 assert(exp >= 0 && exp <= 63); 66 *((uint8_t*)UDB_REL(base, chunk+((uint64_t)1<<exp)-1)) = value; 67} 68 69/** create udb_base from a file descriptor (must be at start of file) */ 70udb_base* 71udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc, 72 void* arg) 73{ 74 uint64_t m, fsz; 75 udb_glob_d g; 76 ssize_t r; 77 udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb)); 78 if(!udb) { 79 log_msg(LOG_ERR, "out of memory"); 80 close(fd); 81 return NULL; 82 } 83 udb->fname = strdup(fname); 84 if(!udb->fname) { 85 log_msg(LOG_ERR, "out of memory"); 86 free(udb); 87 close(fd); 88 return NULL; 89 } 90 udb->walkfunc = walkfunc; 91 udb->walkarg = arg; 92 udb->fd = fd; 93 udb->ram_size = 1024; 94 udb->ram_mask = (int)udb->ram_size - 1; 95 udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*), 96 udb->ram_size); 97 if(!udb->ram_hash) { 98 free(udb->fname); 99 free(udb); 100 log_msg(LOG_ERR, "out of memory"); 101 close(fd); 102 return NULL; 103 } 104 105 /* read magic */ 106 if((r=read(fd, &m, sizeof(m))) == -1) { 107 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 108 goto fail; 109 } else if(r != (ssize_t)sizeof(m)) { 110 log_msg(LOG_ERR, "%s: file too short", fname); 111 goto fail; 112 } 113 /* TODO : what if bigendian and littleendian file, see magic */ 114 if(m != UDB_MAGIC) { 115 log_msg(LOG_ERR, "%s: wrong type of file", fname); 116 goto fail; 117 } 118 /* read header */ 119 if((r=read(fd, &g, sizeof(g))) == -1) { 120 log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno)); 121 goto fail; 122 } else if(r != (ssize_t)sizeof(g)) { 123 log_msg(LOG_ERR, "%s: file too short", fname); 124 goto fail; 125 } 126 if(g.version != 0) { 127 log_msg(LOG_ERR, "%s: unknown file version %d", fname, 128 (int)g.version); 129 goto fail; 130 } 131 if(g.hsize < UDB_HEADER_SIZE) { 132 log_msg(LOG_ERR, "%s: header size too small %d", fname, 133 (int)g.hsize); 134 goto fail; 135 } 136 if(g.hsize > UDB_HEADER_SIZE) { 137 log_msg(LOG_WARNING, "%s: header size too large %d", fname, 138 (int)g.hsize); 139 goto fail; 140 } 141 if(g.clean_close != 1) { 142 log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname, 143 (int)g.clean_close); 144 goto fail; 145 } 146 if(g.dirty_alloc != 0) { 147 log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname, 148 (int)g.dirty_alloc); 149 goto fail; 150 } 151 152 /* check file size correctly written, for 4.0.2 nsd.db failure */ 153 fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END); 154 (void)lseek(fd, (off_t)0, SEEK_SET); 155 if(g.fsize != fsz) { 156 log_msg(LOG_WARNING, "%s: file size %llu but mmap header " 157 "has size %llu", fname, (unsigned long long)fsz, 158 (unsigned long long)g.fsize); 159 goto fail; 160 } 161 162 /* mmap it */ 163 if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) { 164 log_msg(LOG_ERR, "%s: file too short", fname); 165 goto fail; 166 } 167 if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ { 168 log_msg(LOG_WARNING, "%s: file size too large %llu", 169 fname, (unsigned long long)g.fsize); 170 goto fail; 171 } 172 udb->base_size = (size_t)g.fsize; 173#ifdef HAVE_MMAP 174 /* note the size_t casts must be there for portability, on some 175 * systems the layout of memory is otherwise broken. */ 176 udb->base = mmap(NULL, (size_t)udb->base_size, 177 (int)PROT_READ|PROT_WRITE, (int)MAP_SHARED, 178 (int)udb->fd, (off_t)0); 179#else 180 udb->base = MAP_FAILED; errno = ENOSYS; 181#endif 182 if(udb->base == MAP_FAILED) { 183 udb->base = NULL; 184 log_msg(LOG_ERR, "mmap(size %u) error: %s", 185 (unsigned)udb->base_size, strerror(errno)); 186 fail: 187 close(fd); 188 free(udb->fname); 189 free(udb->ram_hash); 190 free(udb); 191 return NULL; 192 } 193 194 /* init completion */ 195 udb->glob_data = (udb_glob_d*)((char*)udb->base+sizeof(uint64_t)); 196 r = 0; 197 /* cannot be dirty because that is goto fail above */ 198 if(udb->glob_data->dirty_alloc != udb_dirty_clean) 199 r = 1; 200 udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)( 201 (char*)udb->glob_data+sizeof(*udb->glob_data))); 202 if(!udb->alloc) { 203 log_msg(LOG_ERR, "out of memory"); 204 udb_base_free(udb); 205 return NULL; 206 } 207 if(r) { 208 /* and compact now, or resume compacting */ 209 udb_alloc_compact(udb, udb->alloc); 210 udb_base_sync(udb, 1); 211 } 212 udb->glob_data->clean_close = 0; 213 214 return udb; 215} 216 217udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc, 218 void* arg) 219{ 220 int fd = open(fname, O_RDWR); 221 if(fd == -1) { 222 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 223 return NULL; 224 } 225 return udb_base_create_fd(fname, fd, walkfunc, arg); 226} 227 228/** init new udb_global structure */ 229static void udb_glob_init_new(udb_glob_d* g) 230{ 231 memset(g, 0, sizeof(*g)); 232 g->hsize = UDB_HEADER_SIZE; 233 g->fsize = UDB_HEADER_SIZE; 234} 235 236/** write data to file and check result */ 237static int 238write_fdata(const char* fname, int fd, void* data, size_t len) 239{ 240 ssize_t w; 241 if((w=write(fd, data, len)) == -1) { 242 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 243 close(fd); 244 return 0; 245 } else if(w != (ssize_t)len) { 246 log_msg(LOG_ERR, "%s: short write (disk full?)", fname); 247 close(fd); 248 return 0; 249 } 250 return 1; 251} 252 253udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc, 254 void* arg) 255{ 256 uint64_t m; 257 udb_glob_d g; 258 udb_alloc_d a; 259 uint64_t endsize = UDB_HEADER_SIZE; 260 uint64_t endexp = 0; 261 int fd = open(fname, O_CREAT|O_RDWR, 0600); 262 if(fd == -1) { 263 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 264 return NULL; 265 } 266 m = UDB_MAGIC; 267 udb_glob_init_new(&g); 268 udb_alloc_init_new(&a); 269 g.clean_close = 1; 270 271 /* write new data to file (closes fd on error) */ 272 if(!write_fdata(fname, fd, &m, sizeof(m))) 273 return NULL; 274 if(!write_fdata(fname, fd, &g, sizeof(g))) 275 return NULL; 276 if(!write_fdata(fname, fd, &a, sizeof(a))) 277 return NULL; 278 if(!write_fdata(fname, fd, &endsize, sizeof(endsize))) 279 return NULL; 280 if(!write_fdata(fname, fd, &endexp, sizeof(endexp))) 281 return NULL; 282 /* rewind to start */ 283 if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) { 284 log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno)); 285 close(fd); 286 return NULL; 287 } 288 /* truncate to the right size */ 289 if(ftruncate(fd, (off_t)g.fsize) < 0) { 290 log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname, 291 (int)g.fsize, strerror(errno)); 292 close(fd); 293 return NULL; 294 } 295 return udb_base_create_fd(fname, fd, walkfunc, arg); 296} 297 298/** shrink the udb base if it has unused space at the end */ 299static void 300udb_base_shrink(udb_base* udb, uint64_t nsize) 301{ 302 udb->glob_data->dirty_alloc = udb_dirty_fsize; 303 udb->glob_data->fsize = nsize; 304 /* sync, does not *seem* to be required on Linux, but it is 305 certainly required on OpenBSD. Otherwise changed data is lost. */ 306#ifdef HAVE_MMAP 307 msync(udb->base, udb->base_size, MS_ASYNC); 308#endif 309 if(ftruncate(udb->fd, (off_t)nsize) != 0) { 310 log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname, 311 (unsigned)nsize, strerror(errno)); 312 } 313 udb->glob_data->dirty_alloc = udb_dirty_clean; 314} 315 316void udb_base_close(udb_base* udb) 317{ 318 if(!udb) 319 return; 320 if(udb->fd != -1 && udb->base && udb->alloc) { 321 uint64_t nsize = udb->alloc->disk->nextgrow; 322 if(nsize < udb->base_size) 323 udb_base_shrink(udb, nsize); 324 } 325 if(udb->fd != -1) { 326 udb->glob_data->clean_close = 1; 327 close(udb->fd); 328 udb->fd = -1; 329 } 330 if(udb->base) { 331#ifdef HAVE_MMAP 332 if(munmap(udb->base, udb->base_size) == -1) { 333 log_msg(LOG_ERR, "munmap: %s", strerror(errno)); 334 } 335#endif 336 udb->base = NULL; 337 } 338} 339 340void udb_base_free(udb_base* udb) 341{ 342 if(!udb) 343 return; 344 udb_base_close(udb); 345 udb_alloc_delete(udb->alloc); 346 free(udb->ram_hash); 347 free(udb->fname); 348 free(udb); 349} 350 351void udb_base_free_keep_mmap(udb_base* udb) 352{ 353 if(!udb) return; 354 if(udb->fd != -1) { 355 close(udb->fd); 356 udb->fd = -1; 357 } 358 udb->base = NULL; 359 udb_alloc_delete(udb->alloc); 360 free(udb->ram_hash); 361 free(udb->fname); 362 free(udb); 363} 364 365void udb_base_sync(udb_base* udb, int wait) 366{ 367 if(!udb) return; 368#ifdef HAVE_MMAP 369 if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) { 370 log_msg(LOG_ERR, "msync(%s) error %s", 371 udb->fname, strerror(errno)); 372 } 373#else 374 (void)wait; 375#endif 376} 377 378/** hash a chunk pointer */ 379static uint32_t 380chunk_hash_ptr(udb_void p) 381{ 382 /* put p into an array of uint32 */ 383 uint32_t h[sizeof(p)/sizeof(uint32_t)]; 384 memcpy(&h, &p, sizeof(h)); 385 return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763); 386} 387 388/** check that the given pointer is on the bucket for the given offset */ 389int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to) 390{ 391 uint32_t i = chunk_hash_ptr(to) & udb->ram_mask; 392 udb_ptr* p; 393 assert((size_t)i < udb->ram_size); 394 for(p = udb->ram_hash[i]; p; p=p->next) { 395 if(p == ptr) 396 return 1; 397 } 398 return 0; 399} 400 401/** grow the ram array */ 402static void 403grow_ram_hash(udb_base* udb, udb_ptr** newhash) 404{ 405 size_t i; 406 size_t osize= udb->ram_size; 407 udb_ptr* p, *np; 408 udb_ptr** oldhash = udb->ram_hash; 409 udb->ram_size *= 2; 410 udb->ram_mask <<= 1; 411 udb->ram_mask |= 1; 412 udb->ram_hash = newhash; 413 /* have to link in every element in the old list into the new list*/ 414 for(i=0; i<osize; i++) { 415 p = oldhash[i]; 416 while(p) { 417 np = p->next; 418 /* link into newhash */ 419 p->prev=NULL; 420 p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask]; 421 if(p->next) p->next->prev = p; 422 /* go to next element of oldhash */ 423 p = np; 424 } 425 } 426 free(oldhash); 427} 428 429void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr) 430{ 431 uint32_t i; 432#ifdef UDB_CHECK 433 assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/ 434#endif 435 udb->ram_num++; 436 if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) { 437 /* grow the array, if allocation succeeds */ 438 udb_ptr** newram = (udb_ptr**)xalloc_array_zero( 439 sizeof(udb_ptr*), udb->ram_size*2); 440 if(newram) { 441 grow_ram_hash(udb, newram); 442 } 443 } 444 i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 445 assert((size_t)i < udb->ram_size); 446 447 ptr->prev = NULL; 448 ptr->next = udb->ram_hash[i]; 449 udb->ram_hash[i] = ptr; 450 if(ptr->next) 451 ptr->next->prev = ptr; 452} 453 454void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr) 455{ 456 assert(ptr->data); 457#ifdef UDB_CHECK 458 assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */ 459 assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data)); 460#endif 461 udb->ram_num--; 462 if(ptr->next) 463 ptr->next->prev = ptr->prev; 464 if(ptr->prev) 465 ptr->prev->next = ptr->next; 466 else { 467 uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 468 assert((size_t)i < udb->ram_size); 469 udb->ram_hash[i] = ptr->next; 470 } 471} 472 473/** change a set of ram ptrs to a new value */ 474static void 475udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd) 476{ 477 uint32_t io = chunk_hash_ptr(old) & udb->ram_mask; 478 udb_ptr* p, *np; 479 /* edit them and move them into the new position */ 480 p = udb->ram_hash[io]; 481 while(p) { 482 np = p->next; 483 if(p->data == old) { 484 udb_base_unlink_ptr(udb, p); 485 p->data = newd; 486 udb_base_link_ptr(udb, p); 487 } 488 p = np; 489 } 490} 491 492udb_rel_ptr* udb_base_get_userdata(udb_base* udb) 493{ 494 return &udb->glob_data->user_global; 495} 496 497void udb_base_set_userdata(udb_base* udb, udb_void user) 498{ 499#ifdef UDB_CHECK 500 if(user) { assert(udb_valid_dataptr(udb, user)); } 501#endif 502 udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user); 503} 504 505void udb_base_set_userflags(udb_base* udb, uint8_t v) 506{ 507 udb->glob_data->userflags = v; 508} 509 510uint8_t udb_base_get_userflags(udb_base* udb) 511{ 512 return udb->glob_data->userflags; 513} 514 515/** re-mmap the udb to specified size */ 516static void* 517udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize) 518{ 519#ifdef HAVE_MMAP 520 void* nb; 521 /* for use with valgrind, do not use mremap, but the other version */ 522#ifdef MREMAP_MAYMOVE 523 nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE); 524 if(nb == MAP_FAILED) { 525 log_msg(LOG_ERR, "mremap(%s, size %u) error %s", 526 udb->fname, (unsigned)nsize, strerror(errno)); 527 return 0; 528 } 529#else /* !HAVE MREMAP */ 530 /* use munmap-mmap to simulate mremap */ 531 if(munmap(udb->base, udb->base_size) != 0) { 532 log_msg(LOG_ERR, "munmap(%s) error %s", 533 udb->fname, strerror(errno)); 534 } 535 /* provide hint for new location */ 536 /* note the size_t casts must be there for portability, on some 537 * systems the layout of memory is otherwise broken. */ 538 nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 539 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 540 /* retry the mmap without basept in case of ENOMEM (FreeBSD8), 541 * the kernel can then try to mmap it at a different location 542 * where more memory is available */ 543 if(nb == MAP_FAILED && errno == ENOMEM) { 544 nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 545 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 546 } 547 if(nb == MAP_FAILED) { 548 log_msg(LOG_ERR, "mmap(%s, size %u) error %s", 549 udb->fname, (unsigned)nsize, strerror(errno)); 550 udb->base = NULL; 551 return 0; 552 } 553#endif /* HAVE MREMAP */ 554 if(nb != udb->base) { 555 /* fix up realpointers in udb and alloc */ 556 /* but mremap may have been nice and not move the base */ 557 udb->base = nb; 558 udb->glob_data = (udb_glob_d*)((char*)nb+sizeof(uint64_t)); 559 /* use passed alloc pointer because the udb->alloc may not 560 * be initialized yet */ 561 alloc->disk = (udb_alloc_d*)((char*)udb->glob_data 562 +sizeof(*udb->glob_data)); 563 } 564 udb->base_size = nsize; 565 return nb; 566#else /* HAVE_MMAP */ 567 (void)udb; (void)alloc; (void)nsize; 568 return NULL; 569#endif /* HAVE_MMAP */ 570} 571 572void 573udb_base_remap_process(udb_base* udb) 574{ 575 /* assume that fsize is still accessible */ 576 udb_base_remap(udb, udb->alloc, udb->glob_data->fsize); 577} 578 579/** grow file to specified size and re-mmap, return new base */ 580static void* 581udb_base_grow_and_remap(udb_base* udb, uint64_t nsize) 582{ 583 /* grow file by writing a single zero at that spot, the 584 * rest is filled in with zeroes. */ 585 uint8_t z = 0; 586 ssize_t w; 587 588 assert(nsize > 0); 589 udb->glob_data->dirty_alloc = udb_dirty_fsize; 590#ifdef HAVE_PWRITE 591 if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) { 592#else 593 if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) { 594 log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno)); 595 return 0; 596 } 597 if((w=write(udb->fd, &z, sizeof(z))) == -1) { 598#endif 599 log_msg(LOG_ERR, "grow(%s, size %u) error %s", 600 udb->fname, (unsigned)nsize, strerror(errno)); 601 return 0; 602 } else if(w != (ssize_t)sizeof(z)) { 603 log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)", 604 udb->fname, (unsigned)nsize); 605 return 0; 606 } 607 udb->glob_data->fsize = nsize; 608 udb->glob_data->dirty_alloc = udb_dirty_clean; 609 return udb_base_remap(udb, udb->alloc, nsize); 610} 611 612int udb_exp_size(uint64_t a) 613{ 614 /* find enclosing value such that 2**x >= a */ 615 int x = 0; 616 uint64_t i = a; 617 assert(a != 0); 618 619 i --; 620 /* could optimise this with uint8* access, depends on endianness */ 621 /* first whole bytes */ 622 while( (i&(~(uint64_t)0xff)) ) { 623 i >>= 8; 624 x += 8; 625 } 626 /* now details */ 627 while(i) { 628 i >>= 1; 629 x ++; 630 } 631 assert( x>=0 && x<=63); 632 assert( ((uint64_t)1<<x) >= a); 633 assert( x==0 || /* <<x-1 without negative number analyzer complaints: */ (((uint64_t)1<<x)>>1) < a); 634 return x; 635} 636 637int udb_exp_offset(uint64_t o) 638{ 639 /* this means measuring the number of 0 bits on the right */ 640 /* so, if exp zero bits then (o&(2**x-1))==0 */ 641 int x = 0; 642 uint64_t i = o; 643 assert(o != 0); 644 /* first whole bytes */ 645 while( (i&(uint64_t)0xff) == 0) { 646 i >>= 8; 647 x += 8; 648 } 649 /* now details */ 650 while( (i&(uint64_t)0x1) == 0) { 651 i >>= 1; 652 x ++; 653 } 654 assert( o % ((uint64_t)1<<x) == 0); 655 assert( o % ((uint64_t)1<<(x+1)) != 0); 656 return x; 657} 658 659void udb_alloc_init_new(udb_alloc_d* a) 660{ 661 assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0); 662 memset(a, 0, sizeof(*a)); 663 /* set new allocations after header, as if allocated in a sequence 664 * of minsize allocations */ 665 a->nextgrow = UDB_HEADER_SIZE; 666} 667 668/** fsck the file size, false if failed and file is useless */ 669static int 670fsck_fsize(udb_base* udb, udb_alloc* alloc) 671{ 672 off_t realsize; 673 log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname); 674 realsize = lseek(udb->fd, (off_t)0, SEEK_END); 675 if(realsize == (off_t)-1) { 676 log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno)); 677 return 0; 678 } 679 udb->glob_data->fsize = (uint64_t)realsize; 680 if(!udb_base_remap(udb, alloc, (uint64_t)realsize)) 681 return 0; 682 udb->glob_data->dirty_alloc = udb_dirty_clean; 683 log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname); 684 udb_base_sync(udb, 1); 685 return 1; 686} 687 688/** regenerate freelist add a new free chunk, return next todo */ 689static udb_void 690regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen) 691{ 692 udb_free_chunk_d* cp = UDB_FREE_CHUNK(c); 693 uint64_t esz = (uint64_t)1<<exp; 694 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 695 return 0; 696 } 697 cp->type = udb_chunk_type_free; 698 cp->flags = 0; 699 chunk_set_last(base, c, exp, (uint8_t)exp); 700 cp->prev = 0; 701 cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 702 if(cp->next) 703 UDB_FREE_CHUNK(cp->next)->prev = c; 704 regen->stat_free += esz; 705 return c + esz; 706} 707 708/** regenerate xl chunk, return next todo */ 709static udb_void 710regen_xl(void* base, udb_void c, udb_alloc_d* regen) 711{ 712 udb_xl_chunk_d* cp = UDB_XL_CHUNK(c); 713 uint64_t xlsz = cp->size; 714 if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 715 return 0; 716 } 717 if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 718 return 0; 719 } 720 /* fixup end-size and end-expmarker */ 721 regen->stat_alloc += xlsz; 722 return c + xlsz; 723} 724 725/** regenerate data chunk, return next todo */ 726static udb_void 727regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen) 728{ 729 uint64_t esz = (uint64_t)1<<exp; 730 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 731 return 0; 732 } 733 chunk_set_last(base, c, exp, (uint8_t)exp); 734 regen->stat_alloc += esz; 735 return c + esz; 736} 737 738/** regenerate a relptr structure inside a data segment */ 739static void 740regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg) 741{ 742 udb_void* a = (udb_void*)arg; 743 /* ignore 0 pointers */ 744 if(!rp->data) 745 return; 746 747 /* edit relptrs that point to oldmoved to point to newmoved. */ 748 if(rp->data == a[0]) 749 rp->data = a[1]; 750 751 /* regenerate relptr lists, add this item to the relptr list for 752 * the data that it points to */ 753 udb_rel_ptr_link(base, rp, rp->data); 754} 755 756/** regenerate the relptrs store in this data segment */ 757static void 758regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp, 759 void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new) 760{ 761 udb_void arg[2]; 762 arg[0] = rb_old; arg[1] = rb_new; 763 /* walk through the structs here and put them on their respective 764 * relptr lists */ 765 (*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz, 766 ®en_relptr_func, arg); 767 768} 769 770/** regenerate relptrlists in the file */ 771static void 772regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc, 773 udb_void rb_old, udb_void rb_new) 774{ 775 udb_void at = alloc->udb->glob_data->hsize; 776 /* clear all ptrlist start pointers in the file. */ 777 while(at < alloc->disk->nextgrow) { 778 int exp = (int)UDB_CHUNK(at)->exp; 779 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 780 if(exp == UDB_EXP_XL) { 781 UDB_XL_CHUNK(at)->ptrlist = 0; 782 at += UDB_XL_CHUNK(at)->size; 783 } else if(tp == udb_chunk_type_free) { 784 at += (uint64_t)1<<exp; 785 } else { /* data chunk */ 786 UDB_CHUNK(at)->ptrlist = 0; 787 at += (uint64_t)1<<exp; 788 } 789 } 790 /* walk through all relptr structs and put on the right list. */ 791 at = alloc->udb->glob_data->hsize; 792 while(at < alloc->disk->nextgrow) { 793 udb_chunk_d* atp = UDB_CHUNK(at); 794 int exp = (int)atp->exp; 795 udb_chunk_type tp = (udb_chunk_type)atp->type; 796 uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size: 797 (uint64_t)1<<exp); 798 if(exp == UDB_EXP_XL) { 799 assert(at != rb_old); /* should have been freed */ 800 regen_its_ptrs(base, udb, atp, 801 ((char*)atp)+sizeof(udb_xl_chunk_d), 802 sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2, 803 rb_old, rb_new); 804 at += sz; 805 } else if(tp == udb_chunk_type_free) { 806 at += sz; 807 } else { /* data chunk */ 808 assert(at != rb_old); /* should have been freed */ 809 regen_its_ptrs(base, udb, atp, 810 ((char*)atp)+sizeof(udb_chunk_d), 811 sz-sizeof(udb_chunk_d)-1, rb_old, rb_new); 812 at += sz; 813 } 814 } 815} 816 817 818/** mark free elements from ex XL chunk space and later fixups pick that up */ 819static void 820rb_mark_free_segs(void* base, udb_void s, uint64_t m) 821{ 822 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 823 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 824 assert(s >= UDB_ALLOC_CHUNK_SIZE); 825 while(q >= s) { 826 UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX; 827 UDB_CHUNK(q)->type = udb_chunk_type_free; 828 q -= UDB_ALLOC_CHUNK_SIZE; 829 } 830} 831 832 833/** fsck rollback or rollforward XL move results */ 834static int 835fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new, 836 uint64_t rb_size, uint64_t rb_seg) 837{ 838 839 if(rb_old <= rb_new) 840 return 0; /* XL move one way */ 841 if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 842 return 0; /* not aligned */ 843 if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 844 return 0; /* not aligned */ 845 if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 846 return 0; /* not aligned */ 847 if(rb_new + rb_size <= rb_old) { 848 /* not overlapping: resume copy */ 849 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 850 /* and free up old piece(s) */ 851 rb_mark_free_segs(base, rb_old, rb_size); 852 } else { 853 /* overlapping, see what segment we stopped at 854 * and continue there. */ 855 move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg); 856 /* free up old piece(s); from the end of the moved segment, 857 * until the end of the old segment */ 858 rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)- 859 (rb_new+rb_size)); 860 } 861 /* do not call fix_ptrs, regenptrs does the job */ 862 return 1; 863} 864 865/** fsck rollback or rollforward move results */ 866static int 867fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size, 868 udb_void* make_free) 869{ 870 if( (rb_size&(rb_size-1)) != 0) 871 return 0; /* not powerof2 */ 872 if( (rb_old&(rb_size-1)) != 0) 873 return 0; /* not aligned */ 874 if( (rb_new&(rb_size-1)) != 0) 875 return 0; /* not aligned */ 876 /* resume copy */ 877 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 878 /* do not call fix_ptrs, regenptrs does the job */ 879 /* make sure udb_old is freed */ 880 *make_free = rb_old; 881 return 1; 882} 883 884/** fsck the file and salvage, false if failed and file is useless */ 885static int 886fsck_file(udb_base* udb, udb_alloc* alloc, int moved) 887{ 888 void* base = udb->base; 889 udb_alloc_d regen; 890 udb_void at = udb->glob_data->hsize; 891 udb_void rb_old = udb->glob_data->rb_old; 892 udb_void rb_new = udb->glob_data->rb_new; 893 udb_void rb_seg = udb->glob_data->rb_seg; 894 udb_void make_free = 0; 895 uint64_t rb_size = udb->glob_data->rb_size; 896 log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname); 897 /* walk through the file, use the exp values to see what can be 898 * salvaged */ 899 if(moved && rb_old && rb_new && rb_size) { 900 if(rb_old+rb_size <= alloc->disk->nextgrow 901 && rb_new+rb_size <= alloc->disk->nextgrow) { 902 /* we can use the move information to fix up the 903 * duplicate element (or partially moved element) */ 904 if(rb_size > 1024*1024) { 905 /* XL chunk */ 906 if(!fsck_rb_xl(base, udb, rb_old, rb_new, 907 rb_size, rb_seg)) 908 return 0; 909 } else { 910 if(!fsck_rb(base, rb_old, rb_new, rb_size, 911 &make_free)) 912 return 0; 913 } 914 } 915 } 916 917 /* rebuild freelists */ 918 /* recalculate stats in alloc (except 'stat_data') */ 919 /* possibly new end 'nextgrow' value */ 920 memset(®en, 0, sizeof(regen)); 921 regen.nextgrow = alloc->disk->nextgrow; 922 while(at < regen.nextgrow) { 923 /* figure out this chunk */ 924 int exp = (int)UDB_CHUNK(at)->exp; 925 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 926 /* consistency check possible here with end-exp */ 927 if(tp == udb_chunk_type_free || at == make_free) { 928 at = regen_free(base, at, exp, ®en); 929 if(!at) return 0; 930 } else if(exp == UDB_EXP_XL) { 931 /* allocated data of XL size */ 932 at = regen_xl(base, at, ®en); 933 if(!at) return 0; 934 } else if(exp >= UDB_ALLOC_CHUNK_MINEXP 935 && exp <= UDB_ALLOC_CHUNKS_MAX) { 936 /* allocated data */ 937 at = regen_data(base, at, exp, ®en); 938 if(!at) return 0; 939 } else { 940 /* garbage; this must be EOF then */ 941 regen.nextgrow = at; 942 break; 943 } 944 } 945 *alloc->disk = regen; 946 947 /* rebuild relptr lists */ 948 regen_ptrlist(base, udb, alloc, rb_old, rb_new); 949 950 log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)", 951 udb->fname); 952 udb->glob_data->rb_old = 0; 953 udb->glob_data->rb_new = 0; 954 udb->glob_data->rb_size = 0; 955 udb->glob_data->dirty_alloc = udb_dirty_clean; 956 udb_base_sync(udb, 1); 957 return 1; 958} 959 960 961udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk) 962{ 963 udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc)); 964 if(!alloc) 965 return NULL; 966 alloc->udb = udb; 967 alloc->disk = disk; 968 /* see if committed but uncompleted actions need to be done */ 969 /* preserves the alloc state */ 970 if(udb->glob_data->dirty_alloc != udb_dirty_clean) { 971 if(udb->glob_data->dirty_alloc == udb_dirty_fsize) { 972 if(fsck_fsize(udb, alloc)) 973 return alloc; 974 } else if(udb->glob_data->dirty_alloc == udb_dirty_fl) { 975 if(fsck_file(udb, alloc, 0)) 976 return alloc; 977 } else if(udb->glob_data->dirty_alloc == udb_dirty_compact) { 978 if(fsck_file(udb, alloc, 1)) 979 return alloc; 980 } 981 log_msg(LOG_ERR, "error: file allocation dirty (%d)", 982 (int)udb->glob_data->dirty_alloc); 983 free(alloc); 984 return NULL; 985 } 986 return alloc; 987} 988 989void udb_alloc_delete(udb_alloc* alloc) 990{ 991 if(!alloc) return; 992 free(alloc); 993} 994 995/** unlink this element from its freelist */ 996static void 997udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp) 998{ 999 udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk); 1000 assert(chunk); 1001 /* chunk is a free chunk */ 1002 assert(fp->exp == (uint8_t)exp); 1003 assert(fp->type == udb_chunk_type_free); 1004 assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp); 1005 /* and thus freelist not empty */ 1006 assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]); 1007 /* unlink */ 1008 if(fp->prev) 1009 UDB_FREE_CHUNK(fp->prev)->next = fp->next; 1010 else alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 1011 if(fp->next) 1012 UDB_FREE_CHUNK(fp->next)->prev = fp->prev; 1013} 1014 1015/** pop first element off freelist, list may not be empty */ 1016static udb_void 1017udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp) 1018{ 1019 udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1020 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1021 assert(f); 1022 assert(fp->exp == (uint8_t)exp); 1023 assert(fp->type == udb_chunk_type_free); 1024 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1025 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 1026 if(fp->next) { 1027 UDB_FREE_CHUNK(fp->next)->prev = 0; 1028 } 1029 return f; 1030} 1031 1032/** push new element onto freelist */ 1033static void 1034udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp) 1035{ 1036 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1037 assert(f); 1038 fp->exp = (uint8_t)exp; 1039 fp->type = udb_chunk_type_free; 1040 fp->flags = 0; 1041 fp->prev = 0; 1042 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1043 if(fp->next) 1044 UDB_FREE_CHUNK(fp->next)->prev = f; 1045 chunk_set_last(base, f, exp, (uint8_t)exp); 1046 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1047} 1048 1049/** push new element onto freelist - do not initialize the elt */ 1050static void 1051udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp) 1052{ 1053 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1054 assert(f); 1055 assert(fp->exp == (uint8_t)exp); 1056 assert(fp->type == udb_chunk_type_free); 1057 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1058 fp->prev = 0; 1059 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1060 if(fp->next) 1061 UDB_FREE_CHUNK(fp->next)->prev = f; 1062 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1063} 1064 1065/** add free chunks at end until specified alignment occurs */ 1066static void 1067grow_align(void* base, udb_alloc* alloc, uint64_t esz) 1068{ 1069 while( (alloc->disk->nextgrow & (esz-1)) != 0) { 1070 /* the nextgrow is not a whole multiple of esz. */ 1071 /* grow a free chunk of max allowed size */ 1072 int fexp = udb_exp_offset(alloc->disk->nextgrow); 1073 uint64_t fsz = (uint64_t)1<<fexp; 1074 udb_void f = alloc->disk->nextgrow; 1075 udb_void fn = alloc->disk->nextgrow+fsz; 1076 assert(fn <= alloc->udb->base_size); 1077 alloc->disk->stat_free += fsz; 1078 udb_alloc_push_fl(base, alloc, f, fexp); 1079 /* now increase nextgrow to commit that free chunk */ 1080 alloc->disk->nextgrow = fn; 1081 } 1082} 1083 1084/** append chunks at end of memory space to get size exp, return dataptr */ 1085static udb_void 1086grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp) 1087{ 1088 uint64_t esz = (uint64_t)1<<exp; 1089 udb_void ret; 1090 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1091 grow_align(base, alloc, esz); 1092 /* free chunks are grown, grow the one we want to use */ 1093 ret = alloc->disk->nextgrow; 1094 /* take a new alloced chunk into use */ 1095 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1096 UDB_CHUNK(ret)->flags = 0; 1097 UDB_CHUNK(ret)->ptrlist = 0; 1098 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1099 /* store last octet */ 1100 chunk_set_last(base, ret, exp, (uint8_t)exp); 1101 /* update stats */ 1102 alloc->disk->stat_alloc += esz; 1103 alloc->disk->stat_data += sz; 1104 /* now increase nextgrow to commit this newly allocated chunk */ 1105 alloc->disk->nextgrow += esz; 1106 assert(alloc->disk->nextgrow <= alloc->udb->base_size); 1107 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1108 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1109} 1110 1111/** calculate how much space is necessary to grow for this exp */ 1112static uint64_t 1113grow_end_calc(udb_alloc* alloc, int exp) 1114{ 1115 uint64_t sz = (uint64_t)1<<exp; 1116 uint64_t ng = alloc->disk->nextgrow; 1117 uint64_t res; 1118 /* if nextgrow is 2**expness, no extra growth needed, only size */ 1119 if( (ng & (sz-1)) == 0) { 1120 /* sz-1 is like 0xfff, and checks if ng is whole 2**exp */ 1121 return ng+sz; /* must grow exactly 2**exp */ 1122 } 1123 /* grow until 2**expness and then we need 2**exp as well */ 1124 /* so, round ng down to whole sz (basically ng-ng%sz, or ng/sz*sz) 1125 * and then add the sz twice (go up to whole sz, and to allocate) */ 1126 res = (ng & ~(sz-1)) + 2*sz; 1127 return res; 1128} 1129 1130/** see if we need to grow more than specified to enable sustained growth */ 1131static uint64_t 1132grow_extra_check(udb_alloc* alloc, uint64_t ge) 1133{ 1134 const uint64_t mb = 1024*1024; 1135 uint64_t bsz = alloc->udb->base_size; 1136 if(bsz <= mb) { 1137 /* below 1 Mb, double sizes for exponential growth */ 1138 /* takes about 15 times to grow to 1Mb */ 1139 if(ge < bsz*2) 1140 return bsz*2; 1141 } else { 1142 uint64_t gnow = ge - bsz; 1143 /* above 1Mb, grow at least 1 Mb, or 12.5% of current size, 1144 * in whole megabytes rounded up. */ 1145 uint64_t want = ((bsz / 8) & ~(mb-1)) + mb; 1146 if(gnow < want) 1147 return bsz + want; 1148 } 1149 return ge; 1150} 1151 1152/** see if free space is enough to warrant shrink (while file is open) */ 1153static int 1154enough_free(udb_alloc* alloc) 1155{ 1156 if(alloc->udb->base_size <= 2*1024*1024) { 1157 /* below 1 Mb, grown by double size, (so up to 2 mb), 1158 * do not shrink unless we can 1/3 in size */ 1159 if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size) 1160 return 1; 1161 } else { 1162 /* grown 12.5%, shrink 25% if possible, at least one mb */ 1163 /* between 1mb and 4mb size, it shrinks by 1mb if possible */ 1164 uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow; 1165 if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size 1166 || alloc->udb->base_size < 4*1024*1024)) 1167 return 1; 1168 } 1169 return 0; 1170} 1171 1172/** grow space for a chunk of 2**exp and return dataptr */ 1173static udb_void 1174udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp) 1175{ 1176 /* commit the grow action 1177 * - the file grow only changes filesize, but not the nextgrow. 1178 * - taking space after nextgrow into use (as free space), 1179 * is like free-ing a chunk (one at a time). 1180 * - and the last chunk taken into use is like alloc. 1181 */ 1182 /* predict how much free space is needed for this */ 1183 uint64_t grow_end = grow_end_calc(alloc, exp); 1184 assert(alloc->udb->base_size >= alloc->disk->nextgrow); 1185 if(grow_end <= alloc->udb->base_size) { 1186 /* we can do this with the available space */ 1187 return grow_chunks(base, alloc, sz, exp); 1188 } 1189 /* we have to grow the file, re-mmap */ 1190 /* see if we need to grow a little more, to avoid endless grow 1191 * efforts on adding data */ 1192 grow_end = grow_extra_check(alloc, grow_end); 1193 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1194 return 0; /* mmap or write failed (disk or mem full) */ 1195 } 1196 /* we have enough space now */ 1197 assert(grow_end <= alloc->udb->base_size); 1198 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1199 return grow_chunks(base, alloc, sz, exp); 1200} 1201 1202/** take XL allocation into use at end of file, return dataptr */ 1203static udb_void 1204grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz) 1205{ 1206 udb_void ret; 1207 udb_xl_chunk_d* p; 1208 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1209 1210 /* align growth to whole mbs */ 1211 grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE); 1212 1213 /* grow XL segment */ 1214 ret = alloc->disk->nextgrow; 1215 p = UDB_XL_CHUNK(ret); 1216 p->exp = UDB_EXP_XL; 1217 p->size = xlsz; 1218 p->flags = 0; 1219 p->ptrlist = 0; 1220 p->type = udb_chunk_type_data; 1221 1222 /* also put size and marker at end for compaction */ 1223 *((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz; 1224 *((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL; 1225 1226 /* stats */ 1227 alloc->disk->stat_data += sz; 1228 alloc->disk->stat_alloc += xlsz; 1229 /* now increase the nextgrow to commit this xl chunk */ 1230 alloc->disk->nextgrow += xlsz; 1231 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1232 return ret + sizeof(udb_xl_chunk_d); /* data ptr */ 1233} 1234 1235/** make space for XL allocation */ 1236static udb_void 1237udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz) 1238{ 1239 /* allocate whole mbs of space, at end of space */ 1240 uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2; 1241 uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1)); 1242 uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need; 1243 assert(need >= asz); 1244 if(grow_end <= alloc->udb->base_size) { 1245 /* can do this in available space */ 1246 return grow_xl(base, alloc, need, sz); 1247 } 1248 /* have to grow file and re-mmap */ 1249 grow_end = grow_extra_check(alloc, grow_end); 1250 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1251 return 0; /* mmap or write failed (disk or mem full) */ 1252 } 1253 /* we have enough space now */ 1254 assert(grow_end <= alloc->udb->base_size); 1255 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1256 return grow_xl(base, alloc, need, sz); 1257} 1258 1259/** divide big(2**e2) into pieces so 2**exp fits */ 1260static udb_void 1261udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2, 1262 int exp) 1263{ 1264 int e = e2; 1265 uint64_t sz = (uint64_t)1<<e2; 1266 assert(big && e2 > exp); 1267 /* so the returned piece to use is the first piece, 1268 * offload the later half until it fits */ 1269 do { 1270 sz >>= 1; /* divide size of big by two */ 1271 e--; /* that means its exp is one smaller */ 1272 udb_alloc_push_fl(base, alloc, big+sz, e); 1273 } while(e != exp); 1274 /* exit loop when last pushed is same size as what we want */ 1275 return big; 1276} 1277 1278/** returns the exponent size of the chunk needed for data sz */ 1279static int 1280udb_alloc_exp_needed(size_t sz) 1281{ 1282 uint64_t asz = sz + sizeof(udb_chunk_d) + 1; 1283 int exp; 1284 if(asz > UDB_ALLOC_CHUNK_SIZE) { 1285 return UDB_EXP_XL; 1286 } else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) { 1287 return UDB_ALLOC_CHUNK_MINEXP; 1288 } 1289 exp = udb_exp_size(asz); 1290 assert(exp <= UDB_ALLOC_CHUNKS_MAX); 1291 return exp; 1292} 1293 1294udb_void udb_alloc_space(udb_alloc* alloc, size_t sz) 1295{ 1296 void* base = alloc->udb->base; 1297 /* calculate actual allocation size */ 1298 int e2, exp = udb_alloc_exp_needed(sz); 1299 if(exp == UDB_EXP_XL) 1300 return udb_alloc_xl_space(base, alloc, sz); 1301 /* see if there is a free chunk of that size exactly */ 1302 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) { 1303 /* snip from freelist, udb_chunk_d */ 1304 udb_void ret; 1305 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1306 ret = udb_alloc_pop_fl(base, alloc, exp); 1307 /* use it - size octets already OK */ 1308 UDB_CHUNK(ret)->flags = 0; 1309 UDB_CHUNK(ret)->ptrlist = 0; 1310 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1311 /* update stats */ 1312 alloc->disk->stat_data += sz; 1313 alloc->disk->stat_alloc += (1<<exp); 1314 assert(alloc->disk->stat_free >= (1u<<exp)); 1315 alloc->disk->stat_free -= (1<<exp); 1316 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1317 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1318 } 1319 /* see if we can subdivide a larger chunk */ 1320 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++) 1321 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1322 udb_void big, ret; /* udb_chunk_d */ 1323 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1324 big = udb_alloc_pop_fl(base, alloc, e2); 1325 /* push other parts onto freelists (needs inited) */ 1326 ret = udb_alloc_subdivide(base, alloc, big, e2, exp); 1327 /* use final part (needs inited) */ 1328 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1329 /* if stop here; the new exp makes smaller free chunk*/ 1330 UDB_CHUNK(ret)->flags = 0; 1331 UDB_CHUNK(ret)->ptrlist = 0; 1332 /* set type to commit data chunk */ 1333 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1334 /* store last octet */ 1335 chunk_set_last(base, ret, exp, (uint8_t)exp); 1336 /* update stats */ 1337 alloc->disk->stat_data += sz; 1338 alloc->disk->stat_alloc += (1<<exp); 1339 assert(alloc->disk->stat_free >= (1u<<exp)); 1340 alloc->disk->stat_free -= (1<<exp); 1341 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1342 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1343 } 1344 /* we need to grow an extra chunk */ 1345 return udb_alloc_grow_space(base, alloc, sz, exp); 1346} 1347 1348/** see if there is free space to allocate a chunk into */ 1349static int 1350have_free_for(udb_alloc* alloc, int exp) 1351{ 1352 int e2; 1353 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) 1354 return exp; 1355 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++) 1356 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1357 return e2; 1358 } 1359 return 0; 1360} 1361 1362/** fix relptr prev and next for moved relptr structures */ 1363static void 1364chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg) 1365{ 1366 udb_void* data = (udb_void*)arg; 1367 udb_void r; 1368 if(!rp->data) 1369 return; 1370 r = UDB_SYSTOREL(base, rp); 1371 if(rp->next) 1372 UDB_REL_PTR(rp->next)->prev = r; 1373 if(rp->prev) 1374 UDB_REL_PTR(rp->prev)->next = r; 1375 else { 1376 /* if this is a pointer to its own chunk, fix it up; 1377 * the data ptr gets set by relptr_edit later. */ 1378 if(rp->data == data[0]) 1379 UDB_CHUNK(data[1])->ptrlist = r; 1380 else UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r; 1381 } 1382} 1383 1384/** fix pointers from and to a moved chunk */ 1385static void 1386chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data, 1387 uint64_t dsz, udb_void olddata) 1388{ 1389 udb_void d[2]; 1390 d[0] = olddata; 1391 d[1] = data; 1392 (*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data), 1393 dsz, &chunk_fix_ptr_each, d); 1394 udb_rel_ptr_edit(base, cp->ptrlist, data); 1395 udb_base_ram_ptr_edit(udb, olddata, data); 1396} 1397 1398/** move an allocated chunk to use a free chunk */ 1399static void 1400move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz, 1401 int e2) 1402{ 1403 udb_void res = udb_alloc_pop_fl(base, alloc, e2); 1404 udb_chunk_d* rp; 1405 udb_chunk_d* fp; 1406 if(exp != e2) { 1407 /* it is bigger, subdivide it */ 1408 res = udb_alloc_subdivide(base, alloc, res, e2, exp); 1409 } 1410 assert(res != f); 1411 /* setup rollback information */ 1412 alloc->udb->glob_data->rb_old = f; 1413 alloc->udb->glob_data->rb_new = res; 1414 alloc->udb->glob_data->rb_size = esz; 1415 /* take the res, exp into use */ 1416 rp = UDB_CHUNK(res); 1417 fp = UDB_CHUNK(f); 1418 /* copy over the data */ 1419 memcpy(rp, fp, esz); 1420 /* adjust rel ptrs */ 1421 chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d), 1422 esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d)); 1423 1424 /* do not freeup the fp; caller does that */ 1425} 1426 1427/** unlink several free elements to overwrite with xl chunk */ 1428static void 1429free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m) 1430{ 1431 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 1432 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 1433 assert(s >= UDB_ALLOC_CHUNK_SIZE); 1434 while(q >= s) { 1435 assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX); 1436 assert(UDB_CHUNK(q)->type == udb_chunk_type_free); 1437 udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX); 1438 q -= UDB_ALLOC_CHUNK_SIZE; 1439 } 1440} 1441 1442/** move an XL chunk, and keep track of segments for rollback */ 1443static void 1444move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n, 1445 uint64_t sz, uint64_t startseg) 1446{ 1447 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1448 udb_xl_chunk_d* np = UDB_XL_CHUNK(n); 1449 uint64_t amount = xl - n; 1450 assert(n < xl); /* move to compact */ 1451 1452 /* setup move rollback */ 1453 udb->glob_data->rb_old = xl; 1454 udb->glob_data->rb_new = n; 1455 udb->glob_data->rb_size = sz; 1456 1457 /* is it overlapping? */ 1458 if(sz <= amount) { 1459 memcpy(np, xlp, sz); 1460 } else { 1461 /* move and commit per 1M segment to avoid data loss */ 1462 uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE; 1463 for(seg = startseg; seg<maxseg; seg++) { 1464 udb->glob_data->rb_seg = seg; 1465 memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE, 1466 xlp+seg*UDB_ALLOC_CHUNK_SIZE, 1467 UDB_ALLOC_CHUNK_SIZE); 1468 } 1469 1470 } 1471} 1472 1473/** move list of XL chunks to the front by the shift amount */ 1474static void 1475move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz, 1476 uint64_t amount) 1477{ 1478 udb_void xl = xl_start; 1479 assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1480 assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1481 assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1482 while(xl < xl_start+xl_sz) { 1483 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1484 udb_void n = xl-amount; 1485 uint64_t sz = xlp->size; 1486 assert(xlp->exp == UDB_EXP_XL); 1487 move_xl_segment(base, alloc->udb, xl, n, sz, 0); 1488 chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n), 1489 n+sizeof(udb_xl_chunk_d), 1490 sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2, 1491 xl+sizeof(udb_xl_chunk_d)); 1492 } 1493 alloc->disk->stat_free -= amount; 1494 alloc->disk->nextgrow -= amount; 1495 alloc->udb->glob_data->rb_old = 0; 1496 alloc->udb->glob_data->rb_new = 0; 1497 alloc->udb->glob_data->rb_size = 0; 1498} 1499 1500/** see if free chunk can coagulate with another chunk, return other chunk */ 1501static udb_void 1502coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp, 1503 uint64_t esz) 1504{ 1505 udb_void other = f^esz; 1506 if(exp == UDB_ALLOC_CHUNKS_MAX) 1507 return 0; /* no further merges */ 1508 if(other >= alloc->udb->base_size) 1509 return 0; /* not allocated */ 1510 if(other >= alloc->disk->nextgrow) 1511 return 0; /* not in use */ 1512 if(other < alloc->udb->glob_data->hsize) 1513 return 0; /* cannot merge with header */ 1514 /* the header is also protected by the special exp marker */ 1515 /* see if the other chunk is a free chunk */ 1516 1517 /* check closest marker to avoid large memory churn */ 1518 /* and also it makes XL allocations and header special markers work */ 1519 if(f > other) { 1520 assert(f > 1); /* this is certain because of header */ 1521 if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) { 1522 /* can do it if the other part is a free chunk */ 1523 assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp); 1524 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1525 return other; 1526 } 1527 } else { 1528 if(UDB_CHUNK(other)->exp == (uint8_t)exp) { 1529 /* can do it if the other part is a free chunk */ 1530 assert(chunk_get_last(base, other, exp)==(uint8_t)exp); 1531 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1532 return other; 1533 } 1534 } 1535 return 0; 1536} 1537 1538/** coagulate and then add new free segment, return final free segment */ 1539static udb_void 1540coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp, 1541 uint64_t esz) 1542{ 1543 /* new free chunk here, attempt coagulate */ 1544 udb_void other; 1545 while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) { 1546 /* unlink that other chunk */ 1547 udb_alloc_unlink_fl(base, alloc, other, exp); 1548 /* merge up */ 1549 if(other < last) 1550 last = other; 1551 exp++; 1552 esz <<= 1; 1553 } 1554 /* free the final segment */ 1555 udb_alloc_push_fl(base, alloc, last, exp); 1556 return last; 1557} 1558 1559/** attempt to compact the data and move free space to the end */ 1560int 1561udb_alloc_compact(void* base, udb_alloc* alloc) 1562{ 1563 udb_void last; 1564 int exp, e2; 1565 uint64_t esz; 1566 uint64_t at = alloc->disk->nextgrow; 1567 udb_void xl_start = 0; 1568 uint64_t xl_sz = 0; 1569 if(alloc->udb->inhibit_compact) 1570 return 1; 1571 alloc->udb->useful_compact = 0; 1572 while(at > alloc->udb->glob_data->hsize) { 1573 /* grab last entry */ 1574 exp = (int)*((uint8_t*)UDB_REL(base, at-1)); 1575 if(exp == UDB_EXP_XL) { 1576 /* for XL chunks: 1577 * - inspect the size of the XLchunklist at end 1578 * - attempt to compact in front of of XLchunklist 1579 */ 1580 uint64_t xlsz = *((uint64_t*)UDB_REL(base, 1581 at-sizeof(uint64_t)*2)); 1582 udb_void xl = at-xlsz; 1583#ifndef NDEBUG 1584 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1585 assert(xlp->exp == UDB_EXP_XL); 1586 assert(xlp->type != udb_chunk_type_free); 1587#endif 1588 /* got thesegment add to the xl chunk list */ 1589 if(xl_start != 0 && xl+xlsz != xl_start) { 1590 /* nonadjoining XL part, but they are aligned, 1591 * so the space in between is whole Mbs, 1592 * shift the later part(s) and continue */ 1593 uint64_t m = xl_start - (xl+xlsz); 1594 assert(xl_start > xl+xlsz); 1595 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1596 free_xl_space(base, alloc, xl+xlsz, m); 1597 move_xl_list(base, alloc, xl_start, xl_sz, m); 1598 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1599 } 1600 xl_start = xl; 1601 xl_sz += xlsz; 1602 at = xl; 1603 continue; 1604 /* end of XL if */ 1605 } else if(exp < UDB_ALLOC_CHUNK_MINEXP 1606 || exp > UDB_ALLOC_CHUNKS_MAX) 1607 break; /* special chunk or garbage */ 1608 esz = (uint64_t)1<<exp; 1609 last = at - esz; 1610 assert(UDB_CHUNK(last)->exp == (uint8_t)exp); 1611 if(UDB_CHUNK(last)->type == udb_chunk_type_free) { 1612 /* if xlstart continue looking to move stuff, but do 1613 * not unlink this free segment */ 1614 if(!xl_start) { 1615 /* it is a free chunk, remove it */ 1616 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1617 udb_alloc_unlink_fl(base, alloc, last, exp); 1618 alloc->disk->stat_free -= esz; 1619 alloc->disk->nextgrow = last; 1620 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1621 /* and continue at this point */ 1622 } 1623 at = last; 1624 } else if( (e2=have_free_for(alloc, exp)) ) { 1625 /* last entry can be allocated in free chunks 1626 * move it to its new position, adjust rel_ptrs */ 1627 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1628 move_chunk(base, alloc, last, exp, esz, e2); 1629 if(xl_start) { 1630 last = coagulate_and_push(base, alloc, 1631 last, exp, esz); 1632 } else { 1633 /* shorten usage */ 1634 alloc->disk->stat_free -= esz; 1635 alloc->disk->nextgrow = last; 1636 } 1637 alloc->udb->glob_data->rb_old = 0; 1638 alloc->udb->glob_data->rb_new = 0; 1639 alloc->udb->glob_data->rb_size = 0; 1640 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1641 /* and continue in front of it */ 1642 at = last; 1643 } else { 1644 /* cannot compact this block, stop compacting */ 1645 break; 1646 } 1647 /* if that worked, repeat it */ 1648 } 1649 /* if we passed xl chunks, see if XL-chunklist can move */ 1650 if(xl_start) { 1651 /* calculate free space in front of the XLchunklist. */ 1652 /* has to be whole mbs of free space */ 1653 /* if so, we can move the XL chunks. Move them all back 1654 * by the new free space. */ 1655 /* this compacts very well, but the XL chunks can be moved 1656 * multiple times; worst case for every mb freed a huge sized 1657 * xlchunklist gets moved. */ 1658 /* free space must be, since aligned and coagulated, in 1659 * chunks of a whole MB */ 1660 udb_void at = xl_start; 1661 uint64_t m = 0; 1662 while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){ 1663 udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE; 1664 if(UDB_CHUNK(chunk)->type != udb_chunk_type_free) 1665 break; 1666 assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX); 1667 m += UDB_ALLOC_CHUNK_SIZE; 1668 at = chunk; 1669 } 1670 if(m != 0) { 1671 assert(at+m == xl_start); 1672 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1673 free_xl_space(base, alloc, at, m); 1674 move_xl_list(base, alloc, xl_start, xl_sz, m); 1675 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1676 } 1677 } 1678 1679 /* if enough free, shrink the file; re-mmap */ 1680 if(enough_free(alloc)) { 1681 uint64_t nsize = alloc->disk->nextgrow; 1682 udb_base_shrink(alloc->udb, nsize); 1683 if(!udb_base_remap(alloc->udb, alloc, nsize)) 1684 return 0; 1685 } 1686 return 1; 1687} 1688 1689int 1690udb_compact(udb_base* udb) 1691{ 1692 if(!udb) return 1; 1693 if(!udb->useful_compact) return 1; 1694 DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database...")); 1695 return udb_alloc_compact(udb->base, udb->alloc); 1696} 1697 1698void udb_compact_inhibited(udb_base* udb, int inhibit) 1699{ 1700 if(!udb) return; 1701 udb->inhibit_compact = inhibit; 1702} 1703 1704#ifdef UDB_CHECK 1705/** check that rptrs are really zero before free */ 1706void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg) 1707{ 1708 (void)base; 1709 (void)arg; 1710 assert(p->data == 0); 1711} 1712#endif /* UDB_CHECK */ 1713 1714/** free XL chunk as multiples of CHUNK_SIZE free segments */ 1715static void 1716udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp, 1717 size_t sz) 1718{ 1719 uint64_t xlsz = fp->size; 1720 uint64_t c; 1721 /* lightweight check for buffer overflow in xl data */ 1722 assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz); 1723 assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL); 1724 assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */ 1725 assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1726#ifdef UDB_CHECK 1727 /* check that relptrs in this chunk have been zeroed */ 1728 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1729 UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz, 1730 &udb_check_rptr_zero, NULL); 1731#endif 1732 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1733 /* update stats */ 1734 alloc->disk->stat_data -= sz; 1735 alloc->disk->stat_alloc -= xlsz; 1736 alloc->disk->stat_free += xlsz; 1737 /* walk in reverse, so the front blocks go first on the list */ 1738 c = f + xlsz - UDB_ALLOC_CHUNK_SIZE; 1739 /* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/ 1740 assert(f >= UDB_ALLOC_CHUNK_SIZE); 1741 while(c >= f) { 1742 /* free a block of CHUNK_SIZE (1 Mb) */ 1743 udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX); 1744 c -= UDB_ALLOC_CHUNK_SIZE; 1745 } 1746 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1747} 1748 1749int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz) 1750{ 1751 void* base; 1752 /* lookup chunk ptr */ 1753 udb_void f; 1754 udb_chunk_d* fp; 1755 uint64_t esz; 1756 int exp; 1757 udb_void other; 1758 int coagulated = 0; 1759 if(!r) 1760 return 1; /* free(NULL) does nothing */ 1761 1762 /* lookup size of chunk */ 1763 base = alloc->udb->base; 1764 /* fails for XL blocks */ 1765 f = chunk_from_dataptr(r); 1766 fp = UDB_CHUNK(f); 1767 assert(fp->type != udb_chunk_type_free); 1768 1769 /* see if it has a ptrlist, if so: trouble, the list is not properly 1770 * cleaned up. (although you can imagine a wholesale delete where 1771 * it does not matter) */ 1772 assert(fp->ptrlist == 0); 1773 1774 /* set ptrlist to 0 to stop relptr from using it, robustness. */ 1775 fp->ptrlist = 0; 1776 1777 if(fp->exp == UDB_EXP_XL) { 1778 udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz); 1779 /* compact */ 1780 if(alloc->udb->inhibit_compact) { 1781 alloc->udb->useful_compact = 1; 1782 return 1; 1783 } 1784 return udb_alloc_compact(base, alloc); 1785 } 1786 /* it is a regular chunk of 2**exp size */ 1787 exp = (int)fp->exp; 1788 esz = (uint64_t)1<<exp; 1789 /* light check for e.g. buffer overflow of the data */ 1790 assert(sz < esz); 1791 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1792#ifdef UDB_CHECK 1793 /* check that relptrs in this chunk have been zeroed */ 1794 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1795 UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL); 1796#endif 1797 1798 /* update the stats */ 1799 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1800 alloc->disk->stat_data -= sz; 1801 alloc->disk->stat_free += esz; 1802 alloc->disk->stat_alloc -= esz; 1803 1804 /* if it can be merged with other free chunks, do so */ 1805 while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) { 1806 coagulated = 1; 1807 /* unlink that other chunk and expand it (it has same size) */ 1808 udb_alloc_unlink_fl(base, alloc, other, exp); 1809 /* merge up */ 1810 if(other < f) 1811 f = other; 1812 exp++; 1813 esz <<= 1; 1814 } 1815 if(coagulated) { 1816 /* put big free chunk into freelist, and init it */ 1817 udb_alloc_push_fl(base, alloc, f, exp); 1818 } else { 1819 /* we do not need to touch the last-exp-byte, which may save 1820 * a reference to that page of memory */ 1821 fp->type = udb_chunk_type_free; 1822 fp->flags = 0; 1823 udb_alloc_push_fl_noinit(base, alloc, f, exp); 1824 } 1825 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1826 /* compact */ 1827 if(alloc->udb->inhibit_compact) { 1828 alloc->udb->useful_compact = 1; 1829 return 1; 1830 } 1831 return udb_alloc_compact(base, alloc); 1832} 1833 1834udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz) 1835{ 1836 /* could be faster maybe, if grown? */ 1837 udb_void r = udb_alloc_space(alloc, sz); 1838 if(!r) return r; 1839 memcpy(UDB_REL(alloc->udb->base, r), d, sz); 1840 return r; 1841} 1842 1843udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz) 1844{ 1845 void* base = alloc->udb->base; 1846 udb_void c, n, newd; 1847 udb_chunk_d* cp, *np; 1848 uint64_t avail; 1849 uint8_t cp_type; 1850 /* emulate some posix realloc stuff */ 1851 if(r == 0) 1852 return udb_alloc_space(alloc, sz); 1853 if(sz == 0) { 1854 if(!udb_alloc_free(alloc, r, osz)) 1855 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1856 return 0; 1857 } 1858 c = chunk_from_dataptr(r); 1859 cp = UDB_CHUNK(c); 1860 cp_type = cp->type; 1861 if(cp->exp == UDB_EXP_XL) { 1862 avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d) 1863 - sizeof(uint64_t)*2; 1864 } else { 1865 avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1; 1866 } 1867 if(sz <= avail) 1868 return r; 1869 /* reallocate it, and copy */ 1870 newd = udb_alloc_space(alloc, sz); 1871 if(!newd) return 0; 1872 /* re-base after alloc, since re-mmap may have happened */ 1873 base = alloc->udb->base; 1874 cp = NULL; /* may be invalid now, robustness */ 1875 n = chunk_from_dataptr(newd); 1876 np = UDB_CHUNK(n); 1877 np->type = cp_type; 1878 memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz); 1879 /* fixup ptrs */ 1880 chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r); 1881 1882 if(!udb_alloc_free(alloc, r, osz)) 1883 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1884 return newd; 1885} 1886 1887int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num) 1888{ 1889 const uint64_t mb = 1024*1024; 1890 int exp = udb_alloc_exp_needed(sz); 1891 uint64_t esz; 1892 uint64_t want; 1893 if(exp == UDB_EXP_XL) 1894 esz = (sz&(mb-1))+mb; 1895 else esz = (uint64_t)1<<exp; 1896 /* we need grow_end_calc to take into account alignment */ 1897 want = grow_end_calc(alloc, exp) + esz*(num-1); 1898 assert(want >= alloc->udb->base_size); 1899 if(!udb_base_grow_and_remap(alloc->udb, want)) { 1900 log_msg(LOG_ERR, "failed to grow the specified amount"); 1901 return 0; 1902 } 1903 return 1; 1904} 1905 1906void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp) 1907{ 1908 void* base = alloc->udb->base; 1909 udb_void f = chunk_from_dataptr(r); 1910 udb_chunk_d* fp = UDB_CHUNK(f); 1911 /* not the 'free' type, that must be set by allocation routines */ 1912 assert(fp->type != udb_chunk_type_free); 1913 assert(tp != udb_chunk_type_free); 1914 fp->type = tp; 1915} 1916 1917int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize) 1918{ 1919 /* pointers are not valid before the header-size or after the 1920 * used-region of the mmap */ 1921 return ( (to+destsize) <= udb->base_size && 1922 to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) && 1923 (to+destsize) <= udb->alloc->disk->nextgrow); 1924} 1925 1926int udb_valid_dataptr(udb_base* udb, udb_void to) 1927{ 1928 void* base = udb->base; 1929 udb_void ch; 1930 int exp; 1931 uint64_t esz; 1932 /* our data chunks are aligned and at least 8 bytes */ 1933 if(!udb_valid_offset(udb, to, sizeof(uint64_t))) 1934 return 0; 1935 /* get the chunk pointer */ 1936 ch = chunk_from_dataptr(to); 1937 if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d))) 1938 return 0; 1939 /* check its size */ 1940 exp = UDB_CHUNK(ch)->exp; 1941 if(exp == UDB_EXP_XL) { 1942 /* check XL chunk */ 1943 uint64_t xlsz; 1944 if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d))) 1945 return 0; 1946 xlsz = UDB_XL_CHUNK(ch)->size; 1947 if(!udb_valid_offset(udb, ch+xlsz-1, 1)) 1948 return 0; 1949 if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL) 1950 return 0; 1951 if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2)) 1952 != xlsz) 1953 return 0; 1954 return 1; 1955 } 1956 /* check if regular chunk has matching end byte */ 1957 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) 1958 return 0; /* cannot be a valid chunk */ 1959 esz = 1<<exp; 1960 if(!udb_valid_offset(udb, ch+esz-1, 1)) 1961 return 0; 1962 if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp) 1963 return 0; 1964 return 1; 1965} 1966 1967int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to) 1968{ 1969 void* base = udb->base; 1970 udb_void p; 1971 if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr))) 1972 return 0; 1973 if(!udb_valid_dataptr(udb, to)) 1974 return 0; 1975 p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist; 1976 while(p) { 1977 if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr))) 1978 return 0; 1979 if(p == rptr) 1980 return 1; 1981 p = UDB_REL_PTR(p)->next; 1982 } 1983 return 0; 1984} 1985 1986void udb_rel_ptr_init(udb_rel_ptr* ptr) 1987{ 1988 memset(ptr, 0, sizeof(*ptr)); 1989} 1990 1991void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr) 1992{ 1993 if(!ptr->data) 1994 return; 1995 if(ptr->prev) { 1996 UDB_REL_PTR(ptr->prev)->next = ptr->next; 1997 } else { 1998 UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next; 1999 } 2000 if(ptr->next) { 2001 UDB_REL_PTR(ptr->next)->prev = ptr->prev; 2002 } 2003} 2004 2005void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to) 2006{ 2007 udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to)); 2008 ptr->prev = 0; 2009 ptr->next = chunk->ptrlist; 2010 if(ptr->next) 2011 UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr); 2012 chunk->ptrlist = UDB_SYSTOREL(base, ptr); 2013 ptr->data = to; 2014} 2015 2016void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to) 2017{ 2018 assert(to == 0 || to > 64); 2019 udb_rel_ptr_unlink(base, ptr); 2020 if(to) 2021 udb_rel_ptr_link(base, ptr, to); 2022 else ptr->data = to; 2023} 2024 2025void udb_rel_ptr_edit(void* base, udb_void list, udb_void to) 2026{ 2027 udb_void p = list; 2028 while(p) { 2029 UDB_REL_PTR(p)->data = to; 2030 p = UDB_REL_PTR(p)->next; 2031 } 2032} 2033 2034#ifdef UDB_CHECK 2035/** check that all pointers are validly chained */ 2036static void 2037udb_check_ptrs_valid(udb_base* udb) 2038{ 2039 size_t i; 2040 udb_ptr* p, *prev; 2041 for(i=0; i<udb->ram_size; i++) { 2042 prev = NULL; 2043 for(p=udb->ram_hash[i]; p; p=p->next) { 2044 assert(p->prev == prev); 2045 assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask) 2046 == i); 2047 assert(p->base == &udb->base); 2048 prev = p; 2049 } 2050 } 2051} 2052#endif /* UDB_CHECK */ 2053 2054void udb_ptr_init(udb_ptr* ptr, udb_base* udb) 2055{ 2056#ifdef UDB_CHECK 2057 udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */ 2058#endif 2059 memset(ptr, 0, sizeof(*ptr)); 2060 ptr->base = &udb->base; 2061} 2062 2063void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval) 2064{ 2065 assert(newval == 0 || newval > 64); 2066 if(ptr->data) 2067 udb_base_unlink_ptr(udb, ptr); 2068 ptr->data = newval; 2069 if(newval) 2070 udb_base_link_ptr(udb, ptr); 2071} 2072 2073int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type, 2074 size_t sz) 2075{ 2076 udb_void r; 2077 r = udb_alloc_space(udb->alloc, sz); 2078 if(!r) return 0; 2079 udb_alloc_set_type(udb->alloc, r, type); 2080 udb_ptr_init(ptr, udb); 2081 udb_ptr_set(ptr, udb, r); 2082 return 1; 2083} 2084 2085void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz) 2086{ 2087 if(ptr->data) { 2088 udb_void d = ptr->data; 2089 udb_ptr_set(ptr, udb, 0); 2090 udb_alloc_free(udb->alloc, d, sz); 2091 } 2092} 2093 2094udb_chunk_type udb_ptr_get_type(udb_ptr* ptr) 2095{ 2096 udb_void f; 2097 if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/ 2098 f = chunk_from_dataptr(ptr->data); 2099 return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type; 2100} 2101