1 /* udb.c - u(micro) data base. 2 * By W.C.A. Wijngaards 3 * Copyright 2010, NLnet Labs. 4 * BSD, see LICENSE. 5 */ 6 #include "config.h" 7 #include "udb.h" 8 #include <string.h> 9 #include <errno.h> 10 #include <stdio.h> 11 #include <unistd.h> 12 #include <assert.h> 13 #include "lookup3.h" 14 #include "util.h" 15 16 /* mmap and friends */ 17 #include <sys/types.h> 18 #include <sys/stat.h> 19 #include <fcntl.h> 20 #include <sys/mman.h> 21 22 /* for systems without, portable definition, failed-1 and async is a flag */ 23 #ifndef MAP_FAILED 24 #define MAP_FAILED ((void*)-1) 25 #endif 26 #ifndef MS_SYNC 27 #define MS_SYNC 0 28 #endif 29 30 /** move and fixup xl segment */ 31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl, 32 udb_void n, uint64_t sz, uint64_t startseg); 33 /** attempt to compact the data and move free space to the end */ 34 static int udb_alloc_compact(void* base, udb_alloc* alloc); 35 36 /** convert pointer to the data part to a pointer to the base of the chunk */ 37 static udb_void 38 chunk_from_dataptr(udb_void data) 39 { 40 /* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and 41 * that xl_chunk_d is aligned on x**1024 boundaries. */ 42 udb_void xl = data - sizeof(udb_xl_chunk_d); 43 if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0) 44 return xl; 45 return data - sizeof(udb_chunk_d); 46 } 47 48 udb_void chunk_from_dataptr_ext(udb_void data) { 49 return chunk_from_dataptr(data); 50 } 51 52 #ifndef NDEBUG 53 /** read last octet from a chunk */ 54 static uint8_t 55 chunk_get_last(void* base, udb_void chunk, int exp) 56 { 57 return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)); 58 } 59 #endif 60 61 /** write last octet of a chunk */ 62 static void 63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value) 64 { 65 *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value; 66 } 67 68 /** create udb_base from a file descriptor (must be at start of file) */ 69 udb_base* 70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc, 71 void* arg) 72 { 73 uint64_t m, fsz; 74 udb_glob_d g; 75 ssize_t r; 76 udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb)); 77 if(!udb) { 78 log_msg(LOG_ERR, "out of memory"); 79 close(fd); 80 return NULL; 81 } 82 udb->fname = strdup(fname); 83 if(!udb->fname) { 84 log_msg(LOG_ERR, "out of memory"); 85 free(udb); 86 close(fd); 87 return NULL; 88 } 89 udb->walkfunc = walkfunc; 90 udb->walkarg = arg; 91 udb->fd = fd; 92 udb->ram_size = 1024; 93 udb->ram_mask = (int)udb->ram_size - 1; 94 udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*), 95 udb->ram_size); 96 if(!udb->ram_hash) { 97 free(udb->fname); 98 free(udb); 99 log_msg(LOG_ERR, "out of memory"); 100 close(fd); 101 return NULL; 102 } 103 104 /* read magic */ 105 if((r=read(fd, &m, sizeof(m))) == -1) { 106 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 107 goto fail; 108 } else if(r != (ssize_t)sizeof(m)) { 109 log_msg(LOG_ERR, "%s: file too short", fname); 110 goto fail; 111 } 112 /* TODO : what if bigendian and littleendian file, see magic */ 113 if(m != UDB_MAGIC) { 114 log_msg(LOG_ERR, "%s: wrong type of file", fname); 115 goto fail; 116 } 117 /* read header */ 118 if((r=read(fd, &g, sizeof(g))) == -1) { 119 log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno)); 120 goto fail; 121 } else if(r != (ssize_t)sizeof(g)) { 122 log_msg(LOG_ERR, "%s: file too short", fname); 123 goto fail; 124 } 125 if(g.version != 0) { 126 log_msg(LOG_ERR, "%s: unknown file version %d", fname, 127 (int)g.version); 128 goto fail; 129 } 130 if(g.hsize < UDB_HEADER_SIZE) { 131 log_msg(LOG_ERR, "%s: header size too small %d", fname, 132 (int)g.hsize); 133 goto fail; 134 } 135 if(g.hsize > UDB_HEADER_SIZE) { 136 log_msg(LOG_WARNING, "%s: header size too large %d", fname, 137 (int)g.hsize); 138 goto fail; 139 } 140 if(g.clean_close != 1) { 141 log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname, 142 (int)g.clean_close); 143 goto fail; 144 } 145 if(g.dirty_alloc != 0) { 146 log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname, 147 (int)g.dirty_alloc); 148 goto fail; 149 } 150 151 /* check file size correctly written, for 4.0.2 nsd.db failure */ 152 fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END); 153 (void)lseek(fd, (off_t)0, SEEK_SET); 154 if(g.fsize != fsz) { 155 log_msg(LOG_WARNING, "%s: file size %llu but mmap header " 156 "has size %llu", fname, (unsigned long long)fsz, 157 (unsigned long long)g.fsize); 158 goto fail; 159 } 160 161 /* mmap it */ 162 if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) { 163 log_msg(LOG_ERR, "%s: file too short", fname); 164 goto fail; 165 } 166 if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ { 167 log_msg(LOG_WARNING, "%s: file size too large %llu", 168 fname, (unsigned long long)g.fsize); 169 goto fail; 170 } 171 udb->base_size = (size_t)g.fsize; 172 #ifdef HAVE_MMAP 173 /* note the size_t casts must be there for portability, on some 174 * systems the layout of memory is otherwise broken. */ 175 udb->base = mmap(NULL, (size_t)udb->base_size, 176 (int)PROT_READ|PROT_WRITE, (int)MAP_SHARED, 177 (int)udb->fd, (off_t)0); 178 #else 179 udb->base = MAP_FAILED; errno = ENOSYS; 180 #endif 181 if(udb->base == MAP_FAILED) { 182 udb->base = NULL; 183 log_msg(LOG_ERR, "mmap(size %u) error: %s", 184 (unsigned)udb->base_size, strerror(errno)); 185 fail: 186 close(fd); 187 free(udb->fname); 188 free(udb->ram_hash); 189 free(udb); 190 return NULL; 191 } 192 193 /* init completion */ 194 udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t)); 195 r = 0; 196 /* cannot be dirty because that is goto fail above */ 197 if(udb->glob_data->dirty_alloc != udb_dirty_clean) 198 r = 1; 199 udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)( 200 (void*)udb->glob_data+sizeof(*udb->glob_data))); 201 if(!udb->alloc) { 202 log_msg(LOG_ERR, "out of memory"); 203 udb_base_free(udb); 204 return NULL; 205 } 206 if(r) { 207 /* and compact now, or resume compacting */ 208 udb_alloc_compact(udb, udb->alloc); 209 udb_base_sync(udb, 1); 210 } 211 udb->glob_data->clean_close = 0; 212 213 return udb; 214 } 215 216 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc, 217 void* arg) 218 { 219 int fd = open(fname, O_RDWR); 220 if(fd == -1) { 221 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 222 return NULL; 223 } 224 return udb_base_create_fd(fname, fd, walkfunc, arg); 225 } 226 227 /** init new udb_global structure */ 228 static void udb_glob_init_new(udb_glob_d* g) 229 { 230 memset(g, 0, sizeof(*g)); 231 g->hsize = UDB_HEADER_SIZE; 232 g->fsize = UDB_HEADER_SIZE; 233 } 234 235 /** write data to file and check result */ 236 static int 237 write_fdata(const char* fname, int fd, void* data, size_t len) 238 { 239 ssize_t w; 240 if((w=write(fd, data, len)) == -1) { 241 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 242 close(fd); 243 return 0; 244 } else if(w != (ssize_t)len) { 245 log_msg(LOG_ERR, "%s: short write (disk full?)", fname); 246 close(fd); 247 return 0; 248 } 249 return 1; 250 } 251 252 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc, 253 void* arg) 254 { 255 uint64_t m; 256 udb_glob_d g; 257 udb_alloc_d a; 258 uint64_t endsize = UDB_HEADER_SIZE; 259 uint64_t endexp = 0; 260 int fd = open(fname, O_CREAT|O_RDWR, 0600); 261 if(fd == -1) { 262 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 263 return NULL; 264 } 265 m = UDB_MAGIC; 266 udb_glob_init_new(&g); 267 udb_alloc_init_new(&a); 268 g.clean_close = 1; 269 270 /* write new data to file (closes fd on error) */ 271 if(!write_fdata(fname, fd, &m, sizeof(m))) 272 return NULL; 273 if(!write_fdata(fname, fd, &g, sizeof(g))) 274 return NULL; 275 if(!write_fdata(fname, fd, &a, sizeof(a))) 276 return NULL; 277 if(!write_fdata(fname, fd, &endsize, sizeof(endsize))) 278 return NULL; 279 if(!write_fdata(fname, fd, &endexp, sizeof(endexp))) 280 return NULL; 281 /* rewind to start */ 282 if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) { 283 log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno)); 284 close(fd); 285 return NULL; 286 } 287 /* truncate to the right size */ 288 if(ftruncate(fd, (off_t)g.fsize) < 0) { 289 log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname, 290 (int)g.fsize, strerror(errno)); 291 close(fd); 292 return NULL; 293 } 294 return udb_base_create_fd(fname, fd, walkfunc, arg); 295 } 296 297 /** shrink the udb base if it has unused space at the end */ 298 static void 299 udb_base_shrink(udb_base* udb, uint64_t nsize) 300 { 301 udb->glob_data->dirty_alloc = udb_dirty_fsize; 302 udb->glob_data->fsize = nsize; 303 /* sync, does not *seem* to be required on Linux, but it is 304 certainly required on OpenBSD. Otherwise changed data is lost. */ 305 #ifdef HAVE_MMAP 306 msync(udb->base, udb->base_size, MS_ASYNC); 307 #endif 308 if(ftruncate(udb->fd, (off_t)nsize) != 0) { 309 log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname, 310 (unsigned)nsize, strerror(errno)); 311 } 312 udb->glob_data->dirty_alloc = udb_dirty_clean; 313 } 314 315 void udb_base_close(udb_base* udb) 316 { 317 if(!udb) 318 return; 319 if(udb->fd != -1 && udb->base && udb->alloc) { 320 uint64_t nsize = udb->alloc->disk->nextgrow; 321 if(nsize < udb->base_size) 322 udb_base_shrink(udb, nsize); 323 } 324 if(udb->fd != -1) { 325 udb->glob_data->clean_close = 1; 326 close(udb->fd); 327 udb->fd = -1; 328 } 329 if(udb->base) { 330 #ifdef HAVE_MMAP 331 if(munmap(udb->base, udb->base_size) == -1) { 332 log_msg(LOG_ERR, "munmap: %s", strerror(errno)); 333 } 334 #endif 335 udb->base = NULL; 336 } 337 } 338 339 void udb_base_free(udb_base* udb) 340 { 341 if(!udb) 342 return; 343 udb_base_close(udb); 344 udb_alloc_delete(udb->alloc); 345 free(udb->ram_hash); 346 free(udb->fname); 347 free(udb); 348 } 349 350 void udb_base_free_keep_mmap(udb_base* udb) 351 { 352 if(!udb) return; 353 if(udb->fd != -1) { 354 close(udb->fd); 355 udb->fd = -1; 356 } 357 udb->base = NULL; 358 udb_alloc_delete(udb->alloc); 359 free(udb->ram_hash); 360 free(udb->fname); 361 free(udb); 362 } 363 364 void udb_base_sync(udb_base* udb, int wait) 365 { 366 if(!udb) return; 367 #ifdef HAVE_MMAP 368 if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) { 369 log_msg(LOG_ERR, "msync(%s) error %s", 370 udb->fname, strerror(errno)); 371 } 372 #else 373 (void)wait; 374 #endif 375 } 376 377 /** hash a chunk pointer */ 378 static uint32_t 379 chunk_hash_ptr(udb_void p) 380 { 381 /* put p into an array of uint32 */ 382 uint32_t h[sizeof(p)/sizeof(uint32_t)]; 383 memcpy(&h, &p, sizeof(h)); 384 return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763); 385 } 386 387 /** check that the given pointer is on the bucket for the given offset */ 388 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to) 389 { 390 uint32_t i = chunk_hash_ptr(to) & udb->ram_mask; 391 udb_ptr* p; 392 assert((size_t)i < udb->ram_size); 393 for(p = udb->ram_hash[i]; p; p=p->next) { 394 if(p == ptr) 395 return 1; 396 } 397 return 0; 398 } 399 400 /** grow the ram array */ 401 static void 402 grow_ram_hash(udb_base* udb, udb_ptr** newhash) 403 { 404 size_t i; 405 size_t osize= udb->ram_size; 406 udb_ptr* p, *np; 407 udb_ptr** oldhash = udb->ram_hash; 408 udb->ram_size *= 2; 409 udb->ram_mask <<= 1; 410 udb->ram_mask |= 1; 411 udb->ram_hash = newhash; 412 /* have to link in every element in the old list into the new list*/ 413 for(i=0; i<osize; i++) { 414 p = oldhash[i]; 415 while(p) { 416 np = p->next; 417 /* link into newhash */ 418 p->prev=NULL; 419 p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask]; 420 if(p->next) p->next->prev = p; 421 /* go to next element of oldhash */ 422 p = np; 423 } 424 } 425 free(oldhash); 426 } 427 428 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr) 429 { 430 uint32_t i; 431 #ifdef UDB_CHECK 432 assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/ 433 #endif 434 udb->ram_num++; 435 if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) { 436 /* grow the array, if allocation succeeds */ 437 udb_ptr** newram = (udb_ptr**)xalloc_array_zero( 438 sizeof(udb_ptr*), udb->ram_size*2); 439 if(newram) { 440 grow_ram_hash(udb, newram); 441 } 442 } 443 i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 444 assert((size_t)i < udb->ram_size); 445 446 ptr->prev = NULL; 447 ptr->next = udb->ram_hash[i]; 448 udb->ram_hash[i] = ptr; 449 if(ptr->next) 450 ptr->next->prev = ptr; 451 } 452 453 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr) 454 { 455 assert(ptr->data); 456 #ifdef UDB_CHECK 457 assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */ 458 assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data)); 459 #endif 460 udb->ram_num--; 461 if(ptr->next) 462 ptr->next->prev = ptr->prev; 463 if(ptr->prev) 464 ptr->prev->next = ptr->next; 465 else { 466 uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 467 assert((size_t)i < udb->ram_size); 468 udb->ram_hash[i] = ptr->next; 469 } 470 } 471 472 /** change a set of ram ptrs to a new value */ 473 static void 474 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd) 475 { 476 uint32_t io = chunk_hash_ptr(old) & udb->ram_mask; 477 udb_ptr* p, *np; 478 /* edit them and move them into the new position */ 479 p = udb->ram_hash[io]; 480 while(p) { 481 np = p->next; 482 if(p->data == old) { 483 udb_base_unlink_ptr(udb, p); 484 p->data = newd; 485 udb_base_link_ptr(udb, p); 486 } 487 p = np; 488 } 489 } 490 491 udb_rel_ptr* udb_base_get_userdata(udb_base* udb) 492 { 493 return &udb->glob_data->user_global; 494 } 495 496 void udb_base_set_userdata(udb_base* udb, udb_void user) 497 { 498 #ifdef UDB_CHECK 499 if(user) { assert(udb_valid_dataptr(udb, user)); } 500 #endif 501 udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user); 502 } 503 504 void udb_base_set_userflags(udb_base* udb, uint8_t v) 505 { 506 udb->glob_data->userflags = v; 507 } 508 509 uint8_t udb_base_get_userflags(udb_base* udb) 510 { 511 return udb->glob_data->userflags; 512 } 513 514 /** re-mmap the udb to specified size */ 515 static void* 516 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize) 517 { 518 #ifdef HAVE_MMAP 519 void* nb; 520 /* for use with valgrind, do not use mremap, but the other version */ 521 #ifdef MREMAP_MAYMOVE 522 nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE); 523 if(nb == MAP_FAILED) { 524 log_msg(LOG_ERR, "mremap(%s, size %u) error %s", 525 udb->fname, (unsigned)nsize, strerror(errno)); 526 return 0; 527 } 528 #else /* !HAVE MREMAP */ 529 /* use munmap-mmap to simulate mremap */ 530 if(munmap(udb->base, udb->base_size) != 0) { 531 log_msg(LOG_ERR, "munmap(%s) error %s", 532 udb->fname, strerror(errno)); 533 } 534 /* provide hint for new location */ 535 /* note the size_t casts must be there for portability, on some 536 * systems the layout of memory is otherwise broken. */ 537 nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 538 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 539 /* retry the mmap without basept in case of ENOMEM (FreeBSD8), 540 * the kernel can then try to mmap it at a different location 541 * where more memory is available */ 542 if(nb == MAP_FAILED && errno == ENOMEM) { 543 nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 544 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 545 } 546 if(nb == MAP_FAILED) { 547 log_msg(LOG_ERR, "mmap(%s, size %u) error %s", 548 udb->fname, (unsigned)nsize, strerror(errno)); 549 udb->base = NULL; 550 return 0; 551 } 552 #endif /* HAVE MREMAP */ 553 if(nb != udb->base) { 554 /* fix up realpointers in udb and alloc */ 555 /* but mremap may have been nice and not move the base */ 556 udb->base = nb; 557 udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t)); 558 /* use passed alloc pointer because the udb->alloc may not 559 * be initialized yet */ 560 alloc->disk = (udb_alloc_d*)((void*)udb->glob_data 561 +sizeof(*udb->glob_data)); 562 } 563 udb->base_size = nsize; 564 return nb; 565 #else /* HAVE_MMAP */ 566 (void)udb; (void)alloc; (void)nsize; 567 return NULL; 568 #endif /* HAVE_MMAP */ 569 } 570 571 void 572 udb_base_remap_process(udb_base* udb) 573 { 574 /* assume that fsize is still accessible */ 575 udb_base_remap(udb, udb->alloc, udb->glob_data->fsize); 576 } 577 578 /** grow file to specified size and re-mmap, return new base */ 579 static void* 580 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize) 581 { 582 /* grow file by writing a single zero at that spot, the 583 * rest is filled in with zeroes. */ 584 uint8_t z = 0; 585 ssize_t w; 586 587 assert(nsize > 0); 588 udb->glob_data->dirty_alloc = udb_dirty_fsize; 589 #ifdef HAVE_PWRITE 590 if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) { 591 #else 592 if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) { 593 log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno)); 594 return 0; 595 } 596 if((w=write(udb->fd, &z, sizeof(z))) == -1) { 597 #endif 598 log_msg(LOG_ERR, "grow(%s, size %u) error %s", 599 udb->fname, (unsigned)nsize, strerror(errno)); 600 return 0; 601 } else if(w != (ssize_t)sizeof(z)) { 602 log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)", 603 udb->fname, (unsigned)nsize); 604 return 0; 605 } 606 udb->glob_data->fsize = nsize; 607 udb->glob_data->dirty_alloc = udb_dirty_clean; 608 return udb_base_remap(udb, udb->alloc, nsize); 609 } 610 611 int udb_exp_size(uint64_t a) 612 { 613 /* find enclosing value such that 2**x >= a */ 614 int x = 0; 615 uint64_t i = a; 616 assert(a != 0); 617 618 i --; 619 /* could optimise this with uint8* access, depends on endianness */ 620 /* first whole bytes */ 621 while( (i&(~(uint64_t)0xff)) ) { 622 i >>= 8; 623 x += 8; 624 } 625 /* now details */ 626 while(i) { 627 i >>= 1; 628 x ++; 629 } 630 assert( ((uint64_t)1<<x) >= a); 631 assert( x==0 || ((uint64_t)1<<(x-1)) < a); 632 return x; 633 } 634 635 int udb_exp_offset(uint64_t o) 636 { 637 /* this means measuring the number of 0 bits on the right */ 638 /* so, if exp zero bits then (o&(2**x-1))==0 */ 639 int x = 0; 640 uint64_t i = o; 641 assert(o != 0); 642 /* first whole bytes */ 643 while( (i&(uint64_t)0xff) == 0) { 644 i >>= 8; 645 x += 8; 646 } 647 /* now details */ 648 while( (i&(uint64_t)0x1) == 0) { 649 i >>= 1; 650 x ++; 651 } 652 assert( o % ((uint64_t)1<<x) == 0); 653 assert( o % ((uint64_t)1<<(x+1)) != 0); 654 return x; 655 } 656 657 void udb_alloc_init_new(udb_alloc_d* a) 658 { 659 assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0); 660 memset(a, 0, sizeof(*a)); 661 /* set new allocations after header, as if allocated in a sequence 662 * of minsize allocations */ 663 a->nextgrow = UDB_HEADER_SIZE; 664 } 665 666 /** fsck the file size, false if failed and file is useless */ 667 static int 668 fsck_fsize(udb_base* udb, udb_alloc* alloc) 669 { 670 off_t realsize; 671 log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname); 672 realsize = lseek(udb->fd, (off_t)0, SEEK_END); 673 if(realsize == (off_t)-1) { 674 log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno)); 675 return 0; 676 } 677 udb->glob_data->fsize = (uint64_t)realsize; 678 if(!udb_base_remap(udb, alloc, (uint64_t)realsize)) 679 return 0; 680 udb->glob_data->dirty_alloc = udb_dirty_clean; 681 log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname); 682 udb_base_sync(udb, 1); 683 return 1; 684 } 685 686 /** regenerate freelist add a new free chunk, return next todo */ 687 static udb_void 688 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen) 689 { 690 udb_free_chunk_d* cp = UDB_FREE_CHUNK(c); 691 uint64_t esz = (uint64_t)1<<exp; 692 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 693 return 0; 694 } 695 cp->type = udb_chunk_type_free; 696 cp->flags = 0; 697 chunk_set_last(base, c, exp, (uint8_t)exp); 698 cp->prev = 0; 699 cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 700 if(cp->next) 701 UDB_FREE_CHUNK(cp->next)->prev = c; 702 regen->stat_free += esz; 703 return c + esz; 704 } 705 706 /** regenerate xl chunk, return next todo */ 707 static udb_void 708 regen_xl(void* base, udb_void c, udb_alloc_d* regen) 709 { 710 udb_xl_chunk_d* cp = UDB_XL_CHUNK(c); 711 uint64_t xlsz = cp->size; 712 if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 713 return 0; 714 } 715 if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 716 return 0; 717 } 718 /* fixup end-size and end-expmarker */ 719 regen->stat_alloc += xlsz; 720 return c + xlsz; 721 } 722 723 /** regenerate data chunk, return next todo */ 724 static udb_void 725 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen) 726 { 727 uint64_t esz = (uint64_t)1<<exp; 728 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 729 return 0; 730 } 731 chunk_set_last(base, c, exp, (uint8_t)exp); 732 regen->stat_alloc += esz; 733 return c + esz; 734 } 735 736 /** regenerate a relptr structure inside a data segment */ 737 static void 738 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg) 739 { 740 udb_void* a = (udb_void*)arg; 741 /* ignore 0 pointers */ 742 if(!rp->data) 743 return; 744 745 /* edit relptrs that point to oldmoved to point to newmoved. */ 746 if(rp->data == a[0]) 747 rp->data = a[1]; 748 749 /* regenerate relptr lists, add this item to the relptr list for 750 * the data that it points to */ 751 udb_rel_ptr_link(base, rp, rp->data); 752 } 753 754 /** regenerate the relptrs store in this data segment */ 755 static void 756 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp, 757 void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new) 758 { 759 udb_void arg[2]; 760 arg[0] = rb_old; arg[1] = rb_new; 761 /* walk through the structs here and put them on their respective 762 * relptr lists */ 763 (*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz, 764 ®en_relptr_func, arg); 765 766 } 767 768 /** regenerate relptrlists in the file */ 769 static void 770 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc, 771 udb_void rb_old, udb_void rb_new) 772 { 773 udb_void at = alloc->udb->glob_data->hsize; 774 /* clear all ptrlist start pointers in the file. */ 775 while(at < alloc->disk->nextgrow) { 776 int exp = (int)UDB_CHUNK(at)->exp; 777 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 778 if(exp == UDB_EXP_XL) { 779 UDB_XL_CHUNK(at)->ptrlist = 0; 780 at += UDB_XL_CHUNK(at)->size; 781 } else if(tp == udb_chunk_type_free) { 782 at += (uint64_t)1<<exp; 783 } else { /* data chunk */ 784 UDB_CHUNK(at)->ptrlist = 0; 785 at += (uint64_t)1<<exp; 786 } 787 } 788 /* walk through all relptr structs and put on the right list. */ 789 at = alloc->udb->glob_data->hsize; 790 while(at < alloc->disk->nextgrow) { 791 udb_chunk_d* atp = UDB_CHUNK(at); 792 int exp = (int)atp->exp; 793 udb_chunk_type tp = (udb_chunk_type)atp->type; 794 uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size: 795 (uint64_t)1<<exp); 796 if(exp == UDB_EXP_XL) { 797 assert(at != rb_old); /* should have been freed */ 798 regen_its_ptrs(base, udb, atp, 799 ((void*)atp)+sizeof(udb_xl_chunk_d), 800 sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2, 801 rb_old, rb_new); 802 at += sz; 803 } else if(tp == udb_chunk_type_free) { 804 at += sz; 805 } else { /* data chunk */ 806 assert(at != rb_old); /* should have been freed */ 807 regen_its_ptrs(base, udb, atp, 808 ((void*)atp)+sizeof(udb_chunk_d), 809 sz-sizeof(udb_chunk_d)-1, rb_old, rb_new); 810 at += sz; 811 } 812 } 813 } 814 815 816 /** mark free elements from ex XL chunk space and later fixups pick that up */ 817 static void 818 rb_mark_free_segs(void* base, udb_void s, uint64_t m) 819 { 820 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 821 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 822 assert(s >= UDB_ALLOC_CHUNK_SIZE); 823 while(q >= s) { 824 UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX; 825 UDB_CHUNK(q)->type = udb_chunk_type_free; 826 q -= UDB_ALLOC_CHUNK_SIZE; 827 } 828 } 829 830 831 /** fsck rollback or rollforward XL move results */ 832 static int 833 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new, 834 uint64_t rb_size, uint64_t rb_seg) 835 { 836 837 if(rb_old <= rb_new) 838 return 0; /* XL move one way */ 839 if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 840 return 0; /* not aligned */ 841 if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 842 return 0; /* not aligned */ 843 if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 844 return 0; /* not aligned */ 845 if(rb_new + rb_size <= rb_old) { 846 /* not overlapping: resume copy */ 847 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 848 /* and free up old piece(s) */ 849 rb_mark_free_segs(base, rb_old, rb_size); 850 } else { 851 /* overlapping, see what segment we stopped at 852 * and continue there. */ 853 move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg); 854 /* free up old piece(s); from the end of the moved segment, 855 * until the end of the old segment */ 856 rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)- 857 (rb_new+rb_size)); 858 } 859 /* do not call fix_ptrs, regenptrs does the job */ 860 return 1; 861 } 862 863 /** fsck rollback or rollforward move results */ 864 static int 865 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size, 866 udb_void* make_free) 867 { 868 if( (rb_size&(rb_size-1)) != 0) 869 return 0; /* not powerof2 */ 870 if( (rb_old&(rb_size-1)) != 0) 871 return 0; /* not aligned */ 872 if( (rb_new&(rb_size-1)) != 0) 873 return 0; /* not aligned */ 874 /* resume copy */ 875 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 876 /* do not call fix_ptrs, regenptrs does the job */ 877 /* make sure udb_old is freed */ 878 *make_free = rb_old; 879 return 1; 880 } 881 882 /** fsck the file and salvage, false if failed and file is useless */ 883 static int 884 fsck_file(udb_base* udb, udb_alloc* alloc, int moved) 885 { 886 void* base = udb->base; 887 udb_alloc_d regen; 888 udb_void at = udb->glob_data->hsize; 889 udb_void rb_old = udb->glob_data->rb_old; 890 udb_void rb_new = udb->glob_data->rb_new; 891 udb_void rb_seg = udb->glob_data->rb_seg; 892 udb_void make_free = 0; 893 uint64_t rb_size = udb->glob_data->rb_size; 894 log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname); 895 /* walk through the file, use the exp values to see what can be 896 * salvaged */ 897 if(moved && rb_old && rb_new && rb_size) { 898 if(rb_old+rb_size <= alloc->disk->nextgrow 899 && rb_new+rb_size <= alloc->disk->nextgrow) { 900 /* we can use the move information to fix up the 901 * duplicate element (or partially moved element) */ 902 if(rb_size > 1024*1024) { 903 /* XL chunk */ 904 if(!fsck_rb_xl(base, udb, rb_old, rb_new, 905 rb_size, rb_seg)) 906 return 0; 907 } else { 908 if(!fsck_rb(base, rb_old, rb_new, rb_size, 909 &make_free)) 910 return 0; 911 } 912 } 913 } 914 915 /* rebuild freelists */ 916 /* recalculate stats in alloc (except 'stat_data') */ 917 /* possibly new end 'nextgrow' value */ 918 memset(®en, 0, sizeof(regen)); 919 regen.nextgrow = alloc->disk->nextgrow; 920 while(at < regen.nextgrow) { 921 /* figure out this chunk */ 922 int exp = (int)UDB_CHUNK(at)->exp; 923 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 924 /* consistency check possible here with end-exp */ 925 if(tp == udb_chunk_type_free || at == make_free) { 926 at = regen_free(base, at, exp, ®en); 927 if(!at) return 0; 928 } else if(exp == UDB_EXP_XL) { 929 /* allocated data of XL size */ 930 at = regen_xl(base, at, ®en); 931 if(!at) return 0; 932 } else if(exp >= UDB_ALLOC_CHUNK_MINEXP 933 && exp <= UDB_ALLOC_CHUNKS_MAX) { 934 /* allocated data */ 935 at = regen_data(base, at, exp, ®en); 936 if(!at) return 0; 937 } else { 938 /* garbage; this must be EOF then */ 939 regen.nextgrow = at; 940 break; 941 } 942 } 943 *alloc->disk = regen; 944 945 /* rebuild relptr lists */ 946 regen_ptrlist(base, udb, alloc, rb_old, rb_new); 947 948 log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)", 949 udb->fname); 950 udb->glob_data->rb_old = 0; 951 udb->glob_data->rb_new = 0; 952 udb->glob_data->rb_size = 0; 953 udb->glob_data->dirty_alloc = udb_dirty_clean; 954 udb_base_sync(udb, 1); 955 return 1; 956 } 957 958 959 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk) 960 { 961 udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc)); 962 if(!alloc) 963 return NULL; 964 alloc->udb = udb; 965 alloc->disk = disk; 966 /* see if committed but uncompleted actions need to be done */ 967 /* preserves the alloc state */ 968 if(udb->glob_data->dirty_alloc != udb_dirty_clean) { 969 if(udb->glob_data->dirty_alloc == udb_dirty_fsize) { 970 if(fsck_fsize(udb, alloc)) 971 return alloc; 972 } else if(udb->glob_data->dirty_alloc == udb_dirty_fl) { 973 if(fsck_file(udb, alloc, 0)) 974 return alloc; 975 } else if(udb->glob_data->dirty_alloc == udb_dirty_compact) { 976 if(fsck_file(udb, alloc, 1)) 977 return alloc; 978 } 979 log_msg(LOG_ERR, "error: file allocation dirty (%d)", 980 (int)udb->glob_data->dirty_alloc); 981 free(alloc); 982 return NULL; 983 } 984 return alloc; 985 } 986 987 void udb_alloc_delete(udb_alloc* alloc) 988 { 989 if(!alloc) return; 990 free(alloc); 991 } 992 993 /** unlink this element from its freelist */ 994 static void 995 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp) 996 { 997 udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk); 998 assert(chunk); 999 /* chunk is a free chunk */ 1000 assert(fp->exp == (uint8_t)exp); 1001 assert(fp->type == udb_chunk_type_free); 1002 assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp); 1003 /* and thus freelist not empty */ 1004 assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]); 1005 /* unlink */ 1006 if(fp->prev) 1007 UDB_FREE_CHUNK(fp->prev)->next = fp->next; 1008 else alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 1009 if(fp->next) 1010 UDB_FREE_CHUNK(fp->next)->prev = fp->prev; 1011 } 1012 1013 /** pop first element off freelist, list may not be empty */ 1014 static udb_void 1015 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp) 1016 { 1017 udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1018 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1019 assert(f); 1020 assert(fp->exp == (uint8_t)exp); 1021 assert(fp->type == udb_chunk_type_free); 1022 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1023 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 1024 if(fp->next) { 1025 UDB_FREE_CHUNK(fp->next)->prev = 0; 1026 } 1027 return f; 1028 } 1029 1030 /** push new element onto freelist */ 1031 static void 1032 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp) 1033 { 1034 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1035 assert(f); 1036 fp->exp = (uint8_t)exp; 1037 fp->type = udb_chunk_type_free; 1038 fp->flags = 0; 1039 fp->prev = 0; 1040 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1041 if(fp->next) 1042 UDB_FREE_CHUNK(fp->next)->prev = f; 1043 chunk_set_last(base, f, exp, (uint8_t)exp); 1044 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1045 } 1046 1047 /** push new element onto freelist - do not initialize the elt */ 1048 static void 1049 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp) 1050 { 1051 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1052 assert(f); 1053 assert(fp->exp == (uint8_t)exp); 1054 assert(fp->type == udb_chunk_type_free); 1055 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1056 fp->prev = 0; 1057 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1058 if(fp->next) 1059 UDB_FREE_CHUNK(fp->next)->prev = f; 1060 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1061 } 1062 1063 /** add free chunks at end until specified alignment occurs */ 1064 static void 1065 grow_align(void* base, udb_alloc* alloc, uint64_t esz) 1066 { 1067 while( (alloc->disk->nextgrow & (esz-1)) != 0) { 1068 /* the nextgrow is not a whole multiple of esz. */ 1069 /* grow a free chunk of max allowed size */ 1070 int fexp = udb_exp_offset(alloc->disk->nextgrow); 1071 uint64_t fsz = (uint64_t)1<<fexp; 1072 udb_void f = alloc->disk->nextgrow; 1073 udb_void fn = alloc->disk->nextgrow+fsz; 1074 assert(fn <= alloc->udb->base_size); 1075 alloc->disk->stat_free += fsz; 1076 udb_alloc_push_fl(base, alloc, f, fexp); 1077 /* now increase nextgrow to commit that free chunk */ 1078 alloc->disk->nextgrow = fn; 1079 } 1080 } 1081 1082 /** append chunks at end of memory space to get size exp, return dataptr */ 1083 static udb_void 1084 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp) 1085 { 1086 uint64_t esz = (uint64_t)1<<exp; 1087 udb_void ret; 1088 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1089 grow_align(base, alloc, esz); 1090 /* free chunks are grown, grow the one we want to use */ 1091 ret = alloc->disk->nextgrow; 1092 /* take a new alloced chunk into use */ 1093 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1094 UDB_CHUNK(ret)->flags = 0; 1095 UDB_CHUNK(ret)->ptrlist = 0; 1096 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1097 /* store last octet */ 1098 chunk_set_last(base, ret, exp, (uint8_t)exp); 1099 /* update stats */ 1100 alloc->disk->stat_alloc += esz; 1101 alloc->disk->stat_data += sz; 1102 /* now increase nextgrow to commit this newly allocated chunk */ 1103 alloc->disk->nextgrow += esz; 1104 assert(alloc->disk->nextgrow <= alloc->udb->base_size); 1105 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1106 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1107 } 1108 1109 /** calculate how much space is necessary to grow for this exp */ 1110 static uint64_t 1111 grow_end_calc(udb_alloc* alloc, int exp) 1112 { 1113 uint64_t sz = (uint64_t)1<<exp; 1114 uint64_t ng = alloc->disk->nextgrow; 1115 uint64_t res; 1116 /* if nextgrow is 2**expness, no extra growth needed, only size */ 1117 if( (ng & (sz-1)) == 0) { 1118 /* sz-1 is like 0xfff, and checks if ng is whole 2**exp */ 1119 return ng+sz; /* must grow exactly 2**exp */ 1120 } 1121 /* grow until 2**expness and then we need 2**exp as well */ 1122 /* so, round ng down to whole sz (basically ng-ng%sz, or ng/sz*sz) 1123 * and then add the sz twice (go up to whole sz, and to allocate) */ 1124 res = (ng & ~(sz-1)) + 2*sz; 1125 return res; 1126 } 1127 1128 /** see if we need to grow more than specified to enable sustained growth */ 1129 static uint64_t 1130 grow_extra_check(udb_alloc* alloc, uint64_t ge) 1131 { 1132 const uint64_t mb = 1024*1024; 1133 uint64_t bsz = alloc->udb->base_size; 1134 if(bsz <= mb) { 1135 /* below 1 Mb, double sizes for exponential growth */ 1136 /* takes about 15 times to grow to 1Mb */ 1137 if(ge < bsz*2) 1138 return bsz*2; 1139 } else { 1140 uint64_t gnow = ge - bsz; 1141 /* above 1Mb, grow at least 1 Mb, or 12.5% of current size, 1142 * in whole megabytes rounded up. */ 1143 uint64_t want = ((bsz / 8) & ~(mb-1)) + mb; 1144 if(gnow < want) 1145 return bsz + want; 1146 } 1147 return ge; 1148 } 1149 1150 /** see if free space is enogh to warrant shrink (while file is open) */ 1151 static int 1152 enough_free(udb_alloc* alloc) 1153 { 1154 if(alloc->udb->base_size <= 2*1024*1024) { 1155 /* below 1 Mb, grown by double size, (so up to 2 mb), 1156 * do not shrink unless we can 1/3 in size */ 1157 if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size) 1158 return 1; 1159 } else { 1160 /* grown 12.5%, shrink 25% if possible, at least one mb */ 1161 /* between 1mb and 4mb size, it shrinks by 1mb if possible */ 1162 uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow; 1163 if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size 1164 || alloc->udb->base_size < 4*1024*1024)) 1165 return 1; 1166 } 1167 return 0; 1168 } 1169 1170 /** grow space for a chunk of 2**exp and return dataptr */ 1171 static udb_void 1172 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp) 1173 { 1174 /* commit the grow action 1175 * - the file grow only changes filesize, but not the nextgrow. 1176 * - taking space after nextgrow into use (as free space), 1177 * is like free-ing a chunk (one at a time). 1178 * - and the last chunk taken into use is like alloc. 1179 */ 1180 /* predict how much free space is needed for this */ 1181 uint64_t grow_end = grow_end_calc(alloc, exp); 1182 assert(alloc->udb->base_size >= alloc->disk->nextgrow); 1183 if(grow_end <= alloc->udb->base_size) { 1184 /* we can do this with the available space */ 1185 return grow_chunks(base, alloc, sz, exp); 1186 } 1187 /* we have to grow the file, re-mmap */ 1188 /* see if we need to grow a little more, to avoid endless grow 1189 * efforts on adding data */ 1190 grow_end = grow_extra_check(alloc, grow_end); 1191 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1192 return 0; /* mmap or write failed (disk or mem full) */ 1193 } 1194 /* we have enough space now */ 1195 assert(grow_end <= alloc->udb->base_size); 1196 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1197 return grow_chunks(base, alloc, sz, exp); 1198 } 1199 1200 /** take XL allocation into use at end of file, return dataptr */ 1201 static udb_void 1202 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz) 1203 { 1204 udb_void ret; 1205 udb_xl_chunk_d* p; 1206 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1207 1208 /* align growth to whole mbs */ 1209 grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE); 1210 1211 /* grow XL segment */ 1212 ret = alloc->disk->nextgrow; 1213 p = UDB_XL_CHUNK(ret); 1214 p->exp = UDB_EXP_XL; 1215 p->size = xlsz; 1216 p->flags = 0; 1217 p->ptrlist = 0; 1218 p->type = udb_chunk_type_data; 1219 1220 /* also put size and marker at end for compaction */ 1221 *((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz; 1222 *((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL; 1223 1224 /* stats */ 1225 alloc->disk->stat_data += sz; 1226 alloc->disk->stat_alloc += xlsz; 1227 /* now increase the nextgrow to commit this xl chunk */ 1228 alloc->disk->nextgrow += xlsz; 1229 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1230 return ret + sizeof(udb_xl_chunk_d); /* data ptr */ 1231 } 1232 1233 /** make space for XL allocation */ 1234 static udb_void 1235 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz) 1236 { 1237 /* allocate whole mbs of space, at end of space */ 1238 uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2; 1239 uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1)); 1240 uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need; 1241 assert(need >= asz); 1242 if(grow_end <= alloc->udb->base_size) { 1243 /* can do this in available space */ 1244 return grow_xl(base, alloc, need, sz); 1245 } 1246 /* have to grow file and re-mmap */ 1247 grow_end = grow_extra_check(alloc, grow_end); 1248 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1249 return 0; /* mmap or write failed (disk or mem full) */ 1250 } 1251 /* we have enough space now */ 1252 assert(grow_end <= alloc->udb->base_size); 1253 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1254 return grow_xl(base, alloc, need, sz); 1255 } 1256 1257 /** divide big(2**e2) into pieces so 2**exp fits */ 1258 static udb_void 1259 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2, 1260 int exp) 1261 { 1262 int e = e2; 1263 uint64_t sz = (uint64_t)1<<e2; 1264 assert(big && e2 > exp); 1265 /* so the returned piece to use is the first piece, 1266 * offload the later half until it fits */ 1267 do { 1268 sz >>= 1; /* divide size of big by two */ 1269 e--; /* that means its exp is one smaller */ 1270 udb_alloc_push_fl(base, alloc, big+sz, e); 1271 } while(e != exp); 1272 /* exit loop when last pushed is same size as what we want */ 1273 return big; 1274 } 1275 1276 /** returns the exponent size of the chunk needed for data sz */ 1277 static int 1278 udb_alloc_exp_needed(size_t sz) 1279 { 1280 uint64_t asz = sz + sizeof(udb_chunk_d) + 1; 1281 if(asz > UDB_ALLOC_CHUNK_SIZE) { 1282 return UDB_EXP_XL; 1283 } else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) { 1284 return UDB_ALLOC_CHUNK_MINEXP; 1285 } 1286 return udb_exp_size(asz); 1287 } 1288 1289 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz) 1290 { 1291 void* base = alloc->udb->base; 1292 /* calculate actual allocation size */ 1293 int e2, exp = udb_alloc_exp_needed(sz); 1294 if(exp == UDB_EXP_XL) 1295 return udb_alloc_xl_space(base, alloc, sz); 1296 /* see if there is a free chunk of that size exactly */ 1297 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) { 1298 /* snip from freelist, udb_chunk_d */ 1299 udb_void ret; 1300 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1301 ret = udb_alloc_pop_fl(base, alloc, exp); 1302 /* use it - size octets already OK */ 1303 UDB_CHUNK(ret)->flags = 0; 1304 UDB_CHUNK(ret)->ptrlist = 0; 1305 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1306 /* update stats */ 1307 alloc->disk->stat_data += sz; 1308 alloc->disk->stat_alloc += (1<<exp); 1309 assert(alloc->disk->stat_free >= (1u<<exp)); 1310 alloc->disk->stat_free -= (1<<exp); 1311 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1312 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1313 } 1314 /* see if we can subdivide a larger chunk */ 1315 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++) 1316 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1317 udb_void big, ret; /* udb_chunk_d */ 1318 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1319 big = udb_alloc_pop_fl(base, alloc, e2); 1320 /* push other parts onto freelists (needs inited) */ 1321 ret = udb_alloc_subdivide(base, alloc, big, e2, exp); 1322 /* use final part (needs inited) */ 1323 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1324 /* if stop here; the new exp makes smaller free chunk*/ 1325 UDB_CHUNK(ret)->flags = 0; 1326 UDB_CHUNK(ret)->ptrlist = 0; 1327 /* set type to commit data chunk */ 1328 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1329 /* store last octet */ 1330 chunk_set_last(base, ret, exp, (uint8_t)exp); 1331 /* update stats */ 1332 alloc->disk->stat_data += sz; 1333 alloc->disk->stat_alloc += (1<<exp); 1334 assert(alloc->disk->stat_free >= (1u<<exp)); 1335 alloc->disk->stat_free -= (1<<exp); 1336 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1337 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1338 } 1339 /* we need to grow an extra chunk */ 1340 return udb_alloc_grow_space(base, alloc, sz, exp); 1341 } 1342 1343 /** see if there is free space to allocate a chunk into */ 1344 static int 1345 have_free_for(udb_alloc* alloc, int exp) 1346 { 1347 int e2; 1348 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) 1349 return exp; 1350 for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++) 1351 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1352 return e2; 1353 } 1354 return 0; 1355 } 1356 1357 /** fix relptr prev and next for moved relptr structures */ 1358 static void 1359 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg) 1360 { 1361 udb_void* data = (udb_void*)arg; 1362 udb_void r; 1363 if(!rp->data) 1364 return; 1365 r = UDB_SYSTOREL(base, rp); 1366 if(rp->next) 1367 UDB_REL_PTR(rp->next)->prev = r; 1368 if(rp->prev) 1369 UDB_REL_PTR(rp->prev)->next = r; 1370 else { 1371 /* if this is a pointer to its own chunk, fix it up; 1372 * the data ptr gets set by relptr_edit later. */ 1373 if(rp->data == data[0]) 1374 UDB_CHUNK(data[1])->ptrlist = r; 1375 else UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r; 1376 } 1377 } 1378 1379 /** fix pointers from and to a moved chunk */ 1380 static void 1381 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data, 1382 uint64_t dsz, udb_void olddata) 1383 { 1384 udb_void d[2]; 1385 d[0] = olddata; 1386 d[1] = data; 1387 (*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data), 1388 dsz, &chunk_fix_ptr_each, d); 1389 udb_rel_ptr_edit(base, cp->ptrlist, data); 1390 udb_base_ram_ptr_edit(udb, olddata, data); 1391 } 1392 1393 /** move an allocated chunk to use a free chunk */ 1394 static void 1395 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz, 1396 int e2) 1397 { 1398 udb_void res = udb_alloc_pop_fl(base, alloc, e2); 1399 udb_chunk_d* rp; 1400 udb_chunk_d* fp; 1401 if(exp != e2) { 1402 /* it is bigger, subdivide it */ 1403 res = udb_alloc_subdivide(base, alloc, res, e2, exp); 1404 } 1405 assert(res != f); 1406 /* setup rollback information */ 1407 alloc->udb->glob_data->rb_old = f; 1408 alloc->udb->glob_data->rb_new = res; 1409 alloc->udb->glob_data->rb_size = esz; 1410 /* take the res, exp into use */ 1411 rp = UDB_CHUNK(res); 1412 fp = UDB_CHUNK(f); 1413 /* copy over the data */ 1414 memcpy(rp, fp, esz); 1415 /* adjust rel ptrs */ 1416 chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d), 1417 esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d)); 1418 1419 /* do not freeup the fp; caller does that */ 1420 } 1421 1422 /** unlink several free elements to overwrite with xl chunk */ 1423 static void 1424 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m) 1425 { 1426 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 1427 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 1428 assert(s >= UDB_ALLOC_CHUNK_SIZE); 1429 while(q >= s) { 1430 assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX); 1431 assert(UDB_CHUNK(q)->type == udb_chunk_type_free); 1432 udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX); 1433 q -= UDB_ALLOC_CHUNK_SIZE; 1434 } 1435 } 1436 1437 /** move an XL chunk, and keep track of segments for rollback */ 1438 static void 1439 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n, 1440 uint64_t sz, uint64_t startseg) 1441 { 1442 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1443 udb_xl_chunk_d* np = UDB_XL_CHUNK(n); 1444 uint64_t amount = xl - n; 1445 assert(n < xl); /* move to compact */ 1446 1447 /* setup move rollback */ 1448 udb->glob_data->rb_old = xl; 1449 udb->glob_data->rb_new = n; 1450 udb->glob_data->rb_size = sz; 1451 1452 /* is it overlapping? */ 1453 if(sz <= amount) { 1454 memcpy(np, xlp, sz); 1455 } else { 1456 /* move and commit per 1M segment to avoid data loss */ 1457 uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE; 1458 for(seg = startseg; seg<maxseg; seg++) { 1459 udb->glob_data->rb_seg = seg; 1460 memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE, 1461 xlp+seg*UDB_ALLOC_CHUNK_SIZE, 1462 UDB_ALLOC_CHUNK_SIZE); 1463 } 1464 1465 } 1466 } 1467 1468 /** move list of XL chunks to the front by the shift amount */ 1469 static void 1470 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz, 1471 uint64_t amount) 1472 { 1473 udb_void xl = xl_start; 1474 assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1475 assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1476 assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1477 while(xl < xl_start+xl_sz) { 1478 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1479 udb_void n = xl-amount; 1480 uint64_t sz = xlp->size; 1481 assert(xlp->exp == UDB_EXP_XL); 1482 move_xl_segment(base, alloc->udb, xl, n, sz, 0); 1483 chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n), 1484 n+sizeof(udb_xl_chunk_d), 1485 sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2, 1486 xl+sizeof(udb_xl_chunk_d)); 1487 } 1488 alloc->disk->stat_free -= amount; 1489 alloc->disk->nextgrow -= amount; 1490 alloc->udb->glob_data->rb_old = 0; 1491 alloc->udb->glob_data->rb_new = 0; 1492 alloc->udb->glob_data->rb_size = 0; 1493 } 1494 1495 /** see if free chunk can coagulate with another chunk, return other chunk */ 1496 static udb_void 1497 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp, 1498 uint64_t esz) 1499 { 1500 udb_void other = f^esz; 1501 if(exp == UDB_ALLOC_CHUNKS_MAX) 1502 return 0; /* no further merges */ 1503 if(other >= alloc->udb->base_size) 1504 return 0; /* not allocated */ 1505 if(other >= alloc->disk->nextgrow) 1506 return 0; /* not in use */ 1507 if(other < alloc->udb->glob_data->hsize) 1508 return 0; /* cannot merge with header */ 1509 /* the header is also protected by the special exp marker */ 1510 /* see if the other chunk is a free chunk */ 1511 1512 /* check closest marker to avoid large memory churn */ 1513 /* and also it makes XL allocations and header special markers work */ 1514 if(f > other) { 1515 assert(f > 1); /* this is certain because of header */ 1516 if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) { 1517 /* can do it if the other part is a free chunk */ 1518 assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp); 1519 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1520 return other; 1521 } 1522 } else { 1523 if(UDB_CHUNK(other)->exp == (uint8_t)exp) { 1524 /* can do it if the other part is a free chunk */ 1525 assert(chunk_get_last(base, other, exp)==(uint8_t)exp); 1526 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1527 return other; 1528 } 1529 } 1530 return 0; 1531 } 1532 1533 /** coagulate and then add new free segment, return final free segment */ 1534 static udb_void 1535 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp, 1536 uint64_t esz) 1537 { 1538 /* new free chunk here, attempt coagulate */ 1539 udb_void other; 1540 while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) { 1541 /* unlink that other chunk */ 1542 udb_alloc_unlink_fl(base, alloc, other, exp); 1543 /* merge up */ 1544 if(other < last) 1545 last = other; 1546 exp++; 1547 esz <<= 1; 1548 } 1549 /* free the final segment */ 1550 udb_alloc_push_fl(base, alloc, last, exp); 1551 return last; 1552 } 1553 1554 /** attempt to compact the data and move free space to the end */ 1555 int 1556 udb_alloc_compact(void* base, udb_alloc* alloc) 1557 { 1558 udb_void last; 1559 int exp, e2; 1560 uint64_t esz; 1561 uint64_t at = alloc->disk->nextgrow; 1562 udb_void xl_start = 0; 1563 uint64_t xl_sz = 0; 1564 if(alloc->udb->inhibit_compact) 1565 return 1; 1566 alloc->udb->useful_compact = 0; 1567 while(at > alloc->udb->glob_data->hsize) { 1568 /* grab last entry */ 1569 exp = (int)*((uint8_t*)UDB_REL(base, at-1)); 1570 if(exp == UDB_EXP_XL) { 1571 /* for XL chunks: 1572 * - inspect the size of the XLchunklist at end 1573 * - attempt to compact in front of of XLchunklist 1574 */ 1575 uint64_t xlsz = *((uint64_t*)UDB_REL(base, 1576 at-sizeof(uint64_t)*2)); 1577 udb_void xl = at-xlsz; 1578 #ifndef NDEBUG 1579 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1580 assert(xlp->exp == UDB_EXP_XL); 1581 assert(xlp->type != udb_chunk_type_free); 1582 #endif 1583 /* got thesegment add to the xl chunk list */ 1584 if(xl_start != 0 && xl+xlsz != xl_start) { 1585 /* nonadjoining XL part, but they are aligned, 1586 * so the space in between is whole Mbs, 1587 * shift the later part(s) and continue */ 1588 uint64_t m = xl_start - (xl+xlsz); 1589 assert(xl_start > xl+xlsz); 1590 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1591 free_xl_space(base, alloc, xl+xlsz, m); 1592 move_xl_list(base, alloc, xl_start, xl_sz, m); 1593 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1594 } 1595 xl_start = xl; 1596 xl_sz += xlsz; 1597 at = xl; 1598 continue; 1599 /* end of XL if */ 1600 } else if(exp < UDB_ALLOC_CHUNK_MINEXP 1601 || exp > UDB_ALLOC_CHUNKS_MAX) 1602 break; /* special chunk or garbage */ 1603 esz = (uint64_t)1<<exp; 1604 last = at - esz; 1605 assert(UDB_CHUNK(last)->exp == (uint8_t)exp); 1606 if(UDB_CHUNK(last)->type == udb_chunk_type_free) { 1607 /* if xlstart continue looking to move stuff, but do 1608 * not unlink this free segment */ 1609 if(!xl_start) { 1610 /* it is a free chunk, remove it */ 1611 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1612 udb_alloc_unlink_fl(base, alloc, last, exp); 1613 alloc->disk->stat_free -= esz; 1614 alloc->disk->nextgrow = last; 1615 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1616 /* and continue at this point */ 1617 } 1618 at = last; 1619 } else if( (e2=have_free_for(alloc, exp)) ) { 1620 /* last entry can be allocated in free chunks 1621 * move it to its new position, adjust rel_ptrs */ 1622 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1623 move_chunk(base, alloc, last, exp, esz, e2); 1624 if(xl_start) { 1625 last = coagulate_and_push(base, alloc, 1626 last, exp, esz); 1627 } else { 1628 /* shorten usage */ 1629 alloc->disk->stat_free -= esz; 1630 alloc->disk->nextgrow = last; 1631 } 1632 alloc->udb->glob_data->rb_old = 0; 1633 alloc->udb->glob_data->rb_new = 0; 1634 alloc->udb->glob_data->rb_size = 0; 1635 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1636 /* and continue in front of it */ 1637 at = last; 1638 } else { 1639 /* cannot compact this block, stop compacting */ 1640 break; 1641 } 1642 /* if that worked, repeat it */ 1643 } 1644 /* if we passed xl chunks, see if XL-chunklist can move */ 1645 if(xl_start) { 1646 /* calculate free space in front of the XLchunklist. */ 1647 /* has to be whole mbs of free space */ 1648 /* if so, we can move the XL chunks. Move them all back 1649 * by the new free space. */ 1650 /* this compacts very well, but the XL chunks can be moved 1651 * multiple times; worst case for every mb freed a huge sized 1652 * xlchunklist gets moved. */ 1653 /* free space must be, since aligned and coagulated, in 1654 * chunks of a whole MB */ 1655 udb_void at = xl_start; 1656 uint64_t m = 0; 1657 while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){ 1658 udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE; 1659 if(UDB_CHUNK(chunk)->type != udb_chunk_type_free) 1660 break; 1661 assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX); 1662 m += UDB_ALLOC_CHUNK_SIZE; 1663 at = chunk; 1664 } 1665 if(m != 0) { 1666 assert(at+m == xl_start); 1667 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1668 free_xl_space(base, alloc, at, m); 1669 move_xl_list(base, alloc, xl_start, xl_sz, m); 1670 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1671 } 1672 } 1673 1674 /* if enough free, shrink the file; re-mmap */ 1675 if(enough_free(alloc)) { 1676 uint64_t nsize = alloc->disk->nextgrow; 1677 udb_base_shrink(alloc->udb, nsize); 1678 if(!udb_base_remap(alloc->udb, alloc, nsize)) 1679 return 0; 1680 } 1681 return 1; 1682 } 1683 1684 int 1685 udb_compact(udb_base* udb) 1686 { 1687 if(!udb) return 1; 1688 if(!udb->useful_compact) return 1; 1689 DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database...")); 1690 return udb_alloc_compact(udb->base, udb->alloc); 1691 } 1692 1693 void udb_compact_inhibited(udb_base* udb, int inhibit) 1694 { 1695 if(!udb) return; 1696 udb->inhibit_compact = inhibit; 1697 } 1698 1699 #ifdef UDB_CHECK 1700 /** check that rptrs are really zero before free */ 1701 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg) 1702 { 1703 (void)base; 1704 (void)arg; 1705 assert(p->data == 0); 1706 } 1707 #endif /* UDB_CHECK */ 1708 1709 /** free XL chunk as multiples of CHUNK_SIZE free segments */ 1710 static void 1711 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp, 1712 size_t sz) 1713 { 1714 uint64_t xlsz = fp->size; 1715 uint64_t c; 1716 /* lightweight check for buffer overflow in xl data */ 1717 assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz); 1718 assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL); 1719 assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */ 1720 assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1721 #ifdef UDB_CHECK 1722 /* check that relptrs in this chunk have been zeroed */ 1723 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1724 UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz, 1725 &udb_check_rptr_zero, NULL); 1726 #endif 1727 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1728 /* update stats */ 1729 alloc->disk->stat_data -= sz; 1730 alloc->disk->stat_alloc -= xlsz; 1731 alloc->disk->stat_free += xlsz; 1732 /* walk in reverse, so the front blocks go first on the list */ 1733 c = f + xlsz - UDB_ALLOC_CHUNK_SIZE; 1734 /* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/ 1735 assert(f >= UDB_ALLOC_CHUNK_SIZE); 1736 while(c >= f) { 1737 /* free a block of CHUNK_SIZE (1 Mb) */ 1738 udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX); 1739 c -= UDB_ALLOC_CHUNK_SIZE; 1740 } 1741 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1742 } 1743 1744 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz) 1745 { 1746 void* base; 1747 /* lookup chunk ptr */ 1748 udb_void f; 1749 udb_chunk_d* fp; 1750 uint64_t esz; 1751 int exp; 1752 udb_void other; 1753 int coagulated = 0; 1754 if(!r) 1755 return 1; /* free(NULL) does nothing */ 1756 1757 /* lookup size of chunk */ 1758 base = alloc->udb->base; 1759 /* fails for XL blocks */ 1760 f = chunk_from_dataptr(r); 1761 fp = UDB_CHUNK(f); 1762 assert(fp->type != udb_chunk_type_free); 1763 1764 /* see if it has a ptrlist, if so: trouble, the list is not properly 1765 * cleaned up. (although you can imagine a wholesale delete where 1766 * it does not matter) */ 1767 assert(fp->ptrlist == 0); 1768 1769 /* set ptrlist to 0 to stop relptr from using it, robustness. */ 1770 fp->ptrlist = 0; 1771 1772 if(fp->exp == UDB_EXP_XL) { 1773 udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz); 1774 /* compact */ 1775 if(alloc->udb->inhibit_compact) { 1776 alloc->udb->useful_compact = 1; 1777 return 1; 1778 } 1779 return udb_alloc_compact(base, alloc); 1780 } 1781 /* it is a regular chunk of 2**exp size */ 1782 exp = (int)fp->exp; 1783 esz = (uint64_t)1<<exp; 1784 /* light check for e.g. buffer overflow of the data */ 1785 assert(sz < esz); 1786 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1787 #ifdef UDB_CHECK 1788 /* check that relptrs in this chunk have been zeroed */ 1789 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1790 UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL); 1791 #endif 1792 1793 /* update the stats */ 1794 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1795 alloc->disk->stat_data -= sz; 1796 alloc->disk->stat_free += esz; 1797 alloc->disk->stat_alloc -= esz; 1798 1799 /* if it can be merged with other free chunks, do so */ 1800 while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) { 1801 coagulated = 1; 1802 /* unlink that other chunk and expand it (it has same size) */ 1803 udb_alloc_unlink_fl(base, alloc, other, exp); 1804 /* merge up */ 1805 if(other < f) 1806 f = other; 1807 exp++; 1808 esz <<= 1; 1809 } 1810 if(coagulated) { 1811 /* put big free chunk into freelist, and init it */ 1812 udb_alloc_push_fl(base, alloc, f, exp); 1813 } else { 1814 /* we do not need to touch the last-exp-byte, which may save 1815 * a reference to that page of memory */ 1816 fp->type = udb_chunk_type_free; 1817 fp->flags = 0; 1818 udb_alloc_push_fl_noinit(base, alloc, f, exp); 1819 } 1820 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1821 /* compact */ 1822 if(alloc->udb->inhibit_compact) { 1823 alloc->udb->useful_compact = 1; 1824 return 1; 1825 } 1826 return udb_alloc_compact(base, alloc); 1827 } 1828 1829 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz) 1830 { 1831 /* could be faster maybe, if grown? */ 1832 udb_void r = udb_alloc_space(alloc, sz); 1833 if(!r) return r; 1834 memcpy(UDB_REL(alloc->udb->base, r), d, sz); 1835 return r; 1836 } 1837 1838 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz) 1839 { 1840 void* base = alloc->udb->base; 1841 udb_void c, n, newd; 1842 udb_chunk_d* cp, *np; 1843 uint64_t avail; 1844 uint8_t cp_type; 1845 /* emulate some posix realloc stuff */ 1846 if(r == 0) 1847 return udb_alloc_space(alloc, sz); 1848 if(sz == 0) { 1849 if(!udb_alloc_free(alloc, r, osz)) 1850 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1851 return 0; 1852 } 1853 c = chunk_from_dataptr(r); 1854 cp = UDB_CHUNK(c); 1855 cp_type = cp->type; 1856 if(cp->exp == UDB_EXP_XL) { 1857 avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d) 1858 - sizeof(uint64_t)*2; 1859 } else { 1860 avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1; 1861 } 1862 if(sz <= avail) 1863 return r; 1864 /* reallocate it, and copy */ 1865 newd = udb_alloc_space(alloc, sz); 1866 if(!newd) return 0; 1867 /* re-base after alloc, since re-mmap may have happened */ 1868 base = alloc->udb->base; 1869 cp = NULL; /* may be invalid now, robustness */ 1870 n = chunk_from_dataptr(newd); 1871 np = UDB_CHUNK(n); 1872 np->type = cp_type; 1873 memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz); 1874 /* fixup ptrs */ 1875 chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r); 1876 1877 if(!udb_alloc_free(alloc, r, osz)) 1878 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1879 return newd; 1880 } 1881 1882 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num) 1883 { 1884 const uint64_t mb = 1024*1024; 1885 int exp = udb_alloc_exp_needed(sz); 1886 uint64_t esz; 1887 uint64_t want; 1888 if(exp == UDB_EXP_XL) 1889 esz = (sz&(mb-1))+mb; 1890 else esz = (uint64_t)1<<exp; 1891 /* we need grow_end_calc to take into account alignment */ 1892 want = grow_end_calc(alloc, exp) + esz*(num-1); 1893 assert(want >= alloc->udb->base_size); 1894 if(!udb_base_grow_and_remap(alloc->udb, want)) { 1895 log_msg(LOG_ERR, "failed to grow the specified amount"); 1896 return 0; 1897 } 1898 return 1; 1899 } 1900 1901 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp) 1902 { 1903 void* base = alloc->udb->base; 1904 udb_void f = chunk_from_dataptr(r); 1905 udb_chunk_d* fp = UDB_CHUNK(f); 1906 /* not the 'free' type, that must be set by allocation routines */ 1907 assert(fp->type != udb_chunk_type_free); 1908 assert(tp != udb_chunk_type_free); 1909 fp->type = tp; 1910 } 1911 1912 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize) 1913 { 1914 /* pointers are not valid before the header-size or after the 1915 * used-region of the mmap */ 1916 return ( (to+destsize) <= udb->base_size && 1917 to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) && 1918 (to+destsize) <= udb->alloc->disk->nextgrow); 1919 } 1920 1921 int udb_valid_dataptr(udb_base* udb, udb_void to) 1922 { 1923 void* base = udb->base; 1924 udb_void ch; 1925 int exp; 1926 uint64_t esz; 1927 /* our data chunks are aligned and at least 8 bytes */ 1928 if(!udb_valid_offset(udb, to, sizeof(uint64_t))) 1929 return 0; 1930 /* get the chunk pointer */ 1931 ch = chunk_from_dataptr(to); 1932 if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d))) 1933 return 0; 1934 /* check its size */ 1935 exp = UDB_CHUNK(ch)->exp; 1936 if(exp == UDB_EXP_XL) { 1937 /* check XL chunk */ 1938 uint64_t xlsz; 1939 if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d))) 1940 return 0; 1941 xlsz = UDB_XL_CHUNK(ch)->size; 1942 if(!udb_valid_offset(udb, ch+xlsz-1, 1)) 1943 return 0; 1944 if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL) 1945 return 0; 1946 if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2)) 1947 != xlsz) 1948 return 0; 1949 return 1; 1950 } 1951 /* check if regular chunk has matching end byte */ 1952 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) 1953 return 0; /* cannot be a valid chunk */ 1954 esz = 1<<exp; 1955 if(!udb_valid_offset(udb, ch+esz-1, 1)) 1956 return 0; 1957 if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp) 1958 return 0; 1959 return 1; 1960 } 1961 1962 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to) 1963 { 1964 void* base = udb->base; 1965 udb_void p; 1966 if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr))) 1967 return 0; 1968 if(!udb_valid_dataptr(udb, to)) 1969 return 0; 1970 p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist; 1971 while(p) { 1972 if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr))) 1973 return 0; 1974 if(p == rptr) 1975 return 1; 1976 p = UDB_REL_PTR(p)->next; 1977 } 1978 return 0; 1979 } 1980 1981 void udb_rel_ptr_init(udb_rel_ptr* ptr) 1982 { 1983 memset(ptr, 0, sizeof(*ptr)); 1984 } 1985 1986 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr) 1987 { 1988 if(!ptr->data) 1989 return; 1990 if(ptr->prev) { 1991 UDB_REL_PTR(ptr->prev)->next = ptr->next; 1992 } else { 1993 UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next; 1994 } 1995 if(ptr->next) { 1996 UDB_REL_PTR(ptr->next)->prev = ptr->prev; 1997 } 1998 } 1999 2000 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to) 2001 { 2002 udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to)); 2003 ptr->prev = 0; 2004 ptr->next = chunk->ptrlist; 2005 if(ptr->next) 2006 UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr); 2007 chunk->ptrlist = UDB_SYSTOREL(base, ptr); 2008 ptr->data = to; 2009 } 2010 2011 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to) 2012 { 2013 assert(to == 0 || to > 64); 2014 udb_rel_ptr_unlink(base, ptr); 2015 if(to) 2016 udb_rel_ptr_link(base, ptr, to); 2017 else ptr->data = to; 2018 } 2019 2020 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to) 2021 { 2022 udb_void p = list; 2023 while(p) { 2024 UDB_REL_PTR(p)->data = to; 2025 p = UDB_REL_PTR(p)->next; 2026 } 2027 } 2028 2029 #ifdef UDB_CHECK 2030 /** check that all pointers are validly chained */ 2031 static void 2032 udb_check_ptrs_valid(udb_base* udb) 2033 { 2034 size_t i; 2035 udb_ptr* p, *prev; 2036 for(i=0; i<udb->ram_size; i++) { 2037 prev = NULL; 2038 for(p=udb->ram_hash[i]; p; p=p->next) { 2039 assert(p->prev == prev); 2040 assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask) 2041 == i); 2042 assert(p->base == &udb->base); 2043 prev = p; 2044 } 2045 } 2046 } 2047 #endif /* UDB_CHECK */ 2048 2049 void udb_ptr_init(udb_ptr* ptr, udb_base* udb) 2050 { 2051 #ifdef UDB_CHECK 2052 udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */ 2053 #endif 2054 memset(ptr, 0, sizeof(*ptr)); 2055 ptr->base = &udb->base; 2056 } 2057 2058 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval) 2059 { 2060 assert(newval == 0 || newval > 64); 2061 if(ptr->data) 2062 udb_base_unlink_ptr(udb, ptr); 2063 ptr->data = newval; 2064 if(newval) 2065 udb_base_link_ptr(udb, ptr); 2066 } 2067 2068 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type, 2069 size_t sz) 2070 { 2071 udb_void r; 2072 r = udb_alloc_space(udb->alloc, sz); 2073 if(!r) return 0; 2074 udb_alloc_set_type(udb->alloc, r, type); 2075 udb_ptr_init(ptr, udb); 2076 udb_ptr_set(ptr, udb, r); 2077 return 1; 2078 } 2079 2080 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz) 2081 { 2082 if(ptr->data) { 2083 udb_void d = ptr->data; 2084 udb_ptr_set(ptr, udb, 0); 2085 udb_alloc_free(udb->alloc, d, sz); 2086 } 2087 } 2088 2089 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr) 2090 { 2091 udb_void f; 2092 if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/ 2093 f = chunk_from_dataptr(ptr->data); 2094 return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type; 2095 } 2096