1 /* udb.c - u(micro) data base. 2 * By W.C.A. Wijngaards 3 * Copyright 2010, NLnet Labs. 4 * BSD, see LICENSE. 5 */ 6 #include "config.h" 7 #include "udb.h" 8 #include <string.h> 9 #include <errno.h> 10 #include <stdio.h> 11 #include <unistd.h> 12 #include <assert.h> 13 #include "lookup3.h" 14 #include "util.h" 15 16 /* mmap and friends */ 17 #include <sys/types.h> 18 #include <sys/stat.h> 19 #include <fcntl.h> 20 #include <sys/mman.h> 21 22 /* for systems without, portable definition, failed-1 and async is a flag */ 23 #ifndef MAP_FAILED 24 #define MAP_FAILED ((void*)-1) 25 #endif 26 #ifndef MS_SYNC 27 #define MS_SYNC 0 28 #endif 29 30 /** move and fixup xl segment */ 31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl, 32 udb_void n, uint64_t sz, uint64_t startseg); 33 /** attempt to compact the data and move free space to the end */ 34 static int udb_alloc_compact(void* base, udb_alloc* alloc); 35 36 /** convert pointer to the data part to a pointer to the base of the chunk */ 37 static udb_void 38 chunk_from_dataptr(udb_void data) 39 { 40 /* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and 41 * that xl_chunk_d is aligned on x**1024 boundaries. */ 42 udb_void xl = data - sizeof(udb_xl_chunk_d); 43 if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0) 44 return xl; 45 return data - sizeof(udb_chunk_d); 46 } 47 48 udb_void chunk_from_dataptr_ext(udb_void data) { 49 return chunk_from_dataptr(data); 50 } 51 52 #ifndef NDEBUG 53 /** read last octet from a chunk */ 54 static uint8_t 55 chunk_get_last(void* base, udb_void chunk, int exp) 56 { 57 return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)); 58 } 59 #endif 60 61 /** write last octet of a chunk */ 62 static void 63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value) 64 { 65 *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value; 66 } 67 68 /** create udb_base from a file descriptor (must be at start of file) */ 69 udb_base* 70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc, 71 void* arg) 72 { 73 uint64_t m, fsz; 74 udb_glob_d g; 75 ssize_t r; 76 udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb)); 77 if(!udb) { 78 log_msg(LOG_ERR, "out of memory"); 79 close(fd); 80 return NULL; 81 } 82 udb->fname = strdup(fname); 83 if(!udb->fname) { 84 log_msg(LOG_ERR, "out of memory"); 85 free(udb); 86 close(fd); 87 return NULL; 88 } 89 udb->walkfunc = walkfunc; 90 udb->walkarg = arg; 91 udb->fd = fd; 92 udb->ram_size = 1024; 93 udb->ram_mask = (int)udb->ram_size - 1; 94 udb->ram_hash = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*udb->ram_size); 95 if(!udb->ram_hash) { 96 free(udb->fname); 97 free(udb); 98 log_msg(LOG_ERR, "out of memory"); 99 close(fd); 100 return NULL; 101 } 102 103 /* read magic */ 104 if((r=read(fd, &m, sizeof(m))) == -1) { 105 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 106 goto fail; 107 } else if(r != (ssize_t)sizeof(m)) { 108 log_msg(LOG_ERR, "%s: file too short", fname); 109 goto fail; 110 } 111 /* TODO : what if bigendian and littleendian file, see magic */ 112 if(m != UDB_MAGIC) { 113 log_msg(LOG_ERR, "%s: wrong type of file", fname); 114 goto fail; 115 } 116 /* read header */ 117 if((r=read(fd, &g, sizeof(g))) == -1) { 118 log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno)); 119 goto fail; 120 } else if(r != (ssize_t)sizeof(g)) { 121 log_msg(LOG_ERR, "%s: file too short", fname); 122 goto fail; 123 } 124 if(g.version != 0) { 125 log_msg(LOG_ERR, "%s: unknown file version %d", fname, 126 (int)g.version); 127 goto fail; 128 } 129 if(g.hsize < UDB_HEADER_SIZE) { 130 log_msg(LOG_ERR, "%s: header size too small %d", fname, 131 (int)g.hsize); 132 goto fail; 133 } 134 if(g.hsize > UDB_HEADER_SIZE) { 135 log_msg(LOG_WARNING, "%s: header size too large %d", fname, 136 (int)g.hsize); 137 goto fail; 138 } 139 if(g.clean_close != 1) { 140 log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname, 141 (int)g.clean_close); 142 goto fail; 143 } 144 if(g.dirty_alloc != 0) { 145 log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname, 146 (int)g.dirty_alloc); 147 goto fail; 148 } 149 150 /* check file size correctly written, for 4.0.2 nsd.db failure */ 151 fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END); 152 (void)lseek(fd, (off_t)0, SEEK_SET); 153 if(g.fsize != fsz) { 154 log_msg(LOG_WARNING, "%s: file size %llu but mmap header " 155 "has size %llu", fname, (unsigned long long)fsz, 156 (unsigned long long)g.fsize); 157 goto fail; 158 } 159 160 /* mmap it */ 161 if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) { 162 log_msg(LOG_ERR, "%s: file too short", fname); 163 goto fail; 164 } 165 if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ { 166 log_msg(LOG_WARNING, "%s: file size too large %llu", 167 fname, (unsigned long long)g.fsize); 168 goto fail; 169 } 170 udb->base_size = (size_t)g.fsize; 171 /* note the size_t casts must be there for portability, on some 172 * systems the layout of memory is otherwise broken. */ 173 udb->base = mmap(NULL, (size_t)udb->base_size, 174 (int)PROT_READ|PROT_WRITE, (int)MAP_SHARED, 175 (int)udb->fd, (off_t)0); 176 if(udb->base == MAP_FAILED) { 177 udb->base = NULL; 178 log_msg(LOG_ERR, "mmap(size %u) error: %s", 179 (unsigned)udb->base_size, strerror(errno)); 180 fail: 181 close(fd); 182 free(udb->fname); 183 free(udb->ram_hash); 184 free(udb); 185 return NULL; 186 } 187 188 /* init completion */ 189 udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t)); 190 r = 0; 191 /* cannot be dirty because that is goto fail above */ 192 if(udb->glob_data->dirty_alloc != udb_dirty_clean) 193 r = 1; 194 udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)( 195 (void*)udb->glob_data+sizeof(*udb->glob_data))); 196 if(!udb->alloc) { 197 log_msg(LOG_ERR, "out of memory"); 198 udb_base_free(udb); 199 return NULL; 200 } 201 if(r) { 202 /* and compact now, or resume compacting */ 203 udb_alloc_compact(udb, udb->alloc); 204 udb_base_sync(udb, 1); 205 } 206 udb->glob_data->clean_close = 0; 207 208 return udb; 209 } 210 211 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc, 212 void* arg) 213 { 214 int fd = open(fname, O_RDWR); 215 if(fd == -1) { 216 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 217 return NULL; 218 } 219 return udb_base_create_fd(fname, fd, walkfunc, arg); 220 } 221 222 /** init new udb_global structure */ 223 static void udb_glob_init_new(udb_glob_d* g) 224 { 225 memset(g, 0, sizeof(*g)); 226 g->hsize = UDB_HEADER_SIZE; 227 g->fsize = UDB_HEADER_SIZE; 228 } 229 230 /** write data to file and check result */ 231 static int 232 write_fdata(const char* fname, int fd, void* data, size_t len) 233 { 234 ssize_t w; 235 if((w=write(fd, data, len)) == -1) { 236 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 237 close(fd); 238 return 0; 239 } else if(w != (ssize_t)len) { 240 log_msg(LOG_ERR, "%s: short write (disk full?)", fname); 241 close(fd); 242 return 0; 243 } 244 return 1; 245 } 246 247 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc, 248 void* arg) 249 { 250 uint64_t m; 251 udb_glob_d g; 252 udb_alloc_d a; 253 uint64_t endsize = UDB_HEADER_SIZE; 254 uint64_t endexp = 0; 255 int fd = open(fname, O_CREAT|O_RDWR, 0600); 256 if(fd == -1) { 257 log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); 258 return NULL; 259 } 260 m = UDB_MAGIC; 261 udb_glob_init_new(&g); 262 udb_alloc_init_new(&a); 263 g.clean_close = 1; 264 265 /* write new data to file (closes fd on error) */ 266 if(!write_fdata(fname, fd, &m, sizeof(m))) 267 return NULL; 268 if(!write_fdata(fname, fd, &g, sizeof(g))) 269 return NULL; 270 if(!write_fdata(fname, fd, &a, sizeof(a))) 271 return NULL; 272 if(!write_fdata(fname, fd, &endsize, sizeof(endsize))) 273 return NULL; 274 if(!write_fdata(fname, fd, &endexp, sizeof(endexp))) 275 return NULL; 276 /* rewind to start */ 277 if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) { 278 log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno)); 279 close(fd); 280 return NULL; 281 } 282 /* truncate to the right size */ 283 if(ftruncate(fd, (off_t)g.fsize) < 0) { 284 log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname, 285 (int)g.fsize, strerror(errno)); 286 close(fd); 287 return NULL; 288 } 289 return udb_base_create_fd(fname, fd, walkfunc, arg); 290 } 291 292 /** shrink the udb base if it has unused space at the end */ 293 static void 294 udb_base_shrink(udb_base* udb, uint64_t nsize) 295 { 296 udb->glob_data->dirty_alloc = udb_dirty_fsize; 297 udb->glob_data->fsize = nsize; 298 /* sync, does not *seem* to be required on Linux, but it is 299 certainly required on OpenBSD. Otherwise changed data is lost. */ 300 msync(udb->base, udb->base_size, MS_ASYNC); 301 if(ftruncate(udb->fd, (off_t)nsize) != 0) { 302 log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname, 303 (unsigned)nsize, strerror(errno)); 304 } 305 udb->glob_data->dirty_alloc = udb_dirty_clean; 306 } 307 308 void udb_base_close(udb_base* udb) 309 { 310 if(!udb) 311 return; 312 if(udb->fd != -1 && udb->base && udb->alloc) { 313 uint64_t nsize = udb->alloc->disk->nextgrow; 314 if(nsize < udb->base_size) 315 udb_base_shrink(udb, nsize); 316 } 317 if(udb->fd != -1) { 318 udb->glob_data->clean_close = 1; 319 close(udb->fd); 320 udb->fd = -1; 321 } 322 if(udb->base) { 323 if(munmap(udb->base, udb->base_size) == -1) { 324 log_msg(LOG_ERR, "munmap: %s", strerror(errno)); 325 } 326 udb->base = NULL; 327 } 328 } 329 330 void udb_base_free(udb_base* udb) 331 { 332 if(!udb) 333 return; 334 udb_base_close(udb); 335 udb_alloc_delete(udb->alloc); 336 free(udb->ram_hash); 337 free(udb->fname); 338 free(udb); 339 } 340 341 void udb_base_free_keep_mmap(udb_base* udb) 342 { 343 if(!udb) return; 344 if(udb->fd != -1) { 345 close(udb->fd); 346 udb->fd = -1; 347 } 348 udb->base = NULL; 349 udb_alloc_delete(udb->alloc); 350 free(udb->ram_hash); 351 free(udb->fname); 352 free(udb); 353 } 354 355 void udb_base_sync(udb_base* udb, int wait) 356 { 357 if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) { 358 log_msg(LOG_ERR, "msync(%s) error %s", 359 udb->fname, strerror(errno)); 360 } 361 } 362 363 /** hash a chunk pointer */ 364 static uint32_t 365 chunk_hash_ptr(udb_void p) 366 { 367 /* put p into an array of uint32 */ 368 uint32_t h[sizeof(p)/sizeof(uint32_t)]; 369 memcpy(&h, &p, sizeof(h)); 370 return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763); 371 } 372 373 /** check that the given pointer is on the bucket for the given offset */ 374 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to) 375 { 376 uint32_t i = chunk_hash_ptr(to) & udb->ram_mask; 377 udb_ptr* p; 378 assert((size_t)i < udb->ram_size); 379 for(p = udb->ram_hash[i]; p; p=p->next) { 380 if(p == ptr) 381 return 1; 382 } 383 return 0; 384 } 385 386 /** grow the ram array */ 387 static void 388 grow_ram_hash(udb_base* udb, udb_ptr** newhash) 389 { 390 size_t i; 391 size_t osize= udb->ram_size; 392 udb_ptr* p, *np; 393 udb_ptr** oldhash = udb->ram_hash; 394 udb->ram_size *= 2; 395 udb->ram_mask <<= 1; 396 udb->ram_mask |= 1; 397 udb->ram_hash = newhash; 398 /* have to link in every element in the old list into the new list*/ 399 for(i=0; i<osize; i++) { 400 p = oldhash[i]; 401 while(p) { 402 np = p->next; 403 /* link into newhash */ 404 p->prev=NULL; 405 p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask]; 406 if(p->next) p->next->prev = p; 407 /* go to next element of oldhash */ 408 p = np; 409 } 410 } 411 free(oldhash); 412 } 413 414 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr) 415 { 416 uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 417 assert((size_t)i < udb->ram_size); 418 #ifdef UDB_CHECK 419 assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/ 420 #endif 421 udb->ram_num++; 422 if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0xefffffff) { 423 /* grow the array, if allocation succeeds */ 424 udb_ptr** newram = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)* 425 udb->ram_size*2); 426 if(newram) { 427 grow_ram_hash(udb, newram); 428 } 429 } 430 ptr->prev = NULL; 431 ptr->next = udb->ram_hash[i]; 432 udb->ram_hash[i] = ptr; 433 if(ptr->next) 434 ptr->next->prev = ptr; 435 } 436 437 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr) 438 { 439 assert(ptr->data); 440 #ifdef UDB_CHECK 441 assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */ 442 assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data)); 443 #endif 444 udb->ram_num--; 445 if(ptr->next) 446 ptr->next->prev = ptr->prev; 447 if(ptr->prev) 448 ptr->prev->next = ptr->next; 449 else { 450 uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; 451 assert((size_t)i < udb->ram_size); 452 udb->ram_hash[i] = ptr->next; 453 } 454 } 455 456 /** change a set of ram ptrs to a new value */ 457 static void 458 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd) 459 { 460 uint32_t io = chunk_hash_ptr(old) & udb->ram_mask; 461 udb_ptr* p, *np; 462 /* edit them and move them into the new position */ 463 p = udb->ram_hash[io]; 464 while(p) { 465 np = p->next; 466 if(p->data == old) { 467 udb_base_unlink_ptr(udb, p); 468 p->data = newd; 469 udb_base_link_ptr(udb, p); 470 } 471 p = np; 472 } 473 } 474 475 udb_rel_ptr* udb_base_get_userdata(udb_base* udb) 476 { 477 return &udb->glob_data->user_global; 478 } 479 480 void udb_base_set_userdata(udb_base* udb, udb_void user) 481 { 482 #ifdef UDB_CHECK 483 if(user) { assert(udb_valid_dataptr(udb, user)); } 484 #endif 485 udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user); 486 } 487 488 void udb_base_set_userflags(udb_base* udb, uint8_t v) 489 { 490 udb->glob_data->userflags = v; 491 } 492 493 uint8_t udb_base_get_userflags(udb_base* udb) 494 { 495 return udb->glob_data->userflags; 496 } 497 498 /** re-mmap the udb to specified size */ 499 static void* 500 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize) 501 { 502 void* nb; 503 /* for use with valgrind, do not use mremap, but the other version */ 504 #ifdef MREMAP_MAYMOVE 505 nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE); 506 if(nb == MAP_FAILED) { 507 log_msg(LOG_ERR, "mremap(%s, size %u) error %s", 508 udb->fname, (unsigned)nsize, strerror(errno)); 509 return 0; 510 } 511 #else /* !HAVE MREMAP */ 512 /* use munmap-mmap to simulate mremap */ 513 if(munmap(udb->base, udb->base_size) != 0) { 514 log_msg(LOG_ERR, "munmap(%s) error %s", 515 udb->fname, strerror(errno)); 516 } 517 /* provide hint for new location */ 518 /* note the size_t casts must be there for portability, on some 519 * systems the layout of memory is otherwise broken. */ 520 nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 521 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 522 /* retry the mmap without basept in case of ENOMEM (FreeBSD8), 523 * the kernel can then try to mmap it at a different location 524 * where more memory is available */ 525 if(nb == MAP_FAILED && errno == ENOMEM) { 526 nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE, 527 (int)MAP_SHARED, (int)udb->fd, (off_t)0); 528 } 529 if(nb == MAP_FAILED) { 530 log_msg(LOG_ERR, "mmap(%s, size %u) error %s", 531 udb->fname, (unsigned)nsize, strerror(errno)); 532 udb->base = NULL; 533 return 0; 534 } 535 #endif /* HAVE MREMAP */ 536 if(nb != udb->base) { 537 /* fix up realpointers in udb and alloc */ 538 /* but mremap may have been nice and not move the base */ 539 udb->base = nb; 540 udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t)); 541 /* use passed alloc pointer because the udb->alloc may not 542 * be initialized yet */ 543 alloc->disk = (udb_alloc_d*)((void*)udb->glob_data 544 +sizeof(*udb->glob_data)); 545 } 546 udb->base_size = nsize; 547 return nb; 548 } 549 550 void 551 udb_base_remap_process(udb_base* udb) 552 { 553 /* assume that fsize is still accessible */ 554 udb_base_remap(udb, udb->alloc, udb->glob_data->fsize); 555 } 556 557 /** grow file to specified size and re-mmap, return new base */ 558 static void* 559 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize) 560 { 561 /* grow file by writing a single zero at that spot, the 562 * rest is filled in with zeroes. */ 563 uint8_t z = 0; 564 ssize_t w; 565 566 assert(nsize > 0); 567 udb->glob_data->dirty_alloc = udb_dirty_fsize; 568 #ifdef HAVE_PWRITE 569 if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) { 570 #else 571 if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) { 572 log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno)); 573 return 0; 574 } 575 if((w=write(udb->fd, &z, sizeof(z))) == -1) { 576 #endif 577 log_msg(LOG_ERR, "grow(%s, size %u) error %s", 578 udb->fname, (unsigned)nsize, strerror(errno)); 579 return 0; 580 } else if(w != (ssize_t)sizeof(z)) { 581 log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)", 582 udb->fname, (unsigned)nsize); 583 return 0; 584 } 585 udb->glob_data->fsize = nsize; 586 udb->glob_data->dirty_alloc = udb_dirty_clean; 587 return udb_base_remap(udb, udb->alloc, nsize); 588 } 589 590 int udb_exp_size(uint64_t a) 591 { 592 /* find enclosing value such that 2**x >= a */ 593 int x = 0; 594 uint64_t i = a; 595 assert(a != 0); 596 597 i --; 598 /* could optimise this with uint8* access, depends on endianness */ 599 /* first whole bytes */ 600 while( (i&(~(uint64_t)0xff)) ) { 601 i >>= 8; 602 x += 8; 603 } 604 /* now details */ 605 while(i) { 606 i >>= 1; 607 x ++; 608 } 609 assert( ((uint64_t)1<<x) >= a); 610 assert( x==0 || ((uint64_t)1<<(x-1)) < a); 611 return x; 612 } 613 614 int udb_exp_offset(uint64_t o) 615 { 616 /* this means measuring the number of 0 bits on the right */ 617 /* so, if exp zero bits then (o&(2**x-1))==0 */ 618 int x = 0; 619 uint64_t i = o; 620 assert(o != 0); 621 /* first whole bytes */ 622 while( (i&(uint64_t)0xff) == 0) { 623 i >>= 8; 624 x += 8; 625 } 626 /* now details */ 627 while( (i&(uint64_t)0x1) == 0) { 628 i >>= 1; 629 x ++; 630 } 631 assert( o % ((uint64_t)1<<x) == 0); 632 assert( o % ((uint64_t)1<<(x+1)) != 0); 633 return x; 634 } 635 636 void udb_alloc_init_new(udb_alloc_d* a) 637 { 638 assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0); 639 memset(a, 0, sizeof(*a)); 640 /* set new allocations after header, as if allocated in a sequence 641 * of minsize allocations */ 642 a->nextgrow = UDB_HEADER_SIZE; 643 } 644 645 /** fsck the file size, false if failed and file is useless */ 646 static int 647 fsck_fsize(udb_base* udb, udb_alloc* alloc) 648 { 649 off_t realsize; 650 log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname); 651 realsize = lseek(udb->fd, (off_t)0, SEEK_END); 652 if(realsize == (off_t)-1) { 653 log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno)); 654 return 0; 655 } 656 udb->glob_data->fsize = (uint64_t)realsize; 657 if(!udb_base_remap(udb, alloc, (uint64_t)realsize)) 658 return 0; 659 udb->glob_data->dirty_alloc = udb_dirty_clean; 660 log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname); 661 udb_base_sync(udb, 1); 662 return 1; 663 } 664 665 /** regenerate freelist add a new free chunk, return next todo */ 666 static udb_void 667 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen) 668 { 669 udb_free_chunk_d* cp = UDB_FREE_CHUNK(c); 670 uint64_t esz = (uint64_t)1<<exp; 671 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 672 return 0; 673 } 674 cp->type = udb_chunk_type_free; 675 cp->flags = 0; 676 chunk_set_last(base, c, exp, (uint8_t)exp); 677 cp->prev = 0; 678 cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 679 if(cp->next) 680 UDB_FREE_CHUNK(cp->next)->prev = c; 681 regen->stat_free += esz; 682 return c + esz; 683 } 684 685 /** regenerate xl chunk, return next todo */ 686 static udb_void 687 regen_xl(void* base, udb_void c, udb_alloc_d* regen) 688 { 689 udb_xl_chunk_d* cp = UDB_XL_CHUNK(c); 690 uint64_t xlsz = cp->size; 691 if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 692 return 0; 693 } 694 if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { 695 return 0; 696 } 697 /* fixup end-size and end-expmarker */ 698 regen->stat_alloc += xlsz; 699 return c + xlsz; 700 } 701 702 /** regenerate data chunk, return next todo */ 703 static udb_void 704 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen) 705 { 706 uint64_t esz = (uint64_t)1<<exp; 707 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { 708 return 0; 709 } 710 chunk_set_last(base, c, exp, (uint8_t)exp); 711 regen->stat_alloc += esz; 712 return c + esz; 713 } 714 715 /** regenerate a relptr structure inside a data segment */ 716 static void 717 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg) 718 { 719 udb_void* a = (udb_void*)arg; 720 /* ignore 0 pointers */ 721 if(!rp->data) 722 return; 723 724 /* edit relptrs that point to oldmoved to point to newmoved. */ 725 if(rp->data == a[0]) 726 rp->data = a[1]; 727 728 /* regenerate relptr lists, add this item to the relptr list for 729 * the data that it points to */ 730 udb_rel_ptr_link(base, rp, rp->data); 731 } 732 733 /** regenerate the relptrs store in this data segment */ 734 static void 735 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp, 736 void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new) 737 { 738 udb_void arg[2]; 739 arg[0] = rb_old; arg[1] = rb_new; 740 /* walk through the structs here and put them on their respective 741 * relptr lists */ 742 (*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz, 743 ®en_relptr_func, arg); 744 745 } 746 747 /** regenerate relptrlists in the file */ 748 static void 749 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc, 750 udb_void rb_old, udb_void rb_new) 751 { 752 udb_void at = alloc->udb->glob_data->hsize; 753 /* clear all ptrlist start pointers in the file. */ 754 while(at < alloc->disk->nextgrow) { 755 int exp = (int)UDB_CHUNK(at)->exp; 756 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 757 if(exp == UDB_EXP_XL) { 758 UDB_XL_CHUNK(at)->ptrlist = 0; 759 at += UDB_XL_CHUNK(at)->size; 760 } else if(tp == udb_chunk_type_free) { 761 at += (uint64_t)1<<exp; 762 } else { /* data chunk */ 763 UDB_CHUNK(at)->ptrlist = 0; 764 at += (uint64_t)1<<exp; 765 } 766 } 767 /* walk through all relptr structs and put on the right list. */ 768 at = alloc->udb->glob_data->hsize; 769 while(at < alloc->disk->nextgrow) { 770 udb_chunk_d* atp = UDB_CHUNK(at); 771 int exp = (int)atp->exp; 772 udb_chunk_type tp = (udb_chunk_type)atp->type; 773 uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size: 774 (uint64_t)1<<exp); 775 if(exp == UDB_EXP_XL) { 776 assert(at != rb_old); /* should have been freed */ 777 regen_its_ptrs(base, udb, atp, 778 ((void*)atp)+sizeof(udb_xl_chunk_d), 779 sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2, 780 rb_old, rb_new); 781 at += sz; 782 } else if(tp == udb_chunk_type_free) { 783 at += sz; 784 } else { /* data chunk */ 785 assert(at != rb_old); /* should have been freed */ 786 regen_its_ptrs(base, udb, atp, 787 ((void*)atp)+sizeof(udb_chunk_d), 788 sz-sizeof(udb_chunk_d)-1, rb_old, rb_new); 789 at += sz; 790 } 791 } 792 } 793 794 795 /** mark free elements from ex XL chunk space and later fixups pick that up */ 796 static void 797 rb_mark_free_segs(void* base, udb_void s, uint64_t m) 798 { 799 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 800 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 801 assert(s >= UDB_ALLOC_CHUNK_SIZE); 802 while(q >= s) { 803 UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX; 804 UDB_CHUNK(q)->type = udb_chunk_type_free; 805 q -= UDB_ALLOC_CHUNK_SIZE; 806 } 807 } 808 809 810 /** fsck rollback or rollforward XL move results */ 811 static int 812 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new, 813 uint64_t rb_size, uint64_t rb_seg) 814 { 815 816 if(rb_old <= rb_new) 817 return 0; /* XL move one way */ 818 if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 819 return 0; /* not aligned */ 820 if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 821 return 0; /* not aligned */ 822 if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) 823 return 0; /* not aligned */ 824 if(rb_new + rb_size <= rb_old) { 825 /* not overlapping: resume copy */ 826 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 827 /* and free up old piece(s) */ 828 rb_mark_free_segs(base, rb_old, rb_size); 829 } else { 830 /* overlapping, see what segment we stopped at 831 * and continue there. */ 832 move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg); 833 /* free up old piece(s); from the end of the moved segment, 834 * until the end of the old segment */ 835 rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)- 836 (rb_new+rb_size)); 837 } 838 /* do not call fix_ptrs, regenptrs does the job */ 839 return 1; 840 } 841 842 /** fsck rollback or rollforward move results */ 843 static int 844 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size, 845 udb_void* make_free) 846 { 847 if( (rb_size&(rb_size-1)) != 0) 848 return 0; /* not powerof2 */ 849 if( (rb_old&(rb_size-1)) != 0) 850 return 0; /* not aligned */ 851 if( (rb_new&(rb_size-1)) != 0) 852 return 0; /* not aligned */ 853 /* resume copy */ 854 memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); 855 /* do not call fix_ptrs, regenptrs does the job */ 856 /* make sure udb_old is freed */ 857 *make_free = rb_old; 858 return 1; 859 } 860 861 /** fsck the file and salvage, false if failed and file is useless */ 862 static int 863 fsck_file(udb_base* udb, udb_alloc* alloc, int moved) 864 { 865 void* base = udb->base; 866 udb_alloc_d regen; 867 udb_void at = udb->glob_data->hsize; 868 udb_void rb_old = udb->glob_data->rb_old; 869 udb_void rb_new = udb->glob_data->rb_new; 870 udb_void rb_seg = udb->glob_data->rb_seg; 871 udb_void make_free = 0; 872 uint64_t rb_size = udb->glob_data->rb_size; 873 log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname); 874 /* walk through the file, use the exp values to see what can be 875 * salvaged */ 876 if(moved && rb_old && rb_new && rb_size) { 877 if(rb_old+rb_size <= alloc->disk->nextgrow 878 && rb_new+rb_size <= alloc->disk->nextgrow) { 879 /* we can use the move information to fix up the 880 * duplicate element (or partially moved element) */ 881 if(rb_size > 1024*1024) { 882 /* XL chunk */ 883 if(!fsck_rb_xl(base, udb, rb_old, rb_new, 884 rb_size, rb_seg)) 885 return 0; 886 } else { 887 if(!fsck_rb(base, rb_old, rb_new, rb_size, 888 &make_free)) 889 return 0; 890 } 891 } 892 } 893 894 /* rebuild freelists */ 895 /* recalculate stats in alloc (except 'stat_data') */ 896 /* possibly new end 'nextgrow' value */ 897 memset(®en, 0, sizeof(regen)); 898 regen.nextgrow = alloc->disk->nextgrow; 899 while(at < regen.nextgrow) { 900 /* figure out this chunk */ 901 int exp = (int)UDB_CHUNK(at)->exp; 902 udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; 903 /* consistency check possible here with end-exp */ 904 if(tp == udb_chunk_type_free || at == make_free) { 905 at = regen_free(base, at, exp, ®en); 906 if(!at) return 0; 907 } else if(exp == UDB_EXP_XL) { 908 /* allocated data of XL size */ 909 at = regen_xl(base, at, ®en); 910 if(!at) return 0; 911 } else if(exp >= UDB_ALLOC_CHUNK_MINEXP 912 && exp <= UDB_ALLOC_CHUNKS_MAX) { 913 /* allocated data */ 914 at = regen_data(base, at, exp, ®en); 915 if(!at) return 0; 916 } else { 917 /* garbage; this must be EOF then */ 918 regen.nextgrow = at; 919 break; 920 } 921 } 922 *alloc->disk = regen; 923 924 /* rebuild relptr lists */ 925 regen_ptrlist(base, udb, alloc, rb_old, rb_new); 926 927 log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)", 928 udb->fname); 929 udb->glob_data->rb_old = 0; 930 udb->glob_data->rb_new = 0; 931 udb->glob_data->rb_size = 0; 932 udb->glob_data->dirty_alloc = udb_dirty_clean; 933 udb_base_sync(udb, 1); 934 return 1; 935 } 936 937 938 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk) 939 { 940 udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc)); 941 if(!alloc) 942 return NULL; 943 alloc->udb = udb; 944 alloc->disk = disk; 945 /* see if committed but uncompleted actions need to be done */ 946 /* preserves the alloc state */ 947 if(udb->glob_data->dirty_alloc != udb_dirty_clean) { 948 if(udb->glob_data->dirty_alloc == udb_dirty_fsize) { 949 if(fsck_fsize(udb, alloc)) 950 return alloc; 951 } else if(udb->glob_data->dirty_alloc == udb_dirty_fl) { 952 if(fsck_file(udb, alloc, 0)) 953 return alloc; 954 } else if(udb->glob_data->dirty_alloc == udb_dirty_compact) { 955 if(fsck_file(udb, alloc, 1)) 956 return alloc; 957 } 958 log_msg(LOG_ERR, "error: file allocation dirty (%d)", 959 (int)udb->glob_data->dirty_alloc); 960 free(alloc); 961 return NULL; 962 } 963 return alloc; 964 } 965 966 void udb_alloc_delete(udb_alloc* alloc) 967 { 968 if(!alloc) return; 969 free(alloc); 970 } 971 972 /** unlink this element from its freelist */ 973 static void 974 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp) 975 { 976 udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk); 977 assert(chunk); 978 /* chunk is a free chunk */ 979 assert(fp->exp == (uint8_t)exp); 980 assert(fp->type == udb_chunk_type_free); 981 assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp); 982 /* and thus freelist not empty */ 983 assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]); 984 /* unlink */ 985 if(fp->prev) 986 UDB_FREE_CHUNK(fp->prev)->next = fp->next; 987 else alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 988 if(fp->next) 989 UDB_FREE_CHUNK(fp->next)->prev = fp->prev; 990 } 991 992 /** pop first element off freelist, list may not be empty */ 993 static udb_void 994 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp) 995 { 996 udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 997 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 998 assert(f); 999 assert(fp->exp == (uint8_t)exp); 1000 assert(fp->type == udb_chunk_type_free); 1001 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1002 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; 1003 if(fp->next) { 1004 UDB_FREE_CHUNK(fp->next)->prev = 0; 1005 } 1006 return f; 1007 } 1008 1009 /** push new element onto freelist */ 1010 static void 1011 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp) 1012 { 1013 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1014 assert(f); 1015 fp->exp = (uint8_t)exp; 1016 fp->type = udb_chunk_type_free; 1017 fp->flags = 0; 1018 fp->prev = 0; 1019 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1020 if(fp->next) 1021 UDB_FREE_CHUNK(fp->next)->prev = f; 1022 chunk_set_last(base, f, exp, (uint8_t)exp); 1023 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1024 } 1025 1026 /** push new element onto freelist - do not initialize the elt */ 1027 static void 1028 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp) 1029 { 1030 udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); 1031 assert(f); 1032 assert(fp->exp == (uint8_t)exp); 1033 assert(fp->type == udb_chunk_type_free); 1034 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1035 fp->prev = 0; 1036 fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; 1037 if(fp->next) 1038 UDB_FREE_CHUNK(fp->next)->prev = f; 1039 alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; 1040 } 1041 1042 /** add free chunks at end until specified alignment occurs */ 1043 static void 1044 grow_align(void* base, udb_alloc* alloc, uint64_t esz) 1045 { 1046 while( (alloc->disk->nextgrow & (esz-1)) != 0) { 1047 /* the nextgrow is not a whole multiple of esz. */ 1048 /* grow a free chunk of max allowed size */ 1049 int fexp = udb_exp_offset(alloc->disk->nextgrow); 1050 uint64_t fsz = (uint64_t)1<<fexp; 1051 udb_void f = alloc->disk->nextgrow; 1052 udb_void fn = alloc->disk->nextgrow+fsz; 1053 assert(fn <= alloc->udb->base_size); 1054 alloc->disk->stat_free += fsz; 1055 udb_alloc_push_fl(base, alloc, f, fexp); 1056 /* now increase nextgrow to commit that free chunk */ 1057 alloc->disk->nextgrow = fn; 1058 } 1059 } 1060 1061 /** append chunks at end of memory space to get size exp, return dataptr */ 1062 static udb_void 1063 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp) 1064 { 1065 uint64_t esz = (uint64_t)1<<exp; 1066 udb_void ret; 1067 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1068 grow_align(base, alloc, esz); 1069 /* free chunks are grown, grow the one we want to use */ 1070 ret = alloc->disk->nextgrow; 1071 /* take a new alloced chunk into use */ 1072 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1073 UDB_CHUNK(ret)->flags = 0; 1074 UDB_CHUNK(ret)->ptrlist = 0; 1075 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1076 /* store last octet */ 1077 chunk_set_last(base, ret, exp, (uint8_t)exp); 1078 /* update stats */ 1079 alloc->disk->stat_alloc += esz; 1080 alloc->disk->stat_data += sz; 1081 /* now increase nextgrow to commit this newly allocated chunk */ 1082 alloc->disk->nextgrow += esz; 1083 assert(alloc->disk->nextgrow <= alloc->udb->base_size); 1084 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1085 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1086 } 1087 1088 /** calculate how much space is necessary to grow for this exp */ 1089 static uint64_t 1090 grow_end_calc(udb_alloc* alloc, int exp) 1091 { 1092 uint64_t sz = (uint64_t)1<<exp; 1093 uint64_t ng = alloc->disk->nextgrow; 1094 uint64_t res; 1095 /* if nextgrow is 2**expness, no extra growth needed, only size */ 1096 if( (ng & (sz-1)) == 0) { 1097 /* sz-1 is like 0xfff, and checks if ng is whole 2**exp */ 1098 return ng+sz; /* must grow exactly 2**exp */ 1099 } 1100 /* grow until 2**expness and then we need 2**exp as well */ 1101 /* so, round ng down to whole sz (basically ng-ng%sz, or ng/sz*sz) 1102 * and then add the sz twice (go up to whole sz, and to allocate) */ 1103 res = (ng & ~(sz-1)) + 2*sz; 1104 return res; 1105 } 1106 1107 /** see if we need to grow more than specified to enable sustained growth */ 1108 static uint64_t 1109 grow_extra_check(udb_alloc* alloc, uint64_t ge) 1110 { 1111 const uint64_t mb = 1024*1024; 1112 uint64_t bsz = alloc->udb->base_size; 1113 if(bsz <= mb) { 1114 /* below 1 Mb, double sizes for exponential growth */ 1115 /* takes about 15 times to grow to 1Mb */ 1116 if(ge < bsz*2) 1117 return bsz*2; 1118 } else { 1119 uint64_t gnow = ge - bsz; 1120 /* above 1Mb, grow at least 1 Mb, or 12.5% of current size, 1121 * in whole megabytes rounded up. */ 1122 uint64_t want = ((bsz / 8) & ~(mb-1)) + mb; 1123 if(gnow < want) 1124 return bsz + want; 1125 } 1126 return ge; 1127 } 1128 1129 /** see if free space is enogh to warrant shrink (while file is open) */ 1130 static int 1131 enough_free(udb_alloc* alloc) 1132 { 1133 if(alloc->udb->base_size <= 2*1024*1024) { 1134 /* below 1 Mb, grown by double size, (so up to 2 mb), 1135 * do not shrink unless we can 1/3 in size */ 1136 if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size) 1137 return 1; 1138 } else { 1139 /* grown 12.5%, shrink 25% if possible, at least one mb */ 1140 /* between 1mb and 4mb size, it shrinks by 1mb if possible */ 1141 uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow; 1142 if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size 1143 || alloc->udb->base_size < 4*1024*1024)) 1144 return 1; 1145 } 1146 return 0; 1147 } 1148 1149 /** grow space for a chunk of 2**exp and return dataptr */ 1150 static udb_void 1151 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp) 1152 { 1153 /* commit the grow action 1154 * - the file grow only changes filesize, but not the nextgrow. 1155 * - taking space after nextgrow into use (as free space), 1156 * is like free-ing a chunk (one at a time). 1157 * - and the last chunk taken into use is like alloc. 1158 */ 1159 /* predict how much free space is needed for this */ 1160 uint64_t grow_end = grow_end_calc(alloc, exp); 1161 assert(alloc->udb->base_size >= alloc->disk->nextgrow); 1162 if(grow_end <= alloc->udb->base_size) { 1163 /* we can do this with the available space */ 1164 return grow_chunks(base, alloc, sz, exp); 1165 } 1166 /* we have to grow the file, re-mmap */ 1167 /* see if we need to grow a little more, to avoid endless grow 1168 * efforts on adding data */ 1169 grow_end = grow_extra_check(alloc, grow_end); 1170 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1171 return 0; /* mmap or write failed (disk or mem full) */ 1172 } 1173 /* we have enough space now */ 1174 assert(grow_end <= alloc->udb->base_size); 1175 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1176 return grow_chunks(base, alloc, sz, exp); 1177 } 1178 1179 /** take XL allocation into use at end of file, return dataptr */ 1180 static udb_void 1181 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz) 1182 { 1183 udb_void ret; 1184 udb_xl_chunk_d* p; 1185 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1186 1187 /* align growth to whole mbs */ 1188 grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE); 1189 1190 /* grow XL segment */ 1191 ret = alloc->disk->nextgrow; 1192 p = UDB_XL_CHUNK(ret); 1193 p->exp = UDB_EXP_XL; 1194 p->size = xlsz; 1195 p->flags = 0; 1196 p->ptrlist = 0; 1197 p->type = udb_chunk_type_data; 1198 1199 /* also put size and marker at end for compaction */ 1200 *((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz; 1201 *((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL; 1202 1203 /* stats */ 1204 alloc->disk->stat_data += sz; 1205 alloc->disk->stat_alloc += xlsz; 1206 /* now increase the nextgrow to commit this xl chunk */ 1207 alloc->disk->nextgrow += xlsz; 1208 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1209 return ret + sizeof(udb_xl_chunk_d); /* data ptr */ 1210 } 1211 1212 /** make space for XL allocation */ 1213 static udb_void 1214 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz) 1215 { 1216 /* allocate whole mbs of space, at end of space */ 1217 uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2; 1218 uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1)); 1219 uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need; 1220 assert(need >= asz); 1221 if(grow_end <= alloc->udb->base_size) { 1222 /* can do this in available space */ 1223 return grow_xl(base, alloc, need, sz); 1224 } 1225 /* have to grow file and re-mmap */ 1226 grow_end = grow_extra_check(alloc, grow_end); 1227 if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { 1228 return 0; /* mmap or write failed (disk or mem full) */ 1229 } 1230 /* we have enough space now */ 1231 assert(grow_end <= alloc->udb->base_size); 1232 assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); 1233 return grow_xl(base, alloc, need, sz); 1234 } 1235 1236 /** divide big(2**e2) into pieces so 2**exp fits */ 1237 static udb_void 1238 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2, 1239 int exp) 1240 { 1241 int e = e2; 1242 uint64_t sz = (uint64_t)1<<e2; 1243 assert(big && e2 > exp); 1244 /* so the returned piece to use is the first piece, 1245 * offload the later half until it fits */ 1246 do { 1247 sz >>= 1; /* divide size of big by two */ 1248 e--; /* that means its exp is one smaller */ 1249 udb_alloc_push_fl(base, alloc, big+sz, e); 1250 } while(e != exp); 1251 /* exit loop when last pushed is same size as what we want */ 1252 return big; 1253 } 1254 1255 /** returns the exponent size of the chunk needed for data sz */ 1256 static int 1257 udb_alloc_exp_needed(size_t sz) 1258 { 1259 uint64_t asz = sz + sizeof(udb_chunk_d) + 1; 1260 if(asz > UDB_ALLOC_CHUNK_SIZE) { 1261 return UDB_EXP_XL; 1262 } else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) { 1263 return UDB_ALLOC_CHUNK_MINEXP; 1264 } 1265 return udb_exp_size(asz); 1266 } 1267 1268 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz) 1269 { 1270 void* base = alloc->udb->base; 1271 /* calculate actual allocation size */ 1272 int e2, exp = udb_alloc_exp_needed(sz); 1273 if(exp == UDB_EXP_XL) 1274 return udb_alloc_xl_space(base, alloc, sz); 1275 /* see if there is a free chunk of that size exactly */ 1276 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) { 1277 /* snip from freelist, udb_chunk_d */ 1278 udb_void ret; 1279 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1280 ret = udb_alloc_pop_fl(base, alloc, exp); 1281 /* use it - size octets already OK */ 1282 UDB_CHUNK(ret)->flags = 0; 1283 UDB_CHUNK(ret)->ptrlist = 0; 1284 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1285 /* update stats */ 1286 alloc->disk->stat_data += sz; 1287 alloc->disk->stat_alloc += (1<<exp); 1288 assert(alloc->disk->stat_free >= (1u<<exp)); 1289 alloc->disk->stat_free -= (1<<exp); 1290 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1291 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1292 } 1293 /* see if we can subdivide a larger chunk */ 1294 for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++) 1295 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1296 udb_void big, ret; /* udb_chunk_d */ 1297 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1298 big = udb_alloc_pop_fl(base, alloc, e2); 1299 /* push other parts onto freelists (needs inited) */ 1300 ret = udb_alloc_subdivide(base, alloc, big, e2, exp); 1301 /* use final part (needs inited) */ 1302 UDB_CHUNK(ret)->exp = (uint8_t)exp; 1303 /* if stop here; the new exp makes smaller free chunk*/ 1304 UDB_CHUNK(ret)->flags = 0; 1305 UDB_CHUNK(ret)->ptrlist = 0; 1306 /* set type to commit data chunk */ 1307 UDB_CHUNK(ret)->type = udb_chunk_type_data; 1308 /* store last octet */ 1309 chunk_set_last(base, ret, exp, (uint8_t)exp); 1310 /* update stats */ 1311 alloc->disk->stat_data += sz; 1312 alloc->disk->stat_alloc += (1<<exp); 1313 assert(alloc->disk->stat_free >= (1u<<exp)); 1314 alloc->disk->stat_free -= (1<<exp); 1315 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1316 return ret + sizeof(udb_chunk_d); /* ptr to data */ 1317 } 1318 /* we need to grow an extra chunk */ 1319 return udb_alloc_grow_space(base, alloc, sz, exp); 1320 } 1321 1322 /** see if there is free space to allocate a chunk into */ 1323 static int 1324 have_free_for(udb_alloc* alloc, int exp) 1325 { 1326 int e2; 1327 if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) 1328 return exp; 1329 for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++) 1330 if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { 1331 return e2; 1332 } 1333 return 0; 1334 } 1335 1336 /** fix relptr prev and next for moved relptr structures */ 1337 static void 1338 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg) 1339 { 1340 udb_void* data = (udb_void*)arg; 1341 udb_void r; 1342 if(!rp->data) 1343 return; 1344 r = UDB_SYSTOREL(base, rp); 1345 if(rp->next) 1346 UDB_REL_PTR(rp->next)->prev = r; 1347 if(rp->prev) 1348 UDB_REL_PTR(rp->prev)->next = r; 1349 else { 1350 /* if this is a pointer to its own chunk, fix it up; 1351 * the data ptr gets set by relptr_edit later. */ 1352 if(rp->data == data[0]) 1353 UDB_CHUNK(data[1])->ptrlist = r; 1354 else UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r; 1355 } 1356 } 1357 1358 /** fix pointers from and to a moved chunk */ 1359 static void 1360 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data, 1361 uint64_t dsz, udb_void olddata) 1362 { 1363 udb_void d[2]; 1364 d[0] = olddata; 1365 d[1] = data; 1366 (*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data), 1367 dsz, &chunk_fix_ptr_each, d); 1368 udb_rel_ptr_edit(base, cp->ptrlist, data); 1369 udb_base_ram_ptr_edit(udb, olddata, data); 1370 } 1371 1372 /** move an allocated chunk to use a free chunk */ 1373 static void 1374 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz, 1375 int e2) 1376 { 1377 udb_void res = udb_alloc_pop_fl(base, alloc, e2); 1378 udb_chunk_d* rp; 1379 udb_chunk_d* fp; 1380 if(exp != e2) { 1381 /* it is bigger, subdivide it */ 1382 res = udb_alloc_subdivide(base, alloc, res, e2, exp); 1383 } 1384 assert(res != f); 1385 /* setup rollback information */ 1386 alloc->udb->glob_data->rb_old = f; 1387 alloc->udb->glob_data->rb_new = res; 1388 alloc->udb->glob_data->rb_size = esz; 1389 /* take the res, exp into use */ 1390 rp = UDB_CHUNK(res); 1391 fp = UDB_CHUNK(f); 1392 /* copy over the data */ 1393 memcpy(rp, fp, esz); 1394 /* adjust rel ptrs */ 1395 chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d), 1396 esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d)); 1397 1398 /* do not freeup the fp; caller does that */ 1399 } 1400 1401 /** unlink several free elements to overwrite with xl chunk */ 1402 static void 1403 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m) 1404 { 1405 udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; 1406 /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ 1407 assert(s >= UDB_ALLOC_CHUNK_SIZE); 1408 while(q >= s) { 1409 assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX); 1410 assert(UDB_CHUNK(q)->type == udb_chunk_type_free); 1411 udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX); 1412 q -= UDB_ALLOC_CHUNK_SIZE; 1413 } 1414 } 1415 1416 /** move an XL chunk, and keep track of segments for rollback */ 1417 static void 1418 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n, 1419 uint64_t sz, uint64_t startseg) 1420 { 1421 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1422 udb_xl_chunk_d* np = UDB_XL_CHUNK(n); 1423 uint64_t amount = xl - n; 1424 assert(n < xl); /* move to compact */ 1425 1426 /* setup move rollback */ 1427 udb->glob_data->rb_old = xl; 1428 udb->glob_data->rb_new = n; 1429 udb->glob_data->rb_size = sz; 1430 1431 /* is it overlapping? */ 1432 if(sz <= amount) { 1433 memcpy(np, xlp, sz); 1434 } else { 1435 /* move and commit per 1M segment to avoid data loss */ 1436 uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE; 1437 for(seg = startseg; seg<maxseg; seg++) { 1438 udb->glob_data->rb_seg = seg; 1439 memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE, 1440 xlp+seg*UDB_ALLOC_CHUNK_SIZE, 1441 UDB_ALLOC_CHUNK_SIZE); 1442 } 1443 1444 } 1445 } 1446 1447 /** move list of XL chunks to the front by the shift amount */ 1448 static void 1449 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz, 1450 uint64_t amount) 1451 { 1452 udb_void xl = xl_start; 1453 assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1454 assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1455 assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ 1456 while(xl < xl_start+xl_sz) { 1457 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1458 udb_void n = xl-amount; 1459 uint64_t sz = xlp->size; 1460 assert(xlp->exp == UDB_EXP_XL); 1461 move_xl_segment(base, alloc->udb, xl, n, sz, 0); 1462 chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n), 1463 n+sizeof(udb_xl_chunk_d), 1464 sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2, 1465 xl+sizeof(udb_xl_chunk_d)); 1466 } 1467 alloc->disk->stat_free -= amount; 1468 alloc->disk->nextgrow -= amount; 1469 alloc->udb->glob_data->rb_old = 0; 1470 alloc->udb->glob_data->rb_new = 0; 1471 alloc->udb->glob_data->rb_size = 0; 1472 } 1473 1474 /** see if free chunk can coagulate with another chunk, return other chunk */ 1475 static udb_void 1476 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp, 1477 uint64_t esz) 1478 { 1479 udb_void other = f^esz; 1480 if(exp == UDB_ALLOC_CHUNKS_MAX) 1481 return 0; /* no further merges */ 1482 if(other >= alloc->udb->base_size) 1483 return 0; /* not allocated */ 1484 if(other >= alloc->disk->nextgrow) 1485 return 0; /* not in use */ 1486 if(other < alloc->udb->glob_data->hsize) 1487 return 0; /* cannot merge with header */ 1488 /* the header is also protected by the special exp marker */ 1489 /* see if the other chunk is a free chunk */ 1490 1491 /* check closest marker to avoid large memory churn */ 1492 /* and also it makes XL allocations and header special markers work */ 1493 if(f > other) { 1494 assert(f > 1); /* this is certain because of header */ 1495 if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) { 1496 /* can do it if the other part is a free chunk */ 1497 assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp); 1498 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1499 return other; 1500 } 1501 } else { 1502 if(UDB_CHUNK(other)->exp == (uint8_t)exp) { 1503 /* can do it if the other part is a free chunk */ 1504 assert(chunk_get_last(base, other, exp)==(uint8_t)exp); 1505 if(UDB_CHUNK(other)->type == udb_chunk_type_free) 1506 return other; 1507 } 1508 } 1509 return 0; 1510 } 1511 1512 /** coagulate and then add new free segment, return final free segment */ 1513 static udb_void 1514 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp, 1515 uint64_t esz) 1516 { 1517 /* new free chunk here, attempt coagulate */ 1518 udb_void other; 1519 while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) { 1520 /* unlink that other chunk */ 1521 udb_alloc_unlink_fl(base, alloc, other, exp); 1522 /* merge up */ 1523 if(other < last) 1524 last = other; 1525 exp++; 1526 esz <<= 1; 1527 } 1528 /* free the final segment */ 1529 udb_alloc_push_fl(base, alloc, last, exp); 1530 return last; 1531 } 1532 1533 /** attempt to compact the data and move free space to the end */ 1534 static int 1535 udb_alloc_compact(void* base, udb_alloc* alloc) 1536 { 1537 udb_void last; 1538 int exp, e2; 1539 uint64_t esz; 1540 uint64_t at = alloc->disk->nextgrow; 1541 udb_void xl_start = 0; 1542 uint64_t xl_sz = 0; 1543 while(at > alloc->udb->glob_data->hsize) { 1544 /* grab last entry */ 1545 exp = (int)*((uint8_t*)UDB_REL(base, at-1)); 1546 if(exp == UDB_EXP_XL) { 1547 /* for XL chunks: 1548 * - inspect the size of the XLchunklist at end 1549 * - attempt to compact in front of of XLchunklist 1550 */ 1551 uint64_t xlsz = *((uint64_t*)UDB_REL(base, 1552 at-sizeof(uint64_t)*2)); 1553 udb_void xl = at-xlsz; 1554 #ifndef NDEBUG 1555 udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); 1556 assert(xlp->exp == UDB_EXP_XL); 1557 assert(xlp->type != udb_chunk_type_free); 1558 #endif 1559 /* got thesegment add to the xl chunk list */ 1560 if(xl_start != 0 && xl+xlsz != xl_start) { 1561 /* nonadjoining XL part, but they are aligned, 1562 * so the space in between is whole Mbs, 1563 * shift the later part(s) and continue */ 1564 uint64_t m = xl_start - (xl+xlsz); 1565 assert(xl_start > xl+xlsz); 1566 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1567 free_xl_space(base, alloc, xl+xlsz, m); 1568 move_xl_list(base, alloc, xl_start, xl_sz, m); 1569 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1570 } 1571 xl_start = xl; 1572 xl_sz += xlsz; 1573 at = xl; 1574 continue; 1575 /* end of XL if */ 1576 } else if(exp < UDB_ALLOC_CHUNK_MINEXP 1577 || exp > UDB_ALLOC_CHUNKS_MAX) 1578 break; /* special chunk or garbage */ 1579 esz = (uint64_t)1<<exp; 1580 last = at - esz; 1581 assert(UDB_CHUNK(last)->exp == (uint8_t)exp); 1582 if(UDB_CHUNK(last)->type == udb_chunk_type_free) { 1583 /* if xlstart continue looking to move stuff, but do 1584 * not unlink this free segment */ 1585 if(!xl_start) { 1586 /* it is a free chunk, remove it */ 1587 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1588 udb_alloc_unlink_fl(base, alloc, last, exp); 1589 alloc->disk->stat_free -= esz; 1590 alloc->disk->nextgrow = last; 1591 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1592 /* and continue at this point */ 1593 } 1594 at = last; 1595 } else if( (e2=have_free_for(alloc, exp)) ) { 1596 /* last entry can be allocated in free chunks 1597 * move it to its new position, adjust rel_ptrs */ 1598 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1599 move_chunk(base, alloc, last, exp, esz, e2); 1600 if(xl_start) { 1601 last = coagulate_and_push(base, alloc, 1602 last, exp, esz); 1603 } else { 1604 /* shorten usage */ 1605 alloc->disk->stat_free -= esz; 1606 alloc->disk->nextgrow = last; 1607 } 1608 alloc->udb->glob_data->rb_old = 0; 1609 alloc->udb->glob_data->rb_new = 0; 1610 alloc->udb->glob_data->rb_size = 0; 1611 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1612 /* and continue in front of it */ 1613 at = last; 1614 } else { 1615 /* cannot compact this block, stop compacting */ 1616 break; 1617 } 1618 /* if that worked, repeat it */ 1619 } 1620 /* if we passed xl chunks, see if XL-chunklist can move */ 1621 if(xl_start) { 1622 /* calculate free space in front of the XLchunklist. */ 1623 /* has to be whole mbs of free space */ 1624 /* if so, we can move the XL chunks. Move them all back 1625 * by the new free space. */ 1626 /* this compacts very well, but the XL chunks can be moved 1627 * multiple times; worst case for every mb freed a huge sized 1628 * xlchunklist gets moved. */ 1629 /* free space must be, since aligned and coagulated, in 1630 * chunks of a whole MB */ 1631 udb_void at = xl_start; 1632 uint64_t m = 0; 1633 while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){ 1634 udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE; 1635 if(UDB_CHUNK(chunk)->type != udb_chunk_type_free) 1636 break; 1637 assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX); 1638 m += UDB_ALLOC_CHUNK_SIZE; 1639 at = chunk; 1640 } 1641 if(m != 0) { 1642 assert(at+m == xl_start); 1643 alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; 1644 free_xl_space(base, alloc, at, m); 1645 move_xl_list(base, alloc, xl_start, xl_sz, m); 1646 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1647 } 1648 } 1649 1650 /* if enough free, shrink the file; re-mmap */ 1651 if(enough_free(alloc)) { 1652 uint64_t nsize = alloc->disk->nextgrow; 1653 udb_base_shrink(alloc->udb, nsize); 1654 if(!udb_base_remap(alloc->udb, alloc, nsize)) 1655 return 0; 1656 } 1657 return 1; 1658 } 1659 1660 #ifdef UDB_CHECK 1661 /** check that rptrs are really zero before free */ 1662 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg) 1663 { 1664 (void)base; 1665 (void)arg; 1666 assert(p->data == 0); 1667 } 1668 #endif /* UDB_CHECK */ 1669 1670 /** free XL chunk as multiples of CHUNK_SIZE free segments */ 1671 static void 1672 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp, 1673 size_t sz) 1674 { 1675 uint64_t xlsz = fp->size; 1676 uint64_t c; 1677 /* lightweight check for buffer overflow in xl data */ 1678 assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz); 1679 assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL); 1680 assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */ 1681 assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ 1682 #ifdef UDB_CHECK 1683 /* check that relptrs in this chunk have been zeroed */ 1684 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1685 UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz, 1686 &udb_check_rptr_zero, NULL); 1687 #endif 1688 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1689 /* update stats */ 1690 alloc->disk->stat_data -= sz; 1691 alloc->disk->stat_alloc -= xlsz; 1692 alloc->disk->stat_free += xlsz; 1693 /* walk in reverse, so the front blocks go first on the list */ 1694 c = f + xlsz - UDB_ALLOC_CHUNK_SIZE; 1695 /* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/ 1696 assert(f >= UDB_ALLOC_CHUNK_SIZE); 1697 while(c >= f) { 1698 /* free a block of CHUNK_SIZE (1 Mb) */ 1699 udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX); 1700 c -= UDB_ALLOC_CHUNK_SIZE; 1701 } 1702 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1703 } 1704 1705 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz) 1706 { 1707 void* base; 1708 /* lookup chunk ptr */ 1709 udb_void f; 1710 udb_chunk_d* fp; 1711 uint64_t esz; 1712 int exp; 1713 udb_void other; 1714 int coagulated = 0; 1715 if(!r) 1716 return 1; /* free(NULL) does nothing */ 1717 1718 /* lookup size of chunk */ 1719 base = alloc->udb->base; 1720 /* fails for XL blocks */ 1721 f = chunk_from_dataptr(r); 1722 fp = UDB_CHUNK(f); 1723 assert(fp->type != udb_chunk_type_free); 1724 1725 /* see if it has a ptrlist, if so: trouble, the list is not properly 1726 * cleaned up. (although you can imagine a wholesale delete where 1727 * it does not matter) */ 1728 assert(fp->ptrlist == 0); 1729 1730 /* set ptrlist to 0 to stop relptr from using it, robustness. */ 1731 fp->ptrlist = 0; 1732 1733 if(fp->exp == UDB_EXP_XL) { 1734 udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz); 1735 /* compact */ 1736 return udb_alloc_compact(base, alloc); 1737 } 1738 /* it is a regular chunk of 2**exp size */ 1739 exp = (int)fp->exp; 1740 esz = (uint64_t)1<<exp; 1741 /* light check for e.g. buffer overflow of the data */ 1742 assert(sz < esz); 1743 assert(chunk_get_last(base, f, exp) == (uint8_t)exp); 1744 #ifdef UDB_CHECK 1745 /* check that relptrs in this chunk have been zeroed */ 1746 (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, 1747 UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL); 1748 #endif 1749 1750 /* update the stats */ 1751 alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; 1752 alloc->disk->stat_data -= sz; 1753 alloc->disk->stat_free += esz; 1754 alloc->disk->stat_alloc -= esz; 1755 1756 /* if it can be merged with other free chunks, do so */ 1757 while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) { 1758 coagulated = 1; 1759 /* unlink that other chunk and expand it (it has same size) */ 1760 udb_alloc_unlink_fl(base, alloc, other, exp); 1761 /* merge up */ 1762 if(other < f) 1763 f = other; 1764 exp++; 1765 esz <<= 1; 1766 } 1767 if(coagulated) { 1768 /* put big free chunk into freelist, and init it */ 1769 udb_alloc_push_fl(base, alloc, f, exp); 1770 } else { 1771 /* we do not need to touch the last-exp-byte, which may save 1772 * a reference to that page of memory */ 1773 fp->type = udb_chunk_type_free; 1774 fp->flags = 0; 1775 udb_alloc_push_fl_noinit(base, alloc, f, exp); 1776 } 1777 alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; 1778 /* compact */ 1779 return udb_alloc_compact(base, alloc); 1780 } 1781 1782 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz) 1783 { 1784 /* could be faster maybe, if grown? */ 1785 udb_void r = udb_alloc_space(alloc, sz); 1786 if(!r) return r; 1787 memcpy(UDB_REL(alloc->udb->base, r), d, sz); 1788 return r; 1789 } 1790 1791 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz) 1792 { 1793 void* base = alloc->udb->base; 1794 udb_void c, n, newd; 1795 udb_chunk_d* cp, *np; 1796 uint64_t avail; 1797 uint8_t cp_type; 1798 /* emulate some posix realloc stuff */ 1799 if(r == 0) 1800 return udb_alloc_space(alloc, sz); 1801 if(sz == 0) { 1802 if(!udb_alloc_free(alloc, r, osz)) 1803 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1804 return 0; 1805 } 1806 c = chunk_from_dataptr(r); 1807 cp = UDB_CHUNK(c); 1808 cp_type = cp->type; 1809 if(cp->exp == UDB_EXP_XL) { 1810 avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d) 1811 - sizeof(uint64_t)*2; 1812 } else { 1813 avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1; 1814 } 1815 if(sz <= avail) 1816 return r; 1817 /* reallocate it, and copy */ 1818 newd = udb_alloc_space(alloc, sz); 1819 if(!newd) return 0; 1820 /* re-base after alloc, since re-mmap may have happened */ 1821 base = alloc->udb->base; 1822 cp = NULL; /* may be invalid now, robustness */ 1823 n = chunk_from_dataptr(newd); 1824 np = UDB_CHUNK(n); 1825 np->type = cp_type; 1826 memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz); 1827 /* fixup ptrs */ 1828 chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r); 1829 1830 if(!udb_alloc_free(alloc, r, osz)) 1831 log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); 1832 return newd; 1833 } 1834 1835 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num) 1836 { 1837 const uint64_t mb = 1024*1024; 1838 int exp = udb_alloc_exp_needed(sz); 1839 uint64_t esz; 1840 uint64_t want; 1841 if(exp == UDB_EXP_XL) 1842 esz = (sz&(mb-1))+mb; 1843 else esz = (uint64_t)1<<exp; 1844 /* we need grow_end_calc to take into account alignment */ 1845 want = grow_end_calc(alloc, exp) + esz*(num-1); 1846 assert(want >= alloc->udb->base_size); 1847 if(!udb_base_grow_and_remap(alloc->udb, want)) { 1848 log_msg(LOG_ERR, "failed to grow the specified amount"); 1849 return 0; 1850 } 1851 return 1; 1852 } 1853 1854 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp) 1855 { 1856 void* base = alloc->udb->base; 1857 udb_void f = chunk_from_dataptr(r); 1858 udb_chunk_d* fp = UDB_CHUNK(f); 1859 /* not the 'free' type, that must be set by allocation routines */ 1860 assert(fp->type != udb_chunk_type_free); 1861 assert(tp != udb_chunk_type_free); 1862 fp->type = tp; 1863 } 1864 1865 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize) 1866 { 1867 /* pointers are not valid before the header-size or after the 1868 * used-region of the mmap */ 1869 return ( (to+destsize) <= udb->base_size && 1870 to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) && 1871 (to+destsize) <= udb->alloc->disk->nextgrow); 1872 } 1873 1874 int udb_valid_dataptr(udb_base* udb, udb_void to) 1875 { 1876 void* base = udb->base; 1877 udb_void ch; 1878 int exp; 1879 uint64_t esz; 1880 /* our data chunks are aligned and at least 8 bytes */ 1881 if(!udb_valid_offset(udb, to, sizeof(uint64_t))) 1882 return 0; 1883 /* get the chunk pointer */ 1884 ch = chunk_from_dataptr(to); 1885 if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d))) 1886 return 0; 1887 /* check its size */ 1888 exp = UDB_CHUNK(ch)->exp; 1889 if(exp == UDB_EXP_XL) { 1890 /* check XL chunk */ 1891 uint64_t xlsz; 1892 if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d))) 1893 return 0; 1894 xlsz = UDB_XL_CHUNK(ch)->size; 1895 if(!udb_valid_offset(udb, ch+xlsz-1, 1)) 1896 return 0; 1897 if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL) 1898 return 0; 1899 if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2)) 1900 != xlsz) 1901 return 0; 1902 return 1; 1903 } 1904 /* check if regular chunk has matching end byte */ 1905 if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) 1906 return 0; /* cannot be a valid chunk */ 1907 esz = 1<<exp; 1908 if(!udb_valid_offset(udb, ch+esz-1, 1)) 1909 return 0; 1910 if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp) 1911 return 0; 1912 return 1; 1913 } 1914 1915 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to) 1916 { 1917 void* base = udb->base; 1918 udb_void p; 1919 if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr))) 1920 return 0; 1921 if(!udb_valid_dataptr(udb, to)) 1922 return 0; 1923 p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist; 1924 while(p) { 1925 if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr))) 1926 return 0; 1927 if(p == rptr) 1928 return 1; 1929 p = UDB_REL_PTR(p)->next; 1930 } 1931 return 0; 1932 } 1933 1934 void udb_rel_ptr_init(udb_rel_ptr* ptr) 1935 { 1936 memset(ptr, 0, sizeof(*ptr)); 1937 } 1938 1939 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr) 1940 { 1941 if(!ptr->data) 1942 return; 1943 if(ptr->prev) { 1944 UDB_REL_PTR(ptr->prev)->next = ptr->next; 1945 } else { 1946 UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next; 1947 } 1948 if(ptr->next) { 1949 UDB_REL_PTR(ptr->next)->prev = ptr->prev; 1950 } 1951 } 1952 1953 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to) 1954 { 1955 udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to)); 1956 ptr->prev = 0; 1957 ptr->next = chunk->ptrlist; 1958 if(ptr->next) 1959 UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr); 1960 chunk->ptrlist = UDB_SYSTOREL(base, ptr); 1961 ptr->data = to; 1962 } 1963 1964 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to) 1965 { 1966 assert(to == 0 || to > 64); 1967 udb_rel_ptr_unlink(base, ptr); 1968 if(to) 1969 udb_rel_ptr_link(base, ptr, to); 1970 else ptr->data = to; 1971 } 1972 1973 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to) 1974 { 1975 udb_void p = list; 1976 while(p) { 1977 UDB_REL_PTR(p)->data = to; 1978 p = UDB_REL_PTR(p)->next; 1979 } 1980 } 1981 1982 #ifdef UDB_CHECK 1983 /** check that all pointers are validly chained */ 1984 static void 1985 udb_check_ptrs_valid(udb_base* udb) 1986 { 1987 size_t i; 1988 udb_ptr* p, *prev; 1989 for(i=0; i<udb->ram_size; i++) { 1990 prev = NULL; 1991 for(p=udb->ram_hash[i]; p; p=p->next) { 1992 assert(p->prev == prev); 1993 assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask) 1994 == i); 1995 assert(p->base == &udb->base); 1996 prev = p; 1997 } 1998 } 1999 } 2000 #endif /* UDB_CHECK */ 2001 2002 void udb_ptr_init(udb_ptr* ptr, udb_base* udb) 2003 { 2004 #ifdef UDB_CHECK 2005 udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */ 2006 #endif 2007 memset(ptr, 0, sizeof(*ptr)); 2008 ptr->base = &udb->base; 2009 } 2010 2011 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval) 2012 { 2013 assert(newval == 0 || newval > 64); 2014 if(ptr->data) 2015 udb_base_unlink_ptr(udb, ptr); 2016 ptr->data = newval; 2017 if(newval) 2018 udb_base_link_ptr(udb, ptr); 2019 } 2020 2021 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type, 2022 size_t sz) 2023 { 2024 udb_void r; 2025 r = udb_alloc_space(udb->alloc, sz); 2026 if(!r) return 0; 2027 udb_alloc_set_type(udb->alloc, r, type); 2028 udb_ptr_init(ptr, udb); 2029 udb_ptr_set(ptr, udb, r); 2030 return 1; 2031 } 2032 2033 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz) 2034 { 2035 if(ptr->data) { 2036 udb_void d = ptr->data; 2037 udb_ptr_set(ptr, udb, 0); 2038 udb_alloc_free(udb->alloc, d, sz); 2039 } 2040 } 2041 2042 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr) 2043 { 2044 udb_void f; 2045 if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/ 2046 f = chunk_from_dataptr(ptr->data); 2047 return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type; 2048 } 2049