xref: /openbsd-src/usr.sbin/nsd/udb.c (revision d3fecca9f63d975339880ea9da999a59fc9dbfdc)
1 /* udb.c - u(micro) data base.
2  * By W.C.A. Wijngaards
3  * Copyright 2010, NLnet Labs.
4  * BSD, see LICENSE.
5  */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15 
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29 
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 	udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35 
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
38 chunk_from_dataptr(udb_void data)
39 {
40 	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 	udb_void xl = data - sizeof(udb_xl_chunk_d);
43 	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 		return xl;
45 	return data - sizeof(udb_chunk_d);
46 }
47 
48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 	return chunk_from_dataptr(data);
50 }
51 
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60 
61 /** write last octet of a chunk */
62 static void
63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 	*((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value;
66 }
67 
68 /** create udb_base from a file descriptor (must be at start of file) */
69 udb_base*
70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
71 	void* arg)
72 {
73 	uint64_t m;
74 	udb_glob_d g;
75 	ssize_t r;
76 	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
77 	if(!udb) {
78 		log_msg(LOG_ERR, "out of memory");
79 		close(fd);
80 		return NULL;
81 	}
82 	udb->fname = strdup(fname);
83 	if(!udb->fname) {
84 		log_msg(LOG_ERR, "out of memory");
85 		free(udb);
86 		close(fd);
87 		return NULL;
88 	}
89 	udb->walkfunc = walkfunc;
90 	udb->walkarg = arg;
91 	udb->fd = fd;
92 	udb->ram_size = 1024;
93 	udb->ram_mask = (int)udb->ram_size - 1;
94 	udb->ram_hash = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*udb->ram_size);
95 	if(!udb->ram_hash) {
96 		free(udb->fname);
97 		free(udb);
98 		log_msg(LOG_ERR, "out of memory");
99 		close(fd);
100 		return NULL;
101 	}
102 
103 	/* read magic */
104 	if((r=read(fd, &m, sizeof(m))) == -1) {
105 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
106 		goto fail;
107 	} else if(r != (ssize_t)sizeof(m)) {
108 		log_msg(LOG_ERR, "%s: file too short", fname);
109 		goto fail;
110 	}
111 	/* TODO : what if bigendian and littleendian file, see magic */
112 	if(m != UDB_MAGIC) {
113 		log_msg(LOG_ERR, "%s: wrong type of file", fname);
114 		goto fail;
115 	}
116 	/* read header */
117 	if((r=read(fd, &g, sizeof(g))) == -1) {
118 		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
119 		goto fail;
120 	} else if(r != (ssize_t)sizeof(g)) {
121 		log_msg(LOG_ERR, "%s: file too short", fname);
122 		goto fail;
123 	}
124 	if(g.version != 0) {
125 		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
126 			(int)g.version);
127 		goto fail;
128 	}
129 	if(g.hsize < UDB_HEADER_SIZE) {
130 		log_msg(LOG_ERR, "%s: header size too small %d", fname,
131 			(int)g.hsize);
132 		goto fail;
133 	}
134 	if(g.hsize > UDB_HEADER_SIZE) {
135 		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
136 			(int)g.hsize);
137 		log_msg(LOG_WARNING, "attempting to continue...");
138 	}
139 	if(g.clean_close != 0) {
140 		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
141 			(int)g.clean_close);
142 		log_msg(LOG_WARNING, "attempting to continue...");
143 	}
144 	/* TODO check if too large (>4g on 32bit); mmap-usage would fail */
145 
146 	/* mmap it */
147 	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
148 		log_msg(LOG_ERR, "%s: file too short", fname);
149 		goto fail;
150 	}
151 	udb->base_size = (size_t)g.fsize;
152 	/* note the size_t casts must be there for portability, on some
153 	 * systems the layout of memory is otherwise broken. */
154 	udb->base = mmap(NULL, (size_t)udb->base_size,
155 		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
156 		(int)udb->fd, (off_t)0);
157 	if(udb->base == MAP_FAILED) {
158 		udb->base = NULL;
159 		log_msg(LOG_ERR, "mmap(size %u) error: %s",
160 			(unsigned)udb->base_size, strerror(errno));
161 	fail:
162 		close(fd);
163 		free(udb->fname);
164 		free(udb->ram_hash);
165 		free(udb);
166 		return NULL;
167 	}
168 
169 	/* init completion */
170 	udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t));
171 	r = 0;
172 	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
173 		r = 1;
174 	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
175 		(void*)udb->glob_data+sizeof(*udb->glob_data)));
176 	if(!udb->alloc) {
177 		log_msg(LOG_ERR, "out of memory");
178 		udb_base_free(udb);
179 		return NULL;
180 	}
181 	if(r) {
182 		/* and compact now, or resume compacting */
183 		udb_alloc_compact(udb, udb->alloc);
184 		udb_base_sync(udb, 1);
185 	}
186 
187 	return udb;
188 }
189 
190 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
191 	void* arg)
192 {
193 	int fd = open(fname, O_RDWR);
194 	if(fd == -1) {
195 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
196 		return NULL;
197 	}
198 	return udb_base_create_fd(fname, fd, walkfunc, arg);
199 }
200 
201 /** init new udb_global structure */
202 static void udb_glob_init_new(udb_glob_d* g)
203 {
204 	memset(g, 0, sizeof(*g));
205 	g->hsize = UDB_HEADER_SIZE;
206 	g->fsize = UDB_HEADER_SIZE;
207 }
208 
209 /** write data to file and check result */
210 static int
211 write_fdata(const char* fname, int fd, void* data, size_t len)
212 {
213 	ssize_t w;
214 	if((w=write(fd, data, len)) == -1) {
215 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
216 		close(fd);
217 		return 0;
218 	} else if(w != (ssize_t)len) {
219 		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
220 		close(fd);
221 		return 0;
222 	}
223 	return 1;
224 }
225 
226 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
227 	void* arg)
228 {
229 	uint64_t m;
230 	udb_glob_d g;
231 	udb_alloc_d a;
232 	uint64_t endsize = UDB_HEADER_SIZE;
233 	uint64_t endexp = 0;
234 	int fd = open(fname, O_CREAT|O_RDWR, 0600);
235 	if(fd == -1) {
236 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
237 		return NULL;
238 	}
239 	m = UDB_MAGIC;
240 	udb_glob_init_new(&g);
241 	udb_alloc_init_new(&a);
242 
243 	/* write new data to file (closes fd on error) */
244 	if(!write_fdata(fname, fd, &m, sizeof(m)))
245 		return NULL;
246 	if(!write_fdata(fname, fd, &g, sizeof(g)))
247 		return NULL;
248 	if(!write_fdata(fname, fd, &a, sizeof(a)))
249 		return NULL;
250 	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
251 		return NULL;
252 	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
253 		return NULL;
254 	/* rewind to start */
255 	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
256 		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
257 		close(fd);
258 		return NULL;
259 	}
260 	return udb_base_create_fd(fname, fd, walkfunc, arg);
261 }
262 
263 /** shrink the udb base if it has unused space at the end */
264 static void
265 udb_base_shrink(udb_base* udb, uint64_t nsize)
266 {
267 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
268 	udb->glob_data->fsize = nsize;
269 	/* sync, does not *seem* to be required on Linux, but it is
270 	   certainly required on OpenBSD.  Otherwise changed data is lost. */
271 	msync(udb->base, udb->base_size, MS_ASYNC);
272 	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
273 		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
274 			(unsigned)nsize, strerror(errno));
275 	}
276 	udb->glob_data->dirty_alloc = udb_dirty_clean;
277 }
278 
279 void udb_base_close(udb_base* udb)
280 {
281 	if(!udb)
282 		return;
283 	if(udb->fd != -1 && udb->base && udb->alloc) {
284 		uint64_t nsize = udb->alloc->disk->nextgrow;
285 		if(nsize < udb->base_size)
286 			udb_base_shrink(udb, nsize);
287 	}
288 	if(udb->fd != -1) {
289 		close(udb->fd);
290 		udb->fd = -1;
291 	}
292 	if(udb->base) {
293 		if(munmap(udb->base, udb->base_size) == -1) {
294 			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
295 		}
296 		udb->base = NULL;
297 	}
298 }
299 
300 void udb_base_free(udb_base* udb)
301 {
302 	if(!udb)
303 		return;
304 	udb_base_close(udb);
305 	udb_alloc_delete(udb->alloc);
306 	free(udb->ram_hash);
307 	free(udb->fname);
308 	free(udb);
309 }
310 
311 void udb_base_free_keep_mmap(udb_base* udb)
312 {
313 	if(!udb) return;
314 	if(udb->fd != -1) {
315 		close(udb->fd);
316 		udb->fd = -1;
317 	}
318 	udb->base = NULL;
319 	udb_alloc_delete(udb->alloc);
320 	free(udb->ram_hash);
321 	free(udb->fname);
322 	free(udb);
323 }
324 
325 void udb_base_sync(udb_base* udb, int wait)
326 {
327 	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
328 		log_msg(LOG_ERR, "msync(%s) error %s",
329 			udb->fname, strerror(errno));
330 	}
331 }
332 
333 /** hash a chunk pointer */
334 static uint32_t
335 chunk_hash_ptr(udb_void p)
336 {
337 	/* put p into an array of uint32 */
338 	uint32_t h[sizeof(p)/sizeof(uint32_t)];
339 	memcpy(&h, &p, sizeof(h));
340 	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
341 }
342 
343 /** check that the given pointer is on the bucket for the given offset */
344 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
345 {
346 	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
347 	udb_ptr* p;
348 	assert((size_t)i < udb->ram_size);
349 	for(p = udb->ram_hash[i]; p; p=p->next) {
350 		if(p == ptr)
351 			return 1;
352 	}
353 	return 0;
354 }
355 
356 /** grow the ram array */
357 static void
358 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
359 {
360 	size_t i;
361 	size_t osize= udb->ram_size;
362 	udb_ptr* p, *np;
363 	udb_ptr** oldhash = udb->ram_hash;
364 	udb->ram_size *= 2;
365 	udb->ram_mask <<= 1;
366 	udb->ram_mask |= 1;
367 	udb->ram_hash = newhash;
368 	/* have to link in every element in the old list into the new list*/
369 	for(i=0; i<osize; i++) {
370 		p = oldhash[i];
371 		while(p) {
372 			np = p->next;
373 			/* link into newhash */
374 			p->prev=NULL;
375 			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
376 			if(p->next) p->next->prev = p;
377 			/* go to next element of oldhash */
378 			p = np;
379 		}
380 	}
381 	free(oldhash);
382 }
383 
384 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
385 {
386 	uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
387 	assert((size_t)i < udb->ram_size);
388 #ifdef UDB_CHECK
389 	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
390 #endif
391 	udb->ram_num++;
392 	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0xefffffff) {
393 		/* grow the array, if allocation succeeds */
394 		udb_ptr** newram = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*
395 			udb->ram_size*2);
396 		if(newram) {
397 			grow_ram_hash(udb, newram);
398 		}
399 	}
400 	ptr->prev = NULL;
401 	ptr->next = udb->ram_hash[i];
402 	udb->ram_hash[i] = ptr;
403 	if(ptr->next)
404 		ptr->next->prev = ptr;
405 }
406 
407 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
408 {
409 	assert(ptr->data);
410 #ifdef UDB_CHECK
411 	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
412 	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
413 #endif
414 	udb->ram_num--;
415 	if(ptr->next)
416 		ptr->next->prev = ptr->prev;
417 	if(ptr->prev)
418 		ptr->prev->next = ptr->next;
419 	else	{
420 		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
421 		assert((size_t)i < udb->ram_size);
422 		udb->ram_hash[i] = ptr->next;
423 	}
424 }
425 
426 /** change a set of ram ptrs to a new value */
427 static void
428 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
429 {
430 	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
431 	udb_ptr* p, *np;
432 	/* edit them and move them into the new position */
433 	p = udb->ram_hash[io];
434 	while(p) {
435 		np = p->next;
436 		if(p->data == old) {
437 			udb_base_unlink_ptr(udb, p);
438 			p->data = newd;
439 			udb_base_link_ptr(udb, p);
440 		}
441 		p = np;
442 	}
443 }
444 
445 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
446 {
447 	return &udb->glob_data->user_global;
448 }
449 
450 void udb_base_set_userdata(udb_base* udb, udb_void user)
451 {
452 #ifdef UDB_CHECK
453 	if(user) { assert(udb_valid_dataptr(udb, user)); }
454 #endif
455 	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
456 }
457 
458 void udb_base_set_userflags(udb_base* udb, uint8_t v)
459 {
460 	udb->glob_data->userflags = v;
461 }
462 
463 uint8_t udb_base_get_userflags(udb_base* udb)
464 {
465 	return udb->glob_data->userflags;
466 }
467 
468 /** re-mmap the udb to specified size */
469 static void*
470 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
471 {
472 	void* nb;
473 	/* for use with valgrind, do not use mremap, but the other version */
474 #ifdef MREMAP_MAYMOVE
475 	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
476 	if(nb == MAP_FAILED) {
477 		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
478 			udb->fname, (unsigned)nsize, strerror(errno));
479 		return 0;
480 	}
481 #else /* !HAVE MREMAP */
482 	/* use munmap-mmap to simulate mremap */
483 	if(munmap(udb->base, udb->base_size) != 0) {
484 		log_msg(LOG_ERR, "munmap(%s) error %s",
485 			udb->fname, strerror(errno));
486 	}
487 	/* provide hint for new location */
488 	/* note the size_t casts must be there for portability, on some
489 	 * systems the layout of memory is otherwise broken. */
490 	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
491 		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
492 	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
493 	 * the kernel can then try to mmap it at a different location
494 	 * where more memory is available */
495 	if(nb == MAP_FAILED && errno == ENOMEM) {
496 		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
497 			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
498 	}
499 	if(nb == MAP_FAILED) {
500 		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
501 			udb->fname, (unsigned)nsize, strerror(errno));
502 		udb->base = NULL;
503 		return 0;
504 	}
505 #endif /* HAVE MREMAP */
506 	if(nb != udb->base) {
507 		/* fix up realpointers in udb and alloc */
508 		/* but mremap may have been nice and not move the base */
509 		udb->base = nb;
510 		udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t));
511 		/* use passed alloc pointer because the udb->alloc may not
512 		 * be initialized yet */
513 		alloc->disk = (udb_alloc_d*)((void*)udb->glob_data
514 			+sizeof(*udb->glob_data));
515 	}
516 	udb->base_size = nsize;
517 	return nb;
518 }
519 
520 void
521 udb_base_remap_process(udb_base* udb)
522 {
523 	/* assume that fsize is still accessible */
524 	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
525 }
526 
527 /** grow file to specified size and re-mmap, return new base */
528 static void*
529 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
530 {
531 	/* grow file by writing a single zero at that spot, the
532 	 * rest is filled in with zeroes. */
533 	uint8_t z = 0;
534 	ssize_t w;
535 
536 	assert(nsize > 0);
537 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
538 #ifdef HAVE_PWRITE
539 	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
540 #else
541 	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
542 		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
543 		return 0;
544 	}
545 	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
546 #endif
547 		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
548 			udb->fname, (unsigned)nsize, strerror(errno));
549 		return 0;
550 	} else if(w != (ssize_t)sizeof(z)) {
551 		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
552 			udb->fname, (unsigned)nsize);
553 		return 0;
554 	}
555 	udb->glob_data->fsize = nsize;
556 	udb->glob_data->dirty_alloc = udb_dirty_clean;
557 	return udb_base_remap(udb, udb->alloc, nsize);
558 }
559 
560 int udb_exp_size(uint64_t a)
561 {
562 	/* find enclosing value such that 2**x >= a */
563 	int x = 0;
564 	uint64_t i = a;
565 	assert(a != 0);
566 
567 	i --;
568 	/* could optimise this with uint8* access, depends on endianness */
569 	/* first whole bytes */
570 	while( (i&(~(uint64_t)0xff)) ) {
571 		i >>= 8;
572 		x += 8;
573 	}
574 	/* now details */
575 	while(i) {
576 		i >>= 1;
577 		x ++;
578 	}
579 	assert( ((uint64_t)1<<x) >= a);
580 	assert( x==0 || ((uint64_t)1<<(x-1)) < a);
581 	return x;
582 }
583 
584 int udb_exp_offset(uint64_t o)
585 {
586 	/* this means measuring the number of 0 bits on the right */
587 	/* so, if exp zero bits then (o&(2**x-1))==0 */
588 	int x = 0;
589 	uint64_t i = o;
590 	assert(o != 0);
591 	/* first whole bytes */
592 	while( (i&(uint64_t)0xff) == 0) {
593 		i >>= 8;
594 		x += 8;
595 	}
596 	/* now details */
597 	while( (i&(uint64_t)0x1) == 0) {
598 		i >>= 1;
599 		x ++;
600 	}
601 	assert( o % ((uint64_t)1<<x) == 0);
602 	assert( o % ((uint64_t)1<<(x+1)) != 0);
603 	return x;
604 }
605 
606 void udb_alloc_init_new(udb_alloc_d* a)
607 {
608 	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
609 	memset(a, 0, sizeof(*a));
610 	/* set new allocations after header, as if allocated in a sequence
611 	 * of minsize allocations */
612 	a->nextgrow = UDB_HEADER_SIZE;
613 }
614 
615 /** fsck the file size, false if failed and file is useless */
616 static int
617 fsck_fsize(udb_base* udb, udb_alloc* alloc)
618 {
619 	off_t realsize;
620 	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
621 	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
622 	if(realsize == (off_t)-1) {
623 		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
624 		return 0;
625 	}
626 	udb->glob_data->fsize = (uint64_t)realsize;
627 	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
628 		return 0;
629 	udb->glob_data->dirty_alloc = udb_dirty_clean;
630 	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
631 	udb_base_sync(udb, 1);
632 	return 1;
633 }
634 
635 /** regenerate freelist add a new free chunk, return next todo */
636 static udb_void
637 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
638 {
639 	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
640 	uint64_t esz = (uint64_t)1<<exp;
641 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
642 		return 0;
643 	}
644 	cp->type = udb_chunk_type_free;
645 	cp->flags = 0;
646 	chunk_set_last(base, c, exp, (uint8_t)exp);
647 	cp->prev = 0;
648 	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
649 	if(cp->next)
650 		UDB_FREE_CHUNK(cp->next)->prev = c;
651 	regen->stat_free += esz;
652 	return c + esz;
653 }
654 
655 /** regenerate xl chunk, return next todo */
656 static udb_void
657 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
658 {
659 	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
660 	uint64_t xlsz = cp->size;
661 	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
662 		return 0;
663 	}
664 	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
665 		return 0;
666 	}
667 	/* fixup end-size and end-expmarker */
668 	regen->stat_alloc += xlsz;
669 	return c + xlsz;
670 }
671 
672 /** regenerate data chunk, return next todo */
673 static udb_void
674 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
675 {
676 	uint64_t esz = (uint64_t)1<<exp;
677 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
678 		return 0;
679 	}
680 	chunk_set_last(base, c, exp, (uint8_t)exp);
681 	regen->stat_alloc += esz;
682 	return c + esz;
683 }
684 
685 /** regenerate a relptr structure inside a data segment */
686 static void
687 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
688 {
689 	udb_void* a = (udb_void*)arg;
690 	/* ignore 0 pointers */
691 	if(!rp->data)
692 		return;
693 
694 	/* edit relptrs that point to oldmoved to point to newmoved. */
695 	if(rp->data == a[0])
696 		rp->data = a[1];
697 
698 	/* regenerate relptr lists, add this item to the relptr list for
699 	 * the data that it points to */
700 	udb_rel_ptr_link(base, rp, rp->data);
701 }
702 
703 /** regenerate the relptrs store in this data segment */
704 static void
705 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
706 	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
707 {
708 	udb_void arg[2];
709 	arg[0] = rb_old; arg[1] = rb_new;
710 	/* walk through the structs here and put them on their respective
711 	 * relptr lists */
712 	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
713 		&regen_relptr_func, arg);
714 
715 }
716 
717 /** regenerate relptrlists in the file */
718 static void
719 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
720 	udb_void rb_old, udb_void rb_new)
721 {
722 	udb_void at = alloc->udb->glob_data->hsize;
723 	/* clear all ptrlist start pointers in the file. */
724 	while(at < alloc->disk->nextgrow) {
725 		int exp = (int)UDB_CHUNK(at)->exp;
726 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
727 		if(exp == UDB_EXP_XL) {
728 			UDB_XL_CHUNK(at)->ptrlist = 0;
729 			at += UDB_XL_CHUNK(at)->size;
730 		} else if(tp == udb_chunk_type_free) {
731 			at += (uint64_t)1<<exp;
732 		} else { /* data chunk */
733 			UDB_CHUNK(at)->ptrlist = 0;
734 			at += (uint64_t)1<<exp;
735 		}
736 	}
737 	/* walk through all relptr structs and put on the right list. */
738 	at = alloc->udb->glob_data->hsize;
739 	while(at < alloc->disk->nextgrow) {
740 		udb_chunk_d* atp = UDB_CHUNK(at);
741 		int exp = (int)atp->exp;
742 		udb_chunk_type tp = (udb_chunk_type)atp->type;
743 		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
744 			(uint64_t)1<<exp);
745 		if(exp == UDB_EXP_XL) {
746 			assert(at != rb_old); /* should have been freed */
747 			regen_its_ptrs(base, udb, atp,
748 				((void*)atp)+sizeof(udb_xl_chunk_d),
749 				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
750 				rb_old, rb_new);
751 			at += sz;
752 		} else if(tp == udb_chunk_type_free) {
753 			at += sz;
754 		} else { /* data chunk */
755 			assert(at != rb_old); /* should have been freed */
756 			regen_its_ptrs(base, udb, atp,
757 				((void*)atp)+sizeof(udb_chunk_d),
758 				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
759 			at += sz;
760 		}
761 	}
762 }
763 
764 
765 /** mark free elements from ex XL chunk space and later fixups pick that up */
766 static void
767 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
768 {
769 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
770 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
771 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
772 	while(q >= s) {
773 		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
774 		UDB_CHUNK(q)->type = udb_chunk_type_free;
775 		q -= UDB_ALLOC_CHUNK_SIZE;
776 	}
777 }
778 
779 
780 /** fsck rollback or rollforward XL move results */
781 static int
782 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
783 	uint64_t rb_size, uint64_t rb_seg)
784 {
785 
786 	if(rb_old <= rb_new)
787 		return 0; /* XL move one way */
788 	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
789 		return 0; /* not aligned */
790 	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
791 		return 0; /* not aligned */
792 	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
793 		return 0; /* not aligned */
794 	if(rb_new + rb_size <= rb_old) {
795 		/* not overlapping: resume copy */
796 		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
797 		/* and free up old piece(s) */
798 		rb_mark_free_segs(base, rb_old, rb_size);
799 	} else {
800 		/* overlapping, see what segment we stopped at
801 		 * and continue there. */
802 		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
803 		/* free up old piece(s); from the end of the moved segment,
804 		 * until the end of the old segment */
805 		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
806 			(rb_new+rb_size));
807 	}
808 	/* do not call fix_ptrs, regenptrs does the job */
809 	return 1;
810 }
811 
812 /** fsck rollback or rollforward move results */
813 static int
814 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
815 	udb_void* make_free)
816 {
817 	if( (rb_size&(rb_size-1)) != 0)
818 		return 0; /* not powerof2 */
819 	if( (rb_old&(rb_size-1)) != 0)
820 		return 0; /* not aligned */
821 	if( (rb_new&(rb_size-1)) != 0)
822 		return 0; /* not aligned */
823 	/* resume copy */
824 	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
825 	/* do not call fix_ptrs, regenptrs does the job */
826 	/* make sure udb_old is freed */
827 	*make_free = rb_old;
828 	return 1;
829 }
830 
831 /** fsck the file and salvage, false if failed and file is useless */
832 static int
833 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
834 {
835 	void* base = udb->base;
836 	udb_alloc_d regen;
837 	udb_void at = udb->glob_data->hsize;
838 	udb_void rb_old = udb->glob_data->rb_old;
839 	udb_void rb_new = udb->glob_data->rb_new;
840 	udb_void rb_seg = udb->glob_data->rb_seg;
841 	udb_void make_free = 0;
842 	uint64_t rb_size = udb->glob_data->rb_size;
843 	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
844 	/* walk through the file, use the exp values to see what can be
845 	 * salvaged */
846 	if(moved && rb_old && rb_new && rb_size) {
847 		if(rb_old+rb_size <= alloc->disk->nextgrow
848 			&& rb_new+rb_size <= alloc->disk->nextgrow) {
849 			/* we can use the move information to fix up the
850 			 * duplicate element (or partially moved element) */
851 			if(rb_size > 1024*1024) {
852 				/* XL chunk */
853 				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
854 					rb_size, rb_seg))
855 					return 0;
856 			} else {
857 				if(!fsck_rb(base, rb_old, rb_new, rb_size,
858 					&make_free))
859 					return 0;
860 			}
861 		}
862 	}
863 
864 	/* rebuild freelists */
865 	/* recalculate stats in alloc (except 'stat_data') */
866 	/* possibly new end 'nextgrow' value */
867 	memset(&regen, 0, sizeof(regen));
868 	regen.nextgrow = alloc->disk->nextgrow;
869 	while(at < regen.nextgrow) {
870 		/* figure out this chunk */
871 		int exp = (int)UDB_CHUNK(at)->exp;
872 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
873 		/* consistency check possible here with end-exp */
874 		if(tp == udb_chunk_type_free || at == make_free) {
875 			at = regen_free(base, at, exp, &regen);
876 			if(!at) return 0;
877 		} else if(exp == UDB_EXP_XL) {
878 			/* allocated data of XL size */
879 			at = regen_xl(base, at, &regen);
880 			if(!at) return 0;
881 		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
882 			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
883 			/* allocated data */
884 			at = regen_data(base, at, exp, &regen);
885 			if(!at) return 0;
886 		} else {
887 			/* garbage; this must be EOF then */
888 			regen.nextgrow = at;
889 			break;
890 		}
891 	}
892 	*alloc->disk = regen;
893 
894 	/* rebuild relptr lists */
895 	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
896 
897 	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
898 		udb->fname);
899 	udb->glob_data->rb_old = 0;
900 	udb->glob_data->rb_new = 0;
901 	udb->glob_data->rb_size = 0;
902 	udb->glob_data->dirty_alloc = udb_dirty_clean;
903 	udb_base_sync(udb, 1);
904 	return 1;
905 }
906 
907 
908 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
909 {
910 	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
911 	if(!alloc)
912 		return NULL;
913 	alloc->udb = udb;
914 	alloc->disk = disk;
915 	/* see if committed but uncompleted actions need to be done */
916 	/* preserves the alloc state */
917 	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
918 		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
919 			if(fsck_fsize(udb, alloc))
920 				return alloc;
921 		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
922 			if(fsck_file(udb, alloc, 0))
923 				return alloc;
924 		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
925 			if(fsck_file(udb, alloc, 1))
926 				return alloc;
927 		}
928 		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
929 			(int)udb->glob_data->dirty_alloc);
930 		free(alloc);
931 		return NULL;
932 	}
933 	return alloc;
934 }
935 
936 void udb_alloc_delete(udb_alloc* alloc)
937 {
938 	if(!alloc) return;
939 	free(alloc);
940 }
941 
942 /** unlink this element from its freelist */
943 static void
944 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
945 {
946 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
947 	assert(chunk);
948 	/* chunk is a free chunk */
949 	assert(fp->exp == (uint8_t)exp);
950 	assert(fp->type == udb_chunk_type_free);
951 	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
952 	/* and thus freelist not empty */
953 	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
954 	/* unlink */
955 	if(fp->prev)
956 		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
957 	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
958 	if(fp->next)
959 		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
960 }
961 
962 /** pop first element off freelist, list may not be empty */
963 static udb_void
964 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
965 {
966 	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
967 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
968 	assert(f);
969 	assert(fp->exp == (uint8_t)exp);
970 	assert(fp->type == udb_chunk_type_free);
971 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
972 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
973 	if(fp->next) {
974 		UDB_FREE_CHUNK(fp->next)->prev = 0;
975 	}
976 	return f;
977 }
978 
979 /** push new element onto freelist */
980 static void
981 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
982 {
983 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
984 	assert(f);
985 	fp->exp = (uint8_t)exp;
986 	fp->type = udb_chunk_type_free;
987 	fp->flags = 0;
988 	fp->prev = 0;
989 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
990 	if(fp->next)
991 		UDB_FREE_CHUNK(fp->next)->prev = f;
992 	chunk_set_last(base, f, exp, (uint8_t)exp);
993 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
994 }
995 
996 /** push new element onto freelist - do not initialize the elt */
997 static void
998 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
999 {
1000 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1001 	assert(f);
1002 	assert(fp->exp == (uint8_t)exp);
1003 	assert(fp->type == udb_chunk_type_free);
1004 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1005 	fp->prev = 0;
1006 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1007 	if(fp->next)
1008 		UDB_FREE_CHUNK(fp->next)->prev = f;
1009 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1010 }
1011 
1012 /** add free chunks at end until specified alignment occurs */
1013 static void
1014 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1015 {
1016 	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1017 		/* the nextgrow is not a whole multiple of esz. */
1018 		/* grow a free chunk of max allowed size */
1019 		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1020 		uint64_t fsz = (uint64_t)1<<fexp;
1021 		udb_void f = alloc->disk->nextgrow;
1022 		udb_void fn = alloc->disk->nextgrow+fsz;
1023 		assert(fn <= alloc->udb->base_size);
1024 		alloc->disk->stat_free += fsz;
1025 		udb_alloc_push_fl(base, alloc, f, fexp);
1026 		/* now increase nextgrow to commit that free chunk */
1027 		alloc->disk->nextgrow = fn;
1028 	}
1029 }
1030 
1031 /** append chunks at end of memory space to get size exp, return dataptr */
1032 static udb_void
1033 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1034 {
1035 	uint64_t esz = (uint64_t)1<<exp;
1036 	udb_void ret;
1037 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1038 	grow_align(base, alloc, esz);
1039 	/* free chunks are grown, grow the one we want to use */
1040 	ret = alloc->disk->nextgrow;
1041 	/* take a new alloced chunk into use */
1042 	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1043 	UDB_CHUNK(ret)->flags = 0;
1044 	UDB_CHUNK(ret)->ptrlist = 0;
1045 	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1046 	/* store last octet */
1047 	chunk_set_last(base, ret, exp, (uint8_t)exp);
1048 	/* update stats */
1049 	alloc->disk->stat_alloc += esz;
1050 	alloc->disk->stat_data += sz;
1051 	/* now increase nextgrow to commit this newly allocated chunk */
1052 	alloc->disk->nextgrow += esz;
1053 	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1054 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1055 	return ret + sizeof(udb_chunk_d); /* ptr to data */
1056 }
1057 
1058 /** calculate how much space is necessary to grow for this exp */
1059 static uint64_t
1060 grow_end_calc(udb_alloc* alloc, int exp)
1061 {
1062 	uint64_t sz = (uint64_t)1<<exp;
1063 	uint64_t ng = alloc->disk->nextgrow;
1064 	uint64_t res;
1065 	/* if nextgrow is 2**expness, no extra growth needed, only size */
1066 	if( (ng & (sz-1)) == 0) {
1067 		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1068 		return ng+sz; /* must grow exactly 2**exp */
1069 	}
1070 	/* grow until 2**expness and then we need 2**exp as well */
1071 	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1072 	 * and then add the sz twice (go up to whole sz, and to allocate) */
1073 	res = (ng & ~(sz-1)) + 2*sz;
1074 	return res;
1075 }
1076 
1077 /** see if we need to grow more than specified to enable sustained growth */
1078 static uint64_t
1079 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1080 {
1081 	const uint64_t mb = 1024*1024;
1082 	uint64_t bsz = alloc->udb->base_size;
1083 	if(bsz <= mb) {
1084 		/* below 1 Mb, double sizes for exponential growth */
1085 		/* takes about 15 times to grow to 1Mb */
1086 		if(ge < bsz*2)
1087 			return bsz*2;
1088 	} else {
1089 		uint64_t gnow = ge - bsz;
1090 		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1091 		 * in whole megabytes rounded up. */
1092 		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1093 		if(gnow < want)
1094 			return bsz + want;
1095 	}
1096 	return ge;
1097 }
1098 
1099 /** see if free space is enogh to warrant shrink (while file is open) */
1100 static int
1101 enough_free(udb_alloc* alloc)
1102 {
1103 	if(alloc->udb->base_size <= 2*1024*1024) {
1104 		/* below 1 Mb, grown by double size, (so up to 2 mb),
1105 		 * do not shrink unless we can 1/3 in size */
1106 		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1107 			return 1;
1108 	} else {
1109 		/* grown 12.5%, shrink 25% if possible, at least one mb */
1110 		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1111 		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1112 		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1113 			|| alloc->udb->base_size < 4*1024*1024))
1114 			return 1;
1115 	}
1116 	return 0;
1117 }
1118 
1119 /** grow space for a chunk of 2**exp and return dataptr */
1120 static udb_void
1121 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1122 {
1123 	/* commit the grow action
1124 	 * - the file grow only changes filesize, but not the nextgrow.
1125 	 * - taking space after nextgrow into use (as free space),
1126 	 *   is like free-ing a chunk (one at a time).
1127 	 * - and the last chunk taken into use is like alloc.
1128 	 */
1129 	/* predict how much free space is needed for this */
1130 	uint64_t grow_end = grow_end_calc(alloc, exp);
1131 	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1132 	if(grow_end <= alloc->udb->base_size) {
1133 		/* we can do this with the available space */
1134 		return grow_chunks(base, alloc, sz, exp);
1135 	}
1136 	/* we have to grow the file, re-mmap */
1137 	/* see if we need to grow a little more, to avoid endless grow
1138 	 * efforts on adding data */
1139 	grow_end = grow_extra_check(alloc, grow_end);
1140 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1141 		return 0; /* mmap or write failed (disk or mem full) */
1142 	}
1143 	/* we have enough space now */
1144 	assert(grow_end <= alloc->udb->base_size);
1145 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1146 	return grow_chunks(base, alloc, sz, exp);
1147 }
1148 
1149 /** take XL allocation into use at end of file, return dataptr */
1150 static udb_void
1151 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1152 {
1153 	udb_void ret;
1154 	udb_xl_chunk_d* p;
1155 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1156 
1157 	/* align growth to whole mbs */
1158 	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1159 
1160 	/* grow XL segment */
1161 	ret = alloc->disk->nextgrow;
1162 	p = UDB_XL_CHUNK(ret);
1163 	p->exp = UDB_EXP_XL;
1164 	p->size = xlsz;
1165 	p->flags = 0;
1166 	p->ptrlist = 0;
1167 	p->type = udb_chunk_type_data;
1168 
1169 	/* also put size and marker at end for compaction */
1170 	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1171 	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1172 
1173 	/* stats */
1174 	alloc->disk->stat_data += sz;
1175 	alloc->disk->stat_alloc += xlsz;
1176 	/* now increase the nextgrow to commit this xl chunk */
1177 	alloc->disk->nextgrow += xlsz;
1178 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1179 	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1180 }
1181 
1182 /** make space for XL allocation */
1183 static udb_void
1184 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1185 {
1186 	/* allocate whole mbs of space, at end of space */
1187 	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1188 	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1189 	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1190 	assert(need >= asz);
1191 	if(grow_end <= alloc->udb->base_size) {
1192 		/* can do this in available space */
1193 		return grow_xl(base, alloc, need, sz);
1194 	}
1195 	/* have to grow file and re-mmap */
1196 	grow_end = grow_extra_check(alloc, grow_end);
1197 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1198 		return 0; /* mmap or write failed (disk or mem full) */
1199 	}
1200 	/* we have enough space now */
1201 	assert(grow_end <= alloc->udb->base_size);
1202 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1203 	return grow_xl(base, alloc, need, sz);
1204 }
1205 
1206 /** divide big(2**e2) into pieces so 2**exp fits */
1207 static udb_void
1208 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1209 	int exp)
1210 {
1211 	int e = e2;
1212 	uint64_t sz = (uint64_t)1<<e2;
1213 	assert(big && e2 > exp);
1214 	/* so the returned piece to use is the first piece,
1215 	 * offload the later half until it fits */
1216 	do {
1217 		sz >>= 1; /* divide size of big by two */
1218 		e--;      /* that means its exp is one smaller */
1219 		udb_alloc_push_fl(base, alloc, big+sz, e);
1220 	} while(e != exp);
1221 	/* exit loop when last pushed is same size as what we want */
1222 	return big;
1223 }
1224 
1225 /** returns the exponent size of the chunk needed for data sz */
1226 static int
1227 udb_alloc_exp_needed(size_t sz)
1228 {
1229 	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1230 	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1231 		return UDB_EXP_XL;
1232 	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1233 		return UDB_ALLOC_CHUNK_MINEXP;
1234 	}
1235 	return udb_exp_size(asz);
1236 }
1237 
1238 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1239 {
1240 	void* base = alloc->udb->base;
1241 	/* calculate actual allocation size */
1242 	int e2, exp = udb_alloc_exp_needed(sz);
1243 	if(exp == UDB_EXP_XL)
1244 		return udb_alloc_xl_space(base, alloc, sz);
1245 	/* see if there is a free chunk of that size exactly */
1246 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1247 		/* snip from freelist, udb_chunk_d */
1248 		udb_void ret;
1249 		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1250 		ret = udb_alloc_pop_fl(base, alloc, exp);
1251 		/* use it - size octets already OK */
1252 		UDB_CHUNK(ret)->flags = 0;
1253 		UDB_CHUNK(ret)->ptrlist = 0;
1254 		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1255 		/* update stats */
1256 		alloc->disk->stat_data += sz;
1257 		alloc->disk->stat_alloc += (1<<exp);
1258 		assert(alloc->disk->stat_free >= (1u<<exp));
1259 		alloc->disk->stat_free -= (1<<exp);
1260 		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1261 		return ret + sizeof(udb_chunk_d); /* ptr to data */
1262 	}
1263 	/* see if we can subdivide a larger chunk */
1264 	for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++)
1265 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1266 			udb_void big, ret; /* udb_chunk_d */
1267 			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1268 			big = udb_alloc_pop_fl(base, alloc, e2);
1269 			/* push other parts onto freelists (needs inited) */
1270 			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1271 			/* use final part (needs inited) */
1272 			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1273 			/* if stop here; the new exp makes smaller free chunk*/
1274 			UDB_CHUNK(ret)->flags = 0;
1275 			UDB_CHUNK(ret)->ptrlist = 0;
1276 			/* set type to commit data chunk */
1277 			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1278 			/* store last octet */
1279 			chunk_set_last(base, ret, exp, (uint8_t)exp);
1280 			/* update stats */
1281 			alloc->disk->stat_data += sz;
1282 			alloc->disk->stat_alloc += (1<<exp);
1283 			assert(alloc->disk->stat_free >= (1u<<exp));
1284 			alloc->disk->stat_free -= (1<<exp);
1285 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1286 			return ret + sizeof(udb_chunk_d); /* ptr to data */
1287 		}
1288 	/* we need to grow an extra chunk */
1289 	return udb_alloc_grow_space(base, alloc, sz, exp);
1290 }
1291 
1292 /** see if there is free space to allocate a chunk into */
1293 static int
1294 have_free_for(udb_alloc* alloc, int exp)
1295 {
1296 	int e2;
1297 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1298 		return exp;
1299 	for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++)
1300 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1301 			return e2;
1302 		}
1303 	return 0;
1304 }
1305 
1306 /** fix relptr prev and next for moved relptr structures */
1307 static void
1308 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1309 {
1310 	udb_void* data = (udb_void*)arg;
1311 	udb_void r;
1312 	if(!rp->data)
1313 		return;
1314 	r = UDB_SYSTOREL(base, rp);
1315 	if(rp->next)
1316 		UDB_REL_PTR(rp->next)->prev = r;
1317 	if(rp->prev)
1318 		UDB_REL_PTR(rp->prev)->next = r;
1319 	else	{
1320 		/* if this is a pointer to its own chunk, fix it up;
1321 		 * the data ptr gets set by relptr_edit later. */
1322 		if(rp->data == data[0])
1323 			UDB_CHUNK(data[1])->ptrlist = r;
1324 		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1325 	}
1326 }
1327 
1328 /** fix pointers from and to a moved chunk */
1329 static void
1330 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1331 	uint64_t dsz, udb_void olddata)
1332 {
1333 	udb_void d[2];
1334 	d[0] = olddata;
1335 	d[1] = data;
1336 	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1337 		dsz, &chunk_fix_ptr_each, d);
1338 	udb_rel_ptr_edit(base, cp->ptrlist, data);
1339 	udb_base_ram_ptr_edit(udb, olddata, data);
1340 }
1341 
1342 /** move an allocated chunk to use a free chunk */
1343 static void
1344 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1345 	int e2)
1346 {
1347 	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1348 	udb_chunk_d* rp;
1349 	udb_chunk_d* fp;
1350 	if(exp != e2) {
1351 		/* it is bigger, subdivide it */
1352 		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1353 	}
1354 	assert(res != f);
1355 	/* setup rollback information */
1356 	alloc->udb->glob_data->rb_old = f;
1357 	alloc->udb->glob_data->rb_new = res;
1358 	alloc->udb->glob_data->rb_size = esz;
1359 	/* take the res, exp into use */
1360 	rp = UDB_CHUNK(res);
1361 	fp = UDB_CHUNK(f);
1362 	/* copy over the data */
1363 	memcpy(rp, fp, esz);
1364 	/* adjust rel ptrs */
1365 	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1366 		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1367 
1368 	/* do not freeup the fp; caller does that */
1369 }
1370 
1371 /** unlink several free elements to overwrite with xl chunk */
1372 static void
1373 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1374 {
1375 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1376 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1377 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1378 	while(q >= s) {
1379 		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1380 		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1381 		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1382 		q -= UDB_ALLOC_CHUNK_SIZE;
1383 	}
1384 }
1385 
1386 /** move an XL chunk, and keep track of segments for rollback */
1387 static void
1388 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1389 	uint64_t sz, uint64_t startseg)
1390 {
1391 	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1392 	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1393 	uint64_t amount = xl - n;
1394 	assert(n < xl); /* move to compact */
1395 
1396 	/* setup move rollback */
1397 	udb->glob_data->rb_old = xl;
1398 	udb->glob_data->rb_new = n;
1399 	udb->glob_data->rb_size = sz;
1400 
1401 	/* is it overlapping? */
1402 	if(sz <= amount) {
1403 		memcpy(np, xlp, sz);
1404 	} else {
1405 		/* move and commit per 1M segment to avoid data loss */
1406 		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1407 		for(seg = startseg; seg<maxseg; seg++) {
1408 			udb->glob_data->rb_seg = seg;
1409 			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1410 				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1411 				UDB_ALLOC_CHUNK_SIZE);
1412 		}
1413 
1414 	}
1415 }
1416 
1417 /** move list of XL chunks to the front by the shift amount */
1418 static void
1419 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1420 	uint64_t amount)
1421 {
1422 	udb_void xl = xl_start;
1423 	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1424 	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1425 	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1426 	while(xl < xl_start+xl_sz) {
1427 		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1428 		udb_void n = xl-amount;
1429 		uint64_t sz = xlp->size;
1430 		assert(xlp->exp == UDB_EXP_XL);
1431 		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1432 		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1433 			n+sizeof(udb_xl_chunk_d),
1434 			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1435 			xl+sizeof(udb_xl_chunk_d));
1436 	}
1437 	alloc->disk->stat_free -= amount;
1438 	alloc->disk->nextgrow -= amount;
1439 	alloc->udb->glob_data->rb_old = 0;
1440 	alloc->udb->glob_data->rb_new = 0;
1441 	alloc->udb->glob_data->rb_size = 0;
1442 }
1443 
1444 /** see if free chunk can coagulate with another chunk, return other chunk */
1445 static udb_void
1446 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1447 	uint64_t esz)
1448 {
1449 	udb_void other = f^esz;
1450 	if(exp == UDB_ALLOC_CHUNKS_MAX)
1451 		return 0; /* no further merges */
1452 	if(other >= alloc->udb->base_size)
1453 		return 0; /* not allocated */
1454 	if(other >= alloc->disk->nextgrow)
1455 		return 0; /* not in use */
1456 	if(other < alloc->udb->glob_data->hsize)
1457 		return 0; /* cannot merge with header */
1458 		/* the header is also protected by the special exp marker */
1459 	/* see if the other chunk is a free chunk */
1460 
1461 	/* check closest marker to avoid large memory churn */
1462 	/* and also it makes XL allocations and header special markers work */
1463 	if(f > other) {
1464 		assert(f > 1); /* this is certain because of header */
1465 		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1466 			/* can do it if the other part is a free chunk */
1467 			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1468 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1469 				return other;
1470 		}
1471 	} else {
1472 		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1473 			/* can do it if the other part is a free chunk */
1474 			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1475 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1476 				return other;
1477 		}
1478 	}
1479 	return 0;
1480 }
1481 
1482 /** coagulate and then add new free segment, return final free segment */
1483 static udb_void
1484 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1485 	uint64_t esz)
1486 {
1487 	/* new free chunk here, attempt coagulate */
1488 	udb_void other;
1489 	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1490 		/* unlink that other chunk */
1491 		udb_alloc_unlink_fl(base, alloc, other, exp);
1492 		/* merge up */
1493 		if(other < last)
1494 			last = other;
1495 		exp++;
1496 		esz <<= 1;
1497 	}
1498 	/* free the final segment */
1499 	udb_alloc_push_fl(base, alloc, last, exp);
1500 	return last;
1501 }
1502 
1503 /** attempt to compact the data and move free space to the end */
1504 static int
1505 udb_alloc_compact(void* base, udb_alloc* alloc)
1506 {
1507 	udb_void last;
1508 	int exp, e2;
1509 	uint64_t esz;
1510 	uint64_t at = alloc->disk->nextgrow;
1511 	udb_void xl_start = 0;
1512 	uint64_t xl_sz = 0;
1513 	while(at > alloc->udb->glob_data->hsize) {
1514 		/* grab last entry */
1515 		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1516 		if(exp == UDB_EXP_XL) {
1517 			/* for XL chunks:
1518 			 * - inspect the size of the XLchunklist at end
1519 			 * - attempt to compact in front of of XLchunklist
1520 			 */
1521 			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1522 				at-sizeof(uint64_t)*2));
1523 			udb_void xl = at-xlsz;
1524 #ifndef NDEBUG
1525 			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1526 			assert(xlp->exp == UDB_EXP_XL);
1527 			assert(xlp->type != udb_chunk_type_free);
1528 #endif
1529 			/* got thesegment add to the xl chunk list */
1530 			if(xl_start != 0 && xl+xlsz != xl_start) {
1531 				/* nonadjoining XL part, but they are aligned,
1532 				 * so the space in between is whole Mbs,
1533 				 * shift the later part(s) and continue */
1534 				uint64_t m = xl_start - (xl+xlsz);
1535 				assert(xl_start > xl+xlsz);
1536 				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1537 				free_xl_space(base, alloc, xl+xlsz, m);
1538 				move_xl_list(base, alloc, xl_start, xl_sz, m);
1539 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1540 			}
1541 			xl_start = xl;
1542 			xl_sz += xlsz;
1543 			at = xl;
1544 			continue;
1545 			/* end of XL if */
1546 		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1547 			|| exp > UDB_ALLOC_CHUNKS_MAX)
1548 			break; /* special chunk or garbage */
1549 		esz = (uint64_t)1<<exp;
1550 		last = at - esz;
1551 		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1552 		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1553 			/* if xlstart continue looking to move stuff, but do
1554 			 * not unlink this free segment */
1555 			if(!xl_start) {
1556 				/* it is a free chunk, remove it */
1557 				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1558 				udb_alloc_unlink_fl(base, alloc, last, exp);
1559 				alloc->disk->stat_free -= esz;
1560 				alloc->disk->nextgrow = last;
1561 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1562 				/* and continue at this point */
1563 			}
1564 			at = last;
1565 		} else if( (e2=have_free_for(alloc, exp)) ) {
1566 			/* last entry can be allocated in free chunks
1567 			 * move it to its new position, adjust rel_ptrs */
1568 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1569 			move_chunk(base, alloc, last, exp, esz, e2);
1570 			if(xl_start) {
1571 				last = coagulate_and_push(base, alloc,
1572 					last, exp, esz);
1573 			} else {
1574 				/* shorten usage */
1575 				alloc->disk->stat_free -= esz;
1576 				alloc->disk->nextgrow = last;
1577 			}
1578 			alloc->udb->glob_data->rb_old = 0;
1579 			alloc->udb->glob_data->rb_new = 0;
1580 			alloc->udb->glob_data->rb_size = 0;
1581 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1582 			/* and continue in front of it */
1583 			at = last;
1584 		} else {
1585 			/* cannot compact this block, stop compacting */
1586 			break;
1587 		}
1588 		/* if that worked, repeat it */
1589 	}
1590 	/* if we passed xl chunks, see if XL-chunklist can move */
1591 	if(xl_start) {
1592 		/* calculate free space in front of the XLchunklist. */
1593 		/* has to be whole mbs of free space */
1594 		/* if so, we can move the XL chunks.  Move them all back
1595 		 * by the new free space. */
1596 		/* this compacts very well, but the XL chunks can be moved
1597 		 * multiple times; worst case for every mb freed a huge sized
1598 		 * xlchunklist gets moved. */
1599 		/* free space must be, since aligned and coagulated, in
1600 		 * chunks of a whole MB */
1601 		udb_void at = xl_start;
1602 		uint64_t m = 0;
1603 		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1604 			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1605 			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1606 				break;
1607 			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1608 			m += UDB_ALLOC_CHUNK_SIZE;
1609 			at = chunk;
1610 		}
1611 		if(m != 0) {
1612 			assert(at+m == xl_start);
1613 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1614 			free_xl_space(base, alloc, at, m);
1615 			move_xl_list(base, alloc, xl_start, xl_sz, m);
1616 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1617 		}
1618 	}
1619 
1620 	/* if enough free, shrink the file; re-mmap */
1621 	if(enough_free(alloc)) {
1622 		uint64_t nsize = alloc->disk->nextgrow;
1623 		udb_base_shrink(alloc->udb, nsize);
1624 		if(!udb_base_remap(alloc->udb, alloc, nsize))
1625 			return 0;
1626 	}
1627 	return 1;
1628 }
1629 
1630 #ifdef UDB_CHECK
1631 /** check that rptrs are really zero before free */
1632 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1633 {
1634 	(void)base;
1635 	(void)arg;
1636 	assert(p->data == 0);
1637 }
1638 #endif /* UDB_CHECK */
1639 
1640 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1641 static void
1642 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1643 	size_t sz)
1644 {
1645 	uint64_t xlsz = fp->size;
1646 	uint64_t c;
1647 	/* lightweight check for buffer overflow in xl data */
1648 	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1649 	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1650 	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1651 	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1652 #ifdef UDB_CHECK
1653 	/* check that relptrs in this chunk have been zeroed */
1654 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1655 		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1656 		&udb_check_rptr_zero, NULL);
1657 #endif
1658 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1659 	/* update stats */
1660 	alloc->disk->stat_data -= sz;
1661 	alloc->disk->stat_alloc -= xlsz;
1662 	alloc->disk->stat_free += xlsz;
1663 	/* walk in reverse, so the front blocks go first on the list */
1664 	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1665 	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1666 	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1667 	while(c >= f) {
1668 		/* free a block of CHUNK_SIZE (1 Mb) */
1669 		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1670 		c -= UDB_ALLOC_CHUNK_SIZE;
1671 	}
1672 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1673 }
1674 
1675 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1676 {
1677 	void* base;
1678 	/* lookup chunk ptr */
1679 	udb_void f;
1680 	udb_chunk_d* fp;
1681 	uint64_t esz;
1682 	int exp;
1683 	udb_void other;
1684 	int coagulated = 0;
1685 	if(!r)
1686 		return 1; /* free(NULL) does nothing */
1687 
1688 	/* lookup size of chunk */
1689 	base = alloc->udb->base;
1690 	/* fails for XL blocks */
1691 	f = chunk_from_dataptr(r);
1692 	fp = UDB_CHUNK(f);
1693 	assert(fp->type != udb_chunk_type_free);
1694 
1695 	/* see if it has a ptrlist, if so: trouble, the list is not properly
1696 	 * cleaned up. (although you can imagine a wholesale delete where
1697 	 * it does not matter) */
1698 	assert(fp->ptrlist == 0);
1699 
1700 	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1701 	fp->ptrlist = 0;
1702 
1703 	if(fp->exp == UDB_EXP_XL) {
1704 		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1705 		/* compact */
1706 		return udb_alloc_compact(base, alloc);
1707 	}
1708 	/* it is a regular chunk of 2**exp size */
1709 	exp = (int)fp->exp;
1710 	esz = (uint64_t)1<<exp;
1711 	/* light check for e.g. buffer overflow of the data */
1712 	assert(sz < esz);
1713 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1714 #ifdef UDB_CHECK
1715 	/* check that relptrs in this chunk have been zeroed */
1716 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1717 		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1718 #endif
1719 
1720 	/* update the stats */
1721 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1722 	alloc->disk->stat_data -= sz;
1723 	alloc->disk->stat_free += esz;
1724 	alloc->disk->stat_alloc -= esz;
1725 
1726 	/* if it can be merged with other free chunks, do so */
1727 	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1728 		coagulated = 1;
1729 		/* unlink that other chunk and expand it (it has same size) */
1730 		udb_alloc_unlink_fl(base, alloc, other, exp);
1731 		/* merge up */
1732 		if(other < f)
1733 			f = other;
1734 		exp++;
1735 		esz <<= 1;
1736 	}
1737 	if(coagulated) {
1738 		/* put big free chunk into freelist, and init it */
1739 		udb_alloc_push_fl(base, alloc, f, exp);
1740 	} else {
1741 		/* we do not need to touch the last-exp-byte, which may save
1742 		 * a reference to that page of memory */
1743 		fp->type = udb_chunk_type_free;
1744 		fp->flags = 0;
1745 		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1746 	}
1747 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1748 	/* compact */
1749 	return udb_alloc_compact(base, alloc);
1750 }
1751 
1752 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1753 {
1754 	/* could be faster maybe, if grown? */
1755 	udb_void r = udb_alloc_space(alloc, sz);
1756 	if(!r) return r;
1757 	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1758 	return r;
1759 }
1760 
1761 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1762 {
1763 	void* base = alloc->udb->base;
1764 	udb_void c, n, newd;
1765 	udb_chunk_d* cp, *np;
1766 	uint64_t avail;
1767 	uint8_t cp_type;
1768 	/* emulate some posix realloc stuff */
1769 	if(r == 0)
1770 		return udb_alloc_space(alloc, sz);
1771 	if(sz == 0) {
1772 		if(!udb_alloc_free(alloc, r, osz))
1773 			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1774 		return 0;
1775 	}
1776 	c = chunk_from_dataptr(r);
1777 	cp = UDB_CHUNK(c);
1778 	cp_type = cp->type;
1779 	if(cp->exp == UDB_EXP_XL) {
1780 		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1781 			- sizeof(uint64_t)*2;
1782 	} else {
1783 		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1784 	}
1785 	if(sz <= avail)
1786 		return r;
1787 	/* reallocate it, and copy */
1788 	newd = udb_alloc_space(alloc, sz);
1789 	if(!newd) return 0;
1790 	/* re-base after alloc, since re-mmap may have happened */
1791 	base = alloc->udb->base;
1792 	cp = NULL; /* may be invalid now, robustness */
1793 	n = chunk_from_dataptr(newd);
1794 	np = UDB_CHUNK(n);
1795 	np->type = cp_type;
1796 	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1797 	/* fixup ptrs */
1798 	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1799 
1800 	if(!udb_alloc_free(alloc, r, osz))
1801 		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1802 	return newd;
1803 }
1804 
1805 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1806 {
1807 	const uint64_t mb = 1024*1024;
1808 	int exp = udb_alloc_exp_needed(sz);
1809 	uint64_t esz;
1810 	uint64_t want;
1811 	if(exp == UDB_EXP_XL)
1812 		esz = (sz&(mb-1))+mb;
1813 	else	esz = (uint64_t)1<<exp;
1814 	/* we need grow_end_calc to take into account alignment */
1815 	want = grow_end_calc(alloc, exp) + esz*(num-1);
1816 	assert(want >= alloc->udb->base_size);
1817 	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1818 		log_msg(LOG_ERR, "failed to grow the specified amount");
1819 		return 0;
1820 	}
1821 	return 1;
1822 }
1823 
1824 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1825 {
1826 	void* base = alloc->udb->base;
1827 	udb_void f = chunk_from_dataptr(r);
1828 	udb_chunk_d* fp = UDB_CHUNK(f);
1829 	/* not the 'free' type, that must be set by allocation routines */
1830 	assert(fp->type != udb_chunk_type_free);
1831 	assert(tp != udb_chunk_type_free);
1832 	fp->type = tp;
1833 }
1834 
1835 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1836 {
1837 	/* pointers are not valid before the header-size or after the
1838 	 * used-region of the mmap */
1839 	return ( (to+destsize) <= udb->base_size &&
1840 		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1841 		(to+destsize) <= udb->alloc->disk->nextgrow);
1842 }
1843 
1844 int udb_valid_dataptr(udb_base* udb, udb_void to)
1845 {
1846 	void* base = udb->base;
1847 	udb_void ch;
1848 	int exp;
1849 	uint64_t esz;
1850 	/* our data chunks are aligned and at least 8 bytes */
1851 	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1852 		return 0;
1853 	/* get the chunk pointer */
1854 	ch = chunk_from_dataptr(to);
1855 	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1856 		return 0;
1857 	/* check its size */
1858 	exp = UDB_CHUNK(ch)->exp;
1859 	if(exp == UDB_EXP_XL) {
1860 		/* check XL chunk */
1861 		uint64_t xlsz;
1862 		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1863 			return 0;
1864 		xlsz = UDB_XL_CHUNK(ch)->size;
1865 		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1866 			return 0;
1867 		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1868 			return 0;
1869 		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1870 			!= xlsz)
1871 			return 0;
1872 		return 1;
1873 	}
1874 	/* check if regular chunk has matching end byte */
1875 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1876 		return 0; /* cannot be a valid chunk */
1877 	esz = 1<<exp;
1878 	if(!udb_valid_offset(udb, ch+esz-1, 1))
1879 		return 0;
1880 	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1881 		return 0;
1882 	return 1;
1883 }
1884 
1885 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1886 {
1887 	void* base = udb->base;
1888 	udb_void p;
1889 	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1890 		return 0;
1891 	if(!udb_valid_dataptr(udb, to))
1892 		return 0;
1893 	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1894 	while(p) {
1895 		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1896 			return 0;
1897 		if(p == rptr)
1898 			return 1;
1899 		p = UDB_REL_PTR(p)->next;
1900 	}
1901 	return 0;
1902 }
1903 
1904 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1905 {
1906 	memset(ptr, 0, sizeof(*ptr));
1907 }
1908 
1909 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1910 {
1911 	if(!ptr->data)
1912 		return;
1913 	if(ptr->prev) {
1914 		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1915 	} else {
1916 		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1917 	}
1918 	if(ptr->next) {
1919 		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
1920 	}
1921 }
1922 
1923 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
1924 {
1925 	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
1926 	ptr->prev = 0;
1927 	ptr->next = chunk->ptrlist;
1928 	if(ptr->next)
1929 		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
1930 	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
1931 	ptr->data = to;
1932 }
1933 
1934 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
1935 {
1936 	assert(to == 0 || to > 64);
1937 	udb_rel_ptr_unlink(base, ptr);
1938 	if(to)
1939 		udb_rel_ptr_link(base, ptr, to);
1940 	else	ptr->data = to;
1941 }
1942 
1943 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
1944 {
1945 	udb_void p = list;
1946 	while(p) {
1947 		UDB_REL_PTR(p)->data = to;
1948 		p = UDB_REL_PTR(p)->next;
1949 	}
1950 }
1951 
1952 #ifdef UDB_CHECK
1953 /** check that all pointers are validly chained */
1954 static void
1955 udb_check_ptrs_valid(udb_base* udb)
1956 {
1957 	size_t i;
1958 	udb_ptr* p, *prev;
1959 	for(i=0; i<udb->ram_size; i++) {
1960 		prev = NULL;
1961 		for(p=udb->ram_hash[i]; p; p=p->next) {
1962 			assert(p->prev == prev);
1963 			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
1964 				== i);
1965 			assert(p->base == &udb->base);
1966 			prev = p;
1967 		}
1968 	}
1969 }
1970 #endif /* UDB_CHECK */
1971 
1972 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
1973 {
1974 #ifdef UDB_CHECK
1975 	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
1976 #endif
1977 	memset(ptr, 0, sizeof(*ptr));
1978 	ptr->base = &udb->base;
1979 }
1980 
1981 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
1982 {
1983 	assert(newval == 0 || newval > 64);
1984 	if(ptr->data)
1985 		udb_base_unlink_ptr(udb, ptr);
1986 	ptr->data = newval;
1987 	if(newval)
1988 		udb_base_link_ptr(udb, ptr);
1989 }
1990 
1991 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
1992 	size_t sz)
1993 {
1994 	udb_void r;
1995 	r = udb_alloc_space(udb->alloc, sz);
1996 	if(!r) return 0;
1997 	udb_alloc_set_type(udb->alloc, r, type);
1998 	udb_ptr_init(ptr, udb);
1999 	udb_ptr_set(ptr, udb, r);
2000 	return 1;
2001 }
2002 
2003 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2004 {
2005 	if(ptr->data) {
2006 		udb_void d = ptr->data;
2007 		udb_ptr_set(ptr, udb, 0);
2008 		udb_alloc_free(udb->alloc, d, sz);
2009 	}
2010 }
2011 
2012 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2013 {
2014 	udb_void f;
2015 	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2016 	f = chunk_from_dataptr(ptr->data);
2017 	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2018 }
2019