xref: /openbsd-src/usr.sbin/nsd/udb.c (revision 50b7afb2c2c0993b0894d4e34bf857cb13ed9c80)
1 /* udb.c - u(micro) data base.
2  * By W.C.A. Wijngaards
3  * Copyright 2010, NLnet Labs.
4  * BSD, see LICENSE.
5  */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15 
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29 
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 	udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35 
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
38 chunk_from_dataptr(udb_void data)
39 {
40 	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 	udb_void xl = data - sizeof(udb_xl_chunk_d);
43 	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 		return xl;
45 	return data - sizeof(udb_chunk_d);
46 }
47 
48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 	return chunk_from_dataptr(data);
50 }
51 
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60 
61 /** write last octet of a chunk */
62 static void
63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 	*((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value;
66 }
67 
68 /** create udb_base from a file descriptor (must be at start of file) */
69 udb_base*
70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
71 	void* arg)
72 {
73 	uint64_t m, fsz;
74 	udb_glob_d g;
75 	ssize_t r;
76 	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
77 	if(!udb) {
78 		log_msg(LOG_ERR, "out of memory");
79 		close(fd);
80 		return NULL;
81 	}
82 	udb->fname = strdup(fname);
83 	if(!udb->fname) {
84 		log_msg(LOG_ERR, "out of memory");
85 		free(udb);
86 		close(fd);
87 		return NULL;
88 	}
89 	udb->walkfunc = walkfunc;
90 	udb->walkarg = arg;
91 	udb->fd = fd;
92 	udb->ram_size = 1024;
93 	udb->ram_mask = (int)udb->ram_size - 1;
94 	udb->ram_hash = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*udb->ram_size);
95 	if(!udb->ram_hash) {
96 		free(udb->fname);
97 		free(udb);
98 		log_msg(LOG_ERR, "out of memory");
99 		close(fd);
100 		return NULL;
101 	}
102 
103 	/* read magic */
104 	if((r=read(fd, &m, sizeof(m))) == -1) {
105 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
106 		goto fail;
107 	} else if(r != (ssize_t)sizeof(m)) {
108 		log_msg(LOG_ERR, "%s: file too short", fname);
109 		goto fail;
110 	}
111 	/* TODO : what if bigendian and littleendian file, see magic */
112 	if(m != UDB_MAGIC) {
113 		log_msg(LOG_ERR, "%s: wrong type of file", fname);
114 		goto fail;
115 	}
116 	/* read header */
117 	if((r=read(fd, &g, sizeof(g))) == -1) {
118 		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
119 		goto fail;
120 	} else if(r != (ssize_t)sizeof(g)) {
121 		log_msg(LOG_ERR, "%s: file too short", fname);
122 		goto fail;
123 	}
124 	if(g.version != 0) {
125 		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
126 			(int)g.version);
127 		goto fail;
128 	}
129 	if(g.hsize < UDB_HEADER_SIZE) {
130 		log_msg(LOG_ERR, "%s: header size too small %d", fname,
131 			(int)g.hsize);
132 		goto fail;
133 	}
134 	if(g.hsize > UDB_HEADER_SIZE) {
135 		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
136 			(int)g.hsize);
137 		goto fail;
138 	}
139 	if(g.clean_close != 1) {
140 		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
141 			(int)g.clean_close);
142 		goto fail;
143 	}
144 	if(g.dirty_alloc != 0) {
145 		log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
146 			(int)g.dirty_alloc);
147 		goto fail;
148 	}
149 
150 	/* check file size correctly written, for 4.0.2 nsd.db failure */
151 	fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
152 	(void)lseek(fd, (off_t)0, SEEK_SET);
153 	if(g.fsize != fsz) {
154 		log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
155 			"has size %llu", fname, (unsigned long long)fsz,
156 			(unsigned long long)g.fsize);
157 		goto fail;
158 	}
159 
160 	/* mmap it */
161 	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
162 		log_msg(LOG_ERR, "%s: file too short", fname);
163 		goto fail;
164 	}
165 	if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
166 		log_msg(LOG_WARNING, "%s: file size too large %llu",
167 			fname, (unsigned long long)g.fsize);
168 		goto fail;
169 	}
170 	udb->base_size = (size_t)g.fsize;
171 	/* note the size_t casts must be there for portability, on some
172 	 * systems the layout of memory is otherwise broken. */
173 	udb->base = mmap(NULL, (size_t)udb->base_size,
174 		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
175 		(int)udb->fd, (off_t)0);
176 	if(udb->base == MAP_FAILED) {
177 		udb->base = NULL;
178 		log_msg(LOG_ERR, "mmap(size %u) error: %s",
179 			(unsigned)udb->base_size, strerror(errno));
180 	fail:
181 		close(fd);
182 		free(udb->fname);
183 		free(udb->ram_hash);
184 		free(udb);
185 		return NULL;
186 	}
187 
188 	/* init completion */
189 	udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t));
190 	r = 0;
191 	/* cannot be dirty because that is goto fail above */
192 	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
193 		r = 1;
194 	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
195 		(void*)udb->glob_data+sizeof(*udb->glob_data)));
196 	if(!udb->alloc) {
197 		log_msg(LOG_ERR, "out of memory");
198 		udb_base_free(udb);
199 		return NULL;
200 	}
201 	if(r) {
202 		/* and compact now, or resume compacting */
203 		udb_alloc_compact(udb, udb->alloc);
204 		udb_base_sync(udb, 1);
205 	}
206 	udb->glob_data->clean_close = 0;
207 
208 	return udb;
209 }
210 
211 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
212 	void* arg)
213 {
214 	int fd = open(fname, O_RDWR);
215 	if(fd == -1) {
216 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
217 		return NULL;
218 	}
219 	return udb_base_create_fd(fname, fd, walkfunc, arg);
220 }
221 
222 /** init new udb_global structure */
223 static void udb_glob_init_new(udb_glob_d* g)
224 {
225 	memset(g, 0, sizeof(*g));
226 	g->hsize = UDB_HEADER_SIZE;
227 	g->fsize = UDB_HEADER_SIZE;
228 }
229 
230 /** write data to file and check result */
231 static int
232 write_fdata(const char* fname, int fd, void* data, size_t len)
233 {
234 	ssize_t w;
235 	if((w=write(fd, data, len)) == -1) {
236 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
237 		close(fd);
238 		return 0;
239 	} else if(w != (ssize_t)len) {
240 		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
241 		close(fd);
242 		return 0;
243 	}
244 	return 1;
245 }
246 
247 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
248 	void* arg)
249 {
250 	uint64_t m;
251 	udb_glob_d g;
252 	udb_alloc_d a;
253 	uint64_t endsize = UDB_HEADER_SIZE;
254 	uint64_t endexp = 0;
255 	int fd = open(fname, O_CREAT|O_RDWR, 0600);
256 	if(fd == -1) {
257 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
258 		return NULL;
259 	}
260 	m = UDB_MAGIC;
261 	udb_glob_init_new(&g);
262 	udb_alloc_init_new(&a);
263 	g.clean_close = 1;
264 
265 	/* write new data to file (closes fd on error) */
266 	if(!write_fdata(fname, fd, &m, sizeof(m)))
267 		return NULL;
268 	if(!write_fdata(fname, fd, &g, sizeof(g)))
269 		return NULL;
270 	if(!write_fdata(fname, fd, &a, sizeof(a)))
271 		return NULL;
272 	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
273 		return NULL;
274 	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
275 		return NULL;
276 	/* rewind to start */
277 	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
278 		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
279 		close(fd);
280 		return NULL;
281 	}
282 	/* truncate to the right size */
283 	if(ftruncate(fd, (off_t)g.fsize) < 0) {
284 		log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
285 			(int)g.fsize, strerror(errno));
286 		close(fd);
287 		return NULL;
288 	}
289 	return udb_base_create_fd(fname, fd, walkfunc, arg);
290 }
291 
292 /** shrink the udb base if it has unused space at the end */
293 static void
294 udb_base_shrink(udb_base* udb, uint64_t nsize)
295 {
296 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
297 	udb->glob_data->fsize = nsize;
298 	/* sync, does not *seem* to be required on Linux, but it is
299 	   certainly required on OpenBSD.  Otherwise changed data is lost. */
300 	msync(udb->base, udb->base_size, MS_ASYNC);
301 	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
302 		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
303 			(unsigned)nsize, strerror(errno));
304 	}
305 	udb->glob_data->dirty_alloc = udb_dirty_clean;
306 }
307 
308 void udb_base_close(udb_base* udb)
309 {
310 	if(!udb)
311 		return;
312 	if(udb->fd != -1 && udb->base && udb->alloc) {
313 		uint64_t nsize = udb->alloc->disk->nextgrow;
314 		if(nsize < udb->base_size)
315 			udb_base_shrink(udb, nsize);
316 	}
317 	if(udb->fd != -1) {
318 		udb->glob_data->clean_close = 1;
319 		close(udb->fd);
320 		udb->fd = -1;
321 	}
322 	if(udb->base) {
323 		if(munmap(udb->base, udb->base_size) == -1) {
324 			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
325 		}
326 		udb->base = NULL;
327 	}
328 }
329 
330 void udb_base_free(udb_base* udb)
331 {
332 	if(!udb)
333 		return;
334 	udb_base_close(udb);
335 	udb_alloc_delete(udb->alloc);
336 	free(udb->ram_hash);
337 	free(udb->fname);
338 	free(udb);
339 }
340 
341 void udb_base_free_keep_mmap(udb_base* udb)
342 {
343 	if(!udb) return;
344 	if(udb->fd != -1) {
345 		close(udb->fd);
346 		udb->fd = -1;
347 	}
348 	udb->base = NULL;
349 	udb_alloc_delete(udb->alloc);
350 	free(udb->ram_hash);
351 	free(udb->fname);
352 	free(udb);
353 }
354 
355 void udb_base_sync(udb_base* udb, int wait)
356 {
357 	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
358 		log_msg(LOG_ERR, "msync(%s) error %s",
359 			udb->fname, strerror(errno));
360 	}
361 }
362 
363 /** hash a chunk pointer */
364 static uint32_t
365 chunk_hash_ptr(udb_void p)
366 {
367 	/* put p into an array of uint32 */
368 	uint32_t h[sizeof(p)/sizeof(uint32_t)];
369 	memcpy(&h, &p, sizeof(h));
370 	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
371 }
372 
373 /** check that the given pointer is on the bucket for the given offset */
374 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
375 {
376 	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
377 	udb_ptr* p;
378 	assert((size_t)i < udb->ram_size);
379 	for(p = udb->ram_hash[i]; p; p=p->next) {
380 		if(p == ptr)
381 			return 1;
382 	}
383 	return 0;
384 }
385 
386 /** grow the ram array */
387 static void
388 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
389 {
390 	size_t i;
391 	size_t osize= udb->ram_size;
392 	udb_ptr* p, *np;
393 	udb_ptr** oldhash = udb->ram_hash;
394 	udb->ram_size *= 2;
395 	udb->ram_mask <<= 1;
396 	udb->ram_mask |= 1;
397 	udb->ram_hash = newhash;
398 	/* have to link in every element in the old list into the new list*/
399 	for(i=0; i<osize; i++) {
400 		p = oldhash[i];
401 		while(p) {
402 			np = p->next;
403 			/* link into newhash */
404 			p->prev=NULL;
405 			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
406 			if(p->next) p->next->prev = p;
407 			/* go to next element of oldhash */
408 			p = np;
409 		}
410 	}
411 	free(oldhash);
412 }
413 
414 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
415 {
416 	uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
417 	assert((size_t)i < udb->ram_size);
418 #ifdef UDB_CHECK
419 	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
420 #endif
421 	udb->ram_num++;
422 	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0xefffffff) {
423 		/* grow the array, if allocation succeeds */
424 		udb_ptr** newram = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*
425 			udb->ram_size*2);
426 		if(newram) {
427 			grow_ram_hash(udb, newram);
428 		}
429 	}
430 	ptr->prev = NULL;
431 	ptr->next = udb->ram_hash[i];
432 	udb->ram_hash[i] = ptr;
433 	if(ptr->next)
434 		ptr->next->prev = ptr;
435 }
436 
437 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
438 {
439 	assert(ptr->data);
440 #ifdef UDB_CHECK
441 	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
442 	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
443 #endif
444 	udb->ram_num--;
445 	if(ptr->next)
446 		ptr->next->prev = ptr->prev;
447 	if(ptr->prev)
448 		ptr->prev->next = ptr->next;
449 	else	{
450 		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
451 		assert((size_t)i < udb->ram_size);
452 		udb->ram_hash[i] = ptr->next;
453 	}
454 }
455 
456 /** change a set of ram ptrs to a new value */
457 static void
458 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
459 {
460 	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
461 	udb_ptr* p, *np;
462 	/* edit them and move them into the new position */
463 	p = udb->ram_hash[io];
464 	while(p) {
465 		np = p->next;
466 		if(p->data == old) {
467 			udb_base_unlink_ptr(udb, p);
468 			p->data = newd;
469 			udb_base_link_ptr(udb, p);
470 		}
471 		p = np;
472 	}
473 }
474 
475 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
476 {
477 	return &udb->glob_data->user_global;
478 }
479 
480 void udb_base_set_userdata(udb_base* udb, udb_void user)
481 {
482 #ifdef UDB_CHECK
483 	if(user) { assert(udb_valid_dataptr(udb, user)); }
484 #endif
485 	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
486 }
487 
488 void udb_base_set_userflags(udb_base* udb, uint8_t v)
489 {
490 	udb->glob_data->userflags = v;
491 }
492 
493 uint8_t udb_base_get_userflags(udb_base* udb)
494 {
495 	return udb->glob_data->userflags;
496 }
497 
498 /** re-mmap the udb to specified size */
499 static void*
500 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
501 {
502 	void* nb;
503 	/* for use with valgrind, do not use mremap, but the other version */
504 #ifdef MREMAP_MAYMOVE
505 	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
506 	if(nb == MAP_FAILED) {
507 		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
508 			udb->fname, (unsigned)nsize, strerror(errno));
509 		return 0;
510 	}
511 #else /* !HAVE MREMAP */
512 	/* use munmap-mmap to simulate mremap */
513 	if(munmap(udb->base, udb->base_size) != 0) {
514 		log_msg(LOG_ERR, "munmap(%s) error %s",
515 			udb->fname, strerror(errno));
516 	}
517 	/* provide hint for new location */
518 	/* note the size_t casts must be there for portability, on some
519 	 * systems the layout of memory is otherwise broken. */
520 	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
521 		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
522 	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
523 	 * the kernel can then try to mmap it at a different location
524 	 * where more memory is available */
525 	if(nb == MAP_FAILED && errno == ENOMEM) {
526 		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
527 			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
528 	}
529 	if(nb == MAP_FAILED) {
530 		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
531 			udb->fname, (unsigned)nsize, strerror(errno));
532 		udb->base = NULL;
533 		return 0;
534 	}
535 #endif /* HAVE MREMAP */
536 	if(nb != udb->base) {
537 		/* fix up realpointers in udb and alloc */
538 		/* but mremap may have been nice and not move the base */
539 		udb->base = nb;
540 		udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t));
541 		/* use passed alloc pointer because the udb->alloc may not
542 		 * be initialized yet */
543 		alloc->disk = (udb_alloc_d*)((void*)udb->glob_data
544 			+sizeof(*udb->glob_data));
545 	}
546 	udb->base_size = nsize;
547 	return nb;
548 }
549 
550 void
551 udb_base_remap_process(udb_base* udb)
552 {
553 	/* assume that fsize is still accessible */
554 	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
555 }
556 
557 /** grow file to specified size and re-mmap, return new base */
558 static void*
559 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
560 {
561 	/* grow file by writing a single zero at that spot, the
562 	 * rest is filled in with zeroes. */
563 	uint8_t z = 0;
564 	ssize_t w;
565 
566 	assert(nsize > 0);
567 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
568 #ifdef HAVE_PWRITE
569 	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
570 #else
571 	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
572 		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
573 		return 0;
574 	}
575 	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
576 #endif
577 		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
578 			udb->fname, (unsigned)nsize, strerror(errno));
579 		return 0;
580 	} else if(w != (ssize_t)sizeof(z)) {
581 		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
582 			udb->fname, (unsigned)nsize);
583 		return 0;
584 	}
585 	udb->glob_data->fsize = nsize;
586 	udb->glob_data->dirty_alloc = udb_dirty_clean;
587 	return udb_base_remap(udb, udb->alloc, nsize);
588 }
589 
590 int udb_exp_size(uint64_t a)
591 {
592 	/* find enclosing value such that 2**x >= a */
593 	int x = 0;
594 	uint64_t i = a;
595 	assert(a != 0);
596 
597 	i --;
598 	/* could optimise this with uint8* access, depends on endianness */
599 	/* first whole bytes */
600 	while( (i&(~(uint64_t)0xff)) ) {
601 		i >>= 8;
602 		x += 8;
603 	}
604 	/* now details */
605 	while(i) {
606 		i >>= 1;
607 		x ++;
608 	}
609 	assert( ((uint64_t)1<<x) >= a);
610 	assert( x==0 || ((uint64_t)1<<(x-1)) < a);
611 	return x;
612 }
613 
614 int udb_exp_offset(uint64_t o)
615 {
616 	/* this means measuring the number of 0 bits on the right */
617 	/* so, if exp zero bits then (o&(2**x-1))==0 */
618 	int x = 0;
619 	uint64_t i = o;
620 	assert(o != 0);
621 	/* first whole bytes */
622 	while( (i&(uint64_t)0xff) == 0) {
623 		i >>= 8;
624 		x += 8;
625 	}
626 	/* now details */
627 	while( (i&(uint64_t)0x1) == 0) {
628 		i >>= 1;
629 		x ++;
630 	}
631 	assert( o % ((uint64_t)1<<x) == 0);
632 	assert( o % ((uint64_t)1<<(x+1)) != 0);
633 	return x;
634 }
635 
636 void udb_alloc_init_new(udb_alloc_d* a)
637 {
638 	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
639 	memset(a, 0, sizeof(*a));
640 	/* set new allocations after header, as if allocated in a sequence
641 	 * of minsize allocations */
642 	a->nextgrow = UDB_HEADER_SIZE;
643 }
644 
645 /** fsck the file size, false if failed and file is useless */
646 static int
647 fsck_fsize(udb_base* udb, udb_alloc* alloc)
648 {
649 	off_t realsize;
650 	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
651 	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
652 	if(realsize == (off_t)-1) {
653 		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
654 		return 0;
655 	}
656 	udb->glob_data->fsize = (uint64_t)realsize;
657 	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
658 		return 0;
659 	udb->glob_data->dirty_alloc = udb_dirty_clean;
660 	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
661 	udb_base_sync(udb, 1);
662 	return 1;
663 }
664 
665 /** regenerate freelist add a new free chunk, return next todo */
666 static udb_void
667 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
668 {
669 	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
670 	uint64_t esz = (uint64_t)1<<exp;
671 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
672 		return 0;
673 	}
674 	cp->type = udb_chunk_type_free;
675 	cp->flags = 0;
676 	chunk_set_last(base, c, exp, (uint8_t)exp);
677 	cp->prev = 0;
678 	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
679 	if(cp->next)
680 		UDB_FREE_CHUNK(cp->next)->prev = c;
681 	regen->stat_free += esz;
682 	return c + esz;
683 }
684 
685 /** regenerate xl chunk, return next todo */
686 static udb_void
687 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
688 {
689 	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
690 	uint64_t xlsz = cp->size;
691 	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
692 		return 0;
693 	}
694 	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
695 		return 0;
696 	}
697 	/* fixup end-size and end-expmarker */
698 	regen->stat_alloc += xlsz;
699 	return c + xlsz;
700 }
701 
702 /** regenerate data chunk, return next todo */
703 static udb_void
704 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
705 {
706 	uint64_t esz = (uint64_t)1<<exp;
707 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
708 		return 0;
709 	}
710 	chunk_set_last(base, c, exp, (uint8_t)exp);
711 	regen->stat_alloc += esz;
712 	return c + esz;
713 }
714 
715 /** regenerate a relptr structure inside a data segment */
716 static void
717 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
718 {
719 	udb_void* a = (udb_void*)arg;
720 	/* ignore 0 pointers */
721 	if(!rp->data)
722 		return;
723 
724 	/* edit relptrs that point to oldmoved to point to newmoved. */
725 	if(rp->data == a[0])
726 		rp->data = a[1];
727 
728 	/* regenerate relptr lists, add this item to the relptr list for
729 	 * the data that it points to */
730 	udb_rel_ptr_link(base, rp, rp->data);
731 }
732 
733 /** regenerate the relptrs store in this data segment */
734 static void
735 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
736 	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
737 {
738 	udb_void arg[2];
739 	arg[0] = rb_old; arg[1] = rb_new;
740 	/* walk through the structs here and put them on their respective
741 	 * relptr lists */
742 	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
743 		&regen_relptr_func, arg);
744 
745 }
746 
747 /** regenerate relptrlists in the file */
748 static void
749 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
750 	udb_void rb_old, udb_void rb_new)
751 {
752 	udb_void at = alloc->udb->glob_data->hsize;
753 	/* clear all ptrlist start pointers in the file. */
754 	while(at < alloc->disk->nextgrow) {
755 		int exp = (int)UDB_CHUNK(at)->exp;
756 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
757 		if(exp == UDB_EXP_XL) {
758 			UDB_XL_CHUNK(at)->ptrlist = 0;
759 			at += UDB_XL_CHUNK(at)->size;
760 		} else if(tp == udb_chunk_type_free) {
761 			at += (uint64_t)1<<exp;
762 		} else { /* data chunk */
763 			UDB_CHUNK(at)->ptrlist = 0;
764 			at += (uint64_t)1<<exp;
765 		}
766 	}
767 	/* walk through all relptr structs and put on the right list. */
768 	at = alloc->udb->glob_data->hsize;
769 	while(at < alloc->disk->nextgrow) {
770 		udb_chunk_d* atp = UDB_CHUNK(at);
771 		int exp = (int)atp->exp;
772 		udb_chunk_type tp = (udb_chunk_type)atp->type;
773 		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
774 			(uint64_t)1<<exp);
775 		if(exp == UDB_EXP_XL) {
776 			assert(at != rb_old); /* should have been freed */
777 			regen_its_ptrs(base, udb, atp,
778 				((void*)atp)+sizeof(udb_xl_chunk_d),
779 				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
780 				rb_old, rb_new);
781 			at += sz;
782 		} else if(tp == udb_chunk_type_free) {
783 			at += sz;
784 		} else { /* data chunk */
785 			assert(at != rb_old); /* should have been freed */
786 			regen_its_ptrs(base, udb, atp,
787 				((void*)atp)+sizeof(udb_chunk_d),
788 				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
789 			at += sz;
790 		}
791 	}
792 }
793 
794 
795 /** mark free elements from ex XL chunk space and later fixups pick that up */
796 static void
797 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
798 {
799 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
800 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
801 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
802 	while(q >= s) {
803 		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
804 		UDB_CHUNK(q)->type = udb_chunk_type_free;
805 		q -= UDB_ALLOC_CHUNK_SIZE;
806 	}
807 }
808 
809 
810 /** fsck rollback or rollforward XL move results */
811 static int
812 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
813 	uint64_t rb_size, uint64_t rb_seg)
814 {
815 
816 	if(rb_old <= rb_new)
817 		return 0; /* XL move one way */
818 	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
819 		return 0; /* not aligned */
820 	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
821 		return 0; /* not aligned */
822 	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
823 		return 0; /* not aligned */
824 	if(rb_new + rb_size <= rb_old) {
825 		/* not overlapping: resume copy */
826 		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
827 		/* and free up old piece(s) */
828 		rb_mark_free_segs(base, rb_old, rb_size);
829 	} else {
830 		/* overlapping, see what segment we stopped at
831 		 * and continue there. */
832 		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
833 		/* free up old piece(s); from the end of the moved segment,
834 		 * until the end of the old segment */
835 		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
836 			(rb_new+rb_size));
837 	}
838 	/* do not call fix_ptrs, regenptrs does the job */
839 	return 1;
840 }
841 
842 /** fsck rollback or rollforward move results */
843 static int
844 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
845 	udb_void* make_free)
846 {
847 	if( (rb_size&(rb_size-1)) != 0)
848 		return 0; /* not powerof2 */
849 	if( (rb_old&(rb_size-1)) != 0)
850 		return 0; /* not aligned */
851 	if( (rb_new&(rb_size-1)) != 0)
852 		return 0; /* not aligned */
853 	/* resume copy */
854 	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
855 	/* do not call fix_ptrs, regenptrs does the job */
856 	/* make sure udb_old is freed */
857 	*make_free = rb_old;
858 	return 1;
859 }
860 
861 /** fsck the file and salvage, false if failed and file is useless */
862 static int
863 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
864 {
865 	void* base = udb->base;
866 	udb_alloc_d regen;
867 	udb_void at = udb->glob_data->hsize;
868 	udb_void rb_old = udb->glob_data->rb_old;
869 	udb_void rb_new = udb->glob_data->rb_new;
870 	udb_void rb_seg = udb->glob_data->rb_seg;
871 	udb_void make_free = 0;
872 	uint64_t rb_size = udb->glob_data->rb_size;
873 	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
874 	/* walk through the file, use the exp values to see what can be
875 	 * salvaged */
876 	if(moved && rb_old && rb_new && rb_size) {
877 		if(rb_old+rb_size <= alloc->disk->nextgrow
878 			&& rb_new+rb_size <= alloc->disk->nextgrow) {
879 			/* we can use the move information to fix up the
880 			 * duplicate element (or partially moved element) */
881 			if(rb_size > 1024*1024) {
882 				/* XL chunk */
883 				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
884 					rb_size, rb_seg))
885 					return 0;
886 			} else {
887 				if(!fsck_rb(base, rb_old, rb_new, rb_size,
888 					&make_free))
889 					return 0;
890 			}
891 		}
892 	}
893 
894 	/* rebuild freelists */
895 	/* recalculate stats in alloc (except 'stat_data') */
896 	/* possibly new end 'nextgrow' value */
897 	memset(&regen, 0, sizeof(regen));
898 	regen.nextgrow = alloc->disk->nextgrow;
899 	while(at < regen.nextgrow) {
900 		/* figure out this chunk */
901 		int exp = (int)UDB_CHUNK(at)->exp;
902 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
903 		/* consistency check possible here with end-exp */
904 		if(tp == udb_chunk_type_free || at == make_free) {
905 			at = regen_free(base, at, exp, &regen);
906 			if(!at) return 0;
907 		} else if(exp == UDB_EXP_XL) {
908 			/* allocated data of XL size */
909 			at = regen_xl(base, at, &regen);
910 			if(!at) return 0;
911 		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
912 			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
913 			/* allocated data */
914 			at = regen_data(base, at, exp, &regen);
915 			if(!at) return 0;
916 		} else {
917 			/* garbage; this must be EOF then */
918 			regen.nextgrow = at;
919 			break;
920 		}
921 	}
922 	*alloc->disk = regen;
923 
924 	/* rebuild relptr lists */
925 	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
926 
927 	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
928 		udb->fname);
929 	udb->glob_data->rb_old = 0;
930 	udb->glob_data->rb_new = 0;
931 	udb->glob_data->rb_size = 0;
932 	udb->glob_data->dirty_alloc = udb_dirty_clean;
933 	udb_base_sync(udb, 1);
934 	return 1;
935 }
936 
937 
938 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
939 {
940 	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
941 	if(!alloc)
942 		return NULL;
943 	alloc->udb = udb;
944 	alloc->disk = disk;
945 	/* see if committed but uncompleted actions need to be done */
946 	/* preserves the alloc state */
947 	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
948 		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
949 			if(fsck_fsize(udb, alloc))
950 				return alloc;
951 		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
952 			if(fsck_file(udb, alloc, 0))
953 				return alloc;
954 		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
955 			if(fsck_file(udb, alloc, 1))
956 				return alloc;
957 		}
958 		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
959 			(int)udb->glob_data->dirty_alloc);
960 		free(alloc);
961 		return NULL;
962 	}
963 	return alloc;
964 }
965 
966 void udb_alloc_delete(udb_alloc* alloc)
967 {
968 	if(!alloc) return;
969 	free(alloc);
970 }
971 
972 /** unlink this element from its freelist */
973 static void
974 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
975 {
976 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
977 	assert(chunk);
978 	/* chunk is a free chunk */
979 	assert(fp->exp == (uint8_t)exp);
980 	assert(fp->type == udb_chunk_type_free);
981 	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
982 	/* and thus freelist not empty */
983 	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
984 	/* unlink */
985 	if(fp->prev)
986 		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
987 	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
988 	if(fp->next)
989 		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
990 }
991 
992 /** pop first element off freelist, list may not be empty */
993 static udb_void
994 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
995 {
996 	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
997 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
998 	assert(f);
999 	assert(fp->exp == (uint8_t)exp);
1000 	assert(fp->type == udb_chunk_type_free);
1001 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1002 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1003 	if(fp->next) {
1004 		UDB_FREE_CHUNK(fp->next)->prev = 0;
1005 	}
1006 	return f;
1007 }
1008 
1009 /** push new element onto freelist */
1010 static void
1011 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1012 {
1013 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1014 	assert(f);
1015 	fp->exp = (uint8_t)exp;
1016 	fp->type = udb_chunk_type_free;
1017 	fp->flags = 0;
1018 	fp->prev = 0;
1019 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1020 	if(fp->next)
1021 		UDB_FREE_CHUNK(fp->next)->prev = f;
1022 	chunk_set_last(base, f, exp, (uint8_t)exp);
1023 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1024 }
1025 
1026 /** push new element onto freelist - do not initialize the elt */
1027 static void
1028 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1029 {
1030 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1031 	assert(f);
1032 	assert(fp->exp == (uint8_t)exp);
1033 	assert(fp->type == udb_chunk_type_free);
1034 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1035 	fp->prev = 0;
1036 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1037 	if(fp->next)
1038 		UDB_FREE_CHUNK(fp->next)->prev = f;
1039 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1040 }
1041 
1042 /** add free chunks at end until specified alignment occurs */
1043 static void
1044 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1045 {
1046 	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1047 		/* the nextgrow is not a whole multiple of esz. */
1048 		/* grow a free chunk of max allowed size */
1049 		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1050 		uint64_t fsz = (uint64_t)1<<fexp;
1051 		udb_void f = alloc->disk->nextgrow;
1052 		udb_void fn = alloc->disk->nextgrow+fsz;
1053 		assert(fn <= alloc->udb->base_size);
1054 		alloc->disk->stat_free += fsz;
1055 		udb_alloc_push_fl(base, alloc, f, fexp);
1056 		/* now increase nextgrow to commit that free chunk */
1057 		alloc->disk->nextgrow = fn;
1058 	}
1059 }
1060 
1061 /** append chunks at end of memory space to get size exp, return dataptr */
1062 static udb_void
1063 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1064 {
1065 	uint64_t esz = (uint64_t)1<<exp;
1066 	udb_void ret;
1067 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1068 	grow_align(base, alloc, esz);
1069 	/* free chunks are grown, grow the one we want to use */
1070 	ret = alloc->disk->nextgrow;
1071 	/* take a new alloced chunk into use */
1072 	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1073 	UDB_CHUNK(ret)->flags = 0;
1074 	UDB_CHUNK(ret)->ptrlist = 0;
1075 	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1076 	/* store last octet */
1077 	chunk_set_last(base, ret, exp, (uint8_t)exp);
1078 	/* update stats */
1079 	alloc->disk->stat_alloc += esz;
1080 	alloc->disk->stat_data += sz;
1081 	/* now increase nextgrow to commit this newly allocated chunk */
1082 	alloc->disk->nextgrow += esz;
1083 	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1084 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1085 	return ret + sizeof(udb_chunk_d); /* ptr to data */
1086 }
1087 
1088 /** calculate how much space is necessary to grow for this exp */
1089 static uint64_t
1090 grow_end_calc(udb_alloc* alloc, int exp)
1091 {
1092 	uint64_t sz = (uint64_t)1<<exp;
1093 	uint64_t ng = alloc->disk->nextgrow;
1094 	uint64_t res;
1095 	/* if nextgrow is 2**expness, no extra growth needed, only size */
1096 	if( (ng & (sz-1)) == 0) {
1097 		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1098 		return ng+sz; /* must grow exactly 2**exp */
1099 	}
1100 	/* grow until 2**expness and then we need 2**exp as well */
1101 	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1102 	 * and then add the sz twice (go up to whole sz, and to allocate) */
1103 	res = (ng & ~(sz-1)) + 2*sz;
1104 	return res;
1105 }
1106 
1107 /** see if we need to grow more than specified to enable sustained growth */
1108 static uint64_t
1109 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1110 {
1111 	const uint64_t mb = 1024*1024;
1112 	uint64_t bsz = alloc->udb->base_size;
1113 	if(bsz <= mb) {
1114 		/* below 1 Mb, double sizes for exponential growth */
1115 		/* takes about 15 times to grow to 1Mb */
1116 		if(ge < bsz*2)
1117 			return bsz*2;
1118 	} else {
1119 		uint64_t gnow = ge - bsz;
1120 		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1121 		 * in whole megabytes rounded up. */
1122 		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1123 		if(gnow < want)
1124 			return bsz + want;
1125 	}
1126 	return ge;
1127 }
1128 
1129 /** see if free space is enogh to warrant shrink (while file is open) */
1130 static int
1131 enough_free(udb_alloc* alloc)
1132 {
1133 	if(alloc->udb->base_size <= 2*1024*1024) {
1134 		/* below 1 Mb, grown by double size, (so up to 2 mb),
1135 		 * do not shrink unless we can 1/3 in size */
1136 		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1137 			return 1;
1138 	} else {
1139 		/* grown 12.5%, shrink 25% if possible, at least one mb */
1140 		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1141 		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1142 		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1143 			|| alloc->udb->base_size < 4*1024*1024))
1144 			return 1;
1145 	}
1146 	return 0;
1147 }
1148 
1149 /** grow space for a chunk of 2**exp and return dataptr */
1150 static udb_void
1151 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1152 {
1153 	/* commit the grow action
1154 	 * - the file grow only changes filesize, but not the nextgrow.
1155 	 * - taking space after nextgrow into use (as free space),
1156 	 *   is like free-ing a chunk (one at a time).
1157 	 * - and the last chunk taken into use is like alloc.
1158 	 */
1159 	/* predict how much free space is needed for this */
1160 	uint64_t grow_end = grow_end_calc(alloc, exp);
1161 	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1162 	if(grow_end <= alloc->udb->base_size) {
1163 		/* we can do this with the available space */
1164 		return grow_chunks(base, alloc, sz, exp);
1165 	}
1166 	/* we have to grow the file, re-mmap */
1167 	/* see if we need to grow a little more, to avoid endless grow
1168 	 * efforts on adding data */
1169 	grow_end = grow_extra_check(alloc, grow_end);
1170 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1171 		return 0; /* mmap or write failed (disk or mem full) */
1172 	}
1173 	/* we have enough space now */
1174 	assert(grow_end <= alloc->udb->base_size);
1175 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1176 	return grow_chunks(base, alloc, sz, exp);
1177 }
1178 
1179 /** take XL allocation into use at end of file, return dataptr */
1180 static udb_void
1181 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1182 {
1183 	udb_void ret;
1184 	udb_xl_chunk_d* p;
1185 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1186 
1187 	/* align growth to whole mbs */
1188 	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1189 
1190 	/* grow XL segment */
1191 	ret = alloc->disk->nextgrow;
1192 	p = UDB_XL_CHUNK(ret);
1193 	p->exp = UDB_EXP_XL;
1194 	p->size = xlsz;
1195 	p->flags = 0;
1196 	p->ptrlist = 0;
1197 	p->type = udb_chunk_type_data;
1198 
1199 	/* also put size and marker at end for compaction */
1200 	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1201 	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1202 
1203 	/* stats */
1204 	alloc->disk->stat_data += sz;
1205 	alloc->disk->stat_alloc += xlsz;
1206 	/* now increase the nextgrow to commit this xl chunk */
1207 	alloc->disk->nextgrow += xlsz;
1208 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1209 	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1210 }
1211 
1212 /** make space for XL allocation */
1213 static udb_void
1214 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1215 {
1216 	/* allocate whole mbs of space, at end of space */
1217 	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1218 	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1219 	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1220 	assert(need >= asz);
1221 	if(grow_end <= alloc->udb->base_size) {
1222 		/* can do this in available space */
1223 		return grow_xl(base, alloc, need, sz);
1224 	}
1225 	/* have to grow file and re-mmap */
1226 	grow_end = grow_extra_check(alloc, grow_end);
1227 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1228 		return 0; /* mmap or write failed (disk or mem full) */
1229 	}
1230 	/* we have enough space now */
1231 	assert(grow_end <= alloc->udb->base_size);
1232 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1233 	return grow_xl(base, alloc, need, sz);
1234 }
1235 
1236 /** divide big(2**e2) into pieces so 2**exp fits */
1237 static udb_void
1238 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1239 	int exp)
1240 {
1241 	int e = e2;
1242 	uint64_t sz = (uint64_t)1<<e2;
1243 	assert(big && e2 > exp);
1244 	/* so the returned piece to use is the first piece,
1245 	 * offload the later half until it fits */
1246 	do {
1247 		sz >>= 1; /* divide size of big by two */
1248 		e--;      /* that means its exp is one smaller */
1249 		udb_alloc_push_fl(base, alloc, big+sz, e);
1250 	} while(e != exp);
1251 	/* exit loop when last pushed is same size as what we want */
1252 	return big;
1253 }
1254 
1255 /** returns the exponent size of the chunk needed for data sz */
1256 static int
1257 udb_alloc_exp_needed(size_t sz)
1258 {
1259 	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1260 	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1261 		return UDB_EXP_XL;
1262 	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1263 		return UDB_ALLOC_CHUNK_MINEXP;
1264 	}
1265 	return udb_exp_size(asz);
1266 }
1267 
1268 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1269 {
1270 	void* base = alloc->udb->base;
1271 	/* calculate actual allocation size */
1272 	int e2, exp = udb_alloc_exp_needed(sz);
1273 	if(exp == UDB_EXP_XL)
1274 		return udb_alloc_xl_space(base, alloc, sz);
1275 	/* see if there is a free chunk of that size exactly */
1276 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1277 		/* snip from freelist, udb_chunk_d */
1278 		udb_void ret;
1279 		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1280 		ret = udb_alloc_pop_fl(base, alloc, exp);
1281 		/* use it - size octets already OK */
1282 		UDB_CHUNK(ret)->flags = 0;
1283 		UDB_CHUNK(ret)->ptrlist = 0;
1284 		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1285 		/* update stats */
1286 		alloc->disk->stat_data += sz;
1287 		alloc->disk->stat_alloc += (1<<exp);
1288 		assert(alloc->disk->stat_free >= (1u<<exp));
1289 		alloc->disk->stat_free -= (1<<exp);
1290 		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1291 		return ret + sizeof(udb_chunk_d); /* ptr to data */
1292 	}
1293 	/* see if we can subdivide a larger chunk */
1294 	for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++)
1295 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1296 			udb_void big, ret; /* udb_chunk_d */
1297 			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1298 			big = udb_alloc_pop_fl(base, alloc, e2);
1299 			/* push other parts onto freelists (needs inited) */
1300 			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1301 			/* use final part (needs inited) */
1302 			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1303 			/* if stop here; the new exp makes smaller free chunk*/
1304 			UDB_CHUNK(ret)->flags = 0;
1305 			UDB_CHUNK(ret)->ptrlist = 0;
1306 			/* set type to commit data chunk */
1307 			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1308 			/* store last octet */
1309 			chunk_set_last(base, ret, exp, (uint8_t)exp);
1310 			/* update stats */
1311 			alloc->disk->stat_data += sz;
1312 			alloc->disk->stat_alloc += (1<<exp);
1313 			assert(alloc->disk->stat_free >= (1u<<exp));
1314 			alloc->disk->stat_free -= (1<<exp);
1315 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1316 			return ret + sizeof(udb_chunk_d); /* ptr to data */
1317 		}
1318 	/* we need to grow an extra chunk */
1319 	return udb_alloc_grow_space(base, alloc, sz, exp);
1320 }
1321 
1322 /** see if there is free space to allocate a chunk into */
1323 static int
1324 have_free_for(udb_alloc* alloc, int exp)
1325 {
1326 	int e2;
1327 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1328 		return exp;
1329 	for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++)
1330 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1331 			return e2;
1332 		}
1333 	return 0;
1334 }
1335 
1336 /** fix relptr prev and next for moved relptr structures */
1337 static void
1338 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1339 {
1340 	udb_void* data = (udb_void*)arg;
1341 	udb_void r;
1342 	if(!rp->data)
1343 		return;
1344 	r = UDB_SYSTOREL(base, rp);
1345 	if(rp->next)
1346 		UDB_REL_PTR(rp->next)->prev = r;
1347 	if(rp->prev)
1348 		UDB_REL_PTR(rp->prev)->next = r;
1349 	else	{
1350 		/* if this is a pointer to its own chunk, fix it up;
1351 		 * the data ptr gets set by relptr_edit later. */
1352 		if(rp->data == data[0])
1353 			UDB_CHUNK(data[1])->ptrlist = r;
1354 		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1355 	}
1356 }
1357 
1358 /** fix pointers from and to a moved chunk */
1359 static void
1360 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1361 	uint64_t dsz, udb_void olddata)
1362 {
1363 	udb_void d[2];
1364 	d[0] = olddata;
1365 	d[1] = data;
1366 	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1367 		dsz, &chunk_fix_ptr_each, d);
1368 	udb_rel_ptr_edit(base, cp->ptrlist, data);
1369 	udb_base_ram_ptr_edit(udb, olddata, data);
1370 }
1371 
1372 /** move an allocated chunk to use a free chunk */
1373 static void
1374 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1375 	int e2)
1376 {
1377 	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1378 	udb_chunk_d* rp;
1379 	udb_chunk_d* fp;
1380 	if(exp != e2) {
1381 		/* it is bigger, subdivide it */
1382 		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1383 	}
1384 	assert(res != f);
1385 	/* setup rollback information */
1386 	alloc->udb->glob_data->rb_old = f;
1387 	alloc->udb->glob_data->rb_new = res;
1388 	alloc->udb->glob_data->rb_size = esz;
1389 	/* take the res, exp into use */
1390 	rp = UDB_CHUNK(res);
1391 	fp = UDB_CHUNK(f);
1392 	/* copy over the data */
1393 	memcpy(rp, fp, esz);
1394 	/* adjust rel ptrs */
1395 	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1396 		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1397 
1398 	/* do not freeup the fp; caller does that */
1399 }
1400 
1401 /** unlink several free elements to overwrite with xl chunk */
1402 static void
1403 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1404 {
1405 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1406 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1407 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1408 	while(q >= s) {
1409 		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1410 		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1411 		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1412 		q -= UDB_ALLOC_CHUNK_SIZE;
1413 	}
1414 }
1415 
1416 /** move an XL chunk, and keep track of segments for rollback */
1417 static void
1418 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1419 	uint64_t sz, uint64_t startseg)
1420 {
1421 	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1422 	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1423 	uint64_t amount = xl - n;
1424 	assert(n < xl); /* move to compact */
1425 
1426 	/* setup move rollback */
1427 	udb->glob_data->rb_old = xl;
1428 	udb->glob_data->rb_new = n;
1429 	udb->glob_data->rb_size = sz;
1430 
1431 	/* is it overlapping? */
1432 	if(sz <= amount) {
1433 		memcpy(np, xlp, sz);
1434 	} else {
1435 		/* move and commit per 1M segment to avoid data loss */
1436 		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1437 		for(seg = startseg; seg<maxseg; seg++) {
1438 			udb->glob_data->rb_seg = seg;
1439 			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1440 				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1441 				UDB_ALLOC_CHUNK_SIZE);
1442 		}
1443 
1444 	}
1445 }
1446 
1447 /** move list of XL chunks to the front by the shift amount */
1448 static void
1449 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1450 	uint64_t amount)
1451 {
1452 	udb_void xl = xl_start;
1453 	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1454 	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1455 	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1456 	while(xl < xl_start+xl_sz) {
1457 		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1458 		udb_void n = xl-amount;
1459 		uint64_t sz = xlp->size;
1460 		assert(xlp->exp == UDB_EXP_XL);
1461 		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1462 		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1463 			n+sizeof(udb_xl_chunk_d),
1464 			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1465 			xl+sizeof(udb_xl_chunk_d));
1466 	}
1467 	alloc->disk->stat_free -= amount;
1468 	alloc->disk->nextgrow -= amount;
1469 	alloc->udb->glob_data->rb_old = 0;
1470 	alloc->udb->glob_data->rb_new = 0;
1471 	alloc->udb->glob_data->rb_size = 0;
1472 }
1473 
1474 /** see if free chunk can coagulate with another chunk, return other chunk */
1475 static udb_void
1476 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1477 	uint64_t esz)
1478 {
1479 	udb_void other = f^esz;
1480 	if(exp == UDB_ALLOC_CHUNKS_MAX)
1481 		return 0; /* no further merges */
1482 	if(other >= alloc->udb->base_size)
1483 		return 0; /* not allocated */
1484 	if(other >= alloc->disk->nextgrow)
1485 		return 0; /* not in use */
1486 	if(other < alloc->udb->glob_data->hsize)
1487 		return 0; /* cannot merge with header */
1488 		/* the header is also protected by the special exp marker */
1489 	/* see if the other chunk is a free chunk */
1490 
1491 	/* check closest marker to avoid large memory churn */
1492 	/* and also it makes XL allocations and header special markers work */
1493 	if(f > other) {
1494 		assert(f > 1); /* this is certain because of header */
1495 		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1496 			/* can do it if the other part is a free chunk */
1497 			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1498 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1499 				return other;
1500 		}
1501 	} else {
1502 		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1503 			/* can do it if the other part is a free chunk */
1504 			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1505 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1506 				return other;
1507 		}
1508 	}
1509 	return 0;
1510 }
1511 
1512 /** coagulate and then add new free segment, return final free segment */
1513 static udb_void
1514 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1515 	uint64_t esz)
1516 {
1517 	/* new free chunk here, attempt coagulate */
1518 	udb_void other;
1519 	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1520 		/* unlink that other chunk */
1521 		udb_alloc_unlink_fl(base, alloc, other, exp);
1522 		/* merge up */
1523 		if(other < last)
1524 			last = other;
1525 		exp++;
1526 		esz <<= 1;
1527 	}
1528 	/* free the final segment */
1529 	udb_alloc_push_fl(base, alloc, last, exp);
1530 	return last;
1531 }
1532 
1533 /** attempt to compact the data and move free space to the end */
1534 static int
1535 udb_alloc_compact(void* base, udb_alloc* alloc)
1536 {
1537 	udb_void last;
1538 	int exp, e2;
1539 	uint64_t esz;
1540 	uint64_t at = alloc->disk->nextgrow;
1541 	udb_void xl_start = 0;
1542 	uint64_t xl_sz = 0;
1543 	while(at > alloc->udb->glob_data->hsize) {
1544 		/* grab last entry */
1545 		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1546 		if(exp == UDB_EXP_XL) {
1547 			/* for XL chunks:
1548 			 * - inspect the size of the XLchunklist at end
1549 			 * - attempt to compact in front of of XLchunklist
1550 			 */
1551 			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1552 				at-sizeof(uint64_t)*2));
1553 			udb_void xl = at-xlsz;
1554 #ifndef NDEBUG
1555 			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1556 			assert(xlp->exp == UDB_EXP_XL);
1557 			assert(xlp->type != udb_chunk_type_free);
1558 #endif
1559 			/* got thesegment add to the xl chunk list */
1560 			if(xl_start != 0 && xl+xlsz != xl_start) {
1561 				/* nonadjoining XL part, but they are aligned,
1562 				 * so the space in between is whole Mbs,
1563 				 * shift the later part(s) and continue */
1564 				uint64_t m = xl_start - (xl+xlsz);
1565 				assert(xl_start > xl+xlsz);
1566 				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1567 				free_xl_space(base, alloc, xl+xlsz, m);
1568 				move_xl_list(base, alloc, xl_start, xl_sz, m);
1569 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1570 			}
1571 			xl_start = xl;
1572 			xl_sz += xlsz;
1573 			at = xl;
1574 			continue;
1575 			/* end of XL if */
1576 		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1577 			|| exp > UDB_ALLOC_CHUNKS_MAX)
1578 			break; /* special chunk or garbage */
1579 		esz = (uint64_t)1<<exp;
1580 		last = at - esz;
1581 		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1582 		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1583 			/* if xlstart continue looking to move stuff, but do
1584 			 * not unlink this free segment */
1585 			if(!xl_start) {
1586 				/* it is a free chunk, remove it */
1587 				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1588 				udb_alloc_unlink_fl(base, alloc, last, exp);
1589 				alloc->disk->stat_free -= esz;
1590 				alloc->disk->nextgrow = last;
1591 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1592 				/* and continue at this point */
1593 			}
1594 			at = last;
1595 		} else if( (e2=have_free_for(alloc, exp)) ) {
1596 			/* last entry can be allocated in free chunks
1597 			 * move it to its new position, adjust rel_ptrs */
1598 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1599 			move_chunk(base, alloc, last, exp, esz, e2);
1600 			if(xl_start) {
1601 				last = coagulate_and_push(base, alloc,
1602 					last, exp, esz);
1603 			} else {
1604 				/* shorten usage */
1605 				alloc->disk->stat_free -= esz;
1606 				alloc->disk->nextgrow = last;
1607 			}
1608 			alloc->udb->glob_data->rb_old = 0;
1609 			alloc->udb->glob_data->rb_new = 0;
1610 			alloc->udb->glob_data->rb_size = 0;
1611 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1612 			/* and continue in front of it */
1613 			at = last;
1614 		} else {
1615 			/* cannot compact this block, stop compacting */
1616 			break;
1617 		}
1618 		/* if that worked, repeat it */
1619 	}
1620 	/* if we passed xl chunks, see if XL-chunklist can move */
1621 	if(xl_start) {
1622 		/* calculate free space in front of the XLchunklist. */
1623 		/* has to be whole mbs of free space */
1624 		/* if so, we can move the XL chunks.  Move them all back
1625 		 * by the new free space. */
1626 		/* this compacts very well, but the XL chunks can be moved
1627 		 * multiple times; worst case for every mb freed a huge sized
1628 		 * xlchunklist gets moved. */
1629 		/* free space must be, since aligned and coagulated, in
1630 		 * chunks of a whole MB */
1631 		udb_void at = xl_start;
1632 		uint64_t m = 0;
1633 		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1634 			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1635 			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1636 				break;
1637 			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1638 			m += UDB_ALLOC_CHUNK_SIZE;
1639 			at = chunk;
1640 		}
1641 		if(m != 0) {
1642 			assert(at+m == xl_start);
1643 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1644 			free_xl_space(base, alloc, at, m);
1645 			move_xl_list(base, alloc, xl_start, xl_sz, m);
1646 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1647 		}
1648 	}
1649 
1650 	/* if enough free, shrink the file; re-mmap */
1651 	if(enough_free(alloc)) {
1652 		uint64_t nsize = alloc->disk->nextgrow;
1653 		udb_base_shrink(alloc->udb, nsize);
1654 		if(!udb_base_remap(alloc->udb, alloc, nsize))
1655 			return 0;
1656 	}
1657 	return 1;
1658 }
1659 
1660 #ifdef UDB_CHECK
1661 /** check that rptrs are really zero before free */
1662 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1663 {
1664 	(void)base;
1665 	(void)arg;
1666 	assert(p->data == 0);
1667 }
1668 #endif /* UDB_CHECK */
1669 
1670 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1671 static void
1672 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1673 	size_t sz)
1674 {
1675 	uint64_t xlsz = fp->size;
1676 	uint64_t c;
1677 	/* lightweight check for buffer overflow in xl data */
1678 	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1679 	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1680 	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1681 	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1682 #ifdef UDB_CHECK
1683 	/* check that relptrs in this chunk have been zeroed */
1684 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1685 		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1686 		&udb_check_rptr_zero, NULL);
1687 #endif
1688 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1689 	/* update stats */
1690 	alloc->disk->stat_data -= sz;
1691 	alloc->disk->stat_alloc -= xlsz;
1692 	alloc->disk->stat_free += xlsz;
1693 	/* walk in reverse, so the front blocks go first on the list */
1694 	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1695 	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1696 	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1697 	while(c >= f) {
1698 		/* free a block of CHUNK_SIZE (1 Mb) */
1699 		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1700 		c -= UDB_ALLOC_CHUNK_SIZE;
1701 	}
1702 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1703 }
1704 
1705 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1706 {
1707 	void* base;
1708 	/* lookup chunk ptr */
1709 	udb_void f;
1710 	udb_chunk_d* fp;
1711 	uint64_t esz;
1712 	int exp;
1713 	udb_void other;
1714 	int coagulated = 0;
1715 	if(!r)
1716 		return 1; /* free(NULL) does nothing */
1717 
1718 	/* lookup size of chunk */
1719 	base = alloc->udb->base;
1720 	/* fails for XL blocks */
1721 	f = chunk_from_dataptr(r);
1722 	fp = UDB_CHUNK(f);
1723 	assert(fp->type != udb_chunk_type_free);
1724 
1725 	/* see if it has a ptrlist, if so: trouble, the list is not properly
1726 	 * cleaned up. (although you can imagine a wholesale delete where
1727 	 * it does not matter) */
1728 	assert(fp->ptrlist == 0);
1729 
1730 	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1731 	fp->ptrlist = 0;
1732 
1733 	if(fp->exp == UDB_EXP_XL) {
1734 		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1735 		/* compact */
1736 		return udb_alloc_compact(base, alloc);
1737 	}
1738 	/* it is a regular chunk of 2**exp size */
1739 	exp = (int)fp->exp;
1740 	esz = (uint64_t)1<<exp;
1741 	/* light check for e.g. buffer overflow of the data */
1742 	assert(sz < esz);
1743 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1744 #ifdef UDB_CHECK
1745 	/* check that relptrs in this chunk have been zeroed */
1746 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1747 		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1748 #endif
1749 
1750 	/* update the stats */
1751 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1752 	alloc->disk->stat_data -= sz;
1753 	alloc->disk->stat_free += esz;
1754 	alloc->disk->stat_alloc -= esz;
1755 
1756 	/* if it can be merged with other free chunks, do so */
1757 	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1758 		coagulated = 1;
1759 		/* unlink that other chunk and expand it (it has same size) */
1760 		udb_alloc_unlink_fl(base, alloc, other, exp);
1761 		/* merge up */
1762 		if(other < f)
1763 			f = other;
1764 		exp++;
1765 		esz <<= 1;
1766 	}
1767 	if(coagulated) {
1768 		/* put big free chunk into freelist, and init it */
1769 		udb_alloc_push_fl(base, alloc, f, exp);
1770 	} else {
1771 		/* we do not need to touch the last-exp-byte, which may save
1772 		 * a reference to that page of memory */
1773 		fp->type = udb_chunk_type_free;
1774 		fp->flags = 0;
1775 		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1776 	}
1777 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1778 	/* compact */
1779 	return udb_alloc_compact(base, alloc);
1780 }
1781 
1782 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1783 {
1784 	/* could be faster maybe, if grown? */
1785 	udb_void r = udb_alloc_space(alloc, sz);
1786 	if(!r) return r;
1787 	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1788 	return r;
1789 }
1790 
1791 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1792 {
1793 	void* base = alloc->udb->base;
1794 	udb_void c, n, newd;
1795 	udb_chunk_d* cp, *np;
1796 	uint64_t avail;
1797 	uint8_t cp_type;
1798 	/* emulate some posix realloc stuff */
1799 	if(r == 0)
1800 		return udb_alloc_space(alloc, sz);
1801 	if(sz == 0) {
1802 		if(!udb_alloc_free(alloc, r, osz))
1803 			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1804 		return 0;
1805 	}
1806 	c = chunk_from_dataptr(r);
1807 	cp = UDB_CHUNK(c);
1808 	cp_type = cp->type;
1809 	if(cp->exp == UDB_EXP_XL) {
1810 		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1811 			- sizeof(uint64_t)*2;
1812 	} else {
1813 		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1814 	}
1815 	if(sz <= avail)
1816 		return r;
1817 	/* reallocate it, and copy */
1818 	newd = udb_alloc_space(alloc, sz);
1819 	if(!newd) return 0;
1820 	/* re-base after alloc, since re-mmap may have happened */
1821 	base = alloc->udb->base;
1822 	cp = NULL; /* may be invalid now, robustness */
1823 	n = chunk_from_dataptr(newd);
1824 	np = UDB_CHUNK(n);
1825 	np->type = cp_type;
1826 	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1827 	/* fixup ptrs */
1828 	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1829 
1830 	if(!udb_alloc_free(alloc, r, osz))
1831 		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1832 	return newd;
1833 }
1834 
1835 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1836 {
1837 	const uint64_t mb = 1024*1024;
1838 	int exp = udb_alloc_exp_needed(sz);
1839 	uint64_t esz;
1840 	uint64_t want;
1841 	if(exp == UDB_EXP_XL)
1842 		esz = (sz&(mb-1))+mb;
1843 	else	esz = (uint64_t)1<<exp;
1844 	/* we need grow_end_calc to take into account alignment */
1845 	want = grow_end_calc(alloc, exp) + esz*(num-1);
1846 	assert(want >= alloc->udb->base_size);
1847 	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1848 		log_msg(LOG_ERR, "failed to grow the specified amount");
1849 		return 0;
1850 	}
1851 	return 1;
1852 }
1853 
1854 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1855 {
1856 	void* base = alloc->udb->base;
1857 	udb_void f = chunk_from_dataptr(r);
1858 	udb_chunk_d* fp = UDB_CHUNK(f);
1859 	/* not the 'free' type, that must be set by allocation routines */
1860 	assert(fp->type != udb_chunk_type_free);
1861 	assert(tp != udb_chunk_type_free);
1862 	fp->type = tp;
1863 }
1864 
1865 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1866 {
1867 	/* pointers are not valid before the header-size or after the
1868 	 * used-region of the mmap */
1869 	return ( (to+destsize) <= udb->base_size &&
1870 		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1871 		(to+destsize) <= udb->alloc->disk->nextgrow);
1872 }
1873 
1874 int udb_valid_dataptr(udb_base* udb, udb_void to)
1875 {
1876 	void* base = udb->base;
1877 	udb_void ch;
1878 	int exp;
1879 	uint64_t esz;
1880 	/* our data chunks are aligned and at least 8 bytes */
1881 	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1882 		return 0;
1883 	/* get the chunk pointer */
1884 	ch = chunk_from_dataptr(to);
1885 	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1886 		return 0;
1887 	/* check its size */
1888 	exp = UDB_CHUNK(ch)->exp;
1889 	if(exp == UDB_EXP_XL) {
1890 		/* check XL chunk */
1891 		uint64_t xlsz;
1892 		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1893 			return 0;
1894 		xlsz = UDB_XL_CHUNK(ch)->size;
1895 		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1896 			return 0;
1897 		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1898 			return 0;
1899 		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1900 			!= xlsz)
1901 			return 0;
1902 		return 1;
1903 	}
1904 	/* check if regular chunk has matching end byte */
1905 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1906 		return 0; /* cannot be a valid chunk */
1907 	esz = 1<<exp;
1908 	if(!udb_valid_offset(udb, ch+esz-1, 1))
1909 		return 0;
1910 	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1911 		return 0;
1912 	return 1;
1913 }
1914 
1915 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1916 {
1917 	void* base = udb->base;
1918 	udb_void p;
1919 	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1920 		return 0;
1921 	if(!udb_valid_dataptr(udb, to))
1922 		return 0;
1923 	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1924 	while(p) {
1925 		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1926 			return 0;
1927 		if(p == rptr)
1928 			return 1;
1929 		p = UDB_REL_PTR(p)->next;
1930 	}
1931 	return 0;
1932 }
1933 
1934 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1935 {
1936 	memset(ptr, 0, sizeof(*ptr));
1937 }
1938 
1939 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1940 {
1941 	if(!ptr->data)
1942 		return;
1943 	if(ptr->prev) {
1944 		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1945 	} else {
1946 		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1947 	}
1948 	if(ptr->next) {
1949 		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
1950 	}
1951 }
1952 
1953 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
1954 {
1955 	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
1956 	ptr->prev = 0;
1957 	ptr->next = chunk->ptrlist;
1958 	if(ptr->next)
1959 		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
1960 	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
1961 	ptr->data = to;
1962 }
1963 
1964 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
1965 {
1966 	assert(to == 0 || to > 64);
1967 	udb_rel_ptr_unlink(base, ptr);
1968 	if(to)
1969 		udb_rel_ptr_link(base, ptr, to);
1970 	else	ptr->data = to;
1971 }
1972 
1973 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
1974 {
1975 	udb_void p = list;
1976 	while(p) {
1977 		UDB_REL_PTR(p)->data = to;
1978 		p = UDB_REL_PTR(p)->next;
1979 	}
1980 }
1981 
1982 #ifdef UDB_CHECK
1983 /** check that all pointers are validly chained */
1984 static void
1985 udb_check_ptrs_valid(udb_base* udb)
1986 {
1987 	size_t i;
1988 	udb_ptr* p, *prev;
1989 	for(i=0; i<udb->ram_size; i++) {
1990 		prev = NULL;
1991 		for(p=udb->ram_hash[i]; p; p=p->next) {
1992 			assert(p->prev == prev);
1993 			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
1994 				== i);
1995 			assert(p->base == &udb->base);
1996 			prev = p;
1997 		}
1998 	}
1999 }
2000 #endif /* UDB_CHECK */
2001 
2002 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2003 {
2004 #ifdef UDB_CHECK
2005 	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2006 #endif
2007 	memset(ptr, 0, sizeof(*ptr));
2008 	ptr->base = &udb->base;
2009 }
2010 
2011 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2012 {
2013 	assert(newval == 0 || newval > 64);
2014 	if(ptr->data)
2015 		udb_base_unlink_ptr(udb, ptr);
2016 	ptr->data = newval;
2017 	if(newval)
2018 		udb_base_link_ptr(udb, ptr);
2019 }
2020 
2021 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2022 	size_t sz)
2023 {
2024 	udb_void r;
2025 	r = udb_alloc_space(udb->alloc, sz);
2026 	if(!r) return 0;
2027 	udb_alloc_set_type(udb->alloc, r, type);
2028 	udb_ptr_init(ptr, udb);
2029 	udb_ptr_set(ptr, udb, r);
2030 	return 1;
2031 }
2032 
2033 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2034 {
2035 	if(ptr->data) {
2036 		udb_void d = ptr->data;
2037 		udb_ptr_set(ptr, udb, 0);
2038 		udb_alloc_free(udb->alloc, d, sz);
2039 	}
2040 }
2041 
2042 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2043 {
2044 	udb_void f;
2045 	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2046 	f = chunk_from_dataptr(ptr->data);
2047 	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2048 }
2049