xref: /openbsd-src/usr.sbin/nsd/udb.c (revision 0b7734b3d77bb9b21afec6f4621cae6c805dbd45)
1 /* udb.c - u(micro) data base.
2  * By W.C.A. Wijngaards
3  * Copyright 2010, NLnet Labs.
4  * BSD, see LICENSE.
5  */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15 
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29 
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 	udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35 
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
38 chunk_from_dataptr(udb_void data)
39 {
40 	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 	udb_void xl = data - sizeof(udb_xl_chunk_d);
43 	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 		return xl;
45 	return data - sizeof(udb_chunk_d);
46 }
47 
48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 	return chunk_from_dataptr(data);
50 }
51 
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60 
61 /** write last octet of a chunk */
62 static void
63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 	*((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value;
66 }
67 
68 /** create udb_base from a file descriptor (must be at start of file) */
69 udb_base*
70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
71 	void* arg)
72 {
73 	uint64_t m, fsz;
74 	udb_glob_d g;
75 	ssize_t r;
76 	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
77 	if(!udb) {
78 		log_msg(LOG_ERR, "out of memory");
79 		close(fd);
80 		return NULL;
81 	}
82 	udb->fname = strdup(fname);
83 	if(!udb->fname) {
84 		log_msg(LOG_ERR, "out of memory");
85 		free(udb);
86 		close(fd);
87 		return NULL;
88 	}
89 	udb->walkfunc = walkfunc;
90 	udb->walkarg = arg;
91 	udb->fd = fd;
92 	udb->ram_size = 1024;
93 	udb->ram_mask = (int)udb->ram_size - 1;
94 	udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*),
95 		udb->ram_size);
96 	if(!udb->ram_hash) {
97 		free(udb->fname);
98 		free(udb);
99 		log_msg(LOG_ERR, "out of memory");
100 		close(fd);
101 		return NULL;
102 	}
103 
104 	/* read magic */
105 	if((r=read(fd, &m, sizeof(m))) == -1) {
106 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
107 		goto fail;
108 	} else if(r != (ssize_t)sizeof(m)) {
109 		log_msg(LOG_ERR, "%s: file too short", fname);
110 		goto fail;
111 	}
112 	/* TODO : what if bigendian and littleendian file, see magic */
113 	if(m != UDB_MAGIC) {
114 		log_msg(LOG_ERR, "%s: wrong type of file", fname);
115 		goto fail;
116 	}
117 	/* read header */
118 	if((r=read(fd, &g, sizeof(g))) == -1) {
119 		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
120 		goto fail;
121 	} else if(r != (ssize_t)sizeof(g)) {
122 		log_msg(LOG_ERR, "%s: file too short", fname);
123 		goto fail;
124 	}
125 	if(g.version != 0) {
126 		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
127 			(int)g.version);
128 		goto fail;
129 	}
130 	if(g.hsize < UDB_HEADER_SIZE) {
131 		log_msg(LOG_ERR, "%s: header size too small %d", fname,
132 			(int)g.hsize);
133 		goto fail;
134 	}
135 	if(g.hsize > UDB_HEADER_SIZE) {
136 		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
137 			(int)g.hsize);
138 		goto fail;
139 	}
140 	if(g.clean_close != 1) {
141 		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
142 			(int)g.clean_close);
143 		goto fail;
144 	}
145 	if(g.dirty_alloc != 0) {
146 		log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
147 			(int)g.dirty_alloc);
148 		goto fail;
149 	}
150 
151 	/* check file size correctly written, for 4.0.2 nsd.db failure */
152 	fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
153 	(void)lseek(fd, (off_t)0, SEEK_SET);
154 	if(g.fsize != fsz) {
155 		log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
156 			"has size %llu", fname, (unsigned long long)fsz,
157 			(unsigned long long)g.fsize);
158 		goto fail;
159 	}
160 
161 	/* mmap it */
162 	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
163 		log_msg(LOG_ERR, "%s: file too short", fname);
164 		goto fail;
165 	}
166 	if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
167 		log_msg(LOG_WARNING, "%s: file size too large %llu",
168 			fname, (unsigned long long)g.fsize);
169 		goto fail;
170 	}
171 	udb->base_size = (size_t)g.fsize;
172 #ifdef HAVE_MMAP
173 	/* note the size_t casts must be there for portability, on some
174 	 * systems the layout of memory is otherwise broken. */
175 	udb->base = mmap(NULL, (size_t)udb->base_size,
176 		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
177 		(int)udb->fd, (off_t)0);
178 #else
179 	udb->base = MAP_FAILED; errno = ENOSYS;
180 #endif
181 	if(udb->base == MAP_FAILED) {
182 		udb->base = NULL;
183 		log_msg(LOG_ERR, "mmap(size %u) error: %s",
184 			(unsigned)udb->base_size, strerror(errno));
185 	fail:
186 		close(fd);
187 		free(udb->fname);
188 		free(udb->ram_hash);
189 		free(udb);
190 		return NULL;
191 	}
192 
193 	/* init completion */
194 	udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t));
195 	r = 0;
196 	/* cannot be dirty because that is goto fail above */
197 	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
198 		r = 1;
199 	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
200 		(void*)udb->glob_data+sizeof(*udb->glob_data)));
201 	if(!udb->alloc) {
202 		log_msg(LOG_ERR, "out of memory");
203 		udb_base_free(udb);
204 		return NULL;
205 	}
206 	if(r) {
207 		/* and compact now, or resume compacting */
208 		udb_alloc_compact(udb, udb->alloc);
209 		udb_base_sync(udb, 1);
210 	}
211 	udb->glob_data->clean_close = 0;
212 
213 	return udb;
214 }
215 
216 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
217 	void* arg)
218 {
219 	int fd = open(fname, O_RDWR);
220 	if(fd == -1) {
221 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
222 		return NULL;
223 	}
224 	return udb_base_create_fd(fname, fd, walkfunc, arg);
225 }
226 
227 /** init new udb_global structure */
228 static void udb_glob_init_new(udb_glob_d* g)
229 {
230 	memset(g, 0, sizeof(*g));
231 	g->hsize = UDB_HEADER_SIZE;
232 	g->fsize = UDB_HEADER_SIZE;
233 }
234 
235 /** write data to file and check result */
236 static int
237 write_fdata(const char* fname, int fd, void* data, size_t len)
238 {
239 	ssize_t w;
240 	if((w=write(fd, data, len)) == -1) {
241 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
242 		close(fd);
243 		return 0;
244 	} else if(w != (ssize_t)len) {
245 		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
246 		close(fd);
247 		return 0;
248 	}
249 	return 1;
250 }
251 
252 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
253 	void* arg)
254 {
255 	uint64_t m;
256 	udb_glob_d g;
257 	udb_alloc_d a;
258 	uint64_t endsize = UDB_HEADER_SIZE;
259 	uint64_t endexp = 0;
260 	int fd = open(fname, O_CREAT|O_RDWR, 0600);
261 	if(fd == -1) {
262 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
263 		return NULL;
264 	}
265 	m = UDB_MAGIC;
266 	udb_glob_init_new(&g);
267 	udb_alloc_init_new(&a);
268 	g.clean_close = 1;
269 
270 	/* write new data to file (closes fd on error) */
271 	if(!write_fdata(fname, fd, &m, sizeof(m)))
272 		return NULL;
273 	if(!write_fdata(fname, fd, &g, sizeof(g)))
274 		return NULL;
275 	if(!write_fdata(fname, fd, &a, sizeof(a)))
276 		return NULL;
277 	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
278 		return NULL;
279 	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
280 		return NULL;
281 	/* rewind to start */
282 	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
283 		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
284 		close(fd);
285 		return NULL;
286 	}
287 	/* truncate to the right size */
288 	if(ftruncate(fd, (off_t)g.fsize) < 0) {
289 		log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
290 			(int)g.fsize, strerror(errno));
291 		close(fd);
292 		return NULL;
293 	}
294 	return udb_base_create_fd(fname, fd, walkfunc, arg);
295 }
296 
297 /** shrink the udb base if it has unused space at the end */
298 static void
299 udb_base_shrink(udb_base* udb, uint64_t nsize)
300 {
301 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
302 	udb->glob_data->fsize = nsize;
303 	/* sync, does not *seem* to be required on Linux, but it is
304 	   certainly required on OpenBSD.  Otherwise changed data is lost. */
305 #ifdef HAVE_MMAP
306 	msync(udb->base, udb->base_size, MS_ASYNC);
307 #endif
308 	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
309 		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
310 			(unsigned)nsize, strerror(errno));
311 	}
312 	udb->glob_data->dirty_alloc = udb_dirty_clean;
313 }
314 
315 void udb_base_close(udb_base* udb)
316 {
317 	if(!udb)
318 		return;
319 	if(udb->fd != -1 && udb->base && udb->alloc) {
320 		uint64_t nsize = udb->alloc->disk->nextgrow;
321 		if(nsize < udb->base_size)
322 			udb_base_shrink(udb, nsize);
323 	}
324 	if(udb->fd != -1) {
325 		udb->glob_data->clean_close = 1;
326 		close(udb->fd);
327 		udb->fd = -1;
328 	}
329 	if(udb->base) {
330 #ifdef HAVE_MMAP
331 		if(munmap(udb->base, udb->base_size) == -1) {
332 			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
333 		}
334 #endif
335 		udb->base = NULL;
336 	}
337 }
338 
339 void udb_base_free(udb_base* udb)
340 {
341 	if(!udb)
342 		return;
343 	udb_base_close(udb);
344 	udb_alloc_delete(udb->alloc);
345 	free(udb->ram_hash);
346 	free(udb->fname);
347 	free(udb);
348 }
349 
350 void udb_base_free_keep_mmap(udb_base* udb)
351 {
352 	if(!udb) return;
353 	if(udb->fd != -1) {
354 		close(udb->fd);
355 		udb->fd = -1;
356 	}
357 	udb->base = NULL;
358 	udb_alloc_delete(udb->alloc);
359 	free(udb->ram_hash);
360 	free(udb->fname);
361 	free(udb);
362 }
363 
364 void udb_base_sync(udb_base* udb, int wait)
365 {
366 	if(!udb) return;
367 #ifdef HAVE_MMAP
368 	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
369 		log_msg(LOG_ERR, "msync(%s) error %s",
370 			udb->fname, strerror(errno));
371 	}
372 #else
373 	(void)wait;
374 #endif
375 }
376 
377 /** hash a chunk pointer */
378 static uint32_t
379 chunk_hash_ptr(udb_void p)
380 {
381 	/* put p into an array of uint32 */
382 	uint32_t h[sizeof(p)/sizeof(uint32_t)];
383 	memcpy(&h, &p, sizeof(h));
384 	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
385 }
386 
387 /** check that the given pointer is on the bucket for the given offset */
388 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
389 {
390 	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
391 	udb_ptr* p;
392 	assert((size_t)i < udb->ram_size);
393 	for(p = udb->ram_hash[i]; p; p=p->next) {
394 		if(p == ptr)
395 			return 1;
396 	}
397 	return 0;
398 }
399 
400 /** grow the ram array */
401 static void
402 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
403 {
404 	size_t i;
405 	size_t osize= udb->ram_size;
406 	udb_ptr* p, *np;
407 	udb_ptr** oldhash = udb->ram_hash;
408 	udb->ram_size *= 2;
409 	udb->ram_mask <<= 1;
410 	udb->ram_mask |= 1;
411 	udb->ram_hash = newhash;
412 	/* have to link in every element in the old list into the new list*/
413 	for(i=0; i<osize; i++) {
414 		p = oldhash[i];
415 		while(p) {
416 			np = p->next;
417 			/* link into newhash */
418 			p->prev=NULL;
419 			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
420 			if(p->next) p->next->prev = p;
421 			/* go to next element of oldhash */
422 			p = np;
423 		}
424 	}
425 	free(oldhash);
426 }
427 
428 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
429 {
430 	uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
431 	assert((size_t)i < udb->ram_size);
432 #ifdef UDB_CHECK
433 	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
434 #endif
435 	udb->ram_num++;
436 	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) {
437 		/* grow the array, if allocation succeeds */
438 		udb_ptr** newram = (udb_ptr**)xalloc_array_zero(
439 			sizeof(udb_ptr*), udb->ram_size*2);
440 		if(newram) {
441 			grow_ram_hash(udb, newram);
442 		}
443 	}
444 	ptr->prev = NULL;
445 	ptr->next = udb->ram_hash[i];
446 	udb->ram_hash[i] = ptr;
447 	if(ptr->next)
448 		ptr->next->prev = ptr;
449 }
450 
451 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
452 {
453 	assert(ptr->data);
454 #ifdef UDB_CHECK
455 	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
456 	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
457 #endif
458 	udb->ram_num--;
459 	if(ptr->next)
460 		ptr->next->prev = ptr->prev;
461 	if(ptr->prev)
462 		ptr->prev->next = ptr->next;
463 	else	{
464 		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
465 		assert((size_t)i < udb->ram_size);
466 		udb->ram_hash[i] = ptr->next;
467 	}
468 }
469 
470 /** change a set of ram ptrs to a new value */
471 static void
472 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
473 {
474 	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
475 	udb_ptr* p, *np;
476 	/* edit them and move them into the new position */
477 	p = udb->ram_hash[io];
478 	while(p) {
479 		np = p->next;
480 		if(p->data == old) {
481 			udb_base_unlink_ptr(udb, p);
482 			p->data = newd;
483 			udb_base_link_ptr(udb, p);
484 		}
485 		p = np;
486 	}
487 }
488 
489 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
490 {
491 	return &udb->glob_data->user_global;
492 }
493 
494 void udb_base_set_userdata(udb_base* udb, udb_void user)
495 {
496 #ifdef UDB_CHECK
497 	if(user) { assert(udb_valid_dataptr(udb, user)); }
498 #endif
499 	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
500 }
501 
502 void udb_base_set_userflags(udb_base* udb, uint8_t v)
503 {
504 	udb->glob_data->userflags = v;
505 }
506 
507 uint8_t udb_base_get_userflags(udb_base* udb)
508 {
509 	return udb->glob_data->userflags;
510 }
511 
512 /** re-mmap the udb to specified size */
513 static void*
514 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
515 {
516 #ifdef HAVE_MMAP
517 	void* nb;
518 	/* for use with valgrind, do not use mremap, but the other version */
519 #ifdef MREMAP_MAYMOVE
520 	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
521 	if(nb == MAP_FAILED) {
522 		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
523 			udb->fname, (unsigned)nsize, strerror(errno));
524 		return 0;
525 	}
526 #else /* !HAVE MREMAP */
527 	/* use munmap-mmap to simulate mremap */
528 	if(munmap(udb->base, udb->base_size) != 0) {
529 		log_msg(LOG_ERR, "munmap(%s) error %s",
530 			udb->fname, strerror(errno));
531 	}
532 	/* provide hint for new location */
533 	/* note the size_t casts must be there for portability, on some
534 	 * systems the layout of memory is otherwise broken. */
535 	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
536 		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
537 	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
538 	 * the kernel can then try to mmap it at a different location
539 	 * where more memory is available */
540 	if(nb == MAP_FAILED && errno == ENOMEM) {
541 		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
542 			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
543 	}
544 	if(nb == MAP_FAILED) {
545 		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
546 			udb->fname, (unsigned)nsize, strerror(errno));
547 		udb->base = NULL;
548 		return 0;
549 	}
550 #endif /* HAVE MREMAP */
551 	if(nb != udb->base) {
552 		/* fix up realpointers in udb and alloc */
553 		/* but mremap may have been nice and not move the base */
554 		udb->base = nb;
555 		udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t));
556 		/* use passed alloc pointer because the udb->alloc may not
557 		 * be initialized yet */
558 		alloc->disk = (udb_alloc_d*)((void*)udb->glob_data
559 			+sizeof(*udb->glob_data));
560 	}
561 	udb->base_size = nsize;
562 	return nb;
563 #else /* HAVE_MMAP */
564 	(void)udb; (void)alloc; (void)nsize;
565 	return NULL;
566 #endif /* HAVE_MMAP */
567 }
568 
569 void
570 udb_base_remap_process(udb_base* udb)
571 {
572 	/* assume that fsize is still accessible */
573 	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
574 }
575 
576 /** grow file to specified size and re-mmap, return new base */
577 static void*
578 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
579 {
580 	/* grow file by writing a single zero at that spot, the
581 	 * rest is filled in with zeroes. */
582 	uint8_t z = 0;
583 	ssize_t w;
584 
585 	assert(nsize > 0);
586 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
587 #ifdef HAVE_PWRITE
588 	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
589 #else
590 	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
591 		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
592 		return 0;
593 	}
594 	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
595 #endif
596 		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
597 			udb->fname, (unsigned)nsize, strerror(errno));
598 		return 0;
599 	} else if(w != (ssize_t)sizeof(z)) {
600 		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
601 			udb->fname, (unsigned)nsize);
602 		return 0;
603 	}
604 	udb->glob_data->fsize = nsize;
605 	udb->glob_data->dirty_alloc = udb_dirty_clean;
606 	return udb_base_remap(udb, udb->alloc, nsize);
607 }
608 
609 int udb_exp_size(uint64_t a)
610 {
611 	/* find enclosing value such that 2**x >= a */
612 	int x = 0;
613 	uint64_t i = a;
614 	assert(a != 0);
615 
616 	i --;
617 	/* could optimise this with uint8* access, depends on endianness */
618 	/* first whole bytes */
619 	while( (i&(~(uint64_t)0xff)) ) {
620 		i >>= 8;
621 		x += 8;
622 	}
623 	/* now details */
624 	while(i) {
625 		i >>= 1;
626 		x ++;
627 	}
628 	assert( ((uint64_t)1<<x) >= a);
629 	assert( x==0 || ((uint64_t)1<<(x-1)) < a);
630 	return x;
631 }
632 
633 int udb_exp_offset(uint64_t o)
634 {
635 	/* this means measuring the number of 0 bits on the right */
636 	/* so, if exp zero bits then (o&(2**x-1))==0 */
637 	int x = 0;
638 	uint64_t i = o;
639 	assert(o != 0);
640 	/* first whole bytes */
641 	while( (i&(uint64_t)0xff) == 0) {
642 		i >>= 8;
643 		x += 8;
644 	}
645 	/* now details */
646 	while( (i&(uint64_t)0x1) == 0) {
647 		i >>= 1;
648 		x ++;
649 	}
650 	assert( o % ((uint64_t)1<<x) == 0);
651 	assert( o % ((uint64_t)1<<(x+1)) != 0);
652 	return x;
653 }
654 
655 void udb_alloc_init_new(udb_alloc_d* a)
656 {
657 	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
658 	memset(a, 0, sizeof(*a));
659 	/* set new allocations after header, as if allocated in a sequence
660 	 * of minsize allocations */
661 	a->nextgrow = UDB_HEADER_SIZE;
662 }
663 
664 /** fsck the file size, false if failed and file is useless */
665 static int
666 fsck_fsize(udb_base* udb, udb_alloc* alloc)
667 {
668 	off_t realsize;
669 	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
670 	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
671 	if(realsize == (off_t)-1) {
672 		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
673 		return 0;
674 	}
675 	udb->glob_data->fsize = (uint64_t)realsize;
676 	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
677 		return 0;
678 	udb->glob_data->dirty_alloc = udb_dirty_clean;
679 	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
680 	udb_base_sync(udb, 1);
681 	return 1;
682 }
683 
684 /** regenerate freelist add a new free chunk, return next todo */
685 static udb_void
686 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
687 {
688 	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
689 	uint64_t esz = (uint64_t)1<<exp;
690 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
691 		return 0;
692 	}
693 	cp->type = udb_chunk_type_free;
694 	cp->flags = 0;
695 	chunk_set_last(base, c, exp, (uint8_t)exp);
696 	cp->prev = 0;
697 	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
698 	if(cp->next)
699 		UDB_FREE_CHUNK(cp->next)->prev = c;
700 	regen->stat_free += esz;
701 	return c + esz;
702 }
703 
704 /** regenerate xl chunk, return next todo */
705 static udb_void
706 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
707 {
708 	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
709 	uint64_t xlsz = cp->size;
710 	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
711 		return 0;
712 	}
713 	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
714 		return 0;
715 	}
716 	/* fixup end-size and end-expmarker */
717 	regen->stat_alloc += xlsz;
718 	return c + xlsz;
719 }
720 
721 /** regenerate data chunk, return next todo */
722 static udb_void
723 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
724 {
725 	uint64_t esz = (uint64_t)1<<exp;
726 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
727 		return 0;
728 	}
729 	chunk_set_last(base, c, exp, (uint8_t)exp);
730 	regen->stat_alloc += esz;
731 	return c + esz;
732 }
733 
734 /** regenerate a relptr structure inside a data segment */
735 static void
736 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
737 {
738 	udb_void* a = (udb_void*)arg;
739 	/* ignore 0 pointers */
740 	if(!rp->data)
741 		return;
742 
743 	/* edit relptrs that point to oldmoved to point to newmoved. */
744 	if(rp->data == a[0])
745 		rp->data = a[1];
746 
747 	/* regenerate relptr lists, add this item to the relptr list for
748 	 * the data that it points to */
749 	udb_rel_ptr_link(base, rp, rp->data);
750 }
751 
752 /** regenerate the relptrs store in this data segment */
753 static void
754 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
755 	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
756 {
757 	udb_void arg[2];
758 	arg[0] = rb_old; arg[1] = rb_new;
759 	/* walk through the structs here and put them on their respective
760 	 * relptr lists */
761 	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
762 		&regen_relptr_func, arg);
763 
764 }
765 
766 /** regenerate relptrlists in the file */
767 static void
768 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
769 	udb_void rb_old, udb_void rb_new)
770 {
771 	udb_void at = alloc->udb->glob_data->hsize;
772 	/* clear all ptrlist start pointers in the file. */
773 	while(at < alloc->disk->nextgrow) {
774 		int exp = (int)UDB_CHUNK(at)->exp;
775 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
776 		if(exp == UDB_EXP_XL) {
777 			UDB_XL_CHUNK(at)->ptrlist = 0;
778 			at += UDB_XL_CHUNK(at)->size;
779 		} else if(tp == udb_chunk_type_free) {
780 			at += (uint64_t)1<<exp;
781 		} else { /* data chunk */
782 			UDB_CHUNK(at)->ptrlist = 0;
783 			at += (uint64_t)1<<exp;
784 		}
785 	}
786 	/* walk through all relptr structs and put on the right list. */
787 	at = alloc->udb->glob_data->hsize;
788 	while(at < alloc->disk->nextgrow) {
789 		udb_chunk_d* atp = UDB_CHUNK(at);
790 		int exp = (int)atp->exp;
791 		udb_chunk_type tp = (udb_chunk_type)atp->type;
792 		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
793 			(uint64_t)1<<exp);
794 		if(exp == UDB_EXP_XL) {
795 			assert(at != rb_old); /* should have been freed */
796 			regen_its_ptrs(base, udb, atp,
797 				((void*)atp)+sizeof(udb_xl_chunk_d),
798 				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
799 				rb_old, rb_new);
800 			at += sz;
801 		} else if(tp == udb_chunk_type_free) {
802 			at += sz;
803 		} else { /* data chunk */
804 			assert(at != rb_old); /* should have been freed */
805 			regen_its_ptrs(base, udb, atp,
806 				((void*)atp)+sizeof(udb_chunk_d),
807 				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
808 			at += sz;
809 		}
810 	}
811 }
812 
813 
814 /** mark free elements from ex XL chunk space and later fixups pick that up */
815 static void
816 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
817 {
818 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
819 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
820 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
821 	while(q >= s) {
822 		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
823 		UDB_CHUNK(q)->type = udb_chunk_type_free;
824 		q -= UDB_ALLOC_CHUNK_SIZE;
825 	}
826 }
827 
828 
829 /** fsck rollback or rollforward XL move results */
830 static int
831 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
832 	uint64_t rb_size, uint64_t rb_seg)
833 {
834 
835 	if(rb_old <= rb_new)
836 		return 0; /* XL move one way */
837 	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
838 		return 0; /* not aligned */
839 	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
840 		return 0; /* not aligned */
841 	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
842 		return 0; /* not aligned */
843 	if(rb_new + rb_size <= rb_old) {
844 		/* not overlapping: resume copy */
845 		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
846 		/* and free up old piece(s) */
847 		rb_mark_free_segs(base, rb_old, rb_size);
848 	} else {
849 		/* overlapping, see what segment we stopped at
850 		 * and continue there. */
851 		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
852 		/* free up old piece(s); from the end of the moved segment,
853 		 * until the end of the old segment */
854 		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
855 			(rb_new+rb_size));
856 	}
857 	/* do not call fix_ptrs, regenptrs does the job */
858 	return 1;
859 }
860 
861 /** fsck rollback or rollforward move results */
862 static int
863 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
864 	udb_void* make_free)
865 {
866 	if( (rb_size&(rb_size-1)) != 0)
867 		return 0; /* not powerof2 */
868 	if( (rb_old&(rb_size-1)) != 0)
869 		return 0; /* not aligned */
870 	if( (rb_new&(rb_size-1)) != 0)
871 		return 0; /* not aligned */
872 	/* resume copy */
873 	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
874 	/* do not call fix_ptrs, regenptrs does the job */
875 	/* make sure udb_old is freed */
876 	*make_free = rb_old;
877 	return 1;
878 }
879 
880 /** fsck the file and salvage, false if failed and file is useless */
881 static int
882 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
883 {
884 	void* base = udb->base;
885 	udb_alloc_d regen;
886 	udb_void at = udb->glob_data->hsize;
887 	udb_void rb_old = udb->glob_data->rb_old;
888 	udb_void rb_new = udb->glob_data->rb_new;
889 	udb_void rb_seg = udb->glob_data->rb_seg;
890 	udb_void make_free = 0;
891 	uint64_t rb_size = udb->glob_data->rb_size;
892 	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
893 	/* walk through the file, use the exp values to see what can be
894 	 * salvaged */
895 	if(moved && rb_old && rb_new && rb_size) {
896 		if(rb_old+rb_size <= alloc->disk->nextgrow
897 			&& rb_new+rb_size <= alloc->disk->nextgrow) {
898 			/* we can use the move information to fix up the
899 			 * duplicate element (or partially moved element) */
900 			if(rb_size > 1024*1024) {
901 				/* XL chunk */
902 				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
903 					rb_size, rb_seg))
904 					return 0;
905 			} else {
906 				if(!fsck_rb(base, rb_old, rb_new, rb_size,
907 					&make_free))
908 					return 0;
909 			}
910 		}
911 	}
912 
913 	/* rebuild freelists */
914 	/* recalculate stats in alloc (except 'stat_data') */
915 	/* possibly new end 'nextgrow' value */
916 	memset(&regen, 0, sizeof(regen));
917 	regen.nextgrow = alloc->disk->nextgrow;
918 	while(at < regen.nextgrow) {
919 		/* figure out this chunk */
920 		int exp = (int)UDB_CHUNK(at)->exp;
921 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
922 		/* consistency check possible here with end-exp */
923 		if(tp == udb_chunk_type_free || at == make_free) {
924 			at = regen_free(base, at, exp, &regen);
925 			if(!at) return 0;
926 		} else if(exp == UDB_EXP_XL) {
927 			/* allocated data of XL size */
928 			at = regen_xl(base, at, &regen);
929 			if(!at) return 0;
930 		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
931 			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
932 			/* allocated data */
933 			at = regen_data(base, at, exp, &regen);
934 			if(!at) return 0;
935 		} else {
936 			/* garbage; this must be EOF then */
937 			regen.nextgrow = at;
938 			break;
939 		}
940 	}
941 	*alloc->disk = regen;
942 
943 	/* rebuild relptr lists */
944 	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
945 
946 	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
947 		udb->fname);
948 	udb->glob_data->rb_old = 0;
949 	udb->glob_data->rb_new = 0;
950 	udb->glob_data->rb_size = 0;
951 	udb->glob_data->dirty_alloc = udb_dirty_clean;
952 	udb_base_sync(udb, 1);
953 	return 1;
954 }
955 
956 
957 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
958 {
959 	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
960 	if(!alloc)
961 		return NULL;
962 	alloc->udb = udb;
963 	alloc->disk = disk;
964 	/* see if committed but uncompleted actions need to be done */
965 	/* preserves the alloc state */
966 	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
967 		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
968 			if(fsck_fsize(udb, alloc))
969 				return alloc;
970 		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
971 			if(fsck_file(udb, alloc, 0))
972 				return alloc;
973 		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
974 			if(fsck_file(udb, alloc, 1))
975 				return alloc;
976 		}
977 		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
978 			(int)udb->glob_data->dirty_alloc);
979 		free(alloc);
980 		return NULL;
981 	}
982 	return alloc;
983 }
984 
985 void udb_alloc_delete(udb_alloc* alloc)
986 {
987 	if(!alloc) return;
988 	free(alloc);
989 }
990 
991 /** unlink this element from its freelist */
992 static void
993 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
994 {
995 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
996 	assert(chunk);
997 	/* chunk is a free chunk */
998 	assert(fp->exp == (uint8_t)exp);
999 	assert(fp->type == udb_chunk_type_free);
1000 	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
1001 	/* and thus freelist not empty */
1002 	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
1003 	/* unlink */
1004 	if(fp->prev)
1005 		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
1006 	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1007 	if(fp->next)
1008 		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
1009 }
1010 
1011 /** pop first element off freelist, list may not be empty */
1012 static udb_void
1013 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
1014 {
1015 	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1016 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1017 	assert(f);
1018 	assert(fp->exp == (uint8_t)exp);
1019 	assert(fp->type == udb_chunk_type_free);
1020 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1021 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1022 	if(fp->next) {
1023 		UDB_FREE_CHUNK(fp->next)->prev = 0;
1024 	}
1025 	return f;
1026 }
1027 
1028 /** push new element onto freelist */
1029 static void
1030 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1031 {
1032 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1033 	assert(f);
1034 	fp->exp = (uint8_t)exp;
1035 	fp->type = udb_chunk_type_free;
1036 	fp->flags = 0;
1037 	fp->prev = 0;
1038 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1039 	if(fp->next)
1040 		UDB_FREE_CHUNK(fp->next)->prev = f;
1041 	chunk_set_last(base, f, exp, (uint8_t)exp);
1042 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1043 }
1044 
1045 /** push new element onto freelist - do not initialize the elt */
1046 static void
1047 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1048 {
1049 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1050 	assert(f);
1051 	assert(fp->exp == (uint8_t)exp);
1052 	assert(fp->type == udb_chunk_type_free);
1053 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1054 	fp->prev = 0;
1055 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1056 	if(fp->next)
1057 		UDB_FREE_CHUNK(fp->next)->prev = f;
1058 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1059 }
1060 
1061 /** add free chunks at end until specified alignment occurs */
1062 static void
1063 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1064 {
1065 	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1066 		/* the nextgrow is not a whole multiple of esz. */
1067 		/* grow a free chunk of max allowed size */
1068 		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1069 		uint64_t fsz = (uint64_t)1<<fexp;
1070 		udb_void f = alloc->disk->nextgrow;
1071 		udb_void fn = alloc->disk->nextgrow+fsz;
1072 		assert(fn <= alloc->udb->base_size);
1073 		alloc->disk->stat_free += fsz;
1074 		udb_alloc_push_fl(base, alloc, f, fexp);
1075 		/* now increase nextgrow to commit that free chunk */
1076 		alloc->disk->nextgrow = fn;
1077 	}
1078 }
1079 
1080 /** append chunks at end of memory space to get size exp, return dataptr */
1081 static udb_void
1082 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1083 {
1084 	uint64_t esz = (uint64_t)1<<exp;
1085 	udb_void ret;
1086 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1087 	grow_align(base, alloc, esz);
1088 	/* free chunks are grown, grow the one we want to use */
1089 	ret = alloc->disk->nextgrow;
1090 	/* take a new alloced chunk into use */
1091 	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1092 	UDB_CHUNK(ret)->flags = 0;
1093 	UDB_CHUNK(ret)->ptrlist = 0;
1094 	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1095 	/* store last octet */
1096 	chunk_set_last(base, ret, exp, (uint8_t)exp);
1097 	/* update stats */
1098 	alloc->disk->stat_alloc += esz;
1099 	alloc->disk->stat_data += sz;
1100 	/* now increase nextgrow to commit this newly allocated chunk */
1101 	alloc->disk->nextgrow += esz;
1102 	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1103 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1104 	return ret + sizeof(udb_chunk_d); /* ptr to data */
1105 }
1106 
1107 /** calculate how much space is necessary to grow for this exp */
1108 static uint64_t
1109 grow_end_calc(udb_alloc* alloc, int exp)
1110 {
1111 	uint64_t sz = (uint64_t)1<<exp;
1112 	uint64_t ng = alloc->disk->nextgrow;
1113 	uint64_t res;
1114 	/* if nextgrow is 2**expness, no extra growth needed, only size */
1115 	if( (ng & (sz-1)) == 0) {
1116 		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1117 		return ng+sz; /* must grow exactly 2**exp */
1118 	}
1119 	/* grow until 2**expness and then we need 2**exp as well */
1120 	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1121 	 * and then add the sz twice (go up to whole sz, and to allocate) */
1122 	res = (ng & ~(sz-1)) + 2*sz;
1123 	return res;
1124 }
1125 
1126 /** see if we need to grow more than specified to enable sustained growth */
1127 static uint64_t
1128 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1129 {
1130 	const uint64_t mb = 1024*1024;
1131 	uint64_t bsz = alloc->udb->base_size;
1132 	if(bsz <= mb) {
1133 		/* below 1 Mb, double sizes for exponential growth */
1134 		/* takes about 15 times to grow to 1Mb */
1135 		if(ge < bsz*2)
1136 			return bsz*2;
1137 	} else {
1138 		uint64_t gnow = ge - bsz;
1139 		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1140 		 * in whole megabytes rounded up. */
1141 		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1142 		if(gnow < want)
1143 			return bsz + want;
1144 	}
1145 	return ge;
1146 }
1147 
1148 /** see if free space is enogh to warrant shrink (while file is open) */
1149 static int
1150 enough_free(udb_alloc* alloc)
1151 {
1152 	if(alloc->udb->base_size <= 2*1024*1024) {
1153 		/* below 1 Mb, grown by double size, (so up to 2 mb),
1154 		 * do not shrink unless we can 1/3 in size */
1155 		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1156 			return 1;
1157 	} else {
1158 		/* grown 12.5%, shrink 25% if possible, at least one mb */
1159 		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1160 		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1161 		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1162 			|| alloc->udb->base_size < 4*1024*1024))
1163 			return 1;
1164 	}
1165 	return 0;
1166 }
1167 
1168 /** grow space for a chunk of 2**exp and return dataptr */
1169 static udb_void
1170 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1171 {
1172 	/* commit the grow action
1173 	 * - the file grow only changes filesize, but not the nextgrow.
1174 	 * - taking space after nextgrow into use (as free space),
1175 	 *   is like free-ing a chunk (one at a time).
1176 	 * - and the last chunk taken into use is like alloc.
1177 	 */
1178 	/* predict how much free space is needed for this */
1179 	uint64_t grow_end = grow_end_calc(alloc, exp);
1180 	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1181 	if(grow_end <= alloc->udb->base_size) {
1182 		/* we can do this with the available space */
1183 		return grow_chunks(base, alloc, sz, exp);
1184 	}
1185 	/* we have to grow the file, re-mmap */
1186 	/* see if we need to grow a little more, to avoid endless grow
1187 	 * efforts on adding data */
1188 	grow_end = grow_extra_check(alloc, grow_end);
1189 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1190 		return 0; /* mmap or write failed (disk or mem full) */
1191 	}
1192 	/* we have enough space now */
1193 	assert(grow_end <= alloc->udb->base_size);
1194 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1195 	return grow_chunks(base, alloc, sz, exp);
1196 }
1197 
1198 /** take XL allocation into use at end of file, return dataptr */
1199 static udb_void
1200 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1201 {
1202 	udb_void ret;
1203 	udb_xl_chunk_d* p;
1204 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1205 
1206 	/* align growth to whole mbs */
1207 	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1208 
1209 	/* grow XL segment */
1210 	ret = alloc->disk->nextgrow;
1211 	p = UDB_XL_CHUNK(ret);
1212 	p->exp = UDB_EXP_XL;
1213 	p->size = xlsz;
1214 	p->flags = 0;
1215 	p->ptrlist = 0;
1216 	p->type = udb_chunk_type_data;
1217 
1218 	/* also put size and marker at end for compaction */
1219 	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1220 	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1221 
1222 	/* stats */
1223 	alloc->disk->stat_data += sz;
1224 	alloc->disk->stat_alloc += xlsz;
1225 	/* now increase the nextgrow to commit this xl chunk */
1226 	alloc->disk->nextgrow += xlsz;
1227 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1228 	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1229 }
1230 
1231 /** make space for XL allocation */
1232 static udb_void
1233 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1234 {
1235 	/* allocate whole mbs of space, at end of space */
1236 	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1237 	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1238 	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1239 	assert(need >= asz);
1240 	if(grow_end <= alloc->udb->base_size) {
1241 		/* can do this in available space */
1242 		return grow_xl(base, alloc, need, sz);
1243 	}
1244 	/* have to grow file and re-mmap */
1245 	grow_end = grow_extra_check(alloc, grow_end);
1246 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1247 		return 0; /* mmap or write failed (disk or mem full) */
1248 	}
1249 	/* we have enough space now */
1250 	assert(grow_end <= alloc->udb->base_size);
1251 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1252 	return grow_xl(base, alloc, need, sz);
1253 }
1254 
1255 /** divide big(2**e2) into pieces so 2**exp fits */
1256 static udb_void
1257 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1258 	int exp)
1259 {
1260 	int e = e2;
1261 	uint64_t sz = (uint64_t)1<<e2;
1262 	assert(big && e2 > exp);
1263 	/* so the returned piece to use is the first piece,
1264 	 * offload the later half until it fits */
1265 	do {
1266 		sz >>= 1; /* divide size of big by two */
1267 		e--;      /* that means its exp is one smaller */
1268 		udb_alloc_push_fl(base, alloc, big+sz, e);
1269 	} while(e != exp);
1270 	/* exit loop when last pushed is same size as what we want */
1271 	return big;
1272 }
1273 
1274 /** returns the exponent size of the chunk needed for data sz */
1275 static int
1276 udb_alloc_exp_needed(size_t sz)
1277 {
1278 	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1279 	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1280 		return UDB_EXP_XL;
1281 	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1282 		return UDB_ALLOC_CHUNK_MINEXP;
1283 	}
1284 	return udb_exp_size(asz);
1285 }
1286 
1287 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1288 {
1289 	void* base = alloc->udb->base;
1290 	/* calculate actual allocation size */
1291 	int e2, exp = udb_alloc_exp_needed(sz);
1292 	if(exp == UDB_EXP_XL)
1293 		return udb_alloc_xl_space(base, alloc, sz);
1294 	/* see if there is a free chunk of that size exactly */
1295 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1296 		/* snip from freelist, udb_chunk_d */
1297 		udb_void ret;
1298 		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1299 		ret = udb_alloc_pop_fl(base, alloc, exp);
1300 		/* use it - size octets already OK */
1301 		UDB_CHUNK(ret)->flags = 0;
1302 		UDB_CHUNK(ret)->ptrlist = 0;
1303 		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1304 		/* update stats */
1305 		alloc->disk->stat_data += sz;
1306 		alloc->disk->stat_alloc += (1<<exp);
1307 		assert(alloc->disk->stat_free >= (1u<<exp));
1308 		alloc->disk->stat_free -= (1<<exp);
1309 		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1310 		return ret + sizeof(udb_chunk_d); /* ptr to data */
1311 	}
1312 	/* see if we can subdivide a larger chunk */
1313 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1314 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1315 			udb_void big, ret; /* udb_chunk_d */
1316 			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1317 			big = udb_alloc_pop_fl(base, alloc, e2);
1318 			/* push other parts onto freelists (needs inited) */
1319 			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1320 			/* use final part (needs inited) */
1321 			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1322 			/* if stop here; the new exp makes smaller free chunk*/
1323 			UDB_CHUNK(ret)->flags = 0;
1324 			UDB_CHUNK(ret)->ptrlist = 0;
1325 			/* set type to commit data chunk */
1326 			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1327 			/* store last octet */
1328 			chunk_set_last(base, ret, exp, (uint8_t)exp);
1329 			/* update stats */
1330 			alloc->disk->stat_data += sz;
1331 			alloc->disk->stat_alloc += (1<<exp);
1332 			assert(alloc->disk->stat_free >= (1u<<exp));
1333 			alloc->disk->stat_free -= (1<<exp);
1334 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1335 			return ret + sizeof(udb_chunk_d); /* ptr to data */
1336 		}
1337 	/* we need to grow an extra chunk */
1338 	return udb_alloc_grow_space(base, alloc, sz, exp);
1339 }
1340 
1341 /** see if there is free space to allocate a chunk into */
1342 static int
1343 have_free_for(udb_alloc* alloc, int exp)
1344 {
1345 	int e2;
1346 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1347 		return exp;
1348 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1349 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1350 			return e2;
1351 		}
1352 	return 0;
1353 }
1354 
1355 /** fix relptr prev and next for moved relptr structures */
1356 static void
1357 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1358 {
1359 	udb_void* data = (udb_void*)arg;
1360 	udb_void r;
1361 	if(!rp->data)
1362 		return;
1363 	r = UDB_SYSTOREL(base, rp);
1364 	if(rp->next)
1365 		UDB_REL_PTR(rp->next)->prev = r;
1366 	if(rp->prev)
1367 		UDB_REL_PTR(rp->prev)->next = r;
1368 	else	{
1369 		/* if this is a pointer to its own chunk, fix it up;
1370 		 * the data ptr gets set by relptr_edit later. */
1371 		if(rp->data == data[0])
1372 			UDB_CHUNK(data[1])->ptrlist = r;
1373 		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1374 	}
1375 }
1376 
1377 /** fix pointers from and to a moved chunk */
1378 static void
1379 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1380 	uint64_t dsz, udb_void olddata)
1381 {
1382 	udb_void d[2];
1383 	d[0] = olddata;
1384 	d[1] = data;
1385 	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1386 		dsz, &chunk_fix_ptr_each, d);
1387 	udb_rel_ptr_edit(base, cp->ptrlist, data);
1388 	udb_base_ram_ptr_edit(udb, olddata, data);
1389 }
1390 
1391 /** move an allocated chunk to use a free chunk */
1392 static void
1393 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1394 	int e2)
1395 {
1396 	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1397 	udb_chunk_d* rp;
1398 	udb_chunk_d* fp;
1399 	if(exp != e2) {
1400 		/* it is bigger, subdivide it */
1401 		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1402 	}
1403 	assert(res != f);
1404 	/* setup rollback information */
1405 	alloc->udb->glob_data->rb_old = f;
1406 	alloc->udb->glob_data->rb_new = res;
1407 	alloc->udb->glob_data->rb_size = esz;
1408 	/* take the res, exp into use */
1409 	rp = UDB_CHUNK(res);
1410 	fp = UDB_CHUNK(f);
1411 	/* copy over the data */
1412 	memcpy(rp, fp, esz);
1413 	/* adjust rel ptrs */
1414 	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1415 		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1416 
1417 	/* do not freeup the fp; caller does that */
1418 }
1419 
1420 /** unlink several free elements to overwrite with xl chunk */
1421 static void
1422 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1423 {
1424 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1425 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1426 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1427 	while(q >= s) {
1428 		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1429 		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1430 		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1431 		q -= UDB_ALLOC_CHUNK_SIZE;
1432 	}
1433 }
1434 
1435 /** move an XL chunk, and keep track of segments for rollback */
1436 static void
1437 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1438 	uint64_t sz, uint64_t startseg)
1439 {
1440 	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1441 	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1442 	uint64_t amount = xl - n;
1443 	assert(n < xl); /* move to compact */
1444 
1445 	/* setup move rollback */
1446 	udb->glob_data->rb_old = xl;
1447 	udb->glob_data->rb_new = n;
1448 	udb->glob_data->rb_size = sz;
1449 
1450 	/* is it overlapping? */
1451 	if(sz <= amount) {
1452 		memcpy(np, xlp, sz);
1453 	} else {
1454 		/* move and commit per 1M segment to avoid data loss */
1455 		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1456 		for(seg = startseg; seg<maxseg; seg++) {
1457 			udb->glob_data->rb_seg = seg;
1458 			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1459 				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1460 				UDB_ALLOC_CHUNK_SIZE);
1461 		}
1462 
1463 	}
1464 }
1465 
1466 /** move list of XL chunks to the front by the shift amount */
1467 static void
1468 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1469 	uint64_t amount)
1470 {
1471 	udb_void xl = xl_start;
1472 	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1473 	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1474 	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1475 	while(xl < xl_start+xl_sz) {
1476 		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1477 		udb_void n = xl-amount;
1478 		uint64_t sz = xlp->size;
1479 		assert(xlp->exp == UDB_EXP_XL);
1480 		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1481 		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1482 			n+sizeof(udb_xl_chunk_d),
1483 			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1484 			xl+sizeof(udb_xl_chunk_d));
1485 	}
1486 	alloc->disk->stat_free -= amount;
1487 	alloc->disk->nextgrow -= amount;
1488 	alloc->udb->glob_data->rb_old = 0;
1489 	alloc->udb->glob_data->rb_new = 0;
1490 	alloc->udb->glob_data->rb_size = 0;
1491 }
1492 
1493 /** see if free chunk can coagulate with another chunk, return other chunk */
1494 static udb_void
1495 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1496 	uint64_t esz)
1497 {
1498 	udb_void other = f^esz;
1499 	if(exp == UDB_ALLOC_CHUNKS_MAX)
1500 		return 0; /* no further merges */
1501 	if(other >= alloc->udb->base_size)
1502 		return 0; /* not allocated */
1503 	if(other >= alloc->disk->nextgrow)
1504 		return 0; /* not in use */
1505 	if(other < alloc->udb->glob_data->hsize)
1506 		return 0; /* cannot merge with header */
1507 		/* the header is also protected by the special exp marker */
1508 	/* see if the other chunk is a free chunk */
1509 
1510 	/* check closest marker to avoid large memory churn */
1511 	/* and also it makes XL allocations and header special markers work */
1512 	if(f > other) {
1513 		assert(f > 1); /* this is certain because of header */
1514 		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1515 			/* can do it if the other part is a free chunk */
1516 			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1517 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1518 				return other;
1519 		}
1520 	} else {
1521 		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1522 			/* can do it if the other part is a free chunk */
1523 			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1524 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1525 				return other;
1526 		}
1527 	}
1528 	return 0;
1529 }
1530 
1531 /** coagulate and then add new free segment, return final free segment */
1532 static udb_void
1533 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1534 	uint64_t esz)
1535 {
1536 	/* new free chunk here, attempt coagulate */
1537 	udb_void other;
1538 	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1539 		/* unlink that other chunk */
1540 		udb_alloc_unlink_fl(base, alloc, other, exp);
1541 		/* merge up */
1542 		if(other < last)
1543 			last = other;
1544 		exp++;
1545 		esz <<= 1;
1546 	}
1547 	/* free the final segment */
1548 	udb_alloc_push_fl(base, alloc, last, exp);
1549 	return last;
1550 }
1551 
1552 /** attempt to compact the data and move free space to the end */
1553 int
1554 udb_alloc_compact(void* base, udb_alloc* alloc)
1555 {
1556 	udb_void last;
1557 	int exp, e2;
1558 	uint64_t esz;
1559 	uint64_t at = alloc->disk->nextgrow;
1560 	udb_void xl_start = 0;
1561 	uint64_t xl_sz = 0;
1562 	if(alloc->udb->inhibit_compact)
1563 		return 1;
1564 	alloc->udb->useful_compact = 0;
1565 	while(at > alloc->udb->glob_data->hsize) {
1566 		/* grab last entry */
1567 		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1568 		if(exp == UDB_EXP_XL) {
1569 			/* for XL chunks:
1570 			 * - inspect the size of the XLchunklist at end
1571 			 * - attempt to compact in front of of XLchunklist
1572 			 */
1573 			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1574 				at-sizeof(uint64_t)*2));
1575 			udb_void xl = at-xlsz;
1576 #ifndef NDEBUG
1577 			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1578 			assert(xlp->exp == UDB_EXP_XL);
1579 			assert(xlp->type != udb_chunk_type_free);
1580 #endif
1581 			/* got thesegment add to the xl chunk list */
1582 			if(xl_start != 0 && xl+xlsz != xl_start) {
1583 				/* nonadjoining XL part, but they are aligned,
1584 				 * so the space in between is whole Mbs,
1585 				 * shift the later part(s) and continue */
1586 				uint64_t m = xl_start - (xl+xlsz);
1587 				assert(xl_start > xl+xlsz);
1588 				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1589 				free_xl_space(base, alloc, xl+xlsz, m);
1590 				move_xl_list(base, alloc, xl_start, xl_sz, m);
1591 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1592 			}
1593 			xl_start = xl;
1594 			xl_sz += xlsz;
1595 			at = xl;
1596 			continue;
1597 			/* end of XL if */
1598 		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1599 			|| exp > UDB_ALLOC_CHUNKS_MAX)
1600 			break; /* special chunk or garbage */
1601 		esz = (uint64_t)1<<exp;
1602 		last = at - esz;
1603 		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1604 		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1605 			/* if xlstart continue looking to move stuff, but do
1606 			 * not unlink this free segment */
1607 			if(!xl_start) {
1608 				/* it is a free chunk, remove it */
1609 				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1610 				udb_alloc_unlink_fl(base, alloc, last, exp);
1611 				alloc->disk->stat_free -= esz;
1612 				alloc->disk->nextgrow = last;
1613 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1614 				/* and continue at this point */
1615 			}
1616 			at = last;
1617 		} else if( (e2=have_free_for(alloc, exp)) ) {
1618 			/* last entry can be allocated in free chunks
1619 			 * move it to its new position, adjust rel_ptrs */
1620 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1621 			move_chunk(base, alloc, last, exp, esz, e2);
1622 			if(xl_start) {
1623 				last = coagulate_and_push(base, alloc,
1624 					last, exp, esz);
1625 			} else {
1626 				/* shorten usage */
1627 				alloc->disk->stat_free -= esz;
1628 				alloc->disk->nextgrow = last;
1629 			}
1630 			alloc->udb->glob_data->rb_old = 0;
1631 			alloc->udb->glob_data->rb_new = 0;
1632 			alloc->udb->glob_data->rb_size = 0;
1633 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1634 			/* and continue in front of it */
1635 			at = last;
1636 		} else {
1637 			/* cannot compact this block, stop compacting */
1638 			break;
1639 		}
1640 		/* if that worked, repeat it */
1641 	}
1642 	/* if we passed xl chunks, see if XL-chunklist can move */
1643 	if(xl_start) {
1644 		/* calculate free space in front of the XLchunklist. */
1645 		/* has to be whole mbs of free space */
1646 		/* if so, we can move the XL chunks.  Move them all back
1647 		 * by the new free space. */
1648 		/* this compacts very well, but the XL chunks can be moved
1649 		 * multiple times; worst case for every mb freed a huge sized
1650 		 * xlchunklist gets moved. */
1651 		/* free space must be, since aligned and coagulated, in
1652 		 * chunks of a whole MB */
1653 		udb_void at = xl_start;
1654 		uint64_t m = 0;
1655 		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1656 			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1657 			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1658 				break;
1659 			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1660 			m += UDB_ALLOC_CHUNK_SIZE;
1661 			at = chunk;
1662 		}
1663 		if(m != 0) {
1664 			assert(at+m == xl_start);
1665 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1666 			free_xl_space(base, alloc, at, m);
1667 			move_xl_list(base, alloc, xl_start, xl_sz, m);
1668 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1669 		}
1670 	}
1671 
1672 	/* if enough free, shrink the file; re-mmap */
1673 	if(enough_free(alloc)) {
1674 		uint64_t nsize = alloc->disk->nextgrow;
1675 		udb_base_shrink(alloc->udb, nsize);
1676 		if(!udb_base_remap(alloc->udb, alloc, nsize))
1677 			return 0;
1678 	}
1679 	return 1;
1680 }
1681 
1682 int
1683 udb_compact(udb_base* udb)
1684 {
1685 	if(!udb) return 1;
1686 	if(!udb->useful_compact) return 1;
1687 	DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database..."));
1688 	return udb_alloc_compact(udb->base, udb->alloc);
1689 }
1690 
1691 void udb_compact_inhibited(udb_base* udb, int inhibit)
1692 {
1693 	if(!udb) return;
1694 	udb->inhibit_compact = inhibit;
1695 }
1696 
1697 #ifdef UDB_CHECK
1698 /** check that rptrs are really zero before free */
1699 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1700 {
1701 	(void)base;
1702 	(void)arg;
1703 	assert(p->data == 0);
1704 }
1705 #endif /* UDB_CHECK */
1706 
1707 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1708 static void
1709 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1710 	size_t sz)
1711 {
1712 	uint64_t xlsz = fp->size;
1713 	uint64_t c;
1714 	/* lightweight check for buffer overflow in xl data */
1715 	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1716 	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1717 	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1718 	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1719 #ifdef UDB_CHECK
1720 	/* check that relptrs in this chunk have been zeroed */
1721 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1722 		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1723 		&udb_check_rptr_zero, NULL);
1724 #endif
1725 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1726 	/* update stats */
1727 	alloc->disk->stat_data -= sz;
1728 	alloc->disk->stat_alloc -= xlsz;
1729 	alloc->disk->stat_free += xlsz;
1730 	/* walk in reverse, so the front blocks go first on the list */
1731 	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1732 	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1733 	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1734 	while(c >= f) {
1735 		/* free a block of CHUNK_SIZE (1 Mb) */
1736 		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1737 		c -= UDB_ALLOC_CHUNK_SIZE;
1738 	}
1739 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1740 }
1741 
1742 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1743 {
1744 	void* base;
1745 	/* lookup chunk ptr */
1746 	udb_void f;
1747 	udb_chunk_d* fp;
1748 	uint64_t esz;
1749 	int exp;
1750 	udb_void other;
1751 	int coagulated = 0;
1752 	if(!r)
1753 		return 1; /* free(NULL) does nothing */
1754 
1755 	/* lookup size of chunk */
1756 	base = alloc->udb->base;
1757 	/* fails for XL blocks */
1758 	f = chunk_from_dataptr(r);
1759 	fp = UDB_CHUNK(f);
1760 	assert(fp->type != udb_chunk_type_free);
1761 
1762 	/* see if it has a ptrlist, if so: trouble, the list is not properly
1763 	 * cleaned up. (although you can imagine a wholesale delete where
1764 	 * it does not matter) */
1765 	assert(fp->ptrlist == 0);
1766 
1767 	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1768 	fp->ptrlist = 0;
1769 
1770 	if(fp->exp == UDB_EXP_XL) {
1771 		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1772 		/* compact */
1773 		if(alloc->udb->inhibit_compact) {
1774 			alloc->udb->useful_compact = 1;
1775 			return 1;
1776 		}
1777 		return udb_alloc_compact(base, alloc);
1778 	}
1779 	/* it is a regular chunk of 2**exp size */
1780 	exp = (int)fp->exp;
1781 	esz = (uint64_t)1<<exp;
1782 	/* light check for e.g. buffer overflow of the data */
1783 	assert(sz < esz);
1784 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1785 #ifdef UDB_CHECK
1786 	/* check that relptrs in this chunk have been zeroed */
1787 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1788 		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1789 #endif
1790 
1791 	/* update the stats */
1792 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1793 	alloc->disk->stat_data -= sz;
1794 	alloc->disk->stat_free += esz;
1795 	alloc->disk->stat_alloc -= esz;
1796 
1797 	/* if it can be merged with other free chunks, do so */
1798 	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1799 		coagulated = 1;
1800 		/* unlink that other chunk and expand it (it has same size) */
1801 		udb_alloc_unlink_fl(base, alloc, other, exp);
1802 		/* merge up */
1803 		if(other < f)
1804 			f = other;
1805 		exp++;
1806 		esz <<= 1;
1807 	}
1808 	if(coagulated) {
1809 		/* put big free chunk into freelist, and init it */
1810 		udb_alloc_push_fl(base, alloc, f, exp);
1811 	} else {
1812 		/* we do not need to touch the last-exp-byte, which may save
1813 		 * a reference to that page of memory */
1814 		fp->type = udb_chunk_type_free;
1815 		fp->flags = 0;
1816 		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1817 	}
1818 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1819 	/* compact */
1820 	if(alloc->udb->inhibit_compact) {
1821 		alloc->udb->useful_compact = 1;
1822 		return 1;
1823 	}
1824 	return udb_alloc_compact(base, alloc);
1825 }
1826 
1827 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1828 {
1829 	/* could be faster maybe, if grown? */
1830 	udb_void r = udb_alloc_space(alloc, sz);
1831 	if(!r) return r;
1832 	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1833 	return r;
1834 }
1835 
1836 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1837 {
1838 	void* base = alloc->udb->base;
1839 	udb_void c, n, newd;
1840 	udb_chunk_d* cp, *np;
1841 	uint64_t avail;
1842 	uint8_t cp_type;
1843 	/* emulate some posix realloc stuff */
1844 	if(r == 0)
1845 		return udb_alloc_space(alloc, sz);
1846 	if(sz == 0) {
1847 		if(!udb_alloc_free(alloc, r, osz))
1848 			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1849 		return 0;
1850 	}
1851 	c = chunk_from_dataptr(r);
1852 	cp = UDB_CHUNK(c);
1853 	cp_type = cp->type;
1854 	if(cp->exp == UDB_EXP_XL) {
1855 		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1856 			- sizeof(uint64_t)*2;
1857 	} else {
1858 		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1859 	}
1860 	if(sz <= avail)
1861 		return r;
1862 	/* reallocate it, and copy */
1863 	newd = udb_alloc_space(alloc, sz);
1864 	if(!newd) return 0;
1865 	/* re-base after alloc, since re-mmap may have happened */
1866 	base = alloc->udb->base;
1867 	cp = NULL; /* may be invalid now, robustness */
1868 	n = chunk_from_dataptr(newd);
1869 	np = UDB_CHUNK(n);
1870 	np->type = cp_type;
1871 	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1872 	/* fixup ptrs */
1873 	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1874 
1875 	if(!udb_alloc_free(alloc, r, osz))
1876 		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1877 	return newd;
1878 }
1879 
1880 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1881 {
1882 	const uint64_t mb = 1024*1024;
1883 	int exp = udb_alloc_exp_needed(sz);
1884 	uint64_t esz;
1885 	uint64_t want;
1886 	if(exp == UDB_EXP_XL)
1887 		esz = (sz&(mb-1))+mb;
1888 	else	esz = (uint64_t)1<<exp;
1889 	/* we need grow_end_calc to take into account alignment */
1890 	want = grow_end_calc(alloc, exp) + esz*(num-1);
1891 	assert(want >= alloc->udb->base_size);
1892 	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1893 		log_msg(LOG_ERR, "failed to grow the specified amount");
1894 		return 0;
1895 	}
1896 	return 1;
1897 }
1898 
1899 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1900 {
1901 	void* base = alloc->udb->base;
1902 	udb_void f = chunk_from_dataptr(r);
1903 	udb_chunk_d* fp = UDB_CHUNK(f);
1904 	/* not the 'free' type, that must be set by allocation routines */
1905 	assert(fp->type != udb_chunk_type_free);
1906 	assert(tp != udb_chunk_type_free);
1907 	fp->type = tp;
1908 }
1909 
1910 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1911 {
1912 	/* pointers are not valid before the header-size or after the
1913 	 * used-region of the mmap */
1914 	return ( (to+destsize) <= udb->base_size &&
1915 		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1916 		(to+destsize) <= udb->alloc->disk->nextgrow);
1917 }
1918 
1919 int udb_valid_dataptr(udb_base* udb, udb_void to)
1920 {
1921 	void* base = udb->base;
1922 	udb_void ch;
1923 	int exp;
1924 	uint64_t esz;
1925 	/* our data chunks are aligned and at least 8 bytes */
1926 	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1927 		return 0;
1928 	/* get the chunk pointer */
1929 	ch = chunk_from_dataptr(to);
1930 	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1931 		return 0;
1932 	/* check its size */
1933 	exp = UDB_CHUNK(ch)->exp;
1934 	if(exp == UDB_EXP_XL) {
1935 		/* check XL chunk */
1936 		uint64_t xlsz;
1937 		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1938 			return 0;
1939 		xlsz = UDB_XL_CHUNK(ch)->size;
1940 		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1941 			return 0;
1942 		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1943 			return 0;
1944 		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1945 			!= xlsz)
1946 			return 0;
1947 		return 1;
1948 	}
1949 	/* check if regular chunk has matching end byte */
1950 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1951 		return 0; /* cannot be a valid chunk */
1952 	esz = 1<<exp;
1953 	if(!udb_valid_offset(udb, ch+esz-1, 1))
1954 		return 0;
1955 	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1956 		return 0;
1957 	return 1;
1958 }
1959 
1960 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1961 {
1962 	void* base = udb->base;
1963 	udb_void p;
1964 	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1965 		return 0;
1966 	if(!udb_valid_dataptr(udb, to))
1967 		return 0;
1968 	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1969 	while(p) {
1970 		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1971 			return 0;
1972 		if(p == rptr)
1973 			return 1;
1974 		p = UDB_REL_PTR(p)->next;
1975 	}
1976 	return 0;
1977 }
1978 
1979 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1980 {
1981 	memset(ptr, 0, sizeof(*ptr));
1982 }
1983 
1984 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1985 {
1986 	if(!ptr->data)
1987 		return;
1988 	if(ptr->prev) {
1989 		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1990 	} else {
1991 		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1992 	}
1993 	if(ptr->next) {
1994 		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
1995 	}
1996 }
1997 
1998 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
1999 {
2000 	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
2001 	ptr->prev = 0;
2002 	ptr->next = chunk->ptrlist;
2003 	if(ptr->next)
2004 		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
2005 	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
2006 	ptr->data = to;
2007 }
2008 
2009 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
2010 {
2011 	assert(to == 0 || to > 64);
2012 	udb_rel_ptr_unlink(base, ptr);
2013 	if(to)
2014 		udb_rel_ptr_link(base, ptr, to);
2015 	else	ptr->data = to;
2016 }
2017 
2018 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
2019 {
2020 	udb_void p = list;
2021 	while(p) {
2022 		UDB_REL_PTR(p)->data = to;
2023 		p = UDB_REL_PTR(p)->next;
2024 	}
2025 }
2026 
2027 #ifdef UDB_CHECK
2028 /** check that all pointers are validly chained */
2029 static void
2030 udb_check_ptrs_valid(udb_base* udb)
2031 {
2032 	size_t i;
2033 	udb_ptr* p, *prev;
2034 	for(i=0; i<udb->ram_size; i++) {
2035 		prev = NULL;
2036 		for(p=udb->ram_hash[i]; p; p=p->next) {
2037 			assert(p->prev == prev);
2038 			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
2039 				== i);
2040 			assert(p->base == &udb->base);
2041 			prev = p;
2042 		}
2043 	}
2044 }
2045 #endif /* UDB_CHECK */
2046 
2047 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2048 {
2049 #ifdef UDB_CHECK
2050 	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2051 #endif
2052 	memset(ptr, 0, sizeof(*ptr));
2053 	ptr->base = &udb->base;
2054 }
2055 
2056 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2057 {
2058 	assert(newval == 0 || newval > 64);
2059 	if(ptr->data)
2060 		udb_base_unlink_ptr(udb, ptr);
2061 	ptr->data = newval;
2062 	if(newval)
2063 		udb_base_link_ptr(udb, ptr);
2064 }
2065 
2066 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2067 	size_t sz)
2068 {
2069 	udb_void r;
2070 	r = udb_alloc_space(udb->alloc, sz);
2071 	if(!r) return 0;
2072 	udb_alloc_set_type(udb->alloc, r, type);
2073 	udb_ptr_init(ptr, udb);
2074 	udb_ptr_set(ptr, udb, r);
2075 	return 1;
2076 }
2077 
2078 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2079 {
2080 	if(ptr->data) {
2081 		udb_void d = ptr->data;
2082 		udb_ptr_set(ptr, udb, 0);
2083 		udb_alloc_free(udb->alloc, d, sz);
2084 	}
2085 }
2086 
2087 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2088 {
2089 	udb_void f;
2090 	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2091 	f = chunk_from_dataptr(ptr->data);
2092 	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2093 }
2094