xref: /openbsd-src/usr.sbin/nsd/udb.c (revision db7d0d02cdf695b4f3cbc8748a264162da703c76)
1 /* udb.c - u(micro) data base.
2  * By W.C.A. Wijngaards
3  * Copyright 2010, NLnet Labs.
4  * BSD, see LICENSE.
5  */
6 #include "config.h"
7 #include "udb.h"
8 #include <string.h>
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <assert.h>
13 #include "lookup3.h"
14 #include "util.h"
15 
16 /* mmap and friends */
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <fcntl.h>
20 #include <sys/mman.h>
21 
22 /* for systems without, portable definition, failed-1 and async is a flag */
23 #ifndef MAP_FAILED
24 #define MAP_FAILED ((void*)-1)
25 #endif
26 #ifndef MS_SYNC
27 #define MS_SYNC 0
28 #endif
29 
30 /** move and fixup xl segment */
31 static void move_xl_segment(void* base, udb_base* udb, udb_void xl,
32 	udb_void n, uint64_t sz, uint64_t startseg);
33 /** attempt to compact the data and move free space to the end */
34 static int udb_alloc_compact(void* base, udb_alloc* alloc);
35 
36 /** convert pointer to the data part to a pointer to the base of the chunk */
37 static udb_void
38 chunk_from_dataptr(udb_void data)
39 {
40 	/* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and
41 	 * that xl_chunk_d is aligned on x**1024 boundaries. */
42 	udb_void xl = data - sizeof(udb_xl_chunk_d);
43 	if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0)
44 		return xl;
45 	return data - sizeof(udb_chunk_d);
46 }
47 
48 udb_void chunk_from_dataptr_ext(udb_void data) {
49 	return chunk_from_dataptr(data);
50 }
51 
52 #ifndef NDEBUG
53 /** read last octet from a chunk */
54 static uint8_t
55 chunk_get_last(void* base, udb_void chunk, int exp)
56 {
57 	return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1));
58 }
59 #endif
60 
61 /** write last octet of a chunk */
62 static void
63 chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value)
64 {
65 	*((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value;
66 }
67 
68 /** create udb_base from a file descriptor (must be at start of file) */
69 udb_base*
70 udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc,
71 	void* arg)
72 {
73 	uint64_t m, fsz;
74 	udb_glob_d g;
75 	ssize_t r;
76 	udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb));
77 	if(!udb) {
78 		log_msg(LOG_ERR, "out of memory");
79 		close(fd);
80 		return NULL;
81 	}
82 	udb->fname = strdup(fname);
83 	if(!udb->fname) {
84 		log_msg(LOG_ERR, "out of memory");
85 		free(udb);
86 		close(fd);
87 		return NULL;
88 	}
89 	udb->walkfunc = walkfunc;
90 	udb->walkarg = arg;
91 	udb->fd = fd;
92 	udb->ram_size = 1024;
93 	udb->ram_mask = (int)udb->ram_size - 1;
94 	udb->ram_hash = (udb_ptr**)xalloc_array_zero(sizeof(udb_ptr*),
95 		udb->ram_size);
96 	if(!udb->ram_hash) {
97 		free(udb->fname);
98 		free(udb);
99 		log_msg(LOG_ERR, "out of memory");
100 		close(fd);
101 		return NULL;
102 	}
103 
104 	/* read magic */
105 	if((r=read(fd, &m, sizeof(m))) == -1) {
106 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
107 		goto fail;
108 	} else if(r != (ssize_t)sizeof(m)) {
109 		log_msg(LOG_ERR, "%s: file too short", fname);
110 		goto fail;
111 	}
112 	/* TODO : what if bigendian and littleendian file, see magic */
113 	if(m != UDB_MAGIC) {
114 		log_msg(LOG_ERR, "%s: wrong type of file", fname);
115 		goto fail;
116 	}
117 	/* read header */
118 	if((r=read(fd, &g, sizeof(g))) == -1) {
119 		log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno));
120 		goto fail;
121 	} else if(r != (ssize_t)sizeof(g)) {
122 		log_msg(LOG_ERR, "%s: file too short", fname);
123 		goto fail;
124 	}
125 	if(g.version != 0) {
126 		log_msg(LOG_ERR, "%s: unknown file version %d", fname,
127 			(int)g.version);
128 		goto fail;
129 	}
130 	if(g.hsize < UDB_HEADER_SIZE) {
131 		log_msg(LOG_ERR, "%s: header size too small %d", fname,
132 			(int)g.hsize);
133 		goto fail;
134 	}
135 	if(g.hsize > UDB_HEADER_SIZE) {
136 		log_msg(LOG_WARNING, "%s: header size too large %d", fname,
137 			(int)g.hsize);
138 		goto fail;
139 	}
140 	if(g.clean_close != 1) {
141 		log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname,
142 			(int)g.clean_close);
143 		goto fail;
144 	}
145 	if(g.dirty_alloc != 0) {
146 		log_msg(LOG_WARNING, "%s: not cleanly closed (alloc:%d)", fname,
147 			(int)g.dirty_alloc);
148 		goto fail;
149 	}
150 
151 	/* check file size correctly written, for 4.0.2 nsd.db failure */
152 	fsz = (uint64_t)lseek(fd, (off_t)0, SEEK_END);
153 	(void)lseek(fd, (off_t)0, SEEK_SET);
154 	if(g.fsize != fsz) {
155 		log_msg(LOG_WARNING, "%s: file size %llu but mmap header "
156 			"has size %llu", fname, (unsigned long long)fsz,
157 			(unsigned long long)g.fsize);
158 		goto fail;
159 	}
160 
161 	/* mmap it */
162 	if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) {
163 		log_msg(LOG_ERR, "%s: file too short", fname);
164 		goto fail;
165 	}
166 	if(g.fsize > (uint64_t)400*1024*1024*1024*1024) /* 400 Tb */ {
167 		log_msg(LOG_WARNING, "%s: file size too large %llu",
168 			fname, (unsigned long long)g.fsize);
169 		goto fail;
170 	}
171 	udb->base_size = (size_t)g.fsize;
172 #ifdef HAVE_MMAP
173 	/* note the size_t casts must be there for portability, on some
174 	 * systems the layout of memory is otherwise broken. */
175 	udb->base = mmap(NULL, (size_t)udb->base_size,
176 		(int)PROT_READ|PROT_WRITE, (int)MAP_SHARED,
177 		(int)udb->fd, (off_t)0);
178 #else
179 	udb->base = MAP_FAILED; errno = ENOSYS;
180 #endif
181 	if(udb->base == MAP_FAILED) {
182 		udb->base = NULL;
183 		log_msg(LOG_ERR, "mmap(size %u) error: %s",
184 			(unsigned)udb->base_size, strerror(errno));
185 	fail:
186 		close(fd);
187 		free(udb->fname);
188 		free(udb->ram_hash);
189 		free(udb);
190 		return NULL;
191 	}
192 
193 	/* init completion */
194 	udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t));
195 	r = 0;
196 	/* cannot be dirty because that is goto fail above */
197 	if(udb->glob_data->dirty_alloc != udb_dirty_clean)
198 		r = 1;
199 	udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)(
200 		(void*)udb->glob_data+sizeof(*udb->glob_data)));
201 	if(!udb->alloc) {
202 		log_msg(LOG_ERR, "out of memory");
203 		udb_base_free(udb);
204 		return NULL;
205 	}
206 	if(r) {
207 		/* and compact now, or resume compacting */
208 		udb_alloc_compact(udb, udb->alloc);
209 		udb_base_sync(udb, 1);
210 	}
211 	udb->glob_data->clean_close = 0;
212 
213 	return udb;
214 }
215 
216 udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc,
217 	void* arg)
218 {
219 	int fd = open(fname, O_RDWR);
220 	if(fd == -1) {
221 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
222 		return NULL;
223 	}
224 	return udb_base_create_fd(fname, fd, walkfunc, arg);
225 }
226 
227 /** init new udb_global structure */
228 static void udb_glob_init_new(udb_glob_d* g)
229 {
230 	memset(g, 0, sizeof(*g));
231 	g->hsize = UDB_HEADER_SIZE;
232 	g->fsize = UDB_HEADER_SIZE;
233 }
234 
235 /** write data to file and check result */
236 static int
237 write_fdata(const char* fname, int fd, void* data, size_t len)
238 {
239 	ssize_t w;
240 	if((w=write(fd, data, len)) == -1) {
241 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
242 		close(fd);
243 		return 0;
244 	} else if(w != (ssize_t)len) {
245 		log_msg(LOG_ERR, "%s: short write (disk full?)", fname);
246 		close(fd);
247 		return 0;
248 	}
249 	return 1;
250 }
251 
252 udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc,
253 	void* arg)
254 {
255 	uint64_t m;
256 	udb_glob_d g;
257 	udb_alloc_d a;
258 	uint64_t endsize = UDB_HEADER_SIZE;
259 	uint64_t endexp = 0;
260 	int fd = open(fname, O_CREAT|O_RDWR, 0600);
261 	if(fd == -1) {
262 		log_msg(LOG_ERR, "%s: %s", fname, strerror(errno));
263 		return NULL;
264 	}
265 	m = UDB_MAGIC;
266 	udb_glob_init_new(&g);
267 	udb_alloc_init_new(&a);
268 	g.clean_close = 1;
269 
270 	/* write new data to file (closes fd on error) */
271 	if(!write_fdata(fname, fd, &m, sizeof(m)))
272 		return NULL;
273 	if(!write_fdata(fname, fd, &g, sizeof(g)))
274 		return NULL;
275 	if(!write_fdata(fname, fd, &a, sizeof(a)))
276 		return NULL;
277 	if(!write_fdata(fname, fd, &endsize, sizeof(endsize)))
278 		return NULL;
279 	if(!write_fdata(fname, fd, &endexp, sizeof(endexp)))
280 		return NULL;
281 	/* rewind to start */
282 	if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) {
283 		log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno));
284 		close(fd);
285 		return NULL;
286 	}
287 	/* truncate to the right size */
288 	if(ftruncate(fd, (off_t)g.fsize) < 0) {
289 		log_msg(LOG_ERR, "%s: ftruncate(%d): %s", fname,
290 			(int)g.fsize, strerror(errno));
291 		close(fd);
292 		return NULL;
293 	}
294 	return udb_base_create_fd(fname, fd, walkfunc, arg);
295 }
296 
297 /** shrink the udb base if it has unused space at the end */
298 static void
299 udb_base_shrink(udb_base* udb, uint64_t nsize)
300 {
301 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
302 	udb->glob_data->fsize = nsize;
303 	/* sync, does not *seem* to be required on Linux, but it is
304 	   certainly required on OpenBSD.  Otherwise changed data is lost. */
305 #ifdef HAVE_MMAP
306 	msync(udb->base, udb->base_size, MS_ASYNC);
307 #endif
308 	if(ftruncate(udb->fd, (off_t)nsize) != 0) {
309 		log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname,
310 			(unsigned)nsize, strerror(errno));
311 	}
312 	udb->glob_data->dirty_alloc = udb_dirty_clean;
313 }
314 
315 void udb_base_close(udb_base* udb)
316 {
317 	if(!udb)
318 		return;
319 	if(udb->fd != -1 && udb->base && udb->alloc) {
320 		uint64_t nsize = udb->alloc->disk->nextgrow;
321 		if(nsize < udb->base_size)
322 			udb_base_shrink(udb, nsize);
323 	}
324 	if(udb->fd != -1) {
325 		udb->glob_data->clean_close = 1;
326 		close(udb->fd);
327 		udb->fd = -1;
328 	}
329 	if(udb->base) {
330 #ifdef HAVE_MMAP
331 		if(munmap(udb->base, udb->base_size) == -1) {
332 			log_msg(LOG_ERR, "munmap: %s", strerror(errno));
333 		}
334 #endif
335 		udb->base = NULL;
336 	}
337 }
338 
339 void udb_base_free(udb_base* udb)
340 {
341 	if(!udb)
342 		return;
343 	udb_base_close(udb);
344 	udb_alloc_delete(udb->alloc);
345 	free(udb->ram_hash);
346 	free(udb->fname);
347 	free(udb);
348 }
349 
350 void udb_base_free_keep_mmap(udb_base* udb)
351 {
352 	if(!udb) return;
353 	if(udb->fd != -1) {
354 		close(udb->fd);
355 		udb->fd = -1;
356 	}
357 	udb->base = NULL;
358 	udb_alloc_delete(udb->alloc);
359 	free(udb->ram_hash);
360 	free(udb->fname);
361 	free(udb);
362 }
363 
364 void udb_base_sync(udb_base* udb, int wait)
365 {
366 	if(!udb) return;
367 #ifdef HAVE_MMAP
368 	if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) {
369 		log_msg(LOG_ERR, "msync(%s) error %s",
370 			udb->fname, strerror(errno));
371 	}
372 #else
373 	(void)wait;
374 #endif
375 }
376 
377 /** hash a chunk pointer */
378 static uint32_t
379 chunk_hash_ptr(udb_void p)
380 {
381 	/* put p into an array of uint32 */
382 	uint32_t h[sizeof(p)/sizeof(uint32_t)];
383 	memcpy(&h, &p, sizeof(h));
384 	return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763);
385 }
386 
387 /** check that the given pointer is on the bucket for the given offset */
388 int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to)
389 {
390 	uint32_t i = chunk_hash_ptr(to) & udb->ram_mask;
391 	udb_ptr* p;
392 	assert((size_t)i < udb->ram_size);
393 	for(p = udb->ram_hash[i]; p; p=p->next) {
394 		if(p == ptr)
395 			return 1;
396 	}
397 	return 0;
398 }
399 
400 /** grow the ram array */
401 static void
402 grow_ram_hash(udb_base* udb, udb_ptr** newhash)
403 {
404 	size_t i;
405 	size_t osize= udb->ram_size;
406 	udb_ptr* p, *np;
407 	udb_ptr** oldhash = udb->ram_hash;
408 	udb->ram_size *= 2;
409 	udb->ram_mask <<= 1;
410 	udb->ram_mask |= 1;
411 	udb->ram_hash = newhash;
412 	/* have to link in every element in the old list into the new list*/
413 	for(i=0; i<osize; i++) {
414 		p = oldhash[i];
415 		while(p) {
416 			np = p->next;
417 			/* link into newhash */
418 			p->prev=NULL;
419 			p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask];
420 			if(p->next) p->next->prev = p;
421 			/* go to next element of oldhash */
422 			p = np;
423 		}
424 	}
425 	free(oldhash);
426 }
427 
428 void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr)
429 {
430 	uint32_t i;
431 #ifdef UDB_CHECK
432 	assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/
433 #endif
434 	udb->ram_num++;
435 	if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0x7fffffff) {
436 		/* grow the array, if allocation succeeds */
437 		udb_ptr** newram = (udb_ptr**)xalloc_array_zero(
438 			sizeof(udb_ptr*), udb->ram_size*2);
439 		if(newram) {
440 			grow_ram_hash(udb, newram);
441 		}
442 	}
443 	i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
444 	assert((size_t)i < udb->ram_size);
445 
446 	ptr->prev = NULL;
447 	ptr->next = udb->ram_hash[i];
448 	udb->ram_hash[i] = ptr;
449 	if(ptr->next)
450 		ptr->next->prev = ptr;
451 }
452 
453 void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr)
454 {
455 	assert(ptr->data);
456 #ifdef UDB_CHECK
457 	assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */
458 	assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data));
459 #endif
460 	udb->ram_num--;
461 	if(ptr->next)
462 		ptr->next->prev = ptr->prev;
463 	if(ptr->prev)
464 		ptr->prev->next = ptr->next;
465 	else	{
466 		uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask;
467 		assert((size_t)i < udb->ram_size);
468 		udb->ram_hash[i] = ptr->next;
469 	}
470 }
471 
472 /** change a set of ram ptrs to a new value */
473 static void
474 udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd)
475 {
476 	uint32_t io = chunk_hash_ptr(old) & udb->ram_mask;
477 	udb_ptr* p, *np;
478 	/* edit them and move them into the new position */
479 	p = udb->ram_hash[io];
480 	while(p) {
481 		np = p->next;
482 		if(p->data == old) {
483 			udb_base_unlink_ptr(udb, p);
484 			p->data = newd;
485 			udb_base_link_ptr(udb, p);
486 		}
487 		p = np;
488 	}
489 }
490 
491 udb_rel_ptr* udb_base_get_userdata(udb_base* udb)
492 {
493 	return &udb->glob_data->user_global;
494 }
495 
496 void udb_base_set_userdata(udb_base* udb, udb_void user)
497 {
498 #ifdef UDB_CHECK
499 	if(user) { assert(udb_valid_dataptr(udb, user)); }
500 #endif
501 	udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user);
502 }
503 
504 void udb_base_set_userflags(udb_base* udb, uint8_t v)
505 {
506 	udb->glob_data->userflags = v;
507 }
508 
509 uint8_t udb_base_get_userflags(udb_base* udb)
510 {
511 	return udb->glob_data->userflags;
512 }
513 
514 /** re-mmap the udb to specified size */
515 static void*
516 udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize)
517 {
518 #ifdef HAVE_MMAP
519 	void* nb;
520 	/* for use with valgrind, do not use mremap, but the other version */
521 #ifdef MREMAP_MAYMOVE
522 	nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE);
523 	if(nb == MAP_FAILED) {
524 		log_msg(LOG_ERR, "mremap(%s, size %u) error %s",
525 			udb->fname, (unsigned)nsize, strerror(errno));
526 		return 0;
527 	}
528 #else /* !HAVE MREMAP */
529 	/* use munmap-mmap to simulate mremap */
530 	if(munmap(udb->base, udb->base_size) != 0) {
531 		log_msg(LOG_ERR, "munmap(%s) error %s",
532 			udb->fname, strerror(errno));
533 	}
534 	/* provide hint for new location */
535 	/* note the size_t casts must be there for portability, on some
536 	 * systems the layout of memory is otherwise broken. */
537 	nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
538 		(int)MAP_SHARED, (int)udb->fd, (off_t)0);
539 	/* retry the mmap without basept in case of ENOMEM (FreeBSD8),
540 	 * the kernel can then try to mmap it at a different location
541 	 * where more memory is available */
542 	if(nb == MAP_FAILED && errno == ENOMEM) {
543 		nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE,
544 			(int)MAP_SHARED, (int)udb->fd, (off_t)0);
545 	}
546 	if(nb == MAP_FAILED) {
547 		log_msg(LOG_ERR, "mmap(%s, size %u) error %s",
548 			udb->fname, (unsigned)nsize, strerror(errno));
549 		udb->base = NULL;
550 		return 0;
551 	}
552 #endif /* HAVE MREMAP */
553 	if(nb != udb->base) {
554 		/* fix up realpointers in udb and alloc */
555 		/* but mremap may have been nice and not move the base */
556 		udb->base = nb;
557 		udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t));
558 		/* use passed alloc pointer because the udb->alloc may not
559 		 * be initialized yet */
560 		alloc->disk = (udb_alloc_d*)((void*)udb->glob_data
561 			+sizeof(*udb->glob_data));
562 	}
563 	udb->base_size = nsize;
564 	return nb;
565 #else /* HAVE_MMAP */
566 	(void)udb; (void)alloc; (void)nsize;
567 	return NULL;
568 #endif /* HAVE_MMAP */
569 }
570 
571 void
572 udb_base_remap_process(udb_base* udb)
573 {
574 	/* assume that fsize is still accessible */
575 	udb_base_remap(udb, udb->alloc, udb->glob_data->fsize);
576 }
577 
578 /** grow file to specified size and re-mmap, return new base */
579 static void*
580 udb_base_grow_and_remap(udb_base* udb, uint64_t nsize)
581 {
582 	/* grow file by writing a single zero at that spot, the
583 	 * rest is filled in with zeroes. */
584 	uint8_t z = 0;
585 	ssize_t w;
586 
587 	assert(nsize > 0);
588 	udb->glob_data->dirty_alloc = udb_dirty_fsize;
589 #ifdef HAVE_PWRITE
590 	if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) {
591 #else
592 	if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) {
593 		log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno));
594 		return 0;
595 	}
596 	if((w=write(udb->fd, &z, sizeof(z))) == -1) {
597 #endif
598 		log_msg(LOG_ERR, "grow(%s, size %u) error %s",
599 			udb->fname, (unsigned)nsize, strerror(errno));
600 		return 0;
601 	} else if(w != (ssize_t)sizeof(z)) {
602 		log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)",
603 			udb->fname, (unsigned)nsize);
604 		return 0;
605 	}
606 	udb->glob_data->fsize = nsize;
607 	udb->glob_data->dirty_alloc = udb_dirty_clean;
608 	return udb_base_remap(udb, udb->alloc, nsize);
609 }
610 
611 int udb_exp_size(uint64_t a)
612 {
613 	/* find enclosing value such that 2**x >= a */
614 	int x = 0;
615 	uint64_t i = a;
616 	assert(a != 0);
617 
618 	i --;
619 	/* could optimise this with uint8* access, depends on endianness */
620 	/* first whole bytes */
621 	while( (i&(~(uint64_t)0xff)) ) {
622 		i >>= 8;
623 		x += 8;
624 	}
625 	/* now details */
626 	while(i) {
627 		i >>= 1;
628 		x ++;
629 	}
630 	assert( ((uint64_t)1<<x) >= a);
631 	assert( x==0 || ((uint64_t)1<<(x-1)) < a);
632 	return x;
633 }
634 
635 int udb_exp_offset(uint64_t o)
636 {
637 	/* this means measuring the number of 0 bits on the right */
638 	/* so, if exp zero bits then (o&(2**x-1))==0 */
639 	int x = 0;
640 	uint64_t i = o;
641 	assert(o != 0);
642 	/* first whole bytes */
643 	while( (i&(uint64_t)0xff) == 0) {
644 		i >>= 8;
645 		x += 8;
646 	}
647 	/* now details */
648 	while( (i&(uint64_t)0x1) == 0) {
649 		i >>= 1;
650 		x ++;
651 	}
652 	assert( o % ((uint64_t)1<<x) == 0);
653 	assert( o % ((uint64_t)1<<(x+1)) != 0);
654 	return x;
655 }
656 
657 void udb_alloc_init_new(udb_alloc_d* a)
658 {
659 	assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0);
660 	memset(a, 0, sizeof(*a));
661 	/* set new allocations after header, as if allocated in a sequence
662 	 * of minsize allocations */
663 	a->nextgrow = UDB_HEADER_SIZE;
664 }
665 
666 /** fsck the file size, false if failed and file is useless */
667 static int
668 fsck_fsize(udb_base* udb, udb_alloc* alloc)
669 {
670 	off_t realsize;
671 	log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname);
672 	realsize = lseek(udb->fd, (off_t)0, SEEK_END);
673 	if(realsize == (off_t)-1) {
674 		log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno));
675 		return 0;
676 	}
677 	udb->glob_data->fsize = (uint64_t)realsize;
678 	if(!udb_base_remap(udb, alloc, (uint64_t)realsize))
679 		return 0;
680 	udb->glob_data->dirty_alloc = udb_dirty_clean;
681 	log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname);
682 	udb_base_sync(udb, 1);
683 	return 1;
684 }
685 
686 /** regenerate freelist add a new free chunk, return next todo */
687 static udb_void
688 regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen)
689 {
690 	udb_free_chunk_d* cp = UDB_FREE_CHUNK(c);
691 	uint64_t esz = (uint64_t)1<<exp;
692 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
693 		return 0;
694 	}
695 	cp->type = udb_chunk_type_free;
696 	cp->flags = 0;
697 	chunk_set_last(base, c, exp, (uint8_t)exp);
698 	cp->prev = 0;
699 	cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP];
700 	if(cp->next)
701 		UDB_FREE_CHUNK(cp->next)->prev = c;
702 	regen->stat_free += esz;
703 	return c + esz;
704 }
705 
706 /** regenerate xl chunk, return next todo */
707 static udb_void
708 regen_xl(void* base, udb_void c, udb_alloc_d* regen)
709 {
710 	udb_xl_chunk_d* cp = UDB_XL_CHUNK(c);
711 	uint64_t xlsz = cp->size;
712 	if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
713 		return 0;
714 	}
715 	if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) {
716 		return 0;
717 	}
718 	/* fixup end-size and end-expmarker */
719 	regen->stat_alloc += xlsz;
720 	return c + xlsz;
721 }
722 
723 /** regenerate data chunk, return next todo */
724 static udb_void
725 regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen)
726 {
727 	uint64_t esz = (uint64_t)1<<exp;
728 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) {
729 		return 0;
730 	}
731 	chunk_set_last(base, c, exp, (uint8_t)exp);
732 	regen->stat_alloc += esz;
733 	return c + esz;
734 }
735 
736 /** regenerate a relptr structure inside a data segment */
737 static void
738 regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg)
739 {
740 	udb_void* a = (udb_void*)arg;
741 	/* ignore 0 pointers */
742 	if(!rp->data)
743 		return;
744 
745 	/* edit relptrs that point to oldmoved to point to newmoved. */
746 	if(rp->data == a[0])
747 		rp->data = a[1];
748 
749 	/* regenerate relptr lists, add this item to the relptr list for
750 	 * the data that it points to */
751 	udb_rel_ptr_link(base, rp, rp->data);
752 }
753 
754 /** regenerate the relptrs store in this data segment */
755 static void
756 regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp,
757 	void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new)
758 {
759 	udb_void arg[2];
760 	arg[0] = rb_old; arg[1] = rb_new;
761 	/* walk through the structs here and put them on their respective
762 	 * relptr lists */
763 	(*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz,
764 		&regen_relptr_func, arg);
765 
766 }
767 
768 /** regenerate relptrlists in the file */
769 static void
770 regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc,
771 	udb_void rb_old, udb_void rb_new)
772 {
773 	udb_void at = alloc->udb->glob_data->hsize;
774 	/* clear all ptrlist start pointers in the file. */
775 	while(at < alloc->disk->nextgrow) {
776 		int exp = (int)UDB_CHUNK(at)->exp;
777 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
778 		if(exp == UDB_EXP_XL) {
779 			UDB_XL_CHUNK(at)->ptrlist = 0;
780 			at += UDB_XL_CHUNK(at)->size;
781 		} else if(tp == udb_chunk_type_free) {
782 			at += (uint64_t)1<<exp;
783 		} else { /* data chunk */
784 			UDB_CHUNK(at)->ptrlist = 0;
785 			at += (uint64_t)1<<exp;
786 		}
787 	}
788 	/* walk through all relptr structs and put on the right list. */
789 	at = alloc->udb->glob_data->hsize;
790 	while(at < alloc->disk->nextgrow) {
791 		udb_chunk_d* atp = UDB_CHUNK(at);
792 		int exp = (int)atp->exp;
793 		udb_chunk_type tp = (udb_chunk_type)atp->type;
794 		uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size:
795 			(uint64_t)1<<exp);
796 		if(exp == UDB_EXP_XL) {
797 			assert(at != rb_old); /* should have been freed */
798 			regen_its_ptrs(base, udb, atp,
799 				((void*)atp)+sizeof(udb_xl_chunk_d),
800 				sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2,
801 				rb_old, rb_new);
802 			at += sz;
803 		} else if(tp == udb_chunk_type_free) {
804 			at += sz;
805 		} else { /* data chunk */
806 			assert(at != rb_old); /* should have been freed */
807 			regen_its_ptrs(base, udb, atp,
808 				((void*)atp)+sizeof(udb_chunk_d),
809 				sz-sizeof(udb_chunk_d)-1, rb_old, rb_new);
810 			at += sz;
811 		}
812 	}
813 }
814 
815 
816 /** mark free elements from ex XL chunk space and later fixups pick that up */
817 static void
818 rb_mark_free_segs(void* base, udb_void s, uint64_t m)
819 {
820 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
821 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
822 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
823 	while(q >= s) {
824 		UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX;
825 		UDB_CHUNK(q)->type = udb_chunk_type_free;
826 		q -= UDB_ALLOC_CHUNK_SIZE;
827 	}
828 }
829 
830 
831 /** fsck rollback or rollforward XL move results */
832 static int
833 fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new,
834 	uint64_t rb_size, uint64_t rb_seg)
835 {
836 
837 	if(rb_old <= rb_new)
838 		return 0; /* XL move one way */
839 	if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
840 		return 0; /* not aligned */
841 	if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
842 		return 0; /* not aligned */
843 	if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0)
844 		return 0; /* not aligned */
845 	if(rb_new + rb_size <= rb_old) {
846 		/* not overlapping: resume copy */
847 		memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
848 		/* and free up old piece(s) */
849 		rb_mark_free_segs(base, rb_old, rb_size);
850 	} else {
851 		/* overlapping, see what segment we stopped at
852 		 * and continue there. */
853 		move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg);
854 		/* free up old piece(s); from the end of the moved segment,
855 		 * until the end of the old segment */
856 		rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)-
857 			(rb_new+rb_size));
858 	}
859 	/* do not call fix_ptrs, regenptrs does the job */
860 	return 1;
861 }
862 
863 /** fsck rollback or rollforward move results */
864 static int
865 fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size,
866 	udb_void* make_free)
867 {
868 	if( (rb_size&(rb_size-1)) != 0)
869 		return 0; /* not powerof2 */
870 	if( (rb_old&(rb_size-1)) != 0)
871 		return 0; /* not aligned */
872 	if( (rb_new&(rb_size-1)) != 0)
873 		return 0; /* not aligned */
874 	/* resume copy */
875 	memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size);
876 	/* do not call fix_ptrs, regenptrs does the job */
877 	/* make sure udb_old is freed */
878 	*make_free = rb_old;
879 	return 1;
880 }
881 
882 /** fsck the file and salvage, false if failed and file is useless */
883 static int
884 fsck_file(udb_base* udb, udb_alloc* alloc, int moved)
885 {
886 	void* base = udb->base;
887 	udb_alloc_d regen;
888 	udb_void at = udb->glob_data->hsize;
889 	udb_void rb_old = udb->glob_data->rb_old;
890 	udb_void rb_new = udb->glob_data->rb_new;
891 	udb_void rb_seg = udb->glob_data->rb_seg;
892 	udb_void make_free = 0;
893 	uint64_t rb_size = udb->glob_data->rb_size;
894 	log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname);
895 	/* walk through the file, use the exp values to see what can be
896 	 * salvaged */
897 	if(moved && rb_old && rb_new && rb_size) {
898 		if(rb_old+rb_size <= alloc->disk->nextgrow
899 			&& rb_new+rb_size <= alloc->disk->nextgrow) {
900 			/* we can use the move information to fix up the
901 			 * duplicate element (or partially moved element) */
902 			if(rb_size > 1024*1024) {
903 				/* XL chunk */
904 				if(!fsck_rb_xl(base, udb, rb_old, rb_new,
905 					rb_size, rb_seg))
906 					return 0;
907 			} else {
908 				if(!fsck_rb(base, rb_old, rb_new, rb_size,
909 					&make_free))
910 					return 0;
911 			}
912 		}
913 	}
914 
915 	/* rebuild freelists */
916 	/* recalculate stats in alloc (except 'stat_data') */
917 	/* possibly new end 'nextgrow' value */
918 	memset(&regen, 0, sizeof(regen));
919 	regen.nextgrow = alloc->disk->nextgrow;
920 	while(at < regen.nextgrow) {
921 		/* figure out this chunk */
922 		int exp = (int)UDB_CHUNK(at)->exp;
923 		udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type;
924 		/* consistency check possible here with end-exp */
925 		if(tp == udb_chunk_type_free || at == make_free) {
926 			at = regen_free(base, at, exp, &regen);
927 			if(!at) return 0;
928 		} else if(exp == UDB_EXP_XL) {
929 			/* allocated data of XL size */
930 			at = regen_xl(base, at, &regen);
931 			if(!at) return 0;
932 		} else if(exp >= UDB_ALLOC_CHUNK_MINEXP
933 			&& exp <= UDB_ALLOC_CHUNKS_MAX) {
934 			/* allocated data */
935 			at = regen_data(base, at, exp, &regen);
936 			if(!at) return 0;
937 		} else {
938 			/* garbage; this must be EOF then */
939 			regen.nextgrow = at;
940 			break;
941 		}
942 	}
943 	*alloc->disk = regen;
944 
945 	/* rebuild relptr lists */
946 	regen_ptrlist(base, udb, alloc, rb_old, rb_new);
947 
948 	log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)",
949 		udb->fname);
950 	udb->glob_data->rb_old = 0;
951 	udb->glob_data->rb_new = 0;
952 	udb->glob_data->rb_size = 0;
953 	udb->glob_data->dirty_alloc = udb_dirty_clean;
954 	udb_base_sync(udb, 1);
955 	return 1;
956 }
957 
958 
959 udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk)
960 {
961 	udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc));
962 	if(!alloc)
963 		return NULL;
964 	alloc->udb = udb;
965 	alloc->disk = disk;
966 	/* see if committed but uncompleted actions need to be done */
967 	/* preserves the alloc state */
968 	if(udb->glob_data->dirty_alloc != udb_dirty_clean) {
969 		if(udb->glob_data->dirty_alloc == udb_dirty_fsize) {
970 			if(fsck_fsize(udb, alloc))
971 				return alloc;
972 		} else if(udb->glob_data->dirty_alloc == udb_dirty_fl) {
973 			if(fsck_file(udb, alloc, 0))
974 				return alloc;
975 		} else if(udb->glob_data->dirty_alloc == udb_dirty_compact) {
976 			if(fsck_file(udb, alloc, 1))
977 				return alloc;
978 		}
979 		log_msg(LOG_ERR, "error: file allocation dirty (%d)",
980 			(int)udb->glob_data->dirty_alloc);
981 		free(alloc);
982 		return NULL;
983 	}
984 	return alloc;
985 }
986 
987 void udb_alloc_delete(udb_alloc* alloc)
988 {
989 	if(!alloc) return;
990 	free(alloc);
991 }
992 
993 /** unlink this element from its freelist */
994 static void
995 udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp)
996 {
997 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk);
998 	assert(chunk);
999 	/* chunk is a free chunk */
1000 	assert(fp->exp == (uint8_t)exp);
1001 	assert(fp->type == udb_chunk_type_free);
1002 	assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp);
1003 	/* and thus freelist not empty */
1004 	assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]);
1005 	/* unlink */
1006 	if(fp->prev)
1007 		UDB_FREE_CHUNK(fp->prev)->next = fp->next;
1008 	else	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1009 	if(fp->next)
1010 		UDB_FREE_CHUNK(fp->next)->prev = fp->prev;
1011 }
1012 
1013 /** pop first element off freelist, list may not be empty */
1014 static udb_void
1015 udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp)
1016 {
1017 	udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1018 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1019 	assert(f);
1020 	assert(fp->exp == (uint8_t)exp);
1021 	assert(fp->type == udb_chunk_type_free);
1022 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1023 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next;
1024 	if(fp->next) {
1025 		UDB_FREE_CHUNK(fp->next)->prev = 0;
1026 	}
1027 	return f;
1028 }
1029 
1030 /** push new element onto freelist */
1031 static void
1032 udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp)
1033 {
1034 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1035 	assert(f);
1036 	fp->exp = (uint8_t)exp;
1037 	fp->type = udb_chunk_type_free;
1038 	fp->flags = 0;
1039 	fp->prev = 0;
1040 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1041 	if(fp->next)
1042 		UDB_FREE_CHUNK(fp->next)->prev = f;
1043 	chunk_set_last(base, f, exp, (uint8_t)exp);
1044 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1045 }
1046 
1047 /** push new element onto freelist - do not initialize the elt */
1048 static void
1049 udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp)
1050 {
1051 	udb_free_chunk_d* fp = UDB_FREE_CHUNK(f);
1052 	assert(f);
1053 	assert(fp->exp == (uint8_t)exp);
1054 	assert(fp->type == udb_chunk_type_free);
1055 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1056 	fp->prev = 0;
1057 	fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP];
1058 	if(fp->next)
1059 		UDB_FREE_CHUNK(fp->next)->prev = f;
1060 	alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f;
1061 }
1062 
1063 /** add free chunks at end until specified alignment occurs */
1064 static void
1065 grow_align(void* base, udb_alloc* alloc, uint64_t esz)
1066 {
1067 	while( (alloc->disk->nextgrow & (esz-1)) != 0) {
1068 		/* the nextgrow is not a whole multiple of esz. */
1069 		/* grow a free chunk of max allowed size */
1070 		int fexp = udb_exp_offset(alloc->disk->nextgrow);
1071 		uint64_t fsz = (uint64_t)1<<fexp;
1072 		udb_void f = alloc->disk->nextgrow;
1073 		udb_void fn = alloc->disk->nextgrow+fsz;
1074 		assert(fn <= alloc->udb->base_size);
1075 		alloc->disk->stat_free += fsz;
1076 		udb_alloc_push_fl(base, alloc, f, fexp);
1077 		/* now increase nextgrow to commit that free chunk */
1078 		alloc->disk->nextgrow = fn;
1079 	}
1080 }
1081 
1082 /** append chunks at end of memory space to get size exp, return dataptr */
1083 static udb_void
1084 grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp)
1085 {
1086 	uint64_t esz = (uint64_t)1<<exp;
1087 	udb_void ret;
1088 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1089 	grow_align(base, alloc, esz);
1090 	/* free chunks are grown, grow the one we want to use */
1091 	ret = alloc->disk->nextgrow;
1092 	/* take a new alloced chunk into use */
1093 	UDB_CHUNK(ret)->exp = (uint8_t)exp;
1094 	UDB_CHUNK(ret)->flags = 0;
1095 	UDB_CHUNK(ret)->ptrlist = 0;
1096 	UDB_CHUNK(ret)->type = udb_chunk_type_data;
1097 	/* store last octet */
1098 	chunk_set_last(base, ret, exp, (uint8_t)exp);
1099 	/* update stats */
1100 	alloc->disk->stat_alloc += esz;
1101 	alloc->disk->stat_data += sz;
1102 	/* now increase nextgrow to commit this newly allocated chunk */
1103 	alloc->disk->nextgrow += esz;
1104 	assert(alloc->disk->nextgrow <= alloc->udb->base_size);
1105 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1106 	return ret + sizeof(udb_chunk_d); /* ptr to data */
1107 }
1108 
1109 /** calculate how much space is necessary to grow for this exp */
1110 static uint64_t
1111 grow_end_calc(udb_alloc* alloc, int exp)
1112 {
1113 	uint64_t sz = (uint64_t)1<<exp;
1114 	uint64_t ng = alloc->disk->nextgrow;
1115 	uint64_t res;
1116 	/* if nextgrow is 2**expness, no extra growth needed, only size */
1117 	if( (ng & (sz-1)) == 0) {
1118 		/* sz-1 is like 0xfff, and checks if ng is whole 2**exp */
1119 		return ng+sz; /* must grow exactly 2**exp */
1120 	}
1121 	/* grow until 2**expness and then we need 2**exp as well */
1122 	/* so, round ng down to whole sz (basically  ng-ng%sz, or ng/sz*sz)
1123 	 * and then add the sz twice (go up to whole sz, and to allocate) */
1124 	res = (ng & ~(sz-1)) + 2*sz;
1125 	return res;
1126 }
1127 
1128 /** see if we need to grow more than specified to enable sustained growth */
1129 static uint64_t
1130 grow_extra_check(udb_alloc* alloc, uint64_t ge)
1131 {
1132 	const uint64_t mb = 1024*1024;
1133 	uint64_t bsz = alloc->udb->base_size;
1134 	if(bsz <= mb) {
1135 		/* below 1 Mb, double sizes for exponential growth */
1136 		/* takes about 15 times to grow to 1Mb */
1137 		if(ge < bsz*2)
1138 			return bsz*2;
1139 	} else {
1140 		uint64_t gnow = ge - bsz;
1141 		/* above 1Mb, grow at least 1 Mb, or 12.5% of current size,
1142 		 * in whole megabytes rounded up. */
1143 		uint64_t want = ((bsz / 8) & ~(mb-1)) + mb;
1144 		if(gnow < want)
1145 			return bsz + want;
1146 	}
1147 	return ge;
1148 }
1149 
1150 /** see if free space is enogh to warrant shrink (while file is open) */
1151 static int
1152 enough_free(udb_alloc* alloc)
1153 {
1154 	if(alloc->udb->base_size <= 2*1024*1024) {
1155 		/* below 1 Mb, grown by double size, (so up to 2 mb),
1156 		 * do not shrink unless we can 1/3 in size */
1157 		if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size)
1158 			return 1;
1159 	} else {
1160 		/* grown 12.5%, shrink 25% if possible, at least one mb */
1161 		/* between 1mb and 4mb size, it shrinks by 1mb if possible */
1162 		uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow;
1163 		if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size
1164 			|| alloc->udb->base_size < 4*1024*1024))
1165 			return 1;
1166 	}
1167 	return 0;
1168 }
1169 
1170 /** grow space for a chunk of 2**exp and return dataptr */
1171 static udb_void
1172 udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp)
1173 {
1174 	/* commit the grow action
1175 	 * - the file grow only changes filesize, but not the nextgrow.
1176 	 * - taking space after nextgrow into use (as free space),
1177 	 *   is like free-ing a chunk (one at a time).
1178 	 * - and the last chunk taken into use is like alloc.
1179 	 */
1180 	/* predict how much free space is needed for this */
1181 	uint64_t grow_end = grow_end_calc(alloc, exp);
1182 	assert(alloc->udb->base_size >= alloc->disk->nextgrow);
1183 	if(grow_end <= alloc->udb->base_size) {
1184 		/* we can do this with the available space */
1185 		return grow_chunks(base, alloc, sz, exp);
1186 	}
1187 	/* we have to grow the file, re-mmap */
1188 	/* see if we need to grow a little more, to avoid endless grow
1189 	 * efforts on adding data */
1190 	grow_end = grow_extra_check(alloc, grow_end);
1191 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1192 		return 0; /* mmap or write failed (disk or mem full) */
1193 	}
1194 	/* we have enough space now */
1195 	assert(grow_end <= alloc->udb->base_size);
1196 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1197 	return grow_chunks(base, alloc, sz, exp);
1198 }
1199 
1200 /** take XL allocation into use at end of file, return dataptr */
1201 static udb_void
1202 grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz)
1203 {
1204 	udb_void ret;
1205 	udb_xl_chunk_d* p;
1206 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1207 
1208 	/* align growth to whole mbs */
1209 	grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE);
1210 
1211 	/* grow XL segment */
1212 	ret = alloc->disk->nextgrow;
1213 	p = UDB_XL_CHUNK(ret);
1214 	p->exp = UDB_EXP_XL;
1215 	p->size = xlsz;
1216 	p->flags = 0;
1217 	p->ptrlist = 0;
1218 	p->type = udb_chunk_type_data;
1219 
1220 	/* also put size and marker at end for compaction */
1221 	*((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz;
1222 	*((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL;
1223 
1224 	/* stats */
1225 	alloc->disk->stat_data += sz;
1226 	alloc->disk->stat_alloc += xlsz;
1227 	/* now increase the nextgrow to commit this xl chunk */
1228 	alloc->disk->nextgrow += xlsz;
1229 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1230 	return ret + sizeof(udb_xl_chunk_d); /* data ptr */
1231 }
1232 
1233 /** make space for XL allocation */
1234 static udb_void
1235 udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz)
1236 {
1237 	/* allocate whole mbs of space, at end of space */
1238 	uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2;
1239 	uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1));
1240 	uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need;
1241 	assert(need >= asz);
1242 	if(grow_end <= alloc->udb->base_size) {
1243 		/* can do this in available space */
1244 		return grow_xl(base, alloc, need, sz);
1245 	}
1246 	/* have to grow file and re-mmap */
1247 	grow_end = grow_extra_check(alloc, grow_end);
1248 	if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) {
1249 		return 0; /* mmap or write failed (disk or mem full) */
1250 	}
1251 	/* we have enough space now */
1252 	assert(grow_end <= alloc->udb->base_size);
1253 	assert(alloc->udb->glob_data->fsize == alloc->udb->base_size);
1254 	return grow_xl(base, alloc, need, sz);
1255 }
1256 
1257 /** divide big(2**e2) into pieces so 2**exp fits */
1258 static udb_void
1259 udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2,
1260 	int exp)
1261 {
1262 	int e = e2;
1263 	uint64_t sz = (uint64_t)1<<e2;
1264 	assert(big && e2 > exp);
1265 	/* so the returned piece to use is the first piece,
1266 	 * offload the later half until it fits */
1267 	do {
1268 		sz >>= 1; /* divide size of big by two */
1269 		e--;      /* that means its exp is one smaller */
1270 		udb_alloc_push_fl(base, alloc, big+sz, e);
1271 	} while(e != exp);
1272 	/* exit loop when last pushed is same size as what we want */
1273 	return big;
1274 }
1275 
1276 /** returns the exponent size of the chunk needed for data sz */
1277 static int
1278 udb_alloc_exp_needed(size_t sz)
1279 {
1280 	uint64_t asz = sz + sizeof(udb_chunk_d) + 1;
1281 	if(asz > UDB_ALLOC_CHUNK_SIZE) {
1282 		return UDB_EXP_XL;
1283 	} else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) {
1284 		return UDB_ALLOC_CHUNK_MINEXP;
1285 	}
1286 	return udb_exp_size(asz);
1287 }
1288 
1289 udb_void udb_alloc_space(udb_alloc* alloc, size_t sz)
1290 {
1291 	void* base = alloc->udb->base;
1292 	/* calculate actual allocation size */
1293 	int e2, exp = udb_alloc_exp_needed(sz);
1294 	if(exp == UDB_EXP_XL)
1295 		return udb_alloc_xl_space(base, alloc, sz);
1296 	/* see if there is a free chunk of that size exactly */
1297 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) {
1298 		/* snip from freelist, udb_chunk_d */
1299 		udb_void ret;
1300 		alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1301 		ret = udb_alloc_pop_fl(base, alloc, exp);
1302 		/* use it - size octets already OK */
1303 		UDB_CHUNK(ret)->flags = 0;
1304 		UDB_CHUNK(ret)->ptrlist = 0;
1305 		UDB_CHUNK(ret)->type = udb_chunk_type_data;
1306 		/* update stats */
1307 		alloc->disk->stat_data += sz;
1308 		alloc->disk->stat_alloc += (1<<exp);
1309 		assert(alloc->disk->stat_free >= (1u<<exp));
1310 		alloc->disk->stat_free -= (1<<exp);
1311 		alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1312 		return ret + sizeof(udb_chunk_d); /* ptr to data */
1313 	}
1314 	/* see if we can subdivide a larger chunk */
1315 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1316 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1317 			udb_void big, ret; /* udb_chunk_d */
1318 			alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1319 			big = udb_alloc_pop_fl(base, alloc, e2);
1320 			/* push other parts onto freelists (needs inited) */
1321 			ret = udb_alloc_subdivide(base, alloc, big, e2, exp);
1322 			/* use final part (needs inited) */
1323 			UDB_CHUNK(ret)->exp = (uint8_t)exp;
1324 			/* if stop here; the new exp makes smaller free chunk*/
1325 			UDB_CHUNK(ret)->flags = 0;
1326 			UDB_CHUNK(ret)->ptrlist = 0;
1327 			/* set type to commit data chunk */
1328 			UDB_CHUNK(ret)->type = udb_chunk_type_data;
1329 			/* store last octet */
1330 			chunk_set_last(base, ret, exp, (uint8_t)exp);
1331 			/* update stats */
1332 			alloc->disk->stat_data += sz;
1333 			alloc->disk->stat_alloc += (1<<exp);
1334 			assert(alloc->disk->stat_free >= (1u<<exp));
1335 			alloc->disk->stat_free -= (1<<exp);
1336 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1337 			return ret + sizeof(udb_chunk_d); /* ptr to data */
1338 		}
1339 	/* we need to grow an extra chunk */
1340 	return udb_alloc_grow_space(base, alloc, sz, exp);
1341 }
1342 
1343 /** see if there is free space to allocate a chunk into */
1344 static int
1345 have_free_for(udb_alloc* alloc, int exp)
1346 {
1347 	int e2;
1348 	if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP])
1349 		return exp;
1350 	for(e2 = exp+1; e2 <= UDB_ALLOC_CHUNKS_MAX; e2++)
1351 		if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) {
1352 			return e2;
1353 		}
1354 	return 0;
1355 }
1356 
1357 /** fix relptr prev and next for moved relptr structures */
1358 static void
1359 chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg)
1360 {
1361 	udb_void* data = (udb_void*)arg;
1362 	udb_void r;
1363 	if(!rp->data)
1364 		return;
1365 	r = UDB_SYSTOREL(base, rp);
1366 	if(rp->next)
1367 		UDB_REL_PTR(rp->next)->prev = r;
1368 	if(rp->prev)
1369 		UDB_REL_PTR(rp->prev)->next = r;
1370 	else	{
1371 		/* if this is a pointer to its own chunk, fix it up;
1372 		 * the data ptr gets set by relptr_edit later. */
1373 		if(rp->data == data[0])
1374 			UDB_CHUNK(data[1])->ptrlist = r;
1375 		else	UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r;
1376 	}
1377 }
1378 
1379 /** fix pointers from and to a moved chunk */
1380 static void
1381 chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data,
1382 	uint64_t dsz, udb_void olddata)
1383 {
1384 	udb_void d[2];
1385 	d[0] = olddata;
1386 	d[1] = data;
1387 	(*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data),
1388 		dsz, &chunk_fix_ptr_each, d);
1389 	udb_rel_ptr_edit(base, cp->ptrlist, data);
1390 	udb_base_ram_ptr_edit(udb, olddata, data);
1391 }
1392 
1393 /** move an allocated chunk to use a free chunk */
1394 static void
1395 move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz,
1396 	int e2)
1397 {
1398 	udb_void res = udb_alloc_pop_fl(base, alloc, e2);
1399 	udb_chunk_d* rp;
1400 	udb_chunk_d* fp;
1401 	if(exp != e2) {
1402 		/* it is bigger, subdivide it */
1403 		res = udb_alloc_subdivide(base, alloc, res, e2, exp);
1404 	}
1405 	assert(res != f);
1406 	/* setup rollback information */
1407 	alloc->udb->glob_data->rb_old = f;
1408 	alloc->udb->glob_data->rb_new = res;
1409 	alloc->udb->glob_data->rb_size = esz;
1410 	/* take the res, exp into use */
1411 	rp = UDB_CHUNK(res);
1412 	fp = UDB_CHUNK(f);
1413 	/* copy over the data */
1414 	memcpy(rp, fp, esz);
1415 	/* adjust rel ptrs */
1416 	chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d),
1417 		esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d));
1418 
1419 	/* do not freeup the fp; caller does that */
1420 }
1421 
1422 /** unlink several free elements to overwrite with xl chunk */
1423 static void
1424 free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m)
1425 {
1426 	udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE;
1427 	/* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/
1428 	assert(s >= UDB_ALLOC_CHUNK_SIZE);
1429 	while(q >= s) {
1430 		assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX);
1431 		assert(UDB_CHUNK(q)->type == udb_chunk_type_free);
1432 		udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX);
1433 		q -= UDB_ALLOC_CHUNK_SIZE;
1434 	}
1435 }
1436 
1437 /** move an XL chunk, and keep track of segments for rollback */
1438 static void
1439 move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n,
1440 	uint64_t sz, uint64_t startseg)
1441 {
1442 	udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1443 	udb_xl_chunk_d* np = UDB_XL_CHUNK(n);
1444 	uint64_t amount = xl - n;
1445 	assert(n < xl); /* move to compact */
1446 
1447 	/* setup move rollback */
1448 	udb->glob_data->rb_old = xl;
1449 	udb->glob_data->rb_new = n;
1450 	udb->glob_data->rb_size = sz;
1451 
1452 	/* is it overlapping? */
1453 	if(sz <= amount) {
1454 		memcpy(np, xlp, sz);
1455 	} else {
1456 		/* move and commit per 1M segment to avoid data loss */
1457 		uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE;
1458 		for(seg = startseg; seg<maxseg; seg++) {
1459 			udb->glob_data->rb_seg = seg;
1460 			memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE,
1461 				xlp+seg*UDB_ALLOC_CHUNK_SIZE,
1462 				UDB_ALLOC_CHUNK_SIZE);
1463 		}
1464 
1465 	}
1466 }
1467 
1468 /** move list of XL chunks to the front by the shift amount */
1469 static void
1470 move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz,
1471 	uint64_t amount)
1472 {
1473 	udb_void xl = xl_start;
1474 	assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1475 	assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1476 	assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */
1477 	while(xl < xl_start+xl_sz) {
1478 		udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1479 		udb_void n = xl-amount;
1480 		uint64_t sz = xlp->size;
1481 		assert(xlp->exp == UDB_EXP_XL);
1482 		move_xl_segment(base, alloc->udb, xl, n, sz, 0);
1483 		chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n),
1484 			n+sizeof(udb_xl_chunk_d),
1485 			sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2,
1486 			xl+sizeof(udb_xl_chunk_d));
1487 	}
1488 	alloc->disk->stat_free -= amount;
1489 	alloc->disk->nextgrow -= amount;
1490 	alloc->udb->glob_data->rb_old = 0;
1491 	alloc->udb->glob_data->rb_new = 0;
1492 	alloc->udb->glob_data->rb_size = 0;
1493 }
1494 
1495 /** see if free chunk can coagulate with another chunk, return other chunk */
1496 static udb_void
1497 coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp,
1498 	uint64_t esz)
1499 {
1500 	udb_void other = f^esz;
1501 	if(exp == UDB_ALLOC_CHUNKS_MAX)
1502 		return 0; /* no further merges */
1503 	if(other >= alloc->udb->base_size)
1504 		return 0; /* not allocated */
1505 	if(other >= alloc->disk->nextgrow)
1506 		return 0; /* not in use */
1507 	if(other < alloc->udb->glob_data->hsize)
1508 		return 0; /* cannot merge with header */
1509 		/* the header is also protected by the special exp marker */
1510 	/* see if the other chunk is a free chunk */
1511 
1512 	/* check closest marker to avoid large memory churn */
1513 	/* and also it makes XL allocations and header special markers work */
1514 	if(f > other) {
1515 		assert(f > 1); /* this is certain because of header */
1516 		if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) {
1517 			/* can do it if the other part is a free chunk */
1518 			assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp);
1519 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1520 				return other;
1521 		}
1522 	} else {
1523 		if(UDB_CHUNK(other)->exp == (uint8_t)exp) {
1524 			/* can do it if the other part is a free chunk */
1525 			assert(chunk_get_last(base, other, exp)==(uint8_t)exp);
1526 			if(UDB_CHUNK(other)->type == udb_chunk_type_free)
1527 				return other;
1528 		}
1529 	}
1530 	return 0;
1531 }
1532 
1533 /** coagulate and then add new free segment, return final free segment */
1534 static udb_void
1535 coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp,
1536 	uint64_t esz)
1537 {
1538 	/* new free chunk here, attempt coagulate */
1539 	udb_void other;
1540 	while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) {
1541 		/* unlink that other chunk */
1542 		udb_alloc_unlink_fl(base, alloc, other, exp);
1543 		/* merge up */
1544 		if(other < last)
1545 			last = other;
1546 		exp++;
1547 		esz <<= 1;
1548 	}
1549 	/* free the final segment */
1550 	udb_alloc_push_fl(base, alloc, last, exp);
1551 	return last;
1552 }
1553 
1554 /** attempt to compact the data and move free space to the end */
1555 int
1556 udb_alloc_compact(void* base, udb_alloc* alloc)
1557 {
1558 	udb_void last;
1559 	int exp, e2;
1560 	uint64_t esz;
1561 	uint64_t at = alloc->disk->nextgrow;
1562 	udb_void xl_start = 0;
1563 	uint64_t xl_sz = 0;
1564 	if(alloc->udb->inhibit_compact)
1565 		return 1;
1566 	alloc->udb->useful_compact = 0;
1567 	while(at > alloc->udb->glob_data->hsize) {
1568 		/* grab last entry */
1569 		exp = (int)*((uint8_t*)UDB_REL(base, at-1));
1570 		if(exp == UDB_EXP_XL) {
1571 			/* for XL chunks:
1572 			 * - inspect the size of the XLchunklist at end
1573 			 * - attempt to compact in front of of XLchunklist
1574 			 */
1575 			uint64_t xlsz = *((uint64_t*)UDB_REL(base,
1576 				at-sizeof(uint64_t)*2));
1577 			udb_void xl = at-xlsz;
1578 #ifndef NDEBUG
1579 			udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl);
1580 			assert(xlp->exp == UDB_EXP_XL);
1581 			assert(xlp->type != udb_chunk_type_free);
1582 #endif
1583 			/* got thesegment add to the xl chunk list */
1584 			if(xl_start != 0 && xl+xlsz != xl_start) {
1585 				/* nonadjoining XL part, but they are aligned,
1586 				 * so the space in between is whole Mbs,
1587 				 * shift the later part(s) and continue */
1588 				uint64_t m = xl_start - (xl+xlsz);
1589 				assert(xl_start > xl+xlsz);
1590 				alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1591 				free_xl_space(base, alloc, xl+xlsz, m);
1592 				move_xl_list(base, alloc, xl_start, xl_sz, m);
1593 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1594 			}
1595 			xl_start = xl;
1596 			xl_sz += xlsz;
1597 			at = xl;
1598 			continue;
1599 			/* end of XL if */
1600 		} else if(exp < UDB_ALLOC_CHUNK_MINEXP
1601 			|| exp > UDB_ALLOC_CHUNKS_MAX)
1602 			break; /* special chunk or garbage */
1603 		esz = (uint64_t)1<<exp;
1604 		last = at - esz;
1605 		assert(UDB_CHUNK(last)->exp == (uint8_t)exp);
1606 		if(UDB_CHUNK(last)->type == udb_chunk_type_free) {
1607 			/* if xlstart continue looking to move stuff, but do
1608 			 * not unlink this free segment */
1609 			if(!xl_start) {
1610 				/* it is a free chunk, remove it */
1611 				alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1612 				udb_alloc_unlink_fl(base, alloc, last, exp);
1613 				alloc->disk->stat_free -= esz;
1614 				alloc->disk->nextgrow = last;
1615 				alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1616 				/* and continue at this point */
1617 			}
1618 			at = last;
1619 		} else if( (e2=have_free_for(alloc, exp)) ) {
1620 			/* last entry can be allocated in free chunks
1621 			 * move it to its new position, adjust rel_ptrs */
1622 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1623 			move_chunk(base, alloc, last, exp, esz, e2);
1624 			if(xl_start) {
1625 				last = coagulate_and_push(base, alloc,
1626 					last, exp, esz);
1627 			} else {
1628 				/* shorten usage */
1629 				alloc->disk->stat_free -= esz;
1630 				alloc->disk->nextgrow = last;
1631 			}
1632 			alloc->udb->glob_data->rb_old = 0;
1633 			alloc->udb->glob_data->rb_new = 0;
1634 			alloc->udb->glob_data->rb_size = 0;
1635 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1636 			/* and continue in front of it */
1637 			at = last;
1638 		} else {
1639 			/* cannot compact this block, stop compacting */
1640 			break;
1641 		}
1642 		/* if that worked, repeat it */
1643 	}
1644 	/* if we passed xl chunks, see if XL-chunklist can move */
1645 	if(xl_start) {
1646 		/* calculate free space in front of the XLchunklist. */
1647 		/* has to be whole mbs of free space */
1648 		/* if so, we can move the XL chunks.  Move them all back
1649 		 * by the new free space. */
1650 		/* this compacts very well, but the XL chunks can be moved
1651 		 * multiple times; worst case for every mb freed a huge sized
1652 		 * xlchunklist gets moved. */
1653 		/* free space must be, since aligned and coagulated, in
1654 		 * chunks of a whole MB */
1655 		udb_void at = xl_start;
1656 		uint64_t m = 0;
1657 		while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){
1658 			udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE;
1659 			if(UDB_CHUNK(chunk)->type != udb_chunk_type_free)
1660 				break;
1661 			assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX);
1662 			m += UDB_ALLOC_CHUNK_SIZE;
1663 			at = chunk;
1664 		}
1665 		if(m != 0) {
1666 			assert(at+m == xl_start);
1667 			alloc->udb->glob_data->dirty_alloc = udb_dirty_compact;
1668 			free_xl_space(base, alloc, at, m);
1669 			move_xl_list(base, alloc, xl_start, xl_sz, m);
1670 			alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1671 		}
1672 	}
1673 
1674 	/* if enough free, shrink the file; re-mmap */
1675 	if(enough_free(alloc)) {
1676 		uint64_t nsize = alloc->disk->nextgrow;
1677 		udb_base_shrink(alloc->udb, nsize);
1678 		if(!udb_base_remap(alloc->udb, alloc, nsize))
1679 			return 0;
1680 	}
1681 	return 1;
1682 }
1683 
1684 int
1685 udb_compact(udb_base* udb)
1686 {
1687 	if(!udb) return 1;
1688 	if(!udb->useful_compact) return 1;
1689 	DEBUG(DEBUG_DBACCESS, 1, (LOG_INFO, "Compacting database..."));
1690 	return udb_alloc_compact(udb->base, udb->alloc);
1691 }
1692 
1693 void udb_compact_inhibited(udb_base* udb, int inhibit)
1694 {
1695 	if(!udb) return;
1696 	udb->inhibit_compact = inhibit;
1697 }
1698 
1699 #ifdef UDB_CHECK
1700 /** check that rptrs are really zero before free */
1701 void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg)
1702 {
1703 	(void)base;
1704 	(void)arg;
1705 	assert(p->data == 0);
1706 }
1707 #endif /* UDB_CHECK */
1708 
1709 /** free XL chunk as multiples of CHUNK_SIZE free segments */
1710 static void
1711 udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp,
1712 	size_t sz)
1713 {
1714 	uint64_t xlsz = fp->size;
1715 	uint64_t c;
1716 	/* lightweight check for buffer overflow in xl data */
1717 	assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz);
1718 	assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL);
1719 	assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */
1720 	assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */
1721 #ifdef UDB_CHECK
1722 	/* check that relptrs in this chunk have been zeroed */
1723 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1724 		UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz,
1725 		&udb_check_rptr_zero, NULL);
1726 #endif
1727 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1728 	/* update stats */
1729 	alloc->disk->stat_data -= sz;
1730 	alloc->disk->stat_alloc -= xlsz;
1731 	alloc->disk->stat_free += xlsz;
1732 	/* walk in reverse, so the front blocks go first on the list */
1733 	c = f + xlsz - UDB_ALLOC_CHUNK_SIZE;
1734 	/* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/
1735 	assert(f >= UDB_ALLOC_CHUNK_SIZE);
1736 	while(c >= f) {
1737 		/* free a block of CHUNK_SIZE (1 Mb) */
1738 		udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX);
1739 		c -= UDB_ALLOC_CHUNK_SIZE;
1740 	}
1741 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1742 }
1743 
1744 int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz)
1745 {
1746 	void* base;
1747 	/* lookup chunk ptr */
1748 	udb_void f;
1749 	udb_chunk_d* fp;
1750 	uint64_t esz;
1751 	int exp;
1752 	udb_void other;
1753 	int coagulated = 0;
1754 	if(!r)
1755 		return 1; /* free(NULL) does nothing */
1756 
1757 	/* lookup size of chunk */
1758 	base = alloc->udb->base;
1759 	/* fails for XL blocks */
1760 	f = chunk_from_dataptr(r);
1761 	fp = UDB_CHUNK(f);
1762 	assert(fp->type != udb_chunk_type_free);
1763 
1764 	/* see if it has a ptrlist, if so: trouble, the list is not properly
1765 	 * cleaned up. (although you can imagine a wholesale delete where
1766 	 * it does not matter) */
1767 	assert(fp->ptrlist == 0);
1768 
1769 	/* set ptrlist to 0 to stop relptr from using it, robustness. */
1770 	fp->ptrlist = 0;
1771 
1772 	if(fp->exp == UDB_EXP_XL) {
1773 		udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz);
1774 		/* compact */
1775 		if(alloc->udb->inhibit_compact) {
1776 			alloc->udb->useful_compact = 1;
1777 			return 1;
1778 		}
1779 		return udb_alloc_compact(base, alloc);
1780 	}
1781 	/* it is a regular chunk of 2**exp size */
1782 	exp = (int)fp->exp;
1783 	esz = (uint64_t)1<<exp;
1784 	/* light check for e.g. buffer overflow of the data */
1785 	assert(sz < esz);
1786 	assert(chunk_get_last(base, f, exp) == (uint8_t)exp);
1787 #ifdef UDB_CHECK
1788 	/* check that relptrs in this chunk have been zeroed */
1789 	(*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type,
1790 		UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL);
1791 #endif
1792 
1793 	/* update the stats */
1794 	alloc->udb->glob_data->dirty_alloc = udb_dirty_fl;
1795 	alloc->disk->stat_data -= sz;
1796 	alloc->disk->stat_free += esz;
1797 	alloc->disk->stat_alloc -= esz;
1798 
1799 	/* if it can be merged with other free chunks, do so */
1800 	while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) {
1801 		coagulated = 1;
1802 		/* unlink that other chunk and expand it (it has same size) */
1803 		udb_alloc_unlink_fl(base, alloc, other, exp);
1804 		/* merge up */
1805 		if(other < f)
1806 			f = other;
1807 		exp++;
1808 		esz <<= 1;
1809 	}
1810 	if(coagulated) {
1811 		/* put big free chunk into freelist, and init it */
1812 		udb_alloc_push_fl(base, alloc, f, exp);
1813 	} else {
1814 		/* we do not need to touch the last-exp-byte, which may save
1815 		 * a reference to that page of memory */
1816 		fp->type = udb_chunk_type_free;
1817 		fp->flags = 0;
1818 		udb_alloc_push_fl_noinit(base, alloc, f, exp);
1819 	}
1820 	alloc->udb->glob_data->dirty_alloc = udb_dirty_clean;
1821 	/* compact */
1822 	if(alloc->udb->inhibit_compact) {
1823 		alloc->udb->useful_compact = 1;
1824 		return 1;
1825 	}
1826 	return udb_alloc_compact(base, alloc);
1827 }
1828 
1829 udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz)
1830 {
1831 	/* could be faster maybe, if grown? */
1832 	udb_void r = udb_alloc_space(alloc, sz);
1833 	if(!r) return r;
1834 	memcpy(UDB_REL(alloc->udb->base, r), d, sz);
1835 	return r;
1836 }
1837 
1838 udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz)
1839 {
1840 	void* base = alloc->udb->base;
1841 	udb_void c, n, newd;
1842 	udb_chunk_d* cp, *np;
1843 	uint64_t avail;
1844 	uint8_t cp_type;
1845 	/* emulate some posix realloc stuff */
1846 	if(r == 0)
1847 		return udb_alloc_space(alloc, sz);
1848 	if(sz == 0) {
1849 		if(!udb_alloc_free(alloc, r, osz))
1850 			log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1851 		return 0;
1852 	}
1853 	c = chunk_from_dataptr(r);
1854 	cp = UDB_CHUNK(c);
1855 	cp_type = cp->type;
1856 	if(cp->exp == UDB_EXP_XL) {
1857 		avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d)
1858 			- sizeof(uint64_t)*2;
1859 	} else {
1860 		avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1;
1861 	}
1862 	if(sz <= avail)
1863 		return r;
1864 	/* reallocate it, and copy */
1865 	newd = udb_alloc_space(alloc, sz);
1866 	if(!newd) return 0;
1867 	/* re-base after alloc, since re-mmap may have happened */
1868 	base = alloc->udb->base;
1869 	cp = NULL; /* may be invalid now, robustness */
1870 	n = chunk_from_dataptr(newd);
1871 	np = UDB_CHUNK(n);
1872 	np->type = cp_type;
1873 	memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz);
1874 	/* fixup ptrs */
1875 	chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r);
1876 
1877 	if(!udb_alloc_free(alloc, r, osz))
1878 		log_msg(LOG_ERR, "udb_alloc_realloc: free failed");
1879 	return newd;
1880 }
1881 
1882 int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num)
1883 {
1884 	const uint64_t mb = 1024*1024;
1885 	int exp = udb_alloc_exp_needed(sz);
1886 	uint64_t esz;
1887 	uint64_t want;
1888 	if(exp == UDB_EXP_XL)
1889 		esz = (sz&(mb-1))+mb;
1890 	else	esz = (uint64_t)1<<exp;
1891 	/* we need grow_end_calc to take into account alignment */
1892 	want = grow_end_calc(alloc, exp) + esz*(num-1);
1893 	assert(want >= alloc->udb->base_size);
1894 	if(!udb_base_grow_and_remap(alloc->udb, want)) {
1895 		log_msg(LOG_ERR, "failed to grow the specified amount");
1896 		return 0;
1897 	}
1898 	return 1;
1899 }
1900 
1901 void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp)
1902 {
1903 	void* base = alloc->udb->base;
1904 	udb_void f = chunk_from_dataptr(r);
1905 	udb_chunk_d* fp = UDB_CHUNK(f);
1906 	/* not the 'free' type, that must be set by allocation routines */
1907 	assert(fp->type != udb_chunk_type_free);
1908 	assert(tp != udb_chunk_type_free);
1909 	fp->type = tp;
1910 }
1911 
1912 int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize)
1913 {
1914 	/* pointers are not valid before the header-size or after the
1915 	 * used-region of the mmap */
1916 	return ( (to+destsize) <= udb->base_size &&
1917 		to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) &&
1918 		(to+destsize) <= udb->alloc->disk->nextgrow);
1919 }
1920 
1921 int udb_valid_dataptr(udb_base* udb, udb_void to)
1922 {
1923 	void* base = udb->base;
1924 	udb_void ch;
1925 	int exp;
1926 	uint64_t esz;
1927 	/* our data chunks are aligned and at least 8 bytes */
1928 	if(!udb_valid_offset(udb, to, sizeof(uint64_t)))
1929 		return 0;
1930 	/* get the chunk pointer */
1931 	ch = chunk_from_dataptr(to);
1932 	if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d)))
1933 		return 0;
1934 	/* check its size */
1935 	exp = UDB_CHUNK(ch)->exp;
1936 	if(exp == UDB_EXP_XL) {
1937 		/* check XL chunk */
1938 		uint64_t xlsz;
1939 		if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d)))
1940 			return 0;
1941 		xlsz = UDB_XL_CHUNK(ch)->size;
1942 		if(!udb_valid_offset(udb, ch+xlsz-1, 1))
1943 			return 0;
1944 		if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL)
1945 			return 0;
1946 		if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2))
1947 			!= xlsz)
1948 			return 0;
1949 		return 1;
1950 	}
1951 	/* check if regular chunk has matching end byte */
1952 	if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX)
1953 		return 0; /* cannot be a valid chunk */
1954 	esz = 1<<exp;
1955 	if(!udb_valid_offset(udb, ch+esz-1, 1))
1956 		return 0;
1957 	if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp)
1958 		return 0;
1959 	return 1;
1960 }
1961 
1962 int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to)
1963 {
1964 	void* base = udb->base;
1965 	udb_void p;
1966 	if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr)))
1967 		return 0;
1968 	if(!udb_valid_dataptr(udb, to))
1969 		return 0;
1970 	p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist;
1971 	while(p) {
1972 		if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr)))
1973 			return 0;
1974 		if(p == rptr)
1975 			return 1;
1976 		p = UDB_REL_PTR(p)->next;
1977 	}
1978 	return 0;
1979 }
1980 
1981 void udb_rel_ptr_init(udb_rel_ptr* ptr)
1982 {
1983 	memset(ptr, 0, sizeof(*ptr));
1984 }
1985 
1986 void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr)
1987 {
1988 	if(!ptr->data)
1989 		return;
1990 	if(ptr->prev) {
1991 		UDB_REL_PTR(ptr->prev)->next = ptr->next;
1992 	} else {
1993 		UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next;
1994 	}
1995 	if(ptr->next) {
1996 		UDB_REL_PTR(ptr->next)->prev = ptr->prev;
1997 	}
1998 }
1999 
2000 void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to)
2001 {
2002 	udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to));
2003 	ptr->prev = 0;
2004 	ptr->next = chunk->ptrlist;
2005 	if(ptr->next)
2006 		UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr);
2007 	chunk->ptrlist = UDB_SYSTOREL(base, ptr);
2008 	ptr->data = to;
2009 }
2010 
2011 void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to)
2012 {
2013 	assert(to == 0 || to > 64);
2014 	udb_rel_ptr_unlink(base, ptr);
2015 	if(to)
2016 		udb_rel_ptr_link(base, ptr, to);
2017 	else	ptr->data = to;
2018 }
2019 
2020 void udb_rel_ptr_edit(void* base, udb_void list, udb_void to)
2021 {
2022 	udb_void p = list;
2023 	while(p) {
2024 		UDB_REL_PTR(p)->data = to;
2025 		p = UDB_REL_PTR(p)->next;
2026 	}
2027 }
2028 
2029 #ifdef UDB_CHECK
2030 /** check that all pointers are validly chained */
2031 static void
2032 udb_check_ptrs_valid(udb_base* udb)
2033 {
2034 	size_t i;
2035 	udb_ptr* p, *prev;
2036 	for(i=0; i<udb->ram_size; i++) {
2037 		prev = NULL;
2038 		for(p=udb->ram_hash[i]; p; p=p->next) {
2039 			assert(p->prev == prev);
2040 			assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask)
2041 				== i);
2042 			assert(p->base == &udb->base);
2043 			prev = p;
2044 		}
2045 	}
2046 }
2047 #endif /* UDB_CHECK */
2048 
2049 void udb_ptr_init(udb_ptr* ptr, udb_base* udb)
2050 {
2051 #ifdef UDB_CHECK
2052 	udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */
2053 #endif
2054 	memset(ptr, 0, sizeof(*ptr));
2055 	ptr->base = &udb->base;
2056 }
2057 
2058 void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval)
2059 {
2060 	assert(newval == 0 || newval > 64);
2061 	if(ptr->data)
2062 		udb_base_unlink_ptr(udb, ptr);
2063 	ptr->data = newval;
2064 	if(newval)
2065 		udb_base_link_ptr(udb, ptr);
2066 }
2067 
2068 int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type,
2069 	size_t sz)
2070 {
2071 	udb_void r;
2072 	r = udb_alloc_space(udb->alloc, sz);
2073 	if(!r) return 0;
2074 	udb_alloc_set_type(udb->alloc, r, type);
2075 	udb_ptr_init(ptr, udb);
2076 	udb_ptr_set(ptr, udb, r);
2077 	return 1;
2078 }
2079 
2080 void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz)
2081 {
2082 	if(ptr->data) {
2083 		udb_void d = ptr->data;
2084 		udb_ptr_set(ptr, udb, 0);
2085 		udb_alloc_free(udb->alloc, d, sz);
2086 	}
2087 }
2088 
2089 udb_chunk_type udb_ptr_get_type(udb_ptr* ptr)
2090 {
2091 	udb_void f;
2092 	if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/
2093 	f = chunk_from_dataptr(ptr->data);
2094 	return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type;
2095 }
2096