xref: /plan9/sys/src/cmd/venti/srv/arena.c (revision 7ab27030036b6c877a6f81728daeda263d1ca3cf)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 
5 typedef struct ASum ASum;
6 
7 struct ASum
8 {
9 	Arena	*arena;
10 	ASum	*next;
11 };
12 
13 static void	sealarena(Arena *arena);
14 static int	okarena(Arena *arena);
15 static int	loadarena(Arena *arena);
16 static CIBlock	*getcib(Arena *arena, int clump, int writing, CIBlock *rock);
17 static void	putcib(Arena *arena, CIBlock *cib);
18 static void	sumproc(void *);
19 static void loadcig(Arena *arena);
20 
21 static QLock	sumlock;
22 static Rendez	sumwait;
23 static ASum	*sumq;
24 static ASum	*sumqtail;
25 static uchar zero[8192];
26 
27 int	arenasumsleeptime;
28 
29 int
initarenasum(void)30 initarenasum(void)
31 {
32 	needzeroscore();  /* OS X */
33 
34 	sumwait.l = &sumlock;
35 
36 	if(vtproc(sumproc, nil) < 0){
37 		seterr(EOk, "can't start arena checksum slave: %r");
38 		return -1;
39 	}
40 	return 0;
41 }
42 
43 /*
44  * make an Arena, and initialize it based upon the disk header and trailer.
45  */
46 Arena*
initarena(Part * part,u64int base,u64int size,u32int blocksize)47 initarena(Part *part, u64int base, u64int size, u32int blocksize)
48 {
49 	Arena *arena;
50 
51 	arena = MKZ(Arena);
52 	arena->part = part;
53 	arena->blocksize = blocksize;
54 	arena->clumpmax = arena->blocksize / ClumpInfoSize;
55 	arena->base = base + blocksize;
56 	arena->size = size - 2 * blocksize;
57 
58 	if(loadarena(arena) < 0){
59 		seterr(ECorrupt, "arena header or trailer corrupted");
60 		freearena(arena);
61 		return nil;
62 	}
63 	if(okarena(arena) < 0){
64 		freearena(arena);
65 		return nil;
66 	}
67 
68 	if(arena->diskstats.sealed && scorecmp(zeroscore, arena->score)==0)
69 		sealarena(arena);
70 
71 	return arena;
72 }
73 
74 void
freearena(Arena * arena)75 freearena(Arena *arena)
76 {
77 	if(arena == nil)
78 		return;
79 	free(arena);
80 }
81 
82 Arena*
newarena(Part * part,u32int vers,char * name,u64int base,u64int size,u32int blocksize)83 newarena(Part *part, u32int vers, char *name, u64int base, u64int size, u32int blocksize)
84 {
85 	int bsize;
86 	Arena *arena;
87 
88 	if(nameok(name) < 0){
89 		seterr(EOk, "illegal arena name", name);
90 		return nil;
91 	}
92 	arena = MKZ(Arena);
93 	arena->part = part;
94 	arena->version = vers;
95 	if(vers == ArenaVersion4)
96 		arena->clumpmagic = _ClumpMagic;
97 	else{
98 		do
99 			arena->clumpmagic = fastrand();
100 		while(arena->clumpmagic==_ClumpMagic || arena->clumpmagic==0);
101 	}
102 	arena->blocksize = blocksize;
103 	arena->clumpmax = arena->blocksize / ClumpInfoSize;
104 	arena->base = base + blocksize;
105 	arena->size = size - 2 * blocksize;
106 
107 	namecp(arena->name, name);
108 
109 	bsize = sizeof zero;
110 	if(bsize > arena->blocksize)
111 		bsize = arena->blocksize;
112 
113 	if(wbarena(arena)<0 || wbarenahead(arena)<0
114 	|| writepart(arena->part, arena->base, zero, bsize)<0){
115 		freearena(arena);
116 		return nil;
117 	}
118 
119 	return arena;
120 }
121 
122 int
readclumpinfo(Arena * arena,int clump,ClumpInfo * ci)123 readclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
124 {
125 	CIBlock *cib, r;
126 
127 	cib = getcib(arena, clump, 0, &r);
128 	if(cib == nil)
129 		return -1;
130 	unpackclumpinfo(ci, &cib->data->data[cib->offset]);
131 	putcib(arena, cib);
132 	return 0;
133 }
134 
135 int
readclumpinfos(Arena * arena,int clump,ClumpInfo * cis,int n)136 readclumpinfos(Arena *arena, int clump, ClumpInfo *cis, int n)
137 {
138 	CIBlock *cib, r;
139 	int i;
140 
141 	/*
142 	 * because the clump blocks are laid out
143 	 * in reverse order at the end of the arena,
144 	 * it can be a few percent faster to read
145 	 * the clumps backwards, which reads the
146 	 * disk blocks forwards.
147 	 */
148 	for(i = n-1; i >= 0; i--){
149 		cib = getcib(arena, clump + i, 0, &r);
150 		if(cib == nil){
151 			n = i;
152 			continue;
153 		}
154 		unpackclumpinfo(&cis[i], &cib->data->data[cib->offset]);
155 		putcib(arena, cib);
156 	}
157 	return n;
158 }
159 
160 /*
161  * write directory information for one clump
162  * must be called the arena locked
163  */
164 int
writeclumpinfo(Arena * arena,int clump,ClumpInfo * ci)165 writeclumpinfo(Arena *arena, int clump, ClumpInfo *ci)
166 {
167 	CIBlock *cib, r;
168 
169 	cib = getcib(arena, clump, 1, &r);
170 	if(cib == nil)
171 		return -1;
172 	dirtydblock(cib->data, DirtyArenaCib);
173 	packclumpinfo(ci, &cib->data->data[cib->offset]);
174 	putcib(arena, cib);
175 	return 0;
176 }
177 
178 u64int
arenadirsize(Arena * arena,u32int clumps)179 arenadirsize(Arena *arena, u32int clumps)
180 {
181 	return ((clumps / arena->clumpmax) + 1) * arena->blocksize;
182 }
183 
184 /*
185  * read a clump of data
186  * n is a hint of the size of the data, not including the header
187  * make sure it won't run off the end, then return the number of bytes actually read
188  */
189 u32int
readarena(Arena * arena,u64int aa,u8int * buf,long n)190 readarena(Arena *arena, u64int aa, u8int *buf, long n)
191 {
192 	DBlock *b;
193 	u64int a;
194 	u32int blocksize, off, m;
195 	long nn;
196 
197 	if(n == 0)
198 		return -1;
199 
200 	qlock(&arena->lock);
201 	a = arena->size - arenadirsize(arena, arena->memstats.clumps);
202 	qunlock(&arena->lock);
203 	if(aa >= a){
204 		seterr(EOk, "reading beyond arena clump storage: clumps=%d aa=%lld a=%lld -1 clumps=%lld\n", arena->memstats.clumps, aa, a, arena->size - arenadirsize(arena, arena->memstats.clumps - 1));
205 		return -1;
206 	}
207 	if(aa + n > a)
208 		n = a - aa;
209 
210 	blocksize = arena->blocksize;
211 	a = arena->base + aa;
212 	off = a & (blocksize - 1);
213 	a -= off;
214 	nn = 0;
215 	for(;;){
216 		b = getdblock(arena->part, a, OREAD);
217 		if(b == nil)
218 			return -1;
219 		m = blocksize - off;
220 		if(m > n - nn)
221 			m = n - nn;
222 		memmove(&buf[nn], &b->data[off], m);
223 		putdblock(b);
224 		nn += m;
225 		if(nn == n)
226 			break;
227 		off = 0;
228 		a += blocksize;
229 	}
230 	return n;
231 }
232 
233 /*
234  * write some data to the clump section at a given offset
235  * used to fix up corrupted arenas.
236  */
237 u32int
writearena(Arena * arena,u64int aa,u8int * clbuf,u32int n)238 writearena(Arena *arena, u64int aa, u8int *clbuf, u32int n)
239 {
240 	DBlock *b;
241 	u64int a;
242 	u32int blocksize, off, m;
243 	long nn;
244 	int ok;
245 
246 	if(n == 0)
247 		return -1;
248 
249 	qlock(&arena->lock);
250 	a = arena->size - arenadirsize(arena, arena->memstats.clumps);
251 	if(aa >= a || aa + n > a){
252 		qunlock(&arena->lock);
253 		seterr(EOk, "writing beyond arena clump storage");
254 		return -1;
255 	}
256 
257 	blocksize = arena->blocksize;
258 	a = arena->base + aa;
259 	off = a & (blocksize - 1);
260 	a -= off;
261 	nn = 0;
262 	for(;;){
263 		b = getdblock(arena->part, a, off != 0 || off + n < blocksize ? ORDWR : OWRITE);
264 		if(b == nil){
265 			qunlock(&arena->lock);
266 			return -1;
267 		}
268 		dirtydblock(b, DirtyArena);
269 		m = blocksize - off;
270 		if(m > n - nn)
271 			m = n - nn;
272 		memmove(&b->data[off], &clbuf[nn], m);
273 		ok = 0;
274 		putdblock(b);
275 		if(ok < 0){
276 			qunlock(&arena->lock);
277 			return -1;
278 		}
279 		nn += m;
280 		if(nn == n)
281 			break;
282 		off = 0;
283 		a += blocksize;
284 	}
285 	qunlock(&arena->lock);
286 	return n;
287 }
288 
289 /*
290  * allocate space for the clump and write it,
291  * updating the arena directory
292 ZZZ question: should this distinguish between an arena
293 filling up and real errors writing the clump?
294  */
295 u64int
writeaclump(Arena * arena,Clump * c,u8int * clbuf)296 writeaclump(Arena *arena, Clump *c, u8int *clbuf)
297 {
298 	DBlock *b;
299 	u64int a, aa;
300 	u32int clump, n, nn, m, off, blocksize;
301 	int ok;
302 
303 	n = c->info.size + ClumpSize + U32Size;
304 	qlock(&arena->lock);
305 	aa = arena->memstats.used;
306 	if(arena->memstats.sealed
307 	|| aa + n + U32Size + arenadirsize(arena, arena->memstats.clumps + 1) > arena->size){
308 		if(!arena->memstats.sealed){
309 			logerr(EOk, "seal memstats %s", arena->name);
310 			arena->memstats.sealed = 1;
311 			wbarena(arena);
312 		}
313 		qunlock(&arena->lock);
314 		return TWID64;
315 	}
316 	if(packclump(c, &clbuf[0], arena->clumpmagic) < 0){
317 		qunlock(&arena->lock);
318 		return TWID64;
319 	}
320 
321 	/*
322 	 * write the data out one block at a time
323 	 */
324 	blocksize = arena->blocksize;
325 	a = arena->base + aa;
326 	off = a & (blocksize - 1);
327 	a -= off;
328 	nn = 0;
329 	for(;;){
330 		b = getdblock(arena->part, a, off != 0 ? ORDWR : OWRITE);
331 		if(b == nil){
332 			qunlock(&arena->lock);
333 			return TWID64;
334 		}
335 		dirtydblock(b, DirtyArena);
336 		m = blocksize - off;
337 		if(m > n - nn)
338 			m = n - nn;
339 		memmove(&b->data[off], &clbuf[nn], m);
340 		ok = 0;
341 		putdblock(b);
342 		if(ok < 0){
343 			qunlock(&arena->lock);
344 			return TWID64;
345 		}
346 		nn += m;
347 		if(nn == n)
348 			break;
349 		off = 0;
350 		a += blocksize;
351 	}
352 
353 	arena->memstats.used += c->info.size + ClumpSize;
354 	arena->memstats.uncsize += c->info.uncsize;
355 	if(c->info.size < c->info.uncsize)
356 		arena->memstats.cclumps++;
357 
358 	clump = arena->memstats.clumps;
359 	if(clump % ArenaCIGSize == 0){
360 		if(arena->cig == nil){
361 			loadcig(arena);
362 			if(arena->cig == nil)
363 				goto NoCIG;
364 		}
365 		/* add aa as start of next cig */
366 		if(clump/ArenaCIGSize != arena->ncig){
367 			fprint(2, "bad arena cig computation %s: writing clump %d but %d cigs\n",
368 				arena->name, clump, arena->ncig);
369 			arena->ncig = -1;
370 			vtfree(arena->cig);
371 			arena->cig = nil;
372 			goto NoCIG;
373 		}
374 		arena->cig = vtrealloc(arena->cig, (arena->ncig+1)*sizeof arena->cig[0]);
375 		arena->cig[arena->ncig++].offset = aa;
376 	}
377 NoCIG:
378 	arena->memstats.clumps++;
379 
380 	if(arena->memstats.clumps == 0)
381 		sysfatal("clumps wrapped");
382 	arena->wtime = now();
383 	if(arena->ctime == 0)
384 		arena->ctime = arena->wtime;
385 
386 	writeclumpinfo(arena, clump, &c->info);
387 	wbarena(arena);
388 
389 	qunlock(&arena->lock);
390 
391 	return aa;
392 }
393 
394 int
atailcmp(ATailStats * a,ATailStats * b)395 atailcmp(ATailStats *a, ATailStats *b)
396 {
397 	/* good test */
398 	if(a->used < b->used)
399 		return -1;
400 	if(a->used > b->used)
401 		return 1;
402 
403 	/* suspect tests - why order this way? (no one cares) */
404 	if(a->clumps < b->clumps)
405 		return -1;
406 	if(a->clumps > b->clumps)
407 		return 1;
408 	if(a->cclumps < b->cclumps)
409 		return -1;
410 	if(a->cclumps > b->cclumps)
411 		return 1;
412 	if(a->uncsize < b->uncsize)
413 		return -1;
414 	if(a->uncsize > b->uncsize)
415 		return 1;
416 	if(a->sealed < b->sealed)
417 		return -1;
418 	if(a->sealed > b->sealed)
419 		return 1;
420 
421 	/* everything matches */
422 	return 0;
423 }
424 
425 void
setatailstate(AState * as)426 setatailstate(AState *as)
427 {
428 	int i, j, osealed;
429 	Arena *a;
430 	Index *ix;
431 
432 	trace(0, "setatailstate %s 0x%llux clumps %d", as->arena->name, as->aa, as->stats.clumps);
433 
434 	/*
435 	 * Look up as->arena to find index.
436 	 */
437 	needmainindex();	/* OS X linker */
438 	ix = mainindex;
439 	for(i=0; i<ix->narenas; i++)
440 		if(ix->arenas[i] == as->arena)
441 			break;
442 	if(i==ix->narenas || as->aa < ix->amap[i].start || as->aa >= ix->amap[i].stop || as->arena != ix->arenas[i]){
443 		fprint(2, "funny settailstate 0x%llux\n", as->aa);
444 		return;
445 	}
446 
447 	for(j=0; j<=i; j++){
448 		a = ix->arenas[j];
449 		if(atailcmp(&a->diskstats, &a->memstats) == 0)
450 			continue;
451 		qlock(&a->lock);
452 		osealed = a->diskstats.sealed;
453 		if(j == i)
454 			a->diskstats = as->stats;
455 		else
456 			a->diskstats = a->memstats;
457 		wbarena(a);
458 		if(a->diskstats.sealed != osealed && !a->inqueue)
459 			sealarena(a);
460 		qunlock(&a->lock);
461 	}
462 }
463 
464 /*
465  * once sealed, an arena never has any data added to it.
466  * it should only be changed to fix errors.
467  * this also syncs the clump directory.
468  */
469 static void
sealarena(Arena * arena)470 sealarena(Arena *arena)
471 {
472 	arena->inqueue = 1;
473 	backsumarena(arena);
474 }
475 
476 void
backsumarena(Arena * arena)477 backsumarena(Arena *arena)
478 {
479 	ASum *as;
480 
481 	if(sumwait.l == nil)
482 		return;
483 
484 	as = MK(ASum);
485 	if(as == nil)
486 		return;
487 	qlock(&sumlock);
488 	as->arena = arena;
489 	as->next = nil;
490 	if(sumq)
491 		sumqtail->next = as;
492 	else
493 		sumq = as;
494 	sumqtail = as;
495 	rwakeup(&sumwait);
496 	qunlock(&sumlock);
497 }
498 
499 static void
sumproc(void * unused)500 sumproc(void *unused)
501 {
502 	ASum *as;
503 	Arena *arena;
504 
505 	USED(unused);
506 
507 	for(;;){
508 		qlock(&sumlock);
509 		while(sumq == nil)
510 			rsleep(&sumwait);
511 		as = sumq;
512 		sumq = as->next;
513 		qunlock(&sumlock);
514 		arena = as->arena;
515 		free(as);
516 
517 		sumarena(arena);
518 	}
519 }
520 
521 void
sumarena(Arena * arena)522 sumarena(Arena *arena)
523 {
524 	ZBlock *b;
525 	DigestState s;
526 	u64int a, e;
527 	u32int bs;
528 	int t;
529 	u8int score[VtScoreSize];
530 
531 	bs = MaxIoSize;
532 	if(bs < arena->blocksize)
533 		bs = arena->blocksize;
534 
535 	/*
536 	 * read & sum all blocks except the last one
537 	 */
538 	flushdcache();
539 	memset(&s, 0, sizeof s);
540 	b = alloczblock(bs, 0, arena->part->blocksize);
541 	e = arena->base + arena->size;
542 	for(a = arena->base - arena->blocksize; a + arena->blocksize <= e; a += bs){
543 		disksched();
544 		while((t=arenasumsleeptime) == SleepForever){
545 			sleep(1000);
546 			disksched();
547 		}
548 		sleep(t);
549 		if(a + bs > e)
550 			bs = arena->blocksize;
551 		if(readpart(arena->part, a, b->data, bs) < 0)
552 			goto ReadErr;
553 		addstat(StatSumRead, 1);
554 		addstat(StatSumReadBytes, bs);
555 		sha1(b->data, bs, nil, &s);
556 	}
557 
558 	/*
559 	 * the last one is special, since it may already have the checksum included
560 	 */
561 	bs = arena->blocksize;
562 	if(readpart(arena->part, e, b->data, bs) < 0){
563 ReadErr:
564 		logerr(EOk, "sumarena can't sum %s, read at %lld failed: %r", arena->name, a);
565 		freezblock(b);
566 		return;
567 	}
568 	addstat(StatSumRead, 1);
569 	addstat(StatSumReadBytes, bs);
570 
571 	sha1(b->data, bs-VtScoreSize, nil, &s);
572 	sha1(zeroscore, VtScoreSize, nil, &s);
573 	sha1(nil, 0, score, &s);
574 
575 	/*
576 	 * check for no checksum or the same
577 	 */
578 	if(scorecmp(score, &b->data[bs - VtScoreSize]) != 0
579 	&& scorecmp(zeroscore, &b->data[bs - VtScoreSize]) != 0)
580 		logerr(EOk, "overwriting mismatched checksums for arena=%s, found=%V calculated=%V",
581 			arena->name, &b->data[bs - VtScoreSize], score);
582 	freezblock(b);
583 
584 	qlock(&arena->lock);
585 	scorecp(arena->score, score);
586 	wbarena(arena);
587 	qunlock(&arena->lock);
588 }
589 
590 /*
591  * write the arena trailer block to the partition
592  */
593 int
wbarena(Arena * arena)594 wbarena(Arena *arena)
595 {
596 	DBlock *b;
597 	int bad;
598 
599 	if((b = getdblock(arena->part, arena->base + arena->size, OWRITE)) == nil){
600 		logerr(EAdmin, "can't write arena trailer: %r");
601 		return -1;
602 	}
603 	dirtydblock(b, DirtyArenaTrailer);
604 	bad = okarena(arena)<0 || packarena(arena, b->data)<0;
605 	scorecp(b->data + arena->blocksize - VtScoreSize, arena->score);
606 	putdblock(b);
607 	if(bad)
608 		return -1;
609 	return 0;
610 }
611 
612 int
wbarenahead(Arena * arena)613 wbarenahead(Arena *arena)
614 {
615 	ZBlock *b;
616 	ArenaHead head;
617 	int bad;
618 
619 	namecp(head.name, arena->name);
620 	head.version = arena->version;
621 	head.size = arena->size + 2 * arena->blocksize;
622 	head.blocksize = arena->blocksize;
623 	head.clumpmagic = arena->clumpmagic;
624 	b = alloczblock(arena->blocksize, 1, arena->part->blocksize);
625 	if(b == nil){
626 		logerr(EAdmin, "can't write arena header: %r");
627 /* ZZZ add error message? */
628 		return -1;
629 	}
630 	/*
631 	 * this writepart is okay because it only happens
632 	 * during initialization.
633 	 */
634 	bad = packarenahead(&head, b->data)<0 ||
635 	      writepart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize)<0 ||
636 	      flushpart(arena->part)<0;
637 	freezblock(b);
638 	if(bad)
639 		return -1;
640 	return 0;
641 }
642 
643 /*
644  * read the arena header and trailer blocks from disk
645  */
646 static int
loadarena(Arena * arena)647 loadarena(Arena *arena)
648 {
649 	ArenaHead head;
650 	ZBlock *b;
651 
652 	b = alloczblock(arena->blocksize, 0, arena->part->blocksize);
653 	if(b == nil)
654 		return -1;
655 	if(readpart(arena->part, arena->base + arena->size, b->data, arena->blocksize) < 0){
656 		freezblock(b);
657 		return -1;
658 	}
659 	if(unpackarena(arena, b->data) < 0){
660 		freezblock(b);
661 		return -1;
662 	}
663 	if(arena->version != ArenaVersion4 && arena->version != ArenaVersion5){
664 		seterr(EAdmin, "unknown arena version %d", arena->version);
665 		freezblock(b);
666 		return -1;
667 	}
668 	scorecp(arena->score, &b->data[arena->blocksize - VtScoreSize]);
669 
670 	if(readpart(arena->part, arena->base - arena->blocksize, b->data, arena->blocksize) < 0){
671 		logerr(EAdmin, "can't read arena header: %r");
672 		freezblock(b);
673 		return 0;
674 	}
675 	if(unpackarenahead(&head, b->data) < 0)
676 		logerr(ECorrupt, "corrupted arena header: %r");
677 	else if(namecmp(arena->name, head.name)!=0
678 	     || arena->clumpmagic != head.clumpmagic
679 	     || arena->version != head.version
680 	     || arena->blocksize != head.blocksize
681 	     || arena->size + 2 * arena->blocksize != head.size){
682 		if(namecmp(arena->name, head.name)!=0)
683 			logerr(ECorrupt, "arena tail name %s head %s",
684 				arena->name, head.name);
685 		else if(arena->clumpmagic != head.clumpmagic)
686 			logerr(ECorrupt, "arena %d tail clumpmagic 0x%lux head 0x%lux",
687 				debugarena, (ulong)arena->clumpmagic,
688 				(ulong)head.clumpmagic);
689 		else if(arena->version != head.version)
690 			logerr(ECorrupt, "arena tail version %d head version %d",
691 				arena->version, head.version);
692 		else if(arena->blocksize != head.blocksize)
693 			logerr(ECorrupt, "arena tail block size %d head %d",
694 				arena->blocksize, head.blocksize);
695 		else if(arena->size+2*arena->blocksize != head.size)
696 			logerr(ECorrupt, "arena tail size %lud head %lud",
697 				(ulong)arena->size+2*arena->blocksize, head.size);
698 		else
699 			logerr(ECorrupt, "arena header inconsistent with arena data");
700 	}
701 	freezblock(b);
702 
703 	return 0;
704 }
705 
706 static int
okarena(Arena * arena)707 okarena(Arena *arena)
708 {
709 	u64int dsize;
710 	int ok;
711 
712 	ok = 0;
713 	dsize = arenadirsize(arena, arena->diskstats.clumps);
714 	if(arena->diskstats.used + dsize > arena->size){
715 		seterr(ECorrupt, "arena %s used > size", arena->name);
716 		ok = -1;
717 	}
718 
719 	if(arena->diskstats.cclumps > arena->diskstats.clumps)
720 		logerr(ECorrupt, "arena %s has more compressed clumps than total clumps", arena->name);
721 
722 	/*
723 	 * This need not be true if some of the disk is corrupted.
724 	 *
725 	if(arena->diskstats.uncsize + arena->diskstats.clumps * ClumpSize + arena->blocksize < arena->diskstats.used)
726 		logerr(ECorrupt, "arena %s uncompressed size inconsistent with used space %lld %d %lld", arena->name, arena->diskstats.uncsize, arena->diskstats.clumps, arena->diskstats.used);
727 	 */
728 
729 	/*
730 	 * this happens; it's harmless.
731 	 *
732 	if(arena->ctime > arena->wtime)
733 		logerr(ECorrupt, "arena %s creation time after last write time", arena->name);
734 	 */
735 	return ok;
736 }
737 
738 static CIBlock*
getcib(Arena * arena,int clump,int writing,CIBlock * rock)739 getcib(Arena *arena, int clump, int writing, CIBlock *rock)
740 {
741 	int mode;
742 	CIBlock *cib;
743 	u32int block, off;
744 
745 	if(clump >= arena->memstats.clumps){
746 		seterr(EOk, "clump directory access out of range");
747 		return nil;
748 	}
749 	block = clump / arena->clumpmax;
750 	off = (clump - block * arena->clumpmax) * ClumpInfoSize;
751 	cib = rock;
752 	cib->block = block;
753 	cib->offset = off;
754 
755 	if(writing){
756 		if(off == 0 && clump == arena->memstats.clumps-1)
757 			mode = OWRITE;
758 		else
759 			mode = ORDWR;
760 	}else
761 		mode = OREAD;
762 
763 	cib->data = getdblock(arena->part,
764 		arena->base + arena->size - (block + 1) * arena->blocksize, mode);
765 	if(cib->data == nil)
766 		return nil;
767 	return cib;
768 }
769 
770 static void
putcib(Arena * arena,CIBlock * cib)771 putcib(Arena *arena, CIBlock *cib)
772 {
773 	USED(arena);
774 
775 	putdblock(cib->data);
776 	cib->data = nil;
777 }
778 
779 
780 /*
781  * For index entry readahead purposes, the arenas are
782  * broken into smaller subpieces, called clump info groups
783  * or cigs.  Each cig has ArenaCIGSize clumps (ArenaCIGSize
784  * is chosen to make the index entries take up about half
785  * a megabyte).  The index entries do not contain enough
786  * information to determine what the clump index is for
787  * a given address in an arena.  That info is needed both for
788  * figuring out which clump group an address belongs to
789  * and for prefetching a clump group's index entries from
790  * the arena table of contents.  The first time clump groups
791  * are accessed, we scan the entire arena table of contents
792  * (which might be 10s of megabytes), recording the data
793  * offset of each clump group.
794  */
795 
796 /*
797  * load clump info group information by scanning entire toc.
798  */
799 static void
loadcig(Arena * arena)800 loadcig(Arena *arena)
801 {
802 	u32int i, j, ncig, nci;
803 	ArenaCIG *cig;
804 	ClumpInfo *ci;
805 	u64int offset;
806 	int ms;
807 
808 	if(arena->cig || arena->ncig < 0)
809 		return;
810 
811 //	fprint(2, "loadcig %s\n", arena->name);
812 
813 	ncig = (arena->memstats.clumps+ArenaCIGSize-1) / ArenaCIGSize;
814 	if(ncig == 0){
815 		arena->cig = vtmalloc(1);
816 		arena->ncig = 0;
817 		return;
818 	}
819 
820 	ms = msec();
821 	cig = vtmalloc(ncig*sizeof cig[0]);
822 	ci = vtmalloc(ArenaCIGSize*sizeof ci[0]);
823 	offset = 0;
824 	for(i=0; i<ncig; i++){
825 		nci = readclumpinfos(arena, i*ArenaCIGSize, ci, ArenaCIGSize);
826 		cig[i].offset = offset;
827 		for(j=0; j<nci; j++)
828 			offset += ClumpSize + ci[j].size;
829 		if(nci < ArenaCIGSize){
830 			if(i != ncig-1){
831 				vtfree(ci);
832 				vtfree(cig);
833 				arena->ncig = -1;
834 				fprint(2, "loadcig %s: got %ud cigs, expected %ud\n", arena->name, i+1, ncig);
835 				goto out;
836 			}
837 		}
838 	}
839 	vtfree(ci);
840 
841 	arena->ncig = ncig;
842 	arena->cig = cig;
843 
844 out:
845 	ms = msec() - ms;
846 	addstat2(StatCigLoad, 1, StatCigLoadTime, ms);
847 }
848 
849 /*
850  * convert arena address into arena group + data boundaries.
851  */
852 int
arenatog(Arena * arena,u64int addr,u64int * gstart,u64int * glimit,int * g)853 arenatog(Arena *arena, u64int addr, u64int *gstart, u64int *glimit, int *g)
854 {
855 	int r, l, m;
856 
857 	qlock(&arena->lock);
858 	if(arena->cig == nil)
859 		loadcig(arena);
860 	if(arena->cig == nil || arena->ncig == 0){
861 		qunlock(&arena->lock);
862 		return -1;
863 	}
864 
865 	l = 1;
866 	r = arena->ncig - 1;
867 	while(l <= r){
868 		m = (r + l) / 2;
869 		if(arena->cig[m].offset <= addr)
870 			l = m + 1;
871 		else
872 			r = m - 1;
873 	}
874 	l--;
875 
876 	*g = l;
877 	*gstart = arena->cig[l].offset;
878 	if(l+1 < arena->ncig)
879 		*glimit = arena->cig[l+1].offset;
880 	else
881 		*glimit = arena->memstats.used;
882 	qunlock(&arena->lock);
883 	return 0;
884 }
885 
886 /*
887  * load the clump info for group g into the index entries.
888  */
889 int
asumload(Arena * arena,int g,IEntry * entries,int nentries)890 asumload(Arena *arena, int g, IEntry *entries, int nentries)
891 {
892 	int i, base, limit;
893 	u64int addr;
894 	ClumpInfo ci;
895 	IEntry *ie;
896 
897 	if(nentries < ArenaCIGSize){
898 		fprint(2, "asking for too few entries\n");
899 		return -1;
900 	}
901 
902 	qlock(&arena->lock);
903 	if(arena->cig == nil)
904 		loadcig(arena);
905 	if(arena->cig == nil || arena->ncig == 0 || g >= arena->ncig){
906 		qunlock(&arena->lock);
907 		return -1;
908 	}
909 
910 	addr = 0;
911 	base = g*ArenaCIGSize;
912 	limit = base + ArenaCIGSize;
913 	if(base > arena->memstats.clumps)
914 		base = arena->memstats.clumps;
915 	ie = entries;
916 	for(i=base; i<limit; i++){
917 		if(readclumpinfo(arena, i, &ci) < 0)
918 			break;
919 		if(ci.type != VtCorruptType){
920 			scorecp(ie->score, ci.score);
921 			ie->ia.type = ci.type;
922 			ie->ia.size = ci.uncsize;
923 			ie->ia.blocks = (ci.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
924 			ie->ia.addr = addr;
925 			ie++;
926 		}
927 		addr += ClumpSize + ci.size;
928 	}
929 	qunlock(&arena->lock);
930 	return ie - entries;
931 }
932