xref: /plan9/sys/src/cmd/fossil/cache.c (revision a84536681645e23c630ce4ef2e5c3b284d4c590b)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	int 	ref;
26 	int	mode;
27 
28 	Disk 	*disk;
29 	int	size;			/* block size */
30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31 	VtSession *z;
32 	u32int	now;			/* ticks for usage timestamps */
33 	Block	**heads;		/* hash table for finding address */
34 	int	nheap;			/* number of available victims */
35 	Block	**heap;			/* heap for locating victims */
36 	long	nblocks;		/* number of blocks allocated */
37 	Block	*blocks;		/* array of block descriptors */
38 	u8int	*mem;			/* memory for all block data & blists */
39 
40 	BList	*blfree;
41 	VtRendez *blrend;
42 
43 	int 	ndirty;			/* number of dirty blocks in the cache */
44 	int 	maxdirty;		/* max number of dirty blocks */
45 	u32int	vers;
46 
47 	long hashSize;
48 
49 	FreeList *fl;
50 
51 	VtRendez *die;			/* daemon threads should die when != nil */
52 
53 	VtRendez *flush;
54 	VtRendez *flushwait;
55 	VtRendez *heapwait;
56 	BAddr *baddr;
57 	int bw, br, be;
58 	int nflush;
59 
60 	Periodic *sync;
61 
62 	/* unlink daemon */
63 	BList *uhead;
64 	BList *utail;
65 	VtRendez *unlink;
66 
67 	/* block counts */
68 	int nused;
69 	int ndisk;
70 };
71 
72 struct BList {
73 	int part;
74 	u32int addr;
75 	uchar type;
76 	u32int tag;
77 	u32int epoch;
78 	u32int vers;
79 
80 	int recurse;	/* for block unlink */
81 
82 	/* for roll back */
83 	int index;			/* -1 indicates not valid */
84 	union {
85 		uchar score[VtScoreSize];
86 		uchar entry[VtEntrySize];
87 	} old;
88 	BList *next;
89 };
90 
91 struct BAddr {
92 	int part;
93 	u32int addr;
94 	u32int vers;
95 };
96 
97 struct FreeList {
98 	VtLock *lk;
99 	u32int last;	/* last block allocated */
100 	u32int end;	/* end of data partition */
101 	u32int nfree;	/* number of free blocks */
102 	u32int nused;	/* number of used blocks */
103 	u32int epochLow;	/* low epoch when last updated nfree and nused */
104 };
105 
106 static FreeList *flAlloc(u32int end);
107 static void flFree(FreeList *fl);
108 
109 static Block *cacheBumpBlock(Cache *c);
110 static void heapDel(Block*);
111 static void heapIns(Block*);
112 static void cacheCheck(Cache*);
113 static void unlinkThread(void *a);
114 static void flushThread(void *a);
115 static void flushBody(Cache *c);
116 static void unlinkBody(Cache *c);
117 static int cacheFlushBlock(Cache *c);
118 static void cacheSync(void*);
119 static BList *blistAlloc(Block*);
120 static void blistFree(Cache*, BList*);
121 static void doRemoveLink(Cache*, BList*);
122 static void doRemoveLinkList(Cache*, BList*);
123 
124 /*
125  * Mapping from local block type to Venti type
126  */
127 int vtType[BtMax] = {
128 	VtDataType,		/* BtData | 0  */
129 	VtPointerType0,		/* BtData | 1  */
130 	VtPointerType1,		/* BtData | 2  */
131 	VtPointerType2,		/* BtData | 3  */
132 	VtPointerType3,		/* BtData | 4  */
133 	VtPointerType4,		/* BtData | 5  */
134 	VtPointerType5,		/* BtData | 6  */
135 	VtPointerType6,		/* BtData | 7  */
136 	VtDirType,		/* BtDir | 0  */
137 	VtPointerType0,		/* BtDir | 1  */
138 	VtPointerType1,		/* BtDir | 2  */
139 	VtPointerType2,		/* BtDir | 3  */
140 	VtPointerType3,		/* BtDir | 4  */
141 	VtPointerType4,		/* BtDir | 5  */
142 	VtPointerType5,		/* BtDir | 6  */
143 	VtPointerType6,		/* BtDir | 7  */
144 };
145 
146 /*
147  * Allocate the memory cache.
148  */
149 Cache *
150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
151 {
152 	int i;
153 	Cache *c;
154 	Block *b;
155 	BList *bl;
156 	u8int *p;
157 	int nbl;
158 
159 	c = vtMemAllocZ(sizeof(Cache));
160 
161 	/* reasonable number of BList elements */
162 	nbl = nblocks * 4;
163 
164 	c->lk = vtLockAlloc();
165 	c->ref = 1;
166 	c->disk = disk;
167 	c->z = z;
168 	c->size = diskBlockSize(disk);
169 bwatchSetBlockSize(c->size);
170 	/* round c->size up to be a nice multiple */
171 	c->size = (c->size + 127) & ~127;
172 	c->ndmap = (c->size/20 + 7) / 8;
173 	c->nblocks = nblocks;
174 	c->hashSize = nblocks;
175 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
176 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
177 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
178 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
179 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
180 	c->mode = mode;
181 	c->vers++;
182 	p = c->mem;
183 	for(i = 0; i < nblocks; i++){
184 		b = &c->blocks[i];
185 		b->lk = vtLockAlloc();
186 		b->c = c;
187 		b->data = p;
188 		b->heap = i;
189 		b->ioready = vtRendezAlloc(b->lk);
190 		c->heap[i] = b;
191 		p += c->size;
192 	}
193 	c->nheap = nblocks;
194 	for(i = 0; i < nbl; i++){
195 		bl = (BList*)p;
196 		bl->next = c->blfree;
197 		c->blfree = bl;
198 		p += sizeof(BList);
199 	}
200 	/* separate loop to keep blocks and blists reasonably aligned */
201 	for(i = 0; i < nblocks; i++){
202 		b = &c->blocks[i];
203 		b->dmap = p;
204 		p += c->ndmap;
205 	}
206 
207 	c->blrend = vtRendezAlloc(c->lk);
208 
209 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
210 
211 	c->fl = flAlloc(diskSize(disk, PartData));
212 
213 	c->unlink = vtRendezAlloc(c->lk);
214 	c->flush = vtRendezAlloc(c->lk);
215 	c->flushwait = vtRendezAlloc(c->lk);
216 	c->heapwait = vtRendezAlloc(c->lk);
217 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
218 
219 	if(mode == OReadWrite){
220 		c->ref += 2;
221 		vtThread(unlinkThread, c);
222 		vtThread(flushThread, c);
223 	}
224 	cacheCheck(c);
225 
226 	return c;
227 }
228 
229 /*
230  * Free the whole memory cache, flushing all dirty blocks to the disk.
231  */
232 void
233 cacheFree(Cache *c)
234 {
235 	int i;
236 
237 	/* kill off daemon threads */
238 	vtLock(c->lk);
239 	c->die = vtRendezAlloc(c->lk);
240 	periodicKill(c->sync);
241 	vtWakeup(c->flush);
242 	vtWakeup(c->unlink);
243 	while(c->ref > 1)
244 		vtSleep(c->die);
245 
246 	/* flush everything out */
247 	do {
248 		unlinkBody(c);
249 		vtUnlock(c->lk);
250 		while(cacheFlushBlock(c))
251 			;
252 		diskFlush(c->disk);
253 		vtLock(c->lk);
254 	} while(c->uhead || c->ndirty);
255 	vtUnlock(c->lk);
256 
257 	cacheCheck(c);
258 
259 	for(i = 0; i < c->nblocks; i++){
260 		assert(c->blocks[i].ref == 0);
261 		vtRendezFree(c->blocks[i].ioready);
262 		vtLockFree(c->blocks[i].lk);
263 	}
264 	flFree(c->fl);
265 	vtMemFree(c->baddr);
266 	vtMemFree(c->heads);
267 	vtMemFree(c->blocks);
268 	vtMemFree(c->mem);
269 	vtLockFree(c->lk);
270 	diskFree(c->disk);
271 	vtRendezFree(c->blrend);
272 	/* don't close vtSession */
273 	vtMemFree(c);
274 }
275 
276 static void
277 cacheDump(Cache *c)
278 {
279 	int i;
280 	Block *b;
281 
282 	for(i = 0; i < c->nblocks; i++){
283 		b = &c->blocks[i];
284 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=0x%lux\n",
285 			i, b->part, b->addr, b->score, b->l.type, b->ref,
286 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
287 	}
288 }
289 
290 static void
291 cacheCheck(Cache *c)
292 {
293 	u32int size, now;
294 	int i, k, refed;
295 	static uchar zero[VtScoreSize];
296 	Block *b;
297 
298 	size = c->size;
299 	now = c->now;
300 
301 	for(i = 0; i < c->nheap; i++){
302 		if(c->heap[i]->heap != i)
303 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
304 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
305 			vtFatal("bad heap ordering");
306 		k = (i << 1) + 1;
307 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
308 			vtFatal("bad heap ordering");
309 		k++;
310 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
311 			vtFatal("bad heap ordering");
312 	}
313 
314 	refed = 0;
315 	for(i = 0; i < c->nblocks; i++){
316 		b = &c->blocks[i];
317 		if(b->data != &c->mem[i * size])
318 			vtFatal("mis-blocked at %d", i);
319 		if(b->ref && b->heap == BadHeap){
320 			refed++;
321 		}
322 	}
323 if(c->nheap + refed != c->nblocks){
324 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks);
325 cacheDump(c);
326 }
327 	assert(c->nheap + refed == c->nblocks);
328 	refed = 0;
329 	for(i = 0; i < c->nblocks; i++){
330 		b = &c->blocks[i];
331 		if(b->ref){
332 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l);
333 			refed++;
334 		}
335 	}
336 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed);
337 }
338 
339 
340 /*
341  * locate the block with the oldest second to last use.
342  * remove it from the heap, and fix up the heap.
343  */
344 /* called with c->lk held */
345 static Block *
346 cacheBumpBlock(Cache *c)
347 {
348 	Block *b;
349 
350 	/*
351 	 * locate the block with the oldest second to last use.
352 	 * remove it from the heap, and fix up the heap.
353 	 */
354 	if(c->nheap == 0){
355 		while(c->nheap == 0){
356 			fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty);
357 			vtWakeup(c->flush);
358 			vtSleep(c->heapwait);
359 		}
360 		fprint(2, "cache is okay again\n");
361 	}
362 
363 	b = c->heap[0];
364 	heapDel(b);
365 
366 	assert(b->heap == BadHeap);
367 	assert(b->ref == 0);
368 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
369 	assert(b->prior == nil);
370 	assert(b->uhead == nil);
371 
372 	/*
373 	 * unchain the block from hash chain
374 	 */
375 	if(b->prev){
376 		*(b->prev) = b->next;
377 		if(b->next)
378 			b->next->prev = b->prev;
379 		b->prev = nil;
380 	}
381 
382 
383 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score);
384 	/* set block to a reasonable state */
385 	b->ref = 1;
386 	b->part = PartError;
387 	memset(&b->l, 0, sizeof(b->l));
388 	b->iostate = BioEmpty;
389 
390 	return b;
391 }
392 
393 /*
394  * look for a particular version of the block in the memory cache.
395  */
396 static Block *
397 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
398 	int waitlock, int *lockfailure)
399 {
400 	Block *b;
401 	ulong h;
402 
403 	h = addr % c->hashSize;
404 
405 	if(lockfailure)
406 		*lockfailure = 0;
407 
408 	/*
409 	 * look for the block in the cache
410 	 */
411 	vtLock(c->lk);
412 	for(b = c->heads[h]; b != nil; b = b->next){
413 		if(b->part == part && b->addr == addr)
414 			break;
415 	}
416 	if(b == nil || b->vers != vers){
417 		vtUnlock(c->lk);
418 		return nil;
419 	}
420 	if(!waitlock && !vtCanLock(b->lk)){
421 		*lockfailure = 1;
422 		vtUnlock(c->lk);
423 		return nil;
424 	}
425 	heapDel(b);
426 	b->ref++;
427 	vtUnlock(c->lk);
428 
429 	bwatchLock(b);
430 	if(waitlock)
431 		vtLock(b->lk);
432 	b->nlock = 1;
433 
434 	for(;;){
435 		switch(b->iostate){
436 		default:
437 			abort();
438 		case BioEmpty:
439 		case BioLabel:
440 		case BioClean:
441 		case BioDirty:
442 			if(b->vers != vers){
443 				blockPut(b);
444 				return nil;
445 			}
446 			return b;
447 		case BioReading:
448 		case BioWriting:
449 			vtSleep(b->ioready);
450 			break;
451 		case BioVentiError:
452 			blockPut(b);
453 			vtSetError("venti i/o error block 0x%.8ux", addr);
454 			return nil;
455 		case BioReadError:
456 			blockPut(b);
457 			vtSetError("i/o error block 0x%.8ux", addr);
458 			return nil;
459 		}
460 	}
461 	/* NOT REACHED */
462 }
463 static Block*
464 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
465 {
466 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
467 }
468 
469 
470 /*
471  * fetch a local (on-disk) block from the memory cache.
472  * if it's not there, load it, bumping some other block.
473  */
474 Block *
475 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
476 {
477 	Block *b;
478 	ulong h;
479 
480 	assert(part != PartVenti);
481 
482 	h = addr % c->hashSize;
483 
484 	/*
485 	 * look for the block in the cache
486 	 */
487 	vtLock(c->lk);
488 	for(b = c->heads[h]; b != nil; b = b->next){
489 		if(b->part != part || b->addr != addr)
490 			continue;
491 		if(epoch && b->l.epoch != epoch){
492 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
493 			vtUnlock(c->lk);
494 			vtSetError(ELabelMismatch);
495 			return nil;
496 		}
497 		heapDel(b);
498 		b->ref++;
499 		break;
500 	}
501 
502 	if(b == nil){
503 		b = cacheBumpBlock(c);
504 
505 		b->part = part;
506 		b->addr = addr;
507 		localToGlobal(addr, b->score);
508 
509 		/* chain onto correct hash */
510 		b->next = c->heads[h];
511 		c->heads[h] = b;
512 		if(b->next != nil)
513 			b->next->prev = &b->next;
514 		b->prev = &c->heads[h];
515 	}
516 
517 	vtUnlock(c->lk);
518 
519 	/*
520 	 * BUG: what if the epoch changes right here?
521 	 * In the worst case, we could end up in some weird
522 	 * lock loop, because the block we want no longer exists,
523 	 * and instead we're trying to lock a block we have no
524 	 * business grabbing.
525 	 *
526 	 * For now, I'm not going to worry about it.
527 	 */
528 
529 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr);
530 	bwatchLock(b);
531 	vtLock(b->lk);
532 	b->nlock = 1;
533 
534 	if(part == PartData && b->iostate == BioEmpty){
535 		if(!readLabel(c, &b->l, addr)){
536 			blockPut(b);
537 			return nil;
538 		}
539 		blockSetIOState(b, BioLabel);
540 	}
541 	if(epoch && b->l.epoch != epoch){
542 		blockPut(b);
543 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
544 		vtSetError(ELabelMismatch);
545 		return nil;
546 	}
547 
548 	b->pc = getcallerpc(&c);
549 	for(;;){
550 		switch(b->iostate){
551 		default:
552 			abort();
553 		case BioEmpty:
554 		case BioLabel:
555 			if(mode == OOverWrite){
556 				blockSetIOState(b, BioClean);
557 				return b;
558 			}
559 			diskRead(c->disk, b);
560 			vtSleep(b->ioready);
561 			break;
562 		case BioClean:
563 		case BioDirty:
564 			return b;
565 		case BioReading:
566 		case BioWriting:
567 			vtSleep(b->ioready);
568 			break;
569 		case BioReadError:
570 			blockSetIOState(b, BioEmpty);
571 			blockPut(b);
572 			vtSetError("i/o error block 0x%.8ux", addr);
573 			return nil;
574 		}
575 	}
576 	/* NOT REACHED */
577 }
578 
579 Block *
580 cacheLocal(Cache *c, int part, u32int addr, int mode)
581 {
582 	return _cacheLocal(c, part, addr, mode, 0);
583 }
584 
585 /*
586  * fetch a local (on-disk) block from the memory cache.
587  * if it's not there, load it, bumping some other block.
588  * check tag and type.
589  */
590 Block *
591 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
592 {
593 	Block *b;
594 
595 	b = _cacheLocal(c, PartData, addr, mode, epoch);
596 	if(b == nil)
597 		return nil;
598 	if(b->l.type != type || b->l.tag != tag){
599 		fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
600 			addr, b->l.type, type, b->l.tag, tag);
601 		vtSetError(ELabelMismatch);
602 		blockPut(b);
603 		return nil;
604 	}
605 	b->pc = getcallerpc(&c);
606 	return b;
607 }
608 
609 /*
610  * fetch a global (Venti) block from the memory cache.
611  * if it's not there, load it, bumping some other block.
612  * check tag and type if it's really a local block in disguise.
613  */
614 Block *
615 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
616 {
617 	int n;
618 	Block *b;
619 	ulong h;
620 	u32int addr;
621 
622 	addr = globalToLocal(score);
623 	if(addr != NilBlock){
624 		b = cacheLocalData(c, addr, type, tag, mode, 0);
625 		if(b)
626 			b->pc = getcallerpc(&c);
627 		return b;
628 	}
629 
630 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
631 
632 	/*
633 	 * look for the block in the cache
634 	 */
635 	vtLock(c->lk);
636 	for(b = c->heads[h]; b != nil; b = b->next){
637 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
638 			continue;
639 		heapDel(b);
640 		b->ref++;
641 		break;
642 	}
643 
644 	if(b == nil){
645 if(0)fprint(2, "cacheGlobal %V %d\n", score, type);
646 
647 		b = cacheBumpBlock(c);
648 
649 		b->part = PartVenti;
650 		b->addr = NilBlock;
651 		b->l.type = type;
652 		memmove(b->score, score, VtScoreSize);
653 
654 		/* chain onto correct hash */
655 		b->next = c->heads[h];
656 		c->heads[h] = b;
657 		if(b->next != nil)
658 			b->next->prev = &b->next;
659 		b->prev = &c->heads[h];
660 	}
661 	vtUnlock(c->lk);
662 
663 	bwatchLock(b);
664 	vtLock(b->lk);
665 	b->nlock = 1;
666 	b->pc = getcallerpc(&c);
667 
668 	switch(b->iostate){
669 	default:
670 		abort();
671 	case BioEmpty:
672 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
673 		if(n < 0 || !vtSha1Check(score, b->data, n)){
674 			blockSetIOState(b, BioVentiError);
675 			blockPut(b);
676 			vtSetError("venti i/o error block %V: %r", score);
677 			return nil;
678 		}
679 		vtZeroExtend(vtType[type], b->data, n, c->size);
680 		blockSetIOState(b, BioClean);
681 		return b;
682 	case BioClean:
683 		return b;
684 	case BioVentiError:
685 		blockPut(b);
686 		vtSetError("venti i/o error block %V", score);
687 		return nil;
688 	case BioReadError:
689 		blockPut(b);
690 		vtSetError("i/o error block %V", b->score);
691 		return nil;
692 	}
693 	/* NOT REACHED */
694 }
695 
696 /*
697  * allocate a new on-disk block and load it into the memory cache.
698  * BUG: if the disk is full, should we flush some of it to Venti?
699  */
700 static u32int lastAlloc;
701 
702 Block *
703 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
704 {
705 	FreeList *fl;
706 	u32int addr;
707 	Block *b;
708 	int n, nwrap;
709 	Label lab;
710 
711 	n = c->size / LabelSize;
712 	fl = c->fl;
713 
714 	vtLock(fl->lk);
715 	addr = fl->last;
716 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
717 	if(b == nil){
718 		fprint(2, "cacheAllocBlock: xxx %R\n");
719 		vtUnlock(fl->lk);
720 		return nil;
721 	}
722 	nwrap = 0;
723 	for(;;){
724 		if(++addr >= fl->end){
725 			addr = 0;
726 			if(++nwrap >= 2){
727 				blockPut(b);
728 				fl->last = 0;
729 				vtSetError("disk is full");
730 				fprint(2, "cacheAllocBlock: xxx1 %R\n");
731 				vtUnlock(fl->lk);
732 				return nil;
733 			}
734 		}
735 		if(addr%n == 0){
736 			blockPut(b);
737 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
738 			if(b == nil){
739 				fl->last = addr;
740 				fprint(2, "cacheAllocBlock: xxx2 %R\n");
741 				vtUnlock(fl->lk);
742 				return nil;
743 			}
744 		}
745 		if(!labelUnpack(&lab, b->data, addr%n))
746 			continue;
747 		if(lab.state == BsFree)
748 			goto Found;
749 		if(lab.state&BsClosed)
750 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
751 			goto Found;
752 	}
753 Found:
754 	blockPut(b);
755 	b = cacheLocal(c, PartData, addr, OOverWrite);
756 	if(b == nil){
757 		fprint(2, "cacheAllocBlock: xxx3 %R\n");
758 		return nil;
759 	}
760 assert(b->iostate == BioLabel || b->iostate == BioClean);
761 	fl->last = addr;
762 	lab.type = type;
763 	lab.tag = tag;
764 	lab.state = BsAlloc;
765 	lab.epoch = epoch;
766 	lab.epochClose = ~(u32int)0;
767 	if(!blockSetLabel(b, &lab, 1)){
768 		fprint(2, "cacheAllocBlock: xxx4 %R\n");
769 		blockPut(b);
770 		return nil;
771 	}
772 	vtZeroExtend(vtType[type], b->data, 0, c->size);
773 if(0)diskWrite(c->disk, b);
774 
775 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag);
776 	lastAlloc = addr;
777 	fl->nused++;
778 	vtUnlock(fl->lk);
779 	b->pc = getcallerpc(&c);
780 	return b;
781 }
782 
783 void
784 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
785 {
786 	int n;
787 	u32int addr, nused;
788 	Block *b;
789 	Label lab;
790 	FreeList *fl;
791 
792 	fl = c->fl;
793 	n = c->size / LabelSize;
794 	*bsize = c->size;
795 	vtLock(fl->lk);
796 	if(fl->epochLow == epochLow){
797 		*used = fl->nused;
798 		*total = fl->end;
799 		vtUnlock(fl->lk);
800 		return;
801 	}
802 	b = nil;
803 	nused = 0;
804 	for(addr=0; addr<fl->end; addr++){
805 		if(addr%n == 0){
806 			blockPut(b);
807 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
808 			if(b == nil){
809 				fprint(2, "flCountUsed: loading %ux: %R\n", addr/n);
810 				break;
811 			}
812 		}
813 		if(!labelUnpack(&lab, b->data, addr%n))
814 			continue;
815 		if(lab.state == BsFree)
816 			continue;
817 		if(lab.state&BsClosed)
818 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
819 			continue;
820 		nused++;
821 	}
822 	blockPut(b);
823 	if(addr == fl->end){
824 		fl->nused = nused;
825 		fl->epochLow = epochLow;
826 	}
827 	*used = nused;
828 	*total = fl->end;
829 	vtUnlock(fl->lk);
830 	return;
831 }
832 
833 static FreeList *
834 flAlloc(u32int end)
835 {
836 	FreeList *fl;
837 
838 	fl = vtMemAllocZ(sizeof(*fl));
839 	fl->lk = vtLockAlloc();
840 	fl->last = 0;
841 	fl->end = end;
842 	return fl;
843 }
844 
845 static void
846 flFree(FreeList *fl)
847 {
848 	vtLockFree(fl->lk);
849 	vtMemFree(fl);
850 }
851 
852 u32int
853 cacheLocalSize(Cache *c, int part)
854 {
855 	return diskSize(c->disk, part);
856 }
857 
858 /*
859  * The thread that has locked b may refer to it by
860  * multiple names.  Nlock counts the number of
861  * references the locking thread holds.  It will call
862  * blockPut once per reference.
863  */
864 void
865 blockDupLock(Block *b)
866 {
867 	assert(b->nlock > 0);
868 	b->nlock++;
869 }
870 
871 /*
872  * we're done with the block.
873  * unlock it.  can't use it after calling this.
874  */
875 void
876 blockPut(Block* b)
877 {
878 	Cache *c;
879 
880 	if(b == nil)
881 		return;
882 
883 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
884 
885 	if(b->iostate == BioDirty)
886 		bwatchDependency(b);
887 
888 	if(--b->nlock > 0)
889 		return;
890 
891 	/*
892 	 * b->nlock should probably stay at zero while
893 	 * the block is unlocked, but diskThread and vtSleep
894 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
895 	 * so we have to keep b->nlock set to 1 even
896 	 * when the block is unlocked.
897 	 */
898 	assert(b->nlock == 0);
899 	b->nlock = 1;
900 //	b->pc = 0;
901 
902 	bwatchUnlock(b);
903 	vtUnlock(b->lk);
904 	c = b->c;
905 	vtLock(c->lk);
906 
907 	if(--b->ref > 0){
908 		vtUnlock(c->lk);
909 		return;
910 	}
911 
912 	assert(b->ref == 0);
913 	switch(b->iostate){
914 	default:
915 		b->used = c->now++;
916 		heapIns(b);
917 		break;
918 	case BioEmpty:
919 	case BioLabel:
920 		if(c->nheap == 0)
921 			b->used = c->now++;
922 		else
923 			b->used = c->heap[0]->used;
924 		heapIns(b);
925 		break;
926 	case BioDirty:
927 		break;
928 	}
929 	vtUnlock(c->lk);
930 }
931 
932 /*
933  * set the label associated with a block.
934  */
935 Block*
936 _blockSetLabel(Block *b, Label *l)
937 {
938 	int lpb;
939 	Block *bb;
940 	u32int a;
941 	Cache *c;
942 
943 	c = b->c;
944 
945 	assert(b->part == PartData);
946 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
947 	lpb = c->size / LabelSize;
948 	a = b->addr / lpb;
949 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
950 	if(bb == nil){
951 		blockPut(b);
952 		return nil;
953 	}
954 	b->l = *l;
955 	labelPack(l, bb->data, b->addr%lpb);
956 	blockDirty(bb);
957 	return bb;
958 }
959 
960 int
961 blockSetLabel(Block *b, Label *l, int allocating)
962 {
963 	Block *lb;
964 	Label oldl;
965 
966 	oldl = b->l;
967 	lb = _blockSetLabel(b, l);
968 	if(lb == nil)
969 		return 0;
970 
971 	/*
972 	 * If we're allocating the block, make sure the label (bl)
973 	 * goes to disk before the data block (b) itself.  This is to help
974 	 * the blocks that in turn depend on b.
975 	 *
976 	 * Suppose bx depends on (must be written out after) b.
977 	 * Once we write b we'll think it's safe to write bx.
978 	 * Bx can't get at b unless it has a valid label, though.
979 	 *
980 	 * Allocation is the only case in which having a current label
981 	 * is vital because:
982 	 *
983 	 *	- l.type is set at allocation and never changes.
984 	 *	- l.tag is set at allocation and never changes.
985 	 *	- l.state is not checked when we load blocks.
986 	 *	- the archiver cares deeply about l.state being
987 	 *		BaActive vs. BaCopied, but that's handled
988 	 *		by direct calls to _blockSetLabel.
989 	 */
990 
991 	if(allocating)
992 		blockDependency(b, lb, -1, nil, nil);
993 	blockPut(lb);
994 	return 1;
995 }
996 
997 /*
998  * Record that bb must be written out before b.
999  * If index is given, we're about to overwrite the score/e
1000  * at that index in the block.  Save the old value so we
1001  * can write a safer ``old'' version of the block if pressed.
1002  */
1003 void
1004 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1005 {
1006 	BList *p;
1007 
1008 	if(bb->iostate == BioClean)
1009 		return;
1010 
1011 	/*
1012 	 * Dependencies for blocks containing Entry structures
1013 	 * or scores must always be explained.  The problem with
1014 	 * only explaining some of them is this.  Suppose we have two
1015 	 * dependencies for the same field, the first explained
1016 	 * and the second not.  We try to write the block when the first
1017 	 * dependency is not written but the second is.  We will roll back
1018 	 * the first change even though the second trumps it.
1019 	 */
1020 	if(index == -1 && bb->part == PartData)
1021 		assert(b->l.type == BtData);
1022 
1023 	if(bb->iostate != BioDirty){
1024 		fprint(2, "%d:%x:%d iostate is %d in blockDependency\n",
1025 			bb->part, bb->addr, bb->l.type, bb->iostate);
1026 		abort();
1027 	}
1028 
1029 	p = blistAlloc(bb);
1030 	if(p == nil)
1031 		return;
1032 
1033 	assert(bb->iostate == BioDirty);
1034 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1035 
1036 	p->part = bb->part;
1037 	p->addr = bb->addr;
1038 	p->type = bb->l.type;
1039 	p->vers = bb->vers;
1040 	p->index = index;
1041 	if(p->index >= 0){
1042 		/*
1043 		 * This test would just be b->l.type==BtDir except
1044 		 * we need to exclude the super block.
1045 		 */
1046 		if(b->l.type == BtDir && b->part == PartData)
1047 			entryPack(e, p->old.entry, 0);
1048 		else
1049 			memmove(p->old.score, score, VtScoreSize);
1050 	}
1051 	p->next = b->prior;
1052 	b->prior = p;
1053 }
1054 
1055 /*
1056  * Mark an in-memory block as dirty.  If there are too many
1057  * dirty blocks, start writing some out to disk.
1058  *
1059  * If there were way too many dirty blocks, we used to
1060  * try to do some flushing ourselves, but it's just too dangerous --
1061  * it implies that the callers cannot have any of our priors locked,
1062  * but this is hard to avoid in some cases.
1063  */
1064 int
1065 blockDirty(Block *b)
1066 {
1067 	Cache *c;
1068 
1069 	c = b->c;
1070 
1071 	assert(b->part != PartVenti);
1072 
1073 	if(b->iostate == BioDirty)
1074 		return 1;
1075 	assert(b->iostate == BioClean);
1076 
1077 	vtLock(c->lk);
1078 	b->iostate = BioDirty;
1079 	c->ndirty++;
1080 	if(c->ndirty > (c->maxdirty>>1))
1081 		vtWakeup(c->flush);
1082 	vtUnlock(c->lk);
1083 
1084 	return 1;
1085 }
1086 
1087 /*
1088  * We've decided to write out b.  Maybe b has some pointers to blocks
1089  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1090  * copy of b that is safe to write out.  (diskThread will make sure the block
1091  * remains marked as dirty.)
1092  */
1093 uchar *
1094 blockRollback(Block *b, uchar *buf)
1095 {
1096 	u32int addr;
1097 	BList *p;
1098 	Super super;
1099 
1100 	/* easy case */
1101 	if(b->prior == nil)
1102 		return b->data;
1103 
1104 	memmove(buf, b->data, b->c->size);
1105 	for(p=b->prior; p; p=p->next){
1106 		/*
1107 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1108 		 */
1109 		assert(p->index >= 0);
1110 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1111 		if(b->part == PartSuper){
1112 			assert(p->index == 0);
1113 			superUnpack(&super, buf);
1114 			addr = globalToLocal(p->old.score);
1115 			if(addr == NilBlock){
1116 				fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score);
1117 				abort();
1118 			}
1119 			super.active = addr;
1120 			superPack(&super, buf);
1121 			continue;
1122 		}
1123 		if(b->l.type == BtDir)
1124 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1125 		else
1126 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1127 	}
1128 	return buf;
1129 }
1130 
1131 /*
1132  * Try to write block b.
1133  * If b depends on other blocks:
1134  *
1135  *	If the block has been written out, remove the dependency.
1136  *	If the dependency is replaced by a more recent dependency,
1137  *		throw it out.
1138  *	If we know how to write out an old version of b that doesn't
1139  *		depend on it, do that.
1140  *
1141  *	Otherwise, bail.
1142  */
1143 int
1144 blockWrite(Block *b)
1145 {
1146 	uchar *dmap;
1147 	Cache *c;
1148 	BList *p, **pp;
1149 	Block *bb;
1150 	int lockfail;
1151 
1152 	c = b->c;
1153 
1154 	if(b->iostate != BioDirty)
1155 		return 1;
1156 
1157 	dmap = b->dmap;
1158 	memset(dmap, 0, c->ndmap);
1159 	pp = &b->prior;
1160 	for(p=*pp; p; p=*pp){
1161 		if(p->index >= 0){
1162 			/* more recent dependency has succeeded; this one can go */
1163 			if(dmap[p->index/8] & (1<<(p->index%8)))
1164 				goto ignblock;
1165 		}
1166 
1167 		lockfail = 0;
1168 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1169 		if(bb == nil){
1170 			if(lockfail)
1171 				return 0;
1172 			/* block not in cache => was written already */
1173 			dmap[p->index/8] |= 1<<(p->index%8);
1174 			goto ignblock;
1175 		}
1176 
1177 		/*
1178 		 * same version of block is still in cache.
1179 		 *
1180 		 * the assertion is true because the block still has version p->vers,
1181 		 * which means it hasn't been written out since we last saw it.
1182 		 */
1183 		if(bb->iostate != BioDirty){
1184 			fprint(2, "%d:%x:%d iostate is %d in blockWrite\n",
1185 				bb->part, bb->addr, bb->l.type, bb->iostate);
1186 			/* probably BioWriting if it happens? */
1187 			if(bb->iostate == BioClean)
1188 				goto ignblock;
1189 		}
1190 
1191 		blockPut(bb);
1192 
1193 		if(p->index < 0){
1194 			/*
1195 			 * We don't know how to temporarily undo
1196 			 * b's dependency on bb, so just don't write b yet.
1197 			 */
1198 			if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1199 				b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1200 			return 0;
1201 		}
1202 		/* keep walking down the list */
1203 		pp = &p->next;
1204 		continue;
1205 
1206 ignblock:
1207 		*pp = p->next;
1208 		blistFree(c, p);
1209 		continue;
1210 	}
1211 
1212 	/*
1213 	 * DiskWrite must never be called with a double-locked block.
1214 	 * This call to diskWrite is okay because blockWrite is only called
1215 	 * from the cache flush thread, which never double-locks a block.
1216 	 */
1217 	diskWrite(c->disk, b);
1218 	return 1;
1219 }
1220 
1221 /*
1222  * Change the I/O state of block b.
1223  * Just an assignment except for magic in
1224  * switch statement (read comments there).
1225  */
1226 void
1227 blockSetIOState(Block *b, int iostate)
1228 {
1229 	int dowakeup;
1230 	Cache *c;
1231 	BList *p, *q;
1232 
1233 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1234 
1235 	c = b->c;
1236 
1237 	dowakeup = 0;
1238 	switch(iostate){
1239 	default:
1240 		abort();
1241 	case BioEmpty:
1242 		assert(!b->uhead);
1243 		break;
1244 	case BioLabel:
1245 		assert(!b->uhead);
1246 		break;
1247 	case BioClean:
1248 		bwatchDependency(b);
1249 		/*
1250 		 * If b->prior is set, it means a write just finished.
1251 		 * The prior list isn't needed anymore.
1252 		 */
1253 		for(p=b->prior; p; p=q){
1254 			q = p->next;
1255 			blistFree(c, p);
1256 		}
1257 		b->prior = nil;
1258 		/*
1259 		 * Freeing a block or just finished a write.
1260 		 * Move the blocks from the per-block unlink
1261 		 * queue to the cache unlink queue.
1262 		 */
1263 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1264 			vtLock(c->lk);
1265 			c->ndirty--;
1266 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1267 			b->vers = c->vers++;
1268 			if(b->uhead){
1269 				/* add unlink blocks to unlink queue */
1270 				if(c->uhead == nil){
1271 					c->uhead = b->uhead;
1272 					vtWakeup(c->unlink);
1273 				}else
1274 					c->utail->next = b->uhead;
1275 				c->utail = b->utail;
1276 				b->uhead = nil;
1277 			}
1278 			vtUnlock(c->lk);
1279 		}
1280 		assert(!b->uhead);
1281 		dowakeup = 1;
1282 		break;
1283 	case BioDirty:
1284 		/*
1285 		 * Wrote out an old version of the block (see blockRollback).
1286 		 * Bump a version count, leave it dirty.
1287 		 */
1288 		if(b->iostate == BioWriting){
1289 			vtLock(c->lk);
1290 			b->vers = c->vers++;
1291 			vtUnlock(c->lk);
1292 			dowakeup = 1;
1293 		}
1294 		break;
1295 	case BioReading:
1296 	case BioWriting:
1297 		/*
1298 		 * Adding block to disk queue.  Bump reference count.
1299 		 * diskThread decs the count later by calling blockPut.
1300 		 * This is here because we need to lock c->lk to
1301 		 * manipulate the ref count.
1302 		 */
1303 		vtLock(c->lk);
1304 		b->ref++;
1305 		vtUnlock(c->lk);
1306 		break;
1307 	case BioReadError:
1308 	case BioVentiError:
1309 		/*
1310 		 * Oops.
1311 		 */
1312 		dowakeup = 1;
1313 		break;
1314 	}
1315 	b->iostate = iostate;
1316 	/*
1317 	 * Now that the state has changed, we can wake the waiters.
1318 	 */
1319 	if(dowakeup)
1320 		vtWakeupAll(b->ioready);
1321 }
1322 
1323 /*
1324  * The active file system is a tree of blocks.
1325  * When we add snapshots to the mix, the entire file system
1326  * becomes a dag and thus requires a bit more care.
1327  *
1328  * The life of the file system is divided into epochs.  A snapshot
1329  * ends one epoch and begins the next.  Each file system block
1330  * is marked with the epoch in which it was created (b.epoch).
1331  * When the block is unlinked from the file system (closed), it is marked
1332  * with the epoch in which it was removed (b.epochClose).
1333  * Once we have discarded or archived all snapshots up to
1334  * b.epochClose, we can reclaim the block.
1335  *
1336  * If a block was created in a past epoch but is not yet closed,
1337  * it is treated as copy-on-write.  Of course, in order to insert the
1338  * new pointer into the tree, the parent must be made writable,
1339  * and so on up the tree.  The recursion stops because the root
1340  * block is always writable.
1341  *
1342  * If blocks are never closed, they will never be reused, and
1343  * we will run out of disk space.  But marking a block as closed
1344  * requires some care about dependencies and write orderings.
1345  *
1346  * (1) If a block p points at a copy-on-write block b and we
1347  * copy b to create bb, then p must be written out after bb and
1348  * lbb (bb's label block).
1349  *
1350  * (2) We have to mark b as closed, but only after we switch
1351  * the pointer, so lb must be written out after p.  In fact, we
1352  * can't even update the in-memory copy, or the cache might
1353  * mistakenly give out b for reuse before p gets written.
1354  *
1355  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1356  * The caller is expected to record a "p after bb" dependency
1357  * to finish (1), and also expected to call blockRemoveLink
1358  * to arrange for (2) to happen once p is written.
1359  *
1360  * Until (2) happens, some pieces of the code (e.g., the archiver)
1361  * still need to know whether a block has been copied, so we
1362  * set the BsCopied bit in the label and force that to disk *before*
1363  * the copy gets written out.
1364  */
1365 Block*
1366 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1367 {
1368 	Block *bb, *lb;
1369 	Label l;
1370 
1371 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1372 		fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n",
1373 			b->addr, &b->l, elo, ehi);
1374 
1375 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1376 	if(bb == nil){
1377 		blockPut(b);
1378 		return nil;
1379 	}
1380 
1381 	/*
1382 	 * Update label so we know the block has been copied.
1383 	 * (It will be marked closed once it has been unlinked from
1384 	 * the tree.)  This must follow cacheAllocBlock since we
1385 	 * can't be holding onto lb when we call cacheAllocBlock.
1386 	 */
1387 	if((b->l.state&BsCopied)==0)
1388 	if(b->part == PartData){	/* not the superblock */
1389 		l = b->l;
1390 		l.state |= BsCopied;
1391 		lb = _blockSetLabel(b, &l);
1392 		if(lb == nil){
1393 			/* can't set label => can't copy block */
1394 			blockPut(b);
1395 			l.type = BtMax;
1396 			l.state = BsFree;
1397 			l.epoch = 0;
1398 			l.epochClose = 0;
1399 			l.tag = 0;
1400 			blockSetLabel(bb, &l, 0);
1401 			blockPut(bb);
1402 			return nil;
1403 		}
1404 		blockDependency(bb, lb, -1, nil, nil);
1405 		blockPut(lb);
1406 	}
1407 
1408 	memmove(bb->data, b->data, b->c->size);
1409 	blockDirty(bb);
1410 	blockPut(b);
1411 	return bb;
1412 }
1413 
1414 /*
1415  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1416  * If recurse is set, we are unlinking all of bb's children as well.
1417  *
1418  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1419  * the relevant information to b's list of unlinked blocks.  Once b is written,
1420  * the list will be queued for processing.
1421  *
1422  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1423  */
1424 void
1425 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1426 {
1427 	BList *p, **pp, bl;
1428 
1429 	/* remove bb from prior list */
1430 	for(pp=&b->prior; (p=*pp)!=nil; ){
1431 		if(p->part == PartData && p->addr == addr){
1432 			*pp = p->next;
1433 			blistFree(b->c, p);
1434 		}else
1435 			pp = &p->next;
1436 	}
1437 
1438 	bl.part = PartData;
1439 	bl.addr = addr;
1440 	bl.type = type;
1441 	bl.tag = tag;
1442 	if(b->l.epoch == 0)
1443 		assert(b->part == PartSuper);
1444 	bl.epoch = b->l.epoch;
1445 	bl.next = nil;
1446 	bl.recurse = recurse;
1447 
1448 	p = blistAlloc(b);
1449 	if(p == nil){
1450 		/*
1451 		 * We were out of blists so blistAlloc wrote b to disk.
1452 		 */
1453 		doRemoveLink(b->c, &bl);
1454 		return;
1455 	}
1456 
1457 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1458 	assert(b->iostate == BioDirty);
1459 
1460 	*p = bl;
1461 	if(b->uhead == nil)
1462 		b->uhead = p;
1463 	else
1464 		b->utail->next = p;
1465 	b->utail = p;
1466 }
1467 
1468 /*
1469  * Process removal of a single block and perhaps its children.
1470  */
1471 static void
1472 doRemoveLink(Cache *c, BList *p)
1473 {
1474 	int i, n;
1475 	u32int a;
1476 	Block *b;
1477 	Label l;
1478 	BList bl;
1479 
1480 	b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1481 	if(b == nil)
1482 		return;
1483 
1484 	/*
1485 	 * When we're unlinking from the superblock, close with the next epoch.
1486 	 */
1487 	if(p->epoch == 0)
1488 		p->epoch = b->l.epoch+1;
1489 
1490 	/* sanity check */
1491 	if(b->l.epoch > p->epoch){
1492 		fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch);
1493 		blockPut(b);
1494 		return;
1495 	}
1496 
1497 	if(p->recurse && p->type != BtData && p->type != BtDir){
1498 		n = c->size / VtScoreSize;
1499 		for(i=0; i<n; i++){
1500 			a = globalToLocal(b->data + i*VtScoreSize);
1501 			if(a == NilBlock || !readLabel(c, &l, a))
1502 				continue;
1503 			if(l.state&BsClosed)
1504 				continue;
1505 			/*
1506 			 * If stack space becomes an issue...
1507 			p->addr = a;
1508 			p->type = l.type;
1509 			p->tag = l.tag;
1510 			doRemoveLink(c, p);
1511 			 */
1512 
1513 			bl.part = PartData;
1514 			bl.addr = a;
1515 			bl.type = l.type;
1516 			bl.tag = l.tag;
1517 			bl.epoch = p->epoch;
1518 			bl.next = nil;
1519 			bl.recurse = 1;
1520 			doRemoveLink(c, &bl);
1521 		}
1522 	}
1523 
1524 	l = b->l;
1525 	l.state |= BsClosed;
1526 	l.epochClose = p->epoch;
1527 	blockSetLabel(b, &l, 0);
1528 	blockPut(b);
1529 }
1530 
1531 /*
1532  * Allocate a BList so that we can record a dependency
1533  * or queue a removal related to block b.
1534  * If we can't find a BList, we write out b and return nil.
1535  */
1536 static BList *
1537 blistAlloc(Block *b)
1538 {
1539 	Cache *c;
1540 	BList *p;
1541 
1542 	if(b->iostate != BioDirty){
1543 		/*
1544 		 * should not happen anymore -
1545 	 	 * blockDirty used to flush but no longer does.
1546 		 */
1547 		assert(b->iostate == BioClean);
1548 		fprint(2, "blistAlloc: called on clean block\n");
1549 		return nil;
1550 	}
1551 
1552 	c = b->c;
1553 	vtLock(c->lk);
1554 	if(c->blfree == nil){
1555 		/*
1556 		 * No free BLists.  What are our options?
1557 		 */
1558 
1559 		/* Block has no priors? Just write it. */
1560 		if(b->prior == nil){
1561 			vtUnlock(c->lk);
1562 			diskWriteAndWait(c->disk, b);
1563 			return nil;
1564 		}
1565 
1566 		/*
1567 		 * Wake the flush thread, which will hopefully free up
1568 		 * some BLists for us.  We used to flush a block from
1569 		 * our own prior list and reclaim that BList, but this is
1570 		 * a no-no: some of the blocks on our prior list may
1571 		 * be locked by our caller.  Or maybe their label blocks
1572 		 * are locked by our caller.  In any event, it's too hard
1573 		 * to make sure we can do I/O for ourselves.  Instead,
1574 		 * we assume the flush thread will find something.
1575 		 * (The flush thread never blocks waiting for a block,
1576 		 * so it can't deadlock like we can.)
1577 		 */
1578 		vtLock(c->lk);
1579 		while(c->blfree == nil){
1580 			vtWakeup(c->flush);
1581 			vtSleep(c->blrend);
1582 		}
1583 	}
1584 
1585 	p = c->blfree;
1586 	c->blfree = p->next;
1587 	vtUnlock(c->lk);
1588 	return p;
1589 }
1590 
1591 static void
1592 blistFree(Cache *c, BList *bl)
1593 {
1594 	vtLock(c->lk);
1595 	bl->next = c->blfree;
1596 	c->blfree = bl;
1597 	vtWakeup(c->blrend);
1598 	vtUnlock(c->lk);
1599 }
1600 
1601 char*
1602 bsStr(int state)
1603 {
1604 	static char s[100];
1605 
1606 	if(state == BsFree)
1607 		return "Free";
1608 	if(state == BsBad)
1609 		return "Bad";
1610 
1611 	sprint(s, "%x", state);
1612 	if(!(state&BsAlloc))
1613 		strcat(s, ",Free");	/* should not happen */
1614 	if(state&BsCopied)
1615 		strcat(s, ",Copied");
1616 	if(state&BsVenti)
1617 		strcat(s, ",Venti");
1618 	if(state&BsClosed)
1619 		strcat(s, ",Closed");
1620 	return s;
1621 }
1622 
1623 char *
1624 bioStr(int iostate)
1625 {
1626 	switch(iostate){
1627 	default:
1628 		return "Unknown!!";
1629 	case BioEmpty:
1630 		return "Empty";
1631 	case BioLabel:
1632 		return "Label";
1633 	case BioClean:
1634 		return "Clean";
1635 	case BioDirty:
1636 		return "Dirty";
1637 	case BioReading:
1638 		return "Reading";
1639 	case BioWriting:
1640 		return "Writing";
1641 	case BioReadError:
1642 		return "ReadError";
1643 	case BioVentiError:
1644 		return "VentiError";
1645 	case BioMax:
1646 		return "Max";
1647 	}
1648 }
1649 
1650 static char *bttab[] = {
1651 	"BtData",
1652 	"BtData+1",
1653 	"BtData+2",
1654 	"BtData+3",
1655 	"BtData+4",
1656 	"BtData+5",
1657 	"BtData+6",
1658 	"BtData+7",
1659 	"BtDir",
1660 	"BtDir+1",
1661 	"BtDir+2",
1662 	"BtDir+3",
1663 	"BtDir+4",
1664 	"BtDir+5",
1665 	"BtDir+6",
1666 	"BtDir+7",
1667 };
1668 
1669 char*
1670 btStr(int type)
1671 {
1672 	if(type < nelem(bttab))
1673 		return bttab[type];
1674 	return "unknown";
1675 }
1676 
1677 int
1678 labelFmt(Fmt *f)
1679 {
1680 	Label *l;
1681 
1682 	l = va_arg(f->args, Label*);
1683 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1684 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1685 }
1686 
1687 int
1688 scoreFmt(Fmt *f)
1689 {
1690 	uchar *v;
1691 	int i;
1692 	u32int addr;
1693 
1694 	v = va_arg(f->args, uchar*);
1695 	if(v == nil){
1696 		fmtprint(f, "*");
1697 	}else if((addr = globalToLocal(v)) != NilBlock)
1698 		fmtprint(f, "0x%.8ux", addr);
1699 	else{
1700 		for(i = 0; i < VtScoreSize; i++)
1701 			fmtprint(f, "%2.2ux", v[i]);
1702 	}
1703 
1704 	return 0;
1705 }
1706 
1707 static int
1708 upHeap(int i, Block *b)
1709 {
1710 	Block *bb;
1711 	u32int now;
1712 	int p;
1713 	Cache *c;
1714 
1715 	c = b->c;
1716 	now = c->now;
1717 	for(; i != 0; i = p){
1718 		p = (i - 1) >> 1;
1719 		bb = c->heap[p];
1720 		if(b->used - now >= bb->used - now)
1721 			break;
1722 		c->heap[i] = bb;
1723 		bb->heap = i;
1724 	}
1725 	c->heap[i] = b;
1726 	b->heap = i;
1727 
1728 	return i;
1729 }
1730 
1731 static int
1732 downHeap(int i, Block *b)
1733 {
1734 	Block *bb;
1735 	u32int now;
1736 	int k;
1737 	Cache *c;
1738 
1739 	c = b->c;
1740 	now = c->now;
1741 	for(; ; i = k){
1742 		k = (i << 1) + 1;
1743 		if(k >= c->nheap)
1744 			break;
1745 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1746 			k++;
1747 		bb = c->heap[k];
1748 		if(b->used - now <= bb->used - now)
1749 			break;
1750 		c->heap[i] = bb;
1751 		bb->heap = i;
1752 	}
1753 	c->heap[i] = b;
1754 	b->heap = i;
1755 	return i;
1756 }
1757 
1758 /*
1759  * Delete a block from the heap.
1760  * Called with c->lk held.
1761  */
1762 static void
1763 heapDel(Block *b)
1764 {
1765 	int i, si;
1766 	Cache *c;
1767 
1768 	c = b->c;
1769 
1770 	si = b->heap;
1771 	if(si == BadHeap)
1772 		return;
1773 	b->heap = BadHeap;
1774 	c->nheap--;
1775 	if(si == c->nheap)
1776 		return;
1777 	b = c->heap[c->nheap];
1778 	i = upHeap(si, b);
1779 	if(i == si)
1780 		downHeap(i, b);
1781 }
1782 
1783 /*
1784  * Insert a block into the heap.
1785  * Called with c->lk held.
1786  */
1787 static void
1788 heapIns(Block *b)
1789 {
1790 	assert(b->heap == BadHeap);
1791 	upHeap(b->c->nheap++, b);
1792 	vtWakeup(b->c->heapwait);
1793 }
1794 
1795 /*
1796  * Get just the label for a block.
1797  */
1798 int
1799 readLabel(Cache *c, Label *l, u32int addr)
1800 {
1801 	int lpb;
1802 	Block *b;
1803 	u32int a;
1804 
1805 	lpb = c->size / LabelSize;
1806 	a = addr / lpb;
1807 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1808 	if(b == nil){
1809 		blockPut(b);
1810 		return 0;
1811 	}
1812 
1813 	if(!labelUnpack(l, b->data, addr%lpb)){
1814 		blockPut(b);
1815 		return 0;
1816 	}
1817 	blockPut(b);
1818 	return 1;
1819 }
1820 
1821 /*
1822  * Process unlink queue.
1823  * Called with c->lk held.
1824  */
1825 static void
1826 unlinkBody(Cache *c)
1827 {
1828 	BList *p;
1829 
1830 	while(c->uhead != nil){
1831 		p = c->uhead;
1832 		c->uhead = p->next;
1833 		vtUnlock(c->lk);
1834 		doRemoveLink(c, p);
1835 		vtLock(c->lk);
1836 		p->next = c->blfree;
1837 		c->blfree = p;
1838 	}
1839 }
1840 
1841 /*
1842  * Occasionally unlink the blocks on the cache unlink queue.
1843  */
1844 static void
1845 unlinkThread(void *a)
1846 {
1847 	Cache *c = a;
1848 
1849 	vtThreadSetName("unlink");
1850 
1851 	vtLock(c->lk);
1852 	for(;;){
1853 		while(c->uhead == nil && c->die == nil)
1854 			vtSleep(c->unlink);
1855 		if(c->die != nil)
1856 			break;
1857 		unlinkBody(c);
1858 	}
1859 	c->ref--;
1860 	vtWakeup(c->die);
1861 	vtUnlock(c->lk);
1862 }
1863 
1864 static int
1865 baddrCmp(void *a0, void *a1)
1866 {
1867 	BAddr *b0, *b1;
1868 	b0 = a0;
1869 	b1 = a1;
1870 
1871 	if(b0->part < b1->part)
1872 		return -1;
1873 	if(b0->part > b1->part)
1874 		return 1;
1875 	if(b0->addr < b1->addr)
1876 		return -1;
1877 	if(b0->addr > b1->addr)
1878 		return 1;
1879 	return 0;
1880 }
1881 
1882 /*
1883  * Scan the block list for dirty blocks; add them to the list c->baddr.
1884  */
1885 static void
1886 flushFill(Cache *c)
1887 {
1888 	int i, ndirty;
1889 	BAddr *p;
1890 	Block *b;
1891 
1892 	vtLock(c->lk);
1893 	if(c->ndirty == 0){
1894 		vtUnlock(c->lk);
1895 		return;
1896 	}
1897 
1898 	p = c->baddr;
1899 	ndirty = 0;
1900 	for(i=0; i<c->nblocks; i++){
1901 		b = c->blocks + i;
1902 		if(b->part == PartError)
1903 			continue;
1904 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1905 			ndirty++;
1906 		if(b->iostate != BioDirty)
1907 			continue;
1908 		p->part = b->part;
1909 		p->addr = b->addr;
1910 		p->vers = b->vers;
1911 		p++;
1912 	}
1913 	if(ndirty != c->ndirty){
1914 		fprint(2, "ndirty mismatch expected %d found %d\n",
1915 			c->ndirty, ndirty);
1916 		c->ndirty = ndirty;
1917 	}
1918 	vtUnlock(c->lk);
1919 
1920 	c->bw = p - c->baddr;
1921 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1922 }
1923 
1924 /*
1925  * This is not thread safe, i.e. it can't be called from multiple threads.
1926  *
1927  * It's okay how we use it, because it only gets called in
1928  * the flushThread.  And cacheFree, but only after
1929  * cacheFree has killed off the flushThread.
1930  */
1931 static int
1932 cacheFlushBlock(Cache *c)
1933 {
1934 	Block *b;
1935 	BAddr *p;
1936 	int lockfail, nfail;
1937 
1938 	nfail = 0;
1939 	for(;;){
1940 		if(c->br == c->be){
1941 			if(c->bw == 0 || c->bw == c->be)
1942 				flushFill(c);
1943 			c->br = 0;
1944 			c->be = c->bw;
1945 			c->bw = 0;
1946 			c->nflush = 0;
1947 		}
1948 
1949 		if(c->br == c->be)
1950 			return 0;
1951 		p = c->baddr + c->br;
1952 		c->br++;
1953 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1954 
1955 		if(b && blockWrite(b)){
1956 			c->nflush++;
1957 			blockPut(b);
1958 			return 1;
1959 		}
1960 		if(b)
1961 			blockPut(b);
1962 
1963 		/*
1964 		 * Why didn't we write the block?
1965 		 */
1966 
1967 		/* Block already written out */
1968 		if(b == nil && !lockfail)
1969 			continue;
1970 
1971 		/* Failed to acquire lock; sleep if happens a lot. */
1972 		if(lockfail && ++nfail > 100){
1973 			sleep(500);
1974 			nfail = 0;
1975 		}
1976 		/* Requeue block. */
1977 		if(c->bw < c->be)
1978 			c->baddr[c->bw++] = *p;
1979 	}
1980 	return 0;
1981 }
1982 
1983 /*
1984  * Occasionally flush dirty blocks from memory to the disk.
1985  */
1986 static void
1987 flushThread(void *a)
1988 {
1989 	Cache *c = a;
1990 	int i;
1991 
1992 	vtThreadSetName("flush");
1993 	vtLock(c->lk);
1994 	while(c->die == nil){
1995 		vtSleep(c->flush);
1996 		vtUnlock(c->lk);
1997 		for(i=0; i<FlushSize; i++)
1998 			if(!cacheFlushBlock(c)){
1999 				/*
2000 				 * If i==0, could be someone is waking us repeatedly
2001 				 * to flush the cache but there's no work to do.
2002 				 * Pause a little.
2003 				 */
2004 				if(i==0)
2005 					sleep(250);
2006 				break;
2007 			}
2008 		if(i==0 && c->ndirty){
2009 			/*
2010 			 * All the blocks are being written right now -- there's nothing to do.
2011 			 * We might be spinning with cacheFlush though -- he'll just keep
2012 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2013 			 * on something that the diskThread can kick, but for now we'll
2014 			 * just pause for a little while waiting for disks to finish.
2015 			 */
2016 			sleep(100);
2017 		}
2018 		vtLock(c->lk);
2019 		vtWakeupAll(c->flushwait);
2020 	}
2021 	c->ref--;
2022 	vtWakeup(c->die);
2023 	vtUnlock(c->lk);
2024 }
2025 
2026 /*
2027  * Keep flushing until everything is clean.
2028  */
2029 void
2030 cacheFlush(Cache *c, int wait)
2031 {
2032 	vtLock(c->lk);
2033 	if(wait){
2034 		while(c->ndirty){
2035 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2036 		//		c->ndirty, c->uhead);
2037 			vtWakeup(c->flush);
2038 			vtSleep(c->flushwait);
2039 		}
2040 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2041 	}else
2042 		vtWakeup(c->flush);
2043 	vtUnlock(c->lk);
2044 }
2045 
2046 /*
2047  * Kick the flushThread every 30 seconds.
2048  */
2049 static void
2050 cacheSync(void *v)
2051 {
2052 	Cache *c;
2053 
2054 	c = v;
2055 	cacheFlush(c, 0);
2056 }
2057