xref: /plan9-contrib/sys/src/cmd/fossil/cache.c (revision a6a9e07217f318acf170f99684a55fba5200524f)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	int 	ref;
26 	int	mode;
27 
28 	Disk 	*disk;
29 	int	size;			/* block size */
30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31 	VtSession *z;
32 	u32int	now;			/* ticks for usage timestamps */
33 	Block	**heads;		/* hash table for finding address */
34 	int	nheap;			/* number of available victims */
35 	Block	**heap;			/* heap for locating victims */
36 	long	nblocks;		/* number of blocks allocated */
37 	Block	*blocks;		/* array of block descriptors */
38 	u8int	*mem;			/* memory for all block data & blists */
39 
40 	BList	*blfree;
41 	VtRendez *blrend;
42 
43 	int 	ndirty;			/* number of dirty blocks in the cache */
44 	int 	maxdirty;		/* max number of dirty blocks */
45 	u32int	vers;
46 
47 	long hashSize;
48 
49 	FreeList *fl;
50 
51 	VtRendez *die;			/* daemon threads should die when != nil */
52 
53 	VtRendez *flush;
54 	VtRendez *flushwait;
55 	BAddr *baddr;
56 	int bw, br, be;
57 	int nflush;
58 
59 	Periodic *sync;
60 
61 	/* unlink daemon */
62 	BList *uhead;
63 	BList *utail;
64 	VtRendez *unlink;
65 };
66 
67 struct BList {
68 	int part;
69 	u32int addr;
70 	uchar type;
71 	u32int tag;
72 	u32int epoch;
73 	u32int vers;
74 
75 	/* for roll back */
76 	int index;			/* -1 indicates not valid */
77 	union {
78 		uchar score[VtScoreSize];
79 		uchar entry[VtEntrySize];
80 	} old;
81 	BList *next;
82 };
83 
84 struct BAddr {
85 	int part;
86 	u32int addr;
87 	u32int vers;
88 };
89 
90 struct FreeList {
91 	VtLock *lk;
92 	u32int last;	/* last block allocated */
93 	u32int end;	/* end of data partition */
94 };
95 
96 static FreeList *flAlloc(u32int end);
97 static void flFree(FreeList *fl);
98 
99 static Block *cacheBumpBlock(Cache *c);
100 static void heapDel(Block*);
101 static void heapIns(Block*);
102 static void cacheCheck(Cache*);
103 static int readLabel(Cache*, Label*, u32int addr);
104 static void unlinkThread(void *a);
105 static void flushThread(void *a);
106 static void flushBody(Cache *c);
107 static void unlinkBody(Cache *c);
108 static int cacheFlushBlock(Cache *c);
109 static void cacheSync(void*);
110 /*
111  * Mapping from local block type to Venti type
112  */
113 int vtType[BtMax] = {
114 	VtDataType,		/* BtData | 0  */
115 	VtPointerType0,		/* BtData | 1  */
116 	VtPointerType1,		/* BtData | 2  */
117 	VtPointerType2,		/* BtData | 3  */
118 	VtPointerType3,		/* BtData | 4  */
119 	VtPointerType4,		/* BtData | 5  */
120 	VtPointerType5,		/* BtData | 6  */
121 	VtPointerType6,		/* BtData | 7  */
122 	VtDirType,		/* BtDir | 0  */
123 	VtPointerType0,		/* BtDir | 1  */
124 	VtPointerType1,		/* BtDir | 2  */
125 	VtPointerType2,		/* BtDir | 3  */
126 	VtPointerType3,		/* BtDir | 4  */
127 	VtPointerType4,		/* BtDir | 5  */
128 	VtPointerType5,		/* BtDir | 6  */
129 	VtPointerType6,		/* BtDir | 7  */
130 };
131 
132 /*
133  * Allocate the memory cache.
134  */
135 Cache *
136 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
137 {
138 	int i;
139 	Cache *c;
140 	Block *b;
141 	BList *bl;
142 	u8int *p;
143 	int nbl;
144 
145 	c = vtMemAllocZ(sizeof(Cache));
146 
147 	/* reasonable number of BList elements */
148 	nbl = nblocks * 4;
149 
150 	c->lk = vtLockAlloc();
151 	c->ref = 1;
152 	c->disk = disk;
153 	c->z = z;
154 	c->size = diskBlockSize(disk);
155 bwatchSetBlockSize(c->size);
156 	/* round c->size up to be a nice multiple */
157 	c->size = (c->size + 127) & ~127;
158 	c->ndmap = (c->size/20 + 7) / 8;
159 	c->nblocks = nblocks;
160 	c->hashSize = nblocks;
161 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
162 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
163 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
164 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
165 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
166 	c->mode = mode;
167 	c->vers++;
168 	p = c->mem;
169 	for(i = 0; i < nblocks; i++){
170 		b = &c->blocks[i];
171 		b->lk = vtLockAlloc();
172 		b->c = c;
173 		b->data = p;
174 		b->heap = i;
175 		b->ioready = vtRendezAlloc(b->lk);
176 		c->heap[i] = b;
177 		p += c->size;
178 	}
179 	c->nheap = nblocks;
180 	for(i = 0; i < nbl; i++){
181 		bl = (BList*)p;
182 		bl->next = c->blfree;
183 		c->blfree = bl;
184 		p += sizeof(BList);
185 	}
186 	/* separate loop to keep blocks and blists reasonably aligned */
187 	for(i = 0; i < nblocks; i++){
188 		b = &c->blocks[i];
189 		b->dmap = p;
190 		p += c->ndmap;
191 	}
192 
193 	c->blrend = vtRendezAlloc(c->lk);
194 
195 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
196 
197 	c->fl = flAlloc(diskSize(disk, PartData));
198 
199 	c->unlink = vtRendezAlloc(c->lk);
200 	c->flush = vtRendezAlloc(c->lk);
201 	c->flushwait = vtRendezAlloc(c->lk);
202 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
203 
204 	if(mode == OReadWrite){
205 		c->ref += 2;
206 		vtThread(unlinkThread, c);
207 		vtThread(flushThread, c);
208 	}
209 	cacheCheck(c);
210 
211 	return c;
212 }
213 
214 /*
215  * Free the whole memory cache, flushing all dirty blocks to the disk.
216  */
217 void
218 cacheFree(Cache *c)
219 {
220 	int i;
221 
222 	/* kill off daemon threads */
223 	vtLock(c->lk);
224 	c->die = vtRendezAlloc(c->lk);
225 	periodicKill(c->sync);
226 	vtWakeup(c->flush);
227 	vtWakeup(c->unlink);
228 	while(c->ref > 1)
229 		vtSleep(c->die);
230 
231 	/* flush everything out */
232 	do {
233 		unlinkBody(c);
234 		vtUnlock(c->lk);
235 		while(cacheFlushBlock(c))
236 			;
237 		diskFlush(c->disk);
238 		vtLock(c->lk);
239 	} while(c->uhead || c->ndirty);
240 	vtUnlock(c->lk);
241 
242 	cacheCheck(c);
243 
244 	for(i = 0; i < c->nblocks; i++){
245 		assert(c->blocks[i].ref == 0);
246 		vtRendezFree(c->blocks[i].ioready);
247 		vtLockFree(c->blocks[i].lk);
248 	}
249 	flFree(c->fl);
250 	vtMemFree(c->baddr);
251 	vtMemFree(c->heads);
252 	vtMemFree(c->blocks);
253 	vtMemFree(c->mem);
254 	vtLockFree(c->lk);
255 	diskFree(c->disk);
256 	vtRendezFree(c->blrend);
257 	/* don't close vtSession */
258 	vtMemFree(c);
259 }
260 
261 static void
262 cacheDump(Cache *c)
263 {
264 	int i;
265 	Block *b;
266 
267 	for(i = 0; i < c->nblocks; i++){
268 		b = &c->blocks[i];
269 		fprint(2, "p=%d a=%ud %V t=%d ref=%d state=%s io=%s\n",
270 			b->part, b->addr, b->score, b->l.type, b->ref,
271 			bsStr(b->l.state), bioStr(b->iostate));
272 	}
273 }
274 
275 static void
276 cacheCheck(Cache *c)
277 {
278 	u32int size, now;
279 	int i, k, refed;
280 	static uchar zero[VtScoreSize];
281 	Block *b;
282 
283 	size = c->size;
284 	now = c->now;
285 
286 	for(i = 0; i < c->nheap; i++){
287 		if(c->heap[i]->heap != i)
288 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
289 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
290 			vtFatal("bad heap ordering");
291 		k = (i << 1) + 1;
292 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
293 			vtFatal("bad heap ordering");
294 		k++;
295 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
296 			vtFatal("bad heap ordering");
297 	}
298 
299 	refed = 0;
300 	for(i = 0; i < c->nblocks; i++){
301 		b = &c->blocks[i];
302 		if(b->data != &c->mem[i * size])
303 			vtFatal("mis-blocked at %d", i);
304 		if(b->ref && b->heap == BadHeap){
305 			refed++;
306 		}
307 	}
308 if(c->nheap + refed != c->nblocks){
309 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks);
310 cacheDump(c);
311 }
312 	assert(c->nheap + refed == c->nblocks);
313 	refed = 0;
314 	for(i = 0; i < c->nblocks; i++){
315 		b = &c->blocks[i];
316 		if(b->ref){
317 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l);
318 			refed++;
319 		}
320 	}
321 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed);
322 }
323 
324 
325 /*
326  * locate the block with the oldest second to last use.
327  * remove it from the heap, and fix up the heap.
328  */
329 /* called with c->lk held */
330 static Block *
331 cacheBumpBlock(Cache *c)
332 {
333 	Block *b;
334 
335 	/*
336 	 * locate the block with the oldest second to last use.
337 	 * remove it from the heap, and fix up the heap.
338 	 */
339 	if(c->nheap == 0)
340 		vtFatal("cacheBumpBlock: no free blocks in cache");
341 	b = c->heap[0];
342 	heapDel(b);
343 
344 	assert(b->heap == BadHeap);
345 	assert(b->ref == 0);
346 	assert(b->iostate == BioEmpty || b->iostate == BioLabel || b->iostate == BioClean);
347 	assert(b->prior == nil);
348 	assert(b->uhead == nil);
349 
350 	/*
351 	 * unchain the block from hash chain
352 	 */
353 	if(b->prev){
354 		*(b->prev) = b->next;
355 		if(b->next)
356 			b->next->prev = b->prev;
357 		b->prev = nil;
358 	}
359 
360 
361 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score);
362 	/* set block to a reasonable state */
363 	b->ref = 1;
364 	b->part = PartError;
365 	memset(&b->l, 0, sizeof(b->l));
366 	b->iostate = BioEmpty;
367 
368 	return b;
369 }
370 
371 /*
372  * look for a particular version of the block in the memory cache.
373  */
374 static Block *
375 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
376 	int waitlock, int *lockfailure)
377 {
378 	Block *b;
379 	ulong h;
380 
381 	h = addr % c->hashSize;
382 
383 	if(lockfailure)
384 		*lockfailure = 0;
385 
386 	/*
387 	 * look for the block in the cache
388 	 */
389 	vtLock(c->lk);
390 	for(b = c->heads[h]; b != nil; b = b->next){
391 		if(b->part == part && b->addr == addr)
392 			break;
393 	}
394 	if(b == nil || b->vers != vers){
395 		vtUnlock(c->lk);
396 		return nil;
397 	}
398 	if(!waitlock && !vtCanLock(b->lk)){
399 		*lockfailure = 1;
400 		vtUnlock(c->lk);
401 		return nil;
402 	}
403 	heapDel(b);
404 	b->ref++;
405 	vtUnlock(c->lk);
406 
407 	bwatchLock(b);
408 	if(waitlock)
409 		vtLock(b->lk);
410 	b->nlock = 1;
411 
412 	for(;;){
413 		switch(b->iostate){
414 		default:
415 			abort();
416 		case BioEmpty:
417 		case BioLabel:
418 		case BioClean:
419 		case BioDirty:
420 			if(b->vers != vers){
421 				blockPut(b);
422 				return nil;
423 			}
424 			return b;
425 		case BioReading:
426 		case BioWriting:
427 			vtSleep(b->ioready);
428 			break;
429 		case BioVentiError:
430 		case BioReadError:
431 			blockSetIOState(b, BioEmpty);
432 			blockPut(b);
433 			vtSetError(EIO);
434 			return nil;
435 		}
436 	}
437 	/* NOT REACHED */
438 }
439 static Block*
440 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
441 {
442 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
443 }
444 
445 
446 /*
447  * fetch a local (on-disk) block from the memory cache.
448  * if it's not there, load it, bumping some other block.
449  */
450 Block *
451 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
452 {
453 	Block *b;
454 	ulong h;
455 
456 	assert(part != PartVenti);
457 
458 	h = addr % c->hashSize;
459 
460 	/*
461 	 * look for the block in the cache
462 	 */
463 	vtLock(c->lk);
464 	for(b = c->heads[h]; b != nil; b = b->next){
465 		if(b->part != part || b->addr != addr)
466 			continue;
467 		if(epoch && b->l.epoch != epoch){
468 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
469 			vtUnlock(c->lk);
470 			vtSetError(ELabelMismatch);
471 			return nil;
472 		}
473 		heapDel(b);
474 		b->ref++;
475 		break;
476 	}
477 
478 	if(b == nil){
479 		b = cacheBumpBlock(c);
480 
481 		b->part = part;
482 		b->addr = addr;
483 		localToGlobal(addr, b->score);
484 
485 		/* chain onto correct hash */
486 		b->next = c->heads[h];
487 		c->heads[h] = b;
488 		if(b->next != nil)
489 			b->next->prev = &b->next;
490 		b->prev = &c->heads[h];
491 	}
492 
493 	vtUnlock(c->lk);
494 
495 	/*
496 	 * BUG: what if the epoch changes right here?
497 	 * In the worst case, we could end up in some weird
498 	 * lock loop, because the block we want no longer exists,
499 	 * and instead we're trying to lock a block we have no
500 	 * business grabbing.
501 	 *
502 	 * For now, I'm not going to worry about it.
503 	 */
504 
505 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr);
506 	bwatchLock(b);
507 	vtLock(b->lk);
508 	b->nlock = 1;
509 
510 	if(part == PartData && b->iostate == BioEmpty){
511 		if(!readLabel(c, &b->l, addr)){
512 			blockPut(b);
513 			return nil;
514 		}
515 		blockSetIOState(b, BioLabel);
516 	}
517 	if(epoch && b->l.epoch != epoch){
518 		blockPut(b);
519 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
520 		vtSetError(ELabelMismatch);
521 		return nil;
522 	}
523 
524 	b->pc = getcallerpc(&c);
525 	for(;;){
526 		switch(b->iostate){
527 		default:
528 			abort();
529 		case BioEmpty:
530 		case BioLabel:
531 			if(mode == OOverWrite){
532 				blockSetIOState(b, BioClean);
533 				return b;
534 			}
535 			diskRead(c->disk, b);
536 			vtSleep(b->ioready);
537 			break;
538 		case BioClean:
539 		case BioDirty:
540 			return b;
541 		case BioReading:
542 		case BioWriting:
543 			vtSleep(b->ioready);
544 			break;
545 		case BioReadError:
546 			blockSetIOState(b, BioEmpty);
547 			blockPut(b);
548 			vtSetError(EIO);
549 			return nil;
550 		}
551 	}
552 	/* NOT REACHED */
553 }
554 
555 Block *
556 cacheLocal(Cache *c, int part, u32int addr, int mode)
557 {
558 	return _cacheLocal(c, part, addr, mode, 0);
559 }
560 
561 /*
562  * fetch a local (on-disk) block from the memory cache.
563  * if it's not there, load it, bumping some other block.
564  * check tag and type.
565  */
566 Block *
567 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
568 {
569 	Block *b;
570 
571 	b = _cacheLocal(c, PartData, addr, mode, epoch);
572 	if(b == nil)
573 		return nil;
574 	if(b->l.type != type || b->l.tag != tag){
575 		fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %x exp %x\n",
576 			addr, b->l.type, type, b->l.tag, tag);
577 abort();
578 		vtSetError(ELabelMismatch);
579 		blockPut(b);
580 		return nil;
581 	}
582 	b->pc = getcallerpc(&c);
583 	return b;
584 }
585 
586 /*
587  * fetch a global (Venti) block from the memory cache.
588  * if it's not there, load it, bumping some other block.
589  * check tag and type if it's really a local block in disguise.
590  */
591 Block *
592 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
593 {
594 	int n;
595 	Block *b;
596 	ulong h;
597 	u32int addr;
598 
599 	addr = globalToLocal(score);
600 	if(addr != NilBlock){
601 		b = cacheLocalData(c, addr, type, tag, mode, 0);
602 		if(b)
603 			b->pc = getcallerpc(&c);
604 		return b;
605 	}
606 
607 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
608 
609 	/*
610 	 * look for the block in the cache
611 	 */
612 	vtLock(c->lk);
613 	for(b = c->heads[h]; b != nil; b = b->next){
614 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
615 			continue;
616 		heapDel(b);
617 		b->ref++;
618 		break;
619 	}
620 
621 	if(b == nil){
622 if(0)fprint(2, "cacheGlobal %V %d\n", score, type);
623 
624 		b = cacheBumpBlock(c);
625 
626 		b->part = PartVenti;
627 		b->addr = NilBlock;
628 		b->l.type = type;
629 		memmove(b->score, score, VtScoreSize);
630 
631 		/* chain onto correct hash */
632 		b->next = c->heads[h];
633 		c->heads[h] = b;
634 		if(b->next != nil)
635 			b->next->prev = &b->next;
636 		b->prev = &c->heads[h];
637 	}
638 	vtUnlock(c->lk);
639 
640 	bwatchLock(b);
641 	vtLock(b->lk);
642 	b->nlock = 1;
643 	b->pc = getcallerpc(&c);
644 
645 	switch(b->iostate){
646 	default:
647 		abort();
648 	case BioEmpty:
649 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
650 		if(n < 0 || !vtSha1Check(score, b->data, n)){
651 			blockSetIOState(b, BioVentiError);
652 			blockPut(b);
653 			return nil;
654 		}
655 		vtZeroExtend(vtType[type], b->data, n, c->size);
656 		blockSetIOState(b, BioClean);
657 		return b;
658 	case BioClean:
659 		return b;
660 	case BioVentiError:
661 	case BioReadError:
662 		blockPut(b);
663 		vtSetError(EIO);
664 		blockSetIOState(b, BioEmpty);
665 		return nil;
666 	}
667 	/* NOT REACHED */
668 }
669 
670 /*
671  * allocate a new on-disk block and load it into the memory cache.
672  * BUG: if the disk is full, should we flush some of it to Venti?
673  */
674 static u32int lastAlloc;
675 
676 Block *
677 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
678 {
679 	FreeList *fl;
680 	u32int addr;
681 	Block *b;
682 	int n, nwrap;
683 	Label lab;
684 
685 	n = c->size / LabelSize;
686 	fl = c->fl;
687 
688 	vtLock(fl->lk);
689 fl->last = 0;
690 	addr = fl->last;
691 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
692 	if(b == nil){
693 		fprint(2, "cacheAllocBlock: xxx %R\n");
694 		vtUnlock(fl->lk);
695 		return nil;
696 	}
697 	nwrap = 0;
698 	for(;;){
699 		if(++addr >= fl->end){
700 			addr = 0;
701 			fprint(2, "cacheAllocBlock wrap %d\n", fl->end);
702 			if(++nwrap >= 2){
703 				blockPut(b);
704 				fl->last = 0;
705 				vtSetError("disk is full");
706 				fprint(2, "cacheAllocBlock: xxx1 %R\n");
707 				vtUnlock(fl->lk);
708 				return nil;
709 			}
710 		}
711 		if(addr%n == 0){
712 			blockPut(b);
713 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
714 			if(b == nil){
715 				fl->last = addr;
716 				fprint(2, "cacheAllocBlock: xxx2 %R\n");
717 				vtUnlock(fl->lk);
718 				return nil;
719 			}
720 		}
721 		if(!labelUnpack(&lab, b->data, addr%n))
722 			continue;
723 		if(lab.state == BsFree)
724 			goto Found;
725 		if((lab.state&BsClosed) && lab.epochClose <= epochLow)
726 			goto Found;
727 	}
728 Found:
729 	blockPut(b);
730 	b = cacheLocal(c, PartData, addr, OOverWrite);
731 	if(b == nil){
732 		fprint(2, "cacheAllocBlock: xxx3 %R\n");
733 		return nil;
734 	}
735 assert(b->iostate == BioLabel || b->iostate == BioClean);
736 	fl->last = addr;
737 	lab.type = type;
738 	lab.tag = tag;
739 	lab.state = BsAlloc;
740 	lab.epoch = epoch;
741 	lab.epochClose = ~(u32int)0;
742 	if(!blockSetLabel(b, &lab)){
743 		fprint(2, "cacheAllocBlock: xxx4 %R\n");
744 		blockPut(b);
745 		return nil;
746 	}
747 	vtZeroExtend(vtType[type], b->data, 0, c->size);
748 if(0)diskWrite(c->disk, b);
749 
750 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag);
751 	lastAlloc = addr;
752 	vtUnlock(fl->lk);
753 	b->pc = getcallerpc(&c);
754 	return b;
755 }
756 
757 static FreeList *
758 flAlloc(u32int end)
759 {
760 	FreeList *fl;
761 
762 	fl = vtMemAllocZ(sizeof(*fl));
763 	fl->lk = vtLockAlloc();
764 	fl->last = end;
765 	fl->end = end;
766 	return fl;
767 }
768 
769 static void
770 flFree(FreeList *fl)
771 {
772 	vtLockFree(fl->lk);
773 	vtMemFree(fl);
774 }
775 
776 u32int
777 cacheLocalSize(Cache *c, int part)
778 {
779 	return diskSize(c->disk, part);
780 }
781 
782 /*
783  * Copy on write.  Copied blocks have to be marked BaCopy.
784  * See the big comment near blockRemoveLink.
785  */
786 Block*
787 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
788 {
789 	Block *bb, *lb;
790 	Label l;
791 
792 	assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi);
793 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
794 	if(bb == nil){
795 		blockPut(b);
796 		return nil;
797 	}
798 
799 	/*
800 	 * Change label on b to mark that we've copied it.
801 	 * This has to come after cacheAllocBlock, since we
802 	 * can't hold any labels blocks (lb) while we try to
803 	 * fetch others (in cacheAllocBlock).
804 	 */
805 	if(!(b->l.state&BsCopied) && b->part==PartData){
806 		l = b->l;
807 		l.state |= BsCopied;
808 		lb = _blockSetLabel(b, &l);
809 		if(lb == nil){
810 			/* can't set label => can't copy block */
811 			blockPut(b);
812 			l.type = BtMax;
813 			l.state = BsFree;
814 			l.epoch = 0;
815 			l.epochClose = 0;
816 			l.tag = 0;
817 			/* ignore error: block gets lost on error */
818 			blockSetLabel(bb, &l);
819 			blockPut(bb);
820 			return nil;
821 		}
822 		blockDependency(bb, lb, -1, nil, nil);
823 		blockPut(lb);
824 	}
825 
826 	if(0){
827 		if(b->addr != NilBlock)
828 			fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n",
829 				b->addr, b->l.epoch, bb->addr, bb->l.epoch);
830 		else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0)
831 			fprint(2, "blockCopy %V => %#ux/%ud\n",
832 				b->score, bb->addr, bb->l.epoch);
833 	}
834 
835 	memmove(bb->data, b->data, b->c->size);
836 	blockDirty(bb);
837 	blockPut(b);
838 	return bb;
839 }
840 
841 /*
842  * The thread that has locked b may refer to it by
843  * multiple names.  Nlock counts the number of
844  * references the locking thread holds.  It will call
845  * blockPut once per reference.
846  */
847 void
848 blockDupLock(Block *b)
849 {
850 	assert(b->nlock > 0);
851 	b->nlock++;
852 }
853 
854 /*
855  * we're done with the block.
856  * unlock it.  can't use it after calling this.
857  */
858 void
859 blockPut(Block* b)
860 {
861 	Cache *c;
862 
863 	if(b == nil)
864 		return;
865 
866 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
867 
868 	if(b->iostate == BioDirty)
869 		bwatchDependency(b);
870 
871 	if(--b->nlock > 0)
872 		return;
873 
874 	/*
875 	 * b->nlock should probably stay at zero while
876 	 * the block is unlocked, but diskThread and vtSleep
877 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
878 	 * so we have to keep b->nlock set to 1 even
879 	 * when the block is unlocked.
880 	 */
881 	assert(b->nlock == 0);
882 	b->nlock = 1;
883 //	b->pc = 0;
884 
885 	bwatchUnlock(b);
886 	vtUnlock(b->lk);
887 	c = b->c;
888 	vtLock(c->lk);
889 
890 	if(--b->ref > 0){
891 		vtUnlock(c->lk);
892 		return;
893 	}
894 
895 	assert(b->ref == 0);
896 	switch(b->iostate){
897 	default:
898 		b->used = c->now++;
899 		heapIns(b);
900 		break;
901 	case BioEmpty:
902 	case BioLabel:
903 		if(c->nheap == 0)
904 			b->used = c->now++;
905 		else
906 			b->used = c->heap[0]->used;
907 		heapIns(b);
908 		break;
909 	case BioDirty:
910 		break;
911 	}
912 	vtUnlock(c->lk);
913 }
914 
915 /*
916  * we're deleting a block; delete all the blocks it points to
917  * that are still active (i.e., not needed by snapshots).
918  */
919 static void
920 blockCleanup(Block *b, u32int epoch)
921 {
922 	Cache *c;
923 	Block *bb;
924 	int i, n;
925 	Label l;
926 	u32int a;
927 	int type;
928 	int mode;
929 
930 	type = b->l.type;
931 	c = b->c;
932 
933 	bwatchReset(b->score);
934 
935 	blockSetIOState(b, BioClean);
936 
937 	/* do not recursively free directories */
938 	if(type == BtData || type == BtDir)
939 		return;
940 
941 	n = c->size / VtScoreSize;
942 	mode = OReadWrite;
943 	if(type-1 == BtData || type-1 == BtDir)
944 		mode = OOverWrite;
945 	for(i=0; i<n; i++){
946 		a = globalToLocal(b->data + i*VtScoreSize);
947 		if(a == NilBlock || !readLabel(c, &l, a))
948 			continue;
949 		if((l.state&BsClosed) || l.epoch != epoch)
950 			continue;
951 		bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0);
952 		if(bb == nil)
953 			continue;
954 		if((bb->l.state&BsClosed) || bb->l.epoch != epoch){
955 			fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n",
956 				a, &l, &bb->l);
957 			blockPut(bb);
958 			continue;
959 		}
960 		blockCleanup(bb, epoch);
961 		l.type = BtMax;
962 		l.epoch = epoch;
963 		l.epochClose = 0;
964 		l.state = BsFree;
965 		l.tag = 0;
966 		blockSetLabel(bb, &l);
967 		blockPut(bb);
968 	}
969 }
970 
971 /*
972  * We don't need the block at addr anymore for the active file system.
973  * If we don't need it for the snapshots, remove it completely.
974  * Epoch is the epoch during which we got rid of the block.
975  * See blockRemoveLink for more.
976  */
977 static int
978 unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch)
979 {
980 	Block *b;
981 	Label l;
982 
983 	if(addr == NilBlock)
984 		return 1;
985 
986 //fprint(2, "unlinkBlock %#ux\n", addr);
987 	b = cacheLocalData(c, addr, type, tag, OReadOnly, 0);
988 	if(b == nil)
989 		return 0;
990 	if(b->l.epoch > epoch){
991 fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch);
992 		blockPut(b);
993 		return 0;
994 	}
995 
996 	l = b->l;
997 	if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){
998 		l.state = BsFree;
999 		l.type = BtMax;
1000 		l.tag = 0;
1001 		l.epoch = 0;
1002 		l.epochClose = 0;
1003 		blockCleanup(b, epoch);
1004 	}else{
1005 		l.state |= BsClosed;
1006 		l.epochClose = epoch;
1007 	}
1008 	blockSetLabel(b, &l);
1009 	blockPut(b);
1010 	return 1;
1011 }
1012 
1013 /*
1014  * try to allocate a BList so we can record that b must
1015  * be written out before some other block.
1016  * if can't find a BList, write b out instead and return nil.
1017  */
1018 static BList *
1019 blistAlloc(Block *b)
1020 {
1021 	Cache *c;
1022 	BList *p;
1023 
1024 	/*
1025 	 * It's possible that when we marked b dirty, there were
1026 	 * too many dirty blocks so we just wrote b there and then.
1027 	 * So b might not be dirty.  If it's not, no need to worry
1028 	 * about recording the write constraint.
1029 	 *
1030 	 * BlockRemoveLink depends on the fact that if blistAlloc
1031 	 * returns non-nil, b really is dirty.
1032 	 */
1033 	if(b->iostate != BioDirty){
1034 		assert(b->iostate == BioClean);
1035 		return nil;
1036 	}
1037 
1038 	/*
1039 	 * Easy: maybe there's a free list left.
1040 	 */
1041 	c = b->c;
1042 	vtLock(c->lk);
1043 	if(c->blfree){
1044 	HaveBlFree:
1045 		p = c->blfree;
1046 		c->blfree = p->next;
1047 		vtUnlock(c->lk);
1048 		return p;
1049 	}
1050 	vtUnlock(c->lk);
1051 
1052 	/*
1053 	 * No free BLists.  What are our options?
1054 	 */
1055 
1056 	/* Block has no priors? Just write it. */
1057 	if(b->prior == nil){
1058 		diskWrite(c->disk, b);
1059 		while(b->iostate != BioClean)
1060 			vtSleep(b->ioready);
1061 		return nil;
1062 	}
1063 
1064 	/*
1065 	 * Wake the flush thread, which will hopefully free up
1066 	 * some BLists for us.  We used to flush a block from
1067 	 * our own prior list and reclaim that BList, but this is
1068 	 * a no-no: some of the blocks on our prior list may
1069 	 * be locked by our caller.  Or maybe their label blocks
1070 	 * are locked by our caller.  In any event, it's too hard
1071 	 * to make sure we can do I/O for ourselves.  Instead,
1072 	 * we assume the flush thread will find something.
1073 	 * (The flush thread never blocks waiting for a block,
1074 	 * so it won't deadlock like we will.)
1075 	 */
1076 	vtLock(c->lk);
1077 	while(c->blfree == nil){
1078 		vtWakeup(c->flush);
1079 		vtSleep(c->blrend);
1080 	}
1081 	goto HaveBlFree;
1082 }
1083 
1084 void
1085 blistFree(Cache *c, BList *bl)
1086 {
1087 	vtLock(c->lk);
1088 	bl->next = c->blfree;
1089 	c->blfree = bl;
1090 	vtWakeup(c->blrend);
1091 	vtUnlock(c->lk);
1092 }
1093 
1094 /*
1095  * Flush b or one of the blocks it depends on.
1096  */
1097 void
1098 blockFlush(Block *b)
1099 {
1100 	int first, nlock;
1101 	BList *p, **pp;
1102 	Block *bb;
1103 	Cache *c;
1104 
1105 //fprint(2, "blockFlush %p\n", b);
1106 
1107 	c = b->c;
1108 
1109 	first = 1;
1110 	pp = &b->prior;
1111 	for(p=*pp; p; p=*pp){
1112 		bb = cacheLocalLookup(c, p->part, p->addr, p->vers);
1113 		if(bb == nil){
1114 			*pp = p->next;
1115 			blistFree(c, p);
1116 			continue;
1117 		}
1118 		if(!first)
1119 			blockPut(b);
1120 		first = 0;
1121 		b = bb;
1122 		pp = &b->prior;
1123 	}
1124 
1125 	/*
1126 	 * If b->nlock > 1, the block is aliased within
1127 	 * a single thread.  That thread is us, and it's
1128 	 * the block that was passed in (rather than a prior).
1129 	 * DiskWrite does some funny stuff with VtLock
1130 	 * and blockPut that basically assumes b->nlock==1.
1131 	 * We humor diskWrite by temporarily setting
1132 	 * nlock to 1.  This needs to be revisited.  (TODO)
1133 	 */
1134 	nlock = b->nlock;
1135 	if(nlock > 1){
1136 		assert(first);
1137 		b->nlock = 1;
1138 	}
1139 	diskWrite(c->disk, b);
1140 	while(b->iostate != BioClean)
1141 		vtSleep(b->ioready);
1142 	b->nlock = nlock;
1143 	if(!first)
1144 		blockPut(b);
1145 }
1146 
1147 /*
1148  * Record that bb must be written out before b.
1149  * If index is given, we're about to overwrite the score/e
1150  * at that index in the block.  Save the old value so we
1151  * can write a safer ``old'' version of the block if pressed.
1152  */
1153 void
1154 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1155 {
1156 	BList *p;
1157 
1158 	if(bb->iostate == BioClean)
1159 		return;
1160 
1161 	/*
1162 	 * Dependencies for blocks containing Entry structures
1163 	 * or scores must always be explained.  The problem with
1164 	 * only explaining some of them is this.  Suppose we have two
1165 	 * dependencies for the same field, the first explained
1166 	 * and the second not.  We try to write the block when the first
1167 	 * dependency is not written but the second is.  We will roll back
1168 	 * the first change even though the second trumps it.
1169 	 */
1170 	if(index == -1 && bb->part == PartData)
1171 		assert(b->l.type == BtData);
1172 
1173 	assert(bb->iostate == BioDirty);
1174 
1175 	p = blistAlloc(bb);
1176 	if(p == nil)
1177 		return;
1178 
1179 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1180 
1181 	p->part = bb->part;
1182 	p->addr = bb->addr;
1183 	p->type = bb->l.type;
1184 	p->vers = bb->vers;
1185 	p->index = index;
1186 	if(p->index >= 0){
1187 		/*
1188 		 * This test would just be b->l.type==BtDir except
1189 		 * we need to exclude the super block.
1190 		 */
1191 		if(b->l.type == BtDir && b->part == PartData)
1192 			entryPack(e, p->old.entry, 0);
1193 		else
1194 			memmove(p->old.score, score, VtScoreSize);
1195 	}
1196 	p->next = b->prior;
1197 	b->prior = p;
1198 }
1199 
1200 /*
1201  * Mark an in-memory block as dirty.  If there are too many
1202  * dirty blocks, start writing some out to disk.  If there are way
1203  * too many dirty blocks, write this one out too.
1204  *
1205  * Note that since we might call blockFlush, which walks
1206  * the prior list, we can't call blockDirty while holding a lock
1207  * on any of our priors.  This can be tested by recompiling
1208  * with flush always set to 1 below.
1209  */
1210 int
1211 blockDirty(Block *b)
1212 {
1213 	Cache *c;
1214 	int flush;
1215 
1216 	c = b->c;
1217 
1218 	assert(b->part != PartVenti);
1219 
1220 	if(b->iostate == BioDirty)
1221 		return 1;
1222 	assert(b->iostate == BioClean);
1223 
1224 	vtLock(c->lk);
1225 	b->iostate = BioDirty;
1226 	c->ndirty++;
1227 	if(c->ndirty > (c->maxdirty>>1))
1228 		vtWakeup(c->flush);
1229 	flush = c->ndirty > c->maxdirty;
1230 	vtUnlock(c->lk);
1231 
1232 	if(flush)
1233 		blockFlush(b);
1234 
1235 	return 1;
1236 }
1237 
1238 /*
1239  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1240  *
1241  * The file system maintains the following invariants (i-iv checked by flchk):
1242  *
1243  * (i) b.e in [bb.e, bb.eClose)
1244  * (ii) if b.e==bb.e,  then no other b' in e points at bb.
1245  * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb.
1246  * (iv) if b is active then no other active b' points at bb.
1247  * (v) if b is a past life of b' then only one of b and b' is active (too hard to check)
1248  *
1249  * The file system initially satisfies these invariants, and we can verify that
1250  * the various file system operations maintain them.  See fossil.invariants.
1251  *
1252  * Condition (i) lets us reclaim blocks once the low epoch is greater
1253  * than epochClose.
1254  *
1255  * If the condition in (iii) is satisfied, then this is the only pointer to bb,
1256  * so bb can be reclaimed once b has been written to disk.  blockRemoveLink
1257  * checks !(b.state&Copied) as an optimization.  UnlinkBlock and blockCleanup
1258  * will check the conditions again for each block they consider.
1259  */
1260 int
1261 blockRemoveLink(Block *b, u32int addr, int type, u32int tag)
1262 {
1263 	BList *bl;
1264 	BList *p, **pp;
1265 	Cache *c;
1266 
1267 	c = b->c;
1268 
1269 	/* remove unlinked block from prior list */
1270 	pp = &b->prior;
1271 	for(p=*pp; p; p=*pp){
1272 		if(p->part != PartData || p->addr != addr){
1273 			pp = &p->next;
1274 			continue;
1275 		}
1276 		*pp = p->next;
1277 		blistFree(c, p);
1278 	}
1279 
1280 	/* if b has been copied, can't reclaim blocks it points at. */
1281 	if(b->l.state & BsCopied)
1282 		return 0;
1283 
1284 	bl = blistAlloc(b);
1285 	if(bl == nil)
1286 		return unlinkBlock(b->c, addr, type, tag, b->l.epoch);
1287 
1288 	/*
1289 	 * Because bl != nil, we know b is dirty.
1290 	 * (Linking b->uhead onto a clean block is
1291 	 * counterproductive, since we only look at
1292 	 * b->uhead when a block transitions from
1293 	 * dirty to clean.)
1294 	 */
1295 	assert(b->iostate == BioDirty);
1296 
1297 	bl->part = PartData;
1298 	bl->addr = addr;
1299 	bl->type = type;
1300 	bl->tag = tag;
1301 	bl->epoch = b->l.epoch;
1302 	if(b->uhead == nil)
1303 		b->uhead = bl;
1304 	else
1305 		b->utail->next = bl;
1306 	b->utail = bl;
1307 	bl->next = nil;
1308 	return 1;
1309 }
1310 
1311 /*
1312  * set the label associated with a block.
1313  */
1314 Block*
1315 _blockSetLabel(Block *b, Label *l)
1316 {
1317 	int lpb;
1318 	Block *bb;
1319 	u32int a;
1320 	Cache *c;
1321 
1322 	c = b->c;
1323 
1324 	assert(b->part == PartData);
1325 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
1326 	lpb = c->size / LabelSize;
1327 	a = b->addr / lpb;
1328 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
1329 	if(bb == nil){
1330 		blockPut(b);
1331 		return nil;
1332 	}
1333 	b->l = *l;
1334 	labelPack(l, bb->data, b->addr%lpb);
1335 	blockDirty(bb);
1336 	return bb;
1337 }
1338 
1339 int
1340 blockSetLabel(Block *b, Label *l)
1341 {
1342 	Block *lb;
1343 	Label oldl;
1344 
1345 	oldl = b->l;
1346 	lb = _blockSetLabel(b, l);
1347 	if(lb == nil)
1348 		return 0;
1349 
1350 	/*
1351 	 * If we're allocating the block, make sure the label (bl)
1352 	 * goes to disk before the data block (b) itself.  This is to help
1353 	 * the blocks that in turn depend on b.
1354 	 *
1355 	 * Suppose bx depends on (must be written out after) b.
1356 	 * Once we write b we'll think it's safe to write bx.
1357 	 * Bx can't get at b unless it has a valid label, though.
1358 	 *
1359 	 * Allocation is the only case in which having a current label
1360 	 * is vital because:
1361 	 *
1362 	 *	- l.type is set at allocation and never changes.
1363 	 *	- l.tag is set at allocation and never changes.
1364 	 *	- l.state is not checked when we load blocks.
1365 	 *	- the archiver cares deeply about l.state being
1366 	 *		BaActive vs. BaCopied, but that's handled
1367 	 *		by direct calls to _blockSetLabel.
1368 	 */
1369 
1370 	if(oldl.state == BsFree)
1371 		blockDependency(b, lb, -1, nil, nil);
1372 	blockPut(lb);
1373 	return 1;
1374 }
1375 
1376 /*
1377  * We've decided to write out b.  Maybe b has some pointers to blocks
1378  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1379  * copy of b that is safe to write out.  (diskThread will make sure the block
1380  * remains marked as dirty.)
1381  */
1382 uchar *
1383 blockRollback(Block *b, uchar *buf)
1384 {
1385 	u32int addr;
1386 	BList *p;
1387 	Super super;
1388 
1389 	/* easy case */
1390 	if(b->prior == nil)
1391 		return b->data;
1392 
1393 	memmove(buf, b->data, b->c->size);
1394 	for(p=b->prior; p; p=p->next){
1395 		/*
1396 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1397 		 */
1398 		assert(p->index >= 0);
1399 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1400 		if(b->part == PartSuper){
1401 			assert(p->index == 0);
1402 			superUnpack(&super, buf);
1403 			addr = globalToLocal(p->old.score);
1404 			if(addr == NilBlock){
1405 				fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score);
1406 				abort();
1407 			}
1408 			super.active = addr;
1409 			superPack(&super, buf);
1410 			continue;
1411 		}
1412 		if(b->l.type == BtDir)
1413 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1414 		else
1415 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1416 	}
1417 	return buf;
1418 }
1419 
1420 /*
1421  * Try to write block b.
1422  * If b depends on other blocks:
1423  *
1424  *	If the block has been written out, remove the dependency.
1425  *	If we know how to write out an old version of b that doesn't
1426  *		depend on it, do that.
1427  *
1428  *	Otherwise, bail.
1429  */
1430 int
1431 blockWrite(Block *b)
1432 {
1433 	uchar *dmap;
1434 	Cache *c;
1435 	BList *p, **pp;
1436 	Block *bb;
1437 	int lockfail;
1438 
1439 	c = b->c;
1440 
1441 	if(b->iostate != BioDirty)
1442 		return 1;
1443 
1444 	dmap = b->dmap;
1445 	memset(dmap, 0, c->ndmap);
1446 	pp = &b->prior;
1447 	for(p=*pp; p; p=*pp){
1448 		if(p->index >= 0){
1449 			/* more recent dependency has succeeded; this one can go */
1450 			if(dmap[p->index/8] & (1<<(p->index%8)))
1451 				goto ignblock;
1452 		}
1453 
1454 		lockfail = 0;
1455 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1456 		if(bb == nil){
1457 			if(lockfail)
1458 				return 0;
1459 			/* block not in cache => was written already */
1460 			dmap[p->index/8] |= 1<<(p->index%8);
1461 			goto ignblock;
1462 		}
1463 
1464 		/*
1465 		 * same version of block is still in cache.
1466 		 *
1467 		 * the assertion is true because the block still has version p->vers,
1468 		 * which means it hasn't been written out since we last saw it.
1469 		 */
1470 		assert(bb->iostate == BioDirty);
1471 		blockPut(bb);
1472 
1473 		if(p->index < 0){
1474 			/*
1475 			 * We don't know how to temporarily undo
1476 			 * b's dependency on bb, so just don't write b yet.
1477 			 */
1478 			if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1479 				b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1480 			return 0;
1481 		}
1482 		/* keep walking down the list */
1483 		pp = &p->next;
1484 		continue;
1485 
1486 ignblock:
1487 		*pp = p->next;
1488 		blistFree(c, p);
1489 		continue;
1490 	}
1491 
1492 	diskWrite(c->disk, b);
1493 	return 1;
1494 }
1495 
1496 /*
1497  * Change the I/O state of block b.
1498  * Just an assignment except for magic in
1499  * switch statement (read comments there).
1500  */
1501 void
1502 blockSetIOState(Block *b, int iostate)
1503 {
1504 	int dowakeup;
1505 	Cache *c;
1506 	BList *p, *q;
1507 
1508 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1509 
1510 	c = b->c;
1511 
1512 	dowakeup = 0;
1513 	switch(iostate){
1514 	default:
1515 		abort();
1516 	case BioEmpty:
1517 		assert(!b->uhead);
1518 		break;
1519 	case BioLabel:
1520 		assert(!b->uhead);
1521 		break;
1522 	case BioClean:
1523 		bwatchDependency(b);
1524 		/*
1525 		 * If b->prior is set, it means a write just finished.
1526 		 * The prior list isn't needed anymore.
1527 		 */
1528 		for(p=b->prior; p; p=q){
1529 			q = p->next;
1530 			blistFree(c, p);
1531 		}
1532 		b->prior = nil;
1533 		/*
1534 		 * Freeing a block or just finished a write.
1535 		 * Move the blocks from the per-block unlink
1536 		 * queue to the cache unlink queue.
1537 		 */
1538 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1539 			vtLock(c->lk);
1540 			c->ndirty--;
1541 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1542 			b->vers = c->vers++;
1543 			if(b->uhead){
1544 				/* add unlink blocks to unlink queue */
1545 				if(c->uhead == nil){
1546 					c->uhead = b->uhead;
1547 					vtWakeup(c->unlink);
1548 				}else
1549 					c->utail->next = b->uhead;
1550 				c->utail = b->utail;
1551 				b->uhead = nil;
1552 			}
1553 			vtUnlock(c->lk);
1554 		}
1555 		assert(!b->uhead);
1556 		dowakeup = 1;
1557 		break;
1558 	case BioDirty:
1559 		/*
1560 		 * Wrote out an old version of the block (see blockRollback).
1561 		 * Bump a version count, leave it dirty.
1562 		 */
1563 		if(b->iostate == BioWriting){
1564 			vtLock(c->lk);
1565 			b->vers = c->vers++;
1566 			vtUnlock(c->lk);
1567 			dowakeup = 1;
1568 		}
1569 		break;
1570 	case BioReading:
1571 	case BioWriting:
1572 		/*
1573 		 * Adding block to disk queue.  Bump reference count.
1574 		 * diskThread decs the count later by calling blockPut.
1575 		 * This is here because we need to lock c->lk to
1576 		 * manipulate the ref count.
1577 		 */
1578 		vtLock(c->lk);
1579 		b->ref++;
1580 		vtUnlock(c->lk);
1581 		break;
1582 	case BioReadError:
1583 	case BioVentiError:
1584 		/*
1585 		 * Oops.
1586 		 */
1587 		dowakeup = 1;
1588 		break;
1589 	}
1590 	b->iostate = iostate;
1591 	/*
1592 	 * Now that the state has changed, we can wake the waiters.
1593 	 */
1594 	if(dowakeup)
1595 		vtWakeupAll(b->ioready);
1596 }
1597 
1598 char*
1599 bsStr(int state)
1600 {
1601 	static char s[100];
1602 
1603 	if(state == BsFree)
1604 		return "Free";
1605 	if(state == BsBad)
1606 		return "Bad";
1607 
1608 	sprint(s, "%x", state);
1609 	if(!(state&BsAlloc))
1610 		strcat(s, ",Free");	/* should not happen */
1611 	if(state&BsCopied)
1612 		strcat(s, ",Copied");
1613 	if(state&BsVenti)
1614 		strcat(s, ",Venti");
1615 	if(state&BsClosed)
1616 		strcat(s, ",Closed");
1617 	return s;
1618 }
1619 
1620 char *
1621 bioStr(int iostate)
1622 {
1623 	switch(iostate){
1624 	default:
1625 		return "Unknown!!";
1626 	case BioEmpty:
1627 		return "Empty";
1628 	case BioLabel:
1629 		return "Label";
1630 	case BioClean:
1631 		return "Clean";
1632 	case BioDirty:
1633 		return "Dirty";
1634 	case BioReading:
1635 		return "Reading";
1636 	case BioWriting:
1637 		return "Writing";
1638 	case BioReadError:
1639 		return "ReadError";
1640 	case BioVentiError:
1641 		return "VentiError";
1642 	case BioMax:
1643 		return "Max";
1644 	}
1645 }
1646 
1647 static char *bttab[] = {
1648 	"BtData",
1649 	"BtData+1",
1650 	"BtData+2",
1651 	"BtData+3",
1652 	"BtData+4",
1653 	"BtData+5",
1654 	"BtData+6",
1655 	"BtData+7",
1656 	"BtDir",
1657 	"BtDir+1",
1658 	"BtDir+2",
1659 	"BtDir+3",
1660 	"BtDir+4",
1661 	"BtDir+5",
1662 	"BtDir+6",
1663 	"BtDir+7",
1664 };
1665 
1666 char*
1667 btStr(int type)
1668 {
1669 	if(type < nelem(bttab))
1670 		return bttab[type];
1671 	return "unknown";
1672 }
1673 
1674 int
1675 labelFmt(Fmt *f)
1676 {
1677 	Label *l;
1678 
1679 	l = va_arg(f->args, Label*);
1680 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1681 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1682 }
1683 
1684 int
1685 scoreFmt(Fmt *f)
1686 {
1687 	uchar *v;
1688 	int i;
1689 	u32int addr;
1690 
1691 	v = va_arg(f->args, uchar*);
1692 	if(v == nil){
1693 		fmtprint(f, "*");
1694 	}else if((addr = globalToLocal(v)) != NilBlock)
1695 		fmtprint(f, "0x%.8ux", addr);
1696 	else{
1697 		for(i = 0; i < VtScoreSize; i++)
1698 			fmtprint(f, "%2.2ux", v[i]);
1699 	}
1700 
1701 	return 0;
1702 }
1703 
1704 static int
1705 upHeap(int i, Block *b)
1706 {
1707 	Block *bb;
1708 	u32int now;
1709 	int p;
1710 	Cache *c;
1711 
1712 	c = b->c;
1713 	now = c->now;
1714 	for(; i != 0; i = p){
1715 		p = (i - 1) >> 1;
1716 		bb = c->heap[p];
1717 		if(b->used - now >= bb->used - now)
1718 			break;
1719 		c->heap[i] = bb;
1720 		bb->heap = i;
1721 	}
1722 	c->heap[i] = b;
1723 	b->heap = i;
1724 
1725 	return i;
1726 }
1727 
1728 static int
1729 downHeap(int i, Block *b)
1730 {
1731 	Block *bb;
1732 	u32int now;
1733 	int k;
1734 	Cache *c;
1735 
1736 	c = b->c;
1737 	now = c->now;
1738 	for(; ; i = k){
1739 		k = (i << 1) + 1;
1740 		if(k >= c->nheap)
1741 			break;
1742 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1743 			k++;
1744 		bb = c->heap[k];
1745 		if(b->used - now <= bb->used - now)
1746 			break;
1747 		c->heap[i] = bb;
1748 		bb->heap = i;
1749 	}
1750 	c->heap[i] = b;
1751 	b->heap = i;
1752 	return i;
1753 }
1754 
1755 /*
1756  * Delete a block from the heap.
1757  * Called with c->lk held.
1758  */
1759 static void
1760 heapDel(Block *b)
1761 {
1762 	int i, si;
1763 	Cache *c;
1764 
1765 	c = b->c;
1766 
1767 	si = b->heap;
1768 	if(si == BadHeap)
1769 		return;
1770 	b->heap = BadHeap;
1771 	c->nheap--;
1772 	if(si == c->nheap)
1773 		return;
1774 	b = c->heap[c->nheap];
1775 	i = upHeap(si, b);
1776 	if(i == si)
1777 		downHeap(i, b);
1778 }
1779 
1780 /*
1781  * Insert a block into the heap.
1782  * Called with c->lk held.
1783  */
1784 static void
1785 heapIns(Block *b)
1786 {
1787 	assert(b->heap == BadHeap);
1788 	upHeap(b->c->nheap++, b);
1789 }
1790 
1791 /*
1792  * Get just the label for a block.
1793  */
1794 static int
1795 readLabel(Cache *c, Label *l, u32int addr)
1796 {
1797 	int lpb;
1798 	Block *b;
1799 	u32int a;
1800 
1801 	lpb = c->size / LabelSize;
1802 	a = addr / lpb;
1803 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1804 	if(b == nil){
1805 		blockPut(b);
1806 		return 0;
1807 	}
1808 
1809 	if(!labelUnpack(l, b->data, addr%lpb)){
1810 		blockPut(b);
1811 		return 0;
1812 	}
1813 	blockPut(b);
1814 	return 1;
1815 }
1816 
1817 /*
1818  * Process unlink queue.
1819  * Called with c->lk held.
1820  */
1821 static void
1822 unlinkBody(Cache *c)
1823 {
1824 	BList *p;
1825 
1826 	while(c->uhead != nil){
1827 		p = c->uhead;
1828 		c->uhead = p->next;
1829 		vtUnlock(c->lk);
1830 
1831 		if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch))
1832 			fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n",
1833 				p->addr, p->type, p->tag);
1834 
1835 		vtLock(c->lk);
1836 		p->next = c->blfree;
1837 		c->blfree = p;
1838 	}
1839 }
1840 
1841 /*
1842  * Occasionally unlink the blocks on the cache unlink queue.
1843  */
1844 static void
1845 unlinkThread(void *a)
1846 {
1847 	Cache *c = a;
1848 
1849 	vtThreadSetName("unlink");
1850 
1851 	vtLock(c->lk);
1852 	for(;;){
1853 		while(c->uhead == nil && c->die == nil)
1854 			vtSleep(c->unlink);
1855 		if(c->die != nil)
1856 			break;
1857 		unlinkBody(c);
1858 	}
1859 	c->ref--;
1860 	vtWakeup(c->die);
1861 	vtUnlock(c->lk);
1862 }
1863 
1864 static int
1865 baddrCmp(void *a0, void *a1)
1866 {
1867 	BAddr *b0, *b1;
1868 	b0 = a0;
1869 	b1 = a1;
1870 
1871 	if(b0->part < b1->part)
1872 		return -1;
1873 	if(b0->part > b1->part)
1874 		return 1;
1875 	if(b0->addr < b1->addr)
1876 		return -1;
1877 	if(b0->addr > b1->addr)
1878 		return 1;
1879 	return 0;
1880 }
1881 
1882 /*
1883  * Scan the block list for dirty blocks; add them to the list c->baddr.
1884  */
1885 static void
1886 flushFill(Cache *c)
1887 {
1888 	int i, ndirty;
1889 	BAddr *p;
1890 	Block *b;
1891 
1892 	vtLock(c->lk);
1893 //	if(c->ndirty == 0){
1894 //		vtUnlock(c->lk);
1895 //		return;
1896 //	}
1897 
1898 	p = c->baddr;
1899 	ndirty = 0;
1900 	for(i=0; i<c->nblocks; i++){
1901 		b = c->blocks + i;
1902 		if(b->part == PartError)
1903 			continue;
1904 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1905 			ndirty++;
1906 		if(b->iostate != BioDirty)
1907 			continue;
1908 		p->part = b->part;
1909 		p->addr = b->addr;
1910 		p->vers = b->vers;
1911 		p++;
1912 	}
1913 	if(ndirty != c->ndirty){
1914 		fprint(2, "ndirty mismatch expected %d found %d\n",
1915 			c->ndirty, ndirty);
1916 		c->ndirty = ndirty;
1917 	}
1918 	vtUnlock(c->lk);
1919 
1920 	c->bw = p - c->baddr;
1921 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1922 }
1923 
1924 /*
1925  * This is not thread safe, i.e. it can't be called from multiple threads.
1926  *
1927  * It's okay how we use it, because it only gets called in
1928  * the flushThread.  And cacheFree, but only after
1929  * cacheFree has killed off the flushThread.
1930  */
1931 static int
1932 cacheFlushBlock(Cache *c)
1933 {
1934 	Block *b;
1935 	BAddr *p;
1936 	int lockfail, nfail;
1937 
1938 	nfail = 0;
1939 	for(;;){
1940 		if(c->br == c->be){
1941 			if(c->bw == 0 || c->bw == c->be)
1942 				flushFill(c);
1943 			c->br = 0;
1944 			c->be = c->bw;
1945 			c->bw = 0;
1946 			c->nflush = 0;
1947 		}
1948 
1949 		if(c->br == c->be)
1950 			return 0;
1951 		p = c->baddr + c->br;
1952 		c->br++;
1953 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1954 
1955 		if(b && blockWrite(b)){
1956 			c->nflush++;
1957 			blockPut(b);
1958 			return 1;
1959 		}
1960 		if(b)
1961 			blockPut(b);
1962 
1963 		/*
1964 		 * Why didn't we write the block?
1965 		 */
1966 
1967 		/* Block already written out */
1968 		if(b == nil && !lockfail)
1969 			continue;
1970 
1971 		/* Failed to acquire lock; sleep if happens a lot. */
1972 		if(lockfail && ++nfail > 100)
1973 			sleep(500);
1974 
1975 		/* Requeue block. */
1976 		if(c->bw < c->be)
1977 			c->baddr[c->bw++] = *p;
1978 	}
1979 	return 0;
1980 }
1981 
1982 /*
1983  * Occasionally flush dirty blocks from memory to the disk.
1984  */
1985 static void
1986 flushThread(void *a)
1987 {
1988 	Cache *c = a;
1989 	int i;
1990 
1991 	vtThreadSetName("flush");
1992 	vtLock(c->lk);
1993 	while(c->die == nil){
1994 		vtSleep(c->flush);
1995 		vtUnlock(c->lk);
1996 		for(i=0; i<FlushSize; i++)
1997 			if(!cacheFlushBlock(c))
1998 				break;
1999 		vtLock(c->lk);
2000 		vtWakeupAll(c->flushwait);
2001 	}
2002 	c->ref--;
2003 	vtWakeup(c->die);
2004 	vtUnlock(c->lk);
2005 }
2006 
2007 /*
2008  * Keep flushing until everything is clean.
2009  */
2010 void
2011 cacheFlush(Cache *c, int wait)
2012 {
2013 	vtLock(c->lk);
2014 	if(wait){
2015 		while(c->ndirty){
2016 			consPrint("cacheFlush: %d dirty blocks\n", c->ndirty);
2017 			vtWakeup(c->flush);
2018 			vtSleep(c->flushwait);
2019 		}
2020 		consPrint("cacheFlush: done\n", c->ndirty);
2021 	}else
2022 		vtWakeup(c->flush);
2023 	vtUnlock(c->lk);
2024 }
2025 
2026 /*
2027  * Kick the flushThread every 30 seconds.
2028  */
2029 static void
2030 cacheSync(void *v)
2031 {
2032 	Cache *c;
2033 
2034 	c = v;
2035 	cacheFlush(c, 0);
2036 }
2037