xref: /plan9/sys/src/cmd/fossil/cache.c (revision c9c4220f970dbfe62eed081bea8c061d239bd5f4)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	VtLock	*dirtylk;
26 	int 	ref;
27 	int	mode;
28 
29 	Disk 	*disk;
30 	int	size;			/* block size */
31 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
32 	VtSession *z;
33 	u32int	now;			/* ticks for usage timestamps */
34 	Block	**heads;		/* hash table for finding address */
35 	int	nheap;			/* number of available victims */
36 	Block	**heap;			/* heap for locating victims */
37 	long	nblocks;		/* number of blocks allocated */
38 	Block	*blocks;		/* array of block descriptors */
39 	u8int	*mem;			/* memory for all block data & blists */
40 
41 	BList	*blfree;
42 	VtRendez *blrend;
43 
44 	int 	ndirty;			/* number of dirty blocks in the cache */
45 	int 	maxdirty;		/* max number of dirty blocks */
46 	u32int	vers;
47 
48 	long hashSize;
49 
50 	FreeList *fl;
51 
52 	VtRendez *die;			/* daemon threads should die when != nil */
53 
54 	VtRendez *flush;
55 	VtRendez *flushwait;
56 	VtRendez *heapwait;
57 	BAddr *baddr;
58 	int bw, br, be;
59 	int nflush;
60 
61 	Periodic *sync;
62 
63 	/* unlink daemon */
64 	BList *uhead;
65 	BList *utail;
66 	VtRendez *unlink;
67 
68 	/* block counts */
69 	int nused;
70 	int ndisk;
71 };
72 
73 struct BList {
74 	int part;
75 	u32int addr;
76 	uchar type;
77 	u32int tag;
78 	u32int epoch;
79 	u32int vers;
80 
81 	int recurse;	/* for block unlink */
82 
83 	/* for roll back */
84 	int index;			/* -1 indicates not valid */
85 	union {
86 		uchar score[VtScoreSize];
87 		uchar entry[VtEntrySize];
88 	} old;
89 	BList *next;
90 };
91 
92 struct BAddr {
93 	int part;
94 	u32int addr;
95 	u32int vers;
96 };
97 
98 struct FreeList {
99 	VtLock *lk;
100 	u32int last;		/* last block allocated */
101 	u32int end;		/* end of data partition */
102 	u32int nused;		/* number of used blocks */
103 	u32int epochLow;	/* low epoch when last updated nused */
104 };
105 
106 static FreeList *flAlloc(u32int end);
107 static void flFree(FreeList *fl);
108 
109 static Block *cacheBumpBlock(Cache *c);
110 static void heapDel(Block*);
111 static void heapIns(Block*);
112 static void cacheCheck(Cache*);
113 static void unlinkThread(void *a);
114 static void flushThread(void *a);
115 static void flushBody(Cache *c);
116 static void unlinkBody(Cache *c);
117 static int cacheFlushBlock(Cache *c);
118 static void cacheSync(void*);
119 static BList *blistAlloc(Block*);
120 static void blistFree(Cache*, BList*);
121 static void doRemoveLink(Cache*, BList*);
122 static void doRemoveLinkList(Cache*, BList*);
123 
124 /*
125  * Mapping from local block type to Venti type
126  */
127 int vtType[BtMax] = {
128 	VtDataType,		/* BtData | 0  */
129 	VtPointerType0,		/* BtData | 1  */
130 	VtPointerType1,		/* BtData | 2  */
131 	VtPointerType2,		/* BtData | 3  */
132 	VtPointerType3,		/* BtData | 4  */
133 	VtPointerType4,		/* BtData | 5  */
134 	VtPointerType5,		/* BtData | 6  */
135 	VtPointerType6,		/* BtData | 7  */
136 	VtDirType,		/* BtDir | 0  */
137 	VtPointerType0,		/* BtDir | 1  */
138 	VtPointerType1,		/* BtDir | 2  */
139 	VtPointerType2,		/* BtDir | 3  */
140 	VtPointerType3,		/* BtDir | 4  */
141 	VtPointerType4,		/* BtDir | 5  */
142 	VtPointerType5,		/* BtDir | 6  */
143 	VtPointerType6,		/* BtDir | 7  */
144 };
145 
146 /*
147  * Allocate the memory cache.
148  */
149 Cache *
150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
151 {
152 	int i;
153 	Cache *c;
154 	Block *b;
155 	BList *bl;
156 	u8int *p;
157 	int nbl;
158 
159 	c = vtMemAllocZ(sizeof(Cache));
160 
161 	/* reasonable number of BList elements */
162 	nbl = nblocks * 4;
163 
164 	c->lk = vtLockAlloc();
165 	c->dirtylk = vtLockAlloc();	/* allowed to dirty blocks */
166 	c->ref = 1;
167 	c->disk = disk;
168 	c->z = z;
169 	c->size = diskBlockSize(disk);
170 bwatchSetBlockSize(c->size);
171 	/* round c->size up to be a nice multiple */
172 	c->size = (c->size + 127) & ~127;
173 	c->ndmap = (c->size/20 + 7) / 8;
174 	c->nblocks = nblocks;
175 	c->hashSize = nblocks;
176 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
177 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
178 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
179 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
180 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
181 	c->mode = mode;
182 	c->vers++;
183 	p = c->mem;
184 	for(i = 0; i < nblocks; i++){
185 		b = &c->blocks[i];
186 		b->lk = vtLockAlloc();
187 		b->c = c;
188 		b->data = p;
189 		b->heap = i;
190 		b->ioready = vtRendezAlloc(b->lk);
191 		c->heap[i] = b;
192 		p += c->size;
193 	}
194 	c->nheap = nblocks;
195 	for(i = 0; i < nbl; i++){
196 		bl = (BList*)p;
197 		bl->next = c->blfree;
198 		c->blfree = bl;
199 		p += sizeof(BList);
200 	}
201 	/* separate loop to keep blocks and blists reasonably aligned */
202 	for(i = 0; i < nblocks; i++){
203 		b = &c->blocks[i];
204 		b->dmap = p;
205 		p += c->ndmap;
206 	}
207 
208 	c->blrend = vtRendezAlloc(c->lk);
209 
210 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
211 
212 	c->fl = flAlloc(diskSize(disk, PartData));
213 
214 	c->unlink = vtRendezAlloc(c->lk);
215 	c->flush = vtRendezAlloc(c->lk);
216 	c->flushwait = vtRendezAlloc(c->lk);
217 	c->heapwait = vtRendezAlloc(c->lk);
218 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
219 
220 	if(mode == OReadWrite){
221 		c->ref += 2;
222 		vtThread(unlinkThread, c);
223 		vtThread(flushThread, c);
224 	}
225 	cacheCheck(c);
226 
227 	return c;
228 }
229 
230 /*
231  * Free the whole memory cache, flushing all dirty blocks to the disk.
232  */
233 void
234 cacheFree(Cache *c)
235 {
236 	int i;
237 
238 	/* kill off daemon threads */
239 	vtLock(c->lk);
240 	c->die = vtRendezAlloc(c->lk);
241 	periodicKill(c->sync);
242 	vtWakeup(c->flush);
243 	vtWakeup(c->unlink);
244 	while(c->ref > 1)
245 		vtSleep(c->die);
246 
247 	/* flush everything out */
248 	do {
249 		unlinkBody(c);
250 		vtUnlock(c->lk);
251 		while(cacheFlushBlock(c))
252 			;
253 		diskFlush(c->disk);
254 		vtLock(c->lk);
255 	} while(c->uhead || c->ndirty);
256 	vtUnlock(c->lk);
257 
258 	cacheCheck(c);
259 
260 	for(i = 0; i < c->nblocks; i++){
261 		assert(c->blocks[i].ref == 0);
262 		vtRendezFree(c->blocks[i].ioready);
263 		vtLockFree(c->blocks[i].lk);
264 	}
265 	flFree(c->fl);
266 	vtMemFree(c->baddr);
267 	vtMemFree(c->heads);
268 	vtMemFree(c->blocks);
269 	vtMemFree(c->mem);
270 	vtLockFree(c->lk);
271 	diskFree(c->disk);
272 	vtRendezFree(c->blrend);
273 	/* don't close vtSession */
274 	vtMemFree(c);
275 }
276 
277 static void
278 cacheDump(Cache *c)
279 {
280 	int i;
281 	Block *b;
282 
283 	for(i = 0; i < c->nblocks; i++){
284 		b = &c->blocks[i];
285 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
286 			i, b->part, b->addr, b->score, b->l.type, b->ref,
287 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
288 	}
289 }
290 
291 static void
292 cacheCheck(Cache *c)
293 {
294 	u32int size, now;
295 	int i, k, refed;
296 	static uchar zero[VtScoreSize];
297 	Block *b;
298 
299 	size = c->size;
300 	now = c->now;
301 
302 	for(i = 0; i < c->nheap; i++){
303 		if(c->heap[i]->heap != i)
304 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
305 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
306 			vtFatal("bad heap ordering");
307 		k = (i << 1) + 1;
308 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
309 			vtFatal("bad heap ordering");
310 		k++;
311 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
312 			vtFatal("bad heap ordering");
313 	}
314 
315 	refed = 0;
316 	for(i = 0; i < c->nblocks; i++){
317 		b = &c->blocks[i];
318 		if(b->data != &c->mem[i * size])
319 			vtFatal("mis-blocked at %d", i);
320 		if(b->ref && b->heap == BadHeap){
321 			refed++;
322 		}
323 	}
324 if(c->nheap + refed != c->nblocks){
325 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks);
326 cacheDump(c);
327 }
328 	assert(c->nheap + refed == c->nblocks);
329 	refed = 0;
330 	for(i = 0; i < c->nblocks; i++){
331 		b = &c->blocks[i];
332 		if(b->ref){
333 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l);
334 			refed++;
335 		}
336 	}
337 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed);
338 }
339 
340 
341 /*
342  * locate the block with the oldest second to last use.
343  * remove it from the heap, and fix up the heap.
344  */
345 /* called with c->lk held */
346 static Block *
347 cacheBumpBlock(Cache *c)
348 {
349 	int printed;
350 	Block *b;
351 
352 	/*
353 	 * locate the block with the oldest second to last use.
354 	 * remove it from the heap, and fix up the heap.
355 	 */
356 	printed = 0;
357 	if(c->nheap == 0){
358 		while(c->nheap == 0){
359 			vtWakeup(c->flush);
360 			vtSleep(c->heapwait);
361 			if(c->nheap == 0){
362 				printed = 1;
363 				fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty);
364 			}
365 		}
366 		if(printed)
367 			fprint(2, "cache is okay again, %d dirty\n", c->ndirty);
368 	}
369 
370 	b = c->heap[0];
371 	heapDel(b);
372 
373 	assert(b->heap == BadHeap);
374 	assert(b->ref == 0);
375 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
376 	assert(b->prior == nil);
377 	assert(b->uhead == nil);
378 
379 	/*
380 	 * unchain the block from hash chain
381 	 */
382 	if(b->prev){
383 		*(b->prev) = b->next;
384 		if(b->next)
385 			b->next->prev = b->prev;
386 		b->prev = nil;
387 	}
388 
389 
390 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score);
391 	/* set block to a reasonable state */
392 	b->ref = 1;
393 	b->part = PartError;
394 	memset(&b->l, 0, sizeof(b->l));
395 	b->iostate = BioEmpty;
396 
397 	return b;
398 }
399 
400 /*
401  * look for a particular version of the block in the memory cache.
402  */
403 static Block *
404 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
405 	int waitlock, int *lockfailure)
406 {
407 	Block *b;
408 	ulong h;
409 
410 	h = addr % c->hashSize;
411 
412 	if(lockfailure)
413 		*lockfailure = 0;
414 
415 	/*
416 	 * look for the block in the cache
417 	 */
418 	vtLock(c->lk);
419 	for(b = c->heads[h]; b != nil; b = b->next){
420 		if(b->part == part && b->addr == addr)
421 			break;
422 	}
423 	if(b == nil || b->vers != vers){
424 		vtUnlock(c->lk);
425 		return nil;
426 	}
427 	if(!waitlock && !vtCanLock(b->lk)){
428 		*lockfailure = 1;
429 		vtUnlock(c->lk);
430 		return nil;
431 	}
432 	heapDel(b);
433 	b->ref++;
434 	vtUnlock(c->lk);
435 
436 	bwatchLock(b);
437 	if(waitlock)
438 		vtLock(b->lk);
439 	b->nlock = 1;
440 
441 	for(;;){
442 		switch(b->iostate){
443 		default:
444 			abort();
445 		case BioEmpty:
446 		case BioLabel:
447 		case BioClean:
448 		case BioDirty:
449 			if(b->vers != vers){
450 				blockPut(b);
451 				return nil;
452 			}
453 			return b;
454 		case BioReading:
455 		case BioWriting:
456 			vtSleep(b->ioready);
457 			break;
458 		case BioVentiError:
459 			blockPut(b);
460 			vtSetError("venti i/o error block 0x%.8ux", addr);
461 			return nil;
462 		case BioReadError:
463 			blockPut(b);
464 			vtSetError("error reading block 0x%.8ux", addr);
465 			return nil;
466 		}
467 	}
468 	/* NOT REACHED */
469 }
470 static Block*
471 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
472 {
473 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
474 }
475 
476 
477 /*
478  * fetch a local (on-disk) block from the memory cache.
479  * if it's not there, load it, bumping some other block.
480  */
481 Block *
482 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
483 {
484 	Block *b;
485 	ulong h;
486 
487 	assert(part != PartVenti);
488 
489 	h = addr % c->hashSize;
490 
491 	/*
492 	 * look for the block in the cache
493 	 */
494 	vtLock(c->lk);
495 	for(b = c->heads[h]; b != nil; b = b->next){
496 		if(b->part != part || b->addr != addr)
497 			continue;
498 		if(epoch && b->l.epoch != epoch){
499 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
500 			vtUnlock(c->lk);
501 			vtSetError(ELabelMismatch);
502 			return nil;
503 		}
504 		heapDel(b);
505 		b->ref++;
506 		break;
507 	}
508 
509 	if(b == nil){
510 		b = cacheBumpBlock(c);
511 
512 		b->part = part;
513 		b->addr = addr;
514 		localToGlobal(addr, b->score);
515 
516 		/* chain onto correct hash */
517 		b->next = c->heads[h];
518 		c->heads[h] = b;
519 		if(b->next != nil)
520 			b->next->prev = &b->next;
521 		b->prev = &c->heads[h];
522 	}
523 
524 	vtUnlock(c->lk);
525 
526 	/*
527 	 * BUG: what if the epoch changes right here?
528 	 * In the worst case, we could end up in some weird
529 	 * lock loop, because the block we want no longer exists,
530 	 * and instead we're trying to lock a block we have no
531 	 * business grabbing.
532 	 *
533 	 * For now, I'm not going to worry about it.
534 	 */
535 
536 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr);
537 	bwatchLock(b);
538 	vtLock(b->lk);
539 	b->nlock = 1;
540 
541 	if(part == PartData && b->iostate == BioEmpty){
542 		if(!readLabel(c, &b->l, addr)){
543 			blockPut(b);
544 			return nil;
545 		}
546 		blockSetIOState(b, BioLabel);
547 	}
548 	if(epoch && b->l.epoch != epoch){
549 		blockPut(b);
550 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
551 		vtSetError(ELabelMismatch);
552 		return nil;
553 	}
554 
555 	b->pc = getcallerpc(&c);
556 	for(;;){
557 		switch(b->iostate){
558 		default:
559 			abort();
560 		case BioEmpty:
561 		case BioLabel:
562 			if(mode == OOverWrite){
563 				blockSetIOState(b, BioClean);
564 				return b;
565 			}
566 			diskRead(c->disk, b);
567 			vtSleep(b->ioready);
568 			break;
569 		case BioClean:
570 		case BioDirty:
571 			return b;
572 		case BioReading:
573 		case BioWriting:
574 			vtSleep(b->ioready);
575 			break;
576 		case BioReadError:
577 			blockSetIOState(b, BioEmpty);
578 			blockPut(b);
579 			vtSetError("error reading block 0x%.8ux", addr);
580 			return nil;
581 		}
582 	}
583 	/* NOT REACHED */
584 }
585 
586 Block *
587 cacheLocal(Cache *c, int part, u32int addr, int mode)
588 {
589 	return _cacheLocal(c, part, addr, mode, 0);
590 }
591 
592 /*
593  * fetch a local (on-disk) block from the memory cache.
594  * if it's not there, load it, bumping some other block.
595  * check tag and type.
596  */
597 Block *
598 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
599 {
600 	Block *b;
601 
602 	b = _cacheLocal(c, PartData, addr, mode, epoch);
603 	if(b == nil)
604 		return nil;
605 	if(b->l.type != type || b->l.tag != tag){
606 		fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
607 			addr, b->l.type, type, b->l.tag, tag);
608 		vtSetError(ELabelMismatch);
609 		blockPut(b);
610 		return nil;
611 	}
612 	b->pc = getcallerpc(&c);
613 	return b;
614 }
615 
616 /*
617  * fetch a global (Venti) block from the memory cache.
618  * if it's not there, load it, bumping some other block.
619  * check tag and type if it's really a local block in disguise.
620  */
621 Block *
622 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
623 {
624 	int n;
625 	Block *b;
626 	ulong h;
627 	u32int addr;
628 
629 	addr = globalToLocal(score);
630 	if(addr != NilBlock){
631 		b = cacheLocalData(c, addr, type, tag, mode, 0);
632 		if(b)
633 			b->pc = getcallerpc(&c);
634 		return b;
635 	}
636 
637 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
638 
639 	/*
640 	 * look for the block in the cache
641 	 */
642 	vtLock(c->lk);
643 	for(b = c->heads[h]; b != nil; b = b->next){
644 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
645 			continue;
646 		heapDel(b);
647 		b->ref++;
648 		break;
649 	}
650 
651 	if(b == nil){
652 if(0)fprint(2, "cacheGlobal %V %d\n", score, type);
653 
654 		b = cacheBumpBlock(c);
655 
656 		b->part = PartVenti;
657 		b->addr = NilBlock;
658 		b->l.type = type;
659 		memmove(b->score, score, VtScoreSize);
660 
661 		/* chain onto correct hash */
662 		b->next = c->heads[h];
663 		c->heads[h] = b;
664 		if(b->next != nil)
665 			b->next->prev = &b->next;
666 		b->prev = &c->heads[h];
667 	}
668 	vtUnlock(c->lk);
669 
670 	bwatchLock(b);
671 	vtLock(b->lk);
672 	b->nlock = 1;
673 	b->pc = getcallerpc(&c);
674 
675 	switch(b->iostate){
676 	default:
677 		abort();
678 	case BioEmpty:
679 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
680 		if(n < 0 || !vtSha1Check(score, b->data, n)){
681 			blockSetIOState(b, BioVentiError);
682 			blockPut(b);
683 			vtSetError(
684 			"venti error reading block %V or wrong score: %r",
685 				score);
686 			return nil;
687 		}
688 		vtZeroExtend(vtType[type], b->data, n, c->size);
689 		blockSetIOState(b, BioClean);
690 		return b;
691 	case BioClean:
692 		return b;
693 	case BioVentiError:
694 		blockPut(b);
695 		vtSetError("venti i/o error or wrong score, block %V", score);
696 		return nil;
697 	case BioReadError:
698 		blockPut(b);
699 		vtSetError("error reading block %V", b->score);
700 		return nil;
701 	}
702 	/* NOT REACHED */
703 }
704 
705 /*
706  * allocate a new on-disk block and load it into the memory cache.
707  * BUG: if the disk is full, should we flush some of it to Venti?
708  */
709 static u32int lastAlloc;
710 
711 Block *
712 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
713 {
714 	FreeList *fl;
715 	u32int addr;
716 	Block *b;
717 	int n, nwrap;
718 	Label lab;
719 
720 	n = c->size / LabelSize;
721 	fl = c->fl;
722 
723 	vtLock(fl->lk);
724 	addr = fl->last;
725 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
726 	if(b == nil){
727 		fprint(2, "cacheAllocBlock: xxx %R\n");
728 		vtUnlock(fl->lk);
729 		return nil;
730 	}
731 	nwrap = 0;
732 	for(;;){
733 		if(++addr >= fl->end){
734 			addr = 0;
735 			if(++nwrap >= 2){
736 				blockPut(b);
737 				fl->last = 0;
738 				vtSetError("disk is full");
739 				fprint(2, "cacheAllocBlock: xxx1 %R\n");
740 				vtUnlock(fl->lk);
741 				return nil;
742 			}
743 		}
744 		if(addr%n == 0){
745 			blockPut(b);
746 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
747 			if(b == nil){
748 				fl->last = addr;
749 				fprint(2, "cacheAllocBlock: xxx2 %R\n");
750 				vtUnlock(fl->lk);
751 				return nil;
752 			}
753 		}
754 		if(!labelUnpack(&lab, b->data, addr%n))
755 			continue;
756 		if(lab.state == BsFree)
757 			goto Found;
758 		if(lab.state&BsClosed)
759 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
760 			goto Found;
761 	}
762 Found:
763 	blockPut(b);
764 	b = cacheLocal(c, PartData, addr, OOverWrite);
765 	if(b == nil){
766 		fprint(2, "cacheAllocBlock: xxx3 %R\n");
767 		return nil;
768 	}
769 assert(b->iostate == BioLabel || b->iostate == BioClean);
770 	fl->last = addr;
771 	lab.type = type;
772 	lab.tag = tag;
773 	lab.state = BsAlloc;
774 	lab.epoch = epoch;
775 	lab.epochClose = ~(u32int)0;
776 	if(!blockSetLabel(b, &lab, 1)){
777 		fprint(2, "cacheAllocBlock: xxx4 %R\n");
778 		blockPut(b);
779 		return nil;
780 	}
781 	vtZeroExtend(vtType[type], b->data, 0, c->size);
782 if(0)diskWrite(c->disk, b);
783 
784 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag);
785 	lastAlloc = addr;
786 	fl->nused++;
787 	vtUnlock(fl->lk);
788 	b->pc = getcallerpc(&c);
789 	return b;
790 }
791 
792 int
793 cacheDirty(Cache *c)
794 {
795 	return c->ndirty;
796 }
797 
798 void
799 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
800 {
801 	int n;
802 	u32int addr, nused;
803 	Block *b;
804 	Label lab;
805 	FreeList *fl;
806 
807 	fl = c->fl;
808 	n = c->size / LabelSize;
809 	*bsize = c->size;
810 	vtLock(fl->lk);
811 	if(fl->epochLow == epochLow){
812 		*used = fl->nused;
813 		*total = fl->end;
814 		vtUnlock(fl->lk);
815 		return;
816 	}
817 	b = nil;
818 	nused = 0;
819 	for(addr=0; addr<fl->end; addr++){
820 		if(addr%n == 0){
821 			blockPut(b);
822 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
823 			if(b == nil){
824 				fprint(2, "flCountUsed: loading %ux: %R\n", addr/n);
825 				break;
826 			}
827 		}
828 		if(!labelUnpack(&lab, b->data, addr%n))
829 			continue;
830 		if(lab.state == BsFree)
831 			continue;
832 		if(lab.state&BsClosed)
833 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
834 			continue;
835 		nused++;
836 	}
837 	blockPut(b);
838 	if(addr == fl->end){
839 		fl->nused = nused;
840 		fl->epochLow = epochLow;
841 	}
842 	*used = nused;
843 	*total = fl->end;
844 	vtUnlock(fl->lk);
845 	return;
846 }
847 
848 static FreeList *
849 flAlloc(u32int end)
850 {
851 	FreeList *fl;
852 
853 	fl = vtMemAllocZ(sizeof(*fl));
854 	fl->lk = vtLockAlloc();
855 	fl->last = 0;
856 	fl->end = end;
857 	return fl;
858 }
859 
860 static void
861 flFree(FreeList *fl)
862 {
863 	vtLockFree(fl->lk);
864 	vtMemFree(fl);
865 }
866 
867 u32int
868 cacheLocalSize(Cache *c, int part)
869 {
870 	return diskSize(c->disk, part);
871 }
872 
873 /*
874  * The thread that has locked b may refer to it by
875  * multiple names.  Nlock counts the number of
876  * references the locking thread holds.  It will call
877  * blockPut once per reference.
878  */
879 void
880 blockDupLock(Block *b)
881 {
882 	assert(b->nlock > 0);
883 	b->nlock++;
884 }
885 
886 /*
887  * we're done with the block.
888  * unlock it.  can't use it after calling this.
889  */
890 void
891 blockPut(Block* b)
892 {
893 	Cache *c;
894 
895 	if(b == nil)
896 		return;
897 
898 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
899 
900 	if(b->iostate == BioDirty)
901 		bwatchDependency(b);
902 
903 	if(--b->nlock > 0)
904 		return;
905 
906 	/*
907 	 * b->nlock should probably stay at zero while
908 	 * the block is unlocked, but diskThread and vtSleep
909 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
910 	 * so we have to keep b->nlock set to 1 even
911 	 * when the block is unlocked.
912 	 */
913 	assert(b->nlock == 0);
914 	b->nlock = 1;
915 //	b->pc = 0;
916 
917 	bwatchUnlock(b);
918 	vtUnlock(b->lk);
919 	c = b->c;
920 	vtLock(c->lk);
921 
922 	if(--b->ref > 0){
923 		vtUnlock(c->lk);
924 		return;
925 	}
926 
927 	assert(b->ref == 0);
928 	switch(b->iostate){
929 	default:
930 		b->used = c->now++;
931 		heapIns(b);
932 		break;
933 	case BioEmpty:
934 	case BioLabel:
935 		if(c->nheap == 0)
936 			b->used = c->now++;
937 		else
938 			b->used = c->heap[0]->used;
939 		heapIns(b);
940 		break;
941 	case BioDirty:
942 		break;
943 	}
944 	vtUnlock(c->lk);
945 }
946 
947 /*
948  * set the label associated with a block.
949  */
950 Block*
951 _blockSetLabel(Block *b, Label *l)
952 {
953 	int lpb;
954 	Block *bb;
955 	u32int a;
956 	Cache *c;
957 
958 	c = b->c;
959 
960 	assert(b->part == PartData);
961 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
962 	lpb = c->size / LabelSize;
963 	a = b->addr / lpb;
964 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
965 	if(bb == nil){
966 		blockPut(b);
967 		return nil;
968 	}
969 	b->l = *l;
970 	labelPack(l, bb->data, b->addr%lpb);
971 	blockDirty(bb);
972 	return bb;
973 }
974 
975 int
976 blockSetLabel(Block *b, Label *l, int allocating)
977 {
978 	Block *lb;
979 	Label oldl;
980 
981 	oldl = b->l;
982 	lb = _blockSetLabel(b, l);
983 	if(lb == nil)
984 		return 0;
985 
986 	/*
987 	 * If we're allocating the block, make sure the label (bl)
988 	 * goes to disk before the data block (b) itself.  This is to help
989 	 * the blocks that in turn depend on b.
990 	 *
991 	 * Suppose bx depends on (must be written out after) b.
992 	 * Once we write b we'll think it's safe to write bx.
993 	 * Bx can't get at b unless it has a valid label, though.
994 	 *
995 	 * Allocation is the only case in which having a current label
996 	 * is vital because:
997 	 *
998 	 *	- l.type is set at allocation and never changes.
999 	 *	- l.tag is set at allocation and never changes.
1000 	 *	- l.state is not checked when we load blocks.
1001 	 *	- the archiver cares deeply about l.state being
1002 	 *		BaActive vs. BaCopied, but that's handled
1003 	 *		by direct calls to _blockSetLabel.
1004 	 */
1005 
1006 	if(allocating)
1007 		blockDependency(b, lb, -1, nil, nil);
1008 	blockPut(lb);
1009 	return 1;
1010 }
1011 
1012 /*
1013  * Record that bb must be written out before b.
1014  * If index is given, we're about to overwrite the score/e
1015  * at that index in the block.  Save the old value so we
1016  * can write a safer ``old'' version of the block if pressed.
1017  */
1018 void
1019 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1020 {
1021 	BList *p;
1022 
1023 	if(bb->iostate == BioClean)
1024 		return;
1025 
1026 	/*
1027 	 * Dependencies for blocks containing Entry structures
1028 	 * or scores must always be explained.  The problem with
1029 	 * only explaining some of them is this.  Suppose we have two
1030 	 * dependencies for the same field, the first explained
1031 	 * and the second not.  We try to write the block when the first
1032 	 * dependency is not written but the second is.  We will roll back
1033 	 * the first change even though the second trumps it.
1034 	 */
1035 	if(index == -1 && bb->part == PartData)
1036 		assert(b->l.type == BtData);
1037 
1038 	if(bb->iostate != BioDirty){
1039 		fprint(2, "%d:%x:%d iostate is %d in blockDependency\n",
1040 			bb->part, bb->addr, bb->l.type, bb->iostate);
1041 		abort();
1042 	}
1043 
1044 	p = blistAlloc(bb);
1045 	if(p == nil)
1046 		return;
1047 
1048 	assert(bb->iostate == BioDirty);
1049 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1050 
1051 	p->part = bb->part;
1052 	p->addr = bb->addr;
1053 	p->type = bb->l.type;
1054 	p->vers = bb->vers;
1055 	p->index = index;
1056 	if(p->index >= 0){
1057 		/*
1058 		 * This test would just be b->l.type==BtDir except
1059 		 * we need to exclude the super block.
1060 		 */
1061 		if(b->l.type == BtDir && b->part == PartData)
1062 			entryPack(e, p->old.entry, 0);
1063 		else
1064 			memmove(p->old.score, score, VtScoreSize);
1065 	}
1066 	p->next = b->prior;
1067 	b->prior = p;
1068 }
1069 
1070 /*
1071  * Mark an in-memory block as dirty.  If there are too many
1072  * dirty blocks, start writing some out to disk.
1073  *
1074  * If there were way too many dirty blocks, we used to
1075  * try to do some flushing ourselves, but it's just too dangerous --
1076  * it implies that the callers cannot have any of our priors locked,
1077  * but this is hard to avoid in some cases.
1078  */
1079 int
1080 blockDirty(Block *b)
1081 {
1082 	Cache *c;
1083 
1084 	c = b->c;
1085 
1086 	assert(b->part != PartVenti);
1087 
1088 	if(b->iostate == BioDirty)
1089 		return 1;
1090 	assert(b->iostate == BioClean);
1091 
1092 	vtLock(c->dirtylk);
1093 	vtLock(c->lk);
1094 	b->iostate = BioDirty;
1095 	c->ndirty++;
1096 	if(c->ndirty > (c->maxdirty>>1))
1097 		vtWakeup(c->flush);
1098 	vtUnlock(c->lk);
1099 	vtUnlock(c->dirtylk);
1100 
1101 	return 1;
1102 }
1103 
1104 /*
1105  * We've decided to write out b.  Maybe b has some pointers to blocks
1106  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1107  * copy of b that is safe to write out.  (diskThread will make sure the block
1108  * remains marked as dirty.)
1109  */
1110 uchar *
1111 blockRollback(Block *b, uchar *buf)
1112 {
1113 	u32int addr;
1114 	BList *p;
1115 	Super super;
1116 
1117 	/* easy case */
1118 	if(b->prior == nil)
1119 		return b->data;
1120 
1121 	memmove(buf, b->data, b->c->size);
1122 	for(p=b->prior; p; p=p->next){
1123 		/*
1124 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1125 		 */
1126 		assert(p->index >= 0);
1127 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1128 		if(b->part == PartSuper){
1129 			assert(p->index == 0);
1130 			superUnpack(&super, buf);
1131 			addr = globalToLocal(p->old.score);
1132 			if(addr == NilBlock){
1133 				fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score);
1134 				abort();
1135 			}
1136 			super.active = addr;
1137 			superPack(&super, buf);
1138 			continue;
1139 		}
1140 		if(b->l.type == BtDir)
1141 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1142 		else
1143 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1144 	}
1145 	return buf;
1146 }
1147 
1148 /*
1149  * Try to write block b.
1150  * If b depends on other blocks:
1151  *
1152  *	If the block has been written out, remove the dependency.
1153  *	If the dependency is replaced by a more recent dependency,
1154  *		throw it out.
1155  *	If we know how to write out an old version of b that doesn't
1156  *		depend on it, do that.
1157  *
1158  *	Otherwise, bail.
1159  */
1160 int
1161 blockWrite(Block *b)
1162 {
1163 	uchar *dmap;
1164 	Cache *c;
1165 	BList *p, **pp;
1166 	Block *bb;
1167 	int lockfail;
1168 
1169 	c = b->c;
1170 
1171 	if(b->iostate != BioDirty)
1172 		return 1;
1173 
1174 	dmap = b->dmap;
1175 	memset(dmap, 0, c->ndmap);
1176 	pp = &b->prior;
1177 	for(p=*pp; p; p=*pp){
1178 		if(p->index >= 0){
1179 			/* more recent dependency has succeeded; this one can go */
1180 			if(dmap[p->index/8] & (1<<(p->index%8)))
1181 				goto ignblock;
1182 		}
1183 
1184 		lockfail = 0;
1185 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1186 		if(bb == nil){
1187 			if(lockfail)
1188 				return 0;
1189 			/* block not in cache => was written already */
1190 			dmap[p->index/8] |= 1<<(p->index%8);
1191 			goto ignblock;
1192 		}
1193 
1194 		/*
1195 		 * same version of block is still in cache.
1196 		 *
1197 		 * the assertion is true because the block still has version p->vers,
1198 		 * which means it hasn't been written out since we last saw it.
1199 		 */
1200 		if(bb->iostate != BioDirty){
1201 			fprint(2, "%d:%x:%d iostate is %d in blockWrite\n",
1202 				bb->part, bb->addr, bb->l.type, bb->iostate);
1203 			/* probably BioWriting if it happens? */
1204 			if(bb->iostate == BioClean)
1205 				goto ignblock;
1206 		}
1207 
1208 		blockPut(bb);
1209 
1210 		if(p->index < 0){
1211 			/*
1212 			 * We don't know how to temporarily undo
1213 			 * b's dependency on bb, so just don't write b yet.
1214 			 */
1215 			if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1216 				b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1217 			return 0;
1218 		}
1219 		/* keep walking down the list */
1220 		pp = &p->next;
1221 		continue;
1222 
1223 ignblock:
1224 		*pp = p->next;
1225 		blistFree(c, p);
1226 		continue;
1227 	}
1228 
1229 	/*
1230 	 * DiskWrite must never be called with a double-locked block.
1231 	 * This call to diskWrite is okay because blockWrite is only called
1232 	 * from the cache flush thread, which never double-locks a block.
1233 	 */
1234 	diskWrite(c->disk, b);
1235 	return 1;
1236 }
1237 
1238 /*
1239  * Change the I/O state of block b.
1240  * Just an assignment except for magic in
1241  * switch statement (read comments there).
1242  */
1243 void
1244 blockSetIOState(Block *b, int iostate)
1245 {
1246 	int dowakeup;
1247 	Cache *c;
1248 	BList *p, *q;
1249 
1250 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1251 
1252 	c = b->c;
1253 
1254 	dowakeup = 0;
1255 	switch(iostate){
1256 	default:
1257 		abort();
1258 	case BioEmpty:
1259 		assert(!b->uhead);
1260 		break;
1261 	case BioLabel:
1262 		assert(!b->uhead);
1263 		break;
1264 	case BioClean:
1265 		bwatchDependency(b);
1266 		/*
1267 		 * If b->prior is set, it means a write just finished.
1268 		 * The prior list isn't needed anymore.
1269 		 */
1270 		for(p=b->prior; p; p=q){
1271 			q = p->next;
1272 			blistFree(c, p);
1273 		}
1274 		b->prior = nil;
1275 		/*
1276 		 * Freeing a block or just finished a write.
1277 		 * Move the blocks from the per-block unlink
1278 		 * queue to the cache unlink queue.
1279 		 */
1280 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1281 			vtLock(c->lk);
1282 			c->ndirty--;
1283 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1284 			b->vers = c->vers++;
1285 			if(b->uhead){
1286 				/* add unlink blocks to unlink queue */
1287 				if(c->uhead == nil){
1288 					c->uhead = b->uhead;
1289 					vtWakeup(c->unlink);
1290 				}else
1291 					c->utail->next = b->uhead;
1292 				c->utail = b->utail;
1293 				b->uhead = nil;
1294 			}
1295 			vtUnlock(c->lk);
1296 		}
1297 		assert(!b->uhead);
1298 		dowakeup = 1;
1299 		break;
1300 	case BioDirty:
1301 		/*
1302 		 * Wrote out an old version of the block (see blockRollback).
1303 		 * Bump a version count, leave it dirty.
1304 		 */
1305 		if(b->iostate == BioWriting){
1306 			vtLock(c->lk);
1307 			b->vers = c->vers++;
1308 			vtUnlock(c->lk);
1309 			dowakeup = 1;
1310 		}
1311 		break;
1312 	case BioReading:
1313 	case BioWriting:
1314 		/*
1315 		 * Adding block to disk queue.  Bump reference count.
1316 		 * diskThread decs the count later by calling blockPut.
1317 		 * This is here because we need to lock c->lk to
1318 		 * manipulate the ref count.
1319 		 */
1320 		vtLock(c->lk);
1321 		b->ref++;
1322 		vtUnlock(c->lk);
1323 		break;
1324 	case BioReadError:
1325 	case BioVentiError:
1326 		/*
1327 		 * Oops.
1328 		 */
1329 		dowakeup = 1;
1330 		break;
1331 	}
1332 	b->iostate = iostate;
1333 	/*
1334 	 * Now that the state has changed, we can wake the waiters.
1335 	 */
1336 	if(dowakeup)
1337 		vtWakeupAll(b->ioready);
1338 }
1339 
1340 /*
1341  * The active file system is a tree of blocks.
1342  * When we add snapshots to the mix, the entire file system
1343  * becomes a dag and thus requires a bit more care.
1344  *
1345  * The life of the file system is divided into epochs.  A snapshot
1346  * ends one epoch and begins the next.  Each file system block
1347  * is marked with the epoch in which it was created (b.epoch).
1348  * When the block is unlinked from the file system (closed), it is marked
1349  * with the epoch in which it was removed (b.epochClose).
1350  * Once we have discarded or archived all snapshots up to
1351  * b.epochClose, we can reclaim the block.
1352  *
1353  * If a block was created in a past epoch but is not yet closed,
1354  * it is treated as copy-on-write.  Of course, in order to insert the
1355  * new pointer into the tree, the parent must be made writable,
1356  * and so on up the tree.  The recursion stops because the root
1357  * block is always writable.
1358  *
1359  * If blocks are never closed, they will never be reused, and
1360  * we will run out of disk space.  But marking a block as closed
1361  * requires some care about dependencies and write orderings.
1362  *
1363  * (1) If a block p points at a copy-on-write block b and we
1364  * copy b to create bb, then p must be written out after bb and
1365  * lbb (bb's label block).
1366  *
1367  * (2) We have to mark b as closed, but only after we switch
1368  * the pointer, so lb must be written out after p.  In fact, we
1369  * can't even update the in-memory copy, or the cache might
1370  * mistakenly give out b for reuse before p gets written.
1371  *
1372  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1373  * The caller is expected to record a "p after bb" dependency
1374  * to finish (1), and also expected to call blockRemoveLink
1375  * to arrange for (2) to happen once p is written.
1376  *
1377  * Until (2) happens, some pieces of the code (e.g., the archiver)
1378  * still need to know whether a block has been copied, so we
1379  * set the BsCopied bit in the label and force that to disk *before*
1380  * the copy gets written out.
1381  */
1382 Block*
1383 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1384 {
1385 	Block *bb, *lb;
1386 	Label l;
1387 
1388 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1389 		fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n",
1390 			b->addr, &b->l, elo, ehi);
1391 
1392 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1393 	if(bb == nil){
1394 		blockPut(b);
1395 		return nil;
1396 	}
1397 
1398 	/*
1399 	 * Update label so we know the block has been copied.
1400 	 * (It will be marked closed once it has been unlinked from
1401 	 * the tree.)  This must follow cacheAllocBlock since we
1402 	 * can't be holding onto lb when we call cacheAllocBlock.
1403 	 */
1404 	if((b->l.state&BsCopied)==0)
1405 	if(b->part == PartData){	/* not the superblock */
1406 		l = b->l;
1407 		l.state |= BsCopied;
1408 		lb = _blockSetLabel(b, &l);
1409 		if(lb == nil){
1410 			/* can't set label => can't copy block */
1411 			blockPut(b);
1412 			l.type = BtMax;
1413 			l.state = BsFree;
1414 			l.epoch = 0;
1415 			l.epochClose = 0;
1416 			l.tag = 0;
1417 			blockSetLabel(bb, &l, 0);
1418 			blockPut(bb);
1419 			return nil;
1420 		}
1421 		blockDependency(bb, lb, -1, nil, nil);
1422 		blockPut(lb);
1423 	}
1424 
1425 	memmove(bb->data, b->data, b->c->size);
1426 	blockDirty(bb);
1427 	blockPut(b);
1428 	return bb;
1429 }
1430 
1431 /*
1432  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1433  * If recurse is set, we are unlinking all of bb's children as well.
1434  *
1435  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1436  * the relevant information to b's list of unlinked blocks.  Once b is written,
1437  * the list will be queued for processing.
1438  *
1439  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1440  */
1441 void
1442 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1443 {
1444 	BList *p, **pp, bl;
1445 
1446 	/* remove bb from prior list */
1447 	for(pp=&b->prior; (p=*pp)!=nil; ){
1448 		if(p->part == PartData && p->addr == addr){
1449 			*pp = p->next;
1450 			blistFree(b->c, p);
1451 		}else
1452 			pp = &p->next;
1453 	}
1454 
1455 	bl.part = PartData;
1456 	bl.addr = addr;
1457 	bl.type = type;
1458 	bl.tag = tag;
1459 	if(b->l.epoch == 0)
1460 		assert(b->part == PartSuper);
1461 	bl.epoch = b->l.epoch;
1462 	bl.next = nil;
1463 	bl.recurse = recurse;
1464 
1465 	p = blistAlloc(b);
1466 	if(p == nil){
1467 		/*
1468 		 * We were out of blists so blistAlloc wrote b to disk.
1469 		 */
1470 		doRemoveLink(b->c, &bl);
1471 		return;
1472 	}
1473 
1474 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1475 	assert(b->iostate == BioDirty);
1476 
1477 	*p = bl;
1478 	if(b->uhead == nil)
1479 		b->uhead = p;
1480 	else
1481 		b->utail->next = p;
1482 	b->utail = p;
1483 }
1484 
1485 /*
1486  * Process removal of a single block and perhaps its children.
1487  */
1488 static void
1489 doRemoveLink(Cache *c, BList *p)
1490 {
1491 	int i, n, recurse;
1492 	u32int a;
1493 	Block *b;
1494 	Label l;
1495 	BList bl;
1496 
1497 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1498 
1499 	/*
1500 	 * We're not really going to overwrite b, but if we're not
1501 	 * going to look at its contents, there is no point in reading
1502 	 * them from the disk.
1503 	 */
1504 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1505 	if(b == nil)
1506 		return;
1507 
1508 	/*
1509 	 * When we're unlinking from the superblock, close with the next epoch.
1510 	 */
1511 	if(p->epoch == 0)
1512 		p->epoch = b->l.epoch+1;
1513 
1514 	/* sanity check */
1515 	if(b->l.epoch > p->epoch){
1516 		fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch);
1517 		blockPut(b);
1518 		return;
1519 	}
1520 
1521 	if(recurse){
1522 		n = c->size / VtScoreSize;
1523 		for(i=0; i<n; i++){
1524 			a = globalToLocal(b->data + i*VtScoreSize);
1525 			if(a == NilBlock || !readLabel(c, &l, a))
1526 				continue;
1527 			if(l.state&BsClosed)
1528 				continue;
1529 			/*
1530 			 * If stack space becomes an issue...
1531 			p->addr = a;
1532 			p->type = l.type;
1533 			p->tag = l.tag;
1534 			doRemoveLink(c, p);
1535 			 */
1536 
1537 			bl.part = PartData;
1538 			bl.addr = a;
1539 			bl.type = l.type;
1540 			bl.tag = l.tag;
1541 			bl.epoch = p->epoch;
1542 			bl.next = nil;
1543 			bl.recurse = 1;
1544 			/* give up the block lock - share with others */
1545 			blockPut(b);
1546 			doRemoveLink(c, &bl);
1547 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1548 			if(b == nil){
1549 				fprint(2, "warning: lost block in doRemoveLink\n");
1550 				return;
1551 			}
1552 		}
1553 	}
1554 
1555 	l = b->l;
1556 	l.state |= BsClosed;
1557 	l.epochClose = p->epoch;
1558 	if(l.epochClose == l.epoch){
1559 		vtLock(c->fl->lk);
1560 		if(l.epoch == c->fl->epochLow)
1561 			c->fl->nused--;
1562 		blockSetLabel(b, &l, 0);
1563 		vtUnlock(c->fl->lk);
1564 	}else
1565 		blockSetLabel(b, &l, 0);
1566 	blockPut(b);
1567 }
1568 
1569 /*
1570  * Allocate a BList so that we can record a dependency
1571  * or queue a removal related to block b.
1572  * If we can't find a BList, we write out b and return nil.
1573  */
1574 static BList *
1575 blistAlloc(Block *b)
1576 {
1577 	Cache *c;
1578 	BList *p;
1579 
1580 	if(b->iostate != BioDirty){
1581 		/*
1582 		 * should not happen anymore -
1583 	 	 * blockDirty used to flush but no longer does.
1584 		 */
1585 		assert(b->iostate == BioClean);
1586 		fprint(2, "blistAlloc: called on clean block\n");
1587 		return nil;
1588 	}
1589 
1590 	c = b->c;
1591 	vtLock(c->lk);
1592 	if(c->blfree == nil){
1593 		/*
1594 		 * No free BLists.  What are our options?
1595 		 */
1596 
1597 		/* Block has no priors? Just write it. */
1598 		if(b->prior == nil){
1599 			vtUnlock(c->lk);
1600 			diskWriteAndWait(c->disk, b);
1601 			return nil;
1602 		}
1603 
1604 		/*
1605 		 * Wake the flush thread, which will hopefully free up
1606 		 * some BLists for us.  We used to flush a block from
1607 		 * our own prior list and reclaim that BList, but this is
1608 		 * a no-no: some of the blocks on our prior list may
1609 		 * be locked by our caller.  Or maybe their label blocks
1610 		 * are locked by our caller.  In any event, it's too hard
1611 		 * to make sure we can do I/O for ourselves.  Instead,
1612 		 * we assume the flush thread will find something.
1613 		 * (The flush thread never blocks waiting for a block,
1614 		 * so it can't deadlock like we can.)
1615 		 */
1616 		while(c->blfree == nil){
1617 			vtWakeup(c->flush);
1618 			vtSleep(c->blrend);
1619 			if(c->blfree == nil)
1620 				fprint(2, "flushing for blists\n");
1621 		}
1622 	}
1623 
1624 	p = c->blfree;
1625 	c->blfree = p->next;
1626 	vtUnlock(c->lk);
1627 	return p;
1628 }
1629 
1630 static void
1631 blistFree(Cache *c, BList *bl)
1632 {
1633 	vtLock(c->lk);
1634 	bl->next = c->blfree;
1635 	c->blfree = bl;
1636 	vtWakeup(c->blrend);
1637 	vtUnlock(c->lk);
1638 }
1639 
1640 char*
1641 bsStr(int state)
1642 {
1643 	static char s[100];
1644 
1645 	if(state == BsFree)
1646 		return "Free";
1647 	if(state == BsBad)
1648 		return "Bad";
1649 
1650 	sprint(s, "%x", state);
1651 	if(!(state&BsAlloc))
1652 		strcat(s, ",Free");	/* should not happen */
1653 	if(state&BsCopied)
1654 		strcat(s, ",Copied");
1655 	if(state&BsVenti)
1656 		strcat(s, ",Venti");
1657 	if(state&BsClosed)
1658 		strcat(s, ",Closed");
1659 	return s;
1660 }
1661 
1662 char *
1663 bioStr(int iostate)
1664 {
1665 	switch(iostate){
1666 	default:
1667 		return "Unknown!!";
1668 	case BioEmpty:
1669 		return "Empty";
1670 	case BioLabel:
1671 		return "Label";
1672 	case BioClean:
1673 		return "Clean";
1674 	case BioDirty:
1675 		return "Dirty";
1676 	case BioReading:
1677 		return "Reading";
1678 	case BioWriting:
1679 		return "Writing";
1680 	case BioReadError:
1681 		return "ReadError";
1682 	case BioVentiError:
1683 		return "VentiError";
1684 	case BioMax:
1685 		return "Max";
1686 	}
1687 }
1688 
1689 static char *bttab[] = {
1690 	"BtData",
1691 	"BtData+1",
1692 	"BtData+2",
1693 	"BtData+3",
1694 	"BtData+4",
1695 	"BtData+5",
1696 	"BtData+6",
1697 	"BtData+7",
1698 	"BtDir",
1699 	"BtDir+1",
1700 	"BtDir+2",
1701 	"BtDir+3",
1702 	"BtDir+4",
1703 	"BtDir+5",
1704 	"BtDir+6",
1705 	"BtDir+7",
1706 };
1707 
1708 char*
1709 btStr(int type)
1710 {
1711 	if(type < nelem(bttab))
1712 		return bttab[type];
1713 	return "unknown";
1714 }
1715 
1716 int
1717 labelFmt(Fmt *f)
1718 {
1719 	Label *l;
1720 
1721 	l = va_arg(f->args, Label*);
1722 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1723 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1724 }
1725 
1726 int
1727 scoreFmt(Fmt *f)
1728 {
1729 	uchar *v;
1730 	int i;
1731 	u32int addr;
1732 
1733 	v = va_arg(f->args, uchar*);
1734 	if(v == nil){
1735 		fmtprint(f, "*");
1736 	}else if((addr = globalToLocal(v)) != NilBlock)
1737 		fmtprint(f, "0x%.8ux", addr);
1738 	else{
1739 		for(i = 0; i < VtScoreSize; i++)
1740 			fmtprint(f, "%2.2ux", v[i]);
1741 	}
1742 
1743 	return 0;
1744 }
1745 
1746 static int
1747 upHeap(int i, Block *b)
1748 {
1749 	Block *bb;
1750 	u32int now;
1751 	int p;
1752 	Cache *c;
1753 
1754 	c = b->c;
1755 	now = c->now;
1756 	for(; i != 0; i = p){
1757 		p = (i - 1) >> 1;
1758 		bb = c->heap[p];
1759 		if(b->used - now >= bb->used - now)
1760 			break;
1761 		c->heap[i] = bb;
1762 		bb->heap = i;
1763 	}
1764 	c->heap[i] = b;
1765 	b->heap = i;
1766 
1767 	return i;
1768 }
1769 
1770 static int
1771 downHeap(int i, Block *b)
1772 {
1773 	Block *bb;
1774 	u32int now;
1775 	int k;
1776 	Cache *c;
1777 
1778 	c = b->c;
1779 	now = c->now;
1780 	for(; ; i = k){
1781 		k = (i << 1) + 1;
1782 		if(k >= c->nheap)
1783 			break;
1784 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1785 			k++;
1786 		bb = c->heap[k];
1787 		if(b->used - now <= bb->used - now)
1788 			break;
1789 		c->heap[i] = bb;
1790 		bb->heap = i;
1791 	}
1792 	c->heap[i] = b;
1793 	b->heap = i;
1794 	return i;
1795 }
1796 
1797 /*
1798  * Delete a block from the heap.
1799  * Called with c->lk held.
1800  */
1801 static void
1802 heapDel(Block *b)
1803 {
1804 	int i, si;
1805 	Cache *c;
1806 
1807 	c = b->c;
1808 
1809 	si = b->heap;
1810 	if(si == BadHeap)
1811 		return;
1812 	b->heap = BadHeap;
1813 	c->nheap--;
1814 	if(si == c->nheap)
1815 		return;
1816 	b = c->heap[c->nheap];
1817 	i = upHeap(si, b);
1818 	if(i == si)
1819 		downHeap(i, b);
1820 }
1821 
1822 /*
1823  * Insert a block into the heap.
1824  * Called with c->lk held.
1825  */
1826 static void
1827 heapIns(Block *b)
1828 {
1829 	assert(b->heap == BadHeap);
1830 	upHeap(b->c->nheap++, b);
1831 	vtWakeup(b->c->heapwait);
1832 }
1833 
1834 /*
1835  * Get just the label for a block.
1836  */
1837 int
1838 readLabel(Cache *c, Label *l, u32int addr)
1839 {
1840 	int lpb;
1841 	Block *b;
1842 	u32int a;
1843 
1844 	lpb = c->size / LabelSize;
1845 	a = addr / lpb;
1846 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1847 	if(b == nil){
1848 		blockPut(b);
1849 		return 0;
1850 	}
1851 
1852 	if(!labelUnpack(l, b->data, addr%lpb)){
1853 		blockPut(b);
1854 		return 0;
1855 	}
1856 	blockPut(b);
1857 	return 1;
1858 }
1859 
1860 /*
1861  * Process unlink queue.
1862  * Called with c->lk held.
1863  */
1864 static void
1865 unlinkBody(Cache *c)
1866 {
1867 	BList *p;
1868 
1869 	while(c->uhead != nil){
1870 		p = c->uhead;
1871 		c->uhead = p->next;
1872 		vtUnlock(c->lk);
1873 		doRemoveLink(c, p);
1874 		vtLock(c->lk);
1875 		p->next = c->blfree;
1876 		c->blfree = p;
1877 	}
1878 }
1879 
1880 /*
1881  * Occasionally unlink the blocks on the cache unlink queue.
1882  */
1883 static void
1884 unlinkThread(void *a)
1885 {
1886 	Cache *c = a;
1887 
1888 	vtThreadSetName("unlink");
1889 
1890 	vtLock(c->lk);
1891 	for(;;){
1892 		while(c->uhead == nil && c->die == nil)
1893 			vtSleep(c->unlink);
1894 		if(c->die != nil)
1895 			break;
1896 		unlinkBody(c);
1897 	}
1898 	c->ref--;
1899 	vtWakeup(c->die);
1900 	vtUnlock(c->lk);
1901 }
1902 
1903 static int
1904 baddrCmp(void *a0, void *a1)
1905 {
1906 	BAddr *b0, *b1;
1907 	b0 = a0;
1908 	b1 = a1;
1909 
1910 	if(b0->part < b1->part)
1911 		return -1;
1912 	if(b0->part > b1->part)
1913 		return 1;
1914 	if(b0->addr < b1->addr)
1915 		return -1;
1916 	if(b0->addr > b1->addr)
1917 		return 1;
1918 	return 0;
1919 }
1920 
1921 /*
1922  * Scan the block list for dirty blocks; add them to the list c->baddr.
1923  */
1924 static void
1925 flushFill(Cache *c)
1926 {
1927 	int i, ndirty;
1928 	BAddr *p;
1929 	Block *b;
1930 
1931 	vtLock(c->lk);
1932 	if(c->ndirty == 0){
1933 		vtUnlock(c->lk);
1934 		return;
1935 	}
1936 
1937 	p = c->baddr;
1938 	ndirty = 0;
1939 	for(i=0; i<c->nblocks; i++){
1940 		b = c->blocks + i;
1941 		if(b->part == PartError)
1942 			continue;
1943 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1944 			ndirty++;
1945 		if(b->iostate != BioDirty)
1946 			continue;
1947 		p->part = b->part;
1948 		p->addr = b->addr;
1949 		p->vers = b->vers;
1950 		p++;
1951 	}
1952 	if(ndirty != c->ndirty){
1953 		fprint(2, "ndirty mismatch expected %d found %d\n",
1954 			c->ndirty, ndirty);
1955 		c->ndirty = ndirty;
1956 	}
1957 	vtUnlock(c->lk);
1958 
1959 	c->bw = p - c->baddr;
1960 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1961 }
1962 
1963 /*
1964  * This is not thread safe, i.e. it can't be called from multiple threads.
1965  *
1966  * It's okay how we use it, because it only gets called in
1967  * the flushThread.  And cacheFree, but only after
1968  * cacheFree has killed off the flushThread.
1969  */
1970 static int
1971 cacheFlushBlock(Cache *c)
1972 {
1973 	Block *b;
1974 	BAddr *p;
1975 	int lockfail, nfail;
1976 
1977 	nfail = 0;
1978 	for(;;){
1979 		if(c->br == c->be){
1980 			if(c->bw == 0 || c->bw == c->be)
1981 				flushFill(c);
1982 			c->br = 0;
1983 			c->be = c->bw;
1984 			c->bw = 0;
1985 			c->nflush = 0;
1986 		}
1987 
1988 		if(c->br == c->be)
1989 			return 0;
1990 		p = c->baddr + c->br;
1991 		c->br++;
1992 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1993 
1994 		if(b && blockWrite(b)){
1995 			c->nflush++;
1996 			blockPut(b);
1997 			return 1;
1998 		}
1999 		if(b)
2000 			blockPut(b);
2001 
2002 		/*
2003 		 * Why didn't we write the block?
2004 		 */
2005 
2006 		/* Block already written out */
2007 		if(b == nil && !lockfail)
2008 			continue;
2009 
2010 		/* Failed to acquire lock; sleep if happens a lot. */
2011 		if(lockfail && ++nfail > 100){
2012 			sleep(500);
2013 			nfail = 0;
2014 		}
2015 		/* Requeue block. */
2016 		if(c->bw < c->be)
2017 			c->baddr[c->bw++] = *p;
2018 	}
2019 }
2020 
2021 /*
2022  * Occasionally flush dirty blocks from memory to the disk.
2023  */
2024 static void
2025 flushThread(void *a)
2026 {
2027 	Cache *c = a;
2028 	int i;
2029 
2030 	vtThreadSetName("flush");
2031 	vtLock(c->lk);
2032 	while(c->die == nil){
2033 		vtSleep(c->flush);
2034 		vtUnlock(c->lk);
2035 		for(i=0; i<FlushSize; i++)
2036 			if(!cacheFlushBlock(c)){
2037 				/*
2038 				 * If i==0, could be someone is waking us repeatedly
2039 				 * to flush the cache but there's no work to do.
2040 				 * Pause a little.
2041 				 */
2042 				if(i==0){
2043 					// fprint(2, "flushthread found nothing to flush - %d dirty\n", c->ndirty);
2044 					sleep(250);
2045 				}
2046 				break;
2047 			}
2048 		if(i==0 && c->ndirty){
2049 			/*
2050 			 * All the blocks are being written right now -- there's nothing to do.
2051 			 * We might be spinning with cacheFlush though -- he'll just keep
2052 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2053 			 * on something that the diskThread can kick, but for now we'll
2054 			 * just pause for a little while waiting for disks to finish.
2055 			 */
2056 			sleep(100);
2057 		}
2058 		vtLock(c->lk);
2059 		vtWakeupAll(c->flushwait);
2060 	}
2061 	c->ref--;
2062 	vtWakeup(c->die);
2063 	vtUnlock(c->lk);
2064 }
2065 
2066 /*
2067  * Flush the cache.
2068  */
2069 void
2070 cacheFlush(Cache *c, int wait)
2071 {
2072 	/*
2073 	 * Lock c->dirtylk so that more blocks aren't being dirtied
2074 	 * while we try to write out what's already here.
2075 	 * Otherwise we might not ever finish!
2076 	 */
2077 	vtLock(c->dirtylk);
2078 	vtLock(c->lk);
2079 	if(wait){
2080 		while(c->ndirty){
2081 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2082 		//		c->ndirty, c->uhead);
2083 			vtWakeup(c->flush);
2084 			vtSleep(c->flushwait);
2085 		}
2086 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2087 	}else if(c->ndirty)
2088 		vtWakeup(c->flush);
2089 	vtUnlock(c->lk);
2090 	vtUnlock(c->dirtylk);
2091 }
2092 
2093 /*
2094  * Kick the flushThread every 30 seconds.
2095  */
2096 static void
2097 cacheSync(void *v)
2098 {
2099 	Cache *c;
2100 
2101 	c = v;
2102 	cacheFlush(c, 0);
2103 }
2104