xref: /plan9-contrib/sys/src/cmd/fossil/cache.c (revision 60014d6756a98ad10929607ca84a1b7488a16cfc)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	VtLock	*dirtylk;
26 	int 	ref;
27 	int	mode;
28 
29 	Disk 	*disk;
30 	int	size;			/* block size */
31 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
32 	VtSession *z;
33 	u32int	now;			/* ticks for usage timestamps */
34 	Block	**heads;		/* hash table for finding address */
35 	int	nheap;			/* number of available victims */
36 	Block	**heap;			/* heap for locating victims */
37 	long	nblocks;		/* number of blocks allocated */
38 	Block	*blocks;		/* array of block descriptors */
39 	u8int	*mem;			/* memory for all block data & blists */
40 
41 	BList	*blfree;
42 	VtRendez *blrend;
43 
44 	int 	ndirty;			/* number of dirty blocks in the cache */
45 	int 	maxdirty;		/* max number of dirty blocks */
46 	u32int	vers;
47 
48 	long hashSize;
49 
50 	FreeList *fl;
51 
52 	VtRendez *die;			/* daemon threads should die when != nil */
53 
54 	VtRendez *flush;
55 	VtRendez *flushwait;
56 	VtRendez *heapwait;
57 	BAddr *baddr;
58 	int bw, br, be;
59 	int nflush;
60 
61 	Periodic *sync;
62 
63 	/* unlink daemon */
64 	BList *uhead;
65 	BList *utail;
66 	VtRendez *unlink;
67 
68 	/* block counts */
69 	int nused;
70 	int ndisk;
71 };
72 
73 struct BList {
74 	int part;
75 	u32int addr;
76 	uchar type;
77 	u32int tag;
78 	u32int epoch;
79 	u32int vers;
80 
81 	int recurse;	/* for block unlink */
82 
83 	/* for roll back */
84 	int index;			/* -1 indicates not valid */
85 	union {
86 		uchar score[VtScoreSize];
87 		uchar entry[VtEntrySize];
88 	} old;
89 	BList *next;
90 };
91 
92 struct BAddr {
93 	int part;
94 	u32int addr;
95 	u32int vers;
96 };
97 
98 struct FreeList {
99 	VtLock *lk;
100 	u32int last;		/* last block allocated */
101 	u32int end;		/* end of data partition */
102 	u32int nused;		/* number of used blocks */
103 	u32int epochLow;	/* low epoch when last updated nused */
104 };
105 
106 static FreeList *flAlloc(u32int end);
107 static void flFree(FreeList *fl);
108 
109 static Block *cacheBumpBlock(Cache *c);
110 static void heapDel(Block*);
111 static void heapIns(Block*);
112 static void cacheCheck(Cache*);
113 static void unlinkThread(void *a);
114 static void flushThread(void *a);
115 static void flushBody(Cache *c);
116 static void unlinkBody(Cache *c);
117 static int cacheFlushBlock(Cache *c);
118 static void cacheSync(void*);
119 static BList *blistAlloc(Block*);
120 static void blistFree(Cache*, BList*);
121 static void doRemoveLink(Cache*, BList*);
122 static void doRemoveLinkList(Cache*, BList*);
123 
124 /*
125  * Mapping from local block type to Venti type
126  */
127 int vtType[BtMax] = {
128 	VtDataType,		/* BtData | 0  */
129 	VtPointerType0,		/* BtData | 1  */
130 	VtPointerType1,		/* BtData | 2  */
131 	VtPointerType2,		/* BtData | 3  */
132 	VtPointerType3,		/* BtData | 4  */
133 	VtPointerType4,		/* BtData | 5  */
134 	VtPointerType5,		/* BtData | 6  */
135 	VtPointerType6,		/* BtData | 7  */
136 	VtDirType,		/* BtDir | 0  */
137 	VtPointerType0,		/* BtDir | 1  */
138 	VtPointerType1,		/* BtDir | 2  */
139 	VtPointerType2,		/* BtDir | 3  */
140 	VtPointerType3,		/* BtDir | 4  */
141 	VtPointerType4,		/* BtDir | 5  */
142 	VtPointerType5,		/* BtDir | 6  */
143 	VtPointerType6,		/* BtDir | 7  */
144 };
145 
146 /*
147  * Allocate the memory cache.
148  */
149 Cache *
150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
151 {
152 	int i;
153 	Cache *c;
154 	Block *b;
155 	BList *bl;
156 	u8int *p;
157 	int nbl;
158 
159 	c = vtMemAllocZ(sizeof(Cache));
160 
161 	/* reasonable number of BList elements */
162 	nbl = nblocks * 4;
163 
164 	c->lk = vtLockAlloc();
165 	c->dirtylk = vtLockAlloc();	/* allowed to dirty blocks */
166 	c->ref = 1;
167 	c->disk = disk;
168 	c->z = z;
169 	c->size = diskBlockSize(disk);
170 bwatchSetBlockSize(c->size);
171 	/* round c->size up to be a nice multiple */
172 	c->size = (c->size + 127) & ~127;
173 	c->ndmap = (c->size/20 + 7) / 8;
174 	c->nblocks = nblocks;
175 	c->hashSize = nblocks;
176 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
177 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
178 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
179 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
180 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
181 	c->mode = mode;
182 	c->vers++;
183 	p = c->mem;
184 	for(i = 0; i < nblocks; i++){
185 		b = &c->blocks[i];
186 		b->lk = vtLockAlloc();
187 		b->c = c;
188 		b->data = p;
189 		b->heap = i;
190 		b->ioready = vtRendezAlloc(b->lk);
191 		c->heap[i] = b;
192 		p += c->size;
193 	}
194 	c->nheap = nblocks;
195 	for(i = 0; i < nbl; i++){
196 		bl = (BList*)p;
197 		bl->next = c->blfree;
198 		c->blfree = bl;
199 		p += sizeof(BList);
200 	}
201 	/* separate loop to keep blocks and blists reasonably aligned */
202 	for(i = 0; i < nblocks; i++){
203 		b = &c->blocks[i];
204 		b->dmap = p;
205 		p += c->ndmap;
206 	}
207 
208 	c->blrend = vtRendezAlloc(c->lk);
209 
210 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
211 
212 	c->fl = flAlloc(diskSize(disk, PartData));
213 
214 	c->unlink = vtRendezAlloc(c->lk);
215 	c->flush = vtRendezAlloc(c->lk);
216 	c->flushwait = vtRendezAlloc(c->lk);
217 	c->heapwait = vtRendezAlloc(c->lk);
218 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
219 
220 	if(mode == OReadWrite){
221 		c->ref += 2;
222 		vtThread(unlinkThread, c);
223 		vtThread(flushThread, c);
224 	}
225 	cacheCheck(c);
226 
227 	return c;
228 }
229 
230 /*
231  * Free the whole memory cache, flushing all dirty blocks to the disk.
232  */
233 void
234 cacheFree(Cache *c)
235 {
236 	int i;
237 
238 	/* kill off daemon threads */
239 	vtLock(c->lk);
240 	c->die = vtRendezAlloc(c->lk);
241 	periodicKill(c->sync);
242 	vtWakeup(c->flush);
243 	vtWakeup(c->unlink);
244 	while(c->ref > 1)
245 		vtSleep(c->die);
246 
247 	/* flush everything out */
248 	do {
249 		unlinkBody(c);
250 		vtUnlock(c->lk);
251 		while(cacheFlushBlock(c))
252 			;
253 		diskFlush(c->disk);
254 		vtLock(c->lk);
255 	} while(c->uhead || c->ndirty);
256 	vtUnlock(c->lk);
257 
258 	cacheCheck(c);
259 
260 	for(i = 0; i < c->nblocks; i++){
261 		assert(c->blocks[i].ref == 0);
262 		vtRendezFree(c->blocks[i].ioready);
263 		vtLockFree(c->blocks[i].lk);
264 	}
265 	flFree(c->fl);
266 	vtMemFree(c->baddr);
267 	vtMemFree(c->heads);
268 	vtMemFree(c->blocks);
269 	vtMemFree(c->mem);
270 	vtLockFree(c->lk);
271 	diskFree(c->disk);
272 	vtRendezFree(c->blrend);
273 	/* don't close vtSession */
274 	vtMemFree(c);
275 }
276 
277 static void
278 cacheDump(Cache *c)
279 {
280 	int i;
281 	Block *b;
282 
283 	for(i = 0; i < c->nblocks; i++){
284 		b = &c->blocks[i];
285 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
286 			i, b->part, b->addr, b->score, b->l.type, b->ref,
287 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
288 	}
289 }
290 
291 static void
292 cacheCheck(Cache *c)
293 {
294 	u32int size, now;
295 	int i, k, refed;
296 	static uchar zero[VtScoreSize];
297 	Block *b;
298 
299 	size = c->size;
300 	now = c->now;
301 
302 	for(i = 0; i < c->nheap; i++){
303 		if(c->heap[i]->heap != i)
304 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
305 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
306 			vtFatal("bad heap ordering");
307 		k = (i << 1) + 1;
308 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
309 			vtFatal("bad heap ordering");
310 		k++;
311 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
312 			vtFatal("bad heap ordering");
313 	}
314 
315 	refed = 0;
316 	for(i = 0; i < c->nblocks; i++){
317 		b = &c->blocks[i];
318 		if(b->data != &c->mem[i * size])
319 			vtFatal("mis-blocked at %d", i);
320 		if(b->ref && b->heap == BadHeap){
321 			refed++;
322 		}
323 	}
324 if(c->nheap + refed != c->nblocks){
325 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
326 cacheDump(c);
327 }
328 	assert(c->nheap + refed == c->nblocks);
329 	refed = 0;
330 	for(i = 0; i < c->nblocks; i++){
331 		b = &c->blocks[i];
332 		if(b->ref){
333 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
334 			refed++;
335 		}
336 	}
337 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
338 }
339 
340 
341 /*
342  * locate the block with the oldest second to last use.
343  * remove it from the heap, and fix up the heap.
344  */
345 /* called with c->lk held */
346 static Block *
347 cacheBumpBlock(Cache *c)
348 {
349 	int printed;
350 	Block *b;
351 
352 	/*
353 	 * locate the block with the oldest second to last use.
354 	 * remove it from the heap, and fix up the heap.
355 	 */
356 	printed = 0;
357 	if(c->nheap == 0){
358 		while(c->nheap == 0){
359 			vtWakeup(c->flush);
360 			vtSleep(c->heapwait);
361 			if(c->nheap == 0){
362 				printed = 1;
363 				fprint(2, "%s: entire cache is busy, %d dirty "
364 					"-- waking flush thread\n",
365 					argv0, c->ndirty);
366 			}
367 		}
368 		if(printed)
369 			fprint(2, "%s: cache is okay again, %d dirty\n",
370 				argv0, c->ndirty);
371 	}
372 
373 	b = c->heap[0];
374 	heapDel(b);
375 
376 	assert(b->heap == BadHeap);
377 	assert(b->ref == 0);
378 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
379 	assert(b->prior == nil);
380 	assert(b->uhead == nil);
381 
382 	/*
383 	 * unchain the block from hash chain
384 	 */
385 	if(b->prev){
386 		*(b->prev) = b->next;
387 		if(b->next)
388 			b->next->prev = b->prev;
389 		b->prev = nil;
390 	}
391 
392 
393 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
394 	/* set block to a reasonable state */
395 	b->ref = 1;
396 	b->part = PartError;
397 	memset(&b->l, 0, sizeof(b->l));
398 	b->iostate = BioEmpty;
399 
400 	return b;
401 }
402 
403 /*
404  * look for a particular version of the block in the memory cache.
405  */
406 static Block *
407 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
408 	int waitlock, int *lockfailure)
409 {
410 	Block *b;
411 	ulong h;
412 
413 	h = addr % c->hashSize;
414 
415 	if(lockfailure)
416 		*lockfailure = 0;
417 
418 	/*
419 	 * look for the block in the cache
420 	 */
421 	vtLock(c->lk);
422 	for(b = c->heads[h]; b != nil; b = b->next){
423 		if(b->part == part && b->addr == addr)
424 			break;
425 	}
426 	if(b == nil || b->vers != vers){
427 		vtUnlock(c->lk);
428 		return nil;
429 	}
430 	if(!waitlock && !vtCanLock(b->lk)){
431 		*lockfailure = 1;
432 		vtUnlock(c->lk);
433 		return nil;
434 	}
435 	heapDel(b);
436 	b->ref++;
437 	vtUnlock(c->lk);
438 
439 	bwatchLock(b);
440 	if(waitlock)
441 		vtLock(b->lk);
442 	b->nlock = 1;
443 
444 	for(;;){
445 		switch(b->iostate){
446 		default:
447 			abort();
448 		case BioEmpty:
449 		case BioLabel:
450 		case BioClean:
451 		case BioDirty:
452 			if(b->vers != vers){
453 				blockPut(b);
454 				return nil;
455 			}
456 			return b;
457 		case BioReading:
458 		case BioWriting:
459 			vtSleep(b->ioready);
460 			break;
461 		case BioVentiError:
462 			blockPut(b);
463 			vtSetError("venti i/o error block 0x%.8ux", addr);
464 			return nil;
465 		case BioReadError:
466 			blockPut(b);
467 			vtSetError("error reading block 0x%.8ux", addr);
468 			return nil;
469 		}
470 	}
471 	/* NOT REACHED */
472 }
473 static Block*
474 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
475 {
476 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
477 }
478 
479 
480 /*
481  * fetch a local (on-disk) block from the memory cache.
482  * if it's not there, load it, bumping some other block.
483  */
484 Block *
485 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
486 {
487 	Block *b;
488 	ulong h;
489 
490 	assert(part != PartVenti);
491 
492 	h = addr % c->hashSize;
493 
494 	/*
495 	 * look for the block in the cache
496 	 */
497 	vtLock(c->lk);
498 	for(b = c->heads[h]; b != nil; b = b->next){
499 		if(b->part != part || b->addr != addr)
500 			continue;
501 		if(epoch && b->l.epoch != epoch){
502 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
503 			vtUnlock(c->lk);
504 			vtSetError(ELabelMismatch);
505 			return nil;
506 		}
507 		heapDel(b);
508 		b->ref++;
509 		break;
510 	}
511 
512 	if(b == nil){
513 		b = cacheBumpBlock(c);
514 
515 		b->part = part;
516 		b->addr = addr;
517 		localToGlobal(addr, b->score);
518 
519 		/* chain onto correct hash */
520 		b->next = c->heads[h];
521 		c->heads[h] = b;
522 		if(b->next != nil)
523 			b->next->prev = &b->next;
524 		b->prev = &c->heads[h];
525 	}
526 
527 	vtUnlock(c->lk);
528 
529 	/*
530 	 * BUG: what if the epoch changes right here?
531 	 * In the worst case, we could end up in some weird
532 	 * lock loop, because the block we want no longer exists,
533 	 * and instead we're trying to lock a block we have no
534 	 * business grabbing.
535 	 *
536 	 * For now, I'm not going to worry about it.
537 	 */
538 
539 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
540 	bwatchLock(b);
541 	vtLock(b->lk);
542 	b->nlock = 1;
543 
544 	if(part == PartData && b->iostate == BioEmpty){
545 		if(!readLabel(c, &b->l, addr)){
546 			blockPut(b);
547 			return nil;
548 		}
549 		blockSetIOState(b, BioLabel);
550 	}
551 	if(epoch && b->l.epoch != epoch){
552 		blockPut(b);
553 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
554 		vtSetError(ELabelMismatch);
555 		return nil;
556 	}
557 
558 	b->pc = getcallerpc(&c);
559 	for(;;){
560 		switch(b->iostate){
561 		default:
562 			abort();
563 		case BioEmpty:
564 		case BioLabel:
565 			if(mode == OOverWrite){
566 				blockSetIOState(b, BioClean);
567 				return b;
568 			}
569 			diskRead(c->disk, b);
570 			vtSleep(b->ioready);
571 			break;
572 		case BioClean:
573 		case BioDirty:
574 			return b;
575 		case BioReading:
576 		case BioWriting:
577 			vtSleep(b->ioready);
578 			break;
579 		case BioReadError:
580 			blockSetIOState(b, BioEmpty);
581 			blockPut(b);
582 			vtSetError("error reading block 0x%.8ux", addr);
583 			return nil;
584 		}
585 	}
586 	/* NOT REACHED */
587 }
588 
589 Block *
590 cacheLocal(Cache *c, int part, u32int addr, int mode)
591 {
592 	return _cacheLocal(c, part, addr, mode, 0);
593 }
594 
595 /*
596  * fetch a local (on-disk) block from the memory cache.
597  * if it's not there, load it, bumping some other block.
598  * check tag and type.
599  */
600 Block *
601 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
602 {
603 	Block *b;
604 
605 	b = _cacheLocal(c, PartData, addr, mode, epoch);
606 	if(b == nil)
607 		return nil;
608 	if(b->l.type != type || b->l.tag != tag){
609 		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
610 			argv0, addr, b->l.type, type, b->l.tag, tag);
611 		vtSetError(ELabelMismatch);
612 		blockPut(b);
613 		return nil;
614 	}
615 	b->pc = getcallerpc(&c);
616 	return b;
617 }
618 
619 /*
620  * fetch a global (Venti) block from the memory cache.
621  * if it's not there, load it, bumping some other block.
622  * check tag and type if it's really a local block in disguise.
623  */
624 Block *
625 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
626 {
627 	int n;
628 	Block *b;
629 	ulong h;
630 	u32int addr;
631 
632 	addr = globalToLocal(score);
633 	if(addr != NilBlock){
634 		b = cacheLocalData(c, addr, type, tag, mode, 0);
635 		if(b)
636 			b->pc = getcallerpc(&c);
637 		return b;
638 	}
639 
640 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
641 
642 	/*
643 	 * look for the block in the cache
644 	 */
645 	vtLock(c->lk);
646 	for(b = c->heads[h]; b != nil; b = b->next){
647 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
648 			continue;
649 		heapDel(b);
650 		b->ref++;
651 		break;
652 	}
653 
654 	if(b == nil){
655 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
656 
657 		b = cacheBumpBlock(c);
658 
659 		b->part = PartVenti;
660 		b->addr = NilBlock;
661 		b->l.type = type;
662 		memmove(b->score, score, VtScoreSize);
663 
664 		/* chain onto correct hash */
665 		b->next = c->heads[h];
666 		c->heads[h] = b;
667 		if(b->next != nil)
668 			b->next->prev = &b->next;
669 		b->prev = &c->heads[h];
670 	}
671 	vtUnlock(c->lk);
672 
673 	bwatchLock(b);
674 	vtLock(b->lk);
675 	b->nlock = 1;
676 	b->pc = getcallerpc(&c);
677 
678 	switch(b->iostate){
679 	default:
680 		abort();
681 	case BioEmpty:
682 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
683 		if(n < 0 || !vtSha1Check(score, b->data, n)){
684 			blockSetIOState(b, BioVentiError);
685 			blockPut(b);
686 			vtSetError(
687 			"venti error reading block %V or wrong score: %r",
688 				score);
689 			return nil;
690 		}
691 		vtZeroExtend(vtType[type], b->data, n, c->size);
692 		blockSetIOState(b, BioClean);
693 		return b;
694 	case BioClean:
695 		return b;
696 	case BioVentiError:
697 		blockPut(b);
698 		vtSetError("venti i/o error or wrong score, block %V", score);
699 		return nil;
700 	case BioReadError:
701 		blockPut(b);
702 		vtSetError("error reading block %V", b->score);
703 		return nil;
704 	}
705 	/* NOT REACHED */
706 }
707 
708 /*
709  * allocate a new on-disk block and load it into the memory cache.
710  * BUG: if the disk is full, should we flush some of it to Venti?
711  */
712 static u32int lastAlloc;
713 
714 Block *
715 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
716 {
717 	FreeList *fl;
718 	u32int addr;
719 	Block *b;
720 	int n, nwrap;
721 	Label lab;
722 
723 	n = c->size / LabelSize;
724 	fl = c->fl;
725 
726 	vtLock(fl->lk);
727 	addr = fl->last;
728 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
729 	if(b == nil){
730 		fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
731 		vtUnlock(fl->lk);
732 		return nil;
733 	}
734 	nwrap = 0;
735 	for(;;){
736 		if(++addr >= fl->end){
737 			addr = 0;
738 			if(++nwrap >= 2){
739 				blockPut(b);
740 				fl->last = 0;
741 				vtSetError("disk is full");
742 				fprint(2, "%s: cacheAllocBlock: xxx1 %R\n", argv0);
743 				vtUnlock(fl->lk);
744 				return nil;
745 			}
746 		}
747 		if(addr%n == 0){
748 			blockPut(b);
749 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
750 			if(b == nil){
751 				fl->last = addr;
752 				fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
753 				vtUnlock(fl->lk);
754 				return nil;
755 			}
756 		}
757 		if(!labelUnpack(&lab, b->data, addr%n))
758 			continue;
759 		if(lab.state == BsFree)
760 			goto Found;
761 		if(lab.state&BsClosed)
762 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
763 			goto Found;
764 	}
765 Found:
766 	blockPut(b);
767 	b = cacheLocal(c, PartData, addr, OOverWrite);
768 	if(b == nil){
769 		fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
770 		return nil;
771 	}
772 assert(b->iostate == BioLabel || b->iostate == BioClean);
773 	fl->last = addr;
774 	lab.type = type;
775 	lab.tag = tag;
776 	lab.state = BsAlloc;
777 	lab.epoch = epoch;
778 	lab.epochClose = ~(u32int)0;
779 	if(!blockSetLabel(b, &lab, 1)){
780 		fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
781 		blockPut(b);
782 		return nil;
783 	}
784 	vtZeroExtend(vtType[type], b->data, 0, c->size);
785 if(0)diskWrite(c->disk, b);
786 
787 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
788 	lastAlloc = addr;
789 	fl->nused++;
790 	vtUnlock(fl->lk);
791 	b->pc = getcallerpc(&c);
792 	return b;
793 }
794 
795 int
796 cacheDirty(Cache *c)
797 {
798 	return c->ndirty;
799 }
800 
801 void
802 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
803 {
804 	int n;
805 	u32int addr, nused;
806 	Block *b;
807 	Label lab;
808 	FreeList *fl;
809 
810 	fl = c->fl;
811 	n = c->size / LabelSize;
812 	*bsize = c->size;
813 	vtLock(fl->lk);
814 	if(fl->epochLow == epochLow){
815 		*used = fl->nused;
816 		*total = fl->end;
817 		vtUnlock(fl->lk);
818 		return;
819 	}
820 	b = nil;
821 	nused = 0;
822 	for(addr=0; addr<fl->end; addr++){
823 		if(addr%n == 0){
824 			blockPut(b);
825 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
826 			if(b == nil){
827 				fprint(2, "%s: flCountUsed: loading %ux: %R\n",
828 					argv0, addr/n);
829 				break;
830 			}
831 		}
832 		if(!labelUnpack(&lab, b->data, addr%n))
833 			continue;
834 		if(lab.state == BsFree)
835 			continue;
836 		if(lab.state&BsClosed)
837 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
838 			continue;
839 		nused++;
840 	}
841 	blockPut(b);
842 	if(addr == fl->end){
843 		fl->nused = nused;
844 		fl->epochLow = epochLow;
845 	}
846 	*used = nused;
847 	*total = fl->end;
848 	vtUnlock(fl->lk);
849 	return;
850 }
851 
852 static FreeList *
853 flAlloc(u32int end)
854 {
855 	FreeList *fl;
856 
857 	fl = vtMemAllocZ(sizeof(*fl));
858 	fl->lk = vtLockAlloc();
859 	fl->last = 0;
860 	fl->end = end;
861 	return fl;
862 }
863 
864 static void
865 flFree(FreeList *fl)
866 {
867 	vtLockFree(fl->lk);
868 	vtMemFree(fl);
869 }
870 
871 u32int
872 cacheLocalSize(Cache *c, int part)
873 {
874 	return diskSize(c->disk, part);
875 }
876 
877 /*
878  * The thread that has locked b may refer to it by
879  * multiple names.  Nlock counts the number of
880  * references the locking thread holds.  It will call
881  * blockPut once per reference.
882  */
883 void
884 blockDupLock(Block *b)
885 {
886 	assert(b->nlock > 0);
887 	b->nlock++;
888 }
889 
890 /*
891  * we're done with the block.
892  * unlock it.  can't use it after calling this.
893  */
894 void
895 blockPut(Block* b)
896 {
897 	Cache *c;
898 
899 	if(b == nil)
900 		return;
901 
902 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
903 
904 	if(b->iostate == BioDirty)
905 		bwatchDependency(b);
906 
907 	if(--b->nlock > 0)
908 		return;
909 
910 	/*
911 	 * b->nlock should probably stay at zero while
912 	 * the block is unlocked, but diskThread and vtSleep
913 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
914 	 * so we have to keep b->nlock set to 1 even
915 	 * when the block is unlocked.
916 	 */
917 	assert(b->nlock == 0);
918 	b->nlock = 1;
919 //	b->pc = 0;
920 
921 	bwatchUnlock(b);
922 	vtUnlock(b->lk);
923 	c = b->c;
924 	vtLock(c->lk);
925 
926 	if(--b->ref > 0){
927 		vtUnlock(c->lk);
928 		return;
929 	}
930 
931 	assert(b->ref == 0);
932 	switch(b->iostate){
933 	default:
934 		b->used = c->now++;
935 		heapIns(b);
936 		break;
937 	case BioEmpty:
938 	case BioLabel:
939 		if(c->nheap == 0)
940 			b->used = c->now++;
941 		else
942 			b->used = c->heap[0]->used;
943 		heapIns(b);
944 		break;
945 	case BioDirty:
946 		break;
947 	}
948 	vtUnlock(c->lk);
949 }
950 
951 /*
952  * set the label associated with a block.
953  */
954 Block*
955 _blockSetLabel(Block *b, Label *l)
956 {
957 	int lpb;
958 	Block *bb;
959 	u32int a;
960 	Cache *c;
961 
962 	c = b->c;
963 
964 	assert(b->part == PartData);
965 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
966 	lpb = c->size / LabelSize;
967 	a = b->addr / lpb;
968 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
969 	if(bb == nil){
970 		blockPut(b);
971 		return nil;
972 	}
973 	b->l = *l;
974 	labelPack(l, bb->data, b->addr%lpb);
975 	blockDirty(bb);
976 	return bb;
977 }
978 
979 int
980 blockSetLabel(Block *b, Label *l, int allocating)
981 {
982 	Block *lb;
983 	Label oldl;
984 
985 	oldl = b->l;
986 	lb = _blockSetLabel(b, l);
987 	if(lb == nil)
988 		return 0;
989 
990 	/*
991 	 * If we're allocating the block, make sure the label (bl)
992 	 * goes to disk before the data block (b) itself.  This is to help
993 	 * the blocks that in turn depend on b.
994 	 *
995 	 * Suppose bx depends on (must be written out after) b.
996 	 * Once we write b we'll think it's safe to write bx.
997 	 * Bx can't get at b unless it has a valid label, though.
998 	 *
999 	 * Allocation is the only case in which having a current label
1000 	 * is vital because:
1001 	 *
1002 	 *	- l.type is set at allocation and never changes.
1003 	 *	- l.tag is set at allocation and never changes.
1004 	 *	- l.state is not checked when we load blocks.
1005 	 *	- the archiver cares deeply about l.state being
1006 	 *		BaActive vs. BaCopied, but that's handled
1007 	 *		by direct calls to _blockSetLabel.
1008 	 */
1009 
1010 	if(allocating)
1011 		blockDependency(b, lb, -1, nil, nil);
1012 	blockPut(lb);
1013 	return 1;
1014 }
1015 
1016 /*
1017  * Record that bb must be written out before b.
1018  * If index is given, we're about to overwrite the score/e
1019  * at that index in the block.  Save the old value so we
1020  * can write a safer ``old'' version of the block if pressed.
1021  */
1022 void
1023 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1024 {
1025 	BList *p;
1026 
1027 	if(bb->iostate == BioClean)
1028 		return;
1029 
1030 	/*
1031 	 * Dependencies for blocks containing Entry structures
1032 	 * or scores must always be explained.  The problem with
1033 	 * only explaining some of them is this.  Suppose we have two
1034 	 * dependencies for the same field, the first explained
1035 	 * and the second not.  We try to write the block when the first
1036 	 * dependency is not written but the second is.  We will roll back
1037 	 * the first change even though the second trumps it.
1038 	 */
1039 	if(index == -1 && bb->part == PartData)
1040 		assert(b->l.type == BtData);
1041 
1042 	if(bb->iostate != BioDirty){
1043 		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1044 			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1045 		abort();
1046 	}
1047 
1048 	p = blistAlloc(bb);
1049 	if(p == nil)
1050 		return;
1051 
1052 	assert(bb->iostate == BioDirty);
1053 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1054 
1055 	p->part = bb->part;
1056 	p->addr = bb->addr;
1057 	p->type = bb->l.type;
1058 	p->vers = bb->vers;
1059 	p->index = index;
1060 	if(p->index >= 0){
1061 		/*
1062 		 * This test would just be b->l.type==BtDir except
1063 		 * we need to exclude the super block.
1064 		 */
1065 		if(b->l.type == BtDir && b->part == PartData)
1066 			entryPack(e, p->old.entry, 0);
1067 		else
1068 			memmove(p->old.score, score, VtScoreSize);
1069 	}
1070 	p->next = b->prior;
1071 	b->prior = p;
1072 }
1073 
1074 /*
1075  * Mark an in-memory block as dirty.  If there are too many
1076  * dirty blocks, start writing some out to disk.
1077  *
1078  * If there were way too many dirty blocks, we used to
1079  * try to do some flushing ourselves, but it's just too dangerous --
1080  * it implies that the callers cannot have any of our priors locked,
1081  * but this is hard to avoid in some cases.
1082  */
1083 int
1084 blockDirty(Block *b)
1085 {
1086 	Cache *c;
1087 
1088 	c = b->c;
1089 
1090 	assert(b->part != PartVenti);
1091 
1092 	if(b->iostate == BioDirty)
1093 		return 1;
1094 	assert(b->iostate == BioClean);
1095 
1096 	vtLock(c->dirtylk);
1097 	vtLock(c->lk);
1098 	b->iostate = BioDirty;
1099 	c->ndirty++;
1100 	if(c->ndirty > (c->maxdirty>>1))
1101 		vtWakeup(c->flush);
1102 	vtUnlock(c->lk);
1103 	vtUnlock(c->dirtylk);
1104 
1105 	return 1;
1106 }
1107 
1108 /*
1109  * We've decided to write out b.  Maybe b has some pointers to blocks
1110  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1111  * copy of b that is safe to write out.  (diskThread will make sure the block
1112  * remains marked as dirty.)
1113  */
1114 uchar *
1115 blockRollback(Block *b, uchar *buf)
1116 {
1117 	u32int addr;
1118 	BList *p;
1119 	Super super;
1120 
1121 	/* easy case */
1122 	if(b->prior == nil)
1123 		return b->data;
1124 
1125 	memmove(buf, b->data, b->c->size);
1126 	for(p=b->prior; p; p=p->next){
1127 		/*
1128 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1129 		 */
1130 		assert(p->index >= 0);
1131 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1132 		if(b->part == PartSuper){
1133 			assert(p->index == 0);
1134 			superUnpack(&super, buf);
1135 			addr = globalToLocal(p->old.score);
1136 			if(addr == NilBlock){
1137 				fprint(2, "%s: rolling back super block: "
1138 					"bad replacement addr %V\n",
1139 					argv0, p->old.score);
1140 				abort();
1141 			}
1142 			super.active = addr;
1143 			superPack(&super, buf);
1144 			continue;
1145 		}
1146 		if(b->l.type == BtDir)
1147 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1148 		else
1149 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1150 	}
1151 	return buf;
1152 }
1153 
1154 /*
1155  * Try to write block b.
1156  * If b depends on other blocks:
1157  *
1158  *	If the block has been written out, remove the dependency.
1159  *	If the dependency is replaced by a more recent dependency,
1160  *		throw it out.
1161  *	If we know how to write out an old version of b that doesn't
1162  *		depend on it, do that.
1163  *
1164  *	Otherwise, bail.
1165  */
1166 int
1167 blockWrite(Block *b)
1168 {
1169 	uchar *dmap;
1170 	Cache *c;
1171 	BList *p, **pp;
1172 	Block *bb;
1173 	int lockfail;
1174 
1175 	c = b->c;
1176 
1177 	if(b->iostate != BioDirty)
1178 		return 1;
1179 
1180 	dmap = b->dmap;
1181 	memset(dmap, 0, c->ndmap);
1182 	pp = &b->prior;
1183 	for(p=*pp; p; p=*pp){
1184 		if(p->index >= 0){
1185 			/* more recent dependency has succeeded; this one can go */
1186 			if(dmap[p->index/8] & (1<<(p->index%8)))
1187 				goto ignblock;
1188 		}
1189 
1190 		lockfail = 0;
1191 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1192 		if(bb == nil){
1193 			if(lockfail)
1194 				return 0;
1195 			/* block not in cache => was written already */
1196 			dmap[p->index/8] |= 1<<(p->index%8);
1197 			goto ignblock;
1198 		}
1199 
1200 		/*
1201 		 * same version of block is still in cache.
1202 		 *
1203 		 * the assertion is true because the block still has version p->vers,
1204 		 * which means it hasn't been written out since we last saw it.
1205 		 */
1206 		if(bb->iostate != BioDirty){
1207 			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1208 				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1209 			/* probably BioWriting if it happens? */
1210 			if(bb->iostate == BioClean)
1211 				goto ignblock;
1212 		}
1213 
1214 		blockPut(bb);
1215 
1216 		if(p->index < 0){
1217 			/*
1218 			 * We don't know how to temporarily undo
1219 			 * b's dependency on bb, so just don't write b yet.
1220 			 */
1221 			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1222 				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1223 			return 0;
1224 		}
1225 		/* keep walking down the list */
1226 		pp = &p->next;
1227 		continue;
1228 
1229 ignblock:
1230 		*pp = p->next;
1231 		blistFree(c, p);
1232 		continue;
1233 	}
1234 
1235 	/*
1236 	 * DiskWrite must never be called with a double-locked block.
1237 	 * This call to diskWrite is okay because blockWrite is only called
1238 	 * from the cache flush thread, which never double-locks a block.
1239 	 */
1240 	diskWrite(c->disk, b);
1241 	return 1;
1242 }
1243 
1244 /*
1245  * Change the I/O state of block b.
1246  * Just an assignment except for magic in
1247  * switch statement (read comments there).
1248  */
1249 void
1250 blockSetIOState(Block *b, int iostate)
1251 {
1252 	int dowakeup;
1253 	Cache *c;
1254 	BList *p, *q;
1255 
1256 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1257 
1258 	c = b->c;
1259 
1260 	dowakeup = 0;
1261 	switch(iostate){
1262 	default:
1263 		abort();
1264 	case BioEmpty:
1265 		assert(!b->uhead);
1266 		break;
1267 	case BioLabel:
1268 		assert(!b->uhead);
1269 		break;
1270 	case BioClean:
1271 		bwatchDependency(b);
1272 		/*
1273 		 * If b->prior is set, it means a write just finished.
1274 		 * The prior list isn't needed anymore.
1275 		 */
1276 		for(p=b->prior; p; p=q){
1277 			q = p->next;
1278 			blistFree(c, p);
1279 		}
1280 		b->prior = nil;
1281 		/*
1282 		 * Freeing a block or just finished a write.
1283 		 * Move the blocks from the per-block unlink
1284 		 * queue to the cache unlink queue.
1285 		 */
1286 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1287 			vtLock(c->lk);
1288 			c->ndirty--;
1289 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1290 			b->vers = c->vers++;
1291 			if(b->uhead){
1292 				/* add unlink blocks to unlink queue */
1293 				if(c->uhead == nil){
1294 					c->uhead = b->uhead;
1295 					vtWakeup(c->unlink);
1296 				}else
1297 					c->utail->next = b->uhead;
1298 				c->utail = b->utail;
1299 				b->uhead = nil;
1300 			}
1301 			vtUnlock(c->lk);
1302 		}
1303 		assert(!b->uhead);
1304 		dowakeup = 1;
1305 		break;
1306 	case BioDirty:
1307 		/*
1308 		 * Wrote out an old version of the block (see blockRollback).
1309 		 * Bump a version count, leave it dirty.
1310 		 */
1311 		if(b->iostate == BioWriting){
1312 			vtLock(c->lk);
1313 			b->vers = c->vers++;
1314 			vtUnlock(c->lk);
1315 			dowakeup = 1;
1316 		}
1317 		break;
1318 	case BioReading:
1319 	case BioWriting:
1320 		/*
1321 		 * Adding block to disk queue.  Bump reference count.
1322 		 * diskThread decs the count later by calling blockPut.
1323 		 * This is here because we need to lock c->lk to
1324 		 * manipulate the ref count.
1325 		 */
1326 		vtLock(c->lk);
1327 		b->ref++;
1328 		vtUnlock(c->lk);
1329 		break;
1330 	case BioReadError:
1331 	case BioVentiError:
1332 		/*
1333 		 * Oops.
1334 		 */
1335 		dowakeup = 1;
1336 		break;
1337 	}
1338 	b->iostate = iostate;
1339 	/*
1340 	 * Now that the state has changed, we can wake the waiters.
1341 	 */
1342 	if(dowakeup)
1343 		vtWakeupAll(b->ioready);
1344 }
1345 
1346 /*
1347  * The active file system is a tree of blocks.
1348  * When we add snapshots to the mix, the entire file system
1349  * becomes a dag and thus requires a bit more care.
1350  *
1351  * The life of the file system is divided into epochs.  A snapshot
1352  * ends one epoch and begins the next.  Each file system block
1353  * is marked with the epoch in which it was created (b.epoch).
1354  * When the block is unlinked from the file system (closed), it is marked
1355  * with the epoch in which it was removed (b.epochClose).
1356  * Once we have discarded or archived all snapshots up to
1357  * b.epochClose, we can reclaim the block.
1358  *
1359  * If a block was created in a past epoch but is not yet closed,
1360  * it is treated as copy-on-write.  Of course, in order to insert the
1361  * new pointer into the tree, the parent must be made writable,
1362  * and so on up the tree.  The recursion stops because the root
1363  * block is always writable.
1364  *
1365  * If blocks are never closed, they will never be reused, and
1366  * we will run out of disk space.  But marking a block as closed
1367  * requires some care about dependencies and write orderings.
1368  *
1369  * (1) If a block p points at a copy-on-write block b and we
1370  * copy b to create bb, then p must be written out after bb and
1371  * lbb (bb's label block).
1372  *
1373  * (2) We have to mark b as closed, but only after we switch
1374  * the pointer, so lb must be written out after p.  In fact, we
1375  * can't even update the in-memory copy, or the cache might
1376  * mistakenly give out b for reuse before p gets written.
1377  *
1378  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1379  * The caller is expected to record a "p after bb" dependency
1380  * to finish (1), and also expected to call blockRemoveLink
1381  * to arrange for (2) to happen once p is written.
1382  *
1383  * Until (2) happens, some pieces of the code (e.g., the archiver)
1384  * still need to know whether a block has been copied, so we
1385  * set the BsCopied bit in the label and force that to disk *before*
1386  * the copy gets written out.
1387  */
1388 Block*
1389 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1390 {
1391 	Block *bb, *lb;
1392 	Label l;
1393 
1394 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1395 		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1396 			argv0, b->addr, &b->l, elo, ehi);
1397 
1398 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1399 	if(bb == nil){
1400 		blockPut(b);
1401 		return nil;
1402 	}
1403 
1404 	/*
1405 	 * Update label so we know the block has been copied.
1406 	 * (It will be marked closed once it has been unlinked from
1407 	 * the tree.)  This must follow cacheAllocBlock since we
1408 	 * can't be holding onto lb when we call cacheAllocBlock.
1409 	 */
1410 	if((b->l.state&BsCopied)==0)
1411 	if(b->part == PartData){	/* not the superblock */
1412 		l = b->l;
1413 		l.state |= BsCopied;
1414 		lb = _blockSetLabel(b, &l);
1415 		if(lb == nil){
1416 			/* can't set label => can't copy block */
1417 			blockPut(b);
1418 			l.type = BtMax;
1419 			l.state = BsFree;
1420 			l.epoch = 0;
1421 			l.epochClose = 0;
1422 			l.tag = 0;
1423 			blockSetLabel(bb, &l, 0);
1424 			blockPut(bb);
1425 			return nil;
1426 		}
1427 		blockDependency(bb, lb, -1, nil, nil);
1428 		blockPut(lb);
1429 	}
1430 
1431 	memmove(bb->data, b->data, b->c->size);
1432 	blockDirty(bb);
1433 	blockPut(b);
1434 	return bb;
1435 }
1436 
1437 /*
1438  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1439  * If recurse is set, we are unlinking all of bb's children as well.
1440  *
1441  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1442  * the relevant information to b's list of unlinked blocks.  Once b is written,
1443  * the list will be queued for processing.
1444  *
1445  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1446  */
1447 void
1448 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1449 {
1450 	BList *p, **pp, bl;
1451 
1452 	/* remove bb from prior list */
1453 	for(pp=&b->prior; (p=*pp)!=nil; ){
1454 		if(p->part == PartData && p->addr == addr){
1455 			*pp = p->next;
1456 			blistFree(b->c, p);
1457 		}else
1458 			pp = &p->next;
1459 	}
1460 
1461 	bl.part = PartData;
1462 	bl.addr = addr;
1463 	bl.type = type;
1464 	bl.tag = tag;
1465 	if(b->l.epoch == 0)
1466 		assert(b->part == PartSuper);
1467 	bl.epoch = b->l.epoch;
1468 	bl.next = nil;
1469 	bl.recurse = recurse;
1470 
1471 	p = blistAlloc(b);
1472 	if(p == nil){
1473 		/*
1474 		 * We were out of blists so blistAlloc wrote b to disk.
1475 		 */
1476 		doRemoveLink(b->c, &bl);
1477 		return;
1478 	}
1479 
1480 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1481 	assert(b->iostate == BioDirty);
1482 
1483 	*p = bl;
1484 	if(b->uhead == nil)
1485 		b->uhead = p;
1486 	else
1487 		b->utail->next = p;
1488 	b->utail = p;
1489 }
1490 
1491 /*
1492  * Process removal of a single block and perhaps its children.
1493  */
1494 static void
1495 doRemoveLink(Cache *c, BList *p)
1496 {
1497 	int i, n, recurse;
1498 	u32int a;
1499 	Block *b;
1500 	Label l;
1501 	BList bl;
1502 
1503 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1504 
1505 	/*
1506 	 * We're not really going to overwrite b, but if we're not
1507 	 * going to look at its contents, there is no point in reading
1508 	 * them from the disk.
1509 	 */
1510 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1511 	if(b == nil)
1512 		return;
1513 
1514 	/*
1515 	 * When we're unlinking from the superblock, close with the next epoch.
1516 	 */
1517 	if(p->epoch == 0)
1518 		p->epoch = b->l.epoch+1;
1519 
1520 	/* sanity check */
1521 	if(b->l.epoch > p->epoch){
1522 		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1523 			argv0, b->l.epoch, p->epoch);
1524 		blockPut(b);
1525 		return;
1526 	}
1527 
1528 	if(recurse){
1529 		n = c->size / VtScoreSize;
1530 		for(i=0; i<n; i++){
1531 			a = globalToLocal(b->data + i*VtScoreSize);
1532 			if(a == NilBlock || !readLabel(c, &l, a))
1533 				continue;
1534 			if(l.state&BsClosed)
1535 				continue;
1536 			/*
1537 			 * If stack space becomes an issue...
1538 			p->addr = a;
1539 			p->type = l.type;
1540 			p->tag = l.tag;
1541 			doRemoveLink(c, p);
1542 			 */
1543 
1544 			bl.part = PartData;
1545 			bl.addr = a;
1546 			bl.type = l.type;
1547 			bl.tag = l.tag;
1548 			bl.epoch = p->epoch;
1549 			bl.next = nil;
1550 			bl.recurse = 1;
1551 			/* give up the block lock - share with others */
1552 			blockPut(b);
1553 			doRemoveLink(c, &bl);
1554 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1555 			if(b == nil){
1556 				fprint(2, "%s: warning: lost block in doRemoveLink\n",
1557 					argv0);
1558 				return;
1559 			}
1560 		}
1561 	}
1562 
1563 	l = b->l;
1564 	l.state |= BsClosed;
1565 	l.epochClose = p->epoch;
1566 	if(l.epochClose == l.epoch){
1567 		vtLock(c->fl->lk);
1568 		if(l.epoch == c->fl->epochLow)
1569 			c->fl->nused--;
1570 		blockSetLabel(b, &l, 0);
1571 		vtUnlock(c->fl->lk);
1572 	}else
1573 		blockSetLabel(b, &l, 0);
1574 	blockPut(b);
1575 }
1576 
1577 /*
1578  * Allocate a BList so that we can record a dependency
1579  * or queue a removal related to block b.
1580  * If we can't find a BList, we write out b and return nil.
1581  */
1582 static BList *
1583 blistAlloc(Block *b)
1584 {
1585 	Cache *c;
1586 	BList *p;
1587 
1588 	if(b->iostate != BioDirty){
1589 		/*
1590 		 * should not happen anymore -
1591 	 	 * blockDirty used to flush but no longer does.
1592 		 */
1593 		assert(b->iostate == BioClean);
1594 		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1595 		return nil;
1596 	}
1597 
1598 	c = b->c;
1599 	vtLock(c->lk);
1600 	if(c->blfree == nil){
1601 		/*
1602 		 * No free BLists.  What are our options?
1603 		 */
1604 
1605 		/* Block has no priors? Just write it. */
1606 		if(b->prior == nil){
1607 			vtUnlock(c->lk);
1608 			diskWriteAndWait(c->disk, b);
1609 			return nil;
1610 		}
1611 
1612 		/*
1613 		 * Wake the flush thread, which will hopefully free up
1614 		 * some BLists for us.  We used to flush a block from
1615 		 * our own prior list and reclaim that BList, but this is
1616 		 * a no-no: some of the blocks on our prior list may
1617 		 * be locked by our caller.  Or maybe their label blocks
1618 		 * are locked by our caller.  In any event, it's too hard
1619 		 * to make sure we can do I/O for ourselves.  Instead,
1620 		 * we assume the flush thread will find something.
1621 		 * (The flush thread never blocks waiting for a block,
1622 		 * so it can't deadlock like we can.)
1623 		 */
1624 		while(c->blfree == nil){
1625 			vtWakeup(c->flush);
1626 			vtSleep(c->blrend);
1627 			if(c->blfree == nil)
1628 				fprint(2, "%s: flushing for blists\n", argv0);
1629 		}
1630 	}
1631 
1632 	p = c->blfree;
1633 	c->blfree = p->next;
1634 	vtUnlock(c->lk);
1635 	return p;
1636 }
1637 
1638 static void
1639 blistFree(Cache *c, BList *bl)
1640 {
1641 	vtLock(c->lk);
1642 	bl->next = c->blfree;
1643 	c->blfree = bl;
1644 	vtWakeup(c->blrend);
1645 	vtUnlock(c->lk);
1646 }
1647 
1648 char*
1649 bsStr(int state)
1650 {
1651 	static char s[100];
1652 
1653 	if(state == BsFree)
1654 		return "Free";
1655 	if(state == BsBad)
1656 		return "Bad";
1657 
1658 	sprint(s, "%x", state);
1659 	if(!(state&BsAlloc))
1660 		strcat(s, ",Free");	/* should not happen */
1661 	if(state&BsCopied)
1662 		strcat(s, ",Copied");
1663 	if(state&BsVenti)
1664 		strcat(s, ",Venti");
1665 	if(state&BsClosed)
1666 		strcat(s, ",Closed");
1667 	return s;
1668 }
1669 
1670 char *
1671 bioStr(int iostate)
1672 {
1673 	switch(iostate){
1674 	default:
1675 		return "Unknown!!";
1676 	case BioEmpty:
1677 		return "Empty";
1678 	case BioLabel:
1679 		return "Label";
1680 	case BioClean:
1681 		return "Clean";
1682 	case BioDirty:
1683 		return "Dirty";
1684 	case BioReading:
1685 		return "Reading";
1686 	case BioWriting:
1687 		return "Writing";
1688 	case BioReadError:
1689 		return "ReadError";
1690 	case BioVentiError:
1691 		return "VentiError";
1692 	case BioMax:
1693 		return "Max";
1694 	}
1695 }
1696 
1697 static char *bttab[] = {
1698 	"BtData",
1699 	"BtData+1",
1700 	"BtData+2",
1701 	"BtData+3",
1702 	"BtData+4",
1703 	"BtData+5",
1704 	"BtData+6",
1705 	"BtData+7",
1706 	"BtDir",
1707 	"BtDir+1",
1708 	"BtDir+2",
1709 	"BtDir+3",
1710 	"BtDir+4",
1711 	"BtDir+5",
1712 	"BtDir+6",
1713 	"BtDir+7",
1714 };
1715 
1716 char*
1717 btStr(int type)
1718 {
1719 	if(type < nelem(bttab))
1720 		return bttab[type];
1721 	return "unknown";
1722 }
1723 
1724 int
1725 labelFmt(Fmt *f)
1726 {
1727 	Label *l;
1728 
1729 	l = va_arg(f->args, Label*);
1730 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1731 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1732 }
1733 
1734 int
1735 scoreFmt(Fmt *f)
1736 {
1737 	uchar *v;
1738 	int i;
1739 	u32int addr;
1740 
1741 	v = va_arg(f->args, uchar*);
1742 	if(v == nil){
1743 		fmtprint(f, "*");
1744 	}else if((addr = globalToLocal(v)) != NilBlock)
1745 		fmtprint(f, "0x%.8ux", addr);
1746 	else{
1747 		for(i = 0; i < VtScoreSize; i++)
1748 			fmtprint(f, "%2.2ux", v[i]);
1749 	}
1750 
1751 	return 0;
1752 }
1753 
1754 static int
1755 upHeap(int i, Block *b)
1756 {
1757 	Block *bb;
1758 	u32int now;
1759 	int p;
1760 	Cache *c;
1761 
1762 	c = b->c;
1763 	now = c->now;
1764 	for(; i != 0; i = p){
1765 		p = (i - 1) >> 1;
1766 		bb = c->heap[p];
1767 		if(b->used - now >= bb->used - now)
1768 			break;
1769 		c->heap[i] = bb;
1770 		bb->heap = i;
1771 	}
1772 	c->heap[i] = b;
1773 	b->heap = i;
1774 
1775 	return i;
1776 }
1777 
1778 static int
1779 downHeap(int i, Block *b)
1780 {
1781 	Block *bb;
1782 	u32int now;
1783 	int k;
1784 	Cache *c;
1785 
1786 	c = b->c;
1787 	now = c->now;
1788 	for(; ; i = k){
1789 		k = (i << 1) + 1;
1790 		if(k >= c->nheap)
1791 			break;
1792 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1793 			k++;
1794 		bb = c->heap[k];
1795 		if(b->used - now <= bb->used - now)
1796 			break;
1797 		c->heap[i] = bb;
1798 		bb->heap = i;
1799 	}
1800 	c->heap[i] = b;
1801 	b->heap = i;
1802 	return i;
1803 }
1804 
1805 /*
1806  * Delete a block from the heap.
1807  * Called with c->lk held.
1808  */
1809 static void
1810 heapDel(Block *b)
1811 {
1812 	int i, si;
1813 	Cache *c;
1814 
1815 	c = b->c;
1816 
1817 	si = b->heap;
1818 	if(si == BadHeap)
1819 		return;
1820 	b->heap = BadHeap;
1821 	c->nheap--;
1822 	if(si == c->nheap)
1823 		return;
1824 	b = c->heap[c->nheap];
1825 	i = upHeap(si, b);
1826 	if(i == si)
1827 		downHeap(i, b);
1828 }
1829 
1830 /*
1831  * Insert a block into the heap.
1832  * Called with c->lk held.
1833  */
1834 static void
1835 heapIns(Block *b)
1836 {
1837 	assert(b->heap == BadHeap);
1838 	upHeap(b->c->nheap++, b);
1839 	vtWakeup(b->c->heapwait);
1840 }
1841 
1842 /*
1843  * Get just the label for a block.
1844  */
1845 int
1846 readLabel(Cache *c, Label *l, u32int addr)
1847 {
1848 	int lpb;
1849 	Block *b;
1850 	u32int a;
1851 
1852 	lpb = c->size / LabelSize;
1853 	a = addr / lpb;
1854 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1855 	if(b == nil){
1856 		blockPut(b);
1857 		return 0;
1858 	}
1859 
1860 	if(!labelUnpack(l, b->data, addr%lpb)){
1861 		blockPut(b);
1862 		return 0;
1863 	}
1864 	blockPut(b);
1865 	return 1;
1866 }
1867 
1868 /*
1869  * Process unlink queue.
1870  * Called with c->lk held.
1871  */
1872 static void
1873 unlinkBody(Cache *c)
1874 {
1875 	BList *p;
1876 
1877 	while(c->uhead != nil){
1878 		p = c->uhead;
1879 		c->uhead = p->next;
1880 		vtUnlock(c->lk);
1881 		doRemoveLink(c, p);
1882 		vtLock(c->lk);
1883 		p->next = c->blfree;
1884 		c->blfree = p;
1885 	}
1886 }
1887 
1888 /*
1889  * Occasionally unlink the blocks on the cache unlink queue.
1890  */
1891 static void
1892 unlinkThread(void *a)
1893 {
1894 	Cache *c = a;
1895 
1896 	vtThreadSetName("unlink");
1897 
1898 	vtLock(c->lk);
1899 	for(;;){
1900 		while(c->uhead == nil && c->die == nil)
1901 			vtSleep(c->unlink);
1902 		if(c->die != nil)
1903 			break;
1904 		unlinkBody(c);
1905 	}
1906 	c->ref--;
1907 	vtWakeup(c->die);
1908 	vtUnlock(c->lk);
1909 }
1910 
1911 static int
1912 baddrCmp(void *a0, void *a1)
1913 {
1914 	BAddr *b0, *b1;
1915 	b0 = a0;
1916 	b1 = a1;
1917 
1918 	if(b0->part < b1->part)
1919 		return -1;
1920 	if(b0->part > b1->part)
1921 		return 1;
1922 	if(b0->addr < b1->addr)
1923 		return -1;
1924 	if(b0->addr > b1->addr)
1925 		return 1;
1926 	return 0;
1927 }
1928 
1929 /*
1930  * Scan the block list for dirty blocks; add them to the list c->baddr.
1931  */
1932 static void
1933 flushFill(Cache *c)
1934 {
1935 	int i, ndirty;
1936 	BAddr *p;
1937 	Block *b;
1938 
1939 	vtLock(c->lk);
1940 	if(c->ndirty == 0){
1941 		vtUnlock(c->lk);
1942 		return;
1943 	}
1944 
1945 	p = c->baddr;
1946 	ndirty = 0;
1947 	for(i=0; i<c->nblocks; i++){
1948 		b = c->blocks + i;
1949 		if(b->part == PartError)
1950 			continue;
1951 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1952 			ndirty++;
1953 		if(b->iostate != BioDirty)
1954 			continue;
1955 		p->part = b->part;
1956 		p->addr = b->addr;
1957 		p->vers = b->vers;
1958 		p++;
1959 	}
1960 	if(ndirty != c->ndirty){
1961 		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1962 			argv0, c->ndirty, ndirty);
1963 		c->ndirty = ndirty;
1964 	}
1965 	vtUnlock(c->lk);
1966 
1967 	c->bw = p - c->baddr;
1968 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1969 }
1970 
1971 /*
1972  * This is not thread safe, i.e. it can't be called from multiple threads.
1973  *
1974  * It's okay how we use it, because it only gets called in
1975  * the flushThread.  And cacheFree, but only after
1976  * cacheFree has killed off the flushThread.
1977  */
1978 static int
1979 cacheFlushBlock(Cache *c)
1980 {
1981 	Block *b;
1982 	BAddr *p;
1983 	int lockfail, nfail;
1984 
1985 	nfail = 0;
1986 	for(;;){
1987 		if(c->br == c->be){
1988 			if(c->bw == 0 || c->bw == c->be)
1989 				flushFill(c);
1990 			c->br = 0;
1991 			c->be = c->bw;
1992 			c->bw = 0;
1993 			c->nflush = 0;
1994 		}
1995 
1996 		if(c->br == c->be)
1997 			return 0;
1998 		p = c->baddr + c->br;
1999 		c->br++;
2000 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
2001 
2002 		if(b && blockWrite(b)){
2003 			c->nflush++;
2004 			blockPut(b);
2005 			return 1;
2006 		}
2007 		if(b)
2008 			blockPut(b);
2009 
2010 		/*
2011 		 * Why didn't we write the block?
2012 		 */
2013 
2014 		/* Block already written out */
2015 		if(b == nil && !lockfail)
2016 			continue;
2017 
2018 		/* Failed to acquire lock; sleep if happens a lot. */
2019 		if(lockfail && ++nfail > 100){
2020 			sleep(500);
2021 			nfail = 0;
2022 		}
2023 		/* Requeue block. */
2024 		if(c->bw < c->be)
2025 			c->baddr[c->bw++] = *p;
2026 	}
2027 }
2028 
2029 /*
2030  * Occasionally flush dirty blocks from memory to the disk.
2031  */
2032 static void
2033 flushThread(void *a)
2034 {
2035 	Cache *c = a;
2036 	int i;
2037 
2038 	vtThreadSetName("flush");
2039 	vtLock(c->lk);
2040 	while(c->die == nil){
2041 		vtSleep(c->flush);
2042 		vtUnlock(c->lk);
2043 		for(i=0; i<FlushSize; i++)
2044 			if(!cacheFlushBlock(c)){
2045 				/*
2046 				 * If i==0, could be someone is waking us repeatedly
2047 				 * to flush the cache but there's no work to do.
2048 				 * Pause a little.
2049 				 */
2050 				if(i==0){
2051 					// fprint(2, "%s: flushthread found "
2052 					//	"nothing to flush - %d dirty\n",
2053 					//	argv0, c->ndirty);
2054 					sleep(250);
2055 				}
2056 				break;
2057 			}
2058 		if(i==0 && c->ndirty){
2059 			/*
2060 			 * All the blocks are being written right now -- there's nothing to do.
2061 			 * We might be spinning with cacheFlush though -- he'll just keep
2062 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2063 			 * on something that the diskThread can kick, but for now we'll
2064 			 * just pause for a little while waiting for disks to finish.
2065 			 */
2066 			sleep(100);
2067 		}
2068 		vtLock(c->lk);
2069 		vtWakeupAll(c->flushwait);
2070 	}
2071 	c->ref--;
2072 	vtWakeup(c->die);
2073 	vtUnlock(c->lk);
2074 }
2075 
2076 /*
2077  * Flush the cache.
2078  */
2079 void
2080 cacheFlush(Cache *c, int wait)
2081 {
2082 	/*
2083 	 * Lock c->dirtylk so that more blocks aren't being dirtied
2084 	 * while we try to write out what's already here.
2085 	 * Otherwise we might not ever finish!
2086 	 */
2087 	vtLock(c->dirtylk);
2088 	vtLock(c->lk);
2089 	if(wait){
2090 		while(c->ndirty){
2091 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2092 		//		c->ndirty, c->uhead);
2093 			vtWakeup(c->flush);
2094 			vtSleep(c->flushwait);
2095 		}
2096 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2097 	}else if(c->ndirty)
2098 		vtWakeup(c->flush);
2099 	vtUnlock(c->lk);
2100 	vtUnlock(c->dirtylk);
2101 }
2102 
2103 /*
2104  * Kick the flushThread every 30 seconds.
2105  */
2106 static void
2107 cacheSync(void *v)
2108 {
2109 	Cache *c;
2110 
2111 	c = v;
2112 	cacheFlush(c, 0);
2113 }
2114