xref: /plan9/sys/src/cmd/fossil/cache.c (revision 2a7824990d644563b93ed8d4abf1407c40b2087a)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	VtLock	*dirtylk;
26 	int 	ref;
27 	int	mode;
28 
29 	Disk 	*disk;
30 	int	size;			/* block size */
31 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
32 	VtSession *z;
33 	u32int	now;			/* ticks for usage timestamps */
34 	Block	**heads;		/* hash table for finding address */
35 	int	nheap;			/* number of available victims */
36 	Block	**heap;			/* heap for locating victims */
37 	long	nblocks;		/* number of blocks allocated */
38 	Block	*blocks;		/* array of block descriptors */
39 	u8int	*mem;			/* memory for all block data & blists */
40 
41 	BList	*blfree;
42 	VtRendez *blrend;
43 
44 	int 	ndirty;			/* number of dirty blocks in the cache */
45 	int 	maxdirty;		/* max number of dirty blocks */
46 	u32int	vers;
47 
48 	long hashSize;
49 
50 	FreeList *fl;
51 
52 	VtRendez *die;			/* daemon threads should die when != nil */
53 
54 	VtRendez *flush;
55 	VtRendez *flushwait;
56 	VtRendez *heapwait;
57 	BAddr *baddr;
58 	int bw, br, be;
59 	int nflush;
60 
61 	Periodic *sync;
62 
63 	/* unlink daemon */
64 	BList *uhead;
65 	BList *utail;
66 	VtRendez *unlink;
67 
68 	/* block counts */
69 	int nused;
70 	int ndisk;
71 };
72 
73 struct BList {
74 	int part;
75 	u32int addr;
76 	uchar type;
77 	u32int tag;
78 	u32int epoch;
79 	u32int vers;
80 
81 	int recurse;	/* for block unlink */
82 
83 	/* for roll back */
84 	int index;			/* -1 indicates not valid */
85 	union {
86 		uchar score[VtScoreSize];
87 		uchar entry[VtEntrySize];
88 	} old;
89 	BList *next;
90 };
91 
92 struct BAddr {
93 	int part;
94 	u32int addr;
95 	u32int vers;
96 };
97 
98 struct FreeList {
99 	VtLock *lk;
100 	u32int last;		/* last block allocated */
101 	u32int end;		/* end of data partition */
102 	u32int nused;		/* number of used blocks */
103 	u32int epochLow;	/* low epoch when last updated nused */
104 };
105 
106 static FreeList *flAlloc(u32int end);
107 static void flFree(FreeList *fl);
108 
109 static Block *cacheBumpBlock(Cache *c);
110 static void heapDel(Block*);
111 static void heapIns(Block*);
112 static void cacheCheck(Cache*);
113 static void unlinkThread(void *a);
114 static void flushThread(void *a);
115 static void flushBody(Cache *c);
116 static void unlinkBody(Cache *c);
117 static int cacheFlushBlock(Cache *c);
118 static void cacheSync(void*);
119 static BList *blistAlloc(Block*);
120 static void blistFree(Cache*, BList*);
121 static void doRemoveLink(Cache*, BList*);
122 static void doRemoveLinkList(Cache*, BList*);
123 
124 /*
125  * Mapping from local block type to Venti type
126  */
127 int vtType[BtMax] = {
128 	VtDataType,		/* BtData | 0  */
129 	VtPointerType0,		/* BtData | 1  */
130 	VtPointerType1,		/* BtData | 2  */
131 	VtPointerType2,		/* BtData | 3  */
132 	VtPointerType3,		/* BtData | 4  */
133 	VtPointerType4,		/* BtData | 5  */
134 	VtPointerType5,		/* BtData | 6  */
135 	VtPointerType6,		/* BtData | 7  */
136 	VtDirType,		/* BtDir | 0  */
137 	VtPointerType0,		/* BtDir | 1  */
138 	VtPointerType1,		/* BtDir | 2  */
139 	VtPointerType2,		/* BtDir | 3  */
140 	VtPointerType3,		/* BtDir | 4  */
141 	VtPointerType4,		/* BtDir | 5  */
142 	VtPointerType5,		/* BtDir | 6  */
143 	VtPointerType6,		/* BtDir | 7  */
144 };
145 
146 /*
147  * Allocate the memory cache.
148  */
149 Cache *
150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
151 {
152 	int i;
153 	Cache *c;
154 	Block *b;
155 	BList *bl;
156 	u8int *p;
157 	int nbl;
158 
159 	c = vtMemAllocZ(sizeof(Cache));
160 
161 	/* reasonable number of BList elements */
162 	nbl = nblocks * 4;
163 
164 	c->lk = vtLockAlloc();
165 	c->dirtylk = vtLockAlloc();	/* allowed to dirty blocks */
166 	c->ref = 1;
167 	c->disk = disk;
168 	c->z = z;
169 	c->size = diskBlockSize(disk);
170 bwatchSetBlockSize(c->size);
171 	/* round c->size up to be a nice multiple */
172 	c->size = (c->size + 127) & ~127;
173 	c->ndmap = (c->size/20 + 7) / 8;
174 	c->nblocks = nblocks;
175 	c->hashSize = nblocks;
176 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
177 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
178 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
179 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
180 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
181 	c->mode = mode;
182 	c->vers++;
183 	p = c->mem;
184 	for(i = 0; i < nblocks; i++){
185 		b = &c->blocks[i];
186 		b->lk = vtLockAlloc();
187 		b->c = c;
188 		b->data = p;
189 		b->heap = i;
190 		b->ioready = vtRendezAlloc(b->lk);
191 		c->heap[i] = b;
192 		p += c->size;
193 	}
194 	c->nheap = nblocks;
195 	for(i = 0; i < nbl; i++){
196 		bl = (BList*)p;
197 		bl->next = c->blfree;
198 		c->blfree = bl;
199 		p += sizeof(BList);
200 	}
201 	/* separate loop to keep blocks and blists reasonably aligned */
202 	for(i = 0; i < nblocks; i++){
203 		b = &c->blocks[i];
204 		b->dmap = p;
205 		p += c->ndmap;
206 	}
207 
208 	c->blrend = vtRendezAlloc(c->lk);
209 
210 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
211 
212 	c->fl = flAlloc(diskSize(disk, PartData));
213 
214 	c->unlink = vtRendezAlloc(c->lk);
215 	c->flush = vtRendezAlloc(c->lk);
216 	c->flushwait = vtRendezAlloc(c->lk);
217 	c->heapwait = vtRendezAlloc(c->lk);
218 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
219 
220 	if(mode == OReadWrite){
221 		c->ref += 2;
222 		vtThread(unlinkThread, c);
223 		vtThread(flushThread, c);
224 	}
225 	cacheCheck(c);
226 
227 	return c;
228 }
229 
230 /*
231  * Free the whole memory cache, flushing all dirty blocks to the disk.
232  */
233 void
234 cacheFree(Cache *c)
235 {
236 	int i;
237 
238 	/* kill off daemon threads */
239 	vtLock(c->lk);
240 	c->die = vtRendezAlloc(c->lk);
241 	periodicKill(c->sync);
242 	vtWakeup(c->flush);
243 	vtWakeup(c->unlink);
244 	while(c->ref > 1)
245 		vtSleep(c->die);
246 
247 	/* flush everything out */
248 	do {
249 		unlinkBody(c);
250 		vtUnlock(c->lk);
251 		while(cacheFlushBlock(c))
252 			;
253 		diskFlush(c->disk);
254 		vtLock(c->lk);
255 	} while(c->uhead || c->ndirty);
256 	vtUnlock(c->lk);
257 
258 	cacheCheck(c);
259 
260 	for(i = 0; i < c->nblocks; i++){
261 		assert(c->blocks[i].ref == 0);
262 		vtRendezFree(c->blocks[i].ioready);
263 		vtLockFree(c->blocks[i].lk);
264 	}
265 	flFree(c->fl);
266 	vtMemFree(c->baddr);
267 	vtMemFree(c->heads);
268 	vtMemFree(c->blocks);
269 	vtMemFree(c->mem);
270 	vtLockFree(c->lk);
271 	diskFree(c->disk);
272 	vtRendezFree(c->blrend);
273 	/* don't close vtSession */
274 	vtMemFree(c);
275 }
276 
277 static void
278 cacheDump(Cache *c)
279 {
280 	int i;
281 	Block *b;
282 
283 	for(i = 0; i < c->nblocks; i++){
284 		b = &c->blocks[i];
285 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
286 			i, b->part, b->addr, b->score, b->l.type, b->ref,
287 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
288 	}
289 }
290 
291 static void
292 cacheCheck(Cache *c)
293 {
294 	u32int size, now;
295 	int i, k, refed;
296 	static uchar zero[VtScoreSize];
297 	Block *b;
298 
299 	size = c->size;
300 	now = c->now;
301 
302 	for(i = 0; i < c->nheap; i++){
303 		if(c->heap[i]->heap != i)
304 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
305 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
306 			vtFatal("bad heap ordering");
307 		k = (i << 1) + 1;
308 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
309 			vtFatal("bad heap ordering");
310 		k++;
311 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
312 			vtFatal("bad heap ordering");
313 	}
314 
315 	refed = 0;
316 	for(i = 0; i < c->nblocks; i++){
317 		b = &c->blocks[i];
318 		if(b->data != &c->mem[i * size])
319 			vtFatal("mis-blocked at %d", i);
320 		if(b->ref && b->heap == BadHeap){
321 			refed++;
322 		}
323 	}
324 if(c->nheap + refed != c->nblocks){
325 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
326 cacheDump(c);
327 }
328 	assert(c->nheap + refed == c->nblocks);
329 	refed = 0;
330 	for(i = 0; i < c->nblocks; i++){
331 		b = &c->blocks[i];
332 		if(b->ref){
333 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
334 			refed++;
335 		}
336 	}
337 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
338 }
339 
340 
341 /*
342  * locate the block with the oldest second to last use.
343  * remove it from the heap, and fix up the heap.
344  */
345 /* called with c->lk held */
346 static Block *
347 cacheBumpBlock(Cache *c)
348 {
349 	int printed;
350 	Block *b;
351 
352 	/*
353 	 * locate the block with the oldest second to last use.
354 	 * remove it from the heap, and fix up the heap.
355 	 */
356 	printed = 0;
357 	if(c->nheap == 0){
358 		while(c->nheap == 0){
359 			vtWakeup(c->flush);
360 			vtSleep(c->heapwait);
361 			if(c->nheap == 0){
362 				printed = 1;
363 				fprint(2, "%s: entire cache is busy, %d dirty "
364 					"-- waking flush thread\n",
365 					argv0, c->ndirty);
366 			}
367 		}
368 		if(printed)
369 			fprint(2, "%s: cache is okay again, %d dirty\n",
370 				argv0, c->ndirty);
371 	}
372 
373 	b = c->heap[0];
374 	heapDel(b);
375 
376 	assert(b->heap == BadHeap);
377 	assert(b->ref == 0);
378 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
379 	assert(b->prior == nil);
380 	assert(b->uhead == nil);
381 
382 	/*
383 	 * unchain the block from hash chain
384 	 */
385 	if(b->prev){
386 		*(b->prev) = b->next;
387 		if(b->next)
388 			b->next->prev = b->prev;
389 		b->prev = nil;
390 	}
391 
392 
393 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
394 	/* set block to a reasonable state */
395 	b->ref = 1;
396 	b->part = PartError;
397 	memset(&b->l, 0, sizeof(b->l));
398 	b->iostate = BioEmpty;
399 
400 	return b;
401 }
402 
403 /*
404  * look for a particular version of the block in the memory cache.
405  */
406 static Block *
407 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
408 	int waitlock, int *lockfailure)
409 {
410 	Block *b;
411 	ulong h;
412 
413 	h = addr % c->hashSize;
414 
415 	if(lockfailure)
416 		*lockfailure = 0;
417 
418 	/*
419 	 * look for the block in the cache
420 	 */
421 	vtLock(c->lk);
422 	for(b = c->heads[h]; b != nil; b = b->next){
423 		if(b->part == part && b->addr == addr)
424 			break;
425 	}
426 	if(b == nil || b->vers != vers){
427 		vtUnlock(c->lk);
428 		return nil;
429 	}
430 	if(!waitlock && !vtCanLock(b->lk)){
431 		*lockfailure = 1;
432 		vtUnlock(c->lk);
433 		return nil;
434 	}
435 	heapDel(b);
436 	b->ref++;
437 	vtUnlock(c->lk);
438 
439 	bwatchLock(b);
440 	if(waitlock)
441 		vtLock(b->lk);
442 	b->nlock = 1;
443 
444 	for(;;){
445 		switch(b->iostate){
446 		default:
447 			abort();
448 		case BioEmpty:
449 		case BioLabel:
450 		case BioClean:
451 		case BioDirty:
452 			if(b->vers != vers){
453 				blockPut(b);
454 				return nil;
455 			}
456 			return b;
457 		case BioReading:
458 		case BioWriting:
459 			vtSleep(b->ioready);
460 			break;
461 		case BioVentiError:
462 			blockPut(b);
463 			vtSetError("venti i/o error block 0x%.8ux", addr);
464 			return nil;
465 		case BioReadError:
466 			blockPut(b);
467 			vtSetError("error reading block 0x%.8ux", addr);
468 			return nil;
469 		}
470 	}
471 	/* NOT REACHED */
472 }
473 static Block*
474 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
475 {
476 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
477 }
478 
479 
480 /*
481  * fetch a local (on-disk) block from the memory cache.
482  * if it's not there, load it, bumping some other block.
483  */
484 Block *
485 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
486 {
487 	Block *b;
488 	ulong h;
489 
490 	assert(part != PartVenti);
491 
492 	h = addr % c->hashSize;
493 
494 	/*
495 	 * look for the block in the cache
496 	 */
497 	vtLock(c->lk);
498 	for(b = c->heads[h]; b != nil; b = b->next){
499 		if(b->part != part || b->addr != addr)
500 			continue;
501 		if(epoch && b->l.epoch != epoch){
502 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
503 			vtUnlock(c->lk);
504 			vtSetError(ELabelMismatch);
505 			return nil;
506 		}
507 		heapDel(b);
508 		b->ref++;
509 		break;
510 	}
511 
512 	if(b == nil){
513 		b = cacheBumpBlock(c);
514 
515 		b->part = part;
516 		b->addr = addr;
517 		localToGlobal(addr, b->score);
518 
519 		/* chain onto correct hash */
520 		b->next = c->heads[h];
521 		c->heads[h] = b;
522 		if(b->next != nil)
523 			b->next->prev = &b->next;
524 		b->prev = &c->heads[h];
525 	}
526 
527 	vtUnlock(c->lk);
528 
529 	/*
530 	 * BUG: what if the epoch changes right here?
531 	 * In the worst case, we could end up in some weird
532 	 * lock loop, because the block we want no longer exists,
533 	 * and instead we're trying to lock a block we have no
534 	 * business grabbing.
535 	 *
536 	 * For now, I'm not going to worry about it.
537 	 */
538 
539 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
540 	bwatchLock(b);
541 	vtLock(b->lk);
542 	b->nlock = 1;
543 
544 	if(part == PartData && b->iostate == BioEmpty){
545 		if(!readLabel(c, &b->l, addr)){
546 			blockPut(b);
547 			return nil;
548 		}
549 		blockSetIOState(b, BioLabel);
550 	}
551 	if(epoch && b->l.epoch != epoch){
552 		blockPut(b);
553 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
554 		vtSetError(ELabelMismatch);
555 		return nil;
556 	}
557 
558 	b->pc = getcallerpc(&c);
559 	for(;;){
560 		switch(b->iostate){
561 		default:
562 			abort();
563 		case BioEmpty:
564 		case BioLabel:
565 			if(mode == OOverWrite){
566 				blockSetIOState(b, BioClean);
567 				return b;
568 			}
569 			diskRead(c->disk, b);
570 			vtSleep(b->ioready);
571 			break;
572 		case BioClean:
573 		case BioDirty:
574 			return b;
575 		case BioReading:
576 		case BioWriting:
577 			vtSleep(b->ioready);
578 			break;
579 		case BioReadError:
580 			blockSetIOState(b, BioEmpty);
581 			blockPut(b);
582 			vtSetError("error reading block 0x%.8ux", addr);
583 			return nil;
584 		}
585 	}
586 	/* NOT REACHED */
587 }
588 
589 Block *
590 cacheLocal(Cache *c, int part, u32int addr, int mode)
591 {
592 	return _cacheLocal(c, part, addr, mode, 0);
593 }
594 
595 /*
596  * fetch a local (on-disk) block from the memory cache.
597  * if it's not there, load it, bumping some other block.
598  * check tag and type.
599  */
600 Block *
601 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
602 {
603 	Block *b;
604 
605 	b = _cacheLocal(c, PartData, addr, mode, epoch);
606 	if(b == nil)
607 		return nil;
608 	if(b->l.type != type || b->l.tag != tag){
609 		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
610 			argv0, addr, b->l.type, type, b->l.tag, tag);
611 		vtSetError(ELabelMismatch);
612 		blockPut(b);
613 		return nil;
614 	}
615 	b->pc = getcallerpc(&c);
616 	return b;
617 }
618 
619 /*
620  * fetch a global (Venti) block from the memory cache.
621  * if it's not there, load it, bumping some other block.
622  * check tag and type if it's really a local block in disguise.
623  */
624 Block *
625 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
626 {
627 	int n;
628 	Block *b;
629 	ulong h;
630 	u32int addr;
631 
632 	addr = globalToLocal(score);
633 	if(addr != NilBlock){
634 		b = cacheLocalData(c, addr, type, tag, mode, 0);
635 		if(b)
636 			b->pc = getcallerpc(&c);
637 		return b;
638 	}
639 
640 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
641 
642 	/*
643 	 * look for the block in the cache
644 	 */
645 	vtLock(c->lk);
646 	for(b = c->heads[h]; b != nil; b = b->next){
647 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
648 			continue;
649 		heapDel(b);
650 		b->ref++;
651 		break;
652 	}
653 
654 	if(b == nil){
655 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
656 
657 		b = cacheBumpBlock(c);
658 
659 		b->part = PartVenti;
660 		b->addr = NilBlock;
661 		b->l.type = type;
662 		memmove(b->score, score, VtScoreSize);
663 
664 		/* chain onto correct hash */
665 		b->next = c->heads[h];
666 		c->heads[h] = b;
667 		if(b->next != nil)
668 			b->next->prev = &b->next;
669 		b->prev = &c->heads[h];
670 	}
671 	vtUnlock(c->lk);
672 
673 	bwatchLock(b);
674 	vtLock(b->lk);
675 	b->nlock = 1;
676 	b->pc = getcallerpc(&c);
677 
678 	switch(b->iostate){
679 	default:
680 		abort();
681 	case BioEmpty:
682 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
683 		if(n < 0 || !vtSha1Check(score, b->data, n)){
684 			blockSetIOState(b, BioVentiError);
685 			blockPut(b);
686 			vtSetError(
687 			"venti error reading block %V or wrong score: %r",
688 				score);
689 			return nil;
690 		}
691 		vtZeroExtend(vtType[type], b->data, n, c->size);
692 		blockSetIOState(b, BioClean);
693 		return b;
694 	case BioClean:
695 		return b;
696 	case BioVentiError:
697 		blockPut(b);
698 		vtSetError("venti i/o error or wrong score, block %V", score);
699 		return nil;
700 	case BioReadError:
701 		blockPut(b);
702 		vtSetError("error reading block %V", b->score);
703 		return nil;
704 	}
705 	/* NOT REACHED */
706 }
707 
708 /*
709  * allocate a new on-disk block and load it into the memory cache.
710  * BUG: if the disk is full, should we flush some of it to Venti?
711  */
712 static u32int lastAlloc;
713 
714 Block *
715 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
716 {
717 	FreeList *fl;
718 	u32int addr;
719 	Block *b;
720 	int n, nwrap;
721 	Label lab;
722 
723 	n = c->size / LabelSize;
724 	fl = c->fl;
725 
726 	vtLock(fl->lk);
727 	addr = fl->last;
728 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
729 	if(b == nil){
730 		fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
731 		vtUnlock(fl->lk);
732 		return nil;
733 	}
734 	nwrap = 0;
735 	for(;;){
736 		if(++addr >= fl->end){
737 			addr = 0;
738 			if(++nwrap >= 2){
739 				blockPut(b);
740 				vtSetError("disk is full");
741 				/*
742 				 * try to avoid a continuous spew of console
743 				 * messages.
744 				 */
745 				if (fl->last != 0)
746 					fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
747 						argv0);
748 				fl->last = 0;
749 				vtUnlock(fl->lk);
750 				return nil;
751 			}
752 		}
753 		if(addr%n == 0){
754 			blockPut(b);
755 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
756 			if(b == nil){
757 				fl->last = addr;
758 				fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
759 				vtUnlock(fl->lk);
760 				return nil;
761 			}
762 		}
763 		if(!labelUnpack(&lab, b->data, addr%n))
764 			continue;
765 		if(lab.state == BsFree)
766 			goto Found;
767 		if(lab.state&BsClosed)
768 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
769 			goto Found;
770 	}
771 Found:
772 	blockPut(b);
773 	b = cacheLocal(c, PartData, addr, OOverWrite);
774 	if(b == nil){
775 		fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
776 		return nil;
777 	}
778 assert(b->iostate == BioLabel || b->iostate == BioClean);
779 	fl->last = addr;
780 	lab.type = type;
781 	lab.tag = tag;
782 	lab.state = BsAlloc;
783 	lab.epoch = epoch;
784 	lab.epochClose = ~(u32int)0;
785 	if(!blockSetLabel(b, &lab, 1)){
786 		fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
787 		blockPut(b);
788 		return nil;
789 	}
790 	vtZeroExtend(vtType[type], b->data, 0, c->size);
791 if(0)diskWrite(c->disk, b);
792 
793 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
794 	lastAlloc = addr;
795 	fl->nused++;
796 	vtUnlock(fl->lk);
797 	b->pc = getcallerpc(&c);
798 	return b;
799 }
800 
801 int
802 cacheDirty(Cache *c)
803 {
804 	return c->ndirty;
805 }
806 
807 void
808 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
809 {
810 	int n;
811 	u32int addr, nused;
812 	Block *b;
813 	Label lab;
814 	FreeList *fl;
815 
816 	fl = c->fl;
817 	n = c->size / LabelSize;
818 	*bsize = c->size;
819 	vtLock(fl->lk);
820 	if(fl->epochLow == epochLow){
821 		*used = fl->nused;
822 		*total = fl->end;
823 		vtUnlock(fl->lk);
824 		return;
825 	}
826 	b = nil;
827 	nused = 0;
828 	for(addr=0; addr<fl->end; addr++){
829 		if(addr%n == 0){
830 			blockPut(b);
831 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
832 			if(b == nil){
833 				fprint(2, "%s: flCountUsed: loading %ux: %R\n",
834 					argv0, addr/n);
835 				break;
836 			}
837 		}
838 		if(!labelUnpack(&lab, b->data, addr%n))
839 			continue;
840 		if(lab.state == BsFree)
841 			continue;
842 		if(lab.state&BsClosed)
843 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
844 			continue;
845 		nused++;
846 	}
847 	blockPut(b);
848 	if(addr == fl->end){
849 		fl->nused = nused;
850 		fl->epochLow = epochLow;
851 	}
852 	*used = nused;
853 	*total = fl->end;
854 	vtUnlock(fl->lk);
855 	return;
856 }
857 
858 static FreeList *
859 flAlloc(u32int end)
860 {
861 	FreeList *fl;
862 
863 	fl = vtMemAllocZ(sizeof(*fl));
864 	fl->lk = vtLockAlloc();
865 	fl->last = 0;
866 	fl->end = end;
867 	return fl;
868 }
869 
870 static void
871 flFree(FreeList *fl)
872 {
873 	vtLockFree(fl->lk);
874 	vtMemFree(fl);
875 }
876 
877 u32int
878 cacheLocalSize(Cache *c, int part)
879 {
880 	return diskSize(c->disk, part);
881 }
882 
883 /*
884  * The thread that has locked b may refer to it by
885  * multiple names.  Nlock counts the number of
886  * references the locking thread holds.  It will call
887  * blockPut once per reference.
888  */
889 void
890 blockDupLock(Block *b)
891 {
892 	assert(b->nlock > 0);
893 	b->nlock++;
894 }
895 
896 /*
897  * we're done with the block.
898  * unlock it.  can't use it after calling this.
899  */
900 void
901 blockPut(Block* b)
902 {
903 	Cache *c;
904 
905 	if(b == nil)
906 		return;
907 
908 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
909 
910 	if(b->iostate == BioDirty)
911 		bwatchDependency(b);
912 
913 	if(--b->nlock > 0)
914 		return;
915 
916 	/*
917 	 * b->nlock should probably stay at zero while
918 	 * the block is unlocked, but diskThread and vtSleep
919 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
920 	 * so we have to keep b->nlock set to 1 even
921 	 * when the block is unlocked.
922 	 */
923 	assert(b->nlock == 0);
924 	b->nlock = 1;
925 //	b->pc = 0;
926 
927 	bwatchUnlock(b);
928 	vtUnlock(b->lk);
929 	c = b->c;
930 	vtLock(c->lk);
931 
932 	if(--b->ref > 0){
933 		vtUnlock(c->lk);
934 		return;
935 	}
936 
937 	assert(b->ref == 0);
938 	switch(b->iostate){
939 	default:
940 		b->used = c->now++;
941 		heapIns(b);
942 		break;
943 	case BioEmpty:
944 	case BioLabel:
945 		if(c->nheap == 0)
946 			b->used = c->now++;
947 		else
948 			b->used = c->heap[0]->used;
949 		heapIns(b);
950 		break;
951 	case BioDirty:
952 		break;
953 	}
954 	vtUnlock(c->lk);
955 }
956 
957 /*
958  * set the label associated with a block.
959  */
960 Block*
961 _blockSetLabel(Block *b, Label *l)
962 {
963 	int lpb;
964 	Block *bb;
965 	u32int a;
966 	Cache *c;
967 
968 	c = b->c;
969 
970 	assert(b->part == PartData);
971 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
972 	lpb = c->size / LabelSize;
973 	a = b->addr / lpb;
974 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
975 	if(bb == nil){
976 		blockPut(b);
977 		return nil;
978 	}
979 	b->l = *l;
980 	labelPack(l, bb->data, b->addr%lpb);
981 	blockDirty(bb);
982 	return bb;
983 }
984 
985 int
986 blockSetLabel(Block *b, Label *l, int allocating)
987 {
988 	Block *lb;
989 	Label oldl;
990 
991 	oldl = b->l;
992 	lb = _blockSetLabel(b, l);
993 	if(lb == nil)
994 		return 0;
995 
996 	/*
997 	 * If we're allocating the block, make sure the label (bl)
998 	 * goes to disk before the data block (b) itself.  This is to help
999 	 * the blocks that in turn depend on b.
1000 	 *
1001 	 * Suppose bx depends on (must be written out after) b.
1002 	 * Once we write b we'll think it's safe to write bx.
1003 	 * Bx can't get at b unless it has a valid label, though.
1004 	 *
1005 	 * Allocation is the only case in which having a current label
1006 	 * is vital because:
1007 	 *
1008 	 *	- l.type is set at allocation and never changes.
1009 	 *	- l.tag is set at allocation and never changes.
1010 	 *	- l.state is not checked when we load blocks.
1011 	 *	- the archiver cares deeply about l.state being
1012 	 *		BaActive vs. BaCopied, but that's handled
1013 	 *		by direct calls to _blockSetLabel.
1014 	 */
1015 
1016 	if(allocating)
1017 		blockDependency(b, lb, -1, nil, nil);
1018 	blockPut(lb);
1019 	return 1;
1020 }
1021 
1022 /*
1023  * Record that bb must be written out before b.
1024  * If index is given, we're about to overwrite the score/e
1025  * at that index in the block.  Save the old value so we
1026  * can write a safer ``old'' version of the block if pressed.
1027  */
1028 void
1029 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1030 {
1031 	BList *p;
1032 
1033 	if(bb->iostate == BioClean)
1034 		return;
1035 
1036 	/*
1037 	 * Dependencies for blocks containing Entry structures
1038 	 * or scores must always be explained.  The problem with
1039 	 * only explaining some of them is this.  Suppose we have two
1040 	 * dependencies for the same field, the first explained
1041 	 * and the second not.  We try to write the block when the first
1042 	 * dependency is not written but the second is.  We will roll back
1043 	 * the first change even though the second trumps it.
1044 	 */
1045 	if(index == -1 && bb->part == PartData)
1046 		assert(b->l.type == BtData);
1047 
1048 	if(bb->iostate != BioDirty){
1049 		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1050 			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1051 		abort();
1052 	}
1053 
1054 	p = blistAlloc(bb);
1055 	if(p == nil)
1056 		return;
1057 
1058 	assert(bb->iostate == BioDirty);
1059 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1060 
1061 	p->part = bb->part;
1062 	p->addr = bb->addr;
1063 	p->type = bb->l.type;
1064 	p->vers = bb->vers;
1065 	p->index = index;
1066 	if(p->index >= 0){
1067 		/*
1068 		 * This test would just be b->l.type==BtDir except
1069 		 * we need to exclude the super block.
1070 		 */
1071 		if(b->l.type == BtDir && b->part == PartData)
1072 			entryPack(e, p->old.entry, 0);
1073 		else
1074 			memmove(p->old.score, score, VtScoreSize);
1075 	}
1076 	p->next = b->prior;
1077 	b->prior = p;
1078 }
1079 
1080 /*
1081  * Mark an in-memory block as dirty.  If there are too many
1082  * dirty blocks, start writing some out to disk.
1083  *
1084  * If there were way too many dirty blocks, we used to
1085  * try to do some flushing ourselves, but it's just too dangerous --
1086  * it implies that the callers cannot have any of our priors locked,
1087  * but this is hard to avoid in some cases.
1088  */
1089 int
1090 blockDirty(Block *b)
1091 {
1092 	Cache *c;
1093 
1094 	c = b->c;
1095 
1096 	assert(b->part != PartVenti);
1097 
1098 	if(b->iostate == BioDirty)
1099 		return 1;
1100 	assert(b->iostate == BioClean);
1101 
1102 	vtLock(c->dirtylk);
1103 	vtLock(c->lk);
1104 	b->iostate = BioDirty;
1105 	c->ndirty++;
1106 	if(c->ndirty > (c->maxdirty>>1))
1107 		vtWakeup(c->flush);
1108 	vtUnlock(c->lk);
1109 	vtUnlock(c->dirtylk);
1110 
1111 	return 1;
1112 }
1113 
1114 /*
1115  * We've decided to write out b.  Maybe b has some pointers to blocks
1116  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1117  * copy of b that is safe to write out.  (diskThread will make sure the block
1118  * remains marked as dirty.)
1119  */
1120 uchar *
1121 blockRollback(Block *b, uchar *buf)
1122 {
1123 	u32int addr;
1124 	BList *p;
1125 	Super super;
1126 
1127 	/* easy case */
1128 	if(b->prior == nil)
1129 		return b->data;
1130 
1131 	memmove(buf, b->data, b->c->size);
1132 	for(p=b->prior; p; p=p->next){
1133 		/*
1134 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1135 		 */
1136 		assert(p->index >= 0);
1137 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1138 		if(b->part == PartSuper){
1139 			assert(p->index == 0);
1140 			superUnpack(&super, buf);
1141 			addr = globalToLocal(p->old.score);
1142 			if(addr == NilBlock){
1143 				fprint(2, "%s: rolling back super block: "
1144 					"bad replacement addr %V\n",
1145 					argv0, p->old.score);
1146 				abort();
1147 			}
1148 			super.active = addr;
1149 			superPack(&super, buf);
1150 			continue;
1151 		}
1152 		if(b->l.type == BtDir)
1153 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1154 		else
1155 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1156 	}
1157 	return buf;
1158 }
1159 
1160 /*
1161  * Try to write block b.
1162  * If b depends on other blocks:
1163  *
1164  *	If the block has been written out, remove the dependency.
1165  *	If the dependency is replaced by a more recent dependency,
1166  *		throw it out.
1167  *	If we know how to write out an old version of b that doesn't
1168  *		depend on it, do that.
1169  *
1170  *	Otherwise, bail.
1171  */
1172 int
1173 blockWrite(Block *b)
1174 {
1175 	uchar *dmap;
1176 	Cache *c;
1177 	BList *p, **pp;
1178 	Block *bb;
1179 	int lockfail;
1180 
1181 	c = b->c;
1182 
1183 	if(b->iostate != BioDirty)
1184 		return 1;
1185 
1186 	dmap = b->dmap;
1187 	memset(dmap, 0, c->ndmap);
1188 	pp = &b->prior;
1189 	for(p=*pp; p; p=*pp){
1190 		if(p->index >= 0){
1191 			/* more recent dependency has succeeded; this one can go */
1192 			if(dmap[p->index/8] & (1<<(p->index%8)))
1193 				goto ignblock;
1194 		}
1195 
1196 		lockfail = 0;
1197 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1198 		if(bb == nil){
1199 			if(lockfail)
1200 				return 0;
1201 			/* block not in cache => was written already */
1202 			dmap[p->index/8] |= 1<<(p->index%8);
1203 			goto ignblock;
1204 		}
1205 
1206 		/*
1207 		 * same version of block is still in cache.
1208 		 *
1209 		 * the assertion is true because the block still has version p->vers,
1210 		 * which means it hasn't been written out since we last saw it.
1211 		 */
1212 		if(bb->iostate != BioDirty){
1213 			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1214 				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1215 			/* probably BioWriting if it happens? */
1216 			if(bb->iostate == BioClean)
1217 				goto ignblock;
1218 		}
1219 
1220 		blockPut(bb);
1221 
1222 		if(p->index < 0){
1223 			/*
1224 			 * We don't know how to temporarily undo
1225 			 * b's dependency on bb, so just don't write b yet.
1226 			 */
1227 			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1228 				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1229 			return 0;
1230 		}
1231 		/* keep walking down the list */
1232 		pp = &p->next;
1233 		continue;
1234 
1235 ignblock:
1236 		*pp = p->next;
1237 		blistFree(c, p);
1238 		continue;
1239 	}
1240 
1241 	/*
1242 	 * DiskWrite must never be called with a double-locked block.
1243 	 * This call to diskWrite is okay because blockWrite is only called
1244 	 * from the cache flush thread, which never double-locks a block.
1245 	 */
1246 	diskWrite(c->disk, b);
1247 	return 1;
1248 }
1249 
1250 /*
1251  * Change the I/O state of block b.
1252  * Just an assignment except for magic in
1253  * switch statement (read comments there).
1254  */
1255 void
1256 blockSetIOState(Block *b, int iostate)
1257 {
1258 	int dowakeup;
1259 	Cache *c;
1260 	BList *p, *q;
1261 
1262 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1263 
1264 	c = b->c;
1265 
1266 	dowakeup = 0;
1267 	switch(iostate){
1268 	default:
1269 		abort();
1270 	case BioEmpty:
1271 		assert(!b->uhead);
1272 		break;
1273 	case BioLabel:
1274 		assert(!b->uhead);
1275 		break;
1276 	case BioClean:
1277 		bwatchDependency(b);
1278 		/*
1279 		 * If b->prior is set, it means a write just finished.
1280 		 * The prior list isn't needed anymore.
1281 		 */
1282 		for(p=b->prior; p; p=q){
1283 			q = p->next;
1284 			blistFree(c, p);
1285 		}
1286 		b->prior = nil;
1287 		/*
1288 		 * Freeing a block or just finished a write.
1289 		 * Move the blocks from the per-block unlink
1290 		 * queue to the cache unlink queue.
1291 		 */
1292 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1293 			vtLock(c->lk);
1294 			c->ndirty--;
1295 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1296 			b->vers = c->vers++;
1297 			if(b->uhead){
1298 				/* add unlink blocks to unlink queue */
1299 				if(c->uhead == nil){
1300 					c->uhead = b->uhead;
1301 					vtWakeup(c->unlink);
1302 				}else
1303 					c->utail->next = b->uhead;
1304 				c->utail = b->utail;
1305 				b->uhead = nil;
1306 			}
1307 			vtUnlock(c->lk);
1308 		}
1309 		assert(!b->uhead);
1310 		dowakeup = 1;
1311 		break;
1312 	case BioDirty:
1313 		/*
1314 		 * Wrote out an old version of the block (see blockRollback).
1315 		 * Bump a version count, leave it dirty.
1316 		 */
1317 		if(b->iostate == BioWriting){
1318 			vtLock(c->lk);
1319 			b->vers = c->vers++;
1320 			vtUnlock(c->lk);
1321 			dowakeup = 1;
1322 		}
1323 		break;
1324 	case BioReading:
1325 	case BioWriting:
1326 		/*
1327 		 * Adding block to disk queue.  Bump reference count.
1328 		 * diskThread decs the count later by calling blockPut.
1329 		 * This is here because we need to lock c->lk to
1330 		 * manipulate the ref count.
1331 		 */
1332 		vtLock(c->lk);
1333 		b->ref++;
1334 		vtUnlock(c->lk);
1335 		break;
1336 	case BioReadError:
1337 	case BioVentiError:
1338 		/*
1339 		 * Oops.
1340 		 */
1341 		dowakeup = 1;
1342 		break;
1343 	}
1344 	b->iostate = iostate;
1345 	/*
1346 	 * Now that the state has changed, we can wake the waiters.
1347 	 */
1348 	if(dowakeup)
1349 		vtWakeupAll(b->ioready);
1350 }
1351 
1352 /*
1353  * The active file system is a tree of blocks.
1354  * When we add snapshots to the mix, the entire file system
1355  * becomes a dag and thus requires a bit more care.
1356  *
1357  * The life of the file system is divided into epochs.  A snapshot
1358  * ends one epoch and begins the next.  Each file system block
1359  * is marked with the epoch in which it was created (b.epoch).
1360  * When the block is unlinked from the file system (closed), it is marked
1361  * with the epoch in which it was removed (b.epochClose).
1362  * Once we have discarded or archived all snapshots up to
1363  * b.epochClose, we can reclaim the block.
1364  *
1365  * If a block was created in a past epoch but is not yet closed,
1366  * it is treated as copy-on-write.  Of course, in order to insert the
1367  * new pointer into the tree, the parent must be made writable,
1368  * and so on up the tree.  The recursion stops because the root
1369  * block is always writable.
1370  *
1371  * If blocks are never closed, they will never be reused, and
1372  * we will run out of disk space.  But marking a block as closed
1373  * requires some care about dependencies and write orderings.
1374  *
1375  * (1) If a block p points at a copy-on-write block b and we
1376  * copy b to create bb, then p must be written out after bb and
1377  * lbb (bb's label block).
1378  *
1379  * (2) We have to mark b as closed, but only after we switch
1380  * the pointer, so lb must be written out after p.  In fact, we
1381  * can't even update the in-memory copy, or the cache might
1382  * mistakenly give out b for reuse before p gets written.
1383  *
1384  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1385  * The caller is expected to record a "p after bb" dependency
1386  * to finish (1), and also expected to call blockRemoveLink
1387  * to arrange for (2) to happen once p is written.
1388  *
1389  * Until (2) happens, some pieces of the code (e.g., the archiver)
1390  * still need to know whether a block has been copied, so we
1391  * set the BsCopied bit in the label and force that to disk *before*
1392  * the copy gets written out.
1393  */
1394 Block*
1395 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1396 {
1397 	Block *bb, *lb;
1398 	Label l;
1399 
1400 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1401 		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1402 			argv0, b->addr, &b->l, elo, ehi);
1403 
1404 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1405 	if(bb == nil){
1406 		blockPut(b);
1407 		return nil;
1408 	}
1409 
1410 	/*
1411 	 * Update label so we know the block has been copied.
1412 	 * (It will be marked closed once it has been unlinked from
1413 	 * the tree.)  This must follow cacheAllocBlock since we
1414 	 * can't be holding onto lb when we call cacheAllocBlock.
1415 	 */
1416 	if((b->l.state&BsCopied)==0)
1417 	if(b->part == PartData){	/* not the superblock */
1418 		l = b->l;
1419 		l.state |= BsCopied;
1420 		lb = _blockSetLabel(b, &l);
1421 		if(lb == nil){
1422 			/* can't set label => can't copy block */
1423 			blockPut(b);
1424 			l.type = BtMax;
1425 			l.state = BsFree;
1426 			l.epoch = 0;
1427 			l.epochClose = 0;
1428 			l.tag = 0;
1429 			blockSetLabel(bb, &l, 0);
1430 			blockPut(bb);
1431 			return nil;
1432 		}
1433 		blockDependency(bb, lb, -1, nil, nil);
1434 		blockPut(lb);
1435 	}
1436 
1437 	memmove(bb->data, b->data, b->c->size);
1438 	blockDirty(bb);
1439 	blockPut(b);
1440 	return bb;
1441 }
1442 
1443 /*
1444  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1445  * If recurse is set, we are unlinking all of bb's children as well.
1446  *
1447  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1448  * the relevant information to b's list of unlinked blocks.  Once b is written,
1449  * the list will be queued for processing.
1450  *
1451  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1452  */
1453 void
1454 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1455 {
1456 	BList *p, **pp, bl;
1457 
1458 	/* remove bb from prior list */
1459 	for(pp=&b->prior; (p=*pp)!=nil; ){
1460 		if(p->part == PartData && p->addr == addr){
1461 			*pp = p->next;
1462 			blistFree(b->c, p);
1463 		}else
1464 			pp = &p->next;
1465 	}
1466 
1467 	bl.part = PartData;
1468 	bl.addr = addr;
1469 	bl.type = type;
1470 	bl.tag = tag;
1471 	if(b->l.epoch == 0)
1472 		assert(b->part == PartSuper);
1473 	bl.epoch = b->l.epoch;
1474 	bl.next = nil;
1475 	bl.recurse = recurse;
1476 
1477 	p = blistAlloc(b);
1478 	if(p == nil){
1479 		/*
1480 		 * We were out of blists so blistAlloc wrote b to disk.
1481 		 */
1482 		doRemoveLink(b->c, &bl);
1483 		return;
1484 	}
1485 
1486 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1487 	assert(b->iostate == BioDirty);
1488 
1489 	*p = bl;
1490 	if(b->uhead == nil)
1491 		b->uhead = p;
1492 	else
1493 		b->utail->next = p;
1494 	b->utail = p;
1495 }
1496 
1497 /*
1498  * Process removal of a single block and perhaps its children.
1499  */
1500 static void
1501 doRemoveLink(Cache *c, BList *p)
1502 {
1503 	int i, n, recurse;
1504 	u32int a;
1505 	Block *b;
1506 	Label l;
1507 	BList bl;
1508 
1509 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1510 
1511 	/*
1512 	 * We're not really going to overwrite b, but if we're not
1513 	 * going to look at its contents, there is no point in reading
1514 	 * them from the disk.
1515 	 */
1516 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1517 	if(b == nil)
1518 		return;
1519 
1520 	/*
1521 	 * When we're unlinking from the superblock, close with the next epoch.
1522 	 */
1523 	if(p->epoch == 0)
1524 		p->epoch = b->l.epoch+1;
1525 
1526 	/* sanity check */
1527 	if(b->l.epoch > p->epoch){
1528 		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1529 			argv0, b->l.epoch, p->epoch);
1530 		blockPut(b);
1531 		return;
1532 	}
1533 
1534 	if(recurse){
1535 		n = c->size / VtScoreSize;
1536 		for(i=0; i<n; i++){
1537 			a = globalToLocal(b->data + i*VtScoreSize);
1538 			if(a == NilBlock || !readLabel(c, &l, a))
1539 				continue;
1540 			if(l.state&BsClosed)
1541 				continue;
1542 			/*
1543 			 * If stack space becomes an issue...
1544 			p->addr = a;
1545 			p->type = l.type;
1546 			p->tag = l.tag;
1547 			doRemoveLink(c, p);
1548 			 */
1549 
1550 			bl.part = PartData;
1551 			bl.addr = a;
1552 			bl.type = l.type;
1553 			bl.tag = l.tag;
1554 			bl.epoch = p->epoch;
1555 			bl.next = nil;
1556 			bl.recurse = 1;
1557 			/* give up the block lock - share with others */
1558 			blockPut(b);
1559 			doRemoveLink(c, &bl);
1560 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1561 			if(b == nil){
1562 				fprint(2, "%s: warning: lost block in doRemoveLink\n",
1563 					argv0);
1564 				return;
1565 			}
1566 		}
1567 	}
1568 
1569 	l = b->l;
1570 	l.state |= BsClosed;
1571 	l.epochClose = p->epoch;
1572 	if(l.epochClose == l.epoch){
1573 		vtLock(c->fl->lk);
1574 		if(l.epoch == c->fl->epochLow)
1575 			c->fl->nused--;
1576 		blockSetLabel(b, &l, 0);
1577 		vtUnlock(c->fl->lk);
1578 	}else
1579 		blockSetLabel(b, &l, 0);
1580 	blockPut(b);
1581 }
1582 
1583 /*
1584  * Allocate a BList so that we can record a dependency
1585  * or queue a removal related to block b.
1586  * If we can't find a BList, we write out b and return nil.
1587  */
1588 static BList *
1589 blistAlloc(Block *b)
1590 {
1591 	Cache *c;
1592 	BList *p;
1593 
1594 	if(b->iostate != BioDirty){
1595 		/*
1596 		 * should not happen anymore -
1597 	 	 * blockDirty used to flush but no longer does.
1598 		 */
1599 		assert(b->iostate == BioClean);
1600 		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1601 		return nil;
1602 	}
1603 
1604 	c = b->c;
1605 	vtLock(c->lk);
1606 	if(c->blfree == nil){
1607 		/*
1608 		 * No free BLists.  What are our options?
1609 		 */
1610 
1611 		/* Block has no priors? Just write it. */
1612 		if(b->prior == nil){
1613 			vtUnlock(c->lk);
1614 			diskWriteAndWait(c->disk, b);
1615 			return nil;
1616 		}
1617 
1618 		/*
1619 		 * Wake the flush thread, which will hopefully free up
1620 		 * some BLists for us.  We used to flush a block from
1621 		 * our own prior list and reclaim that BList, but this is
1622 		 * a no-no: some of the blocks on our prior list may
1623 		 * be locked by our caller.  Or maybe their label blocks
1624 		 * are locked by our caller.  In any event, it's too hard
1625 		 * to make sure we can do I/O for ourselves.  Instead,
1626 		 * we assume the flush thread will find something.
1627 		 * (The flush thread never blocks waiting for a block,
1628 		 * so it can't deadlock like we can.)
1629 		 */
1630 		while(c->blfree == nil){
1631 			vtWakeup(c->flush);
1632 			vtSleep(c->blrend);
1633 			if(c->blfree == nil)
1634 				fprint(2, "%s: flushing for blists\n", argv0);
1635 		}
1636 	}
1637 
1638 	p = c->blfree;
1639 	c->blfree = p->next;
1640 	vtUnlock(c->lk);
1641 	return p;
1642 }
1643 
1644 static void
1645 blistFree(Cache *c, BList *bl)
1646 {
1647 	vtLock(c->lk);
1648 	bl->next = c->blfree;
1649 	c->blfree = bl;
1650 	vtWakeup(c->blrend);
1651 	vtUnlock(c->lk);
1652 }
1653 
1654 char*
1655 bsStr(int state)
1656 {
1657 	static char s[100];
1658 
1659 	if(state == BsFree)
1660 		return "Free";
1661 	if(state == BsBad)
1662 		return "Bad";
1663 
1664 	sprint(s, "%x", state);
1665 	if(!(state&BsAlloc))
1666 		strcat(s, ",Free");	/* should not happen */
1667 	if(state&BsCopied)
1668 		strcat(s, ",Copied");
1669 	if(state&BsVenti)
1670 		strcat(s, ",Venti");
1671 	if(state&BsClosed)
1672 		strcat(s, ",Closed");
1673 	return s;
1674 }
1675 
1676 char *
1677 bioStr(int iostate)
1678 {
1679 	switch(iostate){
1680 	default:
1681 		return "Unknown!!";
1682 	case BioEmpty:
1683 		return "Empty";
1684 	case BioLabel:
1685 		return "Label";
1686 	case BioClean:
1687 		return "Clean";
1688 	case BioDirty:
1689 		return "Dirty";
1690 	case BioReading:
1691 		return "Reading";
1692 	case BioWriting:
1693 		return "Writing";
1694 	case BioReadError:
1695 		return "ReadError";
1696 	case BioVentiError:
1697 		return "VentiError";
1698 	case BioMax:
1699 		return "Max";
1700 	}
1701 }
1702 
1703 static char *bttab[] = {
1704 	"BtData",
1705 	"BtData+1",
1706 	"BtData+2",
1707 	"BtData+3",
1708 	"BtData+4",
1709 	"BtData+5",
1710 	"BtData+6",
1711 	"BtData+7",
1712 	"BtDir",
1713 	"BtDir+1",
1714 	"BtDir+2",
1715 	"BtDir+3",
1716 	"BtDir+4",
1717 	"BtDir+5",
1718 	"BtDir+6",
1719 	"BtDir+7",
1720 };
1721 
1722 char*
1723 btStr(int type)
1724 {
1725 	if(type < nelem(bttab))
1726 		return bttab[type];
1727 	return "unknown";
1728 }
1729 
1730 int
1731 labelFmt(Fmt *f)
1732 {
1733 	Label *l;
1734 
1735 	l = va_arg(f->args, Label*);
1736 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1737 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1738 }
1739 
1740 int
1741 scoreFmt(Fmt *f)
1742 {
1743 	uchar *v;
1744 	int i;
1745 	u32int addr;
1746 
1747 	v = va_arg(f->args, uchar*);
1748 	if(v == nil){
1749 		fmtprint(f, "*");
1750 	}else if((addr = globalToLocal(v)) != NilBlock)
1751 		fmtprint(f, "0x%.8ux", addr);
1752 	else{
1753 		for(i = 0; i < VtScoreSize; i++)
1754 			fmtprint(f, "%2.2ux", v[i]);
1755 	}
1756 
1757 	return 0;
1758 }
1759 
1760 static int
1761 upHeap(int i, Block *b)
1762 {
1763 	Block *bb;
1764 	u32int now;
1765 	int p;
1766 	Cache *c;
1767 
1768 	c = b->c;
1769 	now = c->now;
1770 	for(; i != 0; i = p){
1771 		p = (i - 1) >> 1;
1772 		bb = c->heap[p];
1773 		if(b->used - now >= bb->used - now)
1774 			break;
1775 		c->heap[i] = bb;
1776 		bb->heap = i;
1777 	}
1778 	c->heap[i] = b;
1779 	b->heap = i;
1780 
1781 	return i;
1782 }
1783 
1784 static int
1785 downHeap(int i, Block *b)
1786 {
1787 	Block *bb;
1788 	u32int now;
1789 	int k;
1790 	Cache *c;
1791 
1792 	c = b->c;
1793 	now = c->now;
1794 	for(; ; i = k){
1795 		k = (i << 1) + 1;
1796 		if(k >= c->nheap)
1797 			break;
1798 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1799 			k++;
1800 		bb = c->heap[k];
1801 		if(b->used - now <= bb->used - now)
1802 			break;
1803 		c->heap[i] = bb;
1804 		bb->heap = i;
1805 	}
1806 	c->heap[i] = b;
1807 	b->heap = i;
1808 	return i;
1809 }
1810 
1811 /*
1812  * Delete a block from the heap.
1813  * Called with c->lk held.
1814  */
1815 static void
1816 heapDel(Block *b)
1817 {
1818 	int i, si;
1819 	Cache *c;
1820 
1821 	c = b->c;
1822 
1823 	si = b->heap;
1824 	if(si == BadHeap)
1825 		return;
1826 	b->heap = BadHeap;
1827 	c->nheap--;
1828 	if(si == c->nheap)
1829 		return;
1830 	b = c->heap[c->nheap];
1831 	i = upHeap(si, b);
1832 	if(i == si)
1833 		downHeap(i, b);
1834 }
1835 
1836 /*
1837  * Insert a block into the heap.
1838  * Called with c->lk held.
1839  */
1840 static void
1841 heapIns(Block *b)
1842 {
1843 	assert(b->heap == BadHeap);
1844 	upHeap(b->c->nheap++, b);
1845 	vtWakeup(b->c->heapwait);
1846 }
1847 
1848 /*
1849  * Get just the label for a block.
1850  */
1851 int
1852 readLabel(Cache *c, Label *l, u32int addr)
1853 {
1854 	int lpb;
1855 	Block *b;
1856 	u32int a;
1857 
1858 	lpb = c->size / LabelSize;
1859 	a = addr / lpb;
1860 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1861 	if(b == nil){
1862 		blockPut(b);
1863 		return 0;
1864 	}
1865 
1866 	if(!labelUnpack(l, b->data, addr%lpb)){
1867 		blockPut(b);
1868 		return 0;
1869 	}
1870 	blockPut(b);
1871 	return 1;
1872 }
1873 
1874 /*
1875  * Process unlink queue.
1876  * Called with c->lk held.
1877  */
1878 static void
1879 unlinkBody(Cache *c)
1880 {
1881 	BList *p;
1882 
1883 	while(c->uhead != nil){
1884 		p = c->uhead;
1885 		c->uhead = p->next;
1886 		vtUnlock(c->lk);
1887 		doRemoveLink(c, p);
1888 		vtLock(c->lk);
1889 		p->next = c->blfree;
1890 		c->blfree = p;
1891 	}
1892 }
1893 
1894 /*
1895  * Occasionally unlink the blocks on the cache unlink queue.
1896  */
1897 static void
1898 unlinkThread(void *a)
1899 {
1900 	Cache *c = a;
1901 
1902 	vtThreadSetName("unlink");
1903 
1904 	vtLock(c->lk);
1905 	for(;;){
1906 		while(c->uhead == nil && c->die == nil)
1907 			vtSleep(c->unlink);
1908 		if(c->die != nil)
1909 			break;
1910 		unlinkBody(c);
1911 	}
1912 	c->ref--;
1913 	vtWakeup(c->die);
1914 	vtUnlock(c->lk);
1915 }
1916 
1917 static int
1918 baddrCmp(void *a0, void *a1)
1919 {
1920 	BAddr *b0, *b1;
1921 	b0 = a0;
1922 	b1 = a1;
1923 
1924 	if(b0->part < b1->part)
1925 		return -1;
1926 	if(b0->part > b1->part)
1927 		return 1;
1928 	if(b0->addr < b1->addr)
1929 		return -1;
1930 	if(b0->addr > b1->addr)
1931 		return 1;
1932 	return 0;
1933 }
1934 
1935 /*
1936  * Scan the block list for dirty blocks; add them to the list c->baddr.
1937  */
1938 static void
1939 flushFill(Cache *c)
1940 {
1941 	int i, ndirty;
1942 	BAddr *p;
1943 	Block *b;
1944 
1945 	vtLock(c->lk);
1946 	if(c->ndirty == 0){
1947 		vtUnlock(c->lk);
1948 		return;
1949 	}
1950 
1951 	p = c->baddr;
1952 	ndirty = 0;
1953 	for(i=0; i<c->nblocks; i++){
1954 		b = c->blocks + i;
1955 		if(b->part == PartError)
1956 			continue;
1957 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1958 			ndirty++;
1959 		if(b->iostate != BioDirty)
1960 			continue;
1961 		p->part = b->part;
1962 		p->addr = b->addr;
1963 		p->vers = b->vers;
1964 		p++;
1965 	}
1966 	if(ndirty != c->ndirty){
1967 		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1968 			argv0, c->ndirty, ndirty);
1969 		c->ndirty = ndirty;
1970 	}
1971 	vtUnlock(c->lk);
1972 
1973 	c->bw = p - c->baddr;
1974 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1975 }
1976 
1977 /*
1978  * This is not thread safe, i.e. it can't be called from multiple threads.
1979  *
1980  * It's okay how we use it, because it only gets called in
1981  * the flushThread.  And cacheFree, but only after
1982  * cacheFree has killed off the flushThread.
1983  */
1984 static int
1985 cacheFlushBlock(Cache *c)
1986 {
1987 	Block *b;
1988 	BAddr *p;
1989 	int lockfail, nfail;
1990 
1991 	nfail = 0;
1992 	for(;;){
1993 		if(c->br == c->be){
1994 			if(c->bw == 0 || c->bw == c->be)
1995 				flushFill(c);
1996 			c->br = 0;
1997 			c->be = c->bw;
1998 			c->bw = 0;
1999 			c->nflush = 0;
2000 		}
2001 
2002 		if(c->br == c->be)
2003 			return 0;
2004 		p = c->baddr + c->br;
2005 		c->br++;
2006 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
2007 
2008 		if(b && blockWrite(b)){
2009 			c->nflush++;
2010 			blockPut(b);
2011 			return 1;
2012 		}
2013 		if(b)
2014 			blockPut(b);
2015 
2016 		/*
2017 		 * Why didn't we write the block?
2018 		 */
2019 
2020 		/* Block already written out */
2021 		if(b == nil && !lockfail)
2022 			continue;
2023 
2024 		/* Failed to acquire lock; sleep if happens a lot. */
2025 		if(lockfail && ++nfail > 100){
2026 			sleep(500);
2027 			nfail = 0;
2028 		}
2029 		/* Requeue block. */
2030 		if(c->bw < c->be)
2031 			c->baddr[c->bw++] = *p;
2032 	}
2033 }
2034 
2035 /*
2036  * Occasionally flush dirty blocks from memory to the disk.
2037  */
2038 static void
2039 flushThread(void *a)
2040 {
2041 	Cache *c = a;
2042 	int i;
2043 
2044 	vtThreadSetName("flush");
2045 	vtLock(c->lk);
2046 	while(c->die == nil){
2047 		vtSleep(c->flush);
2048 		vtUnlock(c->lk);
2049 		for(i=0; i<FlushSize; i++)
2050 			if(!cacheFlushBlock(c)){
2051 				/*
2052 				 * If i==0, could be someone is waking us repeatedly
2053 				 * to flush the cache but there's no work to do.
2054 				 * Pause a little.
2055 				 */
2056 				if(i==0){
2057 					// fprint(2, "%s: flushthread found "
2058 					//	"nothing to flush - %d dirty\n",
2059 					//	argv0, c->ndirty);
2060 					sleep(250);
2061 				}
2062 				break;
2063 			}
2064 		if(i==0 && c->ndirty){
2065 			/*
2066 			 * All the blocks are being written right now -- there's nothing to do.
2067 			 * We might be spinning with cacheFlush though -- he'll just keep
2068 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2069 			 * on something that the diskThread can kick, but for now we'll
2070 			 * just pause for a little while waiting for disks to finish.
2071 			 */
2072 			sleep(100);
2073 		}
2074 		vtLock(c->lk);
2075 		vtWakeupAll(c->flushwait);
2076 	}
2077 	c->ref--;
2078 	vtWakeup(c->die);
2079 	vtUnlock(c->lk);
2080 }
2081 
2082 /*
2083  * Flush the cache.
2084  */
2085 void
2086 cacheFlush(Cache *c, int wait)
2087 {
2088 	/*
2089 	 * Lock c->dirtylk so that more blocks aren't being dirtied
2090 	 * while we try to write out what's already here.
2091 	 * Otherwise we might not ever finish!
2092 	 */
2093 	vtLock(c->dirtylk);
2094 	vtLock(c->lk);
2095 	if(wait){
2096 		while(c->ndirty){
2097 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2098 		//		c->ndirty, c->uhead);
2099 			vtWakeup(c->flush);
2100 			vtSleep(c->flushwait);
2101 		}
2102 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2103 	}else if(c->ndirty)
2104 		vtWakeup(c->flush);
2105 	vtUnlock(c->lk);
2106 	vtUnlock(c->dirtylk);
2107 }
2108 
2109 /*
2110  * Kick the flushThread every 30 seconds.
2111  */
2112 static void
2113 cacheSync(void *v)
2114 {
2115 	Cache *c;
2116 
2117 	c = v;
2118 	cacheFlush(c, 0);
2119 }
2120