xref: /plan9-contrib/sys/src/cmd/fossil/cache.c (revision 9b943567965ba040fd275927fbe088656eb8ce4f)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	int 	ref;
26 	int	mode;
27 
28 	Disk 	*disk;
29 	int	size;			/* block size */
30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31 	VtSession *z;
32 	u32int	now;			/* ticks for usage timestamps */
33 	Block	**heads;		/* hash table for finding address */
34 	int	nheap;			/* number of available victims */
35 	Block	**heap;			/* heap for locating victims */
36 	long	nblocks;		/* number of blocks allocated */
37 	Block	*blocks;		/* array of block descriptors */
38 	u8int	*mem;			/* memory for all block data & blists */
39 
40 	BList	*blfree;
41 	VtRendez *blrend;
42 
43 	int 	ndirty;			/* number of dirty blocks in the cache */
44 	int 	maxdirty;		/* max number of dirty blocks */
45 	u32int	vers;
46 
47 	long hashSize;
48 
49 	FreeList *fl;
50 
51 	VtRendez *die;			/* daemon threads should die when != nil */
52 
53 	VtRendez *flush;
54 	VtRendez *flushwait;
55 	VtRendez *heapwait;
56 	BAddr *baddr;
57 	int bw, br, be;
58 	int nflush;
59 
60 	Periodic *sync;
61 
62 	/* unlink daemon */
63 	BList *uhead;
64 	BList *utail;
65 	VtRendez *unlink;
66 
67 	/* block counts */
68 	int nused;
69 	int ndisk;
70 };
71 
72 struct BList {
73 	int part;
74 	u32int addr;
75 	uchar type;
76 	u32int tag;
77 	u32int epoch;
78 	u32int vers;
79 
80 	/* for roll back */
81 	int index;			/* -1 indicates not valid */
82 	union {
83 		uchar score[VtScoreSize];
84 		uchar entry[VtEntrySize];
85 	} old;
86 	BList *next;
87 };
88 
89 struct BAddr {
90 	int part;
91 	u32int addr;
92 	u32int vers;
93 };
94 
95 struct FreeList {
96 	VtLock *lk;
97 	u32int last;	/* last block allocated */
98 	u32int end;	/* end of data partition */
99 	u32int nfree;	/* number of free blocks */
100 	u32int nused;	/* number of used blocks */
101 	u32int epochLow;	/* low epoch when last updated nfree and nused */
102 };
103 
104 static FreeList *flAlloc(u32int end);
105 static void flFree(FreeList *fl);
106 
107 static Block *cacheBumpBlock(Cache *c);
108 static void heapDel(Block*);
109 static void heapIns(Block*);
110 static void cacheCheck(Cache*);
111 static int readLabel(Cache*, Label*, u32int addr);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void flushBody(Cache *c);
115 static void unlinkBody(Cache *c);
116 static int cacheFlushBlock(Cache *c);
117 static void cacheSync(void*);
118 /*
119  * Mapping from local block type to Venti type
120  */
121 int vtType[BtMax] = {
122 	VtDataType,		/* BtData | 0  */
123 	VtPointerType0,		/* BtData | 1  */
124 	VtPointerType1,		/* BtData | 2  */
125 	VtPointerType2,		/* BtData | 3  */
126 	VtPointerType3,		/* BtData | 4  */
127 	VtPointerType4,		/* BtData | 5  */
128 	VtPointerType5,		/* BtData | 6  */
129 	VtPointerType6,		/* BtData | 7  */
130 	VtDirType,		/* BtDir | 0  */
131 	VtPointerType0,		/* BtDir | 1  */
132 	VtPointerType1,		/* BtDir | 2  */
133 	VtPointerType2,		/* BtDir | 3  */
134 	VtPointerType3,		/* BtDir | 4  */
135 	VtPointerType4,		/* BtDir | 5  */
136 	VtPointerType5,		/* BtDir | 6  */
137 	VtPointerType6,		/* BtDir | 7  */
138 };
139 
140 /*
141  * Allocate the memory cache.
142  */
143 Cache *
144 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
145 {
146 	int i;
147 	Cache *c;
148 	Block *b;
149 	BList *bl;
150 	u8int *p;
151 	int nbl;
152 
153 	c = vtMemAllocZ(sizeof(Cache));
154 
155 	/* reasonable number of BList elements */
156 	nbl = nblocks * 4;
157 
158 	c->lk = vtLockAlloc();
159 	c->ref = 1;
160 	c->disk = disk;
161 	c->z = z;
162 	c->size = diskBlockSize(disk);
163 bwatchSetBlockSize(c->size);
164 	/* round c->size up to be a nice multiple */
165 	c->size = (c->size + 127) & ~127;
166 	c->ndmap = (c->size/20 + 7) / 8;
167 	c->nblocks = nblocks;
168 	c->hashSize = nblocks;
169 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
170 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
171 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
172 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
173 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
174 	c->mode = mode;
175 	c->vers++;
176 	p = c->mem;
177 	for(i = 0; i < nblocks; i++){
178 		b = &c->blocks[i];
179 		b->lk = vtLockAlloc();
180 		b->c = c;
181 		b->data = p;
182 		b->heap = i;
183 		b->ioready = vtRendezAlloc(b->lk);
184 		c->heap[i] = b;
185 		p += c->size;
186 	}
187 	c->nheap = nblocks;
188 	for(i = 0; i < nbl; i++){
189 		bl = (BList*)p;
190 		bl->next = c->blfree;
191 		c->blfree = bl;
192 		p += sizeof(BList);
193 	}
194 	/* separate loop to keep blocks and blists reasonably aligned */
195 	for(i = 0; i < nblocks; i++){
196 		b = &c->blocks[i];
197 		b->dmap = p;
198 		p += c->ndmap;
199 	}
200 
201 	c->blrend = vtRendezAlloc(c->lk);
202 
203 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
204 
205 	c->fl = flAlloc(diskSize(disk, PartData));
206 
207 	c->unlink = vtRendezAlloc(c->lk);
208 	c->flush = vtRendezAlloc(c->lk);
209 	c->flushwait = vtRendezAlloc(c->lk);
210 	c->heapwait = vtRendezAlloc(c->lk);
211 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
212 
213 	if(mode == OReadWrite){
214 		c->ref += 2;
215 		vtThread(unlinkThread, c);
216 		vtThread(flushThread, c);
217 	}
218 	cacheCheck(c);
219 
220 	return c;
221 }
222 
223 /*
224  * Free the whole memory cache, flushing all dirty blocks to the disk.
225  */
226 void
227 cacheFree(Cache *c)
228 {
229 	int i;
230 
231 	/* kill off daemon threads */
232 	vtLock(c->lk);
233 	c->die = vtRendezAlloc(c->lk);
234 	periodicKill(c->sync);
235 	vtWakeup(c->flush);
236 	vtWakeup(c->unlink);
237 	while(c->ref > 1)
238 		vtSleep(c->die);
239 
240 	/* flush everything out */
241 	do {
242 		unlinkBody(c);
243 		vtUnlock(c->lk);
244 		while(cacheFlushBlock(c))
245 			;
246 		diskFlush(c->disk);
247 		vtLock(c->lk);
248 	} while(c->uhead || c->ndirty);
249 	vtUnlock(c->lk);
250 
251 	cacheCheck(c);
252 
253 	for(i = 0; i < c->nblocks; i++){
254 		assert(c->blocks[i].ref == 0);
255 		vtRendezFree(c->blocks[i].ioready);
256 		vtLockFree(c->blocks[i].lk);
257 	}
258 	flFree(c->fl);
259 	vtMemFree(c->baddr);
260 	vtMemFree(c->heads);
261 	vtMemFree(c->blocks);
262 	vtMemFree(c->mem);
263 	vtLockFree(c->lk);
264 	diskFree(c->disk);
265 	vtRendezFree(c->blrend);
266 	/* don't close vtSession */
267 	vtMemFree(c);
268 }
269 
270 static void
271 cacheDump(Cache *c)
272 {
273 	int i;
274 	Block *b;
275 
276 	for(i = 0; i < c->nblocks; i++){
277 		b = &c->blocks[i];
278 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=0x%lux\n",
279 			i, b->part, b->addr, b->score, b->l.type, b->ref,
280 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
281 	}
282 }
283 
284 static void
285 cacheCheck(Cache *c)
286 {
287 	u32int size, now;
288 	int i, k, refed;
289 	static uchar zero[VtScoreSize];
290 	Block *b;
291 
292 	size = c->size;
293 	now = c->now;
294 
295 	for(i = 0; i < c->nheap; i++){
296 		if(c->heap[i]->heap != i)
297 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
298 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
299 			vtFatal("bad heap ordering");
300 		k = (i << 1) + 1;
301 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
302 			vtFatal("bad heap ordering");
303 		k++;
304 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
305 			vtFatal("bad heap ordering");
306 	}
307 
308 	refed = 0;
309 	for(i = 0; i < c->nblocks; i++){
310 		b = &c->blocks[i];
311 		if(b->data != &c->mem[i * size])
312 			vtFatal("mis-blocked at %d", i);
313 		if(b->ref && b->heap == BadHeap){
314 			refed++;
315 		}
316 	}
317 if(c->nheap + refed != c->nblocks){
318 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks);
319 cacheDump(c);
320 }
321 	assert(c->nheap + refed == c->nblocks);
322 	refed = 0;
323 	for(i = 0; i < c->nblocks; i++){
324 		b = &c->blocks[i];
325 		if(b->ref){
326 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l);
327 			refed++;
328 		}
329 	}
330 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed);
331 }
332 
333 
334 /*
335  * locate the block with the oldest second to last use.
336  * remove it from the heap, and fix up the heap.
337  */
338 /* called with c->lk held */
339 static Block *
340 cacheBumpBlock(Cache *c)
341 {
342 	Block *b;
343 
344 	/*
345 	 * locate the block with the oldest second to last use.
346 	 * remove it from the heap, and fix up the heap.
347 	 */
348 	if(c->nheap == 0){
349 		while(c->nheap == 0){
350 			fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty);
351 			vtWakeup(c->flush);
352 			vtSleep(c->heapwait);
353 		}
354 		fprint(2, "cache is okay again\n");
355 	}
356 
357 	b = c->heap[0];
358 	heapDel(b);
359 
360 	assert(b->heap == BadHeap);
361 	assert(b->ref == 0);
362 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
363 	assert(b->prior == nil);
364 	assert(b->uhead == nil);
365 
366 	/*
367 	 * unchain the block from hash chain
368 	 */
369 	if(b->prev){
370 		*(b->prev) = b->next;
371 		if(b->next)
372 			b->next->prev = b->prev;
373 		b->prev = nil;
374 	}
375 
376 
377 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score);
378 	/* set block to a reasonable state */
379 	b->ref = 1;
380 	b->part = PartError;
381 	memset(&b->l, 0, sizeof(b->l));
382 	b->iostate = BioEmpty;
383 
384 	return b;
385 }
386 
387 /*
388  * look for a particular version of the block in the memory cache.
389  */
390 static Block *
391 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
392 	int waitlock, int *lockfailure)
393 {
394 	Block *b;
395 	ulong h;
396 
397 	h = addr % c->hashSize;
398 
399 	if(lockfailure)
400 		*lockfailure = 0;
401 
402 	/*
403 	 * look for the block in the cache
404 	 */
405 	vtLock(c->lk);
406 	for(b = c->heads[h]; b != nil; b = b->next){
407 		if(b->part == part && b->addr == addr)
408 			break;
409 	}
410 	if(b == nil || b->vers != vers){
411 		vtUnlock(c->lk);
412 		return nil;
413 	}
414 	if(!waitlock && !vtCanLock(b->lk)){
415 		*lockfailure = 1;
416 		vtUnlock(c->lk);
417 		return nil;
418 	}
419 	heapDel(b);
420 	b->ref++;
421 	vtUnlock(c->lk);
422 
423 	bwatchLock(b);
424 	if(waitlock)
425 		vtLock(b->lk);
426 	b->nlock = 1;
427 
428 	for(;;){
429 		switch(b->iostate){
430 		default:
431 			abort();
432 		case BioEmpty:
433 		case BioLabel:
434 		case BioClean:
435 		case BioDirty:
436 			if(b->vers != vers){
437 				blockPut(b);
438 				return nil;
439 			}
440 			return b;
441 		case BioReading:
442 		case BioWriting:
443 			vtSleep(b->ioready);
444 			break;
445 		case BioVentiError:
446 			blockPut(b);
447 			vtSetError(EVentiIO);
448 			return nil;
449 		case BioReadError:
450 			blockPut(b);
451 			vtSetError(EIO);
452 			return nil;
453 		}
454 	}
455 	/* NOT REACHED */
456 }
457 static Block*
458 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
459 {
460 	return _cacheLocalLookup(c, part, addr, vers, 1, 0);
461 }
462 
463 
464 /*
465  * fetch a local (on-disk) block from the memory cache.
466  * if it's not there, load it, bumping some other block.
467  */
468 Block *
469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
470 {
471 	Block *b;
472 	ulong h;
473 
474 	assert(part != PartVenti);
475 
476 	h = addr % c->hashSize;
477 
478 	/*
479 	 * look for the block in the cache
480 	 */
481 	vtLock(c->lk);
482 	for(b = c->heads[h]; b != nil; b = b->next){
483 		if(b->part != part || b->addr != addr)
484 			continue;
485 		if(epoch && b->l.epoch != epoch){
486 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
487 			vtUnlock(c->lk);
488 			vtSetError(ELabelMismatch);
489 			return nil;
490 		}
491 		heapDel(b);
492 		b->ref++;
493 		break;
494 	}
495 
496 	if(b == nil){
497 		b = cacheBumpBlock(c);
498 
499 		b->part = part;
500 		b->addr = addr;
501 		localToGlobal(addr, b->score);
502 
503 		/* chain onto correct hash */
504 		b->next = c->heads[h];
505 		c->heads[h] = b;
506 		if(b->next != nil)
507 			b->next->prev = &b->next;
508 		b->prev = &c->heads[h];
509 	}
510 
511 	vtUnlock(c->lk);
512 
513 	/*
514 	 * BUG: what if the epoch changes right here?
515 	 * In the worst case, we could end up in some weird
516 	 * lock loop, because the block we want no longer exists,
517 	 * and instead we're trying to lock a block we have no
518 	 * business grabbing.
519 	 *
520 	 * For now, I'm not going to worry about it.
521 	 */
522 
523 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr);
524 	bwatchLock(b);
525 	vtLock(b->lk);
526 	b->nlock = 1;
527 
528 	if(part == PartData && b->iostate == BioEmpty){
529 		if(!readLabel(c, &b->l, addr)){
530 			blockPut(b);
531 			return nil;
532 		}
533 		blockSetIOState(b, BioLabel);
534 	}
535 	if(epoch && b->l.epoch != epoch){
536 		blockPut(b);
537 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch);
538 		vtSetError(ELabelMismatch);
539 		return nil;
540 	}
541 
542 	b->pc = getcallerpc(&c);
543 	for(;;){
544 		switch(b->iostate){
545 		default:
546 			abort();
547 		case BioEmpty:
548 		case BioLabel:
549 			if(mode == OOverWrite){
550 				blockSetIOState(b, BioClean);
551 				return b;
552 			}
553 			diskRead(c->disk, b);
554 			vtSleep(b->ioready);
555 			break;
556 		case BioClean:
557 		case BioDirty:
558 			return b;
559 		case BioReading:
560 		case BioWriting:
561 			vtSleep(b->ioready);
562 			break;
563 		case BioReadError:
564 			blockSetIOState(b, BioEmpty);
565 			blockPut(b);
566 			vtSetError(EIO);
567 			return nil;
568 		}
569 	}
570 	/* NOT REACHED */
571 }
572 
573 Block *
574 cacheLocal(Cache *c, int part, u32int addr, int mode)
575 {
576 	return _cacheLocal(c, part, addr, mode, 0);
577 }
578 
579 /*
580  * fetch a local (on-disk) block from the memory cache.
581  * if it's not there, load it, bumping some other block.
582  * check tag and type.
583  */
584 Block *
585 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
586 {
587 	Block *b;
588 
589 	b = _cacheLocal(c, PartData, addr, mode, epoch);
590 	if(b == nil)
591 		return nil;
592 	if(b->l.type != type || b->l.tag != tag){
593 		fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
594 			addr, b->l.type, type, b->l.tag, tag);
595 		vtSetError(ELabelMismatch);
596 		blockPut(b);
597 		return nil;
598 	}
599 	b->pc = getcallerpc(&c);
600 	return b;
601 }
602 
603 /*
604  * fetch a global (Venti) block from the memory cache.
605  * if it's not there, load it, bumping some other block.
606  * check tag and type if it's really a local block in disguise.
607  */
608 Block *
609 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
610 {
611 	int n;
612 	Block *b;
613 	ulong h;
614 	u32int addr;
615 
616 	addr = globalToLocal(score);
617 	if(addr != NilBlock){
618 		b = cacheLocalData(c, addr, type, tag, mode, 0);
619 		if(b)
620 			b->pc = getcallerpc(&c);
621 		return b;
622 	}
623 
624 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
625 
626 	/*
627 	 * look for the block in the cache
628 	 */
629 	vtLock(c->lk);
630 	for(b = c->heads[h]; b != nil; b = b->next){
631 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
632 			continue;
633 		heapDel(b);
634 		b->ref++;
635 		break;
636 	}
637 
638 	if(b == nil){
639 if(0)fprint(2, "cacheGlobal %V %d\n", score, type);
640 
641 		b = cacheBumpBlock(c);
642 
643 		b->part = PartVenti;
644 		b->addr = NilBlock;
645 		b->l.type = type;
646 		memmove(b->score, score, VtScoreSize);
647 
648 		/* chain onto correct hash */
649 		b->next = c->heads[h];
650 		c->heads[h] = b;
651 		if(b->next != nil)
652 			b->next->prev = &b->next;
653 		b->prev = &c->heads[h];
654 	}
655 	vtUnlock(c->lk);
656 
657 	bwatchLock(b);
658 	vtLock(b->lk);
659 	b->nlock = 1;
660 	b->pc = getcallerpc(&c);
661 
662 	switch(b->iostate){
663 	default:
664 		abort();
665 	case BioEmpty:
666 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
667 		if(n < 0 || !vtSha1Check(score, b->data, n)){
668 			blockSetIOState(b, BioVentiError);
669 			blockPut(b);
670 			vtSetError(EIO);
671 			return nil;
672 		}
673 		vtZeroExtend(vtType[type], b->data, n, c->size);
674 		blockSetIOState(b, BioClean);
675 		return b;
676 	case BioClean:
677 		return b;
678 	case BioVentiError:
679 		blockPut(b);
680 		vtSetError(EVentiIO);
681 		return nil;
682 	case BioReadError:
683 		blockPut(b);
684 		vtSetError(EIO);
685 		return nil;
686 	}
687 	/* NOT REACHED */
688 }
689 
690 /*
691  * allocate a new on-disk block and load it into the memory cache.
692  * BUG: if the disk is full, should we flush some of it to Venti?
693  */
694 static u32int lastAlloc;
695 
696 Block *
697 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
698 {
699 	FreeList *fl;
700 	u32int addr;
701 	Block *b;
702 	int n, nwrap;
703 	Label lab;
704 
705 	n = c->size / LabelSize;
706 	fl = c->fl;
707 
708 	vtLock(fl->lk);
709 	addr = fl->last;
710 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
711 	if(b == nil){
712 		fprint(2, "cacheAllocBlock: xxx %R\n");
713 		vtUnlock(fl->lk);
714 		return nil;
715 	}
716 	nwrap = 0;
717 	for(;;){
718 		if(++addr >= fl->end){
719 			addr = 0;
720 			if(++nwrap >= 2){
721 				blockPut(b);
722 				fl->last = 0;
723 				vtSetError("disk is full");
724 				fprint(2, "cacheAllocBlock: xxx1 %R\n");
725 				vtUnlock(fl->lk);
726 				return nil;
727 			}
728 		}
729 		if(addr%n == 0){
730 			blockPut(b);
731 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
732 			if(b == nil){
733 				fl->last = addr;
734 				fprint(2, "cacheAllocBlock: xxx2 %R\n");
735 				vtUnlock(fl->lk);
736 				return nil;
737 			}
738 		}
739 		if(!labelUnpack(&lab, b->data, addr%n))
740 			continue;
741 		if(lab.state == BsFree)
742 			goto Found;
743 		if((lab.state&BsClosed) && lab.epochClose <= epochLow)
744 			goto Found;
745 	}
746 Found:
747 	blockPut(b);
748 	b = cacheLocal(c, PartData, addr, OOverWrite);
749 	if(b == nil){
750 		fprint(2, "cacheAllocBlock: xxx3 %R\n");
751 		return nil;
752 	}
753 assert(b->iostate == BioLabel || b->iostate == BioClean);
754 	fl->last = addr;
755 	lab.type = type;
756 	lab.tag = tag;
757 	lab.state = BsAlloc;
758 	lab.epoch = epoch;
759 	lab.epochClose = ~(u32int)0;
760 	if(!blockSetLabel(b, &lab)){
761 		fprint(2, "cacheAllocBlock: xxx4 %R\n");
762 		blockPut(b);
763 		return nil;
764 	}
765 	vtZeroExtend(vtType[type], b->data, 0, c->size);
766 if(0)diskWrite(c->disk, b);
767 
768 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag);
769 	lastAlloc = addr;
770 	fl->nused++;
771 	vtUnlock(fl->lk);
772 	b->pc = getcallerpc(&c);
773 	return b;
774 }
775 
776 void
777 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
778 {
779 	int n;
780 	u32int addr, nused;
781 	Block *b;
782 	Label lab;
783 	FreeList *fl;
784 
785 	fl = c->fl;
786 	n = c->size / LabelSize;
787 	*bsize = c->size;
788 	vtLock(fl->lk);
789 	if(fl->epochLow == epochLow){
790 		*used = fl->nused;
791 		*total = fl->end;
792 		vtUnlock(fl->lk);
793 		return;
794 	}
795 	b = nil;
796 	nused = 0;
797 	for(addr=0; addr<fl->end; addr++){
798 		if(addr%n == 0){
799 			blockPut(b);
800 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
801 			if(b == nil){
802 				fprint(2, "flCountUsed: loading %ux: %R\n", addr/n);
803 				break;
804 			}
805 		}
806 		if(!labelUnpack(&lab, b->data, addr%n))
807 			continue;
808 		if(lab.state == BsFree)
809 			continue;
810 		if((lab.state&BsClosed) && lab.epochClose <= epochLow)
811 			continue;
812 		nused++;
813 	}
814 	blockPut(b);
815 	if(addr == fl->end){
816 		fl->nused = nused;
817 		fl->epochLow = epochLow;
818 	}
819 	*used = nused;
820 	*total = fl->end;
821 	vtUnlock(fl->lk);
822 	return;
823 }
824 
825 static FreeList *
826 flAlloc(u32int end)
827 {
828 	FreeList *fl;
829 
830 	fl = vtMemAllocZ(sizeof(*fl));
831 	fl->lk = vtLockAlloc();
832 	fl->last = 0;
833 	fl->end = end;
834 	return fl;
835 }
836 
837 static void
838 flFree(FreeList *fl)
839 {
840 	vtLockFree(fl->lk);
841 	vtMemFree(fl);
842 }
843 
844 u32int
845 cacheLocalSize(Cache *c, int part)
846 {
847 	return diskSize(c->disk, part);
848 }
849 
850 /*
851  * Copy on write.  Copied blocks have to be marked BaCopy.
852  * See the big comment near blockRemoveLink.
853  */
854 Block*
855 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
856 {
857 	Block *bb, *lb;
858 	Label l;
859 
860 	assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi);
861 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
862 	if(bb == nil){
863 		blockPut(b);
864 		return nil;
865 	}
866 
867 //fprint(2, "alloc %lux copy %V\n", bb->addr, b->score);
868 	/*
869 	 * Change label on b to mark that we've copied it.
870 	 * This has to come after cacheAllocBlock, since we
871 	 * can't hold any labels blocks (lb) while we try to
872 	 * fetch others (in cacheAllocBlock).
873 	 */
874 	if(!(b->l.state&BsCopied) && b->part==PartData){
875 		l = b->l;
876 		l.state |= BsCopied;
877 		lb = _blockSetLabel(b, &l);
878 		if(lb == nil){
879 			/* can't set label => can't copy block */
880 			blockPut(b);
881 			l.type = BtMax;
882 			l.state = BsFree;
883 			l.epoch = 0;
884 			l.epochClose = 0;
885 			l.tag = 0;
886 			/* ignore error: block gets lost on error */
887 			blockSetLabel(bb, &l);
888 			blockPut(bb);
889 			return nil;
890 		}
891 		blockDependency(bb, lb, -1, nil, nil);
892 		blockPut(lb);
893 	}
894 
895 	if(0){
896 		if(b->addr != NilBlock)
897 			fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n",
898 				b->addr, b->l.epoch, bb->addr, bb->l.epoch);
899 		else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0)
900 			fprint(2, "blockCopy %V => %#ux/%ud\n",
901 				b->score, bb->addr, bb->l.epoch);
902 	}
903 
904 	memmove(bb->data, b->data, b->c->size);
905 	blockDirty(bb);
906 	blockPut(b);
907 	return bb;
908 }
909 
910 /*
911  * The thread that has locked b may refer to it by
912  * multiple names.  Nlock counts the number of
913  * references the locking thread holds.  It will call
914  * blockPut once per reference.
915  */
916 void
917 blockDupLock(Block *b)
918 {
919 	assert(b->nlock > 0);
920 	b->nlock++;
921 }
922 
923 /*
924  * we're done with the block.
925  * unlock it.  can't use it after calling this.
926  */
927 void
928 blockPut(Block* b)
929 {
930 	Cache *c;
931 
932 	if(b == nil)
933 		return;
934 
935 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
936 
937 	if(b->iostate == BioDirty)
938 		bwatchDependency(b);
939 
940 	if(--b->nlock > 0)
941 		return;
942 
943 	/*
944 	 * b->nlock should probably stay at zero while
945 	 * the block is unlocked, but diskThread and vtSleep
946 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
947 	 * so we have to keep b->nlock set to 1 even
948 	 * when the block is unlocked.
949 	 */
950 	assert(b->nlock == 0);
951 	b->nlock = 1;
952 //	b->pc = 0;
953 
954 	bwatchUnlock(b);
955 	vtUnlock(b->lk);
956 	c = b->c;
957 	vtLock(c->lk);
958 
959 	if(--b->ref > 0){
960 		vtUnlock(c->lk);
961 		return;
962 	}
963 
964 	assert(b->ref == 0);
965 	switch(b->iostate){
966 	default:
967 		b->used = c->now++;
968 		heapIns(b);
969 		break;
970 	case BioEmpty:
971 	case BioLabel:
972 		if(c->nheap == 0)
973 			b->used = c->now++;
974 		else
975 			b->used = c->heap[0]->used;
976 		heapIns(b);
977 		break;
978 	case BioDirty:
979 		break;
980 	}
981 	vtUnlock(c->lk);
982 }
983 
984 /*
985  * we're deleting a block; delete all the blocks it points to
986  * that are still active (i.e., not needed by snapshots).
987  */
988 static void
989 blockCleanup(Block *b, u32int epoch)
990 {
991 	Cache *c;
992 	Block *bb;
993 	int i, n;
994 	Label l;
995 	u32int a;
996 	int type;
997 	int mode;
998 
999 	type = b->l.type;
1000 	c = b->c;
1001 
1002 	bwatchReset(b->score);
1003 
1004 	blockSetIOState(b, BioClean);
1005 
1006 	/* do not recursively free directories */
1007 	if(type == BtData || type == BtDir)
1008 		return;
1009 
1010 	n = c->size / VtScoreSize;
1011 	mode = OReadWrite;
1012 	if(type-1 == BtData || type-1 == BtDir)
1013 		mode = OOverWrite;
1014 	for(i=0; i<n; i++){
1015 		a = globalToLocal(b->data + i*VtScoreSize);
1016 		if(a == NilBlock || !readLabel(c, &l, a))
1017 			continue;
1018 		if((l.state&BsClosed) || l.epoch != epoch)
1019 			continue;
1020 		bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0);
1021 		if(bb == nil)
1022 			continue;
1023 		if((bb->l.state&BsClosed) || bb->l.epoch != epoch){
1024 			fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n",
1025 				a, &l, &bb->l);
1026 			blockPut(bb);
1027 			continue;
1028 		}
1029 		blockCleanup(bb, epoch);
1030 		l.type = BtMax;
1031 		l.epoch = epoch;
1032 		l.epochClose = 0;
1033 		l.state = BsFree;
1034 		l.tag = 0;
1035 		blockSetLabel(bb, &l);
1036 		blockPut(bb);
1037 	}
1038 }
1039 
1040 /*
1041  * We don't need the block at addr anymore for the active file system.
1042  * If we don't need it for the snapshots, remove it completely.
1043  * Epoch is the epoch during which we got rid of the block.
1044  * See blockRemoveLink for more.
1045  */
1046 static int
1047 unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch)
1048 {
1049 	Block *b;
1050 	Label l;
1051 
1052 	if(addr == NilBlock)
1053 		return 1;
1054 
1055 //fprint(2, "unlinkBlock %#ux\n", addr);
1056 	b = cacheLocalData(c, addr, type, tag, OReadOnly, 0);
1057 	if(b == nil)
1058 		return 0;
1059 	if(b->l.epoch > epoch){
1060 fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch);
1061 		blockPut(b);
1062 		return 0;
1063 	}
1064 
1065 	l = b->l;
1066 	if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){
1067 		l.state = BsFree;
1068 		l.type = BtMax;
1069 		l.tag = 0;
1070 		l.epoch = 0;
1071 		l.epochClose = 0;
1072 		blockCleanup(b, epoch);
1073 	}else{
1074 		l.state |= BsClosed;
1075 		l.epochClose = epoch;
1076 	}
1077 	blockSetLabel(b, &l);
1078 	blockPut(b);
1079 	return 1;
1080 }
1081 
1082 /*
1083  * try to allocate a BList so we can record that b must
1084  * be written out before some other block.
1085  * if can't find a BList, write b out instead and return nil.
1086  */
1087 static BList *
1088 blistAlloc(Block *b)
1089 {
1090 	Cache *c;
1091 	BList *p;
1092 
1093 	/*
1094 	 * It's possible that when we marked b dirty, there were
1095 	 * too many dirty blocks so we just wrote b there and then.
1096 	 * So b might not be dirty.  If it's not, no need to worry
1097 	 * about recording the write constraint.
1098 	 *
1099 	 * BlockRemoveLink depends on the fact that if blistAlloc
1100 	 * returns non-nil, b really is dirty.
1101 	 */
1102 	if(b->iostate != BioDirty){
1103 		assert(b->iostate == BioClean);
1104 		return nil;
1105 	}
1106 
1107 	/*
1108 	 * Easy: maybe there's a free list left.
1109 	 */
1110 	c = b->c;
1111 	vtLock(c->lk);
1112 	if(c->blfree){
1113 	HaveBlFree:
1114 		p = c->blfree;
1115 		c->blfree = p->next;
1116 		vtUnlock(c->lk);
1117 assert(b->iostate == BioDirty);
1118 		return p;
1119 	}
1120 	vtUnlock(c->lk);
1121 
1122 	/*
1123 	 * No free BLists.  What are our options?
1124 	 */
1125 
1126 	/* Block has no priors? Just write it. */
1127 	if(b->prior == nil){
1128 		diskWriteAndWait(c->disk, b);
1129 		return nil;
1130 	}
1131 
1132 	/*
1133 	 * Wake the flush thread, which will hopefully free up
1134 	 * some BLists for us.  We used to flush a block from
1135 	 * our own prior list and reclaim that BList, but this is
1136 	 * a no-no: some of the blocks on our prior list may
1137 	 * be locked by our caller.  Or maybe their label blocks
1138 	 * are locked by our caller.  In any event, it's too hard
1139 	 * to make sure we can do I/O for ourselves.  Instead,
1140 	 * we assume the flush thread will find something.
1141 	 * (The flush thread never blocks waiting for a block,
1142 	 * so it won't deadlock like we will.)
1143 	 */
1144 	vtLock(c->lk);
1145 	while(c->blfree == nil){
1146 		vtWakeup(c->flush);
1147 		vtSleep(c->blrend);
1148 	}
1149 	goto HaveBlFree;
1150 }
1151 
1152 void
1153 blistFree(Cache *c, BList *bl)
1154 {
1155 	vtLock(c->lk);
1156 	bl->next = c->blfree;
1157 	c->blfree = bl;
1158 	vtWakeup(c->blrend);
1159 	vtUnlock(c->lk);
1160 cacheFlush(c, 0);
1161 }
1162 
1163 /*
1164  * Record that bb must be written out before b.
1165  * If index is given, we're about to overwrite the score/e
1166  * at that index in the block.  Save the old value so we
1167  * can write a safer ``old'' version of the block if pressed.
1168  */
1169 void
1170 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1171 {
1172 	BList *p;
1173 
1174 	if(bb->iostate == BioClean)
1175 		return;
1176 
1177 	/*
1178 	 * Dependencies for blocks containing Entry structures
1179 	 * or scores must always be explained.  The problem with
1180 	 * only explaining some of them is this.  Suppose we have two
1181 	 * dependencies for the same field, the first explained
1182 	 * and the second not.  We try to write the block when the first
1183 	 * dependency is not written but the second is.  We will roll back
1184 	 * the first change even though the second trumps it.
1185 	 */
1186 	if(index == -1 && bb->part == PartData)
1187 		assert(b->l.type == BtData);
1188 
1189 	if(bb->iostate != BioDirty){
1190 		fprint(2, "%d:%x:%d iostate is %d in blockDependency\n",
1191 			bb->part, bb->addr, bb->l.type, bb->iostate);
1192 		abort();
1193 	}
1194 
1195 	p = blistAlloc(bb);
1196 	if(p == nil)
1197 		return;
1198 
1199 	assert(bb->iostate == BioDirty);
1200 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1201 
1202 	p->part = bb->part;
1203 	p->addr = bb->addr;
1204 	p->type = bb->l.type;
1205 	p->vers = bb->vers;
1206 	p->index = index;
1207 	if(p->index >= 0){
1208 		/*
1209 		 * This test would just be b->l.type==BtDir except
1210 		 * we need to exclude the super block.
1211 		 */
1212 		if(b->l.type == BtDir && b->part == PartData)
1213 			entryPack(e, p->old.entry, 0);
1214 		else
1215 			memmove(p->old.score, score, VtScoreSize);
1216 	}
1217 	p->next = b->prior;
1218 	b->prior = p;
1219 }
1220 
1221 /*
1222  * Mark an in-memory block as dirty.  If there are too many
1223  * dirty blocks, start writing some out to disk.
1224  *
1225  * If there were way too many dirty blocks, we used to
1226  * try to do some flushing ourselves, but it's just too dangerous --
1227  * it implies that the callers cannot have any of our priors locked,
1228  * but this is hard to avoid in some cases.
1229  */
1230 int
1231 blockDirty(Block *b)
1232 {
1233 	Cache *c;
1234 
1235 	c = b->c;
1236 
1237 	assert(b->part != PartVenti);
1238 
1239 	if(b->iostate == BioDirty)
1240 		return 1;
1241 	assert(b->iostate == BioClean);
1242 
1243 	vtLock(c->lk);
1244 	b->iostate = BioDirty;
1245 	c->ndirty++;
1246 	if(c->ndirty > (c->maxdirty>>1))
1247 		vtWakeup(c->flush);
1248 	vtUnlock(c->lk);
1249 
1250 	return 1;
1251 }
1252 
1253 /*
1254  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1255  *
1256  * The file system maintains the following invariants (i-iv checked by flchk):
1257  *
1258  * (i) b.e in [bb.e, bb.eClose)
1259  * (ii) if b.e==bb.e,  then no other b' in e points at bb.
1260  * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb.
1261  * (iv) if b is active then no other active b' points at bb.
1262  * (v) if b is a past life of b' then only one of b and b' is active (too hard to check)
1263  *
1264  * The file system initially satisfies these invariants, and we can verify that
1265  * the various file system operations maintain them.  See fossil.invariants.
1266  *
1267  * Condition (i) lets us reclaim blocks once the low epoch is greater
1268  * than epochClose.
1269  *
1270  * If the condition in (iii) is satisfied, then this is the only pointer to bb,
1271  * so bb can be reclaimed once b has been written to disk.  blockRemoveLink
1272  * checks !(b.state&Copied) as an optimization.  UnlinkBlock and blockCleanup
1273  * will check the conditions again for each block they consider.
1274  */
1275 int
1276 blockRemoveLink(Block *b, u32int addr, int type, u32int tag)
1277 {
1278 	BList *bl;
1279 	BList *p, **pp;
1280 	Cache *c;
1281 
1282 	c = b->c;
1283 
1284 	/* remove unlinked block from prior list */
1285 	pp = &b->prior;
1286 	for(p=*pp; p; p=*pp){
1287 		if(p->part != PartData || p->addr != addr){
1288 			pp = &p->next;
1289 			continue;
1290 		}
1291 		*pp = p->next;
1292 		blistFree(c, p);
1293 	}
1294 
1295 	/* if b has been copied, can't reclaim blocks it points at. */
1296 	if(b->l.state & BsCopied)
1297 		return 0;
1298 
1299 	bl = blistAlloc(b);
1300 	if(bl == nil)
1301 		return unlinkBlock(b->c, addr, type, tag, b->l.epoch);
1302 
1303 	/*
1304 	 * Because bl != nil, we know b is dirty.
1305 	 * (Linking b->uhead onto a clean block is
1306 	 * counterproductive, since we only look at
1307 	 * b->uhead when a block transitions from
1308 	 * dirty to clean.)
1309 	 */
1310 	assert(b->iostate == BioDirty);
1311 
1312 	bl->part = PartData;
1313 	bl->addr = addr;
1314 	bl->type = type;
1315 	bl->tag = tag;
1316 	bl->epoch = b->l.epoch;
1317 	if(b->uhead == nil)
1318 		b->uhead = bl;
1319 	else
1320 		b->utail->next = bl;
1321 	b->utail = bl;
1322 	bl->next = nil;
1323 	return 1;
1324 }
1325 
1326 /*
1327  * set the label associated with a block.
1328  */
1329 Block*
1330 _blockSetLabel(Block *b, Label *l)
1331 {
1332 	int lpb;
1333 	Block *bb;
1334 	u32int a;
1335 	Cache *c;
1336 
1337 	c = b->c;
1338 
1339 	assert(b->part == PartData);
1340 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
1341 	lpb = c->size / LabelSize;
1342 	a = b->addr / lpb;
1343 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
1344 	if(bb == nil){
1345 		blockPut(b);
1346 		return nil;
1347 	}
1348 	b->l = *l;
1349 	labelPack(l, bb->data, b->addr%lpb);
1350 	blockDirty(bb);
1351 	return bb;
1352 }
1353 
1354 int
1355 blockSetLabel(Block *b, Label *l)
1356 {
1357 	Block *lb;
1358 	Label oldl;
1359 
1360 	oldl = b->l;
1361 	lb = _blockSetLabel(b, l);
1362 	if(lb == nil)
1363 		return 0;
1364 
1365 	/*
1366 	 * If we're allocating the block, make sure the label (bl)
1367 	 * goes to disk before the data block (b) itself.  This is to help
1368 	 * the blocks that in turn depend on b.
1369 	 *
1370 	 * Suppose bx depends on (must be written out after) b.
1371 	 * Once we write b we'll think it's safe to write bx.
1372 	 * Bx can't get at b unless it has a valid label, though.
1373 	 *
1374 	 * Allocation is the only case in which having a current label
1375 	 * is vital because:
1376 	 *
1377 	 *	- l.type is set at allocation and never changes.
1378 	 *	- l.tag is set at allocation and never changes.
1379 	 *	- l.state is not checked when we load blocks.
1380 	 *	- the archiver cares deeply about l.state being
1381 	 *		BaActive vs. BaCopied, but that's handled
1382 	 *		by direct calls to _blockSetLabel.
1383 	 */
1384 
1385 	if(oldl.state == BsFree)
1386 		blockDependency(b, lb, -1, nil, nil);
1387 	blockPut(lb);
1388 	return 1;
1389 }
1390 
1391 /*
1392  * We've decided to write out b.  Maybe b has some pointers to blocks
1393  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1394  * copy of b that is safe to write out.  (diskThread will make sure the block
1395  * remains marked as dirty.)
1396  */
1397 uchar *
1398 blockRollback(Block *b, uchar *buf)
1399 {
1400 	u32int addr;
1401 	BList *p;
1402 	Super super;
1403 
1404 	/* easy case */
1405 	if(b->prior == nil)
1406 		return b->data;
1407 
1408 	memmove(buf, b->data, b->c->size);
1409 	for(p=b->prior; p; p=p->next){
1410 		/*
1411 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1412 		 */
1413 		assert(p->index >= 0);
1414 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1415 		if(b->part == PartSuper){
1416 			assert(p->index == 0);
1417 			superUnpack(&super, buf);
1418 			addr = globalToLocal(p->old.score);
1419 			if(addr == NilBlock){
1420 				fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score);
1421 				abort();
1422 			}
1423 			super.active = addr;
1424 			superPack(&super, buf);
1425 			continue;
1426 		}
1427 		if(b->l.type == BtDir)
1428 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1429 		else
1430 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1431 	}
1432 	return buf;
1433 }
1434 
1435 /*
1436  * Try to write block b.
1437  * If b depends on other blocks:
1438  *
1439  *	If the block has been written out, remove the dependency.
1440  *	If the dependency is replaced by a more recent dependency,
1441  *		throw it out.
1442  *	If we know how to write out an old version of b that doesn't
1443  *		depend on it, do that.
1444  *
1445  *	Otherwise, bail.
1446  */
1447 int
1448 blockWrite(Block *b)
1449 {
1450 	uchar *dmap;
1451 	Cache *c;
1452 	BList *p, **pp;
1453 	Block *bb;
1454 	int lockfail;
1455 
1456 	c = b->c;
1457 
1458 	if(b->iostate != BioDirty)
1459 		return 1;
1460 
1461 	dmap = b->dmap;
1462 	memset(dmap, 0, c->ndmap);
1463 	pp = &b->prior;
1464 	for(p=*pp; p; p=*pp){
1465 		if(p->index >= 0){
1466 			/* more recent dependency has succeeded; this one can go */
1467 			if(dmap[p->index/8] & (1<<(p->index%8)))
1468 				goto ignblock;
1469 		}
1470 
1471 		lockfail = 0;
1472 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1473 		if(bb == nil){
1474 			if(lockfail)
1475 				return 0;
1476 			/* block not in cache => was written already */
1477 			dmap[p->index/8] |= 1<<(p->index%8);
1478 			goto ignblock;
1479 		}
1480 
1481 		/*
1482 		 * same version of block is still in cache.
1483 		 *
1484 		 * the assertion is true because the block still has version p->vers,
1485 		 * which means it hasn't been written out since we last saw it.
1486 		 */
1487 		if(bb->iostate != BioDirty){
1488 			fprint(2, "%d:%x:%d iostate is %d in blockWrite\n",
1489 				bb->part, bb->addr, bb->l.type, bb->iostate);
1490 			/* probably BioWriting if it happens? */
1491 			if(bb->iostate == BioClean)
1492 				goto ignblock;
1493 		}
1494 
1495 		blockPut(bb);
1496 
1497 		if(p->index < 0){
1498 			/*
1499 			 * We don't know how to temporarily undo
1500 			 * b's dependency on bb, so just don't write b yet.
1501 			 */
1502 			if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1503 				b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1504 			return 0;
1505 		}
1506 		/* keep walking down the list */
1507 		pp = &p->next;
1508 		continue;
1509 
1510 ignblock:
1511 		*pp = p->next;
1512 		blistFree(c, p);
1513 		continue;
1514 	}
1515 
1516 	/*
1517 	 * DiskWrite must never be called with a double-locked block.
1518 	 * This call to diskWrite is okay because blockWrite is only called
1519 	 * from the cache flush thread, which never double-locks a block.
1520 	 */
1521 	diskWrite(c->disk, b);
1522 	return 1;
1523 }
1524 
1525 /*
1526  * Change the I/O state of block b.
1527  * Just an assignment except for magic in
1528  * switch statement (read comments there).
1529  */
1530 void
1531 blockSetIOState(Block *b, int iostate)
1532 {
1533 	int dowakeup;
1534 	Cache *c;
1535 	BList *p, *q;
1536 
1537 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1538 
1539 	c = b->c;
1540 
1541 	dowakeup = 0;
1542 	switch(iostate){
1543 	default:
1544 		abort();
1545 	case BioEmpty:
1546 		assert(!b->uhead);
1547 		break;
1548 	case BioLabel:
1549 		assert(!b->uhead);
1550 		break;
1551 	case BioClean:
1552 		bwatchDependency(b);
1553 		/*
1554 		 * If b->prior is set, it means a write just finished.
1555 		 * The prior list isn't needed anymore.
1556 		 */
1557 		for(p=b->prior; p; p=q){
1558 			q = p->next;
1559 			blistFree(c, p);
1560 		}
1561 		b->prior = nil;
1562 		/*
1563 		 * Freeing a block or just finished a write.
1564 		 * Move the blocks from the per-block unlink
1565 		 * queue to the cache unlink queue.
1566 		 */
1567 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1568 			vtLock(c->lk);
1569 			c->ndirty--;
1570 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1571 			b->vers = c->vers++;
1572 			if(b->uhead){
1573 				/* add unlink blocks to unlink queue */
1574 				if(c->uhead == nil){
1575 					c->uhead = b->uhead;
1576 					vtWakeup(c->unlink);
1577 				}else
1578 					c->utail->next = b->uhead;
1579 				c->utail = b->utail;
1580 				b->uhead = nil;
1581 			}
1582 			vtUnlock(c->lk);
1583 		}
1584 		assert(!b->uhead);
1585 		dowakeup = 1;
1586 		break;
1587 	case BioDirty:
1588 		/*
1589 		 * Wrote out an old version of the block (see blockRollback).
1590 		 * Bump a version count, leave it dirty.
1591 		 */
1592 		if(b->iostate == BioWriting){
1593 			vtLock(c->lk);
1594 			b->vers = c->vers++;
1595 			vtUnlock(c->lk);
1596 			dowakeup = 1;
1597 		}
1598 		break;
1599 	case BioReading:
1600 	case BioWriting:
1601 		/*
1602 		 * Adding block to disk queue.  Bump reference count.
1603 		 * diskThread decs the count later by calling blockPut.
1604 		 * This is here because we need to lock c->lk to
1605 		 * manipulate the ref count.
1606 		 */
1607 		vtLock(c->lk);
1608 		b->ref++;
1609 		vtUnlock(c->lk);
1610 		break;
1611 	case BioReadError:
1612 	case BioVentiError:
1613 		/*
1614 		 * Oops.
1615 		 */
1616 		dowakeup = 1;
1617 		break;
1618 	}
1619 	b->iostate = iostate;
1620 	/*
1621 	 * Now that the state has changed, we can wake the waiters.
1622 	 */
1623 	if(dowakeup)
1624 		vtWakeupAll(b->ioready);
1625 }
1626 
1627 char*
1628 bsStr(int state)
1629 {
1630 	static char s[100];
1631 
1632 	if(state == BsFree)
1633 		return "Free";
1634 	if(state == BsBad)
1635 		return "Bad";
1636 
1637 	sprint(s, "%x", state);
1638 	if(!(state&BsAlloc))
1639 		strcat(s, ",Free");	/* should not happen */
1640 	if(state&BsCopied)
1641 		strcat(s, ",Copied");
1642 	if(state&BsVenti)
1643 		strcat(s, ",Venti");
1644 	if(state&BsClosed)
1645 		strcat(s, ",Closed");
1646 	return s;
1647 }
1648 
1649 char *
1650 bioStr(int iostate)
1651 {
1652 	switch(iostate){
1653 	default:
1654 		return "Unknown!!";
1655 	case BioEmpty:
1656 		return "Empty";
1657 	case BioLabel:
1658 		return "Label";
1659 	case BioClean:
1660 		return "Clean";
1661 	case BioDirty:
1662 		return "Dirty";
1663 	case BioReading:
1664 		return "Reading";
1665 	case BioWriting:
1666 		return "Writing";
1667 	case BioReadError:
1668 		return "ReadError";
1669 	case BioVentiError:
1670 		return "VentiError";
1671 	case BioMax:
1672 		return "Max";
1673 	}
1674 }
1675 
1676 static char *bttab[] = {
1677 	"BtData",
1678 	"BtData+1",
1679 	"BtData+2",
1680 	"BtData+3",
1681 	"BtData+4",
1682 	"BtData+5",
1683 	"BtData+6",
1684 	"BtData+7",
1685 	"BtDir",
1686 	"BtDir+1",
1687 	"BtDir+2",
1688 	"BtDir+3",
1689 	"BtDir+4",
1690 	"BtDir+5",
1691 	"BtDir+6",
1692 	"BtDir+7",
1693 };
1694 
1695 char*
1696 btStr(int type)
1697 {
1698 	if(type < nelem(bttab))
1699 		return bttab[type];
1700 	return "unknown";
1701 }
1702 
1703 int
1704 labelFmt(Fmt *f)
1705 {
1706 	Label *l;
1707 
1708 	l = va_arg(f->args, Label*);
1709 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1710 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1711 }
1712 
1713 int
1714 scoreFmt(Fmt *f)
1715 {
1716 	uchar *v;
1717 	int i;
1718 	u32int addr;
1719 
1720 	v = va_arg(f->args, uchar*);
1721 	if(v == nil){
1722 		fmtprint(f, "*");
1723 	}else if((addr = globalToLocal(v)) != NilBlock)
1724 		fmtprint(f, "0x%.8ux", addr);
1725 	else{
1726 		for(i = 0; i < VtScoreSize; i++)
1727 			fmtprint(f, "%2.2ux", v[i]);
1728 	}
1729 
1730 	return 0;
1731 }
1732 
1733 static int
1734 upHeap(int i, Block *b)
1735 {
1736 	Block *bb;
1737 	u32int now;
1738 	int p;
1739 	Cache *c;
1740 
1741 	c = b->c;
1742 	now = c->now;
1743 	for(; i != 0; i = p){
1744 		p = (i - 1) >> 1;
1745 		bb = c->heap[p];
1746 		if(b->used - now >= bb->used - now)
1747 			break;
1748 		c->heap[i] = bb;
1749 		bb->heap = i;
1750 	}
1751 	c->heap[i] = b;
1752 	b->heap = i;
1753 
1754 	return i;
1755 }
1756 
1757 static int
1758 downHeap(int i, Block *b)
1759 {
1760 	Block *bb;
1761 	u32int now;
1762 	int k;
1763 	Cache *c;
1764 
1765 	c = b->c;
1766 	now = c->now;
1767 	for(; ; i = k){
1768 		k = (i << 1) + 1;
1769 		if(k >= c->nheap)
1770 			break;
1771 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1772 			k++;
1773 		bb = c->heap[k];
1774 		if(b->used - now <= bb->used - now)
1775 			break;
1776 		c->heap[i] = bb;
1777 		bb->heap = i;
1778 	}
1779 	c->heap[i] = b;
1780 	b->heap = i;
1781 	return i;
1782 }
1783 
1784 /*
1785  * Delete a block from the heap.
1786  * Called with c->lk held.
1787  */
1788 static void
1789 heapDel(Block *b)
1790 {
1791 	int i, si;
1792 	Cache *c;
1793 
1794 	c = b->c;
1795 
1796 	si = b->heap;
1797 	if(si == BadHeap)
1798 		return;
1799 	b->heap = BadHeap;
1800 	c->nheap--;
1801 	if(si == c->nheap)
1802 		return;
1803 	b = c->heap[c->nheap];
1804 	i = upHeap(si, b);
1805 	if(i == si)
1806 		downHeap(i, b);
1807 }
1808 
1809 /*
1810  * Insert a block into the heap.
1811  * Called with c->lk held.
1812  */
1813 static void
1814 heapIns(Block *b)
1815 {
1816 	assert(b->heap == BadHeap);
1817 	upHeap(b->c->nheap++, b);
1818 	vtWakeup(b->c->heapwait);
1819 }
1820 
1821 /*
1822  * Get just the label for a block.
1823  */
1824 static int
1825 readLabel(Cache *c, Label *l, u32int addr)
1826 {
1827 	int lpb;
1828 	Block *b;
1829 	u32int a;
1830 
1831 	lpb = c->size / LabelSize;
1832 	a = addr / lpb;
1833 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1834 	if(b == nil){
1835 		blockPut(b);
1836 		return 0;
1837 	}
1838 
1839 	if(!labelUnpack(l, b->data, addr%lpb)){
1840 		blockPut(b);
1841 		return 0;
1842 	}
1843 	blockPut(b);
1844 	return 1;
1845 }
1846 
1847 /*
1848  * Process unlink queue.
1849  * Called with c->lk held.
1850  */
1851 static void
1852 unlinkBody(Cache *c)
1853 {
1854 	BList *p;
1855 
1856 	while(c->uhead != nil){
1857 		p = c->uhead;
1858 		c->uhead = p->next;
1859 		vtUnlock(c->lk);
1860 
1861 		if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch))
1862 			fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n",
1863 				p->addr, p->type, p->tag);
1864 
1865 		vtLock(c->lk);
1866 		p->next = c->blfree;
1867 		c->blfree = p;
1868 	}
1869 }
1870 
1871 /*
1872  * Occasionally unlink the blocks on the cache unlink queue.
1873  */
1874 static void
1875 unlinkThread(void *a)
1876 {
1877 	Cache *c = a;
1878 
1879 	vtThreadSetName("unlink");
1880 
1881 	vtLock(c->lk);
1882 	for(;;){
1883 		while(c->uhead == nil && c->die == nil)
1884 			vtSleep(c->unlink);
1885 		if(c->die != nil)
1886 			break;
1887 		unlinkBody(c);
1888 	}
1889 	c->ref--;
1890 	vtWakeup(c->die);
1891 	vtUnlock(c->lk);
1892 }
1893 
1894 static int
1895 baddrCmp(void *a0, void *a1)
1896 {
1897 	BAddr *b0, *b1;
1898 	b0 = a0;
1899 	b1 = a1;
1900 
1901 	if(b0->part < b1->part)
1902 		return -1;
1903 	if(b0->part > b1->part)
1904 		return 1;
1905 	if(b0->addr < b1->addr)
1906 		return -1;
1907 	if(b0->addr > b1->addr)
1908 		return 1;
1909 	return 0;
1910 }
1911 
1912 /*
1913  * Scan the block list for dirty blocks; add them to the list c->baddr.
1914  */
1915 static void
1916 flushFill(Cache *c)
1917 {
1918 	int i, ndirty;
1919 	BAddr *p;
1920 	Block *b;
1921 
1922 	vtLock(c->lk);
1923 	if(c->ndirty == 0){
1924 		vtUnlock(c->lk);
1925 		return;
1926 	}
1927 
1928 	p = c->baddr;
1929 	ndirty = 0;
1930 	for(i=0; i<c->nblocks; i++){
1931 		b = c->blocks + i;
1932 		if(b->part == PartError)
1933 			continue;
1934 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1935 			ndirty++;
1936 		if(b->iostate != BioDirty)
1937 			continue;
1938 		p->part = b->part;
1939 		p->addr = b->addr;
1940 		p->vers = b->vers;
1941 		p++;
1942 	}
1943 	if(ndirty != c->ndirty){
1944 		fprint(2, "ndirty mismatch expected %d found %d\n",
1945 			c->ndirty, ndirty);
1946 		c->ndirty = ndirty;
1947 	}
1948 	vtUnlock(c->lk);
1949 
1950 	c->bw = p - c->baddr;
1951 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1952 }
1953 
1954 /*
1955  * This is not thread safe, i.e. it can't be called from multiple threads.
1956  *
1957  * It's okay how we use it, because it only gets called in
1958  * the flushThread.  And cacheFree, but only after
1959  * cacheFree has killed off the flushThread.
1960  */
1961 static int
1962 cacheFlushBlock(Cache *c)
1963 {
1964 	Block *b;
1965 	BAddr *p;
1966 	int lockfail, nfail;
1967 
1968 	nfail = 0;
1969 	for(;;){
1970 		if(c->br == c->be){
1971 			if(c->bw == 0 || c->bw == c->be)
1972 				flushFill(c);
1973 			c->br = 0;
1974 			c->be = c->bw;
1975 			c->bw = 0;
1976 			c->nflush = 0;
1977 		}
1978 
1979 		if(c->br == c->be)
1980 			return 0;
1981 		p = c->baddr + c->br;
1982 		c->br++;
1983 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail);
1984 
1985 		if(b && blockWrite(b)){
1986 			c->nflush++;
1987 			blockPut(b);
1988 			return 1;
1989 		}
1990 		if(b)
1991 			blockPut(b);
1992 
1993 		/*
1994 		 * Why didn't we write the block?
1995 		 */
1996 
1997 		/* Block already written out */
1998 		if(b == nil && !lockfail)
1999 			continue;
2000 
2001 		/* Failed to acquire lock; sleep if happens a lot. */
2002 		if(lockfail && ++nfail > 100){
2003 			sleep(500);
2004 			nfail = 0;
2005 		}
2006 		/* Requeue block. */
2007 		if(c->bw < c->be)
2008 			c->baddr[c->bw++] = *p;
2009 	}
2010 	return 0;
2011 }
2012 
2013 /*
2014  * Occasionally flush dirty blocks from memory to the disk.
2015  */
2016 static void
2017 flushThread(void *a)
2018 {
2019 	Cache *c = a;
2020 	int i;
2021 
2022 	vtThreadSetName("flush");
2023 	vtLock(c->lk);
2024 	while(c->die == nil){
2025 		vtSleep(c->flush);
2026 		vtUnlock(c->lk);
2027 		for(i=0; i<FlushSize; i++)
2028 			if(!cacheFlushBlock(c)){
2029 				/*
2030 				 * If i==0, could be someone is waking us repeatedly
2031 				 * to flush the cache but there's no work to do.
2032 				 * Pause a little.
2033 				 */
2034 				if(i==0)
2035 					sleep(250);
2036 				break;
2037 			}
2038 		if(i==0 && c->ndirty){
2039 			/*
2040 			 * All the blocks are being written right now -- there's nothing to do.
2041 			 * We might be spinning with cacheFlush though -- he'll just keep
2042 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2043 			 * on something that the diskThread can kick, but for now we'll
2044 			 * just pause for a little while waiting for disks to finish.
2045 			 */
2046 			sleep(100);
2047 		}
2048 		vtLock(c->lk);
2049 		vtWakeupAll(c->flushwait);
2050 	}
2051 	c->ref--;
2052 	vtWakeup(c->die);
2053 	vtUnlock(c->lk);
2054 }
2055 
2056 /*
2057  * Keep flushing until everything is clean.
2058  */
2059 void
2060 cacheFlush(Cache *c, int wait)
2061 {
2062 	vtLock(c->lk);
2063 	if(wait){
2064 		while(c->ndirty){
2065 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2066 		//		c->ndirty, c->uhead);
2067 			vtWakeup(c->flush);
2068 			vtSleep(c->flushwait);
2069 		}
2070 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2071 	}else
2072 		vtWakeup(c->flush);
2073 	vtUnlock(c->lk);
2074 }
2075 
2076 /*
2077  * Kick the flushThread every 30 seconds.
2078  */
2079 static void
2080 cacheSync(void *v)
2081 {
2082 	Cache *c;
2083 
2084 	c = v;
2085 	cacheFlush(c, 0);
2086 }
2087