xref: /plan9-contrib/sys/src/cmd/fossil/cache.c (revision f078af31cd4d3450bd40403b94cec9a5fcdabac3)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	QLock	lk;
25 	int 	ref;
26 	int	mode;
27 
28 	Disk 	*disk;
29 	int	size;			/* block size */
30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31 	VtConn *z;
32 	u32int	now;			/* ticks for usage timestamps */
33 	Block	**heads;		/* hash table for finding address */
34 	int	nheap;			/* number of available victims */
35 	Block	**heap;			/* heap for locating victims */
36 	long	nblocks;		/* number of blocks allocated */
37 	Block	*blocks;		/* array of block descriptors */
38 	u8int	*mem;			/* memory for all block data & blists */
39 
40 	BList	*blfree;
41 	Rendez	blrend;
42 
43 	int 	ndirty;			/* number of dirty blocks in the cache */
44 	int 	maxdirty;		/* max number of dirty blocks */
45 	u32int	vers;
46 
47 	long hashSize;
48 
49 	FreeList *fl;
50 
51 	Rendez die;			/* daemon threads should die when QLock != nil */
52 
53 	Rendez flush;
54 	Rendez flushwait;
55 	Rendez heapwait;
56 	BAddr *baddr;
57 	int bw, br, be;
58 	int nflush;
59 
60 	Periodic *sync;
61 
62 	/* unlink daemon */
63 	BList *uhead;
64 	BList *utail;
65 	Rendez unlink;
66 
67 	/* block counts */
68 	int nused;
69 	int ndisk;
70 };
71 
72 struct BList {
73 	int part;
74 	u32int addr;
75 	uchar type;
76 	u32int tag;
77 	u32int epoch;
78 	u32int vers;
79 
80 	int recurse;	/* for block unlink */
81 
82 	/* for roll back */
83 	int index;			/* -1 indicates not valid */
84 	union {
85 		uchar score[VtScoreSize];
86 		uchar entry[VtEntrySize];
87 	} old;
88 	BList *next;
89 };
90 
91 struct BAddr {
92 	int part;
93 	u32int addr;
94 	u32int vers;
95 };
96 
97 struct FreeList {
98 	QLock lk;
99 	u32int last;		/* last block allocated */
100 	u32int end;		/* end of data partition */
101 	u32int nused;		/* number of used blocks */
102 	u32int epochLow;	/* low epoch when last updated nused */
103 };
104 
105 static FreeList *flAlloc(u32int end);
106 static void flFree(FreeList *fl);
107 
108 static Block *cacheBumpBlock(Cache *c);
109 static void heapDel(Block*);
110 static void heapIns(Block*);
111 static void cacheCheck(Cache*);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void unlinkBody(Cache *c);
115 static int cacheFlushBlock(Cache *c);
116 static void cacheSync(void*);
117 static BList *blistAlloc(Block*);
118 static void blistFree(Cache*, BList*);
119 static void doRemoveLink(Cache*, BList*);
120 
121 /*
122  * Mapping from local block type to Venti type
123  */
124 int vtType[BtMax] = {
125 	VtDataType,		/* BtData | 0  */
126 	VtDataType+1,		/* BtData | 1  */
127 	VtDataType+2,		/* BtData | 2  */
128 	VtDataType+3,		/* BtData | 3  */
129 	VtDataType+4,		/* BtData | 4  */
130 	VtDataType+5,		/* BtData | 5  */
131 	VtDataType+6,		/* BtData | 6  */
132 	VtDataType+7,		/* BtData | 7  */
133 	VtDirType,		/* BtDir | 0  */
134 	VtDirType+1,		/* BtDir | 1  */
135 	VtDirType+2,		/* BtDir | 2  */
136 	VtDirType+3,		/* BtDir | 3  */
137 	VtDirType+4,		/* BtDir | 4  */
138 	VtDirType+5,		/* BtDir | 5  */
139 	VtDirType+6,		/* BtDir | 6  */
140 	VtDirType+7,		/* BtDir | 7  */
141 };
142 
143 /*
144  * Allocate the memory cache.
145  */
146 Cache *
cacheAlloc(Disk * disk,VtConn * z,ulong nblocks,int mode)147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode)
148 {
149 	int i;
150 	Cache *c;
151 	Block *b;
152 	BList *bl;
153 	u8int *p;
154 	int nbl;
155 
156 	c = vtmallocz(sizeof(Cache));
157 
158 	/* reasonable number of BList elements */
159 	nbl = nblocks * 4;
160 
161 	c->ref = 1;
162 	c->disk = disk;
163 	c->z = z;
164 	c->size = diskBlockSize(disk);
165 bwatchSetBlockSize(c->size);
166 	/* round c->size up to be a nice multiple */
167 	c->size = (c->size + 127) & ~127;
168 	c->ndmap = (c->size/20 + 7) / 8;
169 	c->nblocks = nblocks;
170 	c->hashSize = nblocks;
171 	c->heads = vtmallocz(c->hashSize*sizeof(Block*));
172 	c->heap = vtmallocz(nblocks*sizeof(Block*));
173 	c->blocks = vtmallocz(nblocks*sizeof(Block));
174 	c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
175 	c->baddr = vtmallocz(nblocks * sizeof(BAddr));
176 	c->mode = mode;
177 	c->vers++;
178 	p = c->mem;
179 	for(i = 0; i < nblocks; i++){
180 		b = &c->blocks[i];
181 		b->c = c;
182 		b->data = p;
183 		b->heap = i;
184 		b->ioready.l = &b->lk;
185 		c->heap[i] = b;
186 		p += c->size;
187 	}
188 	c->nheap = nblocks;
189 	for(i = 0; i < nbl; i++){
190 		bl = (BList*)p;
191 		bl->next = c->blfree;
192 		c->blfree = bl;
193 		p += sizeof(BList);
194 	}
195 	/* separate loop to keep blocks and blists reasonably aligned */
196 	for(i = 0; i < nblocks; i++){
197 		b = &c->blocks[i];
198 		b->dmap = p;
199 		p += c->ndmap;
200 	}
201 
202 	c->blrend.l = &c->lk;
203 
204 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
205 
206 	c->fl = flAlloc(diskSize(disk, PartData));
207 
208 	c->unlink.l = &c->lk;
209 	c->flush.l = &c->lk;
210 	c->flushwait.l = &c->lk;
211 	c->heapwait.l = &c->lk;
212 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
213 
214 	if(mode == OReadWrite){
215 		c->ref += 2;
216 		proccreate(unlinkThread, c, STACK);
217 		proccreate(flushThread, c, STACK);
218 	}
219 	cacheCheck(c);
220 
221 	return c;
222 }
223 
224 /*
225  * Free the whole memory cache, flushing all dirty blocks to the disk.
226  */
227 void
cacheFree(Cache * c)228 cacheFree(Cache *c)
229 {
230 	int i;
231 
232 	/* kill off daemon threads */
233 	qlock(&c->lk);
234 	c->die.l = &c->lk;
235 	periodicKill(c->sync);
236 	rwakeup(&c->flush);
237 	rwakeup(&c->unlink);
238 	while(c->ref > 1)
239 		rsleep(&c->die);
240 
241 	/* flush everything out */
242 	do {
243 		unlinkBody(c);
244 		qunlock(&c->lk);
245 		while(cacheFlushBlock(c))
246 			;
247 		diskFlush(c->disk);
248 		qlock(&c->lk);
249 	} while(c->uhead || c->ndirty);
250 	qunlock(&c->lk);
251 
252 	cacheCheck(c);
253 
254 	for(i = 0; i < c->nblocks; i++){
255 		assert(c->blocks[i].ref == 0);
256 	}
257 	flFree(c->fl);
258 	vtfree(c->baddr);
259 	vtfree(c->heads);
260 	vtfree(c->blocks);
261 	vtfree(c->mem);
262 	diskFree(c->disk);
263 	/* don't close vtSession */
264 	vtfree(c);
265 }
266 
267 static void
cacheDump(Cache * c)268 cacheDump(Cache *c)
269 {
270 	int i;
271 	Block *b;
272 
273 	for(i = 0; i < c->nblocks; i++){
274 		b = &c->blocks[i];
275 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
276 			i, b->part, b->addr, b->score, b->l.type, b->ref,
277 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
278 	}
279 }
280 
281 static void
cacheCheck(Cache * c)282 cacheCheck(Cache *c)
283 {
284 	u32int size, now;
285 	int i, k, refed;
286 	static uchar zero[VtScoreSize];
287 	Block *b;
288 
289 	size = c->size;
290 	now = c->now;
291 
292 	for(i = 0; i < c->nheap; i++){
293 		if(c->heap[i]->heap != i)
294 			sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
295 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
296 			sysfatal("bad heap ordering");
297 		k = (i << 1) + 1;
298 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
299 			sysfatal("bad heap ordering");
300 		k++;
301 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
302 			sysfatal("bad heap ordering");
303 	}
304 
305 	refed = 0;
306 	for(i = 0; i < c->nblocks; i++){
307 		b = &c->blocks[i];
308 		if(b->data != &c->mem[i * size])
309 			sysfatal("mis-blocked at %d", i);
310 		if(b->ref && b->heap == BadHeap){
311 			refed++;
312 		}
313 	}
314 if(c->nheap + refed != c->nblocks){
315 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
316 cacheDump(c);
317 }
318 	assert(c->nheap + refed == c->nblocks);
319 	refed = 0;
320 	for(i = 0; i < c->nblocks; i++){
321 		b = &c->blocks[i];
322 		if(b->ref){
323 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
324 			refed++;
325 		}
326 	}
327 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
328 }
329 
330 
331 /*
332  * locate the block with the oldest second to last use.
333  * remove it from the heap, and fix up the heap.
334  */
335 /* called with c->lk held */
336 static Block *
cacheBumpBlock(Cache * c)337 cacheBumpBlock(Cache *c)
338 {
339 	int printed;
340 	Block *b;
341 
342 	/*
343 	 * locate the block with the oldest second to last use.
344 	 * remove it from the heap, and fix up the heap.
345 	 */
346 	printed = 0;
347 	if(c->nheap == 0){
348 		while(c->nheap == 0){
349 			rwakeup(&c->flush);
350 			rsleep(&c->heapwait);
351 			if(c->nheap == 0){
352 				printed = 1;
353 				fprint(2, "%s: entire cache is busy, %d dirty "
354 					"-- waking flush thread\n",
355 					argv0, c->ndirty);
356 			}
357 		}
358 		if(printed)
359 			fprint(2, "%s: cache is okay again, %d dirty\n",
360 				argv0, c->ndirty);
361 	}
362 
363 	b = c->heap[0];
364 	heapDel(b);
365 
366 	assert(b->heap == BadHeap);
367 	assert(b->ref == 0);
368 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
369 	assert(b->prior == nil);
370 	assert(b->uhead == nil);
371 
372 	/*
373 	 * unchain the block from hash chain
374 	 */
375 	if(b->prev){
376 		*(b->prev) = b->next;
377 		if(b->next)
378 			b->next->prev = b->prev;
379 		b->prev = nil;
380 	}
381 
382 
383 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
384 	/* set block to a reasonable state */
385 	b->ref = 1;
386 	b->part = PartError;
387 	memset(&b->l, 0, sizeof(b->l));
388 	b->iostate = BioEmpty;
389 
390 	return b;
391 }
392 
393 /*
394  * look for a particular version of the block in the memory cache.
395  */
396 static Block *
_cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers,int waitlock,int * lockfailure)397 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
398 	int waitlock, int *lockfailure)
399 {
400 	Block *b;
401 	ulong h;
402 
403 	h = addr % c->hashSize;
404 
405 	if(lockfailure)
406 		*lockfailure = 0;
407 
408 	/*
409 	 * look for the block in the cache
410 	 */
411 	qlock(&c->lk);
412 	for(b = c->heads[h]; b != nil; b = b->next){
413 		if(b->part == part && b->addr == addr)
414 			break;
415 	}
416 	if(b == nil || b->vers != vers){
417 		qunlock(&c->lk);
418 		return nil;
419 	}
420 	if(!waitlock && !canqlock(&b->lk)){
421 		*lockfailure = 1;
422 		qunlock(&c->lk);
423 		return nil;
424 	}
425 	heapDel(b);
426 	b->ref++;
427 	qunlock(&c->lk);
428 
429 	bwatchLock(b);
430 	if(waitlock)
431 		qlock(&b->lk);
432 	b->nlock = 1;
433 
434 	for(;;){
435 		switch(b->iostate){
436 		default:
437 			abort();
438 		case BioEmpty:
439 		case BioLabel:
440 		case BioClean:
441 		case BioDirty:
442 			if(b->vers != vers){
443 				blockPut(b);
444 				return nil;
445 			}
446 			return b;
447 		case BioReading:
448 		case BioWriting:
449 			rsleep(&b->ioready);
450 			break;
451 		case BioVentiError:
452 			blockPut(b);
453 			werrstr("venti i/o error block 0x%.8ux", addr);
454 			return nil;
455 		case BioReadError:
456 			blockPut(b);
457 			werrstr("error reading block 0x%.8ux", addr);
458 			return nil;
459 		}
460 	}
461 	/* NOT REACHED */
462 }
463 static Block*
cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers)464 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
465 {
466 	return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
467 }
468 
469 
470 /*
471  * fetch a local (on-disk) block from the memory cache.
472  * if it's not there, load it, bumping some other block.
473  */
474 Block *
_cacheLocal(Cache * c,int part,u32int addr,int mode,u32int epoch)475 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
476 {
477 	Block *b;
478 	ulong h;
479 
480 	assert(part != PartVenti);
481 
482 	h = addr % c->hashSize;
483 
484 	/*
485 	 * look for the block in the cache
486 	 */
487 	qlock(&c->lk);
488 	for(b = c->heads[h]; b != nil; b = b->next){
489 		if(b->part != part || b->addr != addr)
490 			continue;
491 		if(epoch && b->l.epoch != epoch){
492 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
493 			qunlock(&c->lk);
494 			werrstr(ELabelMismatch);
495 			return nil;
496 		}
497 		heapDel(b);
498 		b->ref++;
499 		break;
500 	}
501 
502 	if(b == nil){
503 		b = cacheBumpBlock(c);
504 
505 		b->part = part;
506 		b->addr = addr;
507 		localToGlobal(addr, b->score);
508 
509 		/* chain onto correct hash */
510 		b->next = c->heads[h];
511 		c->heads[h] = b;
512 		if(b->next != nil)
513 			b->next->prev = &b->next;
514 		b->prev = &c->heads[h];
515 	}
516 
517 	qunlock(&c->lk);
518 
519 	/*
520 	 * BUG: what if the epoch changes right here?
521 	 * In the worst case, we could end up in some weird
522 	 * lock loop, because the block we want no longer exists,
523 	 * and instead we're trying to lock a block we have no
524 	 * business grabbing.
525 	 *
526 	 * For now, I'm not going to worry about it.
527 	 */
528 
529 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
530 	bwatchLock(b);
531 	qlock(&b->lk);
532 	b->nlock = 1;
533 
534 	if(part == PartData && b->iostate == BioEmpty){
535 		if(!readLabel(c, &b->l, addr)){
536 			blockPut(b);
537 			return nil;
538 		}
539 		blockSetIOState(b, BioLabel);
540 	}
541 	if(epoch && b->l.epoch != epoch){
542 		blockPut(b);
543 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
544 		werrstr(ELabelMismatch);
545 		return nil;
546 	}
547 
548 	b->pc = getcallerpc(&c);
549 	for(;;){
550 		switch(b->iostate){
551 		default:
552 			abort();
553 		case BioLabel:
554 			if(mode == OOverWrite)
555 				/*
556 				 * leave iostate as BioLabel because data
557 				 * hasn't been read.
558 				 */
559 				return b;
560 			/* fall through */
561 		case BioEmpty:
562 			diskRead(c->disk, b);
563 			rsleep(&b->ioready);
564 			break;
565 		case BioClean:
566 		case BioDirty:
567 			return b;
568 		case BioReading:
569 		case BioWriting:
570 			rsleep(&b->ioready);
571 			break;
572 		case BioReadError:
573 			blockSetIOState(b, BioEmpty);
574 			blockPut(b);
575 			werrstr("error reading block 0x%.8ux", addr);
576 			return nil;
577 		}
578 	}
579 	/* NOT REACHED */
580 }
581 
582 Block *
cacheLocal(Cache * c,int part,u32int addr,int mode)583 cacheLocal(Cache *c, int part, u32int addr, int mode)
584 {
585 	return _cacheLocal(c, part, addr, mode, 0);
586 }
587 
588 /*
589  * fetch a local (on-disk) block from the memory cache.
590  * if it's not there, load it, bumping some other block.
591  * check tag and type.
592  */
593 Block *
cacheLocalData(Cache * c,u32int addr,int type,u32int tag,int mode,u32int epoch)594 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
595 {
596 	Block *b;
597 
598 	b = _cacheLocal(c, PartData, addr, mode, epoch);
599 	if(b == nil)
600 		return nil;
601 	if(b->l.type != type || b->l.tag != tag){
602 		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
603 			argv0, addr, b->l.type, type, b->l.tag, tag);
604 		werrstr(ELabelMismatch);
605 		blockPut(b);
606 		return nil;
607 	}
608 	b->pc = getcallerpc(&c);
609 	return b;
610 }
611 
612 /*
613  * fetch a global (Venti) block from the memory cache.
614  * if it's not there, load it, bumping some other block.
615  * check tag and type if it's really a local block in disguise.
616  */
617 Block *
cacheGlobal(Cache * c,uchar score[VtScoreSize],int type,u32int tag,int mode)618 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
619 {
620 	int n;
621 	Block *b;
622 	ulong h;
623 	u32int addr;
624 
625 	addr = globalToLocal(score);
626 	if(addr != NilBlock){
627 		b = cacheLocalData(c, addr, type, tag, mode, 0);
628 		if(b)
629 			b->pc = getcallerpc(&c);
630 		return b;
631 	}
632 
633 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
634 
635 	/*
636 	 * look for the block in the cache
637 	 */
638 	qlock(&c->lk);
639 	for(b = c->heads[h]; b != nil; b = b->next){
640 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
641 			continue;
642 		heapDel(b);
643 		b->ref++;
644 		break;
645 	}
646 
647 	if(b == nil){
648 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
649 
650 		b = cacheBumpBlock(c);
651 
652 		b->part = PartVenti;
653 		b->addr = NilBlock;
654 		b->l.type = type;
655 		memmove(b->score, score, VtScoreSize);
656 
657 		/* chain onto correct hash */
658 		b->next = c->heads[h];
659 		c->heads[h] = b;
660 		if(b->next != nil)
661 			b->next->prev = &b->next;
662 		b->prev = &c->heads[h];
663 	}
664 	qunlock(&c->lk);
665 
666 	bwatchLock(b);
667 	qlock(&b->lk);
668 	b->nlock = 1;
669 	b->pc = getcallerpc(&c);
670 
671 	switch(b->iostate){
672 	default:
673 		abort();
674 	case BioEmpty:
675 		n = vtread(c->z, score, vtType[type], b->data, c->size);
676 		if(n < 0 || vtsha1check(score, b->data, n) < 0){
677 			blockSetIOState(b, BioVentiError);
678 			blockPut(b);
679 			werrstr(
680 			"venti error reading block %V or wrong score: %r",
681 				score);
682 			return nil;
683 		}
684 		vtzeroextend(vtType[type], b->data, n, c->size);
685 		blockSetIOState(b, BioClean);
686 		return b;
687 	case BioClean:
688 		return b;
689 	case BioVentiError:
690 		blockPut(b);
691 		werrstr("venti i/o error or wrong score, block %V", score);
692 		return nil;
693 	case BioReadError:
694 		blockPut(b);
695 		werrstr("error reading block %V", b->score);
696 		return nil;
697 	}
698 	/* NOT REACHED */
699 }
700 
701 /*
702  * allocate a new on-disk block and load it into the memory cache.
703  * BUG: if the disk is full, should we flush some of it to Venti?
704  */
705 static u32int lastAlloc;
706 
707 Block *
cacheAllocBlock(Cache * c,int type,u32int tag,u32int epoch,u32int epochLow)708 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
709 {
710 	FreeList *fl;
711 	u32int addr;
712 	Block *b;
713 	int n, nwrap;
714 	Label lab;
715 
716 	n = c->size / LabelSize;
717 	fl = c->fl;
718 
719 	qlock(&fl->lk);
720 	addr = fl->last;
721 	nwrap = 0;
722 NotFound:
723 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
724 	if(b == nil){
725 		fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0);
726 		qunlock(&fl->lk);
727 		return nil;
728 	}
729 	for(;;){
730 		if(++addr >= fl->end){
731 			addr = 0;
732 			if(++nwrap >= 2){
733 				blockPut(b);
734 				werrstr("disk is full");
735 				/*
736 				 * try to avoid a continuous spew of console
737 				 * messages.
738 				 */
739 				if (fl->last != 0)
740 					fprint(2, "%s: cacheAllocBlock: xxx1 %r\n",
741 						argv0);
742 				fl->last = 0;
743 				qunlock(&fl->lk);
744 				return nil;
745 			}
746 		}
747 		if(addr%n == 0){
748 			blockPut(b);
749 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
750 			if(b == nil){
751 				fl->last = addr;
752 				fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0);
753 				qunlock(&fl->lk);
754 				return nil;
755 			}
756 		}
757 		if(!labelUnpack(&lab, b->data, addr%n))
758 			continue;
759 		if(lab.state == BsFree)
760 			goto Found;
761 		if(lab.state&BsClosed)
762 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
763 			goto Found;
764 	}
765 Found:
766 	blockPut(b);
767 	b = cacheLocal(c, PartData, addr, OOverWrite);
768 	if(b == nil){
769 		fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0);
770 		return nil;
771 	}
772 	if(!(b->iostate == BioLabel || b->iostate == BioClean)){
773 		if(0)fprint(2, "%s: cacheAllocBlock addr %ud iostate %s label %L\n",
774 		       argv0, addr, bioStr(b->iostate), &lab);
775 		blockPut(b);
776 		goto NotFound;
777 	}
778 	fl->last = addr;
779 	lab.type = type;
780 	lab.tag = tag;
781 	lab.state = BsAlloc;
782 	lab.epoch = epoch;
783 	lab.epochClose = ~(u32int)0;
784 	if(!blockSetLabel(b, &lab, 1)){
785 		fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0);
786 		blockPut(b);
787 		return nil;
788 	}
789 	vtzeroextend(vtType[type], b->data, 0, c->size);
790 if(0)diskWrite(c->disk, b);
791 
792 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
793 	lastAlloc = addr;
794 	fl->nused++;
795 	qunlock(&fl->lk);
796 	b->pc = getcallerpc(&c);
797 	return b;
798 }
799 
800 int
cacheDirty(Cache * c)801 cacheDirty(Cache *c)
802 {
803 	return c->ndirty;
804 }
805 
806 void
cacheCountUsed(Cache * c,u32int epochLow,u32int * used,u32int * total,u32int * bsize)807 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
808 {
809 	int n;
810 	u32int addr, nused;
811 	Block *b;
812 	Label lab;
813 	FreeList *fl;
814 
815 	fl = c->fl;
816 	n = c->size / LabelSize;
817 	*bsize = c->size;
818 	qlock(&fl->lk);
819 	if(fl->epochLow == epochLow){
820 		*used = fl->nused;
821 		*total = fl->end;
822 		qunlock(&fl->lk);
823 		return;
824 	}
825 	b = nil;
826 	nused = 0;
827 	for(addr=0; addr<fl->end; addr++){
828 		if(addr%n == 0){
829 			blockPut(b);
830 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
831 			if(b == nil){
832 				fprint(2, "%s: flCountUsed: loading %ux: %r\n",
833 					argv0, addr/n);
834 				break;
835 			}
836 		}
837 		if(!labelUnpack(&lab, b->data, addr%n))
838 			continue;
839 		if(lab.state == BsFree)
840 			continue;
841 		if(lab.state&BsClosed)
842 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
843 			continue;
844 		nused++;
845 	}
846 	blockPut(b);
847 	if(addr == fl->end){
848 		fl->nused = nused;
849 		fl->epochLow = epochLow;
850 	}
851 	*used = nused;
852 	*total = fl->end;
853 	qunlock(&fl->lk);
854 	return;
855 }
856 
857 static FreeList *
flAlloc(u32int end)858 flAlloc(u32int end)
859 {
860 	FreeList *fl;
861 
862 	fl = vtmallocz(sizeof(*fl));
863 	fl->last = 0;
864 	fl->end = end;
865 	return fl;
866 }
867 
868 static void
flFree(FreeList * fl)869 flFree(FreeList *fl)
870 {
871 	vtfree(fl);
872 }
873 
874 u32int
cacheLocalSize(Cache * c,int part)875 cacheLocalSize(Cache *c, int part)
876 {
877 	return diskSize(c->disk, part);
878 }
879 
880 /*
881  * The thread that has locked b may refer to it by
882  * multiple names.  Nlock counts the number of
883  * references the locking thread holds.  It will call
884  * blockPut once per reference.
885  */
886 void
blockDupLock(Block * b)887 blockDupLock(Block *b)
888 {
889 	assert(b->nlock > 0);
890 	b->nlock++;
891 }
892 
893 /*
894  * we're done with the block.
895  * unlock it.  can't use it after calling this.
896  */
897 void
blockPut(Block * b)898 blockPut(Block* b)
899 {
900 	Cache *c;
901 
902 	if(b == nil)
903 		return;
904 
905 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
906 
907 	if(b->iostate == BioDirty)
908 		bwatchDependency(b);
909 
910 	if(--b->nlock > 0)
911 		return;
912 
913 	/*
914 	 * b->nlock should probably stay at zero while
915 	 * the block is unlocked, but diskThread and rsleep
916 	 * conspire to assume that they can just qlock(&b->lk); blockPut(b),
917 	 * so we have to keep b->nlock set to 1 even
918 	 * when the block is unlocked.
919 	 */
920 	assert(b->nlock == 0);
921 	b->nlock = 1;
922 //	b->pc = 0;
923 
924 	bwatchUnlock(b);
925 	qunlock(&b->lk);
926 	c = b->c;
927 	qlock(&c->lk);
928 
929 	if(--b->ref > 0){
930 		qunlock(&c->lk);
931 		return;
932 	}
933 
934 	assert(b->ref == 0);
935 	switch(b->iostate){
936 	default:
937 		b->used = c->now++;
938 		heapIns(b);
939 		break;
940 	case BioEmpty:
941 	case BioLabel:
942 		if(c->nheap == 0)
943 			b->used = c->now++;
944 		else
945 			b->used = c->heap[0]->used;
946 		heapIns(b);
947 		break;
948 	case BioDirty:
949 		break;
950 	}
951 	qunlock(&c->lk);
952 }
953 
954 /*
955  * set the label associated with a block.
956  */
957 Block*
_blockSetLabel(Block * b,Label * l)958 _blockSetLabel(Block *b, Label *l)
959 {
960 	int lpb;
961 	Block *bb;
962 	u32int a;
963 	Cache *c;
964 
965 	c = b->c;
966 
967 	assert(b->part == PartData);
968 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
969 	lpb = c->size / LabelSize;
970 	a = b->addr / lpb;
971 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
972 	if(bb == nil){
973 		blockPut(b);
974 		return nil;
975 	}
976 	b->l = *l;
977 	labelPack(l, bb->data, b->addr%lpb);
978 	blockDirty(bb);
979 	return bb;
980 }
981 
982 int
blockSetLabel(Block * b,Label * l,int allocating)983 blockSetLabel(Block *b, Label *l, int allocating)
984 {
985 	Block *lb;
986 	Label oldl;
987 
988 	oldl = b->l;
989 	lb = _blockSetLabel(b, l);
990 	if(lb == nil)
991 		return 0;
992 
993 	/*
994 	 * If we're allocating the block, make sure the label (bl)
995 	 * goes to disk before the data block (b) itself.  This is to help
996 	 * the blocks that in turn depend on b.
997 	 *
998 	 * Suppose bx depends on (must be written out after) b.
999 	 * Once we write b we'll think it's safe to write bx.
1000 	 * Bx can't get at b unless it has a valid label, though.
1001 	 *
1002 	 * Allocation is the only case in which having a current label
1003 	 * is vital because:
1004 	 *
1005 	 *	- l.type is set at allocation and never changes.
1006 	 *	- l.tag is set at allocation and never changes.
1007 	 *	- l.state is not checked when we load blocks.
1008 	 *	- the archiver cares deeply about l.state being
1009 	 *		BaActive vs. BaCopied, but that's handled
1010 	 *		by direct calls to _blockSetLabel.
1011 	 */
1012 
1013 	if(allocating)
1014 		blockDependency(b, lb, -1, nil, nil);
1015 	blockPut(lb);
1016 	return 1;
1017 }
1018 
1019 /*
1020  * Record that bb must be written out before b.
1021  * If index is given, we're about to overwrite the score/e
1022  * at that index in the block.  Save the old value so we
1023  * can write a safer ``old'' version of the block if pressed.
1024  */
1025 void
blockDependency(Block * b,Block * bb,int index,uchar * score,Entry * e)1026 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1027 {
1028 	BList *p;
1029 
1030 	if(bb->iostate == BioClean)
1031 		return;
1032 
1033 	/*
1034 	 * Dependencies for blocks containing Entry structures
1035 	 * or scores must always be explained.  The problem with
1036 	 * only explaining some of them is this.  Suppose we have two
1037 	 * dependencies for the same field, the first explained
1038 	 * and the second not.  We try to write the block when the first
1039 	 * dependency is not written but the second is.  We will roll back
1040 	 * the first change even though the second trumps it.
1041 	 */
1042 	if(index == -1 && bb->part == PartData)
1043 		assert(b->l.type == BtData);
1044 
1045 	if(bb->iostate != BioDirty){
1046 		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1047 			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1048 		abort();
1049 	}
1050 
1051 	p = blistAlloc(bb);
1052 	if(p == nil)
1053 		return;
1054 
1055 	assert(bb->iostate == BioDirty);
1056 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1057 
1058 	p->part = bb->part;
1059 	p->addr = bb->addr;
1060 	p->type = bb->l.type;
1061 	p->vers = bb->vers;
1062 	p->index = index;
1063 	if(p->index >= 0){
1064 		/*
1065 		 * This test would just be b->l.type==BtDir except
1066 		 * we need to exclude the super block.
1067 		 */
1068 		if(b->l.type == BtDir && b->part == PartData)
1069 			entryPack(e, p->old.entry, 0);
1070 		else
1071 			memmove(p->old.score, score, VtScoreSize);
1072 	}
1073 	p->next = b->prior;
1074 	b->prior = p;
1075 }
1076 
1077 /*
1078  * Mark an in-memory block as dirty.  If there are too many
1079  * dirty blocks, start writing some out to disk.
1080  *
1081  * If there were way too many dirty blocks, we used to
1082  * try to do some flushing ourselves, but it's just too dangerous --
1083  * it implies that the callers cannot have any of our priors locked,
1084  * but this is hard to avoid in some cases.
1085  */
1086 int
blockDirty(Block * b)1087 blockDirty(Block *b)
1088 {
1089 	Cache *c;
1090 
1091 	c = b->c;
1092 
1093 	assert(b->part != PartVenti);
1094 
1095 	if(b->iostate == BioDirty)
1096 		return 1;
1097 	assert(b->iostate == BioClean || b->iostate == BioLabel);
1098 
1099 	qlock(&c->lk);
1100 	b->iostate = BioDirty;
1101 	c->ndirty++;
1102 	if(c->ndirty > (c->maxdirty>>1))
1103 		rwakeup(&c->flush);
1104 	qunlock(&c->lk);
1105 
1106 	return 1;
1107 }
1108 
1109 /*
1110  * We've decided to write out b.  Maybe b has some pointers to blocks
1111  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1112  * copy of b that is safe to write out.  (diskThread will make sure the block
1113  * remains marked as dirty.)
1114  */
1115 uchar *
blockRollback(Block * b,uchar * buf)1116 blockRollback(Block *b, uchar *buf)
1117 {
1118 	u32int addr;
1119 	BList *p;
1120 	Super super;
1121 
1122 	/* easy case */
1123 	if(b->prior == nil)
1124 		return b->data;
1125 
1126 	memmove(buf, b->data, b->c->size);
1127 	for(p=b->prior; p; p=p->next){
1128 		/*
1129 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1130 		 */
1131 		assert(p->index >= 0);
1132 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1133 		if(b->part == PartSuper){
1134 			assert(p->index == 0);
1135 			superUnpack(&super, buf);
1136 			addr = globalToLocal(p->old.score);
1137 			if(addr == NilBlock){
1138 				fprint(2, "%s: rolling back super block: "
1139 					"bad replacement addr %V\n",
1140 					argv0, p->old.score);
1141 				abort();
1142 			}
1143 			super.active = addr;
1144 			superPack(&super, buf);
1145 			continue;
1146 		}
1147 		if(b->l.type == BtDir)
1148 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1149 		else
1150 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1151 	}
1152 	return buf;
1153 }
1154 
1155 /*
1156  * Try to write block b.
1157  * If b depends on other blocks:
1158  *
1159  *	If the block has been written out, remove the dependency.
1160  *	If the dependency is replaced by a more recent dependency,
1161  *		throw it out.
1162  *	If we know how to write out an old version of b that doesn't
1163  *		depend on it, do that.
1164  *
1165  *	Otherwise, bail.
1166  */
1167 int
blockWrite(Block * b,int waitlock)1168 blockWrite(Block *b, int waitlock)
1169 {
1170 	uchar *dmap;
1171 	Cache *c;
1172 	BList *p, **pp;
1173 	Block *bb;
1174 	int lockfail;
1175 
1176 	c = b->c;
1177 
1178 	if(b->iostate != BioDirty)
1179 		return 1;
1180 
1181 	dmap = b->dmap;
1182 	memset(dmap, 0, c->ndmap);
1183 	pp = &b->prior;
1184 	for(p=*pp; p; p=*pp){
1185 		if(p->index >= 0){
1186 			/* more recent dependency has succeeded; this one can go */
1187 			if(dmap[p->index/8] & (1<<(p->index%8)))
1188 				goto ignblock;
1189 		}
1190 
1191 		lockfail = 0;
1192 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1193 			&lockfail);
1194 		if(bb == nil){
1195 			if(lockfail)
1196 				return 0;
1197 			/* block not in cache => was written already */
1198 			dmap[p->index/8] |= 1<<(p->index%8);
1199 			goto ignblock;
1200 		}
1201 
1202 		/*
1203 		 * same version of block is still in cache.
1204 		 *
1205 		 * the assertion is true because the block still has version p->vers,
1206 		 * which means it hasn't been written out since we last saw it.
1207 		 */
1208 		if(bb->iostate != BioDirty){
1209 			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1210 				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1211 			/* probably BioWriting if it happens? */
1212 			if(bb->iostate == BioClean)
1213 				goto ignblock;
1214 		}
1215 
1216 		blockPut(bb);
1217 
1218 		if(p->index < 0){
1219 			/*
1220 			 * We don't know how to temporarily undo
1221 			 * b's dependency on bb, so just don't write b yet.
1222 			 */
1223 			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1224 				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1225 			return 0;
1226 		}
1227 		/* keep walking down the list */
1228 		pp = &p->next;
1229 		continue;
1230 
1231 ignblock:
1232 		*pp = p->next;
1233 		blistFree(c, p);
1234 		continue;
1235 	}
1236 
1237 	/*
1238 	 * DiskWrite must never be called with a double-locked block.
1239 	 * This call to diskWrite is okay because blockWrite is only called
1240 	 * from the cache flush thread, which never double-locks a block.
1241 	 */
1242 	diskWrite(c->disk, b);
1243 	return 1;
1244 }
1245 
1246 /*
1247  * Change the I/O state of block b.
1248  * Just an assignment except for magic in
1249  * switch statement (read comments there).
1250  */
1251 void
blockSetIOState(Block * b,int iostate)1252 blockSetIOState(Block *b, int iostate)
1253 {
1254 	int dowakeup;
1255 	Cache *c;
1256 	BList *p, *q;
1257 
1258 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1259 
1260 	c = b->c;
1261 
1262 	dowakeup = 0;
1263 	switch(iostate){
1264 	default:
1265 		abort();
1266 	case BioEmpty:
1267 		assert(!b->uhead);
1268 		break;
1269 	case BioLabel:
1270 		assert(!b->uhead);
1271 		break;
1272 	case BioClean:
1273 		bwatchDependency(b);
1274 		/*
1275 		 * If b->prior is set, it means a write just finished.
1276 		 * The prior list isn't needed anymore.
1277 		 */
1278 		for(p=b->prior; p; p=q){
1279 			q = p->next;
1280 			blistFree(c, p);
1281 		}
1282 		b->prior = nil;
1283 		/*
1284 		 * Freeing a block or just finished a write.
1285 		 * Move the blocks from the per-block unlink
1286 		 * queue to the cache unlink queue.
1287 		 */
1288 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1289 			qlock(&c->lk);
1290 			c->ndirty--;
1291 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1292 			b->vers = c->vers++;
1293 			if(b->uhead){
1294 				/* add unlink blocks to unlink queue */
1295 				if(c->uhead == nil){
1296 					c->uhead = b->uhead;
1297 					rwakeup(&c->unlink);
1298 				}else
1299 					c->utail->next = b->uhead;
1300 				c->utail = b->utail;
1301 				b->uhead = nil;
1302 			}
1303 			qunlock(&c->lk);
1304 		}
1305 		assert(!b->uhead);
1306 		dowakeup = 1;
1307 		break;
1308 	case BioDirty:
1309 		/*
1310 		 * Wrote out an old version of the block (see blockRollback).
1311 		 * Bump a version count, leave it dirty.
1312 		 */
1313 		if(b->iostate == BioWriting){
1314 			qlock(&c->lk);
1315 			b->vers = c->vers++;
1316 			qunlock(&c->lk);
1317 			dowakeup = 1;
1318 		}
1319 		break;
1320 	case BioReading:
1321 	case BioWriting:
1322 		/*
1323 		 * Adding block to disk queue.  Bump reference count.
1324 		 * diskThread decs the count later by calling blockPut.
1325 		 * This is here because we need to lock c->lk to
1326 		 * manipulate the ref count.
1327 		 */
1328 		qlock(&c->lk);
1329 		b->ref++;
1330 		qunlock(&c->lk);
1331 		break;
1332 	case BioReadError:
1333 	case BioVentiError:
1334 		/*
1335 		 * Oops.
1336 		 */
1337 		dowakeup = 1;
1338 		break;
1339 	}
1340 	b->iostate = iostate;
1341 	/*
1342 	 * Now that the state has changed, we can wake the waiters.
1343 	 */
1344 	if(dowakeup)
1345 		rwakeupall(&b->ioready);
1346 }
1347 
1348 /*
1349  * The active file system is a tree of blocks.
1350  * When we add snapshots to the mix, the entire file system
1351  * becomes a dag and thus requires a bit more care.
1352  *
1353  * The life of the file system is divided into epochs.  A snapshot
1354  * ends one epoch and begins the next.  Each file system block
1355  * is marked with the epoch in which it was created (b.epoch).
1356  * When the block is unlinked from the file system (closed), it is marked
1357  * with the epoch in which it was removed (b.epochClose).
1358  * Once we have discarded or archived all snapshots up to
1359  * b.epochClose, we can reclaim the block.
1360  *
1361  * If a block was created in a past epoch but is not yet closed,
1362  * it is treated as copy-on-write.  Of course, in order to insert the
1363  * new pointer into the tree, the parent must be made writable,
1364  * and so on up the tree.  The recursion stops because the root
1365  * block is always writable.
1366  *
1367  * If blocks are never closed, they will never be reused, and
1368  * we will run out of disk space.  But marking a block as closed
1369  * requires some care about dependencies and write orderings.
1370  *
1371  * (1) If a block p points at a copy-on-write block b and we
1372  * copy b to create bb, then p must be written out after bb and
1373  * lbb (bb's label block).
1374  *
1375  * (2) We have to mark b as closed, but only after we switch
1376  * the pointer, so lb must be written out after p.  In fact, we
1377  * can't even update the in-memory copy, or the cache might
1378  * mistakenly give out b for reuse before p gets written.
1379  *
1380  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1381  * The caller is expected to record a "p after bb" dependency
1382  * to finish (1), and also expected to call blockRemoveLink
1383  * to arrange for (2) to happen once p is written.
1384  *
1385  * Until (2) happens, some pieces of the code (e.g., the archiver)
1386  * still need to know whether a block has been copied, so we
1387  * set the BsCopied bit in the label and force that to disk *before*
1388  * the copy gets written out.
1389  */
1390 Block*
blockCopy(Block * b,u32int tag,u32int ehi,u32int elo)1391 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1392 {
1393 	Block *bb, *lb;
1394 	Label l;
1395 
1396 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1397 		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1398 			argv0, b->addr, &b->l, elo, ehi);
1399 
1400 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1401 	if(bb == nil){
1402 		blockPut(b);
1403 		return nil;
1404 	}
1405 
1406 	/*
1407 	 * Update label so we know the block has been copied.
1408 	 * (It will be marked closed once it has been unlinked from
1409 	 * the tree.)  This must follow cacheAllocBlock since we
1410 	 * can't be holding onto lb when we call cacheAllocBlock.
1411 	 */
1412 	if((b->l.state&BsCopied)==0)
1413 	if(b->part == PartData){	/* not the superblock */
1414 		l = b->l;
1415 		l.state |= BsCopied;
1416 		lb = _blockSetLabel(b, &l);
1417 		if(lb == nil){
1418 			/* can't set label => can't copy block */
1419 			blockPut(b);
1420 			l.type = BtMax;
1421 			l.state = BsFree;
1422 			l.epoch = 0;
1423 			l.epochClose = 0;
1424 			l.tag = 0;
1425 			blockSetLabel(bb, &l, 0);
1426 			blockPut(bb);
1427 			return nil;
1428 		}
1429 		blockDependency(bb, lb, -1, nil, nil);
1430 		blockPut(lb);
1431 	}
1432 
1433 	memmove(bb->data, b->data, b->c->size);
1434 	blockDirty(bb);
1435 	blockPut(b);
1436 	return bb;
1437 }
1438 
1439 /*
1440  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1441  * If recurse is set, we are unlinking all of bb's children as well.
1442  *
1443  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1444  * the relevant information to b's list of unlinked blocks.  Once b is written,
1445  * the list will be queued for processing.
1446  *
1447  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1448  */
1449 void
blockRemoveLink(Block * b,u32int addr,int type,u32int tag,int recurse)1450 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1451 {
1452 	BList *p, **pp, bl;
1453 
1454 	/* remove bb from prior list */
1455 	for(pp=&b->prior; (p=*pp)!=nil; ){
1456 		if(p->part == PartData && p->addr == addr){
1457 			*pp = p->next;
1458 			blistFree(b->c, p);
1459 		}else
1460 			pp = &p->next;
1461 	}
1462 
1463 	bl.part = PartData;
1464 	bl.addr = addr;
1465 	bl.type = type;
1466 	bl.tag = tag;
1467 	if(b->l.epoch == 0)
1468 		assert(b->part == PartSuper);
1469 	bl.epoch = b->l.epoch;
1470 	bl.next = nil;
1471 	bl.recurse = recurse;
1472 
1473 	if(b->part == PartSuper && b->iostate == BioClean)
1474 		p = nil;
1475 	else
1476 		p = blistAlloc(b);
1477 	if(p == nil){
1478 		/*
1479 		 * b has already been written to disk.
1480 		 */
1481 		doRemoveLink(b->c, &bl);
1482 		return;
1483 	}
1484 
1485 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1486 	assert(b->iostate == BioDirty);
1487 
1488 	*p = bl;
1489 	if(b->uhead == nil)
1490 		b->uhead = p;
1491 	else
1492 		b->utail->next = p;
1493 	b->utail = p;
1494 }
1495 
1496 /*
1497  * Process removal of a single block and perhaps its children.
1498  */
1499 static void
doRemoveLink(Cache * c,BList * p)1500 doRemoveLink(Cache *c, BList *p)
1501 {
1502 	int i, n, recurse;
1503 	u32int a;
1504 	Block *b;
1505 	Label l;
1506 	BList bl;
1507 
1508 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1509 
1510 	/*
1511 	 * We're not really going to overwrite b, but if we're not
1512 	 * going to look at its contents, there is no point in reading
1513 	 * them from the disk.
1514 	 */
1515 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1516 	if(b == nil)
1517 		return;
1518 
1519 	/*
1520 	 * When we're unlinking from the superblock, close with the next epoch.
1521 	 */
1522 	if(p->epoch == 0)
1523 		p->epoch = b->l.epoch+1;
1524 
1525 	/* sanity check */
1526 	if(b->l.epoch > p->epoch){
1527 		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1528 			argv0, b->l.epoch, p->epoch);
1529 		blockPut(b);
1530 		return;
1531 	}
1532 
1533 	if(recurse){
1534 		n = c->size / VtScoreSize;
1535 		for(i=0; i<n; i++){
1536 			a = globalToLocal(b->data + i*VtScoreSize);
1537 			if(a == NilBlock || !readLabel(c, &l, a))
1538 				continue;
1539 			if(l.state&BsClosed)
1540 				continue;
1541 			/*
1542 			 * If stack space becomes an issue...
1543 			p->addr = a;
1544 			p->type = l.type;
1545 			p->tag = l.tag;
1546 			doRemoveLink(c, p);
1547 			 */
1548 
1549 			bl.part = PartData;
1550 			bl.addr = a;
1551 			bl.type = l.type;
1552 			bl.tag = l.tag;
1553 			bl.epoch = p->epoch;
1554 			bl.next = nil;
1555 			bl.recurse = 1;
1556 			/* give up the block lock - share with others */
1557 			blockPut(b);
1558 			doRemoveLink(c, &bl);
1559 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1560 			if(b == nil){
1561 				fprint(2, "%s: warning: lost block in doRemoveLink\n",
1562 					argv0);
1563 				return;
1564 			}
1565 		}
1566 	}
1567 
1568 	l = b->l;
1569 	l.state |= BsClosed;
1570 	l.epochClose = p->epoch;
1571 	if(l.epochClose == l.epoch){
1572 		/* lock ordering: trying for c->fl->lk while holding b->lk can deadlock */
1573 		if(!canqlock(&c->fl->lk)){
1574 			blockPut(b);
1575 			qlock(&c->fl->lk);
1576 			b = cacheLocalData(c, p->addr, p->type, p->tag, OOverWrite, 0);
1577 			if(b == nil){
1578 				fprint(2, "%s: warning: lost block at end of doRemoveLink\n",
1579 					argv0);
1580 				qunlock(&c->fl->lk);
1581 				return;
1582 			}
1583 		}
1584 		if(l.epoch == c->fl->epochLow)
1585 			c->fl->nused--;
1586 		blockSetLabel(b, &l, 0);
1587 		qunlock(&c->fl->lk);
1588 	}else
1589 		blockSetLabel(b, &l, 0);
1590 	blockPut(b);
1591 }
1592 
1593 /*
1594  * Allocate a BList so that we can record a dependency
1595  * or queue a removal related to block b.
1596  * If we can't find a BList, we write out b and return nil.
1597  */
1598 static BList *
blistAlloc(Block * b)1599 blistAlloc(Block *b)
1600 {
1601 	Cache *c;
1602 	BList *p;
1603 
1604 	if(b->iostate != BioDirty){
1605 		/*
1606 		 * should not happen anymore -
1607 	 	 * blockDirty used to flush but no longer does.
1608 		 */
1609 		assert(b->iostate == BioClean);
1610 		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1611 		return nil;
1612 	}
1613 
1614 	c = b->c;
1615 	qlock(&c->lk);
1616 	if(c->blfree == nil){
1617 		/*
1618 		 * No free BLists.  What are our options?
1619 		 */
1620 
1621 		/* Block has no priors? Just write it. */
1622 		if(b->prior == nil){
1623 			qunlock(&c->lk);
1624 			diskWriteAndWait(c->disk, b);
1625 			return nil;
1626 		}
1627 
1628 		/*
1629 		 * Wake the flush thread, which will hopefully free up
1630 		 * some BLists for us.  We used to flush a block from
1631 		 * our own prior list and reclaim that BList, but this is
1632 		 * a no-no: some of the blocks on our prior list may
1633 		 * be locked by our caller.  Or maybe their label blocks
1634 		 * are locked by our caller.  In any event, it's too hard
1635 		 * to make sure we can do I/O for ourselves.  Instead,
1636 		 * we assume the flush thread will find something.
1637 		 * (The flush thread never blocks waiting for a block,
1638 		 * so it can't deadlock like we can.)
1639 		 */
1640 		while(c->blfree == nil){
1641 			rwakeup(&c->flush);
1642 			rsleep(&c->blrend);
1643 			if(c->blfree == nil)
1644 				fprint(2, "%s: flushing for blists\n", argv0);
1645 		}
1646 	}
1647 
1648 	p = c->blfree;
1649 	c->blfree = p->next;
1650 	qunlock(&c->lk);
1651 	return p;
1652 }
1653 
1654 static void
blistFree(Cache * c,BList * bl)1655 blistFree(Cache *c, BList *bl)
1656 {
1657 	qlock(&c->lk);
1658 	bl->next = c->blfree;
1659 	c->blfree = bl;
1660 	rwakeup(&c->blrend);
1661 	qunlock(&c->lk);
1662 }
1663 
1664 char*
bsStr(int state)1665 bsStr(int state)
1666 {
1667 	static char s[100];
1668 
1669 	if(state == BsFree)
1670 		return "Free";
1671 	if(state == BsBad)
1672 		return "Bad";
1673 
1674 	sprint(s, "%x", state);
1675 	if(!(state&BsAlloc))
1676 		strcat(s, ",Free");	/* should not happen */
1677 	if(state&BsCopied)
1678 		strcat(s, ",Copied");
1679 	if(state&BsVenti)
1680 		strcat(s, ",Venti");
1681 	if(state&BsClosed)
1682 		strcat(s, ",Closed");
1683 	return s;
1684 }
1685 
1686 char *
bioStr(int iostate)1687 bioStr(int iostate)
1688 {
1689 	switch(iostate){
1690 	default:
1691 		return "Unknown!!";
1692 	case BioEmpty:
1693 		return "Empty";
1694 	case BioLabel:
1695 		return "Label";
1696 	case BioClean:
1697 		return "Clean";
1698 	case BioDirty:
1699 		return "Dirty";
1700 	case BioReading:
1701 		return "Reading";
1702 	case BioWriting:
1703 		return "Writing";
1704 	case BioReadError:
1705 		return "ReadError";
1706 	case BioVentiError:
1707 		return "VentiError";
1708 	case BioMax:
1709 		return "Max";
1710 	}
1711 }
1712 
1713 static char *bttab[] = {
1714 	"BtData",
1715 	"BtData+1",
1716 	"BtData+2",
1717 	"BtData+3",
1718 	"BtData+4",
1719 	"BtData+5",
1720 	"BtData+6",
1721 	"BtData+7",
1722 	"BtDir",
1723 	"BtDir+1",
1724 	"BtDir+2",
1725 	"BtDir+3",
1726 	"BtDir+4",
1727 	"BtDir+5",
1728 	"BtDir+6",
1729 	"BtDir+7",
1730 };
1731 
1732 char*
btStr(int type)1733 btStr(int type)
1734 {
1735 	if(type < nelem(bttab))
1736 		return bttab[type];
1737 	return "unknown";
1738 }
1739 
1740 int
labelFmt(Fmt * f)1741 labelFmt(Fmt *f)
1742 {
1743 	Label *l;
1744 
1745 	l = va_arg(f->args, Label*);
1746 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1747 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1748 }
1749 
1750 int
scoreFmt(Fmt * f)1751 scoreFmt(Fmt *f)
1752 {
1753 	uchar *v;
1754 	int i;
1755 	u32int addr;
1756 
1757 	v = va_arg(f->args, uchar*);
1758 	if(v == nil){
1759 		fmtprint(f, "*");
1760 	}else if((addr = globalToLocal(v)) != NilBlock)
1761 		fmtprint(f, "0x%.8ux", addr);
1762 	else{
1763 		for(i = 0; i < VtScoreSize; i++)
1764 			fmtprint(f, "%2.2ux", v[i]);
1765 	}
1766 
1767 	return 0;
1768 }
1769 
1770 static int
upHeap(int i,Block * b)1771 upHeap(int i, Block *b)
1772 {
1773 	Block *bb;
1774 	u32int now;
1775 	int p;
1776 	Cache *c;
1777 
1778 	c = b->c;
1779 	now = c->now;
1780 	for(; i != 0; i = p){
1781 		p = (i - 1) >> 1;
1782 		bb = c->heap[p];
1783 		if(b->used - now >= bb->used - now)
1784 			break;
1785 		c->heap[i] = bb;
1786 		bb->heap = i;
1787 	}
1788 	c->heap[i] = b;
1789 	b->heap = i;
1790 
1791 	return i;
1792 }
1793 
1794 static int
downHeap(int i,Block * b)1795 downHeap(int i, Block *b)
1796 {
1797 	Block *bb;
1798 	u32int now;
1799 	int k;
1800 	Cache *c;
1801 
1802 	c = b->c;
1803 	now = c->now;
1804 	for(; ; i = k){
1805 		k = (i << 1) + 1;
1806 		if(k >= c->nheap)
1807 			break;
1808 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1809 			k++;
1810 		bb = c->heap[k];
1811 		if(b->used - now <= bb->used - now)
1812 			break;
1813 		c->heap[i] = bb;
1814 		bb->heap = i;
1815 	}
1816 	c->heap[i] = b;
1817 	b->heap = i;
1818 	return i;
1819 }
1820 
1821 /*
1822  * Delete a block from the heap.
1823  * Called with c->lk held.
1824  */
1825 static void
heapDel(Block * b)1826 heapDel(Block *b)
1827 {
1828 	int i, si;
1829 	Cache *c;
1830 
1831 	c = b->c;
1832 
1833 	si = b->heap;
1834 	if(si == BadHeap)
1835 		return;
1836 	b->heap = BadHeap;
1837 	c->nheap--;
1838 	if(si == c->nheap)
1839 		return;
1840 	b = c->heap[c->nheap];
1841 	i = upHeap(si, b);
1842 	if(i == si)
1843 		downHeap(i, b);
1844 }
1845 
1846 /*
1847  * Insert a block into the heap.
1848  * Called with c->lk held.
1849  */
1850 static void
heapIns(Block * b)1851 heapIns(Block *b)
1852 {
1853 	assert(b->heap == BadHeap);
1854 	upHeap(b->c->nheap++, b);
1855 	rwakeup(&b->c->heapwait);
1856 }
1857 
1858 /*
1859  * Get just the label for a block.
1860  */
1861 int
readLabel(Cache * c,Label * l,u32int addr)1862 readLabel(Cache *c, Label *l, u32int addr)
1863 {
1864 	int lpb;
1865 	Block *b;
1866 	u32int a;
1867 
1868 	lpb = c->size / LabelSize;
1869 	a = addr / lpb;
1870 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1871 	if(b == nil){
1872 		blockPut(b);
1873 		return 0;
1874 	}
1875 
1876 	if(!labelUnpack(l, b->data, addr%lpb)){
1877 		blockPut(b);
1878 		return 0;
1879 	}
1880 	blockPut(b);
1881 	return 1;
1882 }
1883 
1884 /*
1885  * Process unlink queue.
1886  * Called with c->lk held.
1887  */
1888 static void
unlinkBody(Cache * c)1889 unlinkBody(Cache *c)
1890 {
1891 	BList *p;
1892 
1893 	while(c->uhead != nil){
1894 		p = c->uhead;
1895 		c->uhead = p->next;
1896 		qunlock(&c->lk);
1897 		doRemoveLink(c, p);
1898 		qlock(&c->lk);
1899 		p->next = c->blfree;
1900 		c->blfree = p;
1901 	}
1902 }
1903 
1904 /*
1905  * Occasionally unlink the blocks on the cache unlink queue.
1906  */
1907 static void
unlinkThread(void * a)1908 unlinkThread(void *a)
1909 {
1910 	Cache *c = a;
1911 
1912 	threadsetname("unlink");
1913 
1914 	qlock(&c->lk);
1915 	for(;;){
1916 		while(c->uhead == nil && c->die.l == nil)
1917 			rsleep(&c->unlink);
1918 		if(c->die.l != nil)
1919 			break;
1920 		unlinkBody(c);
1921 	}
1922 	c->ref--;
1923 	rwakeup(&c->die);
1924 	qunlock(&c->lk);
1925 }
1926 
1927 static int
baddrCmp(void * a0,void * a1)1928 baddrCmp(void *a0, void *a1)
1929 {
1930 	BAddr *b0, *b1;
1931 	b0 = a0;
1932 	b1 = a1;
1933 
1934 	if(b0->part < b1->part)
1935 		return -1;
1936 	if(b0->part > b1->part)
1937 		return 1;
1938 	if(b0->addr < b1->addr)
1939 		return -1;
1940 	if(b0->addr > b1->addr)
1941 		return 1;
1942 	return 0;
1943 }
1944 
1945 /*
1946  * Scan the block list for dirty blocks; add them to the list c->baddr.
1947  */
1948 static void
flushFill(Cache * c)1949 flushFill(Cache *c)
1950 {
1951 	int i, ndirty;
1952 	BAddr *p;
1953 	Block *b;
1954 
1955 	qlock(&c->lk);
1956 	if(c->ndirty == 0){
1957 		qunlock(&c->lk);
1958 		return;
1959 	}
1960 
1961 	p = c->baddr;
1962 	ndirty = 0;
1963 	for(i=0; i<c->nblocks; i++){
1964 		b = c->blocks + i;
1965 		if(b->part == PartError)
1966 			continue;
1967 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1968 			ndirty++;
1969 		if(b->iostate != BioDirty)
1970 			continue;
1971 		p->part = b->part;
1972 		p->addr = b->addr;
1973 		p->vers = b->vers;
1974 		p++;
1975 	}
1976 	if(ndirty != c->ndirty){
1977 		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1978 			argv0, c->ndirty, ndirty);
1979 		c->ndirty = ndirty;
1980 	}
1981 	qunlock(&c->lk);
1982 
1983 	c->bw = p - c->baddr;
1984 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1985 }
1986 
1987 /*
1988  * This is not thread safe, i.e. it can't be called from multiple threads.
1989  *
1990  * It's okay how we use it, because it only gets called in
1991  * the flushThread.  And cacheFree, but only after
1992  * cacheFree has killed off the flushThread.
1993  */
1994 static int
cacheFlushBlock(Cache * c)1995 cacheFlushBlock(Cache *c)
1996 {
1997 	Block *b;
1998 	BAddr *p;
1999 	int lockfail, nfail;
2000 
2001 	nfail = 0;
2002 	for(;;){
2003 		if(c->br == c->be){
2004 			if(c->bw == 0 || c->bw == c->be)
2005 				flushFill(c);
2006 			c->br = 0;
2007 			c->be = c->bw;
2008 			c->bw = 0;
2009 			c->nflush = 0;
2010 		}
2011 
2012 		if(c->br == c->be)
2013 			return 0;
2014 		p = c->baddr + c->br;
2015 		c->br++;
2016 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
2017 			&lockfail);
2018 
2019 		if(b && blockWrite(b, Nowaitlock)){
2020 			c->nflush++;
2021 			blockPut(b);
2022 			return 1;
2023 		}
2024 		if(b)
2025 			blockPut(b);
2026 
2027 		/*
2028 		 * Why didn't we write the block?
2029 		 */
2030 
2031 		/* Block already written out */
2032 		if(b == nil && !lockfail)
2033 			continue;
2034 
2035 		/* Failed to acquire lock; sleep if happens a lot. */
2036 		if(lockfail && ++nfail > 100){
2037 			sleep(500);
2038 			nfail = 0;
2039 		}
2040 		/* Requeue block. */
2041 		if(c->bw < c->be)
2042 			c->baddr[c->bw++] = *p;
2043 	}
2044 }
2045 
2046 /*
2047  * Occasionally flush dirty blocks from memory to the disk.
2048  */
2049 static void
flushThread(void * a)2050 flushThread(void *a)
2051 {
2052 	Cache *c = a;
2053 	int i;
2054 
2055 	threadsetname("flush");
2056 	qlock(&c->lk);
2057 	while(c->die.l == nil){
2058 		rsleep(&c->flush);
2059 		qunlock(&c->lk);
2060 		for(i=0; i<FlushSize; i++)
2061 			if(!cacheFlushBlock(c)){
2062 				/*
2063 				 * If i==0, could be someone is waking us repeatedly
2064 				 * to flush the cache but there's no work to do.
2065 				 * Pause a little.
2066 				 */
2067 				if(i==0){
2068 					// fprint(2, "%s: flushthread found "
2069 					//	"nothing to flush - %d dirty\n",
2070 					//	argv0, c->ndirty);
2071 					sleep(250);
2072 				}
2073 				break;
2074 			}
2075 		if(i==0 && c->ndirty){
2076 			/*
2077 			 * All the blocks are being written right now -- there's nothing to do.
2078 			 * We might be spinning with cacheFlush though -- he'll just keep
2079 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2080 			 * on something that the diskThread can kick, but for now we'll
2081 			 * just pause for a little while waiting for disks to finish.
2082 			 */
2083 			sleep(100);
2084 		}
2085 		qlock(&c->lk);
2086 		rwakeupall(&c->flushwait);
2087 	}
2088 	c->ref--;
2089 	rwakeup(&c->die);
2090 	qunlock(&c->lk);
2091 }
2092 
2093 /*
2094  * Flush the cache.
2095  */
2096 void
cacheFlush(Cache * c,int wait)2097 cacheFlush(Cache *c, int wait)
2098 {
2099 	qlock(&c->lk);
2100 	if(wait){
2101 		while(c->ndirty){
2102 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2103 		//		c->ndirty, c->uhead);
2104 			rwakeup(&c->flush);
2105 			rsleep(&c->flushwait);
2106 		}
2107 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2108 	}else if(c->ndirty)
2109 		rwakeup(&c->flush);
2110 	qunlock(&c->lk);
2111 }
2112 
2113 /*
2114  * Kick the flushThread every 30 seconds.
2115  */
2116 static void
cacheSync(void * v)2117 cacheSync(void *v)
2118 {
2119 	Cache *c;
2120 
2121 	c = v;
2122 	cacheFlush(c, 0);
2123 }
2124