xref: /plan9/sys/src/cmd/fossil/cache.c (revision e12a987081f10894b49298514b3a97b41db862b0)
1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5 
6 #include "9.h"	/* for cacheFlush */
7 
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10 
11 enum {
12 	BadHeap = ~0,
13 };
14 
15 /*
16  * Store data to the memory cache in c->size blocks
17  * with the block zero extended to fill it out.  When writing to
18  * Venti, the block will be zero truncated.  The walker will also check
19  * that the block fits within psize or dsize as the case may be.
20  */
21 
22 struct Cache
23 {
24 	VtLock	*lk;
25 	int 	ref;
26 	int	mode;
27 
28 	Disk 	*disk;
29 	int	size;			/* block size */
30 	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31 	VtSession *z;
32 	u32int	now;			/* ticks for usage timestamps */
33 	Block	**heads;		/* hash table for finding address */
34 	int	nheap;			/* number of available victims */
35 	Block	**heap;			/* heap for locating victims */
36 	long	nblocks;		/* number of blocks allocated */
37 	Block	*blocks;		/* array of block descriptors */
38 	u8int	*mem;			/* memory for all block data & blists */
39 
40 	BList	*blfree;
41 	VtRendez *blrend;
42 
43 	int 	ndirty;			/* number of dirty blocks in the cache */
44 	int 	maxdirty;		/* max number of dirty blocks */
45 	u32int	vers;
46 
47 	long hashSize;
48 
49 	FreeList *fl;
50 
51 	VtRendez *die;			/* daemon threads should die when != nil */
52 
53 	VtRendez *flush;
54 	VtRendez *flushwait;
55 	VtRendez *heapwait;
56 	BAddr *baddr;
57 	int bw, br, be;
58 	int nflush;
59 
60 	Periodic *sync;
61 
62 	/* unlink daemon */
63 	BList *uhead;
64 	BList *utail;
65 	VtRendez *unlink;
66 
67 	/* block counts */
68 	int nused;
69 	int ndisk;
70 };
71 
72 struct BList {
73 	int part;
74 	u32int addr;
75 	uchar type;
76 	u32int tag;
77 	u32int epoch;
78 	u32int vers;
79 
80 	int recurse;	/* for block unlink */
81 
82 	/* for roll back */
83 	int index;			/* -1 indicates not valid */
84 	union {
85 		uchar score[VtScoreSize];
86 		uchar entry[VtEntrySize];
87 	} old;
88 	BList *next;
89 };
90 
91 struct BAddr {
92 	int part;
93 	u32int addr;
94 	u32int vers;
95 };
96 
97 struct FreeList {
98 	VtLock *lk;
99 	u32int last;		/* last block allocated */
100 	u32int end;		/* end of data partition */
101 	u32int nused;		/* number of used blocks */
102 	u32int epochLow;	/* low epoch when last updated nused */
103 };
104 
105 static FreeList *flAlloc(u32int end);
106 static void flFree(FreeList *fl);
107 
108 static Block *cacheBumpBlock(Cache *c);
109 static void heapDel(Block*);
110 static void heapIns(Block*);
111 static void cacheCheck(Cache*);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void unlinkBody(Cache *c);
115 static int cacheFlushBlock(Cache *c);
116 static void cacheSync(void*);
117 static BList *blistAlloc(Block*);
118 static void blistFree(Cache*, BList*);
119 static void doRemoveLink(Cache*, BList*);
120 
121 /*
122  * Mapping from local block type to Venti type
123  */
124 int vtType[BtMax] = {
125 	VtDataType,		/* BtData | 0  */
126 	VtPointerType0,		/* BtData | 1  */
127 	VtPointerType1,		/* BtData | 2  */
128 	VtPointerType2,		/* BtData | 3  */
129 	VtPointerType3,		/* BtData | 4  */
130 	VtPointerType4,		/* BtData | 5  */
131 	VtPointerType5,		/* BtData | 6  */
132 	VtPointerType6,		/* BtData | 7  */
133 	VtDirType,		/* BtDir | 0  */
134 	VtPointerType0,		/* BtDir | 1  */
135 	VtPointerType1,		/* BtDir | 2  */
136 	VtPointerType2,		/* BtDir | 3  */
137 	VtPointerType3,		/* BtDir | 4  */
138 	VtPointerType4,		/* BtDir | 5  */
139 	VtPointerType5,		/* BtDir | 6  */
140 	VtPointerType6,		/* BtDir | 7  */
141 };
142 
143 /*
144  * Allocate the memory cache.
145  */
146 Cache *
cacheAlloc(Disk * disk,VtSession * z,ulong nblocks,int mode)147 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
148 {
149 	int i;
150 	Cache *c;
151 	Block *b;
152 	BList *bl;
153 	u8int *p;
154 	int nbl;
155 
156 	c = vtMemAllocZ(sizeof(Cache));
157 
158 	/* reasonable number of BList elements */
159 	nbl = nblocks * 4;
160 
161 	c->lk = vtLockAlloc();
162 	c->ref = 1;
163 	c->disk = disk;
164 	c->z = z;
165 	c->size = diskBlockSize(disk);
166 bwatchSetBlockSize(c->size);
167 	/* round c->size up to be a nice multiple */
168 	c->size = (c->size + 127) & ~127;
169 	c->ndmap = (c->size/20 + 7) / 8;
170 	c->nblocks = nblocks;
171 	c->hashSize = nblocks;
172 	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
173 	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
174 	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
175 	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
176 	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
177 	c->mode = mode;
178 	c->vers++;
179 	p = c->mem;
180 	for(i = 0; i < nblocks; i++){
181 		b = &c->blocks[i];
182 		b->lk = vtLockAlloc();
183 		b->c = c;
184 		b->data = p;
185 		b->heap = i;
186 		b->ioready = vtRendezAlloc(b->lk);
187 		c->heap[i] = b;
188 		p += c->size;
189 	}
190 	c->nheap = nblocks;
191 	for(i = 0; i < nbl; i++){
192 		bl = (BList*)p;
193 		bl->next = c->blfree;
194 		c->blfree = bl;
195 		p += sizeof(BList);
196 	}
197 	/* separate loop to keep blocks and blists reasonably aligned */
198 	for(i = 0; i < nblocks; i++){
199 		b = &c->blocks[i];
200 		b->dmap = p;
201 		p += c->ndmap;
202 	}
203 
204 	c->blrend = vtRendezAlloc(c->lk);
205 
206 	c->maxdirty = nblocks*(DirtyPercentage*0.01);
207 
208 	c->fl = flAlloc(diskSize(disk, PartData));
209 
210 	c->unlink = vtRendezAlloc(c->lk);
211 	c->flush = vtRendezAlloc(c->lk);
212 	c->flushwait = vtRendezAlloc(c->lk);
213 	c->heapwait = vtRendezAlloc(c->lk);
214 	c->sync = periodicAlloc(cacheSync, c, 30*1000);
215 
216 	if(mode == OReadWrite){
217 		c->ref += 2;
218 		vtThread(unlinkThread, c);
219 		vtThread(flushThread, c);
220 	}
221 	cacheCheck(c);
222 
223 	return c;
224 }
225 
226 /*
227  * Free the whole memory cache, flushing all dirty blocks to the disk.
228  */
229 void
cacheFree(Cache * c)230 cacheFree(Cache *c)
231 {
232 	int i;
233 
234 	/* kill off daemon threads */
235 	vtLock(c->lk);
236 	c->die = vtRendezAlloc(c->lk);
237 	periodicKill(c->sync);
238 	vtWakeup(c->flush);
239 	vtWakeup(c->unlink);
240 	while(c->ref > 1)
241 		vtSleep(c->die);
242 
243 	/* flush everything out */
244 	do {
245 		unlinkBody(c);
246 		vtUnlock(c->lk);
247 		while(cacheFlushBlock(c))
248 			;
249 		diskFlush(c->disk);
250 		vtLock(c->lk);
251 	} while(c->uhead || c->ndirty);
252 	vtUnlock(c->lk);
253 
254 	cacheCheck(c);
255 
256 	for(i = 0; i < c->nblocks; i++){
257 		assert(c->blocks[i].ref == 0);
258 		vtRendezFree(c->blocks[i].ioready);
259 		vtLockFree(c->blocks[i].lk);
260 	}
261 	flFree(c->fl);
262 	vtMemFree(c->baddr);
263 	vtMemFree(c->heads);
264 	vtMemFree(c->blocks);
265 	vtMemFree(c->mem);
266 	vtLockFree(c->lk);
267 	diskFree(c->disk);
268 	vtRendezFree(c->blrend);
269 	/* don't close vtSession */
270 	vtMemFree(c);
271 }
272 
273 static void
cacheDump(Cache * c)274 cacheDump(Cache *c)
275 {
276 	int i;
277 	Block *b;
278 
279 	for(i = 0; i < c->nblocks; i++){
280 		b = &c->blocks[i];
281 		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
282 			i, b->part, b->addr, b->score, b->l.type, b->ref,
283 			bsStr(b->l.state), bioStr(b->iostate), b->pc);
284 	}
285 }
286 
287 static void
cacheCheck(Cache * c)288 cacheCheck(Cache *c)
289 {
290 	u32int size, now;
291 	int i, k, refed;
292 	static uchar zero[VtScoreSize];
293 	Block *b;
294 
295 	size = c->size;
296 	now = c->now;
297 
298 	for(i = 0; i < c->nheap; i++){
299 		if(c->heap[i]->heap != i)
300 			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
301 		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
302 			vtFatal("bad heap ordering");
303 		k = (i << 1) + 1;
304 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
305 			vtFatal("bad heap ordering");
306 		k++;
307 		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
308 			vtFatal("bad heap ordering");
309 	}
310 
311 	refed = 0;
312 	for(i = 0; i < c->nblocks; i++){
313 		b = &c->blocks[i];
314 		if(b->data != &c->mem[i * size])
315 			vtFatal("mis-blocked at %d", i);
316 		if(b->ref && b->heap == BadHeap){
317 			refed++;
318 		}
319 	}
320 if(c->nheap + refed != c->nblocks){
321 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
322 cacheDump(c);
323 }
324 	assert(c->nheap + refed == c->nblocks);
325 	refed = 0;
326 	for(i = 0; i < c->nblocks; i++){
327 		b = &c->blocks[i];
328 		if(b->ref){
329 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
330 			refed++;
331 		}
332 	}
333 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
334 }
335 
336 
337 /*
338  * locate the block with the oldest second to last use.
339  * remove it from the heap, and fix up the heap.
340  */
341 /* called with c->lk held */
342 static Block *
cacheBumpBlock(Cache * c)343 cacheBumpBlock(Cache *c)
344 {
345 	int printed;
346 	Block *b;
347 
348 	/*
349 	 * locate the block with the oldest second to last use.
350 	 * remove it from the heap, and fix up the heap.
351 	 */
352 	printed = 0;
353 	if(c->nheap == 0){
354 		while(c->nheap == 0){
355 			vtWakeup(c->flush);
356 			vtSleep(c->heapwait);
357 			if(c->nheap == 0){
358 				printed = 1;
359 				fprint(2, "%s: entire cache is busy, %d dirty "
360 					"-- waking flush thread\n",
361 					argv0, c->ndirty);
362 			}
363 		}
364 		if(printed)
365 			fprint(2, "%s: cache is okay again, %d dirty\n",
366 				argv0, c->ndirty);
367 	}
368 
369 	b = c->heap[0];
370 	heapDel(b);
371 
372 	assert(b->heap == BadHeap);
373 	assert(b->ref == 0);
374 	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
375 	assert(b->prior == nil);
376 	assert(b->uhead == nil);
377 
378 	/*
379 	 * unchain the block from hash chain
380 	 */
381 	if(b->prev){
382 		*(b->prev) = b->next;
383 		if(b->next)
384 			b->next->prev = b->prev;
385 		b->prev = nil;
386 	}
387 
388 
389 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
390 	/* set block to a reasonable state */
391 	b->ref = 1;
392 	b->part = PartError;
393 	memset(&b->l, 0, sizeof(b->l));
394 	b->iostate = BioEmpty;
395 
396 	return b;
397 }
398 
399 /*
400  * look for a particular version of the block in the memory cache.
401  */
402 static Block *
_cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers,int waitlock,int * lockfailure)403 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
404 	int waitlock, int *lockfailure)
405 {
406 	Block *b;
407 	ulong h;
408 
409 	h = addr % c->hashSize;
410 
411 	if(lockfailure)
412 		*lockfailure = 0;
413 
414 	/*
415 	 * look for the block in the cache
416 	 */
417 	vtLock(c->lk);
418 	for(b = c->heads[h]; b != nil; b = b->next){
419 		if(b->part == part && b->addr == addr)
420 			break;
421 	}
422 	if(b == nil || b->vers != vers){
423 		vtUnlock(c->lk);
424 		return nil;
425 	}
426 	if(!waitlock && !vtCanLock(b->lk)){
427 		*lockfailure = 1;
428 		vtUnlock(c->lk);
429 		return nil;
430 	}
431 	heapDel(b);
432 	b->ref++;
433 	vtUnlock(c->lk);
434 
435 	bwatchLock(b);
436 	if(waitlock)
437 		vtLock(b->lk);
438 	b->nlock = 1;
439 
440 	for(;;){
441 		switch(b->iostate){
442 		default:
443 			abort();
444 		case BioEmpty:
445 		case BioLabel:
446 		case BioClean:
447 		case BioDirty:
448 			if(b->vers != vers){
449 				blockPut(b);
450 				return nil;
451 			}
452 			return b;
453 		case BioReading:
454 		case BioWriting:
455 			vtSleep(b->ioready);
456 			break;
457 		case BioVentiError:
458 			blockPut(b);
459 			vtSetError("venti i/o error block 0x%.8ux", addr);
460 			return nil;
461 		case BioReadError:
462 			blockPut(b);
463 			vtSetError("error reading block 0x%.8ux", addr);
464 			return nil;
465 		}
466 	}
467 	/* NOT REACHED */
468 }
469 static Block*
cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers)470 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
471 {
472 	return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
473 }
474 
475 
476 /*
477  * fetch a local (on-disk) block from the memory cache.
478  * if it's not there, load it, bumping some other block.
479  */
480 Block *
_cacheLocal(Cache * c,int part,u32int addr,int mode,u32int epoch)481 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
482 {
483 	Block *b;
484 	ulong h;
485 
486 	assert(part != PartVenti);
487 
488 	h = addr % c->hashSize;
489 
490 	/*
491 	 * look for the block in the cache
492 	 */
493 	vtLock(c->lk);
494 	for(b = c->heads[h]; b != nil; b = b->next){
495 		if(b->part != part || b->addr != addr)
496 			continue;
497 		if(epoch && b->l.epoch != epoch){
498 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
499 			vtUnlock(c->lk);
500 			vtSetError(ELabelMismatch);
501 			return nil;
502 		}
503 		heapDel(b);
504 		b->ref++;
505 		break;
506 	}
507 
508 	if(b == nil){
509 		b = cacheBumpBlock(c);
510 
511 		b->part = part;
512 		b->addr = addr;
513 		localToGlobal(addr, b->score);
514 
515 		/* chain onto correct hash */
516 		b->next = c->heads[h];
517 		c->heads[h] = b;
518 		if(b->next != nil)
519 			b->next->prev = &b->next;
520 		b->prev = &c->heads[h];
521 	}
522 
523 	vtUnlock(c->lk);
524 
525 	/*
526 	 * BUG: what if the epoch changes right here?
527 	 * In the worst case, we could end up in some weird
528 	 * lock loop, because the block we want no longer exists,
529 	 * and instead we're trying to lock a block we have no
530 	 * business grabbing.
531 	 *
532 	 * For now, I'm not going to worry about it.
533 	 */
534 
535 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
536 	bwatchLock(b);
537 	vtLock(b->lk);
538 	b->nlock = 1;
539 
540 	if(part == PartData && b->iostate == BioEmpty){
541 		if(!readLabel(c, &b->l, addr)){
542 			blockPut(b);
543 			return nil;
544 		}
545 		blockSetIOState(b, BioLabel);
546 	}
547 	if(epoch && b->l.epoch != epoch){
548 		blockPut(b);
549 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
550 		vtSetError(ELabelMismatch);
551 		return nil;
552 	}
553 
554 	b->pc = getcallerpc(&c);
555 	for(;;){
556 		switch(b->iostate){
557 		default:
558 			abort();
559 		case BioLabel:
560 			if(mode == OOverWrite)
561 				/*
562 				 * leave iostate as BioLabel because data
563 				 * hasn't been read.
564 				 */
565 				return b;
566 			/* fall through */
567 		case BioEmpty:
568 			diskRead(c->disk, b);
569 			vtSleep(b->ioready);
570 			break;
571 		case BioClean:
572 		case BioDirty:
573 			return b;
574 		case BioReading:
575 		case BioWriting:
576 			vtSleep(b->ioready);
577 			break;
578 		case BioReadError:
579 			blockSetIOState(b, BioEmpty);
580 			blockPut(b);
581 			vtSetError("error reading block 0x%.8ux", addr);
582 			return nil;
583 		}
584 	}
585 	/* NOT REACHED */
586 }
587 
588 Block *
cacheLocal(Cache * c,int part,u32int addr,int mode)589 cacheLocal(Cache *c, int part, u32int addr, int mode)
590 {
591 	return _cacheLocal(c, part, addr, mode, 0);
592 }
593 
594 /*
595  * fetch a local (on-disk) block from the memory cache.
596  * if it's not there, load it, bumping some other block.
597  * check tag and type.
598  */
599 Block *
cacheLocalData(Cache * c,u32int addr,int type,u32int tag,int mode,u32int epoch)600 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
601 {
602 	Block *b;
603 
604 	b = _cacheLocal(c, PartData, addr, mode, epoch);
605 	if(b == nil)
606 		return nil;
607 	if(b->l.type != type || b->l.tag != tag){
608 		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
609 			argv0, addr, b->l.type, type, b->l.tag, tag);
610 		vtSetError(ELabelMismatch);
611 		blockPut(b);
612 		return nil;
613 	}
614 	b->pc = getcallerpc(&c);
615 	return b;
616 }
617 
618 /*
619  * fetch a global (Venti) block from the memory cache.
620  * if it's not there, load it, bumping some other block.
621  * check tag and type if it's really a local block in disguise.
622  */
623 Block *
cacheGlobal(Cache * c,uchar score[VtScoreSize],int type,u32int tag,int mode)624 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
625 {
626 	int n;
627 	Block *b;
628 	ulong h;
629 	u32int addr;
630 
631 	addr = globalToLocal(score);
632 	if(addr != NilBlock){
633 		b = cacheLocalData(c, addr, type, tag, mode, 0);
634 		if(b)
635 			b->pc = getcallerpc(&c);
636 		return b;
637 	}
638 
639 	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
640 
641 	/*
642 	 * look for the block in the cache
643 	 */
644 	vtLock(c->lk);
645 	for(b = c->heads[h]; b != nil; b = b->next){
646 		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
647 			continue;
648 		heapDel(b);
649 		b->ref++;
650 		break;
651 	}
652 
653 	if(b == nil){
654 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
655 
656 		b = cacheBumpBlock(c);
657 
658 		b->part = PartVenti;
659 		b->addr = NilBlock;
660 		b->l.type = type;
661 		memmove(b->score, score, VtScoreSize);
662 
663 		/* chain onto correct hash */
664 		b->next = c->heads[h];
665 		c->heads[h] = b;
666 		if(b->next != nil)
667 			b->next->prev = &b->next;
668 		b->prev = &c->heads[h];
669 	}
670 	vtUnlock(c->lk);
671 
672 	bwatchLock(b);
673 	vtLock(b->lk);
674 	b->nlock = 1;
675 	b->pc = getcallerpc(&c);
676 
677 	switch(b->iostate){
678 	default:
679 		abort();
680 	case BioEmpty:
681 		n = vtRead(c->z, score, vtType[type], b->data, c->size);
682 		if(n < 0 || !vtSha1Check(score, b->data, n)){
683 			blockSetIOState(b, BioVentiError);
684 			blockPut(b);
685 			vtSetError(
686 			"venti error reading block %V or wrong score: %r",
687 				score);
688 			return nil;
689 		}
690 		vtZeroExtend(vtType[type], b->data, n, c->size);
691 		blockSetIOState(b, BioClean);
692 		return b;
693 	case BioClean:
694 		return b;
695 	case BioVentiError:
696 		blockPut(b);
697 		vtSetError("venti i/o error or wrong score, block %V", score);
698 		return nil;
699 	case BioReadError:
700 		blockPut(b);
701 		vtSetError("error reading block %V", b->score);
702 		return nil;
703 	}
704 	/* NOT REACHED */
705 }
706 
707 /*
708  * allocate a new on-disk block and load it into the memory cache.
709  * BUG: if the disk is full, should we flush some of it to Venti?
710  */
711 static u32int lastAlloc;
712 
713 Block *
cacheAllocBlock(Cache * c,int type,u32int tag,u32int epoch,u32int epochLow)714 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
715 {
716 	FreeList *fl;
717 	u32int addr;
718 	Block *b;
719 	int n, nwrap;
720 	Label lab;
721 
722 	n = c->size / LabelSize;
723 	fl = c->fl;
724 
725 	vtLock(fl->lk);
726 	addr = fl->last;
727 	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
728 	if(b == nil){
729 		fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
730 		vtUnlock(fl->lk);
731 		return nil;
732 	}
733 	nwrap = 0;
734 	for(;;){
735 		if(++addr >= fl->end){
736 			addr = 0;
737 			if(++nwrap >= 2){
738 				blockPut(b);
739 				vtSetError("disk is full");
740 				/*
741 				 * try to avoid a continuous spew of console
742 				 * messages.
743 				 */
744 				if (fl->last != 0)
745 					fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
746 						argv0);
747 				fl->last = 0;
748 				vtUnlock(fl->lk);
749 				return nil;
750 			}
751 		}
752 		if(addr%n == 0){
753 			blockPut(b);
754 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
755 			if(b == nil){
756 				fl->last = addr;
757 				fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
758 				vtUnlock(fl->lk);
759 				return nil;
760 			}
761 		}
762 		if(!labelUnpack(&lab, b->data, addr%n))
763 			continue;
764 		if(lab.state == BsFree)
765 			goto Found;
766 		if(lab.state&BsClosed)
767 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
768 			goto Found;
769 	}
770 Found:
771 	blockPut(b);
772 	b = cacheLocal(c, PartData, addr, OOverWrite);
773 	if(b == nil){
774 		fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
775 		return nil;
776 	}
777 assert(b->iostate == BioLabel || b->iostate == BioClean);
778 	fl->last = addr;
779 	lab.type = type;
780 	lab.tag = tag;
781 	lab.state = BsAlloc;
782 	lab.epoch = epoch;
783 	lab.epochClose = ~(u32int)0;
784 	if(!blockSetLabel(b, &lab, 1)){
785 		fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
786 		blockPut(b);
787 		return nil;
788 	}
789 	vtZeroExtend(vtType[type], b->data, 0, c->size);
790 if(0)diskWrite(c->disk, b);
791 
792 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
793 	lastAlloc = addr;
794 	fl->nused++;
795 	vtUnlock(fl->lk);
796 	b->pc = getcallerpc(&c);
797 	return b;
798 }
799 
800 int
cacheDirty(Cache * c)801 cacheDirty(Cache *c)
802 {
803 	return c->ndirty;
804 }
805 
806 void
cacheCountUsed(Cache * c,u32int epochLow,u32int * used,u32int * total,u32int * bsize)807 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
808 {
809 	int n;
810 	u32int addr, nused;
811 	Block *b;
812 	Label lab;
813 	FreeList *fl;
814 
815 	fl = c->fl;
816 	n = c->size / LabelSize;
817 	*bsize = c->size;
818 	vtLock(fl->lk);
819 	if(fl->epochLow == epochLow){
820 		*used = fl->nused;
821 		*total = fl->end;
822 		vtUnlock(fl->lk);
823 		return;
824 	}
825 	b = nil;
826 	nused = 0;
827 	for(addr=0; addr<fl->end; addr++){
828 		if(addr%n == 0){
829 			blockPut(b);
830 			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
831 			if(b == nil){
832 				fprint(2, "%s: flCountUsed: loading %ux: %R\n",
833 					argv0, addr/n);
834 				break;
835 			}
836 		}
837 		if(!labelUnpack(&lab, b->data, addr%n))
838 			continue;
839 		if(lab.state == BsFree)
840 			continue;
841 		if(lab.state&BsClosed)
842 		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
843 			continue;
844 		nused++;
845 	}
846 	blockPut(b);
847 	if(addr == fl->end){
848 		fl->nused = nused;
849 		fl->epochLow = epochLow;
850 	}
851 	*used = nused;
852 	*total = fl->end;
853 	vtUnlock(fl->lk);
854 	return;
855 }
856 
857 static FreeList *
flAlloc(u32int end)858 flAlloc(u32int end)
859 {
860 	FreeList *fl;
861 
862 	fl = vtMemAllocZ(sizeof(*fl));
863 	fl->lk = vtLockAlloc();
864 	fl->last = 0;
865 	fl->end = end;
866 	return fl;
867 }
868 
869 static void
flFree(FreeList * fl)870 flFree(FreeList *fl)
871 {
872 	vtLockFree(fl->lk);
873 	vtMemFree(fl);
874 }
875 
876 u32int
cacheLocalSize(Cache * c,int part)877 cacheLocalSize(Cache *c, int part)
878 {
879 	return diskSize(c->disk, part);
880 }
881 
882 /*
883  * The thread that has locked b may refer to it by
884  * multiple names.  Nlock counts the number of
885  * references the locking thread holds.  It will call
886  * blockPut once per reference.
887  */
888 void
blockDupLock(Block * b)889 blockDupLock(Block *b)
890 {
891 	assert(b->nlock > 0);
892 	b->nlock++;
893 }
894 
895 /*
896  * we're done with the block.
897  * unlock it.  can't use it after calling this.
898  */
899 void
blockPut(Block * b)900 blockPut(Block* b)
901 {
902 	Cache *c;
903 
904 	if(b == nil)
905 		return;
906 
907 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
908 
909 	if(b->iostate == BioDirty)
910 		bwatchDependency(b);
911 
912 	if(--b->nlock > 0)
913 		return;
914 
915 	/*
916 	 * b->nlock should probably stay at zero while
917 	 * the block is unlocked, but diskThread and vtSleep
918 	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
919 	 * so we have to keep b->nlock set to 1 even
920 	 * when the block is unlocked.
921 	 */
922 	assert(b->nlock == 0);
923 	b->nlock = 1;
924 //	b->pc = 0;
925 
926 	bwatchUnlock(b);
927 	vtUnlock(b->lk);
928 	c = b->c;
929 	vtLock(c->lk);
930 
931 	if(--b->ref > 0){
932 		vtUnlock(c->lk);
933 		return;
934 	}
935 
936 	assert(b->ref == 0);
937 	switch(b->iostate){
938 	default:
939 		b->used = c->now++;
940 		heapIns(b);
941 		break;
942 	case BioEmpty:
943 	case BioLabel:
944 		if(c->nheap == 0)
945 			b->used = c->now++;
946 		else
947 			b->used = c->heap[0]->used;
948 		heapIns(b);
949 		break;
950 	case BioDirty:
951 		break;
952 	}
953 	vtUnlock(c->lk);
954 }
955 
956 /*
957  * set the label associated with a block.
958  */
959 Block*
_blockSetLabel(Block * b,Label * l)960 _blockSetLabel(Block *b, Label *l)
961 {
962 	int lpb;
963 	Block *bb;
964 	u32int a;
965 	Cache *c;
966 
967 	c = b->c;
968 
969 	assert(b->part == PartData);
970 	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
971 	lpb = c->size / LabelSize;
972 	a = b->addr / lpb;
973 	bb = cacheLocal(c, PartLabel, a, OReadWrite);
974 	if(bb == nil){
975 		blockPut(b);
976 		return nil;
977 	}
978 	b->l = *l;
979 	labelPack(l, bb->data, b->addr%lpb);
980 	blockDirty(bb);
981 	return bb;
982 }
983 
984 int
blockSetLabel(Block * b,Label * l,int allocating)985 blockSetLabel(Block *b, Label *l, int allocating)
986 {
987 	Block *lb;
988 	Label oldl;
989 
990 	oldl = b->l;
991 	lb = _blockSetLabel(b, l);
992 	if(lb == nil)
993 		return 0;
994 
995 	/*
996 	 * If we're allocating the block, make sure the label (bl)
997 	 * goes to disk before the data block (b) itself.  This is to help
998 	 * the blocks that in turn depend on b.
999 	 *
1000 	 * Suppose bx depends on (must be written out after) b.
1001 	 * Once we write b we'll think it's safe to write bx.
1002 	 * Bx can't get at b unless it has a valid label, though.
1003 	 *
1004 	 * Allocation is the only case in which having a current label
1005 	 * is vital because:
1006 	 *
1007 	 *	- l.type is set at allocation and never changes.
1008 	 *	- l.tag is set at allocation and never changes.
1009 	 *	- l.state is not checked when we load blocks.
1010 	 *	- the archiver cares deeply about l.state being
1011 	 *		BaActive vs. BaCopied, but that's handled
1012 	 *		by direct calls to _blockSetLabel.
1013 	 */
1014 
1015 	if(allocating)
1016 		blockDependency(b, lb, -1, nil, nil);
1017 	blockPut(lb);
1018 	return 1;
1019 }
1020 
1021 /*
1022  * Record that bb must be written out before b.
1023  * If index is given, we're about to overwrite the score/e
1024  * at that index in the block.  Save the old value so we
1025  * can write a safer ``old'' version of the block if pressed.
1026  */
1027 void
blockDependency(Block * b,Block * bb,int index,uchar * score,Entry * e)1028 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1029 {
1030 	BList *p;
1031 
1032 	if(bb->iostate == BioClean)
1033 		return;
1034 
1035 	/*
1036 	 * Dependencies for blocks containing Entry structures
1037 	 * or scores must always be explained.  The problem with
1038 	 * only explaining some of them is this.  Suppose we have two
1039 	 * dependencies for the same field, the first explained
1040 	 * and the second not.  We try to write the block when the first
1041 	 * dependency is not written but the second is.  We will roll back
1042 	 * the first change even though the second trumps it.
1043 	 */
1044 	if(index == -1 && bb->part == PartData)
1045 		assert(b->l.type == BtData);
1046 
1047 	if(bb->iostate != BioDirty){
1048 		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1049 			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1050 		abort();
1051 	}
1052 
1053 	p = blistAlloc(bb);
1054 	if(p == nil)
1055 		return;
1056 
1057 	assert(bb->iostate == BioDirty);
1058 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1059 
1060 	p->part = bb->part;
1061 	p->addr = bb->addr;
1062 	p->type = bb->l.type;
1063 	p->vers = bb->vers;
1064 	p->index = index;
1065 	if(p->index >= 0){
1066 		/*
1067 		 * This test would just be b->l.type==BtDir except
1068 		 * we need to exclude the super block.
1069 		 */
1070 		if(b->l.type == BtDir && b->part == PartData)
1071 			entryPack(e, p->old.entry, 0);
1072 		else
1073 			memmove(p->old.score, score, VtScoreSize);
1074 	}
1075 	p->next = b->prior;
1076 	b->prior = p;
1077 }
1078 
1079 /*
1080  * Mark an in-memory block as dirty.  If there are too many
1081  * dirty blocks, start writing some out to disk.
1082  *
1083  * If there were way too many dirty blocks, we used to
1084  * try to do some flushing ourselves, but it's just too dangerous --
1085  * it implies that the callers cannot have any of our priors locked,
1086  * but this is hard to avoid in some cases.
1087  */
1088 int
blockDirty(Block * b)1089 blockDirty(Block *b)
1090 {
1091 	Cache *c;
1092 
1093 	c = b->c;
1094 
1095 	assert(b->part != PartVenti);
1096 
1097 	if(b->iostate == BioDirty)
1098 		return 1;
1099 	assert(b->iostate == BioClean || b->iostate == BioLabel);
1100 
1101 	vtLock(c->lk);
1102 	b->iostate = BioDirty;
1103 	c->ndirty++;
1104 	if(c->ndirty > (c->maxdirty>>1))
1105 		vtWakeup(c->flush);
1106 	vtUnlock(c->lk);
1107 
1108 	return 1;
1109 }
1110 
1111 /*
1112  * We've decided to write out b.  Maybe b has some pointers to blocks
1113  * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1114  * copy of b that is safe to write out.  (diskThread will make sure the block
1115  * remains marked as dirty.)
1116  */
1117 uchar *
blockRollback(Block * b,uchar * buf)1118 blockRollback(Block *b, uchar *buf)
1119 {
1120 	u32int addr;
1121 	BList *p;
1122 	Super super;
1123 
1124 	/* easy case */
1125 	if(b->prior == nil)
1126 		return b->data;
1127 
1128 	memmove(buf, b->data, b->c->size);
1129 	for(p=b->prior; p; p=p->next){
1130 		/*
1131 		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1132 		 */
1133 		assert(p->index >= 0);
1134 		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1135 		if(b->part == PartSuper){
1136 			assert(p->index == 0);
1137 			superUnpack(&super, buf);
1138 			addr = globalToLocal(p->old.score);
1139 			if(addr == NilBlock){
1140 				fprint(2, "%s: rolling back super block: "
1141 					"bad replacement addr %V\n",
1142 					argv0, p->old.score);
1143 				abort();
1144 			}
1145 			super.active = addr;
1146 			superPack(&super, buf);
1147 			continue;
1148 		}
1149 		if(b->l.type == BtDir)
1150 			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1151 		else
1152 			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1153 	}
1154 	return buf;
1155 }
1156 
1157 /*
1158  * Try to write block b.
1159  * If b depends on other blocks:
1160  *
1161  *	If the block has been written out, remove the dependency.
1162  *	If the dependency is replaced by a more recent dependency,
1163  *		throw it out.
1164  *	If we know how to write out an old version of b that doesn't
1165  *		depend on it, do that.
1166  *
1167  *	Otherwise, bail.
1168  */
1169 int
blockWrite(Block * b,int waitlock)1170 blockWrite(Block *b, int waitlock)
1171 {
1172 	uchar *dmap;
1173 	Cache *c;
1174 	BList *p, **pp;
1175 	Block *bb;
1176 	int lockfail;
1177 
1178 	c = b->c;
1179 
1180 	if(b->iostate != BioDirty)
1181 		return 1;
1182 
1183 	dmap = b->dmap;
1184 	memset(dmap, 0, c->ndmap);
1185 	pp = &b->prior;
1186 	for(p=*pp; p; p=*pp){
1187 		if(p->index >= 0){
1188 			/* more recent dependency has succeeded; this one can go */
1189 			if(dmap[p->index/8] & (1<<(p->index%8)))
1190 				goto ignblock;
1191 		}
1192 
1193 		lockfail = 0;
1194 		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1195 			&lockfail);
1196 		if(bb == nil){
1197 			if(lockfail)
1198 				return 0;
1199 			/* block not in cache => was written already */
1200 			dmap[p->index/8] |= 1<<(p->index%8);
1201 			goto ignblock;
1202 		}
1203 
1204 		/*
1205 		 * same version of block is still in cache.
1206 		 *
1207 		 * the assertion is true because the block still has version p->vers,
1208 		 * which means it hasn't been written out since we last saw it.
1209 		 */
1210 		if(bb->iostate != BioDirty){
1211 			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1212 				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1213 			/* probably BioWriting if it happens? */
1214 			if(bb->iostate == BioClean)
1215 				goto ignblock;
1216 		}
1217 
1218 		blockPut(bb);
1219 
1220 		if(p->index < 0){
1221 			/*
1222 			 * We don't know how to temporarily undo
1223 			 * b's dependency on bb, so just don't write b yet.
1224 			 */
1225 			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1226 				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1227 			return 0;
1228 		}
1229 		/* keep walking down the list */
1230 		pp = &p->next;
1231 		continue;
1232 
1233 ignblock:
1234 		*pp = p->next;
1235 		blistFree(c, p);
1236 		continue;
1237 	}
1238 
1239 	/*
1240 	 * DiskWrite must never be called with a double-locked block.
1241 	 * This call to diskWrite is okay because blockWrite is only called
1242 	 * from the cache flush thread, which never double-locks a block.
1243 	 */
1244 	diskWrite(c->disk, b);
1245 	return 1;
1246 }
1247 
1248 /*
1249  * Change the I/O state of block b.
1250  * Just an assignment except for magic in
1251  * switch statement (read comments there).
1252  */
1253 void
blockSetIOState(Block * b,int iostate)1254 blockSetIOState(Block *b, int iostate)
1255 {
1256 	int dowakeup;
1257 	Cache *c;
1258 	BList *p, *q;
1259 
1260 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1261 
1262 	c = b->c;
1263 
1264 	dowakeup = 0;
1265 	switch(iostate){
1266 	default:
1267 		abort();
1268 	case BioEmpty:
1269 		assert(!b->uhead);
1270 		break;
1271 	case BioLabel:
1272 		assert(!b->uhead);
1273 		break;
1274 	case BioClean:
1275 		bwatchDependency(b);
1276 		/*
1277 		 * If b->prior is set, it means a write just finished.
1278 		 * The prior list isn't needed anymore.
1279 		 */
1280 		for(p=b->prior; p; p=q){
1281 			q = p->next;
1282 			blistFree(c, p);
1283 		}
1284 		b->prior = nil;
1285 		/*
1286 		 * Freeing a block or just finished a write.
1287 		 * Move the blocks from the per-block unlink
1288 		 * queue to the cache unlink queue.
1289 		 */
1290 		if(b->iostate == BioDirty || b->iostate == BioWriting){
1291 			vtLock(c->lk);
1292 			c->ndirty--;
1293 			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1294 			b->vers = c->vers++;
1295 			if(b->uhead){
1296 				/* add unlink blocks to unlink queue */
1297 				if(c->uhead == nil){
1298 					c->uhead = b->uhead;
1299 					vtWakeup(c->unlink);
1300 				}else
1301 					c->utail->next = b->uhead;
1302 				c->utail = b->utail;
1303 				b->uhead = nil;
1304 			}
1305 			vtUnlock(c->lk);
1306 		}
1307 		assert(!b->uhead);
1308 		dowakeup = 1;
1309 		break;
1310 	case BioDirty:
1311 		/*
1312 		 * Wrote out an old version of the block (see blockRollback).
1313 		 * Bump a version count, leave it dirty.
1314 		 */
1315 		if(b->iostate == BioWriting){
1316 			vtLock(c->lk);
1317 			b->vers = c->vers++;
1318 			vtUnlock(c->lk);
1319 			dowakeup = 1;
1320 		}
1321 		break;
1322 	case BioReading:
1323 	case BioWriting:
1324 		/*
1325 		 * Adding block to disk queue.  Bump reference count.
1326 		 * diskThread decs the count later by calling blockPut.
1327 		 * This is here because we need to lock c->lk to
1328 		 * manipulate the ref count.
1329 		 */
1330 		vtLock(c->lk);
1331 		b->ref++;
1332 		vtUnlock(c->lk);
1333 		break;
1334 	case BioReadError:
1335 	case BioVentiError:
1336 		/*
1337 		 * Oops.
1338 		 */
1339 		dowakeup = 1;
1340 		break;
1341 	}
1342 	b->iostate = iostate;
1343 	/*
1344 	 * Now that the state has changed, we can wake the waiters.
1345 	 */
1346 	if(dowakeup)
1347 		vtWakeupAll(b->ioready);
1348 }
1349 
1350 /*
1351  * The active file system is a tree of blocks.
1352  * When we add snapshots to the mix, the entire file system
1353  * becomes a dag and thus requires a bit more care.
1354  *
1355  * The life of the file system is divided into epochs.  A snapshot
1356  * ends one epoch and begins the next.  Each file system block
1357  * is marked with the epoch in which it was created (b.epoch).
1358  * When the block is unlinked from the file system (closed), it is marked
1359  * with the epoch in which it was removed (b.epochClose).
1360  * Once we have discarded or archived all snapshots up to
1361  * b.epochClose, we can reclaim the block.
1362  *
1363  * If a block was created in a past epoch but is not yet closed,
1364  * it is treated as copy-on-write.  Of course, in order to insert the
1365  * new pointer into the tree, the parent must be made writable,
1366  * and so on up the tree.  The recursion stops because the root
1367  * block is always writable.
1368  *
1369  * If blocks are never closed, they will never be reused, and
1370  * we will run out of disk space.  But marking a block as closed
1371  * requires some care about dependencies and write orderings.
1372  *
1373  * (1) If a block p points at a copy-on-write block b and we
1374  * copy b to create bb, then p must be written out after bb and
1375  * lbb (bb's label block).
1376  *
1377  * (2) We have to mark b as closed, but only after we switch
1378  * the pointer, so lb must be written out after p.  In fact, we
1379  * can't even update the in-memory copy, or the cache might
1380  * mistakenly give out b for reuse before p gets written.
1381  *
1382  * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1383  * The caller is expected to record a "p after bb" dependency
1384  * to finish (1), and also expected to call blockRemoveLink
1385  * to arrange for (2) to happen once p is written.
1386  *
1387  * Until (2) happens, some pieces of the code (e.g., the archiver)
1388  * still need to know whether a block has been copied, so we
1389  * set the BsCopied bit in the label and force that to disk *before*
1390  * the copy gets written out.
1391  */
1392 Block*
blockCopy(Block * b,u32int tag,u32int ehi,u32int elo)1393 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1394 {
1395 	Block *bb, *lb;
1396 	Label l;
1397 
1398 	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1399 		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1400 			argv0, b->addr, &b->l, elo, ehi);
1401 
1402 	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1403 	if(bb == nil){
1404 		blockPut(b);
1405 		return nil;
1406 	}
1407 
1408 	/*
1409 	 * Update label so we know the block has been copied.
1410 	 * (It will be marked closed once it has been unlinked from
1411 	 * the tree.)  This must follow cacheAllocBlock since we
1412 	 * can't be holding onto lb when we call cacheAllocBlock.
1413 	 */
1414 	if((b->l.state&BsCopied)==0)
1415 	if(b->part == PartData){	/* not the superblock */
1416 		l = b->l;
1417 		l.state |= BsCopied;
1418 		lb = _blockSetLabel(b, &l);
1419 		if(lb == nil){
1420 			/* can't set label => can't copy block */
1421 			blockPut(b);
1422 			l.type = BtMax;
1423 			l.state = BsFree;
1424 			l.epoch = 0;
1425 			l.epochClose = 0;
1426 			l.tag = 0;
1427 			blockSetLabel(bb, &l, 0);
1428 			blockPut(bb);
1429 			return nil;
1430 		}
1431 		blockDependency(bb, lb, -1, nil, nil);
1432 		blockPut(lb);
1433 	}
1434 
1435 	memmove(bb->data, b->data, b->c->size);
1436 	blockDirty(bb);
1437 	blockPut(b);
1438 	return bb;
1439 }
1440 
1441 /*
1442  * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1443  * If recurse is set, we are unlinking all of bb's children as well.
1444  *
1445  * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1446  * the relevant information to b's list of unlinked blocks.  Once b is written,
1447  * the list will be queued for processing.
1448  *
1449  * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1450  */
1451 void
blockRemoveLink(Block * b,u32int addr,int type,u32int tag,int recurse)1452 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1453 {
1454 	BList *p, **pp, bl;
1455 
1456 	/* remove bb from prior list */
1457 	for(pp=&b->prior; (p=*pp)!=nil; ){
1458 		if(p->part == PartData && p->addr == addr){
1459 			*pp = p->next;
1460 			blistFree(b->c, p);
1461 		}else
1462 			pp = &p->next;
1463 	}
1464 
1465 	bl.part = PartData;
1466 	bl.addr = addr;
1467 	bl.type = type;
1468 	bl.tag = tag;
1469 	if(b->l.epoch == 0)
1470 		assert(b->part == PartSuper);
1471 	bl.epoch = b->l.epoch;
1472 	bl.next = nil;
1473 	bl.recurse = recurse;
1474 
1475 	if(b->part == PartSuper && b->iostate == BioClean)
1476 		p = nil;
1477 	else
1478 		p = blistAlloc(b);
1479 	if(p == nil){
1480 		/*
1481 		 * b has already been written to disk.
1482 		 */
1483 		doRemoveLink(b->c, &bl);
1484 		return;
1485 	}
1486 
1487 	/* Uhead is only processed when the block goes from Dirty -> Clean */
1488 	assert(b->iostate == BioDirty);
1489 
1490 	*p = bl;
1491 	if(b->uhead == nil)
1492 		b->uhead = p;
1493 	else
1494 		b->utail->next = p;
1495 	b->utail = p;
1496 }
1497 
1498 /*
1499  * Process removal of a single block and perhaps its children.
1500  */
1501 static void
doRemoveLink(Cache * c,BList * p)1502 doRemoveLink(Cache *c, BList *p)
1503 {
1504 	int i, n, recurse;
1505 	u32int a;
1506 	Block *b;
1507 	Label l;
1508 	BList bl;
1509 
1510 	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1511 
1512 	/*
1513 	 * We're not really going to overwrite b, but if we're not
1514 	 * going to look at its contents, there is no point in reading
1515 	 * them from the disk.
1516 	 */
1517 	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1518 	if(b == nil)
1519 		return;
1520 
1521 	/*
1522 	 * When we're unlinking from the superblock, close with the next epoch.
1523 	 */
1524 	if(p->epoch == 0)
1525 		p->epoch = b->l.epoch+1;
1526 
1527 	/* sanity check */
1528 	if(b->l.epoch > p->epoch){
1529 		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1530 			argv0, b->l.epoch, p->epoch);
1531 		blockPut(b);
1532 		return;
1533 	}
1534 
1535 	if(recurse){
1536 		n = c->size / VtScoreSize;
1537 		for(i=0; i<n; i++){
1538 			a = globalToLocal(b->data + i*VtScoreSize);
1539 			if(a == NilBlock || !readLabel(c, &l, a))
1540 				continue;
1541 			if(l.state&BsClosed)
1542 				continue;
1543 			/*
1544 			 * If stack space becomes an issue...
1545 			p->addr = a;
1546 			p->type = l.type;
1547 			p->tag = l.tag;
1548 			doRemoveLink(c, p);
1549 			 */
1550 
1551 			bl.part = PartData;
1552 			bl.addr = a;
1553 			bl.type = l.type;
1554 			bl.tag = l.tag;
1555 			bl.epoch = p->epoch;
1556 			bl.next = nil;
1557 			bl.recurse = 1;
1558 			/* give up the block lock - share with others */
1559 			blockPut(b);
1560 			doRemoveLink(c, &bl);
1561 			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1562 			if(b == nil){
1563 				fprint(2, "%s: warning: lost block in doRemoveLink\n",
1564 					argv0);
1565 				return;
1566 			}
1567 		}
1568 	}
1569 
1570 	l = b->l;
1571 	l.state |= BsClosed;
1572 	l.epochClose = p->epoch;
1573 	if(l.epochClose == l.epoch){
1574 		vtLock(c->fl->lk);
1575 		if(l.epoch == c->fl->epochLow)
1576 			c->fl->nused--;
1577 		blockSetLabel(b, &l, 0);
1578 		vtUnlock(c->fl->lk);
1579 	}else
1580 		blockSetLabel(b, &l, 0);
1581 	blockPut(b);
1582 }
1583 
1584 /*
1585  * Allocate a BList so that we can record a dependency
1586  * or queue a removal related to block b.
1587  * If we can't find a BList, we write out b and return nil.
1588  */
1589 static BList *
blistAlloc(Block * b)1590 blistAlloc(Block *b)
1591 {
1592 	Cache *c;
1593 	BList *p;
1594 
1595 	if(b->iostate != BioDirty){
1596 		/*
1597 		 * should not happen anymore -
1598 	 	 * blockDirty used to flush but no longer does.
1599 		 */
1600 		assert(b->iostate == BioClean);
1601 		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1602 		return nil;
1603 	}
1604 
1605 	c = b->c;
1606 	vtLock(c->lk);
1607 	if(c->blfree == nil){
1608 		/*
1609 		 * No free BLists.  What are our options?
1610 		 */
1611 
1612 		/* Block has no priors? Just write it. */
1613 		if(b->prior == nil){
1614 			vtUnlock(c->lk);
1615 			diskWriteAndWait(c->disk, b);
1616 			return nil;
1617 		}
1618 
1619 		/*
1620 		 * Wake the flush thread, which will hopefully free up
1621 		 * some BLists for us.  We used to flush a block from
1622 		 * our own prior list and reclaim that BList, but this is
1623 		 * a no-no: some of the blocks on our prior list may
1624 		 * be locked by our caller.  Or maybe their label blocks
1625 		 * are locked by our caller.  In any event, it's too hard
1626 		 * to make sure we can do I/O for ourselves.  Instead,
1627 		 * we assume the flush thread will find something.
1628 		 * (The flush thread never blocks waiting for a block,
1629 		 * so it can't deadlock like we can.)
1630 		 */
1631 		while(c->blfree == nil){
1632 			vtWakeup(c->flush);
1633 			vtSleep(c->blrend);
1634 			if(c->blfree == nil)
1635 				fprint(2, "%s: flushing for blists\n", argv0);
1636 		}
1637 	}
1638 
1639 	p = c->blfree;
1640 	c->blfree = p->next;
1641 	vtUnlock(c->lk);
1642 	return p;
1643 }
1644 
1645 static void
blistFree(Cache * c,BList * bl)1646 blistFree(Cache *c, BList *bl)
1647 {
1648 	vtLock(c->lk);
1649 	bl->next = c->blfree;
1650 	c->blfree = bl;
1651 	vtWakeup(c->blrend);
1652 	vtUnlock(c->lk);
1653 }
1654 
1655 char*
bsStr(int state)1656 bsStr(int state)
1657 {
1658 	static char s[100];
1659 
1660 	if(state == BsFree)
1661 		return "Free";
1662 	if(state == BsBad)
1663 		return "Bad";
1664 
1665 	sprint(s, "%x", state);
1666 	if(!(state&BsAlloc))
1667 		strcat(s, ",Free");	/* should not happen */
1668 	if(state&BsCopied)
1669 		strcat(s, ",Copied");
1670 	if(state&BsVenti)
1671 		strcat(s, ",Venti");
1672 	if(state&BsClosed)
1673 		strcat(s, ",Closed");
1674 	return s;
1675 }
1676 
1677 char *
bioStr(int iostate)1678 bioStr(int iostate)
1679 {
1680 	switch(iostate){
1681 	default:
1682 		return "Unknown!!";
1683 	case BioEmpty:
1684 		return "Empty";
1685 	case BioLabel:
1686 		return "Label";
1687 	case BioClean:
1688 		return "Clean";
1689 	case BioDirty:
1690 		return "Dirty";
1691 	case BioReading:
1692 		return "Reading";
1693 	case BioWriting:
1694 		return "Writing";
1695 	case BioReadError:
1696 		return "ReadError";
1697 	case BioVentiError:
1698 		return "VentiError";
1699 	case BioMax:
1700 		return "Max";
1701 	}
1702 }
1703 
1704 static char *bttab[] = {
1705 	"BtData",
1706 	"BtData+1",
1707 	"BtData+2",
1708 	"BtData+3",
1709 	"BtData+4",
1710 	"BtData+5",
1711 	"BtData+6",
1712 	"BtData+7",
1713 	"BtDir",
1714 	"BtDir+1",
1715 	"BtDir+2",
1716 	"BtDir+3",
1717 	"BtDir+4",
1718 	"BtDir+5",
1719 	"BtDir+6",
1720 	"BtDir+7",
1721 };
1722 
1723 char*
btStr(int type)1724 btStr(int type)
1725 {
1726 	if(type < nelem(bttab))
1727 		return bttab[type];
1728 	return "unknown";
1729 }
1730 
1731 int
labelFmt(Fmt * f)1732 labelFmt(Fmt *f)
1733 {
1734 	Label *l;
1735 
1736 	l = va_arg(f->args, Label*);
1737 	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1738 		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1739 }
1740 
1741 int
scoreFmt(Fmt * f)1742 scoreFmt(Fmt *f)
1743 {
1744 	uchar *v;
1745 	int i;
1746 	u32int addr;
1747 
1748 	v = va_arg(f->args, uchar*);
1749 	if(v == nil){
1750 		fmtprint(f, "*");
1751 	}else if((addr = globalToLocal(v)) != NilBlock)
1752 		fmtprint(f, "0x%.8ux", addr);
1753 	else{
1754 		for(i = 0; i < VtScoreSize; i++)
1755 			fmtprint(f, "%2.2ux", v[i]);
1756 	}
1757 
1758 	return 0;
1759 }
1760 
1761 static int
upHeap(int i,Block * b)1762 upHeap(int i, Block *b)
1763 {
1764 	Block *bb;
1765 	u32int now;
1766 	int p;
1767 	Cache *c;
1768 
1769 	c = b->c;
1770 	now = c->now;
1771 	for(; i != 0; i = p){
1772 		p = (i - 1) >> 1;
1773 		bb = c->heap[p];
1774 		if(b->used - now >= bb->used - now)
1775 			break;
1776 		c->heap[i] = bb;
1777 		bb->heap = i;
1778 	}
1779 	c->heap[i] = b;
1780 	b->heap = i;
1781 
1782 	return i;
1783 }
1784 
1785 static int
downHeap(int i,Block * b)1786 downHeap(int i, Block *b)
1787 {
1788 	Block *bb;
1789 	u32int now;
1790 	int k;
1791 	Cache *c;
1792 
1793 	c = b->c;
1794 	now = c->now;
1795 	for(; ; i = k){
1796 		k = (i << 1) + 1;
1797 		if(k >= c->nheap)
1798 			break;
1799 		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1800 			k++;
1801 		bb = c->heap[k];
1802 		if(b->used - now <= bb->used - now)
1803 			break;
1804 		c->heap[i] = bb;
1805 		bb->heap = i;
1806 	}
1807 	c->heap[i] = b;
1808 	b->heap = i;
1809 	return i;
1810 }
1811 
1812 /*
1813  * Delete a block from the heap.
1814  * Called with c->lk held.
1815  */
1816 static void
heapDel(Block * b)1817 heapDel(Block *b)
1818 {
1819 	int i, si;
1820 	Cache *c;
1821 
1822 	c = b->c;
1823 
1824 	si = b->heap;
1825 	if(si == BadHeap)
1826 		return;
1827 	b->heap = BadHeap;
1828 	c->nheap--;
1829 	if(si == c->nheap)
1830 		return;
1831 	b = c->heap[c->nheap];
1832 	i = upHeap(si, b);
1833 	if(i == si)
1834 		downHeap(i, b);
1835 }
1836 
1837 /*
1838  * Insert a block into the heap.
1839  * Called with c->lk held.
1840  */
1841 static void
heapIns(Block * b)1842 heapIns(Block *b)
1843 {
1844 	assert(b->heap == BadHeap);
1845 	upHeap(b->c->nheap++, b);
1846 	vtWakeup(b->c->heapwait);
1847 }
1848 
1849 /*
1850  * Get just the label for a block.
1851  */
1852 int
readLabel(Cache * c,Label * l,u32int addr)1853 readLabel(Cache *c, Label *l, u32int addr)
1854 {
1855 	int lpb;
1856 	Block *b;
1857 	u32int a;
1858 
1859 	lpb = c->size / LabelSize;
1860 	a = addr / lpb;
1861 	b = cacheLocal(c, PartLabel, a, OReadOnly);
1862 	if(b == nil){
1863 		blockPut(b);
1864 		return 0;
1865 	}
1866 
1867 	if(!labelUnpack(l, b->data, addr%lpb)){
1868 		blockPut(b);
1869 		return 0;
1870 	}
1871 	blockPut(b);
1872 	return 1;
1873 }
1874 
1875 /*
1876  * Process unlink queue.
1877  * Called with c->lk held.
1878  */
1879 static void
unlinkBody(Cache * c)1880 unlinkBody(Cache *c)
1881 {
1882 	BList *p;
1883 
1884 	while(c->uhead != nil){
1885 		p = c->uhead;
1886 		c->uhead = p->next;
1887 		vtUnlock(c->lk);
1888 		doRemoveLink(c, p);
1889 		vtLock(c->lk);
1890 		p->next = c->blfree;
1891 		c->blfree = p;
1892 	}
1893 }
1894 
1895 /*
1896  * Occasionally unlink the blocks on the cache unlink queue.
1897  */
1898 static void
unlinkThread(void * a)1899 unlinkThread(void *a)
1900 {
1901 	Cache *c = a;
1902 
1903 	vtThreadSetName("unlink");
1904 
1905 	vtLock(c->lk);
1906 	for(;;){
1907 		while(c->uhead == nil && c->die == nil)
1908 			vtSleep(c->unlink);
1909 		if(c->die != nil)
1910 			break;
1911 		unlinkBody(c);
1912 	}
1913 	c->ref--;
1914 	vtWakeup(c->die);
1915 	vtUnlock(c->lk);
1916 }
1917 
1918 static int
baddrCmp(void * a0,void * a1)1919 baddrCmp(void *a0, void *a1)
1920 {
1921 	BAddr *b0, *b1;
1922 	b0 = a0;
1923 	b1 = a1;
1924 
1925 	if(b0->part < b1->part)
1926 		return -1;
1927 	if(b0->part > b1->part)
1928 		return 1;
1929 	if(b0->addr < b1->addr)
1930 		return -1;
1931 	if(b0->addr > b1->addr)
1932 		return 1;
1933 	return 0;
1934 }
1935 
1936 /*
1937  * Scan the block list for dirty blocks; add them to the list c->baddr.
1938  */
1939 static void
flushFill(Cache * c)1940 flushFill(Cache *c)
1941 {
1942 	int i, ndirty;
1943 	BAddr *p;
1944 	Block *b;
1945 
1946 	vtLock(c->lk);
1947 	if(c->ndirty == 0){
1948 		vtUnlock(c->lk);
1949 		return;
1950 	}
1951 
1952 	p = c->baddr;
1953 	ndirty = 0;
1954 	for(i=0; i<c->nblocks; i++){
1955 		b = c->blocks + i;
1956 		if(b->part == PartError)
1957 			continue;
1958 		if(b->iostate == BioDirty || b->iostate == BioWriting)
1959 			ndirty++;
1960 		if(b->iostate != BioDirty)
1961 			continue;
1962 		p->part = b->part;
1963 		p->addr = b->addr;
1964 		p->vers = b->vers;
1965 		p++;
1966 	}
1967 	if(ndirty != c->ndirty){
1968 		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1969 			argv0, c->ndirty, ndirty);
1970 		c->ndirty = ndirty;
1971 	}
1972 	vtUnlock(c->lk);
1973 
1974 	c->bw = p - c->baddr;
1975 	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1976 }
1977 
1978 /*
1979  * This is not thread safe, i.e. it can't be called from multiple threads.
1980  *
1981  * It's okay how we use it, because it only gets called in
1982  * the flushThread.  And cacheFree, but only after
1983  * cacheFree has killed off the flushThread.
1984  */
1985 static int
cacheFlushBlock(Cache * c)1986 cacheFlushBlock(Cache *c)
1987 {
1988 	Block *b;
1989 	BAddr *p;
1990 	int lockfail, nfail;
1991 
1992 	nfail = 0;
1993 	for(;;){
1994 		if(c->br == c->be){
1995 			if(c->bw == 0 || c->bw == c->be)
1996 				flushFill(c);
1997 			c->br = 0;
1998 			c->be = c->bw;
1999 			c->bw = 0;
2000 			c->nflush = 0;
2001 		}
2002 
2003 		if(c->br == c->be)
2004 			return 0;
2005 		p = c->baddr + c->br;
2006 		c->br++;
2007 		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
2008 			&lockfail);
2009 
2010 		if(b && blockWrite(b, Nowaitlock)){
2011 			c->nflush++;
2012 			blockPut(b);
2013 			return 1;
2014 		}
2015 		if(b)
2016 			blockPut(b);
2017 
2018 		/*
2019 		 * Why didn't we write the block?
2020 		 */
2021 
2022 		/* Block already written out */
2023 		if(b == nil && !lockfail)
2024 			continue;
2025 
2026 		/* Failed to acquire lock; sleep if happens a lot. */
2027 		if(lockfail && ++nfail > 100){
2028 			sleep(500);
2029 			nfail = 0;
2030 		}
2031 		/* Requeue block. */
2032 		if(c->bw < c->be)
2033 			c->baddr[c->bw++] = *p;
2034 	}
2035 }
2036 
2037 /*
2038  * Occasionally flush dirty blocks from memory to the disk.
2039  */
2040 static void
flushThread(void * a)2041 flushThread(void *a)
2042 {
2043 	Cache *c = a;
2044 	int i;
2045 
2046 	vtThreadSetName("flush");
2047 	vtLock(c->lk);
2048 	while(c->die == nil){
2049 		vtSleep(c->flush);
2050 		vtUnlock(c->lk);
2051 		for(i=0; i<FlushSize; i++)
2052 			if(!cacheFlushBlock(c)){
2053 				/*
2054 				 * If i==0, could be someone is waking us repeatedly
2055 				 * to flush the cache but there's no work to do.
2056 				 * Pause a little.
2057 				 */
2058 				if(i==0){
2059 					// fprint(2, "%s: flushthread found "
2060 					//	"nothing to flush - %d dirty\n",
2061 					//	argv0, c->ndirty);
2062 					sleep(250);
2063 				}
2064 				break;
2065 			}
2066 		if(i==0 && c->ndirty){
2067 			/*
2068 			 * All the blocks are being written right now -- there's nothing to do.
2069 			 * We might be spinning with cacheFlush though -- he'll just keep
2070 			 * kicking us until c->ndirty goes down.  Probably we should sleep
2071 			 * on something that the diskThread can kick, but for now we'll
2072 			 * just pause for a little while waiting for disks to finish.
2073 			 */
2074 			sleep(100);
2075 		}
2076 		vtLock(c->lk);
2077 		vtWakeupAll(c->flushwait);
2078 	}
2079 	c->ref--;
2080 	vtWakeup(c->die);
2081 	vtUnlock(c->lk);
2082 }
2083 
2084 /*
2085  * Flush the cache.
2086  */
2087 void
cacheFlush(Cache * c,int wait)2088 cacheFlush(Cache *c, int wait)
2089 {
2090 	vtLock(c->lk);
2091 	if(wait){
2092 		while(c->ndirty){
2093 		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2094 		//		c->ndirty, c->uhead);
2095 			vtWakeup(c->flush);
2096 			vtSleep(c->flushwait);
2097 		}
2098 	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2099 	}else if(c->ndirty)
2100 		vtWakeup(c->flush);
2101 	vtUnlock(c->lk);
2102 }
2103 
2104 /*
2105  * Kick the flushThread every 30 seconds.
2106  */
2107 static void
cacheSync(void * v)2108 cacheSync(void *v)
2109 {
2110 	Cache *c;
2111 
2112 	c = v;
2113 	cacheFlush(c, 0);
2114 }
2115