1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5
6 #include "9.h" /* for cacheFlush */
7
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10
11 enum {
12 BadHeap = ~0,
13 };
14
15 /*
16 * Store data to the memory cache in c->size blocks
17 * with the block zero extended to fill it out. When writing to
18 * Venti, the block will be zero truncated. The walker will also check
19 * that the block fits within psize or dsize as the case may be.
20 */
21
22 struct Cache
23 {
24 VtLock *lk;
25 int ref;
26 int mode;
27
28 Disk *disk;
29 int size; /* block size */
30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */
31 VtSession *z;
32 u32int now; /* ticks for usage timestamps */
33 Block **heads; /* hash table for finding address */
34 int nheap; /* number of available victims */
35 Block **heap; /* heap for locating victims */
36 long nblocks; /* number of blocks allocated */
37 Block *blocks; /* array of block descriptors */
38 u8int *mem; /* memory for all block data & blists */
39
40 BList *blfree;
41 VtRendez *blrend;
42
43 int ndirty; /* number of dirty blocks in the cache */
44 int maxdirty; /* max number of dirty blocks */
45 u32int vers;
46
47 long hashSize;
48
49 FreeList *fl;
50
51 VtRendez *die; /* daemon threads should die when != nil */
52
53 VtRendez *flush;
54 VtRendez *flushwait;
55 VtRendez *heapwait;
56 BAddr *baddr;
57 int bw, br, be;
58 int nflush;
59
60 Periodic *sync;
61
62 /* unlink daemon */
63 BList *uhead;
64 BList *utail;
65 VtRendez *unlink;
66
67 /* block counts */
68 int nused;
69 int ndisk;
70 };
71
72 struct BList {
73 int part;
74 u32int addr;
75 uchar type;
76 u32int tag;
77 u32int epoch;
78 u32int vers;
79
80 int recurse; /* for block unlink */
81
82 /* for roll back */
83 int index; /* -1 indicates not valid */
84 union {
85 uchar score[VtScoreSize];
86 uchar entry[VtEntrySize];
87 } old;
88 BList *next;
89 };
90
91 struct BAddr {
92 int part;
93 u32int addr;
94 u32int vers;
95 };
96
97 struct FreeList {
98 VtLock *lk;
99 u32int last; /* last block allocated */
100 u32int end; /* end of data partition */
101 u32int nused; /* number of used blocks */
102 u32int epochLow; /* low epoch when last updated nused */
103 };
104
105 static FreeList *flAlloc(u32int end);
106 static void flFree(FreeList *fl);
107
108 static Block *cacheBumpBlock(Cache *c);
109 static void heapDel(Block*);
110 static void heapIns(Block*);
111 static void cacheCheck(Cache*);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void unlinkBody(Cache *c);
115 static int cacheFlushBlock(Cache *c);
116 static void cacheSync(void*);
117 static BList *blistAlloc(Block*);
118 static void blistFree(Cache*, BList*);
119 static void doRemoveLink(Cache*, BList*);
120
121 /*
122 * Mapping from local block type to Venti type
123 */
124 int vtType[BtMax] = {
125 VtDataType, /* BtData | 0 */
126 VtPointerType0, /* BtData | 1 */
127 VtPointerType1, /* BtData | 2 */
128 VtPointerType2, /* BtData | 3 */
129 VtPointerType3, /* BtData | 4 */
130 VtPointerType4, /* BtData | 5 */
131 VtPointerType5, /* BtData | 6 */
132 VtPointerType6, /* BtData | 7 */
133 VtDirType, /* BtDir | 0 */
134 VtPointerType0, /* BtDir | 1 */
135 VtPointerType1, /* BtDir | 2 */
136 VtPointerType2, /* BtDir | 3 */
137 VtPointerType3, /* BtDir | 4 */
138 VtPointerType4, /* BtDir | 5 */
139 VtPointerType5, /* BtDir | 6 */
140 VtPointerType6, /* BtDir | 7 */
141 };
142
143 /*
144 * Allocate the memory cache.
145 */
146 Cache *
cacheAlloc(Disk * disk,VtSession * z,ulong nblocks,int mode)147 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
148 {
149 int i;
150 Cache *c;
151 Block *b;
152 BList *bl;
153 u8int *p;
154 int nbl;
155
156 c = vtMemAllocZ(sizeof(Cache));
157
158 /* reasonable number of BList elements */
159 nbl = nblocks * 4;
160
161 c->lk = vtLockAlloc();
162 c->ref = 1;
163 c->disk = disk;
164 c->z = z;
165 c->size = diskBlockSize(disk);
166 bwatchSetBlockSize(c->size);
167 /* round c->size up to be a nice multiple */
168 c->size = (c->size + 127) & ~127;
169 c->ndmap = (c->size/20 + 7) / 8;
170 c->nblocks = nblocks;
171 c->hashSize = nblocks;
172 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
173 c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
174 c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
175 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
176 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
177 c->mode = mode;
178 c->vers++;
179 p = c->mem;
180 for(i = 0; i < nblocks; i++){
181 b = &c->blocks[i];
182 b->lk = vtLockAlloc();
183 b->c = c;
184 b->data = p;
185 b->heap = i;
186 b->ioready = vtRendezAlloc(b->lk);
187 c->heap[i] = b;
188 p += c->size;
189 }
190 c->nheap = nblocks;
191 for(i = 0; i < nbl; i++){
192 bl = (BList*)p;
193 bl->next = c->blfree;
194 c->blfree = bl;
195 p += sizeof(BList);
196 }
197 /* separate loop to keep blocks and blists reasonably aligned */
198 for(i = 0; i < nblocks; i++){
199 b = &c->blocks[i];
200 b->dmap = p;
201 p += c->ndmap;
202 }
203
204 c->blrend = vtRendezAlloc(c->lk);
205
206 c->maxdirty = nblocks*(DirtyPercentage*0.01);
207
208 c->fl = flAlloc(diskSize(disk, PartData));
209
210 c->unlink = vtRendezAlloc(c->lk);
211 c->flush = vtRendezAlloc(c->lk);
212 c->flushwait = vtRendezAlloc(c->lk);
213 c->heapwait = vtRendezAlloc(c->lk);
214 c->sync = periodicAlloc(cacheSync, c, 30*1000);
215
216 if(mode == OReadWrite){
217 c->ref += 2;
218 vtThread(unlinkThread, c);
219 vtThread(flushThread, c);
220 }
221 cacheCheck(c);
222
223 return c;
224 }
225
226 /*
227 * Free the whole memory cache, flushing all dirty blocks to the disk.
228 */
229 void
cacheFree(Cache * c)230 cacheFree(Cache *c)
231 {
232 int i;
233
234 /* kill off daemon threads */
235 vtLock(c->lk);
236 c->die = vtRendezAlloc(c->lk);
237 periodicKill(c->sync);
238 vtWakeup(c->flush);
239 vtWakeup(c->unlink);
240 while(c->ref > 1)
241 vtSleep(c->die);
242
243 /* flush everything out */
244 do {
245 unlinkBody(c);
246 vtUnlock(c->lk);
247 while(cacheFlushBlock(c))
248 ;
249 diskFlush(c->disk);
250 vtLock(c->lk);
251 } while(c->uhead || c->ndirty);
252 vtUnlock(c->lk);
253
254 cacheCheck(c);
255
256 for(i = 0; i < c->nblocks; i++){
257 assert(c->blocks[i].ref == 0);
258 vtRendezFree(c->blocks[i].ioready);
259 vtLockFree(c->blocks[i].lk);
260 }
261 flFree(c->fl);
262 vtMemFree(c->baddr);
263 vtMemFree(c->heads);
264 vtMemFree(c->blocks);
265 vtMemFree(c->mem);
266 vtLockFree(c->lk);
267 diskFree(c->disk);
268 vtRendezFree(c->blrend);
269 /* don't close vtSession */
270 vtMemFree(c);
271 }
272
273 static void
cacheDump(Cache * c)274 cacheDump(Cache *c)
275 {
276 int i;
277 Block *b;
278
279 for(i = 0; i < c->nblocks; i++){
280 b = &c->blocks[i];
281 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
282 i, b->part, b->addr, b->score, b->l.type, b->ref,
283 bsStr(b->l.state), bioStr(b->iostate), b->pc);
284 }
285 }
286
287 static void
cacheCheck(Cache * c)288 cacheCheck(Cache *c)
289 {
290 u32int size, now;
291 int i, k, refed;
292 static uchar zero[VtScoreSize];
293 Block *b;
294
295 size = c->size;
296 now = c->now;
297
298 for(i = 0; i < c->nheap; i++){
299 if(c->heap[i]->heap != i)
300 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
301 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
302 vtFatal("bad heap ordering");
303 k = (i << 1) + 1;
304 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
305 vtFatal("bad heap ordering");
306 k++;
307 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
308 vtFatal("bad heap ordering");
309 }
310
311 refed = 0;
312 for(i = 0; i < c->nblocks; i++){
313 b = &c->blocks[i];
314 if(b->data != &c->mem[i * size])
315 vtFatal("mis-blocked at %d", i);
316 if(b->ref && b->heap == BadHeap){
317 refed++;
318 }
319 }
320 if(c->nheap + refed != c->nblocks){
321 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
322 cacheDump(c);
323 }
324 assert(c->nheap + refed == c->nblocks);
325 refed = 0;
326 for(i = 0; i < c->nblocks; i++){
327 b = &c->blocks[i];
328 if(b->ref){
329 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
330 refed++;
331 }
332 }
333 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
334 }
335
336
337 /*
338 * locate the block with the oldest second to last use.
339 * remove it from the heap, and fix up the heap.
340 */
341 /* called with c->lk held */
342 static Block *
cacheBumpBlock(Cache * c)343 cacheBumpBlock(Cache *c)
344 {
345 int printed;
346 Block *b;
347
348 /*
349 * locate the block with the oldest second to last use.
350 * remove it from the heap, and fix up the heap.
351 */
352 printed = 0;
353 if(c->nheap == 0){
354 while(c->nheap == 0){
355 vtWakeup(c->flush);
356 vtSleep(c->heapwait);
357 if(c->nheap == 0){
358 printed = 1;
359 fprint(2, "%s: entire cache is busy, %d dirty "
360 "-- waking flush thread\n",
361 argv0, c->ndirty);
362 }
363 }
364 if(printed)
365 fprint(2, "%s: cache is okay again, %d dirty\n",
366 argv0, c->ndirty);
367 }
368
369 b = c->heap[0];
370 heapDel(b);
371
372 assert(b->heap == BadHeap);
373 assert(b->ref == 0);
374 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
375 assert(b->prior == nil);
376 assert(b->uhead == nil);
377
378 /*
379 * unchain the block from hash chain
380 */
381 if(b->prev){
382 *(b->prev) = b->next;
383 if(b->next)
384 b->next->prev = b->prev;
385 b->prev = nil;
386 }
387
388
389 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
390 /* set block to a reasonable state */
391 b->ref = 1;
392 b->part = PartError;
393 memset(&b->l, 0, sizeof(b->l));
394 b->iostate = BioEmpty;
395
396 return b;
397 }
398
399 /*
400 * look for a particular version of the block in the memory cache.
401 */
402 static Block *
_cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers,int waitlock,int * lockfailure)403 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
404 int waitlock, int *lockfailure)
405 {
406 Block *b;
407 ulong h;
408
409 h = addr % c->hashSize;
410
411 if(lockfailure)
412 *lockfailure = 0;
413
414 /*
415 * look for the block in the cache
416 */
417 vtLock(c->lk);
418 for(b = c->heads[h]; b != nil; b = b->next){
419 if(b->part == part && b->addr == addr)
420 break;
421 }
422 if(b == nil || b->vers != vers){
423 vtUnlock(c->lk);
424 return nil;
425 }
426 if(!waitlock && !vtCanLock(b->lk)){
427 *lockfailure = 1;
428 vtUnlock(c->lk);
429 return nil;
430 }
431 heapDel(b);
432 b->ref++;
433 vtUnlock(c->lk);
434
435 bwatchLock(b);
436 if(waitlock)
437 vtLock(b->lk);
438 b->nlock = 1;
439
440 for(;;){
441 switch(b->iostate){
442 default:
443 abort();
444 case BioEmpty:
445 case BioLabel:
446 case BioClean:
447 case BioDirty:
448 if(b->vers != vers){
449 blockPut(b);
450 return nil;
451 }
452 return b;
453 case BioReading:
454 case BioWriting:
455 vtSleep(b->ioready);
456 break;
457 case BioVentiError:
458 blockPut(b);
459 vtSetError("venti i/o error block 0x%.8ux", addr);
460 return nil;
461 case BioReadError:
462 blockPut(b);
463 vtSetError("error reading block 0x%.8ux", addr);
464 return nil;
465 }
466 }
467 /* NOT REACHED */
468 }
469 static Block*
cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers)470 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
471 {
472 return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
473 }
474
475
476 /*
477 * fetch a local (on-disk) block from the memory cache.
478 * if it's not there, load it, bumping some other block.
479 */
480 Block *
_cacheLocal(Cache * c,int part,u32int addr,int mode,u32int epoch)481 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
482 {
483 Block *b;
484 ulong h;
485
486 assert(part != PartVenti);
487
488 h = addr % c->hashSize;
489
490 /*
491 * look for the block in the cache
492 */
493 vtLock(c->lk);
494 for(b = c->heads[h]; b != nil; b = b->next){
495 if(b->part != part || b->addr != addr)
496 continue;
497 if(epoch && b->l.epoch != epoch){
498 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
499 vtUnlock(c->lk);
500 vtSetError(ELabelMismatch);
501 return nil;
502 }
503 heapDel(b);
504 b->ref++;
505 break;
506 }
507
508 if(b == nil){
509 b = cacheBumpBlock(c);
510
511 b->part = part;
512 b->addr = addr;
513 localToGlobal(addr, b->score);
514
515 /* chain onto correct hash */
516 b->next = c->heads[h];
517 c->heads[h] = b;
518 if(b->next != nil)
519 b->next->prev = &b->next;
520 b->prev = &c->heads[h];
521 }
522
523 vtUnlock(c->lk);
524
525 /*
526 * BUG: what if the epoch changes right here?
527 * In the worst case, we could end up in some weird
528 * lock loop, because the block we want no longer exists,
529 * and instead we're trying to lock a block we have no
530 * business grabbing.
531 *
532 * For now, I'm not going to worry about it.
533 */
534
535 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
536 bwatchLock(b);
537 vtLock(b->lk);
538 b->nlock = 1;
539
540 if(part == PartData && b->iostate == BioEmpty){
541 if(!readLabel(c, &b->l, addr)){
542 blockPut(b);
543 return nil;
544 }
545 blockSetIOState(b, BioLabel);
546 }
547 if(epoch && b->l.epoch != epoch){
548 blockPut(b);
549 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
550 vtSetError(ELabelMismatch);
551 return nil;
552 }
553
554 b->pc = getcallerpc(&c);
555 for(;;){
556 switch(b->iostate){
557 default:
558 abort();
559 case BioLabel:
560 if(mode == OOverWrite)
561 /*
562 * leave iostate as BioLabel because data
563 * hasn't been read.
564 */
565 return b;
566 /* fall through */
567 case BioEmpty:
568 diskRead(c->disk, b);
569 vtSleep(b->ioready);
570 break;
571 case BioClean:
572 case BioDirty:
573 return b;
574 case BioReading:
575 case BioWriting:
576 vtSleep(b->ioready);
577 break;
578 case BioReadError:
579 blockSetIOState(b, BioEmpty);
580 blockPut(b);
581 vtSetError("error reading block 0x%.8ux", addr);
582 return nil;
583 }
584 }
585 /* NOT REACHED */
586 }
587
588 Block *
cacheLocal(Cache * c,int part,u32int addr,int mode)589 cacheLocal(Cache *c, int part, u32int addr, int mode)
590 {
591 return _cacheLocal(c, part, addr, mode, 0);
592 }
593
594 /*
595 * fetch a local (on-disk) block from the memory cache.
596 * if it's not there, load it, bumping some other block.
597 * check tag and type.
598 */
599 Block *
cacheLocalData(Cache * c,u32int addr,int type,u32int tag,int mode,u32int epoch)600 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
601 {
602 Block *b;
603
604 b = _cacheLocal(c, PartData, addr, mode, epoch);
605 if(b == nil)
606 return nil;
607 if(b->l.type != type || b->l.tag != tag){
608 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
609 argv0, addr, b->l.type, type, b->l.tag, tag);
610 vtSetError(ELabelMismatch);
611 blockPut(b);
612 return nil;
613 }
614 b->pc = getcallerpc(&c);
615 return b;
616 }
617
618 /*
619 * fetch a global (Venti) block from the memory cache.
620 * if it's not there, load it, bumping some other block.
621 * check tag and type if it's really a local block in disguise.
622 */
623 Block *
cacheGlobal(Cache * c,uchar score[VtScoreSize],int type,u32int tag,int mode)624 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
625 {
626 int n;
627 Block *b;
628 ulong h;
629 u32int addr;
630
631 addr = globalToLocal(score);
632 if(addr != NilBlock){
633 b = cacheLocalData(c, addr, type, tag, mode, 0);
634 if(b)
635 b->pc = getcallerpc(&c);
636 return b;
637 }
638
639 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
640
641 /*
642 * look for the block in the cache
643 */
644 vtLock(c->lk);
645 for(b = c->heads[h]; b != nil; b = b->next){
646 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
647 continue;
648 heapDel(b);
649 b->ref++;
650 break;
651 }
652
653 if(b == nil){
654 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
655
656 b = cacheBumpBlock(c);
657
658 b->part = PartVenti;
659 b->addr = NilBlock;
660 b->l.type = type;
661 memmove(b->score, score, VtScoreSize);
662
663 /* chain onto correct hash */
664 b->next = c->heads[h];
665 c->heads[h] = b;
666 if(b->next != nil)
667 b->next->prev = &b->next;
668 b->prev = &c->heads[h];
669 }
670 vtUnlock(c->lk);
671
672 bwatchLock(b);
673 vtLock(b->lk);
674 b->nlock = 1;
675 b->pc = getcallerpc(&c);
676
677 switch(b->iostate){
678 default:
679 abort();
680 case BioEmpty:
681 n = vtRead(c->z, score, vtType[type], b->data, c->size);
682 if(n < 0 || !vtSha1Check(score, b->data, n)){
683 blockSetIOState(b, BioVentiError);
684 blockPut(b);
685 vtSetError(
686 "venti error reading block %V or wrong score: %r",
687 score);
688 return nil;
689 }
690 vtZeroExtend(vtType[type], b->data, n, c->size);
691 blockSetIOState(b, BioClean);
692 return b;
693 case BioClean:
694 return b;
695 case BioVentiError:
696 blockPut(b);
697 vtSetError("venti i/o error or wrong score, block %V", score);
698 return nil;
699 case BioReadError:
700 blockPut(b);
701 vtSetError("error reading block %V", b->score);
702 return nil;
703 }
704 /* NOT REACHED */
705 }
706
707 /*
708 * allocate a new on-disk block and load it into the memory cache.
709 * BUG: if the disk is full, should we flush some of it to Venti?
710 */
711 static u32int lastAlloc;
712
713 Block *
cacheAllocBlock(Cache * c,int type,u32int tag,u32int epoch,u32int epochLow)714 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
715 {
716 FreeList *fl;
717 u32int addr;
718 Block *b;
719 int n, nwrap;
720 Label lab;
721
722 n = c->size / LabelSize;
723 fl = c->fl;
724
725 vtLock(fl->lk);
726 addr = fl->last;
727 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
728 if(b == nil){
729 fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
730 vtUnlock(fl->lk);
731 return nil;
732 }
733 nwrap = 0;
734 for(;;){
735 if(++addr >= fl->end){
736 addr = 0;
737 if(++nwrap >= 2){
738 blockPut(b);
739 vtSetError("disk is full");
740 /*
741 * try to avoid a continuous spew of console
742 * messages.
743 */
744 if (fl->last != 0)
745 fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
746 argv0);
747 fl->last = 0;
748 vtUnlock(fl->lk);
749 return nil;
750 }
751 }
752 if(addr%n == 0){
753 blockPut(b);
754 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
755 if(b == nil){
756 fl->last = addr;
757 fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
758 vtUnlock(fl->lk);
759 return nil;
760 }
761 }
762 if(!labelUnpack(&lab, b->data, addr%n))
763 continue;
764 if(lab.state == BsFree)
765 goto Found;
766 if(lab.state&BsClosed)
767 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
768 goto Found;
769 }
770 Found:
771 blockPut(b);
772 b = cacheLocal(c, PartData, addr, OOverWrite);
773 if(b == nil){
774 fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
775 return nil;
776 }
777 assert(b->iostate == BioLabel || b->iostate == BioClean);
778 fl->last = addr;
779 lab.type = type;
780 lab.tag = tag;
781 lab.state = BsAlloc;
782 lab.epoch = epoch;
783 lab.epochClose = ~(u32int)0;
784 if(!blockSetLabel(b, &lab, 1)){
785 fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
786 blockPut(b);
787 return nil;
788 }
789 vtZeroExtend(vtType[type], b->data, 0, c->size);
790 if(0)diskWrite(c->disk, b);
791
792 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
793 lastAlloc = addr;
794 fl->nused++;
795 vtUnlock(fl->lk);
796 b->pc = getcallerpc(&c);
797 return b;
798 }
799
800 int
cacheDirty(Cache * c)801 cacheDirty(Cache *c)
802 {
803 return c->ndirty;
804 }
805
806 void
cacheCountUsed(Cache * c,u32int epochLow,u32int * used,u32int * total,u32int * bsize)807 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
808 {
809 int n;
810 u32int addr, nused;
811 Block *b;
812 Label lab;
813 FreeList *fl;
814
815 fl = c->fl;
816 n = c->size / LabelSize;
817 *bsize = c->size;
818 vtLock(fl->lk);
819 if(fl->epochLow == epochLow){
820 *used = fl->nused;
821 *total = fl->end;
822 vtUnlock(fl->lk);
823 return;
824 }
825 b = nil;
826 nused = 0;
827 for(addr=0; addr<fl->end; addr++){
828 if(addr%n == 0){
829 blockPut(b);
830 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
831 if(b == nil){
832 fprint(2, "%s: flCountUsed: loading %ux: %R\n",
833 argv0, addr/n);
834 break;
835 }
836 }
837 if(!labelUnpack(&lab, b->data, addr%n))
838 continue;
839 if(lab.state == BsFree)
840 continue;
841 if(lab.state&BsClosed)
842 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
843 continue;
844 nused++;
845 }
846 blockPut(b);
847 if(addr == fl->end){
848 fl->nused = nused;
849 fl->epochLow = epochLow;
850 }
851 *used = nused;
852 *total = fl->end;
853 vtUnlock(fl->lk);
854 return;
855 }
856
857 static FreeList *
flAlloc(u32int end)858 flAlloc(u32int end)
859 {
860 FreeList *fl;
861
862 fl = vtMemAllocZ(sizeof(*fl));
863 fl->lk = vtLockAlloc();
864 fl->last = 0;
865 fl->end = end;
866 return fl;
867 }
868
869 static void
flFree(FreeList * fl)870 flFree(FreeList *fl)
871 {
872 vtLockFree(fl->lk);
873 vtMemFree(fl);
874 }
875
876 u32int
cacheLocalSize(Cache * c,int part)877 cacheLocalSize(Cache *c, int part)
878 {
879 return diskSize(c->disk, part);
880 }
881
882 /*
883 * The thread that has locked b may refer to it by
884 * multiple names. Nlock counts the number of
885 * references the locking thread holds. It will call
886 * blockPut once per reference.
887 */
888 void
blockDupLock(Block * b)889 blockDupLock(Block *b)
890 {
891 assert(b->nlock > 0);
892 b->nlock++;
893 }
894
895 /*
896 * we're done with the block.
897 * unlock it. can't use it after calling this.
898 */
899 void
blockPut(Block * b)900 blockPut(Block* b)
901 {
902 Cache *c;
903
904 if(b == nil)
905 return;
906
907 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
908
909 if(b->iostate == BioDirty)
910 bwatchDependency(b);
911
912 if(--b->nlock > 0)
913 return;
914
915 /*
916 * b->nlock should probably stay at zero while
917 * the block is unlocked, but diskThread and vtSleep
918 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
919 * so we have to keep b->nlock set to 1 even
920 * when the block is unlocked.
921 */
922 assert(b->nlock == 0);
923 b->nlock = 1;
924 // b->pc = 0;
925
926 bwatchUnlock(b);
927 vtUnlock(b->lk);
928 c = b->c;
929 vtLock(c->lk);
930
931 if(--b->ref > 0){
932 vtUnlock(c->lk);
933 return;
934 }
935
936 assert(b->ref == 0);
937 switch(b->iostate){
938 default:
939 b->used = c->now++;
940 heapIns(b);
941 break;
942 case BioEmpty:
943 case BioLabel:
944 if(c->nheap == 0)
945 b->used = c->now++;
946 else
947 b->used = c->heap[0]->used;
948 heapIns(b);
949 break;
950 case BioDirty:
951 break;
952 }
953 vtUnlock(c->lk);
954 }
955
956 /*
957 * set the label associated with a block.
958 */
959 Block*
_blockSetLabel(Block * b,Label * l)960 _blockSetLabel(Block *b, Label *l)
961 {
962 int lpb;
963 Block *bb;
964 u32int a;
965 Cache *c;
966
967 c = b->c;
968
969 assert(b->part == PartData);
970 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
971 lpb = c->size / LabelSize;
972 a = b->addr / lpb;
973 bb = cacheLocal(c, PartLabel, a, OReadWrite);
974 if(bb == nil){
975 blockPut(b);
976 return nil;
977 }
978 b->l = *l;
979 labelPack(l, bb->data, b->addr%lpb);
980 blockDirty(bb);
981 return bb;
982 }
983
984 int
blockSetLabel(Block * b,Label * l,int allocating)985 blockSetLabel(Block *b, Label *l, int allocating)
986 {
987 Block *lb;
988 Label oldl;
989
990 oldl = b->l;
991 lb = _blockSetLabel(b, l);
992 if(lb == nil)
993 return 0;
994
995 /*
996 * If we're allocating the block, make sure the label (bl)
997 * goes to disk before the data block (b) itself. This is to help
998 * the blocks that in turn depend on b.
999 *
1000 * Suppose bx depends on (must be written out after) b.
1001 * Once we write b we'll think it's safe to write bx.
1002 * Bx can't get at b unless it has a valid label, though.
1003 *
1004 * Allocation is the only case in which having a current label
1005 * is vital because:
1006 *
1007 * - l.type is set at allocation and never changes.
1008 * - l.tag is set at allocation and never changes.
1009 * - l.state is not checked when we load blocks.
1010 * - the archiver cares deeply about l.state being
1011 * BaActive vs. BaCopied, but that's handled
1012 * by direct calls to _blockSetLabel.
1013 */
1014
1015 if(allocating)
1016 blockDependency(b, lb, -1, nil, nil);
1017 blockPut(lb);
1018 return 1;
1019 }
1020
1021 /*
1022 * Record that bb must be written out before b.
1023 * If index is given, we're about to overwrite the score/e
1024 * at that index in the block. Save the old value so we
1025 * can write a safer ``old'' version of the block if pressed.
1026 */
1027 void
blockDependency(Block * b,Block * bb,int index,uchar * score,Entry * e)1028 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1029 {
1030 BList *p;
1031
1032 if(bb->iostate == BioClean)
1033 return;
1034
1035 /*
1036 * Dependencies for blocks containing Entry structures
1037 * or scores must always be explained. The problem with
1038 * only explaining some of them is this. Suppose we have two
1039 * dependencies for the same field, the first explained
1040 * and the second not. We try to write the block when the first
1041 * dependency is not written but the second is. We will roll back
1042 * the first change even though the second trumps it.
1043 */
1044 if(index == -1 && bb->part == PartData)
1045 assert(b->l.type == BtData);
1046
1047 if(bb->iostate != BioDirty){
1048 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1049 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1050 abort();
1051 }
1052
1053 p = blistAlloc(bb);
1054 if(p == nil)
1055 return;
1056
1057 assert(bb->iostate == BioDirty);
1058 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1059
1060 p->part = bb->part;
1061 p->addr = bb->addr;
1062 p->type = bb->l.type;
1063 p->vers = bb->vers;
1064 p->index = index;
1065 if(p->index >= 0){
1066 /*
1067 * This test would just be b->l.type==BtDir except
1068 * we need to exclude the super block.
1069 */
1070 if(b->l.type == BtDir && b->part == PartData)
1071 entryPack(e, p->old.entry, 0);
1072 else
1073 memmove(p->old.score, score, VtScoreSize);
1074 }
1075 p->next = b->prior;
1076 b->prior = p;
1077 }
1078
1079 /*
1080 * Mark an in-memory block as dirty. If there are too many
1081 * dirty blocks, start writing some out to disk.
1082 *
1083 * If there were way too many dirty blocks, we used to
1084 * try to do some flushing ourselves, but it's just too dangerous --
1085 * it implies that the callers cannot have any of our priors locked,
1086 * but this is hard to avoid in some cases.
1087 */
1088 int
blockDirty(Block * b)1089 blockDirty(Block *b)
1090 {
1091 Cache *c;
1092
1093 c = b->c;
1094
1095 assert(b->part != PartVenti);
1096
1097 if(b->iostate == BioDirty)
1098 return 1;
1099 assert(b->iostate == BioClean || b->iostate == BioLabel);
1100
1101 vtLock(c->lk);
1102 b->iostate = BioDirty;
1103 c->ndirty++;
1104 if(c->ndirty > (c->maxdirty>>1))
1105 vtWakeup(c->flush);
1106 vtUnlock(c->lk);
1107
1108 return 1;
1109 }
1110
1111 /*
1112 * We've decided to write out b. Maybe b has some pointers to blocks
1113 * that haven't yet been written to disk. If so, construct a slightly out-of-date
1114 * copy of b that is safe to write out. (diskThread will make sure the block
1115 * remains marked as dirty.)
1116 */
1117 uchar *
blockRollback(Block * b,uchar * buf)1118 blockRollback(Block *b, uchar *buf)
1119 {
1120 u32int addr;
1121 BList *p;
1122 Super super;
1123
1124 /* easy case */
1125 if(b->prior == nil)
1126 return b->data;
1127
1128 memmove(buf, b->data, b->c->size);
1129 for(p=b->prior; p; p=p->next){
1130 /*
1131 * we know p->index >= 0 because blockWrite has vetted this block for us.
1132 */
1133 assert(p->index >= 0);
1134 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1135 if(b->part == PartSuper){
1136 assert(p->index == 0);
1137 superUnpack(&super, buf);
1138 addr = globalToLocal(p->old.score);
1139 if(addr == NilBlock){
1140 fprint(2, "%s: rolling back super block: "
1141 "bad replacement addr %V\n",
1142 argv0, p->old.score);
1143 abort();
1144 }
1145 super.active = addr;
1146 superPack(&super, buf);
1147 continue;
1148 }
1149 if(b->l.type == BtDir)
1150 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1151 else
1152 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1153 }
1154 return buf;
1155 }
1156
1157 /*
1158 * Try to write block b.
1159 * If b depends on other blocks:
1160 *
1161 * If the block has been written out, remove the dependency.
1162 * If the dependency is replaced by a more recent dependency,
1163 * throw it out.
1164 * If we know how to write out an old version of b that doesn't
1165 * depend on it, do that.
1166 *
1167 * Otherwise, bail.
1168 */
1169 int
blockWrite(Block * b,int waitlock)1170 blockWrite(Block *b, int waitlock)
1171 {
1172 uchar *dmap;
1173 Cache *c;
1174 BList *p, **pp;
1175 Block *bb;
1176 int lockfail;
1177
1178 c = b->c;
1179
1180 if(b->iostate != BioDirty)
1181 return 1;
1182
1183 dmap = b->dmap;
1184 memset(dmap, 0, c->ndmap);
1185 pp = &b->prior;
1186 for(p=*pp; p; p=*pp){
1187 if(p->index >= 0){
1188 /* more recent dependency has succeeded; this one can go */
1189 if(dmap[p->index/8] & (1<<(p->index%8)))
1190 goto ignblock;
1191 }
1192
1193 lockfail = 0;
1194 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1195 &lockfail);
1196 if(bb == nil){
1197 if(lockfail)
1198 return 0;
1199 /* block not in cache => was written already */
1200 dmap[p->index/8] |= 1<<(p->index%8);
1201 goto ignblock;
1202 }
1203
1204 /*
1205 * same version of block is still in cache.
1206 *
1207 * the assertion is true because the block still has version p->vers,
1208 * which means it hasn't been written out since we last saw it.
1209 */
1210 if(bb->iostate != BioDirty){
1211 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1212 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1213 /* probably BioWriting if it happens? */
1214 if(bb->iostate == BioClean)
1215 goto ignblock;
1216 }
1217
1218 blockPut(bb);
1219
1220 if(p->index < 0){
1221 /*
1222 * We don't know how to temporarily undo
1223 * b's dependency on bb, so just don't write b yet.
1224 */
1225 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1226 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1227 return 0;
1228 }
1229 /* keep walking down the list */
1230 pp = &p->next;
1231 continue;
1232
1233 ignblock:
1234 *pp = p->next;
1235 blistFree(c, p);
1236 continue;
1237 }
1238
1239 /*
1240 * DiskWrite must never be called with a double-locked block.
1241 * This call to diskWrite is okay because blockWrite is only called
1242 * from the cache flush thread, which never double-locks a block.
1243 */
1244 diskWrite(c->disk, b);
1245 return 1;
1246 }
1247
1248 /*
1249 * Change the I/O state of block b.
1250 * Just an assignment except for magic in
1251 * switch statement (read comments there).
1252 */
1253 void
blockSetIOState(Block * b,int iostate)1254 blockSetIOState(Block *b, int iostate)
1255 {
1256 int dowakeup;
1257 Cache *c;
1258 BList *p, *q;
1259
1260 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1261
1262 c = b->c;
1263
1264 dowakeup = 0;
1265 switch(iostate){
1266 default:
1267 abort();
1268 case BioEmpty:
1269 assert(!b->uhead);
1270 break;
1271 case BioLabel:
1272 assert(!b->uhead);
1273 break;
1274 case BioClean:
1275 bwatchDependency(b);
1276 /*
1277 * If b->prior is set, it means a write just finished.
1278 * The prior list isn't needed anymore.
1279 */
1280 for(p=b->prior; p; p=q){
1281 q = p->next;
1282 blistFree(c, p);
1283 }
1284 b->prior = nil;
1285 /*
1286 * Freeing a block or just finished a write.
1287 * Move the blocks from the per-block unlink
1288 * queue to the cache unlink queue.
1289 */
1290 if(b->iostate == BioDirty || b->iostate == BioWriting){
1291 vtLock(c->lk);
1292 c->ndirty--;
1293 b->iostate = iostate; /* change here to keep in sync with ndirty */
1294 b->vers = c->vers++;
1295 if(b->uhead){
1296 /* add unlink blocks to unlink queue */
1297 if(c->uhead == nil){
1298 c->uhead = b->uhead;
1299 vtWakeup(c->unlink);
1300 }else
1301 c->utail->next = b->uhead;
1302 c->utail = b->utail;
1303 b->uhead = nil;
1304 }
1305 vtUnlock(c->lk);
1306 }
1307 assert(!b->uhead);
1308 dowakeup = 1;
1309 break;
1310 case BioDirty:
1311 /*
1312 * Wrote out an old version of the block (see blockRollback).
1313 * Bump a version count, leave it dirty.
1314 */
1315 if(b->iostate == BioWriting){
1316 vtLock(c->lk);
1317 b->vers = c->vers++;
1318 vtUnlock(c->lk);
1319 dowakeup = 1;
1320 }
1321 break;
1322 case BioReading:
1323 case BioWriting:
1324 /*
1325 * Adding block to disk queue. Bump reference count.
1326 * diskThread decs the count later by calling blockPut.
1327 * This is here because we need to lock c->lk to
1328 * manipulate the ref count.
1329 */
1330 vtLock(c->lk);
1331 b->ref++;
1332 vtUnlock(c->lk);
1333 break;
1334 case BioReadError:
1335 case BioVentiError:
1336 /*
1337 * Oops.
1338 */
1339 dowakeup = 1;
1340 break;
1341 }
1342 b->iostate = iostate;
1343 /*
1344 * Now that the state has changed, we can wake the waiters.
1345 */
1346 if(dowakeup)
1347 vtWakeupAll(b->ioready);
1348 }
1349
1350 /*
1351 * The active file system is a tree of blocks.
1352 * When we add snapshots to the mix, the entire file system
1353 * becomes a dag and thus requires a bit more care.
1354 *
1355 * The life of the file system is divided into epochs. A snapshot
1356 * ends one epoch and begins the next. Each file system block
1357 * is marked with the epoch in which it was created (b.epoch).
1358 * When the block is unlinked from the file system (closed), it is marked
1359 * with the epoch in which it was removed (b.epochClose).
1360 * Once we have discarded or archived all snapshots up to
1361 * b.epochClose, we can reclaim the block.
1362 *
1363 * If a block was created in a past epoch but is not yet closed,
1364 * it is treated as copy-on-write. Of course, in order to insert the
1365 * new pointer into the tree, the parent must be made writable,
1366 * and so on up the tree. The recursion stops because the root
1367 * block is always writable.
1368 *
1369 * If blocks are never closed, they will never be reused, and
1370 * we will run out of disk space. But marking a block as closed
1371 * requires some care about dependencies and write orderings.
1372 *
1373 * (1) If a block p points at a copy-on-write block b and we
1374 * copy b to create bb, then p must be written out after bb and
1375 * lbb (bb's label block).
1376 *
1377 * (2) We have to mark b as closed, but only after we switch
1378 * the pointer, so lb must be written out after p. In fact, we
1379 * can't even update the in-memory copy, or the cache might
1380 * mistakenly give out b for reuse before p gets written.
1381 *
1382 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1383 * The caller is expected to record a "p after bb" dependency
1384 * to finish (1), and also expected to call blockRemoveLink
1385 * to arrange for (2) to happen once p is written.
1386 *
1387 * Until (2) happens, some pieces of the code (e.g., the archiver)
1388 * still need to know whether a block has been copied, so we
1389 * set the BsCopied bit in the label and force that to disk *before*
1390 * the copy gets written out.
1391 */
1392 Block*
blockCopy(Block * b,u32int tag,u32int ehi,u32int elo)1393 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1394 {
1395 Block *bb, *lb;
1396 Label l;
1397
1398 if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1399 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1400 argv0, b->addr, &b->l, elo, ehi);
1401
1402 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1403 if(bb == nil){
1404 blockPut(b);
1405 return nil;
1406 }
1407
1408 /*
1409 * Update label so we know the block has been copied.
1410 * (It will be marked closed once it has been unlinked from
1411 * the tree.) This must follow cacheAllocBlock since we
1412 * can't be holding onto lb when we call cacheAllocBlock.
1413 */
1414 if((b->l.state&BsCopied)==0)
1415 if(b->part == PartData){ /* not the superblock */
1416 l = b->l;
1417 l.state |= BsCopied;
1418 lb = _blockSetLabel(b, &l);
1419 if(lb == nil){
1420 /* can't set label => can't copy block */
1421 blockPut(b);
1422 l.type = BtMax;
1423 l.state = BsFree;
1424 l.epoch = 0;
1425 l.epochClose = 0;
1426 l.tag = 0;
1427 blockSetLabel(bb, &l, 0);
1428 blockPut(bb);
1429 return nil;
1430 }
1431 blockDependency(bb, lb, -1, nil, nil);
1432 blockPut(lb);
1433 }
1434
1435 memmove(bb->data, b->data, b->c->size);
1436 blockDirty(bb);
1437 blockPut(b);
1438 return bb;
1439 }
1440
1441 /*
1442 * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1443 * If recurse is set, we are unlinking all of bb's children as well.
1444 *
1445 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add
1446 * the relevant information to b's list of unlinked blocks. Once b is written,
1447 * the list will be queued for processing.
1448 *
1449 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1450 */
1451 void
blockRemoveLink(Block * b,u32int addr,int type,u32int tag,int recurse)1452 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1453 {
1454 BList *p, **pp, bl;
1455
1456 /* remove bb from prior list */
1457 for(pp=&b->prior; (p=*pp)!=nil; ){
1458 if(p->part == PartData && p->addr == addr){
1459 *pp = p->next;
1460 blistFree(b->c, p);
1461 }else
1462 pp = &p->next;
1463 }
1464
1465 bl.part = PartData;
1466 bl.addr = addr;
1467 bl.type = type;
1468 bl.tag = tag;
1469 if(b->l.epoch == 0)
1470 assert(b->part == PartSuper);
1471 bl.epoch = b->l.epoch;
1472 bl.next = nil;
1473 bl.recurse = recurse;
1474
1475 if(b->part == PartSuper && b->iostate == BioClean)
1476 p = nil;
1477 else
1478 p = blistAlloc(b);
1479 if(p == nil){
1480 /*
1481 * b has already been written to disk.
1482 */
1483 doRemoveLink(b->c, &bl);
1484 return;
1485 }
1486
1487 /* Uhead is only processed when the block goes from Dirty -> Clean */
1488 assert(b->iostate == BioDirty);
1489
1490 *p = bl;
1491 if(b->uhead == nil)
1492 b->uhead = p;
1493 else
1494 b->utail->next = p;
1495 b->utail = p;
1496 }
1497
1498 /*
1499 * Process removal of a single block and perhaps its children.
1500 */
1501 static void
doRemoveLink(Cache * c,BList * p)1502 doRemoveLink(Cache *c, BList *p)
1503 {
1504 int i, n, recurse;
1505 u32int a;
1506 Block *b;
1507 Label l;
1508 BList bl;
1509
1510 recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1511
1512 /*
1513 * We're not really going to overwrite b, but if we're not
1514 * going to look at its contents, there is no point in reading
1515 * them from the disk.
1516 */
1517 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1518 if(b == nil)
1519 return;
1520
1521 /*
1522 * When we're unlinking from the superblock, close with the next epoch.
1523 */
1524 if(p->epoch == 0)
1525 p->epoch = b->l.epoch+1;
1526
1527 /* sanity check */
1528 if(b->l.epoch > p->epoch){
1529 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1530 argv0, b->l.epoch, p->epoch);
1531 blockPut(b);
1532 return;
1533 }
1534
1535 if(recurse){
1536 n = c->size / VtScoreSize;
1537 for(i=0; i<n; i++){
1538 a = globalToLocal(b->data + i*VtScoreSize);
1539 if(a == NilBlock || !readLabel(c, &l, a))
1540 continue;
1541 if(l.state&BsClosed)
1542 continue;
1543 /*
1544 * If stack space becomes an issue...
1545 p->addr = a;
1546 p->type = l.type;
1547 p->tag = l.tag;
1548 doRemoveLink(c, p);
1549 */
1550
1551 bl.part = PartData;
1552 bl.addr = a;
1553 bl.type = l.type;
1554 bl.tag = l.tag;
1555 bl.epoch = p->epoch;
1556 bl.next = nil;
1557 bl.recurse = 1;
1558 /* give up the block lock - share with others */
1559 blockPut(b);
1560 doRemoveLink(c, &bl);
1561 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1562 if(b == nil){
1563 fprint(2, "%s: warning: lost block in doRemoveLink\n",
1564 argv0);
1565 return;
1566 }
1567 }
1568 }
1569
1570 l = b->l;
1571 l.state |= BsClosed;
1572 l.epochClose = p->epoch;
1573 if(l.epochClose == l.epoch){
1574 vtLock(c->fl->lk);
1575 if(l.epoch == c->fl->epochLow)
1576 c->fl->nused--;
1577 blockSetLabel(b, &l, 0);
1578 vtUnlock(c->fl->lk);
1579 }else
1580 blockSetLabel(b, &l, 0);
1581 blockPut(b);
1582 }
1583
1584 /*
1585 * Allocate a BList so that we can record a dependency
1586 * or queue a removal related to block b.
1587 * If we can't find a BList, we write out b and return nil.
1588 */
1589 static BList *
blistAlloc(Block * b)1590 blistAlloc(Block *b)
1591 {
1592 Cache *c;
1593 BList *p;
1594
1595 if(b->iostate != BioDirty){
1596 /*
1597 * should not happen anymore -
1598 * blockDirty used to flush but no longer does.
1599 */
1600 assert(b->iostate == BioClean);
1601 fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1602 return nil;
1603 }
1604
1605 c = b->c;
1606 vtLock(c->lk);
1607 if(c->blfree == nil){
1608 /*
1609 * No free BLists. What are our options?
1610 */
1611
1612 /* Block has no priors? Just write it. */
1613 if(b->prior == nil){
1614 vtUnlock(c->lk);
1615 diskWriteAndWait(c->disk, b);
1616 return nil;
1617 }
1618
1619 /*
1620 * Wake the flush thread, which will hopefully free up
1621 * some BLists for us. We used to flush a block from
1622 * our own prior list and reclaim that BList, but this is
1623 * a no-no: some of the blocks on our prior list may
1624 * be locked by our caller. Or maybe their label blocks
1625 * are locked by our caller. In any event, it's too hard
1626 * to make sure we can do I/O for ourselves. Instead,
1627 * we assume the flush thread will find something.
1628 * (The flush thread never blocks waiting for a block,
1629 * so it can't deadlock like we can.)
1630 */
1631 while(c->blfree == nil){
1632 vtWakeup(c->flush);
1633 vtSleep(c->blrend);
1634 if(c->blfree == nil)
1635 fprint(2, "%s: flushing for blists\n", argv0);
1636 }
1637 }
1638
1639 p = c->blfree;
1640 c->blfree = p->next;
1641 vtUnlock(c->lk);
1642 return p;
1643 }
1644
1645 static void
blistFree(Cache * c,BList * bl)1646 blistFree(Cache *c, BList *bl)
1647 {
1648 vtLock(c->lk);
1649 bl->next = c->blfree;
1650 c->blfree = bl;
1651 vtWakeup(c->blrend);
1652 vtUnlock(c->lk);
1653 }
1654
1655 char*
bsStr(int state)1656 bsStr(int state)
1657 {
1658 static char s[100];
1659
1660 if(state == BsFree)
1661 return "Free";
1662 if(state == BsBad)
1663 return "Bad";
1664
1665 sprint(s, "%x", state);
1666 if(!(state&BsAlloc))
1667 strcat(s, ",Free"); /* should not happen */
1668 if(state&BsCopied)
1669 strcat(s, ",Copied");
1670 if(state&BsVenti)
1671 strcat(s, ",Venti");
1672 if(state&BsClosed)
1673 strcat(s, ",Closed");
1674 return s;
1675 }
1676
1677 char *
bioStr(int iostate)1678 bioStr(int iostate)
1679 {
1680 switch(iostate){
1681 default:
1682 return "Unknown!!";
1683 case BioEmpty:
1684 return "Empty";
1685 case BioLabel:
1686 return "Label";
1687 case BioClean:
1688 return "Clean";
1689 case BioDirty:
1690 return "Dirty";
1691 case BioReading:
1692 return "Reading";
1693 case BioWriting:
1694 return "Writing";
1695 case BioReadError:
1696 return "ReadError";
1697 case BioVentiError:
1698 return "VentiError";
1699 case BioMax:
1700 return "Max";
1701 }
1702 }
1703
1704 static char *bttab[] = {
1705 "BtData",
1706 "BtData+1",
1707 "BtData+2",
1708 "BtData+3",
1709 "BtData+4",
1710 "BtData+5",
1711 "BtData+6",
1712 "BtData+7",
1713 "BtDir",
1714 "BtDir+1",
1715 "BtDir+2",
1716 "BtDir+3",
1717 "BtDir+4",
1718 "BtDir+5",
1719 "BtDir+6",
1720 "BtDir+7",
1721 };
1722
1723 char*
btStr(int type)1724 btStr(int type)
1725 {
1726 if(type < nelem(bttab))
1727 return bttab[type];
1728 return "unknown";
1729 }
1730
1731 int
labelFmt(Fmt * f)1732 labelFmt(Fmt *f)
1733 {
1734 Label *l;
1735
1736 l = va_arg(f->args, Label*);
1737 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1738 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1739 }
1740
1741 int
scoreFmt(Fmt * f)1742 scoreFmt(Fmt *f)
1743 {
1744 uchar *v;
1745 int i;
1746 u32int addr;
1747
1748 v = va_arg(f->args, uchar*);
1749 if(v == nil){
1750 fmtprint(f, "*");
1751 }else if((addr = globalToLocal(v)) != NilBlock)
1752 fmtprint(f, "0x%.8ux", addr);
1753 else{
1754 for(i = 0; i < VtScoreSize; i++)
1755 fmtprint(f, "%2.2ux", v[i]);
1756 }
1757
1758 return 0;
1759 }
1760
1761 static int
upHeap(int i,Block * b)1762 upHeap(int i, Block *b)
1763 {
1764 Block *bb;
1765 u32int now;
1766 int p;
1767 Cache *c;
1768
1769 c = b->c;
1770 now = c->now;
1771 for(; i != 0; i = p){
1772 p = (i - 1) >> 1;
1773 bb = c->heap[p];
1774 if(b->used - now >= bb->used - now)
1775 break;
1776 c->heap[i] = bb;
1777 bb->heap = i;
1778 }
1779 c->heap[i] = b;
1780 b->heap = i;
1781
1782 return i;
1783 }
1784
1785 static int
downHeap(int i,Block * b)1786 downHeap(int i, Block *b)
1787 {
1788 Block *bb;
1789 u32int now;
1790 int k;
1791 Cache *c;
1792
1793 c = b->c;
1794 now = c->now;
1795 for(; ; i = k){
1796 k = (i << 1) + 1;
1797 if(k >= c->nheap)
1798 break;
1799 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1800 k++;
1801 bb = c->heap[k];
1802 if(b->used - now <= bb->used - now)
1803 break;
1804 c->heap[i] = bb;
1805 bb->heap = i;
1806 }
1807 c->heap[i] = b;
1808 b->heap = i;
1809 return i;
1810 }
1811
1812 /*
1813 * Delete a block from the heap.
1814 * Called with c->lk held.
1815 */
1816 static void
heapDel(Block * b)1817 heapDel(Block *b)
1818 {
1819 int i, si;
1820 Cache *c;
1821
1822 c = b->c;
1823
1824 si = b->heap;
1825 if(si == BadHeap)
1826 return;
1827 b->heap = BadHeap;
1828 c->nheap--;
1829 if(si == c->nheap)
1830 return;
1831 b = c->heap[c->nheap];
1832 i = upHeap(si, b);
1833 if(i == si)
1834 downHeap(i, b);
1835 }
1836
1837 /*
1838 * Insert a block into the heap.
1839 * Called with c->lk held.
1840 */
1841 static void
heapIns(Block * b)1842 heapIns(Block *b)
1843 {
1844 assert(b->heap == BadHeap);
1845 upHeap(b->c->nheap++, b);
1846 vtWakeup(b->c->heapwait);
1847 }
1848
1849 /*
1850 * Get just the label for a block.
1851 */
1852 int
readLabel(Cache * c,Label * l,u32int addr)1853 readLabel(Cache *c, Label *l, u32int addr)
1854 {
1855 int lpb;
1856 Block *b;
1857 u32int a;
1858
1859 lpb = c->size / LabelSize;
1860 a = addr / lpb;
1861 b = cacheLocal(c, PartLabel, a, OReadOnly);
1862 if(b == nil){
1863 blockPut(b);
1864 return 0;
1865 }
1866
1867 if(!labelUnpack(l, b->data, addr%lpb)){
1868 blockPut(b);
1869 return 0;
1870 }
1871 blockPut(b);
1872 return 1;
1873 }
1874
1875 /*
1876 * Process unlink queue.
1877 * Called with c->lk held.
1878 */
1879 static void
unlinkBody(Cache * c)1880 unlinkBody(Cache *c)
1881 {
1882 BList *p;
1883
1884 while(c->uhead != nil){
1885 p = c->uhead;
1886 c->uhead = p->next;
1887 vtUnlock(c->lk);
1888 doRemoveLink(c, p);
1889 vtLock(c->lk);
1890 p->next = c->blfree;
1891 c->blfree = p;
1892 }
1893 }
1894
1895 /*
1896 * Occasionally unlink the blocks on the cache unlink queue.
1897 */
1898 static void
unlinkThread(void * a)1899 unlinkThread(void *a)
1900 {
1901 Cache *c = a;
1902
1903 vtThreadSetName("unlink");
1904
1905 vtLock(c->lk);
1906 for(;;){
1907 while(c->uhead == nil && c->die == nil)
1908 vtSleep(c->unlink);
1909 if(c->die != nil)
1910 break;
1911 unlinkBody(c);
1912 }
1913 c->ref--;
1914 vtWakeup(c->die);
1915 vtUnlock(c->lk);
1916 }
1917
1918 static int
baddrCmp(void * a0,void * a1)1919 baddrCmp(void *a0, void *a1)
1920 {
1921 BAddr *b0, *b1;
1922 b0 = a0;
1923 b1 = a1;
1924
1925 if(b0->part < b1->part)
1926 return -1;
1927 if(b0->part > b1->part)
1928 return 1;
1929 if(b0->addr < b1->addr)
1930 return -1;
1931 if(b0->addr > b1->addr)
1932 return 1;
1933 return 0;
1934 }
1935
1936 /*
1937 * Scan the block list for dirty blocks; add them to the list c->baddr.
1938 */
1939 static void
flushFill(Cache * c)1940 flushFill(Cache *c)
1941 {
1942 int i, ndirty;
1943 BAddr *p;
1944 Block *b;
1945
1946 vtLock(c->lk);
1947 if(c->ndirty == 0){
1948 vtUnlock(c->lk);
1949 return;
1950 }
1951
1952 p = c->baddr;
1953 ndirty = 0;
1954 for(i=0; i<c->nblocks; i++){
1955 b = c->blocks + i;
1956 if(b->part == PartError)
1957 continue;
1958 if(b->iostate == BioDirty || b->iostate == BioWriting)
1959 ndirty++;
1960 if(b->iostate != BioDirty)
1961 continue;
1962 p->part = b->part;
1963 p->addr = b->addr;
1964 p->vers = b->vers;
1965 p++;
1966 }
1967 if(ndirty != c->ndirty){
1968 fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1969 argv0, c->ndirty, ndirty);
1970 c->ndirty = ndirty;
1971 }
1972 vtUnlock(c->lk);
1973
1974 c->bw = p - c->baddr;
1975 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1976 }
1977
1978 /*
1979 * This is not thread safe, i.e. it can't be called from multiple threads.
1980 *
1981 * It's okay how we use it, because it only gets called in
1982 * the flushThread. And cacheFree, but only after
1983 * cacheFree has killed off the flushThread.
1984 */
1985 static int
cacheFlushBlock(Cache * c)1986 cacheFlushBlock(Cache *c)
1987 {
1988 Block *b;
1989 BAddr *p;
1990 int lockfail, nfail;
1991
1992 nfail = 0;
1993 for(;;){
1994 if(c->br == c->be){
1995 if(c->bw == 0 || c->bw == c->be)
1996 flushFill(c);
1997 c->br = 0;
1998 c->be = c->bw;
1999 c->bw = 0;
2000 c->nflush = 0;
2001 }
2002
2003 if(c->br == c->be)
2004 return 0;
2005 p = c->baddr + c->br;
2006 c->br++;
2007 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
2008 &lockfail);
2009
2010 if(b && blockWrite(b, Nowaitlock)){
2011 c->nflush++;
2012 blockPut(b);
2013 return 1;
2014 }
2015 if(b)
2016 blockPut(b);
2017
2018 /*
2019 * Why didn't we write the block?
2020 */
2021
2022 /* Block already written out */
2023 if(b == nil && !lockfail)
2024 continue;
2025
2026 /* Failed to acquire lock; sleep if happens a lot. */
2027 if(lockfail && ++nfail > 100){
2028 sleep(500);
2029 nfail = 0;
2030 }
2031 /* Requeue block. */
2032 if(c->bw < c->be)
2033 c->baddr[c->bw++] = *p;
2034 }
2035 }
2036
2037 /*
2038 * Occasionally flush dirty blocks from memory to the disk.
2039 */
2040 static void
flushThread(void * a)2041 flushThread(void *a)
2042 {
2043 Cache *c = a;
2044 int i;
2045
2046 vtThreadSetName("flush");
2047 vtLock(c->lk);
2048 while(c->die == nil){
2049 vtSleep(c->flush);
2050 vtUnlock(c->lk);
2051 for(i=0; i<FlushSize; i++)
2052 if(!cacheFlushBlock(c)){
2053 /*
2054 * If i==0, could be someone is waking us repeatedly
2055 * to flush the cache but there's no work to do.
2056 * Pause a little.
2057 */
2058 if(i==0){
2059 // fprint(2, "%s: flushthread found "
2060 // "nothing to flush - %d dirty\n",
2061 // argv0, c->ndirty);
2062 sleep(250);
2063 }
2064 break;
2065 }
2066 if(i==0 && c->ndirty){
2067 /*
2068 * All the blocks are being written right now -- there's nothing to do.
2069 * We might be spinning with cacheFlush though -- he'll just keep
2070 * kicking us until c->ndirty goes down. Probably we should sleep
2071 * on something that the diskThread can kick, but for now we'll
2072 * just pause for a little while waiting for disks to finish.
2073 */
2074 sleep(100);
2075 }
2076 vtLock(c->lk);
2077 vtWakeupAll(c->flushwait);
2078 }
2079 c->ref--;
2080 vtWakeup(c->die);
2081 vtUnlock(c->lk);
2082 }
2083
2084 /*
2085 * Flush the cache.
2086 */
2087 void
cacheFlush(Cache * c,int wait)2088 cacheFlush(Cache *c, int wait)
2089 {
2090 vtLock(c->lk);
2091 if(wait){
2092 while(c->ndirty){
2093 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2094 // c->ndirty, c->uhead);
2095 vtWakeup(c->flush);
2096 vtSleep(c->flushwait);
2097 }
2098 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2099 }else if(c->ndirty)
2100 vtWakeup(c->flush);
2101 vtUnlock(c->lk);
2102 }
2103
2104 /*
2105 * Kick the flushThread every 30 seconds.
2106 */
2107 static void
cacheSync(void * v)2108 cacheSync(void *v)
2109 {
2110 Cache *c;
2111
2112 c = v;
2113 cacheFlush(c, 0);
2114 }
2115