1 #include "stdinc.h"
2 #include "dat.h"
3 #include "fns.h"
4 #include "error.h"
5
6 #include "9.h" /* for cacheFlush */
7
8 typedef struct FreeList FreeList;
9 typedef struct BAddr BAddr;
10
11 enum {
12 BadHeap = ~0,
13 };
14
15 /*
16 * Store data to the memory cache in c->size blocks
17 * with the block zero extended to fill it out. When writing to
18 * Venti, the block will be zero truncated. The walker will also check
19 * that the block fits within psize or dsize as the case may be.
20 */
21
22 struct Cache
23 {
24 QLock lk;
25 int ref;
26 int mode;
27
28 Disk *disk;
29 int size; /* block size */
30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */
31 VtConn *z;
32 u32int now; /* ticks for usage timestamps */
33 Block **heads; /* hash table for finding address */
34 int nheap; /* number of available victims */
35 Block **heap; /* heap for locating victims */
36 long nblocks; /* number of blocks allocated */
37 Block *blocks; /* array of block descriptors */
38 u8int *mem; /* memory for all block data & blists */
39
40 BList *blfree;
41 Rendez blrend;
42
43 int ndirty; /* number of dirty blocks in the cache */
44 int maxdirty; /* max number of dirty blocks */
45 u32int vers;
46
47 long hashSize;
48
49 FreeList *fl;
50
51 Rendez die; /* daemon threads should die when QLock != nil */
52
53 Rendez flush;
54 Rendez flushwait;
55 Rendez heapwait;
56 BAddr *baddr;
57 int bw, br, be;
58 int nflush;
59
60 Periodic *sync;
61
62 /* unlink daemon */
63 BList *uhead;
64 BList *utail;
65 Rendez unlink;
66
67 /* block counts */
68 int nused;
69 int ndisk;
70 };
71
72 struct BList {
73 int part;
74 u32int addr;
75 uchar type;
76 u32int tag;
77 u32int epoch;
78 u32int vers;
79
80 int recurse; /* for block unlink */
81
82 /* for roll back */
83 int index; /* -1 indicates not valid */
84 union {
85 uchar score[VtScoreSize];
86 uchar entry[VtEntrySize];
87 } old;
88 BList *next;
89 };
90
91 struct BAddr {
92 int part;
93 u32int addr;
94 u32int vers;
95 };
96
97 struct FreeList {
98 QLock lk;
99 u32int last; /* last block allocated */
100 u32int end; /* end of data partition */
101 u32int nused; /* number of used blocks */
102 u32int epochLow; /* low epoch when last updated nused */
103 };
104
105 static FreeList *flAlloc(u32int end);
106 static void flFree(FreeList *fl);
107
108 static Block *cacheBumpBlock(Cache *c);
109 static void heapDel(Block*);
110 static void heapIns(Block*);
111 static void cacheCheck(Cache*);
112 static void unlinkThread(void *a);
113 static void flushThread(void *a);
114 static void unlinkBody(Cache *c);
115 static int cacheFlushBlock(Cache *c);
116 static void cacheSync(void*);
117 static BList *blistAlloc(Block*);
118 static void blistFree(Cache*, BList*);
119 static void doRemoveLink(Cache*, BList*);
120
121 /*
122 * Mapping from local block type to Venti type
123 */
124 int vtType[BtMax] = {
125 VtDataType, /* BtData | 0 */
126 VtDataType+1, /* BtData | 1 */
127 VtDataType+2, /* BtData | 2 */
128 VtDataType+3, /* BtData | 3 */
129 VtDataType+4, /* BtData | 4 */
130 VtDataType+5, /* BtData | 5 */
131 VtDataType+6, /* BtData | 6 */
132 VtDataType+7, /* BtData | 7 */
133 VtDirType, /* BtDir | 0 */
134 VtDirType+1, /* BtDir | 1 */
135 VtDirType+2, /* BtDir | 2 */
136 VtDirType+3, /* BtDir | 3 */
137 VtDirType+4, /* BtDir | 4 */
138 VtDirType+5, /* BtDir | 5 */
139 VtDirType+6, /* BtDir | 6 */
140 VtDirType+7, /* BtDir | 7 */
141 };
142
143 /*
144 * Allocate the memory cache.
145 */
146 Cache *
cacheAlloc(Disk * disk,VtConn * z,ulong nblocks,int mode)147 cacheAlloc(Disk *disk, VtConn *z, ulong nblocks, int mode)
148 {
149 int i;
150 Cache *c;
151 Block *b;
152 BList *bl;
153 u8int *p;
154 int nbl;
155
156 c = vtmallocz(sizeof(Cache));
157
158 /* reasonable number of BList elements */
159 nbl = nblocks * 4;
160
161 c->ref = 1;
162 c->disk = disk;
163 c->z = z;
164 c->size = diskBlockSize(disk);
165 bwatchSetBlockSize(c->size);
166 /* round c->size up to be a nice multiple */
167 c->size = (c->size + 127) & ~127;
168 c->ndmap = (c->size/20 + 7) / 8;
169 c->nblocks = nblocks;
170 c->hashSize = nblocks;
171 c->heads = vtmallocz(c->hashSize*sizeof(Block*));
172 c->heap = vtmallocz(nblocks*sizeof(Block*));
173 c->blocks = vtmallocz(nblocks*sizeof(Block));
174 c->mem = vtmallocz(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
175 c->baddr = vtmallocz(nblocks * sizeof(BAddr));
176 c->mode = mode;
177 c->vers++;
178 p = c->mem;
179 for(i = 0; i < nblocks; i++){
180 b = &c->blocks[i];
181 b->c = c;
182 b->data = p;
183 b->heap = i;
184 b->ioready.l = &b->lk;
185 c->heap[i] = b;
186 p += c->size;
187 }
188 c->nheap = nblocks;
189 for(i = 0; i < nbl; i++){
190 bl = (BList*)p;
191 bl->next = c->blfree;
192 c->blfree = bl;
193 p += sizeof(BList);
194 }
195 /* separate loop to keep blocks and blists reasonably aligned */
196 for(i = 0; i < nblocks; i++){
197 b = &c->blocks[i];
198 b->dmap = p;
199 p += c->ndmap;
200 }
201
202 c->blrend.l = &c->lk;
203
204 c->maxdirty = nblocks*(DirtyPercentage*0.01);
205
206 c->fl = flAlloc(diskSize(disk, PartData));
207
208 c->unlink.l = &c->lk;
209 c->flush.l = &c->lk;
210 c->flushwait.l = &c->lk;
211 c->heapwait.l = &c->lk;
212 c->sync = periodicAlloc(cacheSync, c, 30*1000);
213
214 if(mode == OReadWrite){
215 c->ref += 2;
216 proccreate(unlinkThread, c, STACK);
217 proccreate(flushThread, c, STACK);
218 }
219 cacheCheck(c);
220
221 return c;
222 }
223
224 /*
225 * Free the whole memory cache, flushing all dirty blocks to the disk.
226 */
227 void
cacheFree(Cache * c)228 cacheFree(Cache *c)
229 {
230 int i;
231
232 /* kill off daemon threads */
233 qlock(&c->lk);
234 c->die.l = &c->lk;
235 periodicKill(c->sync);
236 rwakeup(&c->flush);
237 rwakeup(&c->unlink);
238 while(c->ref > 1)
239 rsleep(&c->die);
240
241 /* flush everything out */
242 do {
243 unlinkBody(c);
244 qunlock(&c->lk);
245 while(cacheFlushBlock(c))
246 ;
247 diskFlush(c->disk);
248 qlock(&c->lk);
249 } while(c->uhead || c->ndirty);
250 qunlock(&c->lk);
251
252 cacheCheck(c);
253
254 for(i = 0; i < c->nblocks; i++){
255 assert(c->blocks[i].ref == 0);
256 }
257 flFree(c->fl);
258 vtfree(c->baddr);
259 vtfree(c->heads);
260 vtfree(c->blocks);
261 vtfree(c->mem);
262 diskFree(c->disk);
263 /* don't close vtSession */
264 vtfree(c);
265 }
266
267 static void
cacheDump(Cache * c)268 cacheDump(Cache *c)
269 {
270 int i;
271 Block *b;
272
273 for(i = 0; i < c->nblocks; i++){
274 b = &c->blocks[i];
275 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
276 i, b->part, b->addr, b->score, b->l.type, b->ref,
277 bsStr(b->l.state), bioStr(b->iostate), b->pc);
278 }
279 }
280
281 static void
cacheCheck(Cache * c)282 cacheCheck(Cache *c)
283 {
284 u32int size, now;
285 int i, k, refed;
286 static uchar zero[VtScoreSize];
287 Block *b;
288
289 size = c->size;
290 now = c->now;
291
292 for(i = 0; i < c->nheap; i++){
293 if(c->heap[i]->heap != i)
294 sysfatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
295 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
296 sysfatal("bad heap ordering");
297 k = (i << 1) + 1;
298 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
299 sysfatal("bad heap ordering");
300 k++;
301 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
302 sysfatal("bad heap ordering");
303 }
304
305 refed = 0;
306 for(i = 0; i < c->nblocks; i++){
307 b = &c->blocks[i];
308 if(b->data != &c->mem[i * size])
309 sysfatal("mis-blocked at %d", i);
310 if(b->ref && b->heap == BadHeap){
311 refed++;
312 }
313 }
314 if(c->nheap + refed != c->nblocks){
315 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
316 cacheDump(c);
317 }
318 assert(c->nheap + refed == c->nblocks);
319 refed = 0;
320 for(i = 0; i < c->nblocks; i++){
321 b = &c->blocks[i];
322 if(b->ref){
323 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
324 refed++;
325 }
326 }
327 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
328 }
329
330
331 /*
332 * locate the block with the oldest second to last use.
333 * remove it from the heap, and fix up the heap.
334 */
335 /* called with c->lk held */
336 static Block *
cacheBumpBlock(Cache * c)337 cacheBumpBlock(Cache *c)
338 {
339 int printed;
340 Block *b;
341
342 /*
343 * locate the block with the oldest second to last use.
344 * remove it from the heap, and fix up the heap.
345 */
346 printed = 0;
347 if(c->nheap == 0){
348 while(c->nheap == 0){
349 rwakeup(&c->flush);
350 rsleep(&c->heapwait);
351 if(c->nheap == 0){
352 printed = 1;
353 fprint(2, "%s: entire cache is busy, %d dirty "
354 "-- waking flush thread\n",
355 argv0, c->ndirty);
356 }
357 }
358 if(printed)
359 fprint(2, "%s: cache is okay again, %d dirty\n",
360 argv0, c->ndirty);
361 }
362
363 b = c->heap[0];
364 heapDel(b);
365
366 assert(b->heap == BadHeap);
367 assert(b->ref == 0);
368 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
369 assert(b->prior == nil);
370 assert(b->uhead == nil);
371
372 /*
373 * unchain the block from hash chain
374 */
375 if(b->prev){
376 *(b->prev) = b->next;
377 if(b->next)
378 b->next->prev = b->prev;
379 b->prev = nil;
380 }
381
382
383 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
384 /* set block to a reasonable state */
385 b->ref = 1;
386 b->part = PartError;
387 memset(&b->l, 0, sizeof(b->l));
388 b->iostate = BioEmpty;
389
390 return b;
391 }
392
393 /*
394 * look for a particular version of the block in the memory cache.
395 */
396 static Block *
_cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers,int waitlock,int * lockfailure)397 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
398 int waitlock, int *lockfailure)
399 {
400 Block *b;
401 ulong h;
402
403 h = addr % c->hashSize;
404
405 if(lockfailure)
406 *lockfailure = 0;
407
408 /*
409 * look for the block in the cache
410 */
411 qlock(&c->lk);
412 for(b = c->heads[h]; b != nil; b = b->next){
413 if(b->part == part && b->addr == addr)
414 break;
415 }
416 if(b == nil || b->vers != vers){
417 qunlock(&c->lk);
418 return nil;
419 }
420 if(!waitlock && !canqlock(&b->lk)){
421 *lockfailure = 1;
422 qunlock(&c->lk);
423 return nil;
424 }
425 heapDel(b);
426 b->ref++;
427 qunlock(&c->lk);
428
429 bwatchLock(b);
430 if(waitlock)
431 qlock(&b->lk);
432 b->nlock = 1;
433
434 for(;;){
435 switch(b->iostate){
436 default:
437 abort();
438 case BioEmpty:
439 case BioLabel:
440 case BioClean:
441 case BioDirty:
442 if(b->vers != vers){
443 blockPut(b);
444 return nil;
445 }
446 return b;
447 case BioReading:
448 case BioWriting:
449 rsleep(&b->ioready);
450 break;
451 case BioVentiError:
452 blockPut(b);
453 werrstr("venti i/o error block 0x%.8ux", addr);
454 return nil;
455 case BioReadError:
456 blockPut(b);
457 werrstr("error reading block 0x%.8ux", addr);
458 return nil;
459 }
460 }
461 /* NOT REACHED */
462 }
463 static Block*
cacheLocalLookup(Cache * c,int part,u32int addr,u32int vers)464 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
465 {
466 return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
467 }
468
469
470 /*
471 * fetch a local (on-disk) block from the memory cache.
472 * if it's not there, load it, bumping some other block.
473 */
474 Block *
_cacheLocal(Cache * c,int part,u32int addr,int mode,u32int epoch)475 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
476 {
477 Block *b;
478 ulong h;
479
480 assert(part != PartVenti);
481
482 h = addr % c->hashSize;
483
484 /*
485 * look for the block in the cache
486 */
487 qlock(&c->lk);
488 for(b = c->heads[h]; b != nil; b = b->next){
489 if(b->part != part || b->addr != addr)
490 continue;
491 if(epoch && b->l.epoch != epoch){
492 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
493 qunlock(&c->lk);
494 werrstr(ELabelMismatch);
495 return nil;
496 }
497 heapDel(b);
498 b->ref++;
499 break;
500 }
501
502 if(b == nil){
503 b = cacheBumpBlock(c);
504
505 b->part = part;
506 b->addr = addr;
507 localToGlobal(addr, b->score);
508
509 /* chain onto correct hash */
510 b->next = c->heads[h];
511 c->heads[h] = b;
512 if(b->next != nil)
513 b->next->prev = &b->next;
514 b->prev = &c->heads[h];
515 }
516
517 qunlock(&c->lk);
518
519 /*
520 * BUG: what if the epoch changes right here?
521 * In the worst case, we could end up in some weird
522 * lock loop, because the block we want no longer exists,
523 * and instead we're trying to lock a block we have no
524 * business grabbing.
525 *
526 * For now, I'm not going to worry about it.
527 */
528
529 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
530 bwatchLock(b);
531 qlock(&b->lk);
532 b->nlock = 1;
533
534 if(part == PartData && b->iostate == BioEmpty){
535 if(!readLabel(c, &b->l, addr)){
536 blockPut(b);
537 return nil;
538 }
539 blockSetIOState(b, BioLabel);
540 }
541 if(epoch && b->l.epoch != epoch){
542 blockPut(b);
543 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
544 werrstr(ELabelMismatch);
545 return nil;
546 }
547
548 b->pc = getcallerpc(&c);
549 for(;;){
550 switch(b->iostate){
551 default:
552 abort();
553 case BioLabel:
554 if(mode == OOverWrite)
555 /*
556 * leave iostate as BioLabel because data
557 * hasn't been read.
558 */
559 return b;
560 /* fall through */
561 case BioEmpty:
562 diskRead(c->disk, b);
563 rsleep(&b->ioready);
564 break;
565 case BioClean:
566 case BioDirty:
567 return b;
568 case BioReading:
569 case BioWriting:
570 rsleep(&b->ioready);
571 break;
572 case BioReadError:
573 blockSetIOState(b, BioEmpty);
574 blockPut(b);
575 werrstr("error reading block 0x%.8ux", addr);
576 return nil;
577 }
578 }
579 /* NOT REACHED */
580 }
581
582 Block *
cacheLocal(Cache * c,int part,u32int addr,int mode)583 cacheLocal(Cache *c, int part, u32int addr, int mode)
584 {
585 return _cacheLocal(c, part, addr, mode, 0);
586 }
587
588 /*
589 * fetch a local (on-disk) block from the memory cache.
590 * if it's not there, load it, bumping some other block.
591 * check tag and type.
592 */
593 Block *
cacheLocalData(Cache * c,u32int addr,int type,u32int tag,int mode,u32int epoch)594 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
595 {
596 Block *b;
597
598 b = _cacheLocal(c, PartData, addr, mode, epoch);
599 if(b == nil)
600 return nil;
601 if(b->l.type != type || b->l.tag != tag){
602 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
603 argv0, addr, b->l.type, type, b->l.tag, tag);
604 werrstr(ELabelMismatch);
605 blockPut(b);
606 return nil;
607 }
608 b->pc = getcallerpc(&c);
609 return b;
610 }
611
612 /*
613 * fetch a global (Venti) block from the memory cache.
614 * if it's not there, load it, bumping some other block.
615 * check tag and type if it's really a local block in disguise.
616 */
617 Block *
cacheGlobal(Cache * c,uchar score[VtScoreSize],int type,u32int tag,int mode)618 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
619 {
620 int n;
621 Block *b;
622 ulong h;
623 u32int addr;
624
625 addr = globalToLocal(score);
626 if(addr != NilBlock){
627 b = cacheLocalData(c, addr, type, tag, mode, 0);
628 if(b)
629 b->pc = getcallerpc(&c);
630 return b;
631 }
632
633 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
634
635 /*
636 * look for the block in the cache
637 */
638 qlock(&c->lk);
639 for(b = c->heads[h]; b != nil; b = b->next){
640 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
641 continue;
642 heapDel(b);
643 b->ref++;
644 break;
645 }
646
647 if(b == nil){
648 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
649
650 b = cacheBumpBlock(c);
651
652 b->part = PartVenti;
653 b->addr = NilBlock;
654 b->l.type = type;
655 memmove(b->score, score, VtScoreSize);
656
657 /* chain onto correct hash */
658 b->next = c->heads[h];
659 c->heads[h] = b;
660 if(b->next != nil)
661 b->next->prev = &b->next;
662 b->prev = &c->heads[h];
663 }
664 qunlock(&c->lk);
665
666 bwatchLock(b);
667 qlock(&b->lk);
668 b->nlock = 1;
669 b->pc = getcallerpc(&c);
670
671 switch(b->iostate){
672 default:
673 abort();
674 case BioEmpty:
675 n = vtread(c->z, score, vtType[type], b->data, c->size);
676 if(n < 0 || vtsha1check(score, b->data, n) < 0){
677 blockSetIOState(b, BioVentiError);
678 blockPut(b);
679 werrstr(
680 "venti error reading block %V or wrong score: %r",
681 score);
682 return nil;
683 }
684 vtzeroextend(vtType[type], b->data, n, c->size);
685 blockSetIOState(b, BioClean);
686 return b;
687 case BioClean:
688 return b;
689 case BioVentiError:
690 blockPut(b);
691 werrstr("venti i/o error or wrong score, block %V", score);
692 return nil;
693 case BioReadError:
694 blockPut(b);
695 werrstr("error reading block %V", b->score);
696 return nil;
697 }
698 /* NOT REACHED */
699 }
700
701 /*
702 * allocate a new on-disk block and load it into the memory cache.
703 * BUG: if the disk is full, should we flush some of it to Venti?
704 */
705 static u32int lastAlloc;
706
707 Block *
cacheAllocBlock(Cache * c,int type,u32int tag,u32int epoch,u32int epochLow)708 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
709 {
710 FreeList *fl;
711 u32int addr;
712 Block *b;
713 int n, nwrap;
714 Label lab;
715
716 n = c->size / LabelSize;
717 fl = c->fl;
718
719 qlock(&fl->lk);
720 addr = fl->last;
721 nwrap = 0;
722 NotFound:
723 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
724 if(b == nil){
725 fprint(2, "%s: cacheAllocBlock: xxx %r\n", argv0);
726 qunlock(&fl->lk);
727 return nil;
728 }
729 for(;;){
730 if(++addr >= fl->end){
731 addr = 0;
732 if(++nwrap >= 2){
733 blockPut(b);
734 werrstr("disk is full");
735 /*
736 * try to avoid a continuous spew of console
737 * messages.
738 */
739 if (fl->last != 0)
740 fprint(2, "%s: cacheAllocBlock: xxx1 %r\n",
741 argv0);
742 fl->last = 0;
743 qunlock(&fl->lk);
744 return nil;
745 }
746 }
747 if(addr%n == 0){
748 blockPut(b);
749 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
750 if(b == nil){
751 fl->last = addr;
752 fprint(2, "%s: cacheAllocBlock: xxx2 %r\n", argv0);
753 qunlock(&fl->lk);
754 return nil;
755 }
756 }
757 if(!labelUnpack(&lab, b->data, addr%n))
758 continue;
759 if(lab.state == BsFree)
760 goto Found;
761 if(lab.state&BsClosed)
762 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
763 goto Found;
764 }
765 Found:
766 blockPut(b);
767 b = cacheLocal(c, PartData, addr, OOverWrite);
768 if(b == nil){
769 fprint(2, "%s: cacheAllocBlock: xxx3 %r\n", argv0);
770 return nil;
771 }
772 if(!(b->iostate == BioLabel || b->iostate == BioClean)){
773 if(0)fprint(2, "%s: cacheAllocBlock addr %ud iostate %s label %L\n",
774 argv0, addr, bioStr(b->iostate), &lab);
775 blockPut(b);
776 goto NotFound;
777 }
778 fl->last = addr;
779 lab.type = type;
780 lab.tag = tag;
781 lab.state = BsAlloc;
782 lab.epoch = epoch;
783 lab.epochClose = ~(u32int)0;
784 if(!blockSetLabel(b, &lab, 1)){
785 fprint(2, "%s: cacheAllocBlock: xxx4 %r\n", argv0);
786 blockPut(b);
787 return nil;
788 }
789 vtzeroextend(vtType[type], b->data, 0, c->size);
790 if(0)diskWrite(c->disk, b);
791
792 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
793 lastAlloc = addr;
794 fl->nused++;
795 qunlock(&fl->lk);
796 b->pc = getcallerpc(&c);
797 return b;
798 }
799
800 int
cacheDirty(Cache * c)801 cacheDirty(Cache *c)
802 {
803 return c->ndirty;
804 }
805
806 void
cacheCountUsed(Cache * c,u32int epochLow,u32int * used,u32int * total,u32int * bsize)807 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
808 {
809 int n;
810 u32int addr, nused;
811 Block *b;
812 Label lab;
813 FreeList *fl;
814
815 fl = c->fl;
816 n = c->size / LabelSize;
817 *bsize = c->size;
818 qlock(&fl->lk);
819 if(fl->epochLow == epochLow){
820 *used = fl->nused;
821 *total = fl->end;
822 qunlock(&fl->lk);
823 return;
824 }
825 b = nil;
826 nused = 0;
827 for(addr=0; addr<fl->end; addr++){
828 if(addr%n == 0){
829 blockPut(b);
830 b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
831 if(b == nil){
832 fprint(2, "%s: flCountUsed: loading %ux: %r\n",
833 argv0, addr/n);
834 break;
835 }
836 }
837 if(!labelUnpack(&lab, b->data, addr%n))
838 continue;
839 if(lab.state == BsFree)
840 continue;
841 if(lab.state&BsClosed)
842 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
843 continue;
844 nused++;
845 }
846 blockPut(b);
847 if(addr == fl->end){
848 fl->nused = nused;
849 fl->epochLow = epochLow;
850 }
851 *used = nused;
852 *total = fl->end;
853 qunlock(&fl->lk);
854 return;
855 }
856
857 static FreeList *
flAlloc(u32int end)858 flAlloc(u32int end)
859 {
860 FreeList *fl;
861
862 fl = vtmallocz(sizeof(*fl));
863 fl->last = 0;
864 fl->end = end;
865 return fl;
866 }
867
868 static void
flFree(FreeList * fl)869 flFree(FreeList *fl)
870 {
871 vtfree(fl);
872 }
873
874 u32int
cacheLocalSize(Cache * c,int part)875 cacheLocalSize(Cache *c, int part)
876 {
877 return diskSize(c->disk, part);
878 }
879
880 /*
881 * The thread that has locked b may refer to it by
882 * multiple names. Nlock counts the number of
883 * references the locking thread holds. It will call
884 * blockPut once per reference.
885 */
886 void
blockDupLock(Block * b)887 blockDupLock(Block *b)
888 {
889 assert(b->nlock > 0);
890 b->nlock++;
891 }
892
893 /*
894 * we're done with the block.
895 * unlock it. can't use it after calling this.
896 */
897 void
blockPut(Block * b)898 blockPut(Block* b)
899 {
900 Cache *c;
901
902 if(b == nil)
903 return;
904
905 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
906
907 if(b->iostate == BioDirty)
908 bwatchDependency(b);
909
910 if(--b->nlock > 0)
911 return;
912
913 /*
914 * b->nlock should probably stay at zero while
915 * the block is unlocked, but diskThread and rsleep
916 * conspire to assume that they can just qlock(&b->lk); blockPut(b),
917 * so we have to keep b->nlock set to 1 even
918 * when the block is unlocked.
919 */
920 assert(b->nlock == 0);
921 b->nlock = 1;
922 // b->pc = 0;
923
924 bwatchUnlock(b);
925 qunlock(&b->lk);
926 c = b->c;
927 qlock(&c->lk);
928
929 if(--b->ref > 0){
930 qunlock(&c->lk);
931 return;
932 }
933
934 assert(b->ref == 0);
935 switch(b->iostate){
936 default:
937 b->used = c->now++;
938 heapIns(b);
939 break;
940 case BioEmpty:
941 case BioLabel:
942 if(c->nheap == 0)
943 b->used = c->now++;
944 else
945 b->used = c->heap[0]->used;
946 heapIns(b);
947 break;
948 case BioDirty:
949 break;
950 }
951 qunlock(&c->lk);
952 }
953
954 /*
955 * set the label associated with a block.
956 */
957 Block*
_blockSetLabel(Block * b,Label * l)958 _blockSetLabel(Block *b, Label *l)
959 {
960 int lpb;
961 Block *bb;
962 u32int a;
963 Cache *c;
964
965 c = b->c;
966
967 assert(b->part == PartData);
968 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
969 lpb = c->size / LabelSize;
970 a = b->addr / lpb;
971 bb = cacheLocal(c, PartLabel, a, OReadWrite);
972 if(bb == nil){
973 blockPut(b);
974 return nil;
975 }
976 b->l = *l;
977 labelPack(l, bb->data, b->addr%lpb);
978 blockDirty(bb);
979 return bb;
980 }
981
982 int
blockSetLabel(Block * b,Label * l,int allocating)983 blockSetLabel(Block *b, Label *l, int allocating)
984 {
985 Block *lb;
986 Label oldl;
987
988 oldl = b->l;
989 lb = _blockSetLabel(b, l);
990 if(lb == nil)
991 return 0;
992
993 /*
994 * If we're allocating the block, make sure the label (bl)
995 * goes to disk before the data block (b) itself. This is to help
996 * the blocks that in turn depend on b.
997 *
998 * Suppose bx depends on (must be written out after) b.
999 * Once we write b we'll think it's safe to write bx.
1000 * Bx can't get at b unless it has a valid label, though.
1001 *
1002 * Allocation is the only case in which having a current label
1003 * is vital because:
1004 *
1005 * - l.type is set at allocation and never changes.
1006 * - l.tag is set at allocation and never changes.
1007 * - l.state is not checked when we load blocks.
1008 * - the archiver cares deeply about l.state being
1009 * BaActive vs. BaCopied, but that's handled
1010 * by direct calls to _blockSetLabel.
1011 */
1012
1013 if(allocating)
1014 blockDependency(b, lb, -1, nil, nil);
1015 blockPut(lb);
1016 return 1;
1017 }
1018
1019 /*
1020 * Record that bb must be written out before b.
1021 * If index is given, we're about to overwrite the score/e
1022 * at that index in the block. Save the old value so we
1023 * can write a safer ``old'' version of the block if pressed.
1024 */
1025 void
blockDependency(Block * b,Block * bb,int index,uchar * score,Entry * e)1026 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1027 {
1028 BList *p;
1029
1030 if(bb->iostate == BioClean)
1031 return;
1032
1033 /*
1034 * Dependencies for blocks containing Entry structures
1035 * or scores must always be explained. The problem with
1036 * only explaining some of them is this. Suppose we have two
1037 * dependencies for the same field, the first explained
1038 * and the second not. We try to write the block when the first
1039 * dependency is not written but the second is. We will roll back
1040 * the first change even though the second trumps it.
1041 */
1042 if(index == -1 && bb->part == PartData)
1043 assert(b->l.type == BtData);
1044
1045 if(bb->iostate != BioDirty){
1046 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1047 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1048 abort();
1049 }
1050
1051 p = blistAlloc(bb);
1052 if(p == nil)
1053 return;
1054
1055 assert(bb->iostate == BioDirty);
1056 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1057
1058 p->part = bb->part;
1059 p->addr = bb->addr;
1060 p->type = bb->l.type;
1061 p->vers = bb->vers;
1062 p->index = index;
1063 if(p->index >= 0){
1064 /*
1065 * This test would just be b->l.type==BtDir except
1066 * we need to exclude the super block.
1067 */
1068 if(b->l.type == BtDir && b->part == PartData)
1069 entryPack(e, p->old.entry, 0);
1070 else
1071 memmove(p->old.score, score, VtScoreSize);
1072 }
1073 p->next = b->prior;
1074 b->prior = p;
1075 }
1076
1077 /*
1078 * Mark an in-memory block as dirty. If there are too many
1079 * dirty blocks, start writing some out to disk.
1080 *
1081 * If there were way too many dirty blocks, we used to
1082 * try to do some flushing ourselves, but it's just too dangerous --
1083 * it implies that the callers cannot have any of our priors locked,
1084 * but this is hard to avoid in some cases.
1085 */
1086 int
blockDirty(Block * b)1087 blockDirty(Block *b)
1088 {
1089 Cache *c;
1090
1091 c = b->c;
1092
1093 assert(b->part != PartVenti);
1094
1095 if(b->iostate == BioDirty)
1096 return 1;
1097 assert(b->iostate == BioClean || b->iostate == BioLabel);
1098
1099 qlock(&c->lk);
1100 b->iostate = BioDirty;
1101 c->ndirty++;
1102 if(c->ndirty > (c->maxdirty>>1))
1103 rwakeup(&c->flush);
1104 qunlock(&c->lk);
1105
1106 return 1;
1107 }
1108
1109 /*
1110 * We've decided to write out b. Maybe b has some pointers to blocks
1111 * that haven't yet been written to disk. If so, construct a slightly out-of-date
1112 * copy of b that is safe to write out. (diskThread will make sure the block
1113 * remains marked as dirty.)
1114 */
1115 uchar *
blockRollback(Block * b,uchar * buf)1116 blockRollback(Block *b, uchar *buf)
1117 {
1118 u32int addr;
1119 BList *p;
1120 Super super;
1121
1122 /* easy case */
1123 if(b->prior == nil)
1124 return b->data;
1125
1126 memmove(buf, b->data, b->c->size);
1127 for(p=b->prior; p; p=p->next){
1128 /*
1129 * we know p->index >= 0 because blockWrite has vetted this block for us.
1130 */
1131 assert(p->index >= 0);
1132 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1133 if(b->part == PartSuper){
1134 assert(p->index == 0);
1135 superUnpack(&super, buf);
1136 addr = globalToLocal(p->old.score);
1137 if(addr == NilBlock){
1138 fprint(2, "%s: rolling back super block: "
1139 "bad replacement addr %V\n",
1140 argv0, p->old.score);
1141 abort();
1142 }
1143 super.active = addr;
1144 superPack(&super, buf);
1145 continue;
1146 }
1147 if(b->l.type == BtDir)
1148 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1149 else
1150 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1151 }
1152 return buf;
1153 }
1154
1155 /*
1156 * Try to write block b.
1157 * If b depends on other blocks:
1158 *
1159 * If the block has been written out, remove the dependency.
1160 * If the dependency is replaced by a more recent dependency,
1161 * throw it out.
1162 * If we know how to write out an old version of b that doesn't
1163 * depend on it, do that.
1164 *
1165 * Otherwise, bail.
1166 */
1167 int
blockWrite(Block * b,int waitlock)1168 blockWrite(Block *b, int waitlock)
1169 {
1170 uchar *dmap;
1171 Cache *c;
1172 BList *p, **pp;
1173 Block *bb;
1174 int lockfail;
1175
1176 c = b->c;
1177
1178 if(b->iostate != BioDirty)
1179 return 1;
1180
1181 dmap = b->dmap;
1182 memset(dmap, 0, c->ndmap);
1183 pp = &b->prior;
1184 for(p=*pp; p; p=*pp){
1185 if(p->index >= 0){
1186 /* more recent dependency has succeeded; this one can go */
1187 if(dmap[p->index/8] & (1<<(p->index%8)))
1188 goto ignblock;
1189 }
1190
1191 lockfail = 0;
1192 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1193 &lockfail);
1194 if(bb == nil){
1195 if(lockfail)
1196 return 0;
1197 /* block not in cache => was written already */
1198 dmap[p->index/8] |= 1<<(p->index%8);
1199 goto ignblock;
1200 }
1201
1202 /*
1203 * same version of block is still in cache.
1204 *
1205 * the assertion is true because the block still has version p->vers,
1206 * which means it hasn't been written out since we last saw it.
1207 */
1208 if(bb->iostate != BioDirty){
1209 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1210 argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1211 /* probably BioWriting if it happens? */
1212 if(bb->iostate == BioClean)
1213 goto ignblock;
1214 }
1215
1216 blockPut(bb);
1217
1218 if(p->index < 0){
1219 /*
1220 * We don't know how to temporarily undo
1221 * b's dependency on bb, so just don't write b yet.
1222 */
1223 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1224 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1225 return 0;
1226 }
1227 /* keep walking down the list */
1228 pp = &p->next;
1229 continue;
1230
1231 ignblock:
1232 *pp = p->next;
1233 blistFree(c, p);
1234 continue;
1235 }
1236
1237 /*
1238 * DiskWrite must never be called with a double-locked block.
1239 * This call to diskWrite is okay because blockWrite is only called
1240 * from the cache flush thread, which never double-locks a block.
1241 */
1242 diskWrite(c->disk, b);
1243 return 1;
1244 }
1245
1246 /*
1247 * Change the I/O state of block b.
1248 * Just an assignment except for magic in
1249 * switch statement (read comments there).
1250 */
1251 void
blockSetIOState(Block * b,int iostate)1252 blockSetIOState(Block *b, int iostate)
1253 {
1254 int dowakeup;
1255 Cache *c;
1256 BList *p, *q;
1257
1258 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1259
1260 c = b->c;
1261
1262 dowakeup = 0;
1263 switch(iostate){
1264 default:
1265 abort();
1266 case BioEmpty:
1267 assert(!b->uhead);
1268 break;
1269 case BioLabel:
1270 assert(!b->uhead);
1271 break;
1272 case BioClean:
1273 bwatchDependency(b);
1274 /*
1275 * If b->prior is set, it means a write just finished.
1276 * The prior list isn't needed anymore.
1277 */
1278 for(p=b->prior; p; p=q){
1279 q = p->next;
1280 blistFree(c, p);
1281 }
1282 b->prior = nil;
1283 /*
1284 * Freeing a block or just finished a write.
1285 * Move the blocks from the per-block unlink
1286 * queue to the cache unlink queue.
1287 */
1288 if(b->iostate == BioDirty || b->iostate == BioWriting){
1289 qlock(&c->lk);
1290 c->ndirty--;
1291 b->iostate = iostate; /* change here to keep in sync with ndirty */
1292 b->vers = c->vers++;
1293 if(b->uhead){
1294 /* add unlink blocks to unlink queue */
1295 if(c->uhead == nil){
1296 c->uhead = b->uhead;
1297 rwakeup(&c->unlink);
1298 }else
1299 c->utail->next = b->uhead;
1300 c->utail = b->utail;
1301 b->uhead = nil;
1302 }
1303 qunlock(&c->lk);
1304 }
1305 assert(!b->uhead);
1306 dowakeup = 1;
1307 break;
1308 case BioDirty:
1309 /*
1310 * Wrote out an old version of the block (see blockRollback).
1311 * Bump a version count, leave it dirty.
1312 */
1313 if(b->iostate == BioWriting){
1314 qlock(&c->lk);
1315 b->vers = c->vers++;
1316 qunlock(&c->lk);
1317 dowakeup = 1;
1318 }
1319 break;
1320 case BioReading:
1321 case BioWriting:
1322 /*
1323 * Adding block to disk queue. Bump reference count.
1324 * diskThread decs the count later by calling blockPut.
1325 * This is here because we need to lock c->lk to
1326 * manipulate the ref count.
1327 */
1328 qlock(&c->lk);
1329 b->ref++;
1330 qunlock(&c->lk);
1331 break;
1332 case BioReadError:
1333 case BioVentiError:
1334 /*
1335 * Oops.
1336 */
1337 dowakeup = 1;
1338 break;
1339 }
1340 b->iostate = iostate;
1341 /*
1342 * Now that the state has changed, we can wake the waiters.
1343 */
1344 if(dowakeup)
1345 rwakeupall(&b->ioready);
1346 }
1347
1348 /*
1349 * The active file system is a tree of blocks.
1350 * When we add snapshots to the mix, the entire file system
1351 * becomes a dag and thus requires a bit more care.
1352 *
1353 * The life of the file system is divided into epochs. A snapshot
1354 * ends one epoch and begins the next. Each file system block
1355 * is marked with the epoch in which it was created (b.epoch).
1356 * When the block is unlinked from the file system (closed), it is marked
1357 * with the epoch in which it was removed (b.epochClose).
1358 * Once we have discarded or archived all snapshots up to
1359 * b.epochClose, we can reclaim the block.
1360 *
1361 * If a block was created in a past epoch but is not yet closed,
1362 * it is treated as copy-on-write. Of course, in order to insert the
1363 * new pointer into the tree, the parent must be made writable,
1364 * and so on up the tree. The recursion stops because the root
1365 * block is always writable.
1366 *
1367 * If blocks are never closed, they will never be reused, and
1368 * we will run out of disk space. But marking a block as closed
1369 * requires some care about dependencies and write orderings.
1370 *
1371 * (1) If a block p points at a copy-on-write block b and we
1372 * copy b to create bb, then p must be written out after bb and
1373 * lbb (bb's label block).
1374 *
1375 * (2) We have to mark b as closed, but only after we switch
1376 * the pointer, so lb must be written out after p. In fact, we
1377 * can't even update the in-memory copy, or the cache might
1378 * mistakenly give out b for reuse before p gets written.
1379 *
1380 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1381 * The caller is expected to record a "p after bb" dependency
1382 * to finish (1), and also expected to call blockRemoveLink
1383 * to arrange for (2) to happen once p is written.
1384 *
1385 * Until (2) happens, some pieces of the code (e.g., the archiver)
1386 * still need to know whether a block has been copied, so we
1387 * set the BsCopied bit in the label and force that to disk *before*
1388 * the copy gets written out.
1389 */
1390 Block*
blockCopy(Block * b,u32int tag,u32int ehi,u32int elo)1391 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1392 {
1393 Block *bb, *lb;
1394 Label l;
1395
1396 if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1397 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1398 argv0, b->addr, &b->l, elo, ehi);
1399
1400 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1401 if(bb == nil){
1402 blockPut(b);
1403 return nil;
1404 }
1405
1406 /*
1407 * Update label so we know the block has been copied.
1408 * (It will be marked closed once it has been unlinked from
1409 * the tree.) This must follow cacheAllocBlock since we
1410 * can't be holding onto lb when we call cacheAllocBlock.
1411 */
1412 if((b->l.state&BsCopied)==0)
1413 if(b->part == PartData){ /* not the superblock */
1414 l = b->l;
1415 l.state |= BsCopied;
1416 lb = _blockSetLabel(b, &l);
1417 if(lb == nil){
1418 /* can't set label => can't copy block */
1419 blockPut(b);
1420 l.type = BtMax;
1421 l.state = BsFree;
1422 l.epoch = 0;
1423 l.epochClose = 0;
1424 l.tag = 0;
1425 blockSetLabel(bb, &l, 0);
1426 blockPut(bb);
1427 return nil;
1428 }
1429 blockDependency(bb, lb, -1, nil, nil);
1430 blockPut(lb);
1431 }
1432
1433 memmove(bb->data, b->data, b->c->size);
1434 blockDirty(bb);
1435 blockPut(b);
1436 return bb;
1437 }
1438
1439 /*
1440 * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1441 * If recurse is set, we are unlinking all of bb's children as well.
1442 *
1443 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add
1444 * the relevant information to b's list of unlinked blocks. Once b is written,
1445 * the list will be queued for processing.
1446 *
1447 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1448 */
1449 void
blockRemoveLink(Block * b,u32int addr,int type,u32int tag,int recurse)1450 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1451 {
1452 BList *p, **pp, bl;
1453
1454 /* remove bb from prior list */
1455 for(pp=&b->prior; (p=*pp)!=nil; ){
1456 if(p->part == PartData && p->addr == addr){
1457 *pp = p->next;
1458 blistFree(b->c, p);
1459 }else
1460 pp = &p->next;
1461 }
1462
1463 bl.part = PartData;
1464 bl.addr = addr;
1465 bl.type = type;
1466 bl.tag = tag;
1467 if(b->l.epoch == 0)
1468 assert(b->part == PartSuper);
1469 bl.epoch = b->l.epoch;
1470 bl.next = nil;
1471 bl.recurse = recurse;
1472
1473 if(b->part == PartSuper && b->iostate == BioClean)
1474 p = nil;
1475 else
1476 p = blistAlloc(b);
1477 if(p == nil){
1478 /*
1479 * b has already been written to disk.
1480 */
1481 doRemoveLink(b->c, &bl);
1482 return;
1483 }
1484
1485 /* Uhead is only processed when the block goes from Dirty -> Clean */
1486 assert(b->iostate == BioDirty);
1487
1488 *p = bl;
1489 if(b->uhead == nil)
1490 b->uhead = p;
1491 else
1492 b->utail->next = p;
1493 b->utail = p;
1494 }
1495
1496 /*
1497 * Process removal of a single block and perhaps its children.
1498 */
1499 static void
doRemoveLink(Cache * c,BList * p)1500 doRemoveLink(Cache *c, BList *p)
1501 {
1502 int i, n, recurse;
1503 u32int a;
1504 Block *b;
1505 Label l;
1506 BList bl;
1507
1508 recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1509
1510 /*
1511 * We're not really going to overwrite b, but if we're not
1512 * going to look at its contents, there is no point in reading
1513 * them from the disk.
1514 */
1515 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1516 if(b == nil)
1517 return;
1518
1519 /*
1520 * When we're unlinking from the superblock, close with the next epoch.
1521 */
1522 if(p->epoch == 0)
1523 p->epoch = b->l.epoch+1;
1524
1525 /* sanity check */
1526 if(b->l.epoch > p->epoch){
1527 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1528 argv0, b->l.epoch, p->epoch);
1529 blockPut(b);
1530 return;
1531 }
1532
1533 if(recurse){
1534 n = c->size / VtScoreSize;
1535 for(i=0; i<n; i++){
1536 a = globalToLocal(b->data + i*VtScoreSize);
1537 if(a == NilBlock || !readLabel(c, &l, a))
1538 continue;
1539 if(l.state&BsClosed)
1540 continue;
1541 /*
1542 * If stack space becomes an issue...
1543 p->addr = a;
1544 p->type = l.type;
1545 p->tag = l.tag;
1546 doRemoveLink(c, p);
1547 */
1548
1549 bl.part = PartData;
1550 bl.addr = a;
1551 bl.type = l.type;
1552 bl.tag = l.tag;
1553 bl.epoch = p->epoch;
1554 bl.next = nil;
1555 bl.recurse = 1;
1556 /* give up the block lock - share with others */
1557 blockPut(b);
1558 doRemoveLink(c, &bl);
1559 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1560 if(b == nil){
1561 fprint(2, "%s: warning: lost block in doRemoveLink\n",
1562 argv0);
1563 return;
1564 }
1565 }
1566 }
1567
1568 l = b->l;
1569 l.state |= BsClosed;
1570 l.epochClose = p->epoch;
1571 if(l.epochClose == l.epoch){
1572 /* lock ordering: trying for c->fl->lk while holding b->lk can deadlock */
1573 if(!canqlock(&c->fl->lk)){
1574 blockPut(b);
1575 qlock(&c->fl->lk);
1576 b = cacheLocalData(c, p->addr, p->type, p->tag, OOverWrite, 0);
1577 if(b == nil){
1578 fprint(2, "%s: warning: lost block at end of doRemoveLink\n",
1579 argv0);
1580 qunlock(&c->fl->lk);
1581 return;
1582 }
1583 }
1584 if(l.epoch == c->fl->epochLow)
1585 c->fl->nused--;
1586 blockSetLabel(b, &l, 0);
1587 qunlock(&c->fl->lk);
1588 }else
1589 blockSetLabel(b, &l, 0);
1590 blockPut(b);
1591 }
1592
1593 /*
1594 * Allocate a BList so that we can record a dependency
1595 * or queue a removal related to block b.
1596 * If we can't find a BList, we write out b and return nil.
1597 */
1598 static BList *
blistAlloc(Block * b)1599 blistAlloc(Block *b)
1600 {
1601 Cache *c;
1602 BList *p;
1603
1604 if(b->iostate != BioDirty){
1605 /*
1606 * should not happen anymore -
1607 * blockDirty used to flush but no longer does.
1608 */
1609 assert(b->iostate == BioClean);
1610 fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1611 return nil;
1612 }
1613
1614 c = b->c;
1615 qlock(&c->lk);
1616 if(c->blfree == nil){
1617 /*
1618 * No free BLists. What are our options?
1619 */
1620
1621 /* Block has no priors? Just write it. */
1622 if(b->prior == nil){
1623 qunlock(&c->lk);
1624 diskWriteAndWait(c->disk, b);
1625 return nil;
1626 }
1627
1628 /*
1629 * Wake the flush thread, which will hopefully free up
1630 * some BLists for us. We used to flush a block from
1631 * our own prior list and reclaim that BList, but this is
1632 * a no-no: some of the blocks on our prior list may
1633 * be locked by our caller. Or maybe their label blocks
1634 * are locked by our caller. In any event, it's too hard
1635 * to make sure we can do I/O for ourselves. Instead,
1636 * we assume the flush thread will find something.
1637 * (The flush thread never blocks waiting for a block,
1638 * so it can't deadlock like we can.)
1639 */
1640 while(c->blfree == nil){
1641 rwakeup(&c->flush);
1642 rsleep(&c->blrend);
1643 if(c->blfree == nil)
1644 fprint(2, "%s: flushing for blists\n", argv0);
1645 }
1646 }
1647
1648 p = c->blfree;
1649 c->blfree = p->next;
1650 qunlock(&c->lk);
1651 return p;
1652 }
1653
1654 static void
blistFree(Cache * c,BList * bl)1655 blistFree(Cache *c, BList *bl)
1656 {
1657 qlock(&c->lk);
1658 bl->next = c->blfree;
1659 c->blfree = bl;
1660 rwakeup(&c->blrend);
1661 qunlock(&c->lk);
1662 }
1663
1664 char*
bsStr(int state)1665 bsStr(int state)
1666 {
1667 static char s[100];
1668
1669 if(state == BsFree)
1670 return "Free";
1671 if(state == BsBad)
1672 return "Bad";
1673
1674 sprint(s, "%x", state);
1675 if(!(state&BsAlloc))
1676 strcat(s, ",Free"); /* should not happen */
1677 if(state&BsCopied)
1678 strcat(s, ",Copied");
1679 if(state&BsVenti)
1680 strcat(s, ",Venti");
1681 if(state&BsClosed)
1682 strcat(s, ",Closed");
1683 return s;
1684 }
1685
1686 char *
bioStr(int iostate)1687 bioStr(int iostate)
1688 {
1689 switch(iostate){
1690 default:
1691 return "Unknown!!";
1692 case BioEmpty:
1693 return "Empty";
1694 case BioLabel:
1695 return "Label";
1696 case BioClean:
1697 return "Clean";
1698 case BioDirty:
1699 return "Dirty";
1700 case BioReading:
1701 return "Reading";
1702 case BioWriting:
1703 return "Writing";
1704 case BioReadError:
1705 return "ReadError";
1706 case BioVentiError:
1707 return "VentiError";
1708 case BioMax:
1709 return "Max";
1710 }
1711 }
1712
1713 static char *bttab[] = {
1714 "BtData",
1715 "BtData+1",
1716 "BtData+2",
1717 "BtData+3",
1718 "BtData+4",
1719 "BtData+5",
1720 "BtData+6",
1721 "BtData+7",
1722 "BtDir",
1723 "BtDir+1",
1724 "BtDir+2",
1725 "BtDir+3",
1726 "BtDir+4",
1727 "BtDir+5",
1728 "BtDir+6",
1729 "BtDir+7",
1730 };
1731
1732 char*
btStr(int type)1733 btStr(int type)
1734 {
1735 if(type < nelem(bttab))
1736 return bttab[type];
1737 return "unknown";
1738 }
1739
1740 int
labelFmt(Fmt * f)1741 labelFmt(Fmt *f)
1742 {
1743 Label *l;
1744
1745 l = va_arg(f->args, Label*);
1746 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1747 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1748 }
1749
1750 int
scoreFmt(Fmt * f)1751 scoreFmt(Fmt *f)
1752 {
1753 uchar *v;
1754 int i;
1755 u32int addr;
1756
1757 v = va_arg(f->args, uchar*);
1758 if(v == nil){
1759 fmtprint(f, "*");
1760 }else if((addr = globalToLocal(v)) != NilBlock)
1761 fmtprint(f, "0x%.8ux", addr);
1762 else{
1763 for(i = 0; i < VtScoreSize; i++)
1764 fmtprint(f, "%2.2ux", v[i]);
1765 }
1766
1767 return 0;
1768 }
1769
1770 static int
upHeap(int i,Block * b)1771 upHeap(int i, Block *b)
1772 {
1773 Block *bb;
1774 u32int now;
1775 int p;
1776 Cache *c;
1777
1778 c = b->c;
1779 now = c->now;
1780 for(; i != 0; i = p){
1781 p = (i - 1) >> 1;
1782 bb = c->heap[p];
1783 if(b->used - now >= bb->used - now)
1784 break;
1785 c->heap[i] = bb;
1786 bb->heap = i;
1787 }
1788 c->heap[i] = b;
1789 b->heap = i;
1790
1791 return i;
1792 }
1793
1794 static int
downHeap(int i,Block * b)1795 downHeap(int i, Block *b)
1796 {
1797 Block *bb;
1798 u32int now;
1799 int k;
1800 Cache *c;
1801
1802 c = b->c;
1803 now = c->now;
1804 for(; ; i = k){
1805 k = (i << 1) + 1;
1806 if(k >= c->nheap)
1807 break;
1808 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1809 k++;
1810 bb = c->heap[k];
1811 if(b->used - now <= bb->used - now)
1812 break;
1813 c->heap[i] = bb;
1814 bb->heap = i;
1815 }
1816 c->heap[i] = b;
1817 b->heap = i;
1818 return i;
1819 }
1820
1821 /*
1822 * Delete a block from the heap.
1823 * Called with c->lk held.
1824 */
1825 static void
heapDel(Block * b)1826 heapDel(Block *b)
1827 {
1828 int i, si;
1829 Cache *c;
1830
1831 c = b->c;
1832
1833 si = b->heap;
1834 if(si == BadHeap)
1835 return;
1836 b->heap = BadHeap;
1837 c->nheap--;
1838 if(si == c->nheap)
1839 return;
1840 b = c->heap[c->nheap];
1841 i = upHeap(si, b);
1842 if(i == si)
1843 downHeap(i, b);
1844 }
1845
1846 /*
1847 * Insert a block into the heap.
1848 * Called with c->lk held.
1849 */
1850 static void
heapIns(Block * b)1851 heapIns(Block *b)
1852 {
1853 assert(b->heap == BadHeap);
1854 upHeap(b->c->nheap++, b);
1855 rwakeup(&b->c->heapwait);
1856 }
1857
1858 /*
1859 * Get just the label for a block.
1860 */
1861 int
readLabel(Cache * c,Label * l,u32int addr)1862 readLabel(Cache *c, Label *l, u32int addr)
1863 {
1864 int lpb;
1865 Block *b;
1866 u32int a;
1867
1868 lpb = c->size / LabelSize;
1869 a = addr / lpb;
1870 b = cacheLocal(c, PartLabel, a, OReadOnly);
1871 if(b == nil){
1872 blockPut(b);
1873 return 0;
1874 }
1875
1876 if(!labelUnpack(l, b->data, addr%lpb)){
1877 blockPut(b);
1878 return 0;
1879 }
1880 blockPut(b);
1881 return 1;
1882 }
1883
1884 /*
1885 * Process unlink queue.
1886 * Called with c->lk held.
1887 */
1888 static void
unlinkBody(Cache * c)1889 unlinkBody(Cache *c)
1890 {
1891 BList *p;
1892
1893 while(c->uhead != nil){
1894 p = c->uhead;
1895 c->uhead = p->next;
1896 qunlock(&c->lk);
1897 doRemoveLink(c, p);
1898 qlock(&c->lk);
1899 p->next = c->blfree;
1900 c->blfree = p;
1901 }
1902 }
1903
1904 /*
1905 * Occasionally unlink the blocks on the cache unlink queue.
1906 */
1907 static void
unlinkThread(void * a)1908 unlinkThread(void *a)
1909 {
1910 Cache *c = a;
1911
1912 threadsetname("unlink");
1913
1914 qlock(&c->lk);
1915 for(;;){
1916 while(c->uhead == nil && c->die.l == nil)
1917 rsleep(&c->unlink);
1918 if(c->die.l != nil)
1919 break;
1920 unlinkBody(c);
1921 }
1922 c->ref--;
1923 rwakeup(&c->die);
1924 qunlock(&c->lk);
1925 }
1926
1927 static int
baddrCmp(void * a0,void * a1)1928 baddrCmp(void *a0, void *a1)
1929 {
1930 BAddr *b0, *b1;
1931 b0 = a0;
1932 b1 = a1;
1933
1934 if(b0->part < b1->part)
1935 return -1;
1936 if(b0->part > b1->part)
1937 return 1;
1938 if(b0->addr < b1->addr)
1939 return -1;
1940 if(b0->addr > b1->addr)
1941 return 1;
1942 return 0;
1943 }
1944
1945 /*
1946 * Scan the block list for dirty blocks; add them to the list c->baddr.
1947 */
1948 static void
flushFill(Cache * c)1949 flushFill(Cache *c)
1950 {
1951 int i, ndirty;
1952 BAddr *p;
1953 Block *b;
1954
1955 qlock(&c->lk);
1956 if(c->ndirty == 0){
1957 qunlock(&c->lk);
1958 return;
1959 }
1960
1961 p = c->baddr;
1962 ndirty = 0;
1963 for(i=0; i<c->nblocks; i++){
1964 b = c->blocks + i;
1965 if(b->part == PartError)
1966 continue;
1967 if(b->iostate == BioDirty || b->iostate == BioWriting)
1968 ndirty++;
1969 if(b->iostate != BioDirty)
1970 continue;
1971 p->part = b->part;
1972 p->addr = b->addr;
1973 p->vers = b->vers;
1974 p++;
1975 }
1976 if(ndirty != c->ndirty){
1977 fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1978 argv0, c->ndirty, ndirty);
1979 c->ndirty = ndirty;
1980 }
1981 qunlock(&c->lk);
1982
1983 c->bw = p - c->baddr;
1984 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1985 }
1986
1987 /*
1988 * This is not thread safe, i.e. it can't be called from multiple threads.
1989 *
1990 * It's okay how we use it, because it only gets called in
1991 * the flushThread. And cacheFree, but only after
1992 * cacheFree has killed off the flushThread.
1993 */
1994 static int
cacheFlushBlock(Cache * c)1995 cacheFlushBlock(Cache *c)
1996 {
1997 Block *b;
1998 BAddr *p;
1999 int lockfail, nfail;
2000
2001 nfail = 0;
2002 for(;;){
2003 if(c->br == c->be){
2004 if(c->bw == 0 || c->bw == c->be)
2005 flushFill(c);
2006 c->br = 0;
2007 c->be = c->bw;
2008 c->bw = 0;
2009 c->nflush = 0;
2010 }
2011
2012 if(c->br == c->be)
2013 return 0;
2014 p = c->baddr + c->br;
2015 c->br++;
2016 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
2017 &lockfail);
2018
2019 if(b && blockWrite(b, Nowaitlock)){
2020 c->nflush++;
2021 blockPut(b);
2022 return 1;
2023 }
2024 if(b)
2025 blockPut(b);
2026
2027 /*
2028 * Why didn't we write the block?
2029 */
2030
2031 /* Block already written out */
2032 if(b == nil && !lockfail)
2033 continue;
2034
2035 /* Failed to acquire lock; sleep if happens a lot. */
2036 if(lockfail && ++nfail > 100){
2037 sleep(500);
2038 nfail = 0;
2039 }
2040 /* Requeue block. */
2041 if(c->bw < c->be)
2042 c->baddr[c->bw++] = *p;
2043 }
2044 }
2045
2046 /*
2047 * Occasionally flush dirty blocks from memory to the disk.
2048 */
2049 static void
flushThread(void * a)2050 flushThread(void *a)
2051 {
2052 Cache *c = a;
2053 int i;
2054
2055 threadsetname("flush");
2056 qlock(&c->lk);
2057 while(c->die.l == nil){
2058 rsleep(&c->flush);
2059 qunlock(&c->lk);
2060 for(i=0; i<FlushSize; i++)
2061 if(!cacheFlushBlock(c)){
2062 /*
2063 * If i==0, could be someone is waking us repeatedly
2064 * to flush the cache but there's no work to do.
2065 * Pause a little.
2066 */
2067 if(i==0){
2068 // fprint(2, "%s: flushthread found "
2069 // "nothing to flush - %d dirty\n",
2070 // argv0, c->ndirty);
2071 sleep(250);
2072 }
2073 break;
2074 }
2075 if(i==0 && c->ndirty){
2076 /*
2077 * All the blocks are being written right now -- there's nothing to do.
2078 * We might be spinning with cacheFlush though -- he'll just keep
2079 * kicking us until c->ndirty goes down. Probably we should sleep
2080 * on something that the diskThread can kick, but for now we'll
2081 * just pause for a little while waiting for disks to finish.
2082 */
2083 sleep(100);
2084 }
2085 qlock(&c->lk);
2086 rwakeupall(&c->flushwait);
2087 }
2088 c->ref--;
2089 rwakeup(&c->die);
2090 qunlock(&c->lk);
2091 }
2092
2093 /*
2094 * Flush the cache.
2095 */
2096 void
cacheFlush(Cache * c,int wait)2097 cacheFlush(Cache *c, int wait)
2098 {
2099 qlock(&c->lk);
2100 if(wait){
2101 while(c->ndirty){
2102 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2103 // c->ndirty, c->uhead);
2104 rwakeup(&c->flush);
2105 rsleep(&c->flushwait);
2106 }
2107 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2108 }else if(c->ndirty)
2109 rwakeup(&c->flush);
2110 qunlock(&c->lk);
2111 }
2112
2113 /*
2114 * Kick the flushThread every 30 seconds.
2115 */
2116 static void
cacheSync(void * v)2117 cacheSync(void *v)
2118 {
2119 Cache *c;
2120
2121 c = v;
2122 cacheFlush(c, 0);
2123 }
2124