1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 VtLock *dirtylk; 26 int ref; 27 int mode; 28 29 Disk *disk; 30 int size; /* block size */ 31 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 32 VtSession *z; 33 u32int now; /* ticks for usage timestamps */ 34 Block **heads; /* hash table for finding address */ 35 int nheap; /* number of available victims */ 36 Block **heap; /* heap for locating victims */ 37 long nblocks; /* number of blocks allocated */ 38 Block *blocks; /* array of block descriptors */ 39 u8int *mem; /* memory for all block data & blists */ 40 41 BList *blfree; 42 VtRendez *blrend; 43 44 int ndirty; /* number of dirty blocks in the cache */ 45 int maxdirty; /* max number of dirty blocks */ 46 u32int vers; 47 48 long hashSize; 49 50 FreeList *fl; 51 52 VtRendez *die; /* daemon threads should die when != nil */ 53 54 VtRendez *flush; 55 VtRendez *flushwait; 56 VtRendez *heapwait; 57 BAddr *baddr; 58 int bw, br, be; 59 int nflush; 60 61 Periodic *sync; 62 63 /* unlink daemon */ 64 BList *uhead; 65 BList *utail; 66 VtRendez *unlink; 67 68 /* block counts */ 69 int nused; 70 int ndisk; 71 }; 72 73 struct BList { 74 int part; 75 u32int addr; 76 uchar type; 77 u32int tag; 78 u32int epoch; 79 u32int vers; 80 81 int recurse; /* for block unlink */ 82 83 /* for roll back */ 84 int index; /* -1 indicates not valid */ 85 union { 86 uchar score[VtScoreSize]; 87 uchar entry[VtEntrySize]; 88 } old; 89 BList *next; 90 }; 91 92 struct BAddr { 93 int part; 94 u32int addr; 95 u32int vers; 96 }; 97 98 struct FreeList { 99 VtLock *lk; 100 u32int last; /* last block allocated */ 101 u32int end; /* end of data partition */ 102 u32int nused; /* number of used blocks */ 103 u32int epochLow; /* low epoch when last updated nused */ 104 }; 105 106 static FreeList *flAlloc(u32int end); 107 static void flFree(FreeList *fl); 108 109 static Block *cacheBumpBlock(Cache *c); 110 static void heapDel(Block*); 111 static void heapIns(Block*); 112 static void cacheCheck(Cache*); 113 static void unlinkThread(void *a); 114 static void flushThread(void *a); 115 static void flushBody(Cache *c); 116 static void unlinkBody(Cache *c); 117 static int cacheFlushBlock(Cache *c); 118 static void cacheSync(void*); 119 static BList *blistAlloc(Block*); 120 static void blistFree(Cache*, BList*); 121 static void doRemoveLink(Cache*, BList*); 122 static void doRemoveLinkList(Cache*, BList*); 123 124 /* 125 * Mapping from local block type to Venti type 126 */ 127 int vtType[BtMax] = { 128 VtDataType, /* BtData | 0 */ 129 VtPointerType0, /* BtData | 1 */ 130 VtPointerType1, /* BtData | 2 */ 131 VtPointerType2, /* BtData | 3 */ 132 VtPointerType3, /* BtData | 4 */ 133 VtPointerType4, /* BtData | 5 */ 134 VtPointerType5, /* BtData | 6 */ 135 VtPointerType6, /* BtData | 7 */ 136 VtDirType, /* BtDir | 0 */ 137 VtPointerType0, /* BtDir | 1 */ 138 VtPointerType1, /* BtDir | 2 */ 139 VtPointerType2, /* BtDir | 3 */ 140 VtPointerType3, /* BtDir | 4 */ 141 VtPointerType4, /* BtDir | 5 */ 142 VtPointerType5, /* BtDir | 6 */ 143 VtPointerType6, /* BtDir | 7 */ 144 }; 145 146 /* 147 * Allocate the memory cache. 148 */ 149 Cache * 150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 151 { 152 int i; 153 Cache *c; 154 Block *b; 155 BList *bl; 156 u8int *p; 157 int nbl; 158 159 c = vtMemAllocZ(sizeof(Cache)); 160 161 /* reasonable number of BList elements */ 162 nbl = nblocks * 4; 163 164 c->lk = vtLockAlloc(); 165 c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */ 166 c->ref = 1; 167 c->disk = disk; 168 c->z = z; 169 c->size = diskBlockSize(disk); 170 bwatchSetBlockSize(c->size); 171 /* round c->size up to be a nice multiple */ 172 c->size = (c->size + 127) & ~127; 173 c->ndmap = (c->size/20 + 7) / 8; 174 c->nblocks = nblocks; 175 c->hashSize = nblocks; 176 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 177 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 178 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 179 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 180 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 181 c->mode = mode; 182 c->vers++; 183 p = c->mem; 184 for(i = 0; i < nblocks; i++){ 185 b = &c->blocks[i]; 186 b->lk = vtLockAlloc(); 187 b->c = c; 188 b->data = p; 189 b->heap = i; 190 b->ioready = vtRendezAlloc(b->lk); 191 c->heap[i] = b; 192 p += c->size; 193 } 194 c->nheap = nblocks; 195 for(i = 0; i < nbl; i++){ 196 bl = (BList*)p; 197 bl->next = c->blfree; 198 c->blfree = bl; 199 p += sizeof(BList); 200 } 201 /* separate loop to keep blocks and blists reasonably aligned */ 202 for(i = 0; i < nblocks; i++){ 203 b = &c->blocks[i]; 204 b->dmap = p; 205 p += c->ndmap; 206 } 207 208 c->blrend = vtRendezAlloc(c->lk); 209 210 c->maxdirty = nblocks*(DirtyPercentage*0.01); 211 212 c->fl = flAlloc(diskSize(disk, PartData)); 213 214 c->unlink = vtRendezAlloc(c->lk); 215 c->flush = vtRendezAlloc(c->lk); 216 c->flushwait = vtRendezAlloc(c->lk); 217 c->heapwait = vtRendezAlloc(c->lk); 218 c->sync = periodicAlloc(cacheSync, c, 30*1000); 219 220 if(mode == OReadWrite){ 221 c->ref += 2; 222 vtThread(unlinkThread, c); 223 vtThread(flushThread, c); 224 } 225 cacheCheck(c); 226 227 return c; 228 } 229 230 /* 231 * Free the whole memory cache, flushing all dirty blocks to the disk. 232 */ 233 void 234 cacheFree(Cache *c) 235 { 236 int i; 237 238 /* kill off daemon threads */ 239 vtLock(c->lk); 240 c->die = vtRendezAlloc(c->lk); 241 periodicKill(c->sync); 242 vtWakeup(c->flush); 243 vtWakeup(c->unlink); 244 while(c->ref > 1) 245 vtSleep(c->die); 246 247 /* flush everything out */ 248 do { 249 unlinkBody(c); 250 vtUnlock(c->lk); 251 while(cacheFlushBlock(c)) 252 ; 253 diskFlush(c->disk); 254 vtLock(c->lk); 255 } while(c->uhead || c->ndirty); 256 vtUnlock(c->lk); 257 258 cacheCheck(c); 259 260 for(i = 0; i < c->nblocks; i++){ 261 assert(c->blocks[i].ref == 0); 262 vtRendezFree(c->blocks[i].ioready); 263 vtLockFree(c->blocks[i].lk); 264 } 265 flFree(c->fl); 266 vtMemFree(c->baddr); 267 vtMemFree(c->heads); 268 vtMemFree(c->blocks); 269 vtMemFree(c->mem); 270 vtLockFree(c->lk); 271 diskFree(c->disk); 272 vtRendezFree(c->blrend); 273 /* don't close vtSession */ 274 vtMemFree(c); 275 } 276 277 static void 278 cacheDump(Cache *c) 279 { 280 int i; 281 Block *b; 282 283 for(i = 0; i < c->nblocks; i++){ 284 b = &c->blocks[i]; 285 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 286 i, b->part, b->addr, b->score, b->l.type, b->ref, 287 bsStr(b->l.state), bioStr(b->iostate), b->pc); 288 } 289 } 290 291 static void 292 cacheCheck(Cache *c) 293 { 294 u32int size, now; 295 int i, k, refed; 296 static uchar zero[VtScoreSize]; 297 Block *b; 298 299 size = c->size; 300 now = c->now; 301 302 for(i = 0; i < c->nheap; i++){ 303 if(c->heap[i]->heap != i) 304 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 305 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 306 vtFatal("bad heap ordering"); 307 k = (i << 1) + 1; 308 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 309 vtFatal("bad heap ordering"); 310 k++; 311 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 312 vtFatal("bad heap ordering"); 313 } 314 315 refed = 0; 316 for(i = 0; i < c->nblocks; i++){ 317 b = &c->blocks[i]; 318 if(b->data != &c->mem[i * size]) 319 vtFatal("mis-blocked at %d", i); 320 if(b->ref && b->heap == BadHeap){ 321 refed++; 322 } 323 } 324 if(c->nheap + refed != c->nblocks){ 325 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks); 326 cacheDump(c); 327 } 328 assert(c->nheap + refed == c->nblocks); 329 refed = 0; 330 for(i = 0; i < c->nblocks; i++){ 331 b = &c->blocks[i]; 332 if(b->ref){ 333 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l); 334 refed++; 335 } 336 } 337 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed); 338 } 339 340 341 /* 342 * locate the block with the oldest second to last use. 343 * remove it from the heap, and fix up the heap. 344 */ 345 /* called with c->lk held */ 346 static Block * 347 cacheBumpBlock(Cache *c) 348 { 349 int printed; 350 Block *b; 351 352 /* 353 * locate the block with the oldest second to last use. 354 * remove it from the heap, and fix up the heap. 355 */ 356 printed = 0; 357 if(c->nheap == 0){ 358 while(c->nheap == 0){ 359 vtWakeup(c->flush); 360 vtSleep(c->heapwait); 361 if(c->nheap == 0){ 362 printed = 1; 363 fprint(2, "%s: entire cache is busy, %d dirty " 364 "-- waking flush thread\n", 365 argv0, c->ndirty); 366 } 367 } 368 if(printed) 369 fprint(2, "%s: cache is okay again, %d dirty\n", 370 argv0, c->ndirty); 371 } 372 373 b = c->heap[0]; 374 heapDel(b); 375 376 assert(b->heap == BadHeap); 377 assert(b->ref == 0); 378 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 379 assert(b->prior == nil); 380 assert(b->uhead == nil); 381 382 /* 383 * unchain the block from hash chain 384 */ 385 if(b->prev){ 386 *(b->prev) = b->next; 387 if(b->next) 388 b->next->prev = b->prev; 389 b->prev = nil; 390 } 391 392 393 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score); 394 /* set block to a reasonable state */ 395 b->ref = 1; 396 b->part = PartError; 397 memset(&b->l, 0, sizeof(b->l)); 398 b->iostate = BioEmpty; 399 400 return b; 401 } 402 403 /* 404 * look for a particular version of the block in the memory cache. 405 */ 406 static Block * 407 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 408 int waitlock, int *lockfailure) 409 { 410 Block *b; 411 ulong h; 412 413 h = addr % c->hashSize; 414 415 if(lockfailure) 416 *lockfailure = 0; 417 418 /* 419 * look for the block in the cache 420 */ 421 vtLock(c->lk); 422 for(b = c->heads[h]; b != nil; b = b->next){ 423 if(b->part == part && b->addr == addr) 424 break; 425 } 426 if(b == nil || b->vers != vers){ 427 vtUnlock(c->lk); 428 return nil; 429 } 430 if(!waitlock && !vtCanLock(b->lk)){ 431 *lockfailure = 1; 432 vtUnlock(c->lk); 433 return nil; 434 } 435 heapDel(b); 436 b->ref++; 437 vtUnlock(c->lk); 438 439 bwatchLock(b); 440 if(waitlock) 441 vtLock(b->lk); 442 b->nlock = 1; 443 444 for(;;){ 445 switch(b->iostate){ 446 default: 447 abort(); 448 case BioEmpty: 449 case BioLabel: 450 case BioClean: 451 case BioDirty: 452 if(b->vers != vers){ 453 blockPut(b); 454 return nil; 455 } 456 return b; 457 case BioReading: 458 case BioWriting: 459 vtSleep(b->ioready); 460 break; 461 case BioVentiError: 462 blockPut(b); 463 vtSetError("venti i/o error block 0x%.8ux", addr); 464 return nil; 465 case BioReadError: 466 blockPut(b); 467 vtSetError("error reading block 0x%.8ux", addr); 468 return nil; 469 } 470 } 471 /* NOT REACHED */ 472 } 473 static Block* 474 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 475 { 476 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 477 } 478 479 480 /* 481 * fetch a local (on-disk) block from the memory cache. 482 * if it's not there, load it, bumping some other block. 483 */ 484 Block * 485 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 486 { 487 Block *b; 488 ulong h; 489 490 assert(part != PartVenti); 491 492 h = addr % c->hashSize; 493 494 /* 495 * look for the block in the cache 496 */ 497 vtLock(c->lk); 498 for(b = c->heads[h]; b != nil; b = b->next){ 499 if(b->part != part || b->addr != addr) 500 continue; 501 if(epoch && b->l.epoch != epoch){ 502 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 503 vtUnlock(c->lk); 504 vtSetError(ELabelMismatch); 505 return nil; 506 } 507 heapDel(b); 508 b->ref++; 509 break; 510 } 511 512 if(b == nil){ 513 b = cacheBumpBlock(c); 514 515 b->part = part; 516 b->addr = addr; 517 localToGlobal(addr, b->score); 518 519 /* chain onto correct hash */ 520 b->next = c->heads[h]; 521 c->heads[h] = b; 522 if(b->next != nil) 523 b->next->prev = &b->next; 524 b->prev = &c->heads[h]; 525 } 526 527 vtUnlock(c->lk); 528 529 /* 530 * BUG: what if the epoch changes right here? 531 * In the worst case, we could end up in some weird 532 * lock loop, because the block we want no longer exists, 533 * and instead we're trying to lock a block we have no 534 * business grabbing. 535 * 536 * For now, I'm not going to worry about it. 537 */ 538 539 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr); 540 bwatchLock(b); 541 vtLock(b->lk); 542 b->nlock = 1; 543 544 if(part == PartData && b->iostate == BioEmpty){ 545 if(!readLabel(c, &b->l, addr)){ 546 blockPut(b); 547 return nil; 548 } 549 blockSetIOState(b, BioLabel); 550 } 551 if(epoch && b->l.epoch != epoch){ 552 blockPut(b); 553 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 554 vtSetError(ELabelMismatch); 555 return nil; 556 } 557 558 b->pc = getcallerpc(&c); 559 for(;;){ 560 switch(b->iostate){ 561 default: 562 abort(); 563 case BioEmpty: 564 case BioLabel: 565 if(mode == OOverWrite){ 566 blockSetIOState(b, BioClean); 567 return b; 568 } 569 diskRead(c->disk, b); 570 vtSleep(b->ioready); 571 break; 572 case BioClean: 573 case BioDirty: 574 return b; 575 case BioReading: 576 case BioWriting: 577 vtSleep(b->ioready); 578 break; 579 case BioReadError: 580 blockSetIOState(b, BioEmpty); 581 blockPut(b); 582 vtSetError("error reading block 0x%.8ux", addr); 583 return nil; 584 } 585 } 586 /* NOT REACHED */ 587 } 588 589 Block * 590 cacheLocal(Cache *c, int part, u32int addr, int mode) 591 { 592 return _cacheLocal(c, part, addr, mode, 0); 593 } 594 595 /* 596 * fetch a local (on-disk) block from the memory cache. 597 * if it's not there, load it, bumping some other block. 598 * check tag and type. 599 */ 600 Block * 601 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 602 { 603 Block *b; 604 605 b = _cacheLocal(c, PartData, addr, mode, epoch); 606 if(b == nil) 607 return nil; 608 if(b->l.type != type || b->l.tag != tag){ 609 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 610 argv0, addr, b->l.type, type, b->l.tag, tag); 611 vtSetError(ELabelMismatch); 612 blockPut(b); 613 return nil; 614 } 615 b->pc = getcallerpc(&c); 616 return b; 617 } 618 619 /* 620 * fetch a global (Venti) block from the memory cache. 621 * if it's not there, load it, bumping some other block. 622 * check tag and type if it's really a local block in disguise. 623 */ 624 Block * 625 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 626 { 627 int n; 628 Block *b; 629 ulong h; 630 u32int addr; 631 632 addr = globalToLocal(score); 633 if(addr != NilBlock){ 634 b = cacheLocalData(c, addr, type, tag, mode, 0); 635 if(b) 636 b->pc = getcallerpc(&c); 637 return b; 638 } 639 640 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 641 642 /* 643 * look for the block in the cache 644 */ 645 vtLock(c->lk); 646 for(b = c->heads[h]; b != nil; b = b->next){ 647 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 648 continue; 649 heapDel(b); 650 b->ref++; 651 break; 652 } 653 654 if(b == nil){ 655 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type); 656 657 b = cacheBumpBlock(c); 658 659 b->part = PartVenti; 660 b->addr = NilBlock; 661 b->l.type = type; 662 memmove(b->score, score, VtScoreSize); 663 664 /* chain onto correct hash */ 665 b->next = c->heads[h]; 666 c->heads[h] = b; 667 if(b->next != nil) 668 b->next->prev = &b->next; 669 b->prev = &c->heads[h]; 670 } 671 vtUnlock(c->lk); 672 673 bwatchLock(b); 674 vtLock(b->lk); 675 b->nlock = 1; 676 b->pc = getcallerpc(&c); 677 678 switch(b->iostate){ 679 default: 680 abort(); 681 case BioEmpty: 682 n = vtRead(c->z, score, vtType[type], b->data, c->size); 683 if(n < 0 || !vtSha1Check(score, b->data, n)){ 684 blockSetIOState(b, BioVentiError); 685 blockPut(b); 686 vtSetError( 687 "venti error reading block %V or wrong score: %r", 688 score); 689 return nil; 690 } 691 vtZeroExtend(vtType[type], b->data, n, c->size); 692 blockSetIOState(b, BioClean); 693 return b; 694 case BioClean: 695 return b; 696 case BioVentiError: 697 blockPut(b); 698 vtSetError("venti i/o error or wrong score, block %V", score); 699 return nil; 700 case BioReadError: 701 blockPut(b); 702 vtSetError("error reading block %V", b->score); 703 return nil; 704 } 705 /* NOT REACHED */ 706 } 707 708 /* 709 * allocate a new on-disk block and load it into the memory cache. 710 * BUG: if the disk is full, should we flush some of it to Venti? 711 */ 712 static u32int lastAlloc; 713 714 Block * 715 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 716 { 717 FreeList *fl; 718 u32int addr; 719 Block *b; 720 int n, nwrap; 721 Label lab; 722 723 n = c->size / LabelSize; 724 fl = c->fl; 725 726 vtLock(fl->lk); 727 addr = fl->last; 728 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 729 if(b == nil){ 730 fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0); 731 vtUnlock(fl->lk); 732 return nil; 733 } 734 nwrap = 0; 735 for(;;){ 736 if(++addr >= fl->end){ 737 addr = 0; 738 if(++nwrap >= 2){ 739 blockPut(b); 740 fl->last = 0; 741 vtSetError("disk is full"); 742 fprint(2, "%s: cacheAllocBlock: xxx1 %R\n", argv0); 743 vtUnlock(fl->lk); 744 return nil; 745 } 746 } 747 if(addr%n == 0){ 748 blockPut(b); 749 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 750 if(b == nil){ 751 fl->last = addr; 752 fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0); 753 vtUnlock(fl->lk); 754 return nil; 755 } 756 } 757 if(!labelUnpack(&lab, b->data, addr%n)) 758 continue; 759 if(lab.state == BsFree) 760 goto Found; 761 if(lab.state&BsClosed) 762 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 763 goto Found; 764 } 765 Found: 766 blockPut(b); 767 b = cacheLocal(c, PartData, addr, OOverWrite); 768 if(b == nil){ 769 fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0); 770 return nil; 771 } 772 assert(b->iostate == BioLabel || b->iostate == BioClean); 773 fl->last = addr; 774 lab.type = type; 775 lab.tag = tag; 776 lab.state = BsAlloc; 777 lab.epoch = epoch; 778 lab.epochClose = ~(u32int)0; 779 if(!blockSetLabel(b, &lab, 1)){ 780 fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0); 781 blockPut(b); 782 return nil; 783 } 784 vtZeroExtend(vtType[type], b->data, 0, c->size); 785 if(0)diskWrite(c->disk, b); 786 787 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag); 788 lastAlloc = addr; 789 fl->nused++; 790 vtUnlock(fl->lk); 791 b->pc = getcallerpc(&c); 792 return b; 793 } 794 795 int 796 cacheDirty(Cache *c) 797 { 798 return c->ndirty; 799 } 800 801 void 802 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 803 { 804 int n; 805 u32int addr, nused; 806 Block *b; 807 Label lab; 808 FreeList *fl; 809 810 fl = c->fl; 811 n = c->size / LabelSize; 812 *bsize = c->size; 813 vtLock(fl->lk); 814 if(fl->epochLow == epochLow){ 815 *used = fl->nused; 816 *total = fl->end; 817 vtUnlock(fl->lk); 818 return; 819 } 820 b = nil; 821 nused = 0; 822 for(addr=0; addr<fl->end; addr++){ 823 if(addr%n == 0){ 824 blockPut(b); 825 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 826 if(b == nil){ 827 fprint(2, "%s: flCountUsed: loading %ux: %R\n", 828 argv0, addr/n); 829 break; 830 } 831 } 832 if(!labelUnpack(&lab, b->data, addr%n)) 833 continue; 834 if(lab.state == BsFree) 835 continue; 836 if(lab.state&BsClosed) 837 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 838 continue; 839 nused++; 840 } 841 blockPut(b); 842 if(addr == fl->end){ 843 fl->nused = nused; 844 fl->epochLow = epochLow; 845 } 846 *used = nused; 847 *total = fl->end; 848 vtUnlock(fl->lk); 849 return; 850 } 851 852 static FreeList * 853 flAlloc(u32int end) 854 { 855 FreeList *fl; 856 857 fl = vtMemAllocZ(sizeof(*fl)); 858 fl->lk = vtLockAlloc(); 859 fl->last = 0; 860 fl->end = end; 861 return fl; 862 } 863 864 static void 865 flFree(FreeList *fl) 866 { 867 vtLockFree(fl->lk); 868 vtMemFree(fl); 869 } 870 871 u32int 872 cacheLocalSize(Cache *c, int part) 873 { 874 return diskSize(c->disk, part); 875 } 876 877 /* 878 * The thread that has locked b may refer to it by 879 * multiple names. Nlock counts the number of 880 * references the locking thread holds. It will call 881 * blockPut once per reference. 882 */ 883 void 884 blockDupLock(Block *b) 885 { 886 assert(b->nlock > 0); 887 b->nlock++; 888 } 889 890 /* 891 * we're done with the block. 892 * unlock it. can't use it after calling this. 893 */ 894 void 895 blockPut(Block* b) 896 { 897 Cache *c; 898 899 if(b == nil) 900 return; 901 902 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 903 904 if(b->iostate == BioDirty) 905 bwatchDependency(b); 906 907 if(--b->nlock > 0) 908 return; 909 910 /* 911 * b->nlock should probably stay at zero while 912 * the block is unlocked, but diskThread and vtSleep 913 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 914 * so we have to keep b->nlock set to 1 even 915 * when the block is unlocked. 916 */ 917 assert(b->nlock == 0); 918 b->nlock = 1; 919 // b->pc = 0; 920 921 bwatchUnlock(b); 922 vtUnlock(b->lk); 923 c = b->c; 924 vtLock(c->lk); 925 926 if(--b->ref > 0){ 927 vtUnlock(c->lk); 928 return; 929 } 930 931 assert(b->ref == 0); 932 switch(b->iostate){ 933 default: 934 b->used = c->now++; 935 heapIns(b); 936 break; 937 case BioEmpty: 938 case BioLabel: 939 if(c->nheap == 0) 940 b->used = c->now++; 941 else 942 b->used = c->heap[0]->used; 943 heapIns(b); 944 break; 945 case BioDirty: 946 break; 947 } 948 vtUnlock(c->lk); 949 } 950 951 /* 952 * set the label associated with a block. 953 */ 954 Block* 955 _blockSetLabel(Block *b, Label *l) 956 { 957 int lpb; 958 Block *bb; 959 u32int a; 960 Cache *c; 961 962 c = b->c; 963 964 assert(b->part == PartData); 965 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 966 lpb = c->size / LabelSize; 967 a = b->addr / lpb; 968 bb = cacheLocal(c, PartLabel, a, OReadWrite); 969 if(bb == nil){ 970 blockPut(b); 971 return nil; 972 } 973 b->l = *l; 974 labelPack(l, bb->data, b->addr%lpb); 975 blockDirty(bb); 976 return bb; 977 } 978 979 int 980 blockSetLabel(Block *b, Label *l, int allocating) 981 { 982 Block *lb; 983 Label oldl; 984 985 oldl = b->l; 986 lb = _blockSetLabel(b, l); 987 if(lb == nil) 988 return 0; 989 990 /* 991 * If we're allocating the block, make sure the label (bl) 992 * goes to disk before the data block (b) itself. This is to help 993 * the blocks that in turn depend on b. 994 * 995 * Suppose bx depends on (must be written out after) b. 996 * Once we write b we'll think it's safe to write bx. 997 * Bx can't get at b unless it has a valid label, though. 998 * 999 * Allocation is the only case in which having a current label 1000 * is vital because: 1001 * 1002 * - l.type is set at allocation and never changes. 1003 * - l.tag is set at allocation and never changes. 1004 * - l.state is not checked when we load blocks. 1005 * - the archiver cares deeply about l.state being 1006 * BaActive vs. BaCopied, but that's handled 1007 * by direct calls to _blockSetLabel. 1008 */ 1009 1010 if(allocating) 1011 blockDependency(b, lb, -1, nil, nil); 1012 blockPut(lb); 1013 return 1; 1014 } 1015 1016 /* 1017 * Record that bb must be written out before b. 1018 * If index is given, we're about to overwrite the score/e 1019 * at that index in the block. Save the old value so we 1020 * can write a safer ``old'' version of the block if pressed. 1021 */ 1022 void 1023 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1024 { 1025 BList *p; 1026 1027 if(bb->iostate == BioClean) 1028 return; 1029 1030 /* 1031 * Dependencies for blocks containing Entry structures 1032 * or scores must always be explained. The problem with 1033 * only explaining some of them is this. Suppose we have two 1034 * dependencies for the same field, the first explained 1035 * and the second not. We try to write the block when the first 1036 * dependency is not written but the second is. We will roll back 1037 * the first change even though the second trumps it. 1038 */ 1039 if(index == -1 && bb->part == PartData) 1040 assert(b->l.type == BtData); 1041 1042 if(bb->iostate != BioDirty){ 1043 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n", 1044 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1045 abort(); 1046 } 1047 1048 p = blistAlloc(bb); 1049 if(p == nil) 1050 return; 1051 1052 assert(bb->iostate == BioDirty); 1053 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1054 1055 p->part = bb->part; 1056 p->addr = bb->addr; 1057 p->type = bb->l.type; 1058 p->vers = bb->vers; 1059 p->index = index; 1060 if(p->index >= 0){ 1061 /* 1062 * This test would just be b->l.type==BtDir except 1063 * we need to exclude the super block. 1064 */ 1065 if(b->l.type == BtDir && b->part == PartData) 1066 entryPack(e, p->old.entry, 0); 1067 else 1068 memmove(p->old.score, score, VtScoreSize); 1069 } 1070 p->next = b->prior; 1071 b->prior = p; 1072 } 1073 1074 /* 1075 * Mark an in-memory block as dirty. If there are too many 1076 * dirty blocks, start writing some out to disk. 1077 * 1078 * If there were way too many dirty blocks, we used to 1079 * try to do some flushing ourselves, but it's just too dangerous -- 1080 * it implies that the callers cannot have any of our priors locked, 1081 * but this is hard to avoid in some cases. 1082 */ 1083 int 1084 blockDirty(Block *b) 1085 { 1086 Cache *c; 1087 1088 c = b->c; 1089 1090 assert(b->part != PartVenti); 1091 1092 if(b->iostate == BioDirty) 1093 return 1; 1094 assert(b->iostate == BioClean); 1095 1096 vtLock(c->dirtylk); 1097 vtLock(c->lk); 1098 b->iostate = BioDirty; 1099 c->ndirty++; 1100 if(c->ndirty > (c->maxdirty>>1)) 1101 vtWakeup(c->flush); 1102 vtUnlock(c->lk); 1103 vtUnlock(c->dirtylk); 1104 1105 return 1; 1106 } 1107 1108 /* 1109 * We've decided to write out b. Maybe b has some pointers to blocks 1110 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1111 * copy of b that is safe to write out. (diskThread will make sure the block 1112 * remains marked as dirty.) 1113 */ 1114 uchar * 1115 blockRollback(Block *b, uchar *buf) 1116 { 1117 u32int addr; 1118 BList *p; 1119 Super super; 1120 1121 /* easy case */ 1122 if(b->prior == nil) 1123 return b->data; 1124 1125 memmove(buf, b->data, b->c->size); 1126 for(p=b->prior; p; p=p->next){ 1127 /* 1128 * we know p->index >= 0 because blockWrite has vetted this block for us. 1129 */ 1130 assert(p->index >= 0); 1131 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1132 if(b->part == PartSuper){ 1133 assert(p->index == 0); 1134 superUnpack(&super, buf); 1135 addr = globalToLocal(p->old.score); 1136 if(addr == NilBlock){ 1137 fprint(2, "%s: rolling back super block: " 1138 "bad replacement addr %V\n", 1139 argv0, p->old.score); 1140 abort(); 1141 } 1142 super.active = addr; 1143 superPack(&super, buf); 1144 continue; 1145 } 1146 if(b->l.type == BtDir) 1147 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1148 else 1149 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1150 } 1151 return buf; 1152 } 1153 1154 /* 1155 * Try to write block b. 1156 * If b depends on other blocks: 1157 * 1158 * If the block has been written out, remove the dependency. 1159 * If the dependency is replaced by a more recent dependency, 1160 * throw it out. 1161 * If we know how to write out an old version of b that doesn't 1162 * depend on it, do that. 1163 * 1164 * Otherwise, bail. 1165 */ 1166 int 1167 blockWrite(Block *b) 1168 { 1169 uchar *dmap; 1170 Cache *c; 1171 BList *p, **pp; 1172 Block *bb; 1173 int lockfail; 1174 1175 c = b->c; 1176 1177 if(b->iostate != BioDirty) 1178 return 1; 1179 1180 dmap = b->dmap; 1181 memset(dmap, 0, c->ndmap); 1182 pp = &b->prior; 1183 for(p=*pp; p; p=*pp){ 1184 if(p->index >= 0){ 1185 /* more recent dependency has succeeded; this one can go */ 1186 if(dmap[p->index/8] & (1<<(p->index%8))) 1187 goto ignblock; 1188 } 1189 1190 lockfail = 0; 1191 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1192 if(bb == nil){ 1193 if(lockfail) 1194 return 0; 1195 /* block not in cache => was written already */ 1196 dmap[p->index/8] |= 1<<(p->index%8); 1197 goto ignblock; 1198 } 1199 1200 /* 1201 * same version of block is still in cache. 1202 * 1203 * the assertion is true because the block still has version p->vers, 1204 * which means it hasn't been written out since we last saw it. 1205 */ 1206 if(bb->iostate != BioDirty){ 1207 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n", 1208 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1209 /* probably BioWriting if it happens? */ 1210 if(bb->iostate == BioClean) 1211 goto ignblock; 1212 } 1213 1214 blockPut(bb); 1215 1216 if(p->index < 0){ 1217 /* 1218 * We don't know how to temporarily undo 1219 * b's dependency on bb, so just don't write b yet. 1220 */ 1221 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1222 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1223 return 0; 1224 } 1225 /* keep walking down the list */ 1226 pp = &p->next; 1227 continue; 1228 1229 ignblock: 1230 *pp = p->next; 1231 blistFree(c, p); 1232 continue; 1233 } 1234 1235 /* 1236 * DiskWrite must never be called with a double-locked block. 1237 * This call to diskWrite is okay because blockWrite is only called 1238 * from the cache flush thread, which never double-locks a block. 1239 */ 1240 diskWrite(c->disk, b); 1241 return 1; 1242 } 1243 1244 /* 1245 * Change the I/O state of block b. 1246 * Just an assignment except for magic in 1247 * switch statement (read comments there). 1248 */ 1249 void 1250 blockSetIOState(Block *b, int iostate) 1251 { 1252 int dowakeup; 1253 Cache *c; 1254 BList *p, *q; 1255 1256 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1257 1258 c = b->c; 1259 1260 dowakeup = 0; 1261 switch(iostate){ 1262 default: 1263 abort(); 1264 case BioEmpty: 1265 assert(!b->uhead); 1266 break; 1267 case BioLabel: 1268 assert(!b->uhead); 1269 break; 1270 case BioClean: 1271 bwatchDependency(b); 1272 /* 1273 * If b->prior is set, it means a write just finished. 1274 * The prior list isn't needed anymore. 1275 */ 1276 for(p=b->prior; p; p=q){ 1277 q = p->next; 1278 blistFree(c, p); 1279 } 1280 b->prior = nil; 1281 /* 1282 * Freeing a block or just finished a write. 1283 * Move the blocks from the per-block unlink 1284 * queue to the cache unlink queue. 1285 */ 1286 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1287 vtLock(c->lk); 1288 c->ndirty--; 1289 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1290 b->vers = c->vers++; 1291 if(b->uhead){ 1292 /* add unlink blocks to unlink queue */ 1293 if(c->uhead == nil){ 1294 c->uhead = b->uhead; 1295 vtWakeup(c->unlink); 1296 }else 1297 c->utail->next = b->uhead; 1298 c->utail = b->utail; 1299 b->uhead = nil; 1300 } 1301 vtUnlock(c->lk); 1302 } 1303 assert(!b->uhead); 1304 dowakeup = 1; 1305 break; 1306 case BioDirty: 1307 /* 1308 * Wrote out an old version of the block (see blockRollback). 1309 * Bump a version count, leave it dirty. 1310 */ 1311 if(b->iostate == BioWriting){ 1312 vtLock(c->lk); 1313 b->vers = c->vers++; 1314 vtUnlock(c->lk); 1315 dowakeup = 1; 1316 } 1317 break; 1318 case BioReading: 1319 case BioWriting: 1320 /* 1321 * Adding block to disk queue. Bump reference count. 1322 * diskThread decs the count later by calling blockPut. 1323 * This is here because we need to lock c->lk to 1324 * manipulate the ref count. 1325 */ 1326 vtLock(c->lk); 1327 b->ref++; 1328 vtUnlock(c->lk); 1329 break; 1330 case BioReadError: 1331 case BioVentiError: 1332 /* 1333 * Oops. 1334 */ 1335 dowakeup = 1; 1336 break; 1337 } 1338 b->iostate = iostate; 1339 /* 1340 * Now that the state has changed, we can wake the waiters. 1341 */ 1342 if(dowakeup) 1343 vtWakeupAll(b->ioready); 1344 } 1345 1346 /* 1347 * The active file system is a tree of blocks. 1348 * When we add snapshots to the mix, the entire file system 1349 * becomes a dag and thus requires a bit more care. 1350 * 1351 * The life of the file system is divided into epochs. A snapshot 1352 * ends one epoch and begins the next. Each file system block 1353 * is marked with the epoch in which it was created (b.epoch). 1354 * When the block is unlinked from the file system (closed), it is marked 1355 * with the epoch in which it was removed (b.epochClose). 1356 * Once we have discarded or archived all snapshots up to 1357 * b.epochClose, we can reclaim the block. 1358 * 1359 * If a block was created in a past epoch but is not yet closed, 1360 * it is treated as copy-on-write. Of course, in order to insert the 1361 * new pointer into the tree, the parent must be made writable, 1362 * and so on up the tree. The recursion stops because the root 1363 * block is always writable. 1364 * 1365 * If blocks are never closed, they will never be reused, and 1366 * we will run out of disk space. But marking a block as closed 1367 * requires some care about dependencies and write orderings. 1368 * 1369 * (1) If a block p points at a copy-on-write block b and we 1370 * copy b to create bb, then p must be written out after bb and 1371 * lbb (bb's label block). 1372 * 1373 * (2) We have to mark b as closed, but only after we switch 1374 * the pointer, so lb must be written out after p. In fact, we 1375 * can't even update the in-memory copy, or the cache might 1376 * mistakenly give out b for reuse before p gets written. 1377 * 1378 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1379 * The caller is expected to record a "p after bb" dependency 1380 * to finish (1), and also expected to call blockRemoveLink 1381 * to arrange for (2) to happen once p is written. 1382 * 1383 * Until (2) happens, some pieces of the code (e.g., the archiver) 1384 * still need to know whether a block has been copied, so we 1385 * set the BsCopied bit in the label and force that to disk *before* 1386 * the copy gets written out. 1387 */ 1388 Block* 1389 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1390 { 1391 Block *bb, *lb; 1392 Label l; 1393 1394 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1395 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n", 1396 argv0, b->addr, &b->l, elo, ehi); 1397 1398 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1399 if(bb == nil){ 1400 blockPut(b); 1401 return nil; 1402 } 1403 1404 /* 1405 * Update label so we know the block has been copied. 1406 * (It will be marked closed once it has been unlinked from 1407 * the tree.) This must follow cacheAllocBlock since we 1408 * can't be holding onto lb when we call cacheAllocBlock. 1409 */ 1410 if((b->l.state&BsCopied)==0) 1411 if(b->part == PartData){ /* not the superblock */ 1412 l = b->l; 1413 l.state |= BsCopied; 1414 lb = _blockSetLabel(b, &l); 1415 if(lb == nil){ 1416 /* can't set label => can't copy block */ 1417 blockPut(b); 1418 l.type = BtMax; 1419 l.state = BsFree; 1420 l.epoch = 0; 1421 l.epochClose = 0; 1422 l.tag = 0; 1423 blockSetLabel(bb, &l, 0); 1424 blockPut(bb); 1425 return nil; 1426 } 1427 blockDependency(bb, lb, -1, nil, nil); 1428 blockPut(lb); 1429 } 1430 1431 memmove(bb->data, b->data, b->c->size); 1432 blockDirty(bb); 1433 blockPut(b); 1434 return bb; 1435 } 1436 1437 /* 1438 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1439 * If recurse is set, we are unlinking all of bb's children as well. 1440 * 1441 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1442 * the relevant information to b's list of unlinked blocks. Once b is written, 1443 * the list will be queued for processing. 1444 * 1445 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1446 */ 1447 void 1448 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1449 { 1450 BList *p, **pp, bl; 1451 1452 /* remove bb from prior list */ 1453 for(pp=&b->prior; (p=*pp)!=nil; ){ 1454 if(p->part == PartData && p->addr == addr){ 1455 *pp = p->next; 1456 blistFree(b->c, p); 1457 }else 1458 pp = &p->next; 1459 } 1460 1461 bl.part = PartData; 1462 bl.addr = addr; 1463 bl.type = type; 1464 bl.tag = tag; 1465 if(b->l.epoch == 0) 1466 assert(b->part == PartSuper); 1467 bl.epoch = b->l.epoch; 1468 bl.next = nil; 1469 bl.recurse = recurse; 1470 1471 p = blistAlloc(b); 1472 if(p == nil){ 1473 /* 1474 * We were out of blists so blistAlloc wrote b to disk. 1475 */ 1476 doRemoveLink(b->c, &bl); 1477 return; 1478 } 1479 1480 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1481 assert(b->iostate == BioDirty); 1482 1483 *p = bl; 1484 if(b->uhead == nil) 1485 b->uhead = p; 1486 else 1487 b->utail->next = p; 1488 b->utail = p; 1489 } 1490 1491 /* 1492 * Process removal of a single block and perhaps its children. 1493 */ 1494 static void 1495 doRemoveLink(Cache *c, BList *p) 1496 { 1497 int i, n, recurse; 1498 u32int a; 1499 Block *b; 1500 Label l; 1501 BList bl; 1502 1503 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1504 1505 /* 1506 * We're not really going to overwrite b, but if we're not 1507 * going to look at its contents, there is no point in reading 1508 * them from the disk. 1509 */ 1510 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1511 if(b == nil) 1512 return; 1513 1514 /* 1515 * When we're unlinking from the superblock, close with the next epoch. 1516 */ 1517 if(p->epoch == 0) 1518 p->epoch = b->l.epoch+1; 1519 1520 /* sanity check */ 1521 if(b->l.epoch > p->epoch){ 1522 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n", 1523 argv0, b->l.epoch, p->epoch); 1524 blockPut(b); 1525 return; 1526 } 1527 1528 if(recurse){ 1529 n = c->size / VtScoreSize; 1530 for(i=0; i<n; i++){ 1531 a = globalToLocal(b->data + i*VtScoreSize); 1532 if(a == NilBlock || !readLabel(c, &l, a)) 1533 continue; 1534 if(l.state&BsClosed) 1535 continue; 1536 /* 1537 * If stack space becomes an issue... 1538 p->addr = a; 1539 p->type = l.type; 1540 p->tag = l.tag; 1541 doRemoveLink(c, p); 1542 */ 1543 1544 bl.part = PartData; 1545 bl.addr = a; 1546 bl.type = l.type; 1547 bl.tag = l.tag; 1548 bl.epoch = p->epoch; 1549 bl.next = nil; 1550 bl.recurse = 1; 1551 /* give up the block lock - share with others */ 1552 blockPut(b); 1553 doRemoveLink(c, &bl); 1554 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1555 if(b == nil){ 1556 fprint(2, "%s: warning: lost block in doRemoveLink\n", 1557 argv0); 1558 return; 1559 } 1560 } 1561 } 1562 1563 l = b->l; 1564 l.state |= BsClosed; 1565 l.epochClose = p->epoch; 1566 if(l.epochClose == l.epoch){ 1567 vtLock(c->fl->lk); 1568 if(l.epoch == c->fl->epochLow) 1569 c->fl->nused--; 1570 blockSetLabel(b, &l, 0); 1571 vtUnlock(c->fl->lk); 1572 }else 1573 blockSetLabel(b, &l, 0); 1574 blockPut(b); 1575 } 1576 1577 /* 1578 * Allocate a BList so that we can record a dependency 1579 * or queue a removal related to block b. 1580 * If we can't find a BList, we write out b and return nil. 1581 */ 1582 static BList * 1583 blistAlloc(Block *b) 1584 { 1585 Cache *c; 1586 BList *p; 1587 1588 if(b->iostate != BioDirty){ 1589 /* 1590 * should not happen anymore - 1591 * blockDirty used to flush but no longer does. 1592 */ 1593 assert(b->iostate == BioClean); 1594 fprint(2, "%s: blistAlloc: called on clean block\n", argv0); 1595 return nil; 1596 } 1597 1598 c = b->c; 1599 vtLock(c->lk); 1600 if(c->blfree == nil){ 1601 /* 1602 * No free BLists. What are our options? 1603 */ 1604 1605 /* Block has no priors? Just write it. */ 1606 if(b->prior == nil){ 1607 vtUnlock(c->lk); 1608 diskWriteAndWait(c->disk, b); 1609 return nil; 1610 } 1611 1612 /* 1613 * Wake the flush thread, which will hopefully free up 1614 * some BLists for us. We used to flush a block from 1615 * our own prior list and reclaim that BList, but this is 1616 * a no-no: some of the blocks on our prior list may 1617 * be locked by our caller. Or maybe their label blocks 1618 * are locked by our caller. In any event, it's too hard 1619 * to make sure we can do I/O for ourselves. Instead, 1620 * we assume the flush thread will find something. 1621 * (The flush thread never blocks waiting for a block, 1622 * so it can't deadlock like we can.) 1623 */ 1624 while(c->blfree == nil){ 1625 vtWakeup(c->flush); 1626 vtSleep(c->blrend); 1627 if(c->blfree == nil) 1628 fprint(2, "%s: flushing for blists\n", argv0); 1629 } 1630 } 1631 1632 p = c->blfree; 1633 c->blfree = p->next; 1634 vtUnlock(c->lk); 1635 return p; 1636 } 1637 1638 static void 1639 blistFree(Cache *c, BList *bl) 1640 { 1641 vtLock(c->lk); 1642 bl->next = c->blfree; 1643 c->blfree = bl; 1644 vtWakeup(c->blrend); 1645 vtUnlock(c->lk); 1646 } 1647 1648 char* 1649 bsStr(int state) 1650 { 1651 static char s[100]; 1652 1653 if(state == BsFree) 1654 return "Free"; 1655 if(state == BsBad) 1656 return "Bad"; 1657 1658 sprint(s, "%x", state); 1659 if(!(state&BsAlloc)) 1660 strcat(s, ",Free"); /* should not happen */ 1661 if(state&BsCopied) 1662 strcat(s, ",Copied"); 1663 if(state&BsVenti) 1664 strcat(s, ",Venti"); 1665 if(state&BsClosed) 1666 strcat(s, ",Closed"); 1667 return s; 1668 } 1669 1670 char * 1671 bioStr(int iostate) 1672 { 1673 switch(iostate){ 1674 default: 1675 return "Unknown!!"; 1676 case BioEmpty: 1677 return "Empty"; 1678 case BioLabel: 1679 return "Label"; 1680 case BioClean: 1681 return "Clean"; 1682 case BioDirty: 1683 return "Dirty"; 1684 case BioReading: 1685 return "Reading"; 1686 case BioWriting: 1687 return "Writing"; 1688 case BioReadError: 1689 return "ReadError"; 1690 case BioVentiError: 1691 return "VentiError"; 1692 case BioMax: 1693 return "Max"; 1694 } 1695 } 1696 1697 static char *bttab[] = { 1698 "BtData", 1699 "BtData+1", 1700 "BtData+2", 1701 "BtData+3", 1702 "BtData+4", 1703 "BtData+5", 1704 "BtData+6", 1705 "BtData+7", 1706 "BtDir", 1707 "BtDir+1", 1708 "BtDir+2", 1709 "BtDir+3", 1710 "BtDir+4", 1711 "BtDir+5", 1712 "BtDir+6", 1713 "BtDir+7", 1714 }; 1715 1716 char* 1717 btStr(int type) 1718 { 1719 if(type < nelem(bttab)) 1720 return bttab[type]; 1721 return "unknown"; 1722 } 1723 1724 int 1725 labelFmt(Fmt *f) 1726 { 1727 Label *l; 1728 1729 l = va_arg(f->args, Label*); 1730 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1731 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1732 } 1733 1734 int 1735 scoreFmt(Fmt *f) 1736 { 1737 uchar *v; 1738 int i; 1739 u32int addr; 1740 1741 v = va_arg(f->args, uchar*); 1742 if(v == nil){ 1743 fmtprint(f, "*"); 1744 }else if((addr = globalToLocal(v)) != NilBlock) 1745 fmtprint(f, "0x%.8ux", addr); 1746 else{ 1747 for(i = 0; i < VtScoreSize; i++) 1748 fmtprint(f, "%2.2ux", v[i]); 1749 } 1750 1751 return 0; 1752 } 1753 1754 static int 1755 upHeap(int i, Block *b) 1756 { 1757 Block *bb; 1758 u32int now; 1759 int p; 1760 Cache *c; 1761 1762 c = b->c; 1763 now = c->now; 1764 for(; i != 0; i = p){ 1765 p = (i - 1) >> 1; 1766 bb = c->heap[p]; 1767 if(b->used - now >= bb->used - now) 1768 break; 1769 c->heap[i] = bb; 1770 bb->heap = i; 1771 } 1772 c->heap[i] = b; 1773 b->heap = i; 1774 1775 return i; 1776 } 1777 1778 static int 1779 downHeap(int i, Block *b) 1780 { 1781 Block *bb; 1782 u32int now; 1783 int k; 1784 Cache *c; 1785 1786 c = b->c; 1787 now = c->now; 1788 for(; ; i = k){ 1789 k = (i << 1) + 1; 1790 if(k >= c->nheap) 1791 break; 1792 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1793 k++; 1794 bb = c->heap[k]; 1795 if(b->used - now <= bb->used - now) 1796 break; 1797 c->heap[i] = bb; 1798 bb->heap = i; 1799 } 1800 c->heap[i] = b; 1801 b->heap = i; 1802 return i; 1803 } 1804 1805 /* 1806 * Delete a block from the heap. 1807 * Called with c->lk held. 1808 */ 1809 static void 1810 heapDel(Block *b) 1811 { 1812 int i, si; 1813 Cache *c; 1814 1815 c = b->c; 1816 1817 si = b->heap; 1818 if(si == BadHeap) 1819 return; 1820 b->heap = BadHeap; 1821 c->nheap--; 1822 if(si == c->nheap) 1823 return; 1824 b = c->heap[c->nheap]; 1825 i = upHeap(si, b); 1826 if(i == si) 1827 downHeap(i, b); 1828 } 1829 1830 /* 1831 * Insert a block into the heap. 1832 * Called with c->lk held. 1833 */ 1834 static void 1835 heapIns(Block *b) 1836 { 1837 assert(b->heap == BadHeap); 1838 upHeap(b->c->nheap++, b); 1839 vtWakeup(b->c->heapwait); 1840 } 1841 1842 /* 1843 * Get just the label for a block. 1844 */ 1845 int 1846 readLabel(Cache *c, Label *l, u32int addr) 1847 { 1848 int lpb; 1849 Block *b; 1850 u32int a; 1851 1852 lpb = c->size / LabelSize; 1853 a = addr / lpb; 1854 b = cacheLocal(c, PartLabel, a, OReadOnly); 1855 if(b == nil){ 1856 blockPut(b); 1857 return 0; 1858 } 1859 1860 if(!labelUnpack(l, b->data, addr%lpb)){ 1861 blockPut(b); 1862 return 0; 1863 } 1864 blockPut(b); 1865 return 1; 1866 } 1867 1868 /* 1869 * Process unlink queue. 1870 * Called with c->lk held. 1871 */ 1872 static void 1873 unlinkBody(Cache *c) 1874 { 1875 BList *p; 1876 1877 while(c->uhead != nil){ 1878 p = c->uhead; 1879 c->uhead = p->next; 1880 vtUnlock(c->lk); 1881 doRemoveLink(c, p); 1882 vtLock(c->lk); 1883 p->next = c->blfree; 1884 c->blfree = p; 1885 } 1886 } 1887 1888 /* 1889 * Occasionally unlink the blocks on the cache unlink queue. 1890 */ 1891 static void 1892 unlinkThread(void *a) 1893 { 1894 Cache *c = a; 1895 1896 vtThreadSetName("unlink"); 1897 1898 vtLock(c->lk); 1899 for(;;){ 1900 while(c->uhead == nil && c->die == nil) 1901 vtSleep(c->unlink); 1902 if(c->die != nil) 1903 break; 1904 unlinkBody(c); 1905 } 1906 c->ref--; 1907 vtWakeup(c->die); 1908 vtUnlock(c->lk); 1909 } 1910 1911 static int 1912 baddrCmp(void *a0, void *a1) 1913 { 1914 BAddr *b0, *b1; 1915 b0 = a0; 1916 b1 = a1; 1917 1918 if(b0->part < b1->part) 1919 return -1; 1920 if(b0->part > b1->part) 1921 return 1; 1922 if(b0->addr < b1->addr) 1923 return -1; 1924 if(b0->addr > b1->addr) 1925 return 1; 1926 return 0; 1927 } 1928 1929 /* 1930 * Scan the block list for dirty blocks; add them to the list c->baddr. 1931 */ 1932 static void 1933 flushFill(Cache *c) 1934 { 1935 int i, ndirty; 1936 BAddr *p; 1937 Block *b; 1938 1939 vtLock(c->lk); 1940 if(c->ndirty == 0){ 1941 vtUnlock(c->lk); 1942 return; 1943 } 1944 1945 p = c->baddr; 1946 ndirty = 0; 1947 for(i=0; i<c->nblocks; i++){ 1948 b = c->blocks + i; 1949 if(b->part == PartError) 1950 continue; 1951 if(b->iostate == BioDirty || b->iostate == BioWriting) 1952 ndirty++; 1953 if(b->iostate != BioDirty) 1954 continue; 1955 p->part = b->part; 1956 p->addr = b->addr; 1957 p->vers = b->vers; 1958 p++; 1959 } 1960 if(ndirty != c->ndirty){ 1961 fprint(2, "%s: ndirty mismatch expected %d found %d\n", 1962 argv0, c->ndirty, ndirty); 1963 c->ndirty = ndirty; 1964 } 1965 vtUnlock(c->lk); 1966 1967 c->bw = p - c->baddr; 1968 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1969 } 1970 1971 /* 1972 * This is not thread safe, i.e. it can't be called from multiple threads. 1973 * 1974 * It's okay how we use it, because it only gets called in 1975 * the flushThread. And cacheFree, but only after 1976 * cacheFree has killed off the flushThread. 1977 */ 1978 static int 1979 cacheFlushBlock(Cache *c) 1980 { 1981 Block *b; 1982 BAddr *p; 1983 int lockfail, nfail; 1984 1985 nfail = 0; 1986 for(;;){ 1987 if(c->br == c->be){ 1988 if(c->bw == 0 || c->bw == c->be) 1989 flushFill(c); 1990 c->br = 0; 1991 c->be = c->bw; 1992 c->bw = 0; 1993 c->nflush = 0; 1994 } 1995 1996 if(c->br == c->be) 1997 return 0; 1998 p = c->baddr + c->br; 1999 c->br++; 2000 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 2001 2002 if(b && blockWrite(b)){ 2003 c->nflush++; 2004 blockPut(b); 2005 return 1; 2006 } 2007 if(b) 2008 blockPut(b); 2009 2010 /* 2011 * Why didn't we write the block? 2012 */ 2013 2014 /* Block already written out */ 2015 if(b == nil && !lockfail) 2016 continue; 2017 2018 /* Failed to acquire lock; sleep if happens a lot. */ 2019 if(lockfail && ++nfail > 100){ 2020 sleep(500); 2021 nfail = 0; 2022 } 2023 /* Requeue block. */ 2024 if(c->bw < c->be) 2025 c->baddr[c->bw++] = *p; 2026 } 2027 } 2028 2029 /* 2030 * Occasionally flush dirty blocks from memory to the disk. 2031 */ 2032 static void 2033 flushThread(void *a) 2034 { 2035 Cache *c = a; 2036 int i; 2037 2038 vtThreadSetName("flush"); 2039 vtLock(c->lk); 2040 while(c->die == nil){ 2041 vtSleep(c->flush); 2042 vtUnlock(c->lk); 2043 for(i=0; i<FlushSize; i++) 2044 if(!cacheFlushBlock(c)){ 2045 /* 2046 * If i==0, could be someone is waking us repeatedly 2047 * to flush the cache but there's no work to do. 2048 * Pause a little. 2049 */ 2050 if(i==0){ 2051 // fprint(2, "%s: flushthread found " 2052 // "nothing to flush - %d dirty\n", 2053 // argv0, c->ndirty); 2054 sleep(250); 2055 } 2056 break; 2057 } 2058 if(i==0 && c->ndirty){ 2059 /* 2060 * All the blocks are being written right now -- there's nothing to do. 2061 * We might be spinning with cacheFlush though -- he'll just keep 2062 * kicking us until c->ndirty goes down. Probably we should sleep 2063 * on something that the diskThread can kick, but for now we'll 2064 * just pause for a little while waiting for disks to finish. 2065 */ 2066 sleep(100); 2067 } 2068 vtLock(c->lk); 2069 vtWakeupAll(c->flushwait); 2070 } 2071 c->ref--; 2072 vtWakeup(c->die); 2073 vtUnlock(c->lk); 2074 } 2075 2076 /* 2077 * Flush the cache. 2078 */ 2079 void 2080 cacheFlush(Cache *c, int wait) 2081 { 2082 /* 2083 * Lock c->dirtylk so that more blocks aren't being dirtied 2084 * while we try to write out what's already here. 2085 * Otherwise we might not ever finish! 2086 */ 2087 vtLock(c->dirtylk); 2088 vtLock(c->lk); 2089 if(wait){ 2090 while(c->ndirty){ 2091 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2092 // c->ndirty, c->uhead); 2093 vtWakeup(c->flush); 2094 vtSleep(c->flushwait); 2095 } 2096 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2097 }else if(c->ndirty) 2098 vtWakeup(c->flush); 2099 vtUnlock(c->lk); 2100 vtUnlock(c->dirtylk); 2101 } 2102 2103 /* 2104 * Kick the flushThread every 30 seconds. 2105 */ 2106 static void 2107 cacheSync(void *v) 2108 { 2109 Cache *c; 2110 2111 c = v; 2112 cacheFlush(c, 0); 2113 } 2114