1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 VtLock *dirtylk; 26 int ref; 27 int mode; 28 29 Disk *disk; 30 int size; /* block size */ 31 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 32 VtSession *z; 33 u32int now; /* ticks for usage timestamps */ 34 Block **heads; /* hash table for finding address */ 35 int nheap; /* number of available victims */ 36 Block **heap; /* heap for locating victims */ 37 long nblocks; /* number of blocks allocated */ 38 Block *blocks; /* array of block descriptors */ 39 u8int *mem; /* memory for all block data & blists */ 40 41 BList *blfree; 42 VtRendez *blrend; 43 44 int ndirty; /* number of dirty blocks in the cache */ 45 int maxdirty; /* max number of dirty blocks */ 46 u32int vers; 47 48 long hashSize; 49 50 FreeList *fl; 51 52 VtRendez *die; /* daemon threads should die when != nil */ 53 54 VtRendez *flush; 55 VtRendez *flushwait; 56 VtRendez *heapwait; 57 BAddr *baddr; 58 int bw, br, be; 59 int nflush; 60 61 Periodic *sync; 62 63 /* unlink daemon */ 64 BList *uhead; 65 BList *utail; 66 VtRendez *unlink; 67 68 /* block counts */ 69 int nused; 70 int ndisk; 71 }; 72 73 struct BList { 74 int part; 75 u32int addr; 76 uchar type; 77 u32int tag; 78 u32int epoch; 79 u32int vers; 80 81 int recurse; /* for block unlink */ 82 83 /* for roll back */ 84 int index; /* -1 indicates not valid */ 85 union { 86 uchar score[VtScoreSize]; 87 uchar entry[VtEntrySize]; 88 } old; 89 BList *next; 90 }; 91 92 struct BAddr { 93 int part; 94 u32int addr; 95 u32int vers; 96 }; 97 98 struct FreeList { 99 VtLock *lk; 100 u32int last; /* last block allocated */ 101 u32int end; /* end of data partition */ 102 u32int nused; /* number of used blocks */ 103 u32int epochLow; /* low epoch when last updated nused */ 104 }; 105 106 static FreeList *flAlloc(u32int end); 107 static void flFree(FreeList *fl); 108 109 static Block *cacheBumpBlock(Cache *c); 110 static void heapDel(Block*); 111 static void heapIns(Block*); 112 static void cacheCheck(Cache*); 113 static void unlinkThread(void *a); 114 static void flushThread(void *a); 115 static void flushBody(Cache *c); 116 static void unlinkBody(Cache *c); 117 static int cacheFlushBlock(Cache *c); 118 static void cacheSync(void*); 119 static BList *blistAlloc(Block*); 120 static void blistFree(Cache*, BList*); 121 static void doRemoveLink(Cache*, BList*); 122 static void doRemoveLinkList(Cache*, BList*); 123 124 /* 125 * Mapping from local block type to Venti type 126 */ 127 int vtType[BtMax] = { 128 VtDataType, /* BtData | 0 */ 129 VtPointerType0, /* BtData | 1 */ 130 VtPointerType1, /* BtData | 2 */ 131 VtPointerType2, /* BtData | 3 */ 132 VtPointerType3, /* BtData | 4 */ 133 VtPointerType4, /* BtData | 5 */ 134 VtPointerType5, /* BtData | 6 */ 135 VtPointerType6, /* BtData | 7 */ 136 VtDirType, /* BtDir | 0 */ 137 VtPointerType0, /* BtDir | 1 */ 138 VtPointerType1, /* BtDir | 2 */ 139 VtPointerType2, /* BtDir | 3 */ 140 VtPointerType3, /* BtDir | 4 */ 141 VtPointerType4, /* BtDir | 5 */ 142 VtPointerType5, /* BtDir | 6 */ 143 VtPointerType6, /* BtDir | 7 */ 144 }; 145 146 /* 147 * Allocate the memory cache. 148 */ 149 Cache * 150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 151 { 152 int i; 153 Cache *c; 154 Block *b; 155 BList *bl; 156 u8int *p; 157 int nbl; 158 159 c = vtMemAllocZ(sizeof(Cache)); 160 161 /* reasonable number of BList elements */ 162 nbl = nblocks * 4; 163 164 c->lk = vtLockAlloc(); 165 c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */ 166 c->ref = 1; 167 c->disk = disk; 168 c->z = z; 169 c->size = diskBlockSize(disk); 170 bwatchSetBlockSize(c->size); 171 /* round c->size up to be a nice multiple */ 172 c->size = (c->size + 127) & ~127; 173 c->ndmap = (c->size/20 + 7) / 8; 174 c->nblocks = nblocks; 175 c->hashSize = nblocks; 176 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 177 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 178 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 179 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 180 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 181 c->mode = mode; 182 c->vers++; 183 p = c->mem; 184 for(i = 0; i < nblocks; i++){ 185 b = &c->blocks[i]; 186 b->lk = vtLockAlloc(); 187 b->c = c; 188 b->data = p; 189 b->heap = i; 190 b->ioready = vtRendezAlloc(b->lk); 191 c->heap[i] = b; 192 p += c->size; 193 } 194 c->nheap = nblocks; 195 for(i = 0; i < nbl; i++){ 196 bl = (BList*)p; 197 bl->next = c->blfree; 198 c->blfree = bl; 199 p += sizeof(BList); 200 } 201 /* separate loop to keep blocks and blists reasonably aligned */ 202 for(i = 0; i < nblocks; i++){ 203 b = &c->blocks[i]; 204 b->dmap = p; 205 p += c->ndmap; 206 } 207 208 c->blrend = vtRendezAlloc(c->lk); 209 210 c->maxdirty = nblocks*(DirtyPercentage*0.01); 211 212 c->fl = flAlloc(diskSize(disk, PartData)); 213 214 c->unlink = vtRendezAlloc(c->lk); 215 c->flush = vtRendezAlloc(c->lk); 216 c->flushwait = vtRendezAlloc(c->lk); 217 c->heapwait = vtRendezAlloc(c->lk); 218 c->sync = periodicAlloc(cacheSync, c, 30*1000); 219 220 if(mode == OReadWrite){ 221 c->ref += 2; 222 vtThread(unlinkThread, c); 223 vtThread(flushThread, c); 224 } 225 cacheCheck(c); 226 227 return c; 228 } 229 230 /* 231 * Free the whole memory cache, flushing all dirty blocks to the disk. 232 */ 233 void 234 cacheFree(Cache *c) 235 { 236 int i; 237 238 /* kill off daemon threads */ 239 vtLock(c->lk); 240 c->die = vtRendezAlloc(c->lk); 241 periodicKill(c->sync); 242 vtWakeup(c->flush); 243 vtWakeup(c->unlink); 244 while(c->ref > 1) 245 vtSleep(c->die); 246 247 /* flush everything out */ 248 do { 249 unlinkBody(c); 250 vtUnlock(c->lk); 251 while(cacheFlushBlock(c)) 252 ; 253 diskFlush(c->disk); 254 vtLock(c->lk); 255 } while(c->uhead || c->ndirty); 256 vtUnlock(c->lk); 257 258 cacheCheck(c); 259 260 for(i = 0; i < c->nblocks; i++){ 261 assert(c->blocks[i].ref == 0); 262 vtRendezFree(c->blocks[i].ioready); 263 vtLockFree(c->blocks[i].lk); 264 } 265 flFree(c->fl); 266 vtMemFree(c->baddr); 267 vtMemFree(c->heads); 268 vtMemFree(c->blocks); 269 vtMemFree(c->mem); 270 vtLockFree(c->lk); 271 diskFree(c->disk); 272 vtRendezFree(c->blrend); 273 /* don't close vtSession */ 274 vtMemFree(c); 275 } 276 277 static void 278 cacheDump(Cache *c) 279 { 280 int i; 281 Block *b; 282 283 for(i = 0; i < c->nblocks; i++){ 284 b = &c->blocks[i]; 285 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 286 i, b->part, b->addr, b->score, b->l.type, b->ref, 287 bsStr(b->l.state), bioStr(b->iostate), b->pc); 288 } 289 } 290 291 static void 292 cacheCheck(Cache *c) 293 { 294 u32int size, now; 295 int i, k, refed; 296 static uchar zero[VtScoreSize]; 297 Block *b; 298 299 size = c->size; 300 now = c->now; 301 302 for(i = 0; i < c->nheap; i++){ 303 if(c->heap[i]->heap != i) 304 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 305 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 306 vtFatal("bad heap ordering"); 307 k = (i << 1) + 1; 308 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 309 vtFatal("bad heap ordering"); 310 k++; 311 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 312 vtFatal("bad heap ordering"); 313 } 314 315 refed = 0; 316 for(i = 0; i < c->nblocks; i++){ 317 b = &c->blocks[i]; 318 if(b->data != &c->mem[i * size]) 319 vtFatal("mis-blocked at %d", i); 320 if(b->ref && b->heap == BadHeap){ 321 refed++; 322 } 323 } 324 if(c->nheap + refed != c->nblocks){ 325 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks); 326 cacheDump(c); 327 } 328 assert(c->nheap + refed == c->nblocks); 329 refed = 0; 330 for(i = 0; i < c->nblocks; i++){ 331 b = &c->blocks[i]; 332 if(b->ref){ 333 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l); 334 refed++; 335 } 336 } 337 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed); 338 } 339 340 341 /* 342 * locate the block with the oldest second to last use. 343 * remove it from the heap, and fix up the heap. 344 */ 345 /* called with c->lk held */ 346 static Block * 347 cacheBumpBlock(Cache *c) 348 { 349 int printed; 350 Block *b; 351 352 /* 353 * locate the block with the oldest second to last use. 354 * remove it from the heap, and fix up the heap. 355 */ 356 printed = 0; 357 if(c->nheap == 0){ 358 while(c->nheap == 0){ 359 vtWakeup(c->flush); 360 vtSleep(c->heapwait); 361 if(c->nheap == 0){ 362 printed = 1; 363 fprint(2, "%s: entire cache is busy, %d dirty " 364 "-- waking flush thread\n", 365 argv0, c->ndirty); 366 } 367 } 368 if(printed) 369 fprint(2, "%s: cache is okay again, %d dirty\n", 370 argv0, c->ndirty); 371 } 372 373 b = c->heap[0]; 374 heapDel(b); 375 376 assert(b->heap == BadHeap); 377 assert(b->ref == 0); 378 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 379 assert(b->prior == nil); 380 assert(b->uhead == nil); 381 382 /* 383 * unchain the block from hash chain 384 */ 385 if(b->prev){ 386 *(b->prev) = b->next; 387 if(b->next) 388 b->next->prev = b->prev; 389 b->prev = nil; 390 } 391 392 393 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score); 394 /* set block to a reasonable state */ 395 b->ref = 1; 396 b->part = PartError; 397 memset(&b->l, 0, sizeof(b->l)); 398 b->iostate = BioEmpty; 399 400 return b; 401 } 402 403 /* 404 * look for a particular version of the block in the memory cache. 405 */ 406 static Block * 407 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 408 int waitlock, int *lockfailure) 409 { 410 Block *b; 411 ulong h; 412 413 h = addr % c->hashSize; 414 415 if(lockfailure) 416 *lockfailure = 0; 417 418 /* 419 * look for the block in the cache 420 */ 421 vtLock(c->lk); 422 for(b = c->heads[h]; b != nil; b = b->next){ 423 if(b->part == part && b->addr == addr) 424 break; 425 } 426 if(b == nil || b->vers != vers){ 427 vtUnlock(c->lk); 428 return nil; 429 } 430 if(!waitlock && !vtCanLock(b->lk)){ 431 *lockfailure = 1; 432 vtUnlock(c->lk); 433 return nil; 434 } 435 heapDel(b); 436 b->ref++; 437 vtUnlock(c->lk); 438 439 bwatchLock(b); 440 if(waitlock) 441 vtLock(b->lk); 442 b->nlock = 1; 443 444 for(;;){ 445 switch(b->iostate){ 446 default: 447 abort(); 448 case BioEmpty: 449 case BioLabel: 450 case BioClean: 451 case BioDirty: 452 if(b->vers != vers){ 453 blockPut(b); 454 return nil; 455 } 456 return b; 457 case BioReading: 458 case BioWriting: 459 vtSleep(b->ioready); 460 break; 461 case BioVentiError: 462 blockPut(b); 463 vtSetError("venti i/o error block 0x%.8ux", addr); 464 return nil; 465 case BioReadError: 466 blockPut(b); 467 vtSetError("error reading block 0x%.8ux", addr); 468 return nil; 469 } 470 } 471 /* NOT REACHED */ 472 } 473 static Block* 474 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 475 { 476 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 477 } 478 479 480 /* 481 * fetch a local (on-disk) block from the memory cache. 482 * if it's not there, load it, bumping some other block. 483 */ 484 Block * 485 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 486 { 487 Block *b; 488 ulong h; 489 490 assert(part != PartVenti); 491 492 h = addr % c->hashSize; 493 494 /* 495 * look for the block in the cache 496 */ 497 vtLock(c->lk); 498 for(b = c->heads[h]; b != nil; b = b->next){ 499 if(b->part != part || b->addr != addr) 500 continue; 501 if(epoch && b->l.epoch != epoch){ 502 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 503 vtUnlock(c->lk); 504 vtSetError(ELabelMismatch); 505 return nil; 506 } 507 heapDel(b); 508 b->ref++; 509 break; 510 } 511 512 if(b == nil){ 513 b = cacheBumpBlock(c); 514 515 b->part = part; 516 b->addr = addr; 517 localToGlobal(addr, b->score); 518 519 /* chain onto correct hash */ 520 b->next = c->heads[h]; 521 c->heads[h] = b; 522 if(b->next != nil) 523 b->next->prev = &b->next; 524 b->prev = &c->heads[h]; 525 } 526 527 vtUnlock(c->lk); 528 529 /* 530 * BUG: what if the epoch changes right here? 531 * In the worst case, we could end up in some weird 532 * lock loop, because the block we want no longer exists, 533 * and instead we're trying to lock a block we have no 534 * business grabbing. 535 * 536 * For now, I'm not going to worry about it. 537 */ 538 539 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr); 540 bwatchLock(b); 541 vtLock(b->lk); 542 b->nlock = 1; 543 544 if(part == PartData && b->iostate == BioEmpty){ 545 if(!readLabel(c, &b->l, addr)){ 546 blockPut(b); 547 return nil; 548 } 549 blockSetIOState(b, BioLabel); 550 } 551 if(epoch && b->l.epoch != epoch){ 552 blockPut(b); 553 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 554 vtSetError(ELabelMismatch); 555 return nil; 556 } 557 558 b->pc = getcallerpc(&c); 559 for(;;){ 560 switch(b->iostate){ 561 default: 562 abort(); 563 case BioEmpty: 564 case BioLabel: 565 if(mode == OOverWrite){ 566 blockSetIOState(b, BioClean); 567 return b; 568 } 569 diskRead(c->disk, b); 570 vtSleep(b->ioready); 571 break; 572 case BioClean: 573 case BioDirty: 574 return b; 575 case BioReading: 576 case BioWriting: 577 vtSleep(b->ioready); 578 break; 579 case BioReadError: 580 blockSetIOState(b, BioEmpty); 581 blockPut(b); 582 vtSetError("error reading block 0x%.8ux", addr); 583 return nil; 584 } 585 } 586 /* NOT REACHED */ 587 } 588 589 Block * 590 cacheLocal(Cache *c, int part, u32int addr, int mode) 591 { 592 return _cacheLocal(c, part, addr, mode, 0); 593 } 594 595 /* 596 * fetch a local (on-disk) block from the memory cache. 597 * if it's not there, load it, bumping some other block. 598 * check tag and type. 599 */ 600 Block * 601 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 602 { 603 Block *b; 604 605 b = _cacheLocal(c, PartData, addr, mode, epoch); 606 if(b == nil) 607 return nil; 608 if(b->l.type != type || b->l.tag != tag){ 609 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 610 argv0, addr, b->l.type, type, b->l.tag, tag); 611 vtSetError(ELabelMismatch); 612 blockPut(b); 613 return nil; 614 } 615 b->pc = getcallerpc(&c); 616 return b; 617 } 618 619 /* 620 * fetch a global (Venti) block from the memory cache. 621 * if it's not there, load it, bumping some other block. 622 * check tag and type if it's really a local block in disguise. 623 */ 624 Block * 625 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 626 { 627 int n; 628 Block *b; 629 ulong h; 630 u32int addr; 631 632 addr = globalToLocal(score); 633 if(addr != NilBlock){ 634 b = cacheLocalData(c, addr, type, tag, mode, 0); 635 if(b) 636 b->pc = getcallerpc(&c); 637 return b; 638 } 639 640 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 641 642 /* 643 * look for the block in the cache 644 */ 645 vtLock(c->lk); 646 for(b = c->heads[h]; b != nil; b = b->next){ 647 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 648 continue; 649 heapDel(b); 650 b->ref++; 651 break; 652 } 653 654 if(b == nil){ 655 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type); 656 657 b = cacheBumpBlock(c); 658 659 b->part = PartVenti; 660 b->addr = NilBlock; 661 b->l.type = type; 662 memmove(b->score, score, VtScoreSize); 663 664 /* chain onto correct hash */ 665 b->next = c->heads[h]; 666 c->heads[h] = b; 667 if(b->next != nil) 668 b->next->prev = &b->next; 669 b->prev = &c->heads[h]; 670 } 671 vtUnlock(c->lk); 672 673 bwatchLock(b); 674 vtLock(b->lk); 675 b->nlock = 1; 676 b->pc = getcallerpc(&c); 677 678 switch(b->iostate){ 679 default: 680 abort(); 681 case BioEmpty: 682 n = vtRead(c->z, score, vtType[type], b->data, c->size); 683 if(n < 0 || !vtSha1Check(score, b->data, n)){ 684 blockSetIOState(b, BioVentiError); 685 blockPut(b); 686 vtSetError( 687 "venti error reading block %V or wrong score: %r", 688 score); 689 return nil; 690 } 691 vtZeroExtend(vtType[type], b->data, n, c->size); 692 blockSetIOState(b, BioClean); 693 return b; 694 case BioClean: 695 return b; 696 case BioVentiError: 697 blockPut(b); 698 vtSetError("venti i/o error or wrong score, block %V", score); 699 return nil; 700 case BioReadError: 701 blockPut(b); 702 vtSetError("error reading block %V", b->score); 703 return nil; 704 } 705 /* NOT REACHED */ 706 } 707 708 /* 709 * allocate a new on-disk block and load it into the memory cache. 710 * BUG: if the disk is full, should we flush some of it to Venti? 711 */ 712 static u32int lastAlloc; 713 714 Block * 715 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 716 { 717 FreeList *fl; 718 u32int addr; 719 Block *b; 720 int n, nwrap; 721 Label lab; 722 723 n = c->size / LabelSize; 724 fl = c->fl; 725 726 vtLock(fl->lk); 727 addr = fl->last; 728 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 729 if(b == nil){ 730 fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0); 731 vtUnlock(fl->lk); 732 return nil; 733 } 734 nwrap = 0; 735 for(;;){ 736 if(++addr >= fl->end){ 737 addr = 0; 738 if(++nwrap >= 2){ 739 blockPut(b); 740 vtSetError("disk is full"); 741 /* 742 * try to avoid a continuous spew of console 743 * messages. 744 */ 745 if (fl->last != 0) 746 fprint(2, "%s: cacheAllocBlock: xxx1 %R\n", 747 argv0); 748 fl->last = 0; 749 vtUnlock(fl->lk); 750 return nil; 751 } 752 } 753 if(addr%n == 0){ 754 blockPut(b); 755 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 756 if(b == nil){ 757 fl->last = addr; 758 fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0); 759 vtUnlock(fl->lk); 760 return nil; 761 } 762 } 763 if(!labelUnpack(&lab, b->data, addr%n)) 764 continue; 765 if(lab.state == BsFree) 766 goto Found; 767 if(lab.state&BsClosed) 768 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 769 goto Found; 770 } 771 Found: 772 blockPut(b); 773 b = cacheLocal(c, PartData, addr, OOverWrite); 774 if(b == nil){ 775 fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0); 776 return nil; 777 } 778 assert(b->iostate == BioLabel || b->iostate == BioClean); 779 fl->last = addr; 780 lab.type = type; 781 lab.tag = tag; 782 lab.state = BsAlloc; 783 lab.epoch = epoch; 784 lab.epochClose = ~(u32int)0; 785 if(!blockSetLabel(b, &lab, 1)){ 786 fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0); 787 blockPut(b); 788 return nil; 789 } 790 vtZeroExtend(vtType[type], b->data, 0, c->size); 791 if(0)diskWrite(c->disk, b); 792 793 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag); 794 lastAlloc = addr; 795 fl->nused++; 796 vtUnlock(fl->lk); 797 b->pc = getcallerpc(&c); 798 return b; 799 } 800 801 int 802 cacheDirty(Cache *c) 803 { 804 return c->ndirty; 805 } 806 807 void 808 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 809 { 810 int n; 811 u32int addr, nused; 812 Block *b; 813 Label lab; 814 FreeList *fl; 815 816 fl = c->fl; 817 n = c->size / LabelSize; 818 *bsize = c->size; 819 vtLock(fl->lk); 820 if(fl->epochLow == epochLow){ 821 *used = fl->nused; 822 *total = fl->end; 823 vtUnlock(fl->lk); 824 return; 825 } 826 b = nil; 827 nused = 0; 828 for(addr=0; addr<fl->end; addr++){ 829 if(addr%n == 0){ 830 blockPut(b); 831 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 832 if(b == nil){ 833 fprint(2, "%s: flCountUsed: loading %ux: %R\n", 834 argv0, addr/n); 835 break; 836 } 837 } 838 if(!labelUnpack(&lab, b->data, addr%n)) 839 continue; 840 if(lab.state == BsFree) 841 continue; 842 if(lab.state&BsClosed) 843 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 844 continue; 845 nused++; 846 } 847 blockPut(b); 848 if(addr == fl->end){ 849 fl->nused = nused; 850 fl->epochLow = epochLow; 851 } 852 *used = nused; 853 *total = fl->end; 854 vtUnlock(fl->lk); 855 return; 856 } 857 858 static FreeList * 859 flAlloc(u32int end) 860 { 861 FreeList *fl; 862 863 fl = vtMemAllocZ(sizeof(*fl)); 864 fl->lk = vtLockAlloc(); 865 fl->last = 0; 866 fl->end = end; 867 return fl; 868 } 869 870 static void 871 flFree(FreeList *fl) 872 { 873 vtLockFree(fl->lk); 874 vtMemFree(fl); 875 } 876 877 u32int 878 cacheLocalSize(Cache *c, int part) 879 { 880 return diskSize(c->disk, part); 881 } 882 883 /* 884 * The thread that has locked b may refer to it by 885 * multiple names. Nlock counts the number of 886 * references the locking thread holds. It will call 887 * blockPut once per reference. 888 */ 889 void 890 blockDupLock(Block *b) 891 { 892 assert(b->nlock > 0); 893 b->nlock++; 894 } 895 896 /* 897 * we're done with the block. 898 * unlock it. can't use it after calling this. 899 */ 900 void 901 blockPut(Block* b) 902 { 903 Cache *c; 904 905 if(b == nil) 906 return; 907 908 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 909 910 if(b->iostate == BioDirty) 911 bwatchDependency(b); 912 913 if(--b->nlock > 0) 914 return; 915 916 /* 917 * b->nlock should probably stay at zero while 918 * the block is unlocked, but diskThread and vtSleep 919 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 920 * so we have to keep b->nlock set to 1 even 921 * when the block is unlocked. 922 */ 923 assert(b->nlock == 0); 924 b->nlock = 1; 925 // b->pc = 0; 926 927 bwatchUnlock(b); 928 vtUnlock(b->lk); 929 c = b->c; 930 vtLock(c->lk); 931 932 if(--b->ref > 0){ 933 vtUnlock(c->lk); 934 return; 935 } 936 937 assert(b->ref == 0); 938 switch(b->iostate){ 939 default: 940 b->used = c->now++; 941 heapIns(b); 942 break; 943 case BioEmpty: 944 case BioLabel: 945 if(c->nheap == 0) 946 b->used = c->now++; 947 else 948 b->used = c->heap[0]->used; 949 heapIns(b); 950 break; 951 case BioDirty: 952 break; 953 } 954 vtUnlock(c->lk); 955 } 956 957 /* 958 * set the label associated with a block. 959 */ 960 Block* 961 _blockSetLabel(Block *b, Label *l) 962 { 963 int lpb; 964 Block *bb; 965 u32int a; 966 Cache *c; 967 968 c = b->c; 969 970 assert(b->part == PartData); 971 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 972 lpb = c->size / LabelSize; 973 a = b->addr / lpb; 974 bb = cacheLocal(c, PartLabel, a, OReadWrite); 975 if(bb == nil){ 976 blockPut(b); 977 return nil; 978 } 979 b->l = *l; 980 labelPack(l, bb->data, b->addr%lpb); 981 blockDirty(bb); 982 return bb; 983 } 984 985 int 986 blockSetLabel(Block *b, Label *l, int allocating) 987 { 988 Block *lb; 989 Label oldl; 990 991 oldl = b->l; 992 lb = _blockSetLabel(b, l); 993 if(lb == nil) 994 return 0; 995 996 /* 997 * If we're allocating the block, make sure the label (bl) 998 * goes to disk before the data block (b) itself. This is to help 999 * the blocks that in turn depend on b. 1000 * 1001 * Suppose bx depends on (must be written out after) b. 1002 * Once we write b we'll think it's safe to write bx. 1003 * Bx can't get at b unless it has a valid label, though. 1004 * 1005 * Allocation is the only case in which having a current label 1006 * is vital because: 1007 * 1008 * - l.type is set at allocation and never changes. 1009 * - l.tag is set at allocation and never changes. 1010 * - l.state is not checked when we load blocks. 1011 * - the archiver cares deeply about l.state being 1012 * BaActive vs. BaCopied, but that's handled 1013 * by direct calls to _blockSetLabel. 1014 */ 1015 1016 if(allocating) 1017 blockDependency(b, lb, -1, nil, nil); 1018 blockPut(lb); 1019 return 1; 1020 } 1021 1022 /* 1023 * Record that bb must be written out before b. 1024 * If index is given, we're about to overwrite the score/e 1025 * at that index in the block. Save the old value so we 1026 * can write a safer ``old'' version of the block if pressed. 1027 */ 1028 void 1029 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1030 { 1031 BList *p; 1032 1033 if(bb->iostate == BioClean) 1034 return; 1035 1036 /* 1037 * Dependencies for blocks containing Entry structures 1038 * or scores must always be explained. The problem with 1039 * only explaining some of them is this. Suppose we have two 1040 * dependencies for the same field, the first explained 1041 * and the second not. We try to write the block when the first 1042 * dependency is not written but the second is. We will roll back 1043 * the first change even though the second trumps it. 1044 */ 1045 if(index == -1 && bb->part == PartData) 1046 assert(b->l.type == BtData); 1047 1048 if(bb->iostate != BioDirty){ 1049 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n", 1050 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1051 abort(); 1052 } 1053 1054 p = blistAlloc(bb); 1055 if(p == nil) 1056 return; 1057 1058 assert(bb->iostate == BioDirty); 1059 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1060 1061 p->part = bb->part; 1062 p->addr = bb->addr; 1063 p->type = bb->l.type; 1064 p->vers = bb->vers; 1065 p->index = index; 1066 if(p->index >= 0){ 1067 /* 1068 * This test would just be b->l.type==BtDir except 1069 * we need to exclude the super block. 1070 */ 1071 if(b->l.type == BtDir && b->part == PartData) 1072 entryPack(e, p->old.entry, 0); 1073 else 1074 memmove(p->old.score, score, VtScoreSize); 1075 } 1076 p->next = b->prior; 1077 b->prior = p; 1078 } 1079 1080 /* 1081 * Mark an in-memory block as dirty. If there are too many 1082 * dirty blocks, start writing some out to disk. 1083 * 1084 * If there were way too many dirty blocks, we used to 1085 * try to do some flushing ourselves, but it's just too dangerous -- 1086 * it implies that the callers cannot have any of our priors locked, 1087 * but this is hard to avoid in some cases. 1088 */ 1089 int 1090 blockDirty(Block *b) 1091 { 1092 Cache *c; 1093 1094 c = b->c; 1095 1096 assert(b->part != PartVenti); 1097 1098 if(b->iostate == BioDirty) 1099 return 1; 1100 assert(b->iostate == BioClean); 1101 1102 vtLock(c->dirtylk); 1103 vtLock(c->lk); 1104 b->iostate = BioDirty; 1105 c->ndirty++; 1106 if(c->ndirty > (c->maxdirty>>1)) 1107 vtWakeup(c->flush); 1108 vtUnlock(c->lk); 1109 vtUnlock(c->dirtylk); 1110 1111 return 1; 1112 } 1113 1114 /* 1115 * We've decided to write out b. Maybe b has some pointers to blocks 1116 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1117 * copy of b that is safe to write out. (diskThread will make sure the block 1118 * remains marked as dirty.) 1119 */ 1120 uchar * 1121 blockRollback(Block *b, uchar *buf) 1122 { 1123 u32int addr; 1124 BList *p; 1125 Super super; 1126 1127 /* easy case */ 1128 if(b->prior == nil) 1129 return b->data; 1130 1131 memmove(buf, b->data, b->c->size); 1132 for(p=b->prior; p; p=p->next){ 1133 /* 1134 * we know p->index >= 0 because blockWrite has vetted this block for us. 1135 */ 1136 assert(p->index >= 0); 1137 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1138 if(b->part == PartSuper){ 1139 assert(p->index == 0); 1140 superUnpack(&super, buf); 1141 addr = globalToLocal(p->old.score); 1142 if(addr == NilBlock){ 1143 fprint(2, "%s: rolling back super block: " 1144 "bad replacement addr %V\n", 1145 argv0, p->old.score); 1146 abort(); 1147 } 1148 super.active = addr; 1149 superPack(&super, buf); 1150 continue; 1151 } 1152 if(b->l.type == BtDir) 1153 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1154 else 1155 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1156 } 1157 return buf; 1158 } 1159 1160 /* 1161 * Try to write block b. 1162 * If b depends on other blocks: 1163 * 1164 * If the block has been written out, remove the dependency. 1165 * If the dependency is replaced by a more recent dependency, 1166 * throw it out. 1167 * If we know how to write out an old version of b that doesn't 1168 * depend on it, do that. 1169 * 1170 * Otherwise, bail. 1171 */ 1172 int 1173 blockWrite(Block *b) 1174 { 1175 uchar *dmap; 1176 Cache *c; 1177 BList *p, **pp; 1178 Block *bb; 1179 int lockfail; 1180 1181 c = b->c; 1182 1183 if(b->iostate != BioDirty) 1184 return 1; 1185 1186 dmap = b->dmap; 1187 memset(dmap, 0, c->ndmap); 1188 pp = &b->prior; 1189 for(p=*pp; p; p=*pp){ 1190 if(p->index >= 0){ 1191 /* more recent dependency has succeeded; this one can go */ 1192 if(dmap[p->index/8] & (1<<(p->index%8))) 1193 goto ignblock; 1194 } 1195 1196 lockfail = 0; 1197 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1198 if(bb == nil){ 1199 if(lockfail) 1200 return 0; 1201 /* block not in cache => was written already */ 1202 dmap[p->index/8] |= 1<<(p->index%8); 1203 goto ignblock; 1204 } 1205 1206 /* 1207 * same version of block is still in cache. 1208 * 1209 * the assertion is true because the block still has version p->vers, 1210 * which means it hasn't been written out since we last saw it. 1211 */ 1212 if(bb->iostate != BioDirty){ 1213 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n", 1214 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1215 /* probably BioWriting if it happens? */ 1216 if(bb->iostate == BioClean) 1217 goto ignblock; 1218 } 1219 1220 blockPut(bb); 1221 1222 if(p->index < 0){ 1223 /* 1224 * We don't know how to temporarily undo 1225 * b's dependency on bb, so just don't write b yet. 1226 */ 1227 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1228 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1229 return 0; 1230 } 1231 /* keep walking down the list */ 1232 pp = &p->next; 1233 continue; 1234 1235 ignblock: 1236 *pp = p->next; 1237 blistFree(c, p); 1238 continue; 1239 } 1240 1241 /* 1242 * DiskWrite must never be called with a double-locked block. 1243 * This call to diskWrite is okay because blockWrite is only called 1244 * from the cache flush thread, which never double-locks a block. 1245 */ 1246 diskWrite(c->disk, b); 1247 return 1; 1248 } 1249 1250 /* 1251 * Change the I/O state of block b. 1252 * Just an assignment except for magic in 1253 * switch statement (read comments there). 1254 */ 1255 void 1256 blockSetIOState(Block *b, int iostate) 1257 { 1258 int dowakeup; 1259 Cache *c; 1260 BList *p, *q; 1261 1262 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1263 1264 c = b->c; 1265 1266 dowakeup = 0; 1267 switch(iostate){ 1268 default: 1269 abort(); 1270 case BioEmpty: 1271 assert(!b->uhead); 1272 break; 1273 case BioLabel: 1274 assert(!b->uhead); 1275 break; 1276 case BioClean: 1277 bwatchDependency(b); 1278 /* 1279 * If b->prior is set, it means a write just finished. 1280 * The prior list isn't needed anymore. 1281 */ 1282 for(p=b->prior; p; p=q){ 1283 q = p->next; 1284 blistFree(c, p); 1285 } 1286 b->prior = nil; 1287 /* 1288 * Freeing a block or just finished a write. 1289 * Move the blocks from the per-block unlink 1290 * queue to the cache unlink queue. 1291 */ 1292 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1293 vtLock(c->lk); 1294 c->ndirty--; 1295 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1296 b->vers = c->vers++; 1297 if(b->uhead){ 1298 /* add unlink blocks to unlink queue */ 1299 if(c->uhead == nil){ 1300 c->uhead = b->uhead; 1301 vtWakeup(c->unlink); 1302 }else 1303 c->utail->next = b->uhead; 1304 c->utail = b->utail; 1305 b->uhead = nil; 1306 } 1307 vtUnlock(c->lk); 1308 } 1309 assert(!b->uhead); 1310 dowakeup = 1; 1311 break; 1312 case BioDirty: 1313 /* 1314 * Wrote out an old version of the block (see blockRollback). 1315 * Bump a version count, leave it dirty. 1316 */ 1317 if(b->iostate == BioWriting){ 1318 vtLock(c->lk); 1319 b->vers = c->vers++; 1320 vtUnlock(c->lk); 1321 dowakeup = 1; 1322 } 1323 break; 1324 case BioReading: 1325 case BioWriting: 1326 /* 1327 * Adding block to disk queue. Bump reference count. 1328 * diskThread decs the count later by calling blockPut. 1329 * This is here because we need to lock c->lk to 1330 * manipulate the ref count. 1331 */ 1332 vtLock(c->lk); 1333 b->ref++; 1334 vtUnlock(c->lk); 1335 break; 1336 case BioReadError: 1337 case BioVentiError: 1338 /* 1339 * Oops. 1340 */ 1341 dowakeup = 1; 1342 break; 1343 } 1344 b->iostate = iostate; 1345 /* 1346 * Now that the state has changed, we can wake the waiters. 1347 */ 1348 if(dowakeup) 1349 vtWakeupAll(b->ioready); 1350 } 1351 1352 /* 1353 * The active file system is a tree of blocks. 1354 * When we add snapshots to the mix, the entire file system 1355 * becomes a dag and thus requires a bit more care. 1356 * 1357 * The life of the file system is divided into epochs. A snapshot 1358 * ends one epoch and begins the next. Each file system block 1359 * is marked with the epoch in which it was created (b.epoch). 1360 * When the block is unlinked from the file system (closed), it is marked 1361 * with the epoch in which it was removed (b.epochClose). 1362 * Once we have discarded or archived all snapshots up to 1363 * b.epochClose, we can reclaim the block. 1364 * 1365 * If a block was created in a past epoch but is not yet closed, 1366 * it is treated as copy-on-write. Of course, in order to insert the 1367 * new pointer into the tree, the parent must be made writable, 1368 * and so on up the tree. The recursion stops because the root 1369 * block is always writable. 1370 * 1371 * If blocks are never closed, they will never be reused, and 1372 * we will run out of disk space. But marking a block as closed 1373 * requires some care about dependencies and write orderings. 1374 * 1375 * (1) If a block p points at a copy-on-write block b and we 1376 * copy b to create bb, then p must be written out after bb and 1377 * lbb (bb's label block). 1378 * 1379 * (2) We have to mark b as closed, but only after we switch 1380 * the pointer, so lb must be written out after p. In fact, we 1381 * can't even update the in-memory copy, or the cache might 1382 * mistakenly give out b for reuse before p gets written. 1383 * 1384 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1385 * The caller is expected to record a "p after bb" dependency 1386 * to finish (1), and also expected to call blockRemoveLink 1387 * to arrange for (2) to happen once p is written. 1388 * 1389 * Until (2) happens, some pieces of the code (e.g., the archiver) 1390 * still need to know whether a block has been copied, so we 1391 * set the BsCopied bit in the label and force that to disk *before* 1392 * the copy gets written out. 1393 */ 1394 Block* 1395 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1396 { 1397 Block *bb, *lb; 1398 Label l; 1399 1400 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1401 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n", 1402 argv0, b->addr, &b->l, elo, ehi); 1403 1404 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1405 if(bb == nil){ 1406 blockPut(b); 1407 return nil; 1408 } 1409 1410 /* 1411 * Update label so we know the block has been copied. 1412 * (It will be marked closed once it has been unlinked from 1413 * the tree.) This must follow cacheAllocBlock since we 1414 * can't be holding onto lb when we call cacheAllocBlock. 1415 */ 1416 if((b->l.state&BsCopied)==0) 1417 if(b->part == PartData){ /* not the superblock */ 1418 l = b->l; 1419 l.state |= BsCopied; 1420 lb = _blockSetLabel(b, &l); 1421 if(lb == nil){ 1422 /* can't set label => can't copy block */ 1423 blockPut(b); 1424 l.type = BtMax; 1425 l.state = BsFree; 1426 l.epoch = 0; 1427 l.epochClose = 0; 1428 l.tag = 0; 1429 blockSetLabel(bb, &l, 0); 1430 blockPut(bb); 1431 return nil; 1432 } 1433 blockDependency(bb, lb, -1, nil, nil); 1434 blockPut(lb); 1435 } 1436 1437 memmove(bb->data, b->data, b->c->size); 1438 blockDirty(bb); 1439 blockPut(b); 1440 return bb; 1441 } 1442 1443 /* 1444 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1445 * If recurse is set, we are unlinking all of bb's children as well. 1446 * 1447 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1448 * the relevant information to b's list of unlinked blocks. Once b is written, 1449 * the list will be queued for processing. 1450 * 1451 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1452 */ 1453 void 1454 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1455 { 1456 BList *p, **pp, bl; 1457 1458 /* remove bb from prior list */ 1459 for(pp=&b->prior; (p=*pp)!=nil; ){ 1460 if(p->part == PartData && p->addr == addr){ 1461 *pp = p->next; 1462 blistFree(b->c, p); 1463 }else 1464 pp = &p->next; 1465 } 1466 1467 bl.part = PartData; 1468 bl.addr = addr; 1469 bl.type = type; 1470 bl.tag = tag; 1471 if(b->l.epoch == 0) 1472 assert(b->part == PartSuper); 1473 bl.epoch = b->l.epoch; 1474 bl.next = nil; 1475 bl.recurse = recurse; 1476 1477 p = blistAlloc(b); 1478 if(p == nil){ 1479 /* 1480 * We were out of blists so blistAlloc wrote b to disk. 1481 */ 1482 doRemoveLink(b->c, &bl); 1483 return; 1484 } 1485 1486 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1487 assert(b->iostate == BioDirty); 1488 1489 *p = bl; 1490 if(b->uhead == nil) 1491 b->uhead = p; 1492 else 1493 b->utail->next = p; 1494 b->utail = p; 1495 } 1496 1497 /* 1498 * Process removal of a single block and perhaps its children. 1499 */ 1500 static void 1501 doRemoveLink(Cache *c, BList *p) 1502 { 1503 int i, n, recurse; 1504 u32int a; 1505 Block *b; 1506 Label l; 1507 BList bl; 1508 1509 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1510 1511 /* 1512 * We're not really going to overwrite b, but if we're not 1513 * going to look at its contents, there is no point in reading 1514 * them from the disk. 1515 */ 1516 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1517 if(b == nil) 1518 return; 1519 1520 /* 1521 * When we're unlinking from the superblock, close with the next epoch. 1522 */ 1523 if(p->epoch == 0) 1524 p->epoch = b->l.epoch+1; 1525 1526 /* sanity check */ 1527 if(b->l.epoch > p->epoch){ 1528 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n", 1529 argv0, b->l.epoch, p->epoch); 1530 blockPut(b); 1531 return; 1532 } 1533 1534 if(recurse){ 1535 n = c->size / VtScoreSize; 1536 for(i=0; i<n; i++){ 1537 a = globalToLocal(b->data + i*VtScoreSize); 1538 if(a == NilBlock || !readLabel(c, &l, a)) 1539 continue; 1540 if(l.state&BsClosed) 1541 continue; 1542 /* 1543 * If stack space becomes an issue... 1544 p->addr = a; 1545 p->type = l.type; 1546 p->tag = l.tag; 1547 doRemoveLink(c, p); 1548 */ 1549 1550 bl.part = PartData; 1551 bl.addr = a; 1552 bl.type = l.type; 1553 bl.tag = l.tag; 1554 bl.epoch = p->epoch; 1555 bl.next = nil; 1556 bl.recurse = 1; 1557 /* give up the block lock - share with others */ 1558 blockPut(b); 1559 doRemoveLink(c, &bl); 1560 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1561 if(b == nil){ 1562 fprint(2, "%s: warning: lost block in doRemoveLink\n", 1563 argv0); 1564 return; 1565 } 1566 } 1567 } 1568 1569 l = b->l; 1570 l.state |= BsClosed; 1571 l.epochClose = p->epoch; 1572 if(l.epochClose == l.epoch){ 1573 vtLock(c->fl->lk); 1574 if(l.epoch == c->fl->epochLow) 1575 c->fl->nused--; 1576 blockSetLabel(b, &l, 0); 1577 vtUnlock(c->fl->lk); 1578 }else 1579 blockSetLabel(b, &l, 0); 1580 blockPut(b); 1581 } 1582 1583 /* 1584 * Allocate a BList so that we can record a dependency 1585 * or queue a removal related to block b. 1586 * If we can't find a BList, we write out b and return nil. 1587 */ 1588 static BList * 1589 blistAlloc(Block *b) 1590 { 1591 Cache *c; 1592 BList *p; 1593 1594 if(b->iostate != BioDirty){ 1595 /* 1596 * should not happen anymore - 1597 * blockDirty used to flush but no longer does. 1598 */ 1599 assert(b->iostate == BioClean); 1600 fprint(2, "%s: blistAlloc: called on clean block\n", argv0); 1601 return nil; 1602 } 1603 1604 c = b->c; 1605 vtLock(c->lk); 1606 if(c->blfree == nil){ 1607 /* 1608 * No free BLists. What are our options? 1609 */ 1610 1611 /* Block has no priors? Just write it. */ 1612 if(b->prior == nil){ 1613 vtUnlock(c->lk); 1614 diskWriteAndWait(c->disk, b); 1615 return nil; 1616 } 1617 1618 /* 1619 * Wake the flush thread, which will hopefully free up 1620 * some BLists for us. We used to flush a block from 1621 * our own prior list and reclaim that BList, but this is 1622 * a no-no: some of the blocks on our prior list may 1623 * be locked by our caller. Or maybe their label blocks 1624 * are locked by our caller. In any event, it's too hard 1625 * to make sure we can do I/O for ourselves. Instead, 1626 * we assume the flush thread will find something. 1627 * (The flush thread never blocks waiting for a block, 1628 * so it can't deadlock like we can.) 1629 */ 1630 while(c->blfree == nil){ 1631 vtWakeup(c->flush); 1632 vtSleep(c->blrend); 1633 if(c->blfree == nil) 1634 fprint(2, "%s: flushing for blists\n", argv0); 1635 } 1636 } 1637 1638 p = c->blfree; 1639 c->blfree = p->next; 1640 vtUnlock(c->lk); 1641 return p; 1642 } 1643 1644 static void 1645 blistFree(Cache *c, BList *bl) 1646 { 1647 vtLock(c->lk); 1648 bl->next = c->blfree; 1649 c->blfree = bl; 1650 vtWakeup(c->blrend); 1651 vtUnlock(c->lk); 1652 } 1653 1654 char* 1655 bsStr(int state) 1656 { 1657 static char s[100]; 1658 1659 if(state == BsFree) 1660 return "Free"; 1661 if(state == BsBad) 1662 return "Bad"; 1663 1664 sprint(s, "%x", state); 1665 if(!(state&BsAlloc)) 1666 strcat(s, ",Free"); /* should not happen */ 1667 if(state&BsCopied) 1668 strcat(s, ",Copied"); 1669 if(state&BsVenti) 1670 strcat(s, ",Venti"); 1671 if(state&BsClosed) 1672 strcat(s, ",Closed"); 1673 return s; 1674 } 1675 1676 char * 1677 bioStr(int iostate) 1678 { 1679 switch(iostate){ 1680 default: 1681 return "Unknown!!"; 1682 case BioEmpty: 1683 return "Empty"; 1684 case BioLabel: 1685 return "Label"; 1686 case BioClean: 1687 return "Clean"; 1688 case BioDirty: 1689 return "Dirty"; 1690 case BioReading: 1691 return "Reading"; 1692 case BioWriting: 1693 return "Writing"; 1694 case BioReadError: 1695 return "ReadError"; 1696 case BioVentiError: 1697 return "VentiError"; 1698 case BioMax: 1699 return "Max"; 1700 } 1701 } 1702 1703 static char *bttab[] = { 1704 "BtData", 1705 "BtData+1", 1706 "BtData+2", 1707 "BtData+3", 1708 "BtData+4", 1709 "BtData+5", 1710 "BtData+6", 1711 "BtData+7", 1712 "BtDir", 1713 "BtDir+1", 1714 "BtDir+2", 1715 "BtDir+3", 1716 "BtDir+4", 1717 "BtDir+5", 1718 "BtDir+6", 1719 "BtDir+7", 1720 }; 1721 1722 char* 1723 btStr(int type) 1724 { 1725 if(type < nelem(bttab)) 1726 return bttab[type]; 1727 return "unknown"; 1728 } 1729 1730 int 1731 labelFmt(Fmt *f) 1732 { 1733 Label *l; 1734 1735 l = va_arg(f->args, Label*); 1736 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1737 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1738 } 1739 1740 int 1741 scoreFmt(Fmt *f) 1742 { 1743 uchar *v; 1744 int i; 1745 u32int addr; 1746 1747 v = va_arg(f->args, uchar*); 1748 if(v == nil){ 1749 fmtprint(f, "*"); 1750 }else if((addr = globalToLocal(v)) != NilBlock) 1751 fmtprint(f, "0x%.8ux", addr); 1752 else{ 1753 for(i = 0; i < VtScoreSize; i++) 1754 fmtprint(f, "%2.2ux", v[i]); 1755 } 1756 1757 return 0; 1758 } 1759 1760 static int 1761 upHeap(int i, Block *b) 1762 { 1763 Block *bb; 1764 u32int now; 1765 int p; 1766 Cache *c; 1767 1768 c = b->c; 1769 now = c->now; 1770 for(; i != 0; i = p){ 1771 p = (i - 1) >> 1; 1772 bb = c->heap[p]; 1773 if(b->used - now >= bb->used - now) 1774 break; 1775 c->heap[i] = bb; 1776 bb->heap = i; 1777 } 1778 c->heap[i] = b; 1779 b->heap = i; 1780 1781 return i; 1782 } 1783 1784 static int 1785 downHeap(int i, Block *b) 1786 { 1787 Block *bb; 1788 u32int now; 1789 int k; 1790 Cache *c; 1791 1792 c = b->c; 1793 now = c->now; 1794 for(; ; i = k){ 1795 k = (i << 1) + 1; 1796 if(k >= c->nheap) 1797 break; 1798 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1799 k++; 1800 bb = c->heap[k]; 1801 if(b->used - now <= bb->used - now) 1802 break; 1803 c->heap[i] = bb; 1804 bb->heap = i; 1805 } 1806 c->heap[i] = b; 1807 b->heap = i; 1808 return i; 1809 } 1810 1811 /* 1812 * Delete a block from the heap. 1813 * Called with c->lk held. 1814 */ 1815 static void 1816 heapDel(Block *b) 1817 { 1818 int i, si; 1819 Cache *c; 1820 1821 c = b->c; 1822 1823 si = b->heap; 1824 if(si == BadHeap) 1825 return; 1826 b->heap = BadHeap; 1827 c->nheap--; 1828 if(si == c->nheap) 1829 return; 1830 b = c->heap[c->nheap]; 1831 i = upHeap(si, b); 1832 if(i == si) 1833 downHeap(i, b); 1834 } 1835 1836 /* 1837 * Insert a block into the heap. 1838 * Called with c->lk held. 1839 */ 1840 static void 1841 heapIns(Block *b) 1842 { 1843 assert(b->heap == BadHeap); 1844 upHeap(b->c->nheap++, b); 1845 vtWakeup(b->c->heapwait); 1846 } 1847 1848 /* 1849 * Get just the label for a block. 1850 */ 1851 int 1852 readLabel(Cache *c, Label *l, u32int addr) 1853 { 1854 int lpb; 1855 Block *b; 1856 u32int a; 1857 1858 lpb = c->size / LabelSize; 1859 a = addr / lpb; 1860 b = cacheLocal(c, PartLabel, a, OReadOnly); 1861 if(b == nil){ 1862 blockPut(b); 1863 return 0; 1864 } 1865 1866 if(!labelUnpack(l, b->data, addr%lpb)){ 1867 blockPut(b); 1868 return 0; 1869 } 1870 blockPut(b); 1871 return 1; 1872 } 1873 1874 /* 1875 * Process unlink queue. 1876 * Called with c->lk held. 1877 */ 1878 static void 1879 unlinkBody(Cache *c) 1880 { 1881 BList *p; 1882 1883 while(c->uhead != nil){ 1884 p = c->uhead; 1885 c->uhead = p->next; 1886 vtUnlock(c->lk); 1887 doRemoveLink(c, p); 1888 vtLock(c->lk); 1889 p->next = c->blfree; 1890 c->blfree = p; 1891 } 1892 } 1893 1894 /* 1895 * Occasionally unlink the blocks on the cache unlink queue. 1896 */ 1897 static void 1898 unlinkThread(void *a) 1899 { 1900 Cache *c = a; 1901 1902 vtThreadSetName("unlink"); 1903 1904 vtLock(c->lk); 1905 for(;;){ 1906 while(c->uhead == nil && c->die == nil) 1907 vtSleep(c->unlink); 1908 if(c->die != nil) 1909 break; 1910 unlinkBody(c); 1911 } 1912 c->ref--; 1913 vtWakeup(c->die); 1914 vtUnlock(c->lk); 1915 } 1916 1917 static int 1918 baddrCmp(void *a0, void *a1) 1919 { 1920 BAddr *b0, *b1; 1921 b0 = a0; 1922 b1 = a1; 1923 1924 if(b0->part < b1->part) 1925 return -1; 1926 if(b0->part > b1->part) 1927 return 1; 1928 if(b0->addr < b1->addr) 1929 return -1; 1930 if(b0->addr > b1->addr) 1931 return 1; 1932 return 0; 1933 } 1934 1935 /* 1936 * Scan the block list for dirty blocks; add them to the list c->baddr. 1937 */ 1938 static void 1939 flushFill(Cache *c) 1940 { 1941 int i, ndirty; 1942 BAddr *p; 1943 Block *b; 1944 1945 vtLock(c->lk); 1946 if(c->ndirty == 0){ 1947 vtUnlock(c->lk); 1948 return; 1949 } 1950 1951 p = c->baddr; 1952 ndirty = 0; 1953 for(i=0; i<c->nblocks; i++){ 1954 b = c->blocks + i; 1955 if(b->part == PartError) 1956 continue; 1957 if(b->iostate == BioDirty || b->iostate == BioWriting) 1958 ndirty++; 1959 if(b->iostate != BioDirty) 1960 continue; 1961 p->part = b->part; 1962 p->addr = b->addr; 1963 p->vers = b->vers; 1964 p++; 1965 } 1966 if(ndirty != c->ndirty){ 1967 fprint(2, "%s: ndirty mismatch expected %d found %d\n", 1968 argv0, c->ndirty, ndirty); 1969 c->ndirty = ndirty; 1970 } 1971 vtUnlock(c->lk); 1972 1973 c->bw = p - c->baddr; 1974 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1975 } 1976 1977 /* 1978 * This is not thread safe, i.e. it can't be called from multiple threads. 1979 * 1980 * It's okay how we use it, because it only gets called in 1981 * the flushThread. And cacheFree, but only after 1982 * cacheFree has killed off the flushThread. 1983 */ 1984 static int 1985 cacheFlushBlock(Cache *c) 1986 { 1987 Block *b; 1988 BAddr *p; 1989 int lockfail, nfail; 1990 1991 nfail = 0; 1992 for(;;){ 1993 if(c->br == c->be){ 1994 if(c->bw == 0 || c->bw == c->be) 1995 flushFill(c); 1996 c->br = 0; 1997 c->be = c->bw; 1998 c->bw = 0; 1999 c->nflush = 0; 2000 } 2001 2002 if(c->br == c->be) 2003 return 0; 2004 p = c->baddr + c->br; 2005 c->br++; 2006 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 2007 2008 if(b && blockWrite(b)){ 2009 c->nflush++; 2010 blockPut(b); 2011 return 1; 2012 } 2013 if(b) 2014 blockPut(b); 2015 2016 /* 2017 * Why didn't we write the block? 2018 */ 2019 2020 /* Block already written out */ 2021 if(b == nil && !lockfail) 2022 continue; 2023 2024 /* Failed to acquire lock; sleep if happens a lot. */ 2025 if(lockfail && ++nfail > 100){ 2026 sleep(500); 2027 nfail = 0; 2028 } 2029 /* Requeue block. */ 2030 if(c->bw < c->be) 2031 c->baddr[c->bw++] = *p; 2032 } 2033 } 2034 2035 /* 2036 * Occasionally flush dirty blocks from memory to the disk. 2037 */ 2038 static void 2039 flushThread(void *a) 2040 { 2041 Cache *c = a; 2042 int i; 2043 2044 vtThreadSetName("flush"); 2045 vtLock(c->lk); 2046 while(c->die == nil){ 2047 vtSleep(c->flush); 2048 vtUnlock(c->lk); 2049 for(i=0; i<FlushSize; i++) 2050 if(!cacheFlushBlock(c)){ 2051 /* 2052 * If i==0, could be someone is waking us repeatedly 2053 * to flush the cache but there's no work to do. 2054 * Pause a little. 2055 */ 2056 if(i==0){ 2057 // fprint(2, "%s: flushthread found " 2058 // "nothing to flush - %d dirty\n", 2059 // argv0, c->ndirty); 2060 sleep(250); 2061 } 2062 break; 2063 } 2064 if(i==0 && c->ndirty){ 2065 /* 2066 * All the blocks are being written right now -- there's nothing to do. 2067 * We might be spinning with cacheFlush though -- he'll just keep 2068 * kicking us until c->ndirty goes down. Probably we should sleep 2069 * on something that the diskThread can kick, but for now we'll 2070 * just pause for a little while waiting for disks to finish. 2071 */ 2072 sleep(100); 2073 } 2074 vtLock(c->lk); 2075 vtWakeupAll(c->flushwait); 2076 } 2077 c->ref--; 2078 vtWakeup(c->die); 2079 vtUnlock(c->lk); 2080 } 2081 2082 /* 2083 * Flush the cache. 2084 */ 2085 void 2086 cacheFlush(Cache *c, int wait) 2087 { 2088 /* 2089 * Lock c->dirtylk so that more blocks aren't being dirtied 2090 * while we try to write out what's already here. 2091 * Otherwise we might not ever finish! 2092 */ 2093 vtLock(c->dirtylk); 2094 vtLock(c->lk); 2095 if(wait){ 2096 while(c->ndirty){ 2097 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2098 // c->ndirty, c->uhead); 2099 vtWakeup(c->flush); 2100 vtSleep(c->flushwait); 2101 } 2102 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2103 }else if(c->ndirty) 2104 vtWakeup(c->flush); 2105 vtUnlock(c->lk); 2106 vtUnlock(c->dirtylk); 2107 } 2108 2109 /* 2110 * Kick the flushThread every 30 seconds. 2111 */ 2112 static void 2113 cacheSync(void *v) 2114 { 2115 Cache *c; 2116 2117 c = v; 2118 cacheFlush(c, 0); 2119 } 2120