1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtSession *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 VtRendez *blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 VtRendez *die; /* daemon threads should die when != nil */ 52 53 VtRendez *flush; 54 VtRendez *flushwait; 55 VtRendez *heapwait; 56 BAddr *baddr; 57 int bw, br, be; 58 int nflush; 59 60 Periodic *sync; 61 62 /* unlink daemon */ 63 BList *uhead; 64 BList *utail; 65 VtRendez *unlink; 66 67 /* block counts */ 68 int nused; 69 int ndisk; 70 }; 71 72 struct BList { 73 int part; 74 u32int addr; 75 uchar type; 76 u32int tag; 77 u32int epoch; 78 u32int vers; 79 80 int recurse; /* for block unlink */ 81 82 /* for roll back */ 83 int index; /* -1 indicates not valid */ 84 union { 85 uchar score[VtScoreSize]; 86 uchar entry[VtEntrySize]; 87 } old; 88 BList *next; 89 }; 90 91 struct BAddr { 92 int part; 93 u32int addr; 94 u32int vers; 95 }; 96 97 struct FreeList { 98 VtLock *lk; 99 u32int last; /* last block allocated */ 100 u32int end; /* end of data partition */ 101 u32int nfree; /* number of free blocks */ 102 u32int nused; /* number of used blocks */ 103 u32int epochLow; /* low epoch when last updated nfree and nused */ 104 }; 105 106 static FreeList *flAlloc(u32int end); 107 static void flFree(FreeList *fl); 108 109 static Block *cacheBumpBlock(Cache *c); 110 static void heapDel(Block*); 111 static void heapIns(Block*); 112 static void cacheCheck(Cache*); 113 static void unlinkThread(void *a); 114 static void flushThread(void *a); 115 static void flushBody(Cache *c); 116 static void unlinkBody(Cache *c); 117 static int cacheFlushBlock(Cache *c); 118 static void cacheSync(void*); 119 static BList *blistAlloc(Block*); 120 static void blistFree(Cache*, BList*); 121 static void doRemoveLink(Cache*, BList*); 122 static void doRemoveLinkList(Cache*, BList*); 123 124 /* 125 * Mapping from local block type to Venti type 126 */ 127 int vtType[BtMax] = { 128 VtDataType, /* BtData | 0 */ 129 VtPointerType0, /* BtData | 1 */ 130 VtPointerType1, /* BtData | 2 */ 131 VtPointerType2, /* BtData | 3 */ 132 VtPointerType3, /* BtData | 4 */ 133 VtPointerType4, /* BtData | 5 */ 134 VtPointerType5, /* BtData | 6 */ 135 VtPointerType6, /* BtData | 7 */ 136 VtDirType, /* BtDir | 0 */ 137 VtPointerType0, /* BtDir | 1 */ 138 VtPointerType1, /* BtDir | 2 */ 139 VtPointerType2, /* BtDir | 3 */ 140 VtPointerType3, /* BtDir | 4 */ 141 VtPointerType4, /* BtDir | 5 */ 142 VtPointerType5, /* BtDir | 6 */ 143 VtPointerType6, /* BtDir | 7 */ 144 }; 145 146 /* 147 * Allocate the memory cache. 148 */ 149 Cache * 150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 151 { 152 int i; 153 Cache *c; 154 Block *b; 155 BList *bl; 156 u8int *p; 157 int nbl; 158 159 c = vtMemAllocZ(sizeof(Cache)); 160 161 /* reasonable number of BList elements */ 162 nbl = nblocks * 4; 163 164 c->lk = vtLockAlloc(); 165 c->ref = 1; 166 c->disk = disk; 167 c->z = z; 168 c->size = diskBlockSize(disk); 169 bwatchSetBlockSize(c->size); 170 /* round c->size up to be a nice multiple */ 171 c->size = (c->size + 127) & ~127; 172 c->ndmap = (c->size/20 + 7) / 8; 173 c->nblocks = nblocks; 174 c->hashSize = nblocks; 175 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 176 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 177 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 178 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 179 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 180 c->mode = mode; 181 c->vers++; 182 p = c->mem; 183 for(i = 0; i < nblocks; i++){ 184 b = &c->blocks[i]; 185 b->lk = vtLockAlloc(); 186 b->c = c; 187 b->data = p; 188 b->heap = i; 189 b->ioready = vtRendezAlloc(b->lk); 190 c->heap[i] = b; 191 p += c->size; 192 } 193 c->nheap = nblocks; 194 for(i = 0; i < nbl; i++){ 195 bl = (BList*)p; 196 bl->next = c->blfree; 197 c->blfree = bl; 198 p += sizeof(BList); 199 } 200 /* separate loop to keep blocks and blists reasonably aligned */ 201 for(i = 0; i < nblocks; i++){ 202 b = &c->blocks[i]; 203 b->dmap = p; 204 p += c->ndmap; 205 } 206 207 c->blrend = vtRendezAlloc(c->lk); 208 209 c->maxdirty = nblocks*(DirtyPercentage*0.01); 210 211 c->fl = flAlloc(diskSize(disk, PartData)); 212 213 c->unlink = vtRendezAlloc(c->lk); 214 c->flush = vtRendezAlloc(c->lk); 215 c->flushwait = vtRendezAlloc(c->lk); 216 c->heapwait = vtRendezAlloc(c->lk); 217 c->sync = periodicAlloc(cacheSync, c, 30*1000); 218 219 if(mode == OReadWrite){ 220 c->ref += 2; 221 vtThread(unlinkThread, c); 222 vtThread(flushThread, c); 223 } 224 cacheCheck(c); 225 226 return c; 227 } 228 229 /* 230 * Free the whole memory cache, flushing all dirty blocks to the disk. 231 */ 232 void 233 cacheFree(Cache *c) 234 { 235 int i; 236 237 /* kill off daemon threads */ 238 vtLock(c->lk); 239 c->die = vtRendezAlloc(c->lk); 240 periodicKill(c->sync); 241 vtWakeup(c->flush); 242 vtWakeup(c->unlink); 243 while(c->ref > 1) 244 vtSleep(c->die); 245 246 /* flush everything out */ 247 do { 248 unlinkBody(c); 249 vtUnlock(c->lk); 250 while(cacheFlushBlock(c)) 251 ; 252 diskFlush(c->disk); 253 vtLock(c->lk); 254 } while(c->uhead || c->ndirty); 255 vtUnlock(c->lk); 256 257 cacheCheck(c); 258 259 for(i = 0; i < c->nblocks; i++){ 260 assert(c->blocks[i].ref == 0); 261 vtRendezFree(c->blocks[i].ioready); 262 vtLockFree(c->blocks[i].lk); 263 } 264 flFree(c->fl); 265 vtMemFree(c->baddr); 266 vtMemFree(c->heads); 267 vtMemFree(c->blocks); 268 vtMemFree(c->mem); 269 vtLockFree(c->lk); 270 diskFree(c->disk); 271 vtRendezFree(c->blrend); 272 /* don't close vtSession */ 273 vtMemFree(c); 274 } 275 276 static void 277 cacheDump(Cache *c) 278 { 279 int i; 280 Block *b; 281 282 for(i = 0; i < c->nblocks; i++){ 283 b = &c->blocks[i]; 284 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=0x%lux\n", 285 i, b->part, b->addr, b->score, b->l.type, b->ref, 286 bsStr(b->l.state), bioStr(b->iostate), b->pc); 287 } 288 } 289 290 static void 291 cacheCheck(Cache *c) 292 { 293 u32int size, now; 294 int i, k, refed; 295 static uchar zero[VtScoreSize]; 296 Block *b; 297 298 size = c->size; 299 now = c->now; 300 301 for(i = 0; i < c->nheap; i++){ 302 if(c->heap[i]->heap != i) 303 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 304 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 305 vtFatal("bad heap ordering"); 306 k = (i << 1) + 1; 307 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 308 vtFatal("bad heap ordering"); 309 k++; 310 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 311 vtFatal("bad heap ordering"); 312 } 313 314 refed = 0; 315 for(i = 0; i < c->nblocks; i++){ 316 b = &c->blocks[i]; 317 if(b->data != &c->mem[i * size]) 318 vtFatal("mis-blocked at %d", i); 319 if(b->ref && b->heap == BadHeap){ 320 refed++; 321 } 322 } 323 if(c->nheap + refed != c->nblocks){ 324 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 325 cacheDump(c); 326 } 327 assert(c->nheap + refed == c->nblocks); 328 refed = 0; 329 for(i = 0; i < c->nblocks; i++){ 330 b = &c->blocks[i]; 331 if(b->ref){ 332 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 333 refed++; 334 } 335 } 336 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 337 } 338 339 340 /* 341 * locate the block with the oldest second to last use. 342 * remove it from the heap, and fix up the heap. 343 */ 344 /* called with c->lk held */ 345 static Block * 346 cacheBumpBlock(Cache *c) 347 { 348 Block *b; 349 350 /* 351 * locate the block with the oldest second to last use. 352 * remove it from the heap, and fix up the heap. 353 */ 354 if(c->nheap == 0){ 355 while(c->nheap == 0){ 356 fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty); 357 vtWakeup(c->flush); 358 vtSleep(c->heapwait); 359 } 360 fprint(2, "cache is okay again\n"); 361 } 362 363 b = c->heap[0]; 364 heapDel(b); 365 366 assert(b->heap == BadHeap); 367 assert(b->ref == 0); 368 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 369 assert(b->prior == nil); 370 assert(b->uhead == nil); 371 372 /* 373 * unchain the block from hash chain 374 */ 375 if(b->prev){ 376 *(b->prev) = b->next; 377 if(b->next) 378 b->next->prev = b->prev; 379 b->prev = nil; 380 } 381 382 383 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 384 /* set block to a reasonable state */ 385 b->ref = 1; 386 b->part = PartError; 387 memset(&b->l, 0, sizeof(b->l)); 388 b->iostate = BioEmpty; 389 390 return b; 391 } 392 393 /* 394 * look for a particular version of the block in the memory cache. 395 */ 396 static Block * 397 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 398 int waitlock, int *lockfailure) 399 { 400 Block *b; 401 ulong h; 402 403 h = addr % c->hashSize; 404 405 if(lockfailure) 406 *lockfailure = 0; 407 408 /* 409 * look for the block in the cache 410 */ 411 vtLock(c->lk); 412 for(b = c->heads[h]; b != nil; b = b->next){ 413 if(b->part == part && b->addr == addr) 414 break; 415 } 416 if(b == nil || b->vers != vers){ 417 vtUnlock(c->lk); 418 return nil; 419 } 420 if(!waitlock && !vtCanLock(b->lk)){ 421 *lockfailure = 1; 422 vtUnlock(c->lk); 423 return nil; 424 } 425 heapDel(b); 426 b->ref++; 427 vtUnlock(c->lk); 428 429 bwatchLock(b); 430 if(waitlock) 431 vtLock(b->lk); 432 b->nlock = 1; 433 434 for(;;){ 435 switch(b->iostate){ 436 default: 437 abort(); 438 case BioEmpty: 439 case BioLabel: 440 case BioClean: 441 case BioDirty: 442 if(b->vers != vers){ 443 blockPut(b); 444 return nil; 445 } 446 return b; 447 case BioReading: 448 case BioWriting: 449 vtSleep(b->ioready); 450 break; 451 case BioVentiError: 452 blockPut(b); 453 vtSetError("venti i/o error block 0x%.8ux", addr); 454 return nil; 455 case BioReadError: 456 blockPut(b); 457 vtSetError("i/o error block 0x%.8ux", addr); 458 return nil; 459 } 460 } 461 /* NOT REACHED */ 462 } 463 static Block* 464 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 465 { 466 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 467 } 468 469 470 /* 471 * fetch a local (on-disk) block from the memory cache. 472 * if it's not there, load it, bumping some other block. 473 */ 474 Block * 475 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 476 { 477 Block *b; 478 ulong h; 479 480 assert(part != PartVenti); 481 482 h = addr % c->hashSize; 483 484 /* 485 * look for the block in the cache 486 */ 487 vtLock(c->lk); 488 for(b = c->heads[h]; b != nil; b = b->next){ 489 if(b->part != part || b->addr != addr) 490 continue; 491 if(epoch && b->l.epoch != epoch){ 492 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 493 vtUnlock(c->lk); 494 vtSetError(ELabelMismatch); 495 return nil; 496 } 497 heapDel(b); 498 b->ref++; 499 break; 500 } 501 502 if(b == nil){ 503 b = cacheBumpBlock(c); 504 505 b->part = part; 506 b->addr = addr; 507 localToGlobal(addr, b->score); 508 509 /* chain onto correct hash */ 510 b->next = c->heads[h]; 511 c->heads[h] = b; 512 if(b->next != nil) 513 b->next->prev = &b->next; 514 b->prev = &c->heads[h]; 515 } 516 517 vtUnlock(c->lk); 518 519 /* 520 * BUG: what if the epoch changes right here? 521 * In the worst case, we could end up in some weird 522 * lock loop, because the block we want no longer exists, 523 * and instead we're trying to lock a block we have no 524 * business grabbing. 525 * 526 * For now, I'm not going to worry about it. 527 */ 528 529 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 530 bwatchLock(b); 531 vtLock(b->lk); 532 b->nlock = 1; 533 534 if(part == PartData && b->iostate == BioEmpty){ 535 if(!readLabel(c, &b->l, addr)){ 536 blockPut(b); 537 return nil; 538 } 539 blockSetIOState(b, BioLabel); 540 } 541 if(epoch && b->l.epoch != epoch){ 542 blockPut(b); 543 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 544 vtSetError(ELabelMismatch); 545 return nil; 546 } 547 548 b->pc = getcallerpc(&c); 549 for(;;){ 550 switch(b->iostate){ 551 default: 552 abort(); 553 case BioEmpty: 554 case BioLabel: 555 if(mode == OOverWrite){ 556 blockSetIOState(b, BioClean); 557 return b; 558 } 559 diskRead(c->disk, b); 560 vtSleep(b->ioready); 561 break; 562 case BioClean: 563 case BioDirty: 564 return b; 565 case BioReading: 566 case BioWriting: 567 vtSleep(b->ioready); 568 break; 569 case BioReadError: 570 blockSetIOState(b, BioEmpty); 571 blockPut(b); 572 vtSetError("i/o error block 0x%.8ux", addr); 573 return nil; 574 } 575 } 576 /* NOT REACHED */ 577 } 578 579 Block * 580 cacheLocal(Cache *c, int part, u32int addr, int mode) 581 { 582 return _cacheLocal(c, part, addr, mode, 0); 583 } 584 585 /* 586 * fetch a local (on-disk) block from the memory cache. 587 * if it's not there, load it, bumping some other block. 588 * check tag and type. 589 */ 590 Block * 591 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 592 { 593 Block *b; 594 595 b = _cacheLocal(c, PartData, addr, mode, epoch); 596 if(b == nil) 597 return nil; 598 if(b->l.type != type || b->l.tag != tag){ 599 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 600 addr, b->l.type, type, b->l.tag, tag); 601 vtSetError(ELabelMismatch); 602 blockPut(b); 603 return nil; 604 } 605 b->pc = getcallerpc(&c); 606 return b; 607 } 608 609 /* 610 * fetch a global (Venti) block from the memory cache. 611 * if it's not there, load it, bumping some other block. 612 * check tag and type if it's really a local block in disguise. 613 */ 614 Block * 615 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 616 { 617 int n; 618 Block *b; 619 ulong h; 620 u32int addr; 621 622 addr = globalToLocal(score); 623 if(addr != NilBlock){ 624 b = cacheLocalData(c, addr, type, tag, mode, 0); 625 if(b) 626 b->pc = getcallerpc(&c); 627 return b; 628 } 629 630 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 631 632 /* 633 * look for the block in the cache 634 */ 635 vtLock(c->lk); 636 for(b = c->heads[h]; b != nil; b = b->next){ 637 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 638 continue; 639 heapDel(b); 640 b->ref++; 641 break; 642 } 643 644 if(b == nil){ 645 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 646 647 b = cacheBumpBlock(c); 648 649 b->part = PartVenti; 650 b->addr = NilBlock; 651 b->l.type = type; 652 memmove(b->score, score, VtScoreSize); 653 654 /* chain onto correct hash */ 655 b->next = c->heads[h]; 656 c->heads[h] = b; 657 if(b->next != nil) 658 b->next->prev = &b->next; 659 b->prev = &c->heads[h]; 660 } 661 vtUnlock(c->lk); 662 663 bwatchLock(b); 664 vtLock(b->lk); 665 b->nlock = 1; 666 b->pc = getcallerpc(&c); 667 668 switch(b->iostate){ 669 default: 670 abort(); 671 case BioEmpty: 672 n = vtRead(c->z, score, vtType[type], b->data, c->size); 673 if(n < 0 || !vtSha1Check(score, b->data, n)){ 674 blockSetIOState(b, BioVentiError); 675 blockPut(b); 676 vtSetError("venti i/o error block %V: %r", score); 677 return nil; 678 } 679 vtZeroExtend(vtType[type], b->data, n, c->size); 680 blockSetIOState(b, BioClean); 681 return b; 682 case BioClean: 683 return b; 684 case BioVentiError: 685 blockPut(b); 686 vtSetError("venti i/o error block %V", score); 687 return nil; 688 case BioReadError: 689 blockPut(b); 690 vtSetError("i/o error block %V", b->score); 691 return nil; 692 } 693 /* NOT REACHED */ 694 } 695 696 /* 697 * allocate a new on-disk block and load it into the memory cache. 698 * BUG: if the disk is full, should we flush some of it to Venti? 699 */ 700 static u32int lastAlloc; 701 702 Block * 703 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 704 { 705 FreeList *fl; 706 u32int addr; 707 Block *b; 708 int n, nwrap; 709 Label lab; 710 711 n = c->size / LabelSize; 712 fl = c->fl; 713 714 vtLock(fl->lk); 715 addr = fl->last; 716 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 717 if(b == nil){ 718 fprint(2, "cacheAllocBlock: xxx %R\n"); 719 vtUnlock(fl->lk); 720 return nil; 721 } 722 nwrap = 0; 723 for(;;){ 724 if(++addr >= fl->end){ 725 addr = 0; 726 if(++nwrap >= 2){ 727 blockPut(b); 728 fl->last = 0; 729 vtSetError("disk is full"); 730 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 731 vtUnlock(fl->lk); 732 return nil; 733 } 734 } 735 if(addr%n == 0){ 736 blockPut(b); 737 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 738 if(b == nil){ 739 fl->last = addr; 740 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 741 vtUnlock(fl->lk); 742 return nil; 743 } 744 } 745 if(!labelUnpack(&lab, b->data, addr%n)) 746 continue; 747 if(lab.state == BsFree) 748 goto Found; 749 if(lab.state&BsClosed) 750 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 751 goto Found; 752 } 753 Found: 754 blockPut(b); 755 b = cacheLocal(c, PartData, addr, OOverWrite); 756 if(b == nil){ 757 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 758 return nil; 759 } 760 assert(b->iostate == BioLabel || b->iostate == BioClean); 761 fl->last = addr; 762 lab.type = type; 763 lab.tag = tag; 764 lab.state = BsAlloc; 765 lab.epoch = epoch; 766 lab.epochClose = ~(u32int)0; 767 if(!blockSetLabel(b, &lab, 1)){ 768 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 769 blockPut(b); 770 return nil; 771 } 772 vtZeroExtend(vtType[type], b->data, 0, c->size); 773 if(0)diskWrite(c->disk, b); 774 775 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 776 lastAlloc = addr; 777 fl->nused++; 778 vtUnlock(fl->lk); 779 b->pc = getcallerpc(&c); 780 return b; 781 } 782 783 void 784 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 785 { 786 int n; 787 u32int addr, nused; 788 Block *b; 789 Label lab; 790 FreeList *fl; 791 792 fl = c->fl; 793 n = c->size / LabelSize; 794 *bsize = c->size; 795 vtLock(fl->lk); 796 if(fl->epochLow == epochLow){ 797 *used = fl->nused; 798 *total = fl->end; 799 vtUnlock(fl->lk); 800 return; 801 } 802 b = nil; 803 nused = 0; 804 for(addr=0; addr<fl->end; addr++){ 805 if(addr%n == 0){ 806 blockPut(b); 807 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 808 if(b == nil){ 809 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 810 break; 811 } 812 } 813 if(!labelUnpack(&lab, b->data, addr%n)) 814 continue; 815 if(lab.state == BsFree) 816 continue; 817 if(lab.state&BsClosed) 818 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 819 continue; 820 nused++; 821 } 822 blockPut(b); 823 if(addr == fl->end){ 824 fl->nused = nused; 825 fl->epochLow = epochLow; 826 } 827 *used = nused; 828 *total = fl->end; 829 vtUnlock(fl->lk); 830 return; 831 } 832 833 static FreeList * 834 flAlloc(u32int end) 835 { 836 FreeList *fl; 837 838 fl = vtMemAllocZ(sizeof(*fl)); 839 fl->lk = vtLockAlloc(); 840 fl->last = 0; 841 fl->end = end; 842 return fl; 843 } 844 845 static void 846 flFree(FreeList *fl) 847 { 848 vtLockFree(fl->lk); 849 vtMemFree(fl); 850 } 851 852 u32int 853 cacheLocalSize(Cache *c, int part) 854 { 855 return diskSize(c->disk, part); 856 } 857 858 /* 859 * The thread that has locked b may refer to it by 860 * multiple names. Nlock counts the number of 861 * references the locking thread holds. It will call 862 * blockPut once per reference. 863 */ 864 void 865 blockDupLock(Block *b) 866 { 867 assert(b->nlock > 0); 868 b->nlock++; 869 } 870 871 /* 872 * we're done with the block. 873 * unlock it. can't use it after calling this. 874 */ 875 void 876 blockPut(Block* b) 877 { 878 Cache *c; 879 880 if(b == nil) 881 return; 882 883 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 884 885 if(b->iostate == BioDirty) 886 bwatchDependency(b); 887 888 if(--b->nlock > 0) 889 return; 890 891 /* 892 * b->nlock should probably stay at zero while 893 * the block is unlocked, but diskThread and vtSleep 894 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 895 * so we have to keep b->nlock set to 1 even 896 * when the block is unlocked. 897 */ 898 assert(b->nlock == 0); 899 b->nlock = 1; 900 // b->pc = 0; 901 902 bwatchUnlock(b); 903 vtUnlock(b->lk); 904 c = b->c; 905 vtLock(c->lk); 906 907 if(--b->ref > 0){ 908 vtUnlock(c->lk); 909 return; 910 } 911 912 assert(b->ref == 0); 913 switch(b->iostate){ 914 default: 915 b->used = c->now++; 916 heapIns(b); 917 break; 918 case BioEmpty: 919 case BioLabel: 920 if(c->nheap == 0) 921 b->used = c->now++; 922 else 923 b->used = c->heap[0]->used; 924 heapIns(b); 925 break; 926 case BioDirty: 927 break; 928 } 929 vtUnlock(c->lk); 930 } 931 932 /* 933 * set the label associated with a block. 934 */ 935 Block* 936 _blockSetLabel(Block *b, Label *l) 937 { 938 int lpb; 939 Block *bb; 940 u32int a; 941 Cache *c; 942 943 c = b->c; 944 945 assert(b->part == PartData); 946 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 947 lpb = c->size / LabelSize; 948 a = b->addr / lpb; 949 bb = cacheLocal(c, PartLabel, a, OReadWrite); 950 if(bb == nil){ 951 blockPut(b); 952 return nil; 953 } 954 b->l = *l; 955 labelPack(l, bb->data, b->addr%lpb); 956 blockDirty(bb); 957 return bb; 958 } 959 960 int 961 blockSetLabel(Block *b, Label *l, int allocating) 962 { 963 Block *lb; 964 Label oldl; 965 966 oldl = b->l; 967 lb = _blockSetLabel(b, l); 968 if(lb == nil) 969 return 0; 970 971 /* 972 * If we're allocating the block, make sure the label (bl) 973 * goes to disk before the data block (b) itself. This is to help 974 * the blocks that in turn depend on b. 975 * 976 * Suppose bx depends on (must be written out after) b. 977 * Once we write b we'll think it's safe to write bx. 978 * Bx can't get at b unless it has a valid label, though. 979 * 980 * Allocation is the only case in which having a current label 981 * is vital because: 982 * 983 * - l.type is set at allocation and never changes. 984 * - l.tag is set at allocation and never changes. 985 * - l.state is not checked when we load blocks. 986 * - the archiver cares deeply about l.state being 987 * BaActive vs. BaCopied, but that's handled 988 * by direct calls to _blockSetLabel. 989 */ 990 991 if(allocating) 992 blockDependency(b, lb, -1, nil, nil); 993 blockPut(lb); 994 return 1; 995 } 996 997 /* 998 * Record that bb must be written out before b. 999 * If index is given, we're about to overwrite the score/e 1000 * at that index in the block. Save the old value so we 1001 * can write a safer ``old'' version of the block if pressed. 1002 */ 1003 void 1004 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1005 { 1006 BList *p; 1007 1008 if(bb->iostate == BioClean) 1009 return; 1010 1011 /* 1012 * Dependencies for blocks containing Entry structures 1013 * or scores must always be explained. The problem with 1014 * only explaining some of them is this. Suppose we have two 1015 * dependencies for the same field, the first explained 1016 * and the second not. We try to write the block when the first 1017 * dependency is not written but the second is. We will roll back 1018 * the first change even though the second trumps it. 1019 */ 1020 if(index == -1 && bb->part == PartData) 1021 assert(b->l.type == BtData); 1022 1023 if(bb->iostate != BioDirty){ 1024 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1025 bb->part, bb->addr, bb->l.type, bb->iostate); 1026 abort(); 1027 } 1028 1029 p = blistAlloc(bb); 1030 if(p == nil) 1031 return; 1032 1033 assert(bb->iostate == BioDirty); 1034 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1035 1036 p->part = bb->part; 1037 p->addr = bb->addr; 1038 p->type = bb->l.type; 1039 p->vers = bb->vers; 1040 p->index = index; 1041 if(p->index >= 0){ 1042 /* 1043 * This test would just be b->l.type==BtDir except 1044 * we need to exclude the super block. 1045 */ 1046 if(b->l.type == BtDir && b->part == PartData) 1047 entryPack(e, p->old.entry, 0); 1048 else 1049 memmove(p->old.score, score, VtScoreSize); 1050 } 1051 p->next = b->prior; 1052 b->prior = p; 1053 } 1054 1055 /* 1056 * Mark an in-memory block as dirty. If there are too many 1057 * dirty blocks, start writing some out to disk. 1058 * 1059 * If there were way too many dirty blocks, we used to 1060 * try to do some flushing ourselves, but it's just too dangerous -- 1061 * it implies that the callers cannot have any of our priors locked, 1062 * but this is hard to avoid in some cases. 1063 */ 1064 int 1065 blockDirty(Block *b) 1066 { 1067 Cache *c; 1068 1069 c = b->c; 1070 1071 assert(b->part != PartVenti); 1072 1073 if(b->iostate == BioDirty) 1074 return 1; 1075 assert(b->iostate == BioClean); 1076 1077 vtLock(c->lk); 1078 b->iostate = BioDirty; 1079 c->ndirty++; 1080 if(c->ndirty > (c->maxdirty>>1)) 1081 vtWakeup(c->flush); 1082 vtUnlock(c->lk); 1083 1084 return 1; 1085 } 1086 1087 /* 1088 * We've decided to write out b. Maybe b has some pointers to blocks 1089 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1090 * copy of b that is safe to write out. (diskThread will make sure the block 1091 * remains marked as dirty.) 1092 */ 1093 uchar * 1094 blockRollback(Block *b, uchar *buf) 1095 { 1096 u32int addr; 1097 BList *p; 1098 Super super; 1099 1100 /* easy case */ 1101 if(b->prior == nil) 1102 return b->data; 1103 1104 memmove(buf, b->data, b->c->size); 1105 for(p=b->prior; p; p=p->next){ 1106 /* 1107 * we know p->index >= 0 because blockWrite has vetted this block for us. 1108 */ 1109 assert(p->index >= 0); 1110 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1111 if(b->part == PartSuper){ 1112 assert(p->index == 0); 1113 superUnpack(&super, buf); 1114 addr = globalToLocal(p->old.score); 1115 if(addr == NilBlock){ 1116 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1117 abort(); 1118 } 1119 super.active = addr; 1120 superPack(&super, buf); 1121 continue; 1122 } 1123 if(b->l.type == BtDir) 1124 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1125 else 1126 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1127 } 1128 return buf; 1129 } 1130 1131 /* 1132 * Try to write block b. 1133 * If b depends on other blocks: 1134 * 1135 * If the block has been written out, remove the dependency. 1136 * If the dependency is replaced by a more recent dependency, 1137 * throw it out. 1138 * If we know how to write out an old version of b that doesn't 1139 * depend on it, do that. 1140 * 1141 * Otherwise, bail. 1142 */ 1143 int 1144 blockWrite(Block *b) 1145 { 1146 uchar *dmap; 1147 Cache *c; 1148 BList *p, **pp; 1149 Block *bb; 1150 int lockfail; 1151 1152 c = b->c; 1153 1154 if(b->iostate != BioDirty) 1155 return 1; 1156 1157 dmap = b->dmap; 1158 memset(dmap, 0, c->ndmap); 1159 pp = &b->prior; 1160 for(p=*pp; p; p=*pp){ 1161 if(p->index >= 0){ 1162 /* more recent dependency has succeeded; this one can go */ 1163 if(dmap[p->index/8] & (1<<(p->index%8))) 1164 goto ignblock; 1165 } 1166 1167 lockfail = 0; 1168 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1169 if(bb == nil){ 1170 if(lockfail) 1171 return 0; 1172 /* block not in cache => was written already */ 1173 dmap[p->index/8] |= 1<<(p->index%8); 1174 goto ignblock; 1175 } 1176 1177 /* 1178 * same version of block is still in cache. 1179 * 1180 * the assertion is true because the block still has version p->vers, 1181 * which means it hasn't been written out since we last saw it. 1182 */ 1183 if(bb->iostate != BioDirty){ 1184 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1185 bb->part, bb->addr, bb->l.type, bb->iostate); 1186 /* probably BioWriting if it happens? */ 1187 if(bb->iostate == BioClean) 1188 goto ignblock; 1189 } 1190 1191 blockPut(bb); 1192 1193 if(p->index < 0){ 1194 /* 1195 * We don't know how to temporarily undo 1196 * b's dependency on bb, so just don't write b yet. 1197 */ 1198 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1199 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1200 return 0; 1201 } 1202 /* keep walking down the list */ 1203 pp = &p->next; 1204 continue; 1205 1206 ignblock: 1207 *pp = p->next; 1208 blistFree(c, p); 1209 continue; 1210 } 1211 1212 /* 1213 * DiskWrite must never be called with a double-locked block. 1214 * This call to diskWrite is okay because blockWrite is only called 1215 * from the cache flush thread, which never double-locks a block. 1216 */ 1217 diskWrite(c->disk, b); 1218 return 1; 1219 } 1220 1221 /* 1222 * Change the I/O state of block b. 1223 * Just an assignment except for magic in 1224 * switch statement (read comments there). 1225 */ 1226 void 1227 blockSetIOState(Block *b, int iostate) 1228 { 1229 int dowakeup; 1230 Cache *c; 1231 BList *p, *q; 1232 1233 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1234 1235 c = b->c; 1236 1237 dowakeup = 0; 1238 switch(iostate){ 1239 default: 1240 abort(); 1241 case BioEmpty: 1242 assert(!b->uhead); 1243 break; 1244 case BioLabel: 1245 assert(!b->uhead); 1246 break; 1247 case BioClean: 1248 bwatchDependency(b); 1249 /* 1250 * If b->prior is set, it means a write just finished. 1251 * The prior list isn't needed anymore. 1252 */ 1253 for(p=b->prior; p; p=q){ 1254 q = p->next; 1255 blistFree(c, p); 1256 } 1257 b->prior = nil; 1258 /* 1259 * Freeing a block or just finished a write. 1260 * Move the blocks from the per-block unlink 1261 * queue to the cache unlink queue. 1262 */ 1263 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1264 vtLock(c->lk); 1265 c->ndirty--; 1266 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1267 b->vers = c->vers++; 1268 if(b->uhead){ 1269 /* add unlink blocks to unlink queue */ 1270 if(c->uhead == nil){ 1271 c->uhead = b->uhead; 1272 vtWakeup(c->unlink); 1273 }else 1274 c->utail->next = b->uhead; 1275 c->utail = b->utail; 1276 b->uhead = nil; 1277 } 1278 vtUnlock(c->lk); 1279 } 1280 assert(!b->uhead); 1281 dowakeup = 1; 1282 break; 1283 case BioDirty: 1284 /* 1285 * Wrote out an old version of the block (see blockRollback). 1286 * Bump a version count, leave it dirty. 1287 */ 1288 if(b->iostate == BioWriting){ 1289 vtLock(c->lk); 1290 b->vers = c->vers++; 1291 vtUnlock(c->lk); 1292 dowakeup = 1; 1293 } 1294 break; 1295 case BioReading: 1296 case BioWriting: 1297 /* 1298 * Adding block to disk queue. Bump reference count. 1299 * diskThread decs the count later by calling blockPut. 1300 * This is here because we need to lock c->lk to 1301 * manipulate the ref count. 1302 */ 1303 vtLock(c->lk); 1304 b->ref++; 1305 vtUnlock(c->lk); 1306 break; 1307 case BioReadError: 1308 case BioVentiError: 1309 /* 1310 * Oops. 1311 */ 1312 dowakeup = 1; 1313 break; 1314 } 1315 b->iostate = iostate; 1316 /* 1317 * Now that the state has changed, we can wake the waiters. 1318 */ 1319 if(dowakeup) 1320 vtWakeupAll(b->ioready); 1321 } 1322 1323 /* 1324 * The active file system is a tree of blocks. 1325 * When we add snapshots to the mix, the entire file system 1326 * becomes a dag and thus requires a bit more care. 1327 * 1328 * The life of the file system is divided into epochs. A snapshot 1329 * ends one epoch and begins the next. Each file system block 1330 * is marked with the epoch in which it was created (b.epoch). 1331 * When the block is unlinked from the file system (closed), it is marked 1332 * with the epoch in which it was removed (b.epochClose). 1333 * Once we have discarded or archived all snapshots up to 1334 * b.epochClose, we can reclaim the block. 1335 * 1336 * If a block was created in a past epoch but is not yet closed, 1337 * it is treated as copy-on-write. Of course, in order to insert the 1338 * new pointer into the tree, the parent must be made writable, 1339 * and so on up the tree. The recursion stops because the root 1340 * block is always writable. 1341 * 1342 * If blocks are never closed, they will never be reused, and 1343 * we will run out of disk space. But marking a block as closed 1344 * requires some care about dependencies and write orderings. 1345 * 1346 * (1) If a block p points at a copy-on-write block b and we 1347 * copy b to create bb, then p must be written out after bb and 1348 * lbb (bb's label block). 1349 * 1350 * (2) We have to mark b as closed, but only after we switch 1351 * the pointer, so lb must be written out after p. In fact, we 1352 * can't even update the in-memory copy, or the cache might 1353 * mistakenly give out b for reuse before p gets written. 1354 * 1355 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1356 * The caller is expected to record a "p after bb" dependency 1357 * to finish (1), and also expected to call blockRemoveLink 1358 * to arrange for (2) to happen once p is written. 1359 * 1360 * Until (2) happens, some pieces of the code (e.g., the archiver) 1361 * still need to know whether a block has been copied, so we 1362 * set the BsCopied bit in the label and force that to disk *before* 1363 * the copy gets written out. 1364 */ 1365 Block* 1366 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1367 { 1368 Block *bb, *lb; 1369 Label l; 1370 1371 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1372 fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n", 1373 b->addr, &b->l, elo, ehi); 1374 1375 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1376 if(bb == nil){ 1377 blockPut(b); 1378 return nil; 1379 } 1380 1381 /* 1382 * Update label so we know the block has been copied. 1383 * (It will be marked closed once it has been unlinked from 1384 * the tree.) This must follow cacheAllocBlock since we 1385 * can't be holding onto lb when we call cacheAllocBlock. 1386 */ 1387 if((b->l.state&BsCopied)==0) 1388 if(b->part == PartData){ /* not the superblock */ 1389 l = b->l; 1390 l.state |= BsCopied; 1391 lb = _blockSetLabel(b, &l); 1392 if(lb == nil){ 1393 /* can't set label => can't copy block */ 1394 blockPut(b); 1395 l.type = BtMax; 1396 l.state = BsFree; 1397 l.epoch = 0; 1398 l.epochClose = 0; 1399 l.tag = 0; 1400 blockSetLabel(bb, &l, 0); 1401 blockPut(bb); 1402 return nil; 1403 } 1404 blockDependency(bb, lb, -1, nil, nil); 1405 blockPut(lb); 1406 } 1407 1408 memmove(bb->data, b->data, b->c->size); 1409 blockDirty(bb); 1410 blockPut(b); 1411 return bb; 1412 } 1413 1414 /* 1415 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1416 * If recurse is set, we are unlinking all of bb's children as well. 1417 * 1418 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1419 * the relevant information to b's list of unlinked blocks. Once b is written, 1420 * the list will be queued for processing. 1421 * 1422 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1423 */ 1424 void 1425 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1426 { 1427 BList *p, **pp, bl; 1428 1429 /* remove bb from prior list */ 1430 for(pp=&b->prior; (p=*pp)!=nil; ){ 1431 if(p->part == PartData && p->addr == addr){ 1432 *pp = p->next; 1433 blistFree(b->c, p); 1434 }else 1435 pp = &p->next; 1436 } 1437 1438 bl.part = PartData; 1439 bl.addr = addr; 1440 bl.type = type; 1441 bl.tag = tag; 1442 if(b->l.epoch == 0) 1443 assert(b->part == PartSuper); 1444 bl.epoch = b->l.epoch; 1445 bl.next = nil; 1446 bl.recurse = recurse; 1447 1448 p = blistAlloc(b); 1449 if(p == nil){ 1450 /* 1451 * We were out of blists so blistAlloc wrote b to disk. 1452 */ 1453 doRemoveLink(b->c, &bl); 1454 return; 1455 } 1456 1457 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1458 assert(b->iostate == BioDirty); 1459 1460 *p = bl; 1461 if(b->uhead == nil) 1462 b->uhead = p; 1463 else 1464 b->utail->next = p; 1465 b->utail = p; 1466 } 1467 1468 /* 1469 * Process removal of a single block and perhaps its children. 1470 */ 1471 static void 1472 doRemoveLink(Cache *c, BList *p) 1473 { 1474 int i, n; 1475 u32int a; 1476 Block *b; 1477 Label l; 1478 BList bl; 1479 1480 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1481 if(b == nil) 1482 return; 1483 1484 /* 1485 * When we're unlinking from the superblock, close with the next epoch. 1486 */ 1487 if(p->epoch == 0) 1488 p->epoch = b->l.epoch+1; 1489 1490 /* sanity check */ 1491 if(b->l.epoch > p->epoch){ 1492 fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch); 1493 blockPut(b); 1494 return; 1495 } 1496 1497 if(p->recurse && p->type != BtData && p->type != BtDir){ 1498 n = c->size / VtScoreSize; 1499 for(i=0; i<n; i++){ 1500 a = globalToLocal(b->data + i*VtScoreSize); 1501 if(a == NilBlock || !readLabel(c, &l, a)) 1502 continue; 1503 if(l.state&BsClosed) 1504 continue; 1505 /* 1506 * If stack space becomes an issue... 1507 p->addr = a; 1508 p->type = l.type; 1509 p->tag = l.tag; 1510 doRemoveLink(c, p); 1511 */ 1512 1513 bl.part = PartData; 1514 bl.addr = a; 1515 bl.type = l.type; 1516 bl.tag = l.tag; 1517 bl.epoch = p->epoch; 1518 bl.next = nil; 1519 bl.recurse = 1; 1520 doRemoveLink(c, &bl); 1521 } 1522 } 1523 1524 l = b->l; 1525 l.state |= BsClosed; 1526 l.epochClose = p->epoch; 1527 blockSetLabel(b, &l, 0); 1528 blockPut(b); 1529 } 1530 1531 /* 1532 * Allocate a BList so that we can record a dependency 1533 * or queue a removal related to block b. 1534 * If we can't find a BList, we write out b and return nil. 1535 */ 1536 static BList * 1537 blistAlloc(Block *b) 1538 { 1539 Cache *c; 1540 BList *p; 1541 1542 if(b->iostate != BioDirty){ 1543 /* 1544 * should not happen anymore - 1545 * blockDirty used to flush but no longer does. 1546 */ 1547 assert(b->iostate == BioClean); 1548 fprint(2, "blistAlloc: called on clean block\n"); 1549 return nil; 1550 } 1551 1552 c = b->c; 1553 vtLock(c->lk); 1554 if(c->blfree == nil){ 1555 /* 1556 * No free BLists. What are our options? 1557 */ 1558 1559 /* Block has no priors? Just write it. */ 1560 if(b->prior == nil){ 1561 vtUnlock(c->lk); 1562 diskWriteAndWait(c->disk, b); 1563 return nil; 1564 } 1565 1566 /* 1567 * Wake the flush thread, which will hopefully free up 1568 * some BLists for us. We used to flush a block from 1569 * our own prior list and reclaim that BList, but this is 1570 * a no-no: some of the blocks on our prior list may 1571 * be locked by our caller. Or maybe their label blocks 1572 * are locked by our caller. In any event, it's too hard 1573 * to make sure we can do I/O for ourselves. Instead, 1574 * we assume the flush thread will find something. 1575 * (The flush thread never blocks waiting for a block, 1576 * so it can't deadlock like we can.) 1577 */ 1578 vtLock(c->lk); 1579 while(c->blfree == nil){ 1580 vtWakeup(c->flush); 1581 vtSleep(c->blrend); 1582 } 1583 } 1584 1585 p = c->blfree; 1586 c->blfree = p->next; 1587 vtUnlock(c->lk); 1588 return p; 1589 } 1590 1591 static void 1592 blistFree(Cache *c, BList *bl) 1593 { 1594 vtLock(c->lk); 1595 bl->next = c->blfree; 1596 c->blfree = bl; 1597 vtWakeup(c->blrend); 1598 vtUnlock(c->lk); 1599 } 1600 1601 char* 1602 bsStr(int state) 1603 { 1604 static char s[100]; 1605 1606 if(state == BsFree) 1607 return "Free"; 1608 if(state == BsBad) 1609 return "Bad"; 1610 1611 sprint(s, "%x", state); 1612 if(!(state&BsAlloc)) 1613 strcat(s, ",Free"); /* should not happen */ 1614 if(state&BsCopied) 1615 strcat(s, ",Copied"); 1616 if(state&BsVenti) 1617 strcat(s, ",Venti"); 1618 if(state&BsClosed) 1619 strcat(s, ",Closed"); 1620 return s; 1621 } 1622 1623 char * 1624 bioStr(int iostate) 1625 { 1626 switch(iostate){ 1627 default: 1628 return "Unknown!!"; 1629 case BioEmpty: 1630 return "Empty"; 1631 case BioLabel: 1632 return "Label"; 1633 case BioClean: 1634 return "Clean"; 1635 case BioDirty: 1636 return "Dirty"; 1637 case BioReading: 1638 return "Reading"; 1639 case BioWriting: 1640 return "Writing"; 1641 case BioReadError: 1642 return "ReadError"; 1643 case BioVentiError: 1644 return "VentiError"; 1645 case BioMax: 1646 return "Max"; 1647 } 1648 } 1649 1650 static char *bttab[] = { 1651 "BtData", 1652 "BtData+1", 1653 "BtData+2", 1654 "BtData+3", 1655 "BtData+4", 1656 "BtData+5", 1657 "BtData+6", 1658 "BtData+7", 1659 "BtDir", 1660 "BtDir+1", 1661 "BtDir+2", 1662 "BtDir+3", 1663 "BtDir+4", 1664 "BtDir+5", 1665 "BtDir+6", 1666 "BtDir+7", 1667 }; 1668 1669 char* 1670 btStr(int type) 1671 { 1672 if(type < nelem(bttab)) 1673 return bttab[type]; 1674 return "unknown"; 1675 } 1676 1677 int 1678 labelFmt(Fmt *f) 1679 { 1680 Label *l; 1681 1682 l = va_arg(f->args, Label*); 1683 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1684 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1685 } 1686 1687 int 1688 scoreFmt(Fmt *f) 1689 { 1690 uchar *v; 1691 int i; 1692 u32int addr; 1693 1694 v = va_arg(f->args, uchar*); 1695 if(v == nil){ 1696 fmtprint(f, "*"); 1697 }else if((addr = globalToLocal(v)) != NilBlock) 1698 fmtprint(f, "0x%.8ux", addr); 1699 else{ 1700 for(i = 0; i < VtScoreSize; i++) 1701 fmtprint(f, "%2.2ux", v[i]); 1702 } 1703 1704 return 0; 1705 } 1706 1707 static int 1708 upHeap(int i, Block *b) 1709 { 1710 Block *bb; 1711 u32int now; 1712 int p; 1713 Cache *c; 1714 1715 c = b->c; 1716 now = c->now; 1717 for(; i != 0; i = p){ 1718 p = (i - 1) >> 1; 1719 bb = c->heap[p]; 1720 if(b->used - now >= bb->used - now) 1721 break; 1722 c->heap[i] = bb; 1723 bb->heap = i; 1724 } 1725 c->heap[i] = b; 1726 b->heap = i; 1727 1728 return i; 1729 } 1730 1731 static int 1732 downHeap(int i, Block *b) 1733 { 1734 Block *bb; 1735 u32int now; 1736 int k; 1737 Cache *c; 1738 1739 c = b->c; 1740 now = c->now; 1741 for(; ; i = k){ 1742 k = (i << 1) + 1; 1743 if(k >= c->nheap) 1744 break; 1745 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1746 k++; 1747 bb = c->heap[k]; 1748 if(b->used - now <= bb->used - now) 1749 break; 1750 c->heap[i] = bb; 1751 bb->heap = i; 1752 } 1753 c->heap[i] = b; 1754 b->heap = i; 1755 return i; 1756 } 1757 1758 /* 1759 * Delete a block from the heap. 1760 * Called with c->lk held. 1761 */ 1762 static void 1763 heapDel(Block *b) 1764 { 1765 int i, si; 1766 Cache *c; 1767 1768 c = b->c; 1769 1770 si = b->heap; 1771 if(si == BadHeap) 1772 return; 1773 b->heap = BadHeap; 1774 c->nheap--; 1775 if(si == c->nheap) 1776 return; 1777 b = c->heap[c->nheap]; 1778 i = upHeap(si, b); 1779 if(i == si) 1780 downHeap(i, b); 1781 } 1782 1783 /* 1784 * Insert a block into the heap. 1785 * Called with c->lk held. 1786 */ 1787 static void 1788 heapIns(Block *b) 1789 { 1790 assert(b->heap == BadHeap); 1791 upHeap(b->c->nheap++, b); 1792 vtWakeup(b->c->heapwait); 1793 } 1794 1795 /* 1796 * Get just the label for a block. 1797 */ 1798 int 1799 readLabel(Cache *c, Label *l, u32int addr) 1800 { 1801 int lpb; 1802 Block *b; 1803 u32int a; 1804 1805 lpb = c->size / LabelSize; 1806 a = addr / lpb; 1807 b = cacheLocal(c, PartLabel, a, OReadOnly); 1808 if(b == nil){ 1809 blockPut(b); 1810 return 0; 1811 } 1812 1813 if(!labelUnpack(l, b->data, addr%lpb)){ 1814 blockPut(b); 1815 return 0; 1816 } 1817 blockPut(b); 1818 return 1; 1819 } 1820 1821 /* 1822 * Process unlink queue. 1823 * Called with c->lk held. 1824 */ 1825 static void 1826 unlinkBody(Cache *c) 1827 { 1828 BList *p; 1829 1830 while(c->uhead != nil){ 1831 p = c->uhead; 1832 c->uhead = p->next; 1833 vtUnlock(c->lk); 1834 doRemoveLink(c, p); 1835 vtLock(c->lk); 1836 p->next = c->blfree; 1837 c->blfree = p; 1838 } 1839 } 1840 1841 /* 1842 * Occasionally unlink the blocks on the cache unlink queue. 1843 */ 1844 static void 1845 unlinkThread(void *a) 1846 { 1847 Cache *c = a; 1848 1849 vtThreadSetName("unlink"); 1850 1851 vtLock(c->lk); 1852 for(;;){ 1853 while(c->uhead == nil && c->die == nil) 1854 vtSleep(c->unlink); 1855 if(c->die != nil) 1856 break; 1857 unlinkBody(c); 1858 } 1859 c->ref--; 1860 vtWakeup(c->die); 1861 vtUnlock(c->lk); 1862 } 1863 1864 static int 1865 baddrCmp(void *a0, void *a1) 1866 { 1867 BAddr *b0, *b1; 1868 b0 = a0; 1869 b1 = a1; 1870 1871 if(b0->part < b1->part) 1872 return -1; 1873 if(b0->part > b1->part) 1874 return 1; 1875 if(b0->addr < b1->addr) 1876 return -1; 1877 if(b0->addr > b1->addr) 1878 return 1; 1879 return 0; 1880 } 1881 1882 /* 1883 * Scan the block list for dirty blocks; add them to the list c->baddr. 1884 */ 1885 static void 1886 flushFill(Cache *c) 1887 { 1888 int i, ndirty; 1889 BAddr *p; 1890 Block *b; 1891 1892 vtLock(c->lk); 1893 if(c->ndirty == 0){ 1894 vtUnlock(c->lk); 1895 return; 1896 } 1897 1898 p = c->baddr; 1899 ndirty = 0; 1900 for(i=0; i<c->nblocks; i++){ 1901 b = c->blocks + i; 1902 if(b->part == PartError) 1903 continue; 1904 if(b->iostate == BioDirty || b->iostate == BioWriting) 1905 ndirty++; 1906 if(b->iostate != BioDirty) 1907 continue; 1908 p->part = b->part; 1909 p->addr = b->addr; 1910 p->vers = b->vers; 1911 p++; 1912 } 1913 if(ndirty != c->ndirty){ 1914 fprint(2, "ndirty mismatch expected %d found %d\n", 1915 c->ndirty, ndirty); 1916 c->ndirty = ndirty; 1917 } 1918 vtUnlock(c->lk); 1919 1920 c->bw = p - c->baddr; 1921 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1922 } 1923 1924 /* 1925 * This is not thread safe, i.e. it can't be called from multiple threads. 1926 * 1927 * It's okay how we use it, because it only gets called in 1928 * the flushThread. And cacheFree, but only after 1929 * cacheFree has killed off the flushThread. 1930 */ 1931 static int 1932 cacheFlushBlock(Cache *c) 1933 { 1934 Block *b; 1935 BAddr *p; 1936 int lockfail, nfail; 1937 1938 nfail = 0; 1939 for(;;){ 1940 if(c->br == c->be){ 1941 if(c->bw == 0 || c->bw == c->be) 1942 flushFill(c); 1943 c->br = 0; 1944 c->be = c->bw; 1945 c->bw = 0; 1946 c->nflush = 0; 1947 } 1948 1949 if(c->br == c->be) 1950 return 0; 1951 p = c->baddr + c->br; 1952 c->br++; 1953 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1954 1955 if(b && blockWrite(b)){ 1956 c->nflush++; 1957 blockPut(b); 1958 return 1; 1959 } 1960 if(b) 1961 blockPut(b); 1962 1963 /* 1964 * Why didn't we write the block? 1965 */ 1966 1967 /* Block already written out */ 1968 if(b == nil && !lockfail) 1969 continue; 1970 1971 /* Failed to acquire lock; sleep if happens a lot. */ 1972 if(lockfail && ++nfail > 100){ 1973 sleep(500); 1974 nfail = 0; 1975 } 1976 /* Requeue block. */ 1977 if(c->bw < c->be) 1978 c->baddr[c->bw++] = *p; 1979 } 1980 return 0; 1981 } 1982 1983 /* 1984 * Occasionally flush dirty blocks from memory to the disk. 1985 */ 1986 static void 1987 flushThread(void *a) 1988 { 1989 Cache *c = a; 1990 int i; 1991 1992 vtThreadSetName("flush"); 1993 vtLock(c->lk); 1994 while(c->die == nil){ 1995 vtSleep(c->flush); 1996 vtUnlock(c->lk); 1997 for(i=0; i<FlushSize; i++) 1998 if(!cacheFlushBlock(c)){ 1999 /* 2000 * If i==0, could be someone is waking us repeatedly 2001 * to flush the cache but there's no work to do. 2002 * Pause a little. 2003 */ 2004 if(i==0) 2005 sleep(250); 2006 break; 2007 } 2008 if(i==0 && c->ndirty){ 2009 /* 2010 * All the blocks are being written right now -- there's nothing to do. 2011 * We might be spinning with cacheFlush though -- he'll just keep 2012 * kicking us until c->ndirty goes down. Probably we should sleep 2013 * on something that the diskThread can kick, but for now we'll 2014 * just pause for a little while waiting for disks to finish. 2015 */ 2016 sleep(100); 2017 } 2018 vtLock(c->lk); 2019 vtWakeupAll(c->flushwait); 2020 } 2021 c->ref--; 2022 vtWakeup(c->die); 2023 vtUnlock(c->lk); 2024 } 2025 2026 /* 2027 * Keep flushing until everything is clean. 2028 */ 2029 void 2030 cacheFlush(Cache *c, int wait) 2031 { 2032 vtLock(c->lk); 2033 if(wait){ 2034 while(c->ndirty){ 2035 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2036 // c->ndirty, c->uhead); 2037 vtWakeup(c->flush); 2038 vtSleep(c->flushwait); 2039 } 2040 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2041 }else 2042 vtWakeup(c->flush); 2043 vtUnlock(c->lk); 2044 } 2045 2046 /* 2047 * Kick the flushThread every 30 seconds. 2048 */ 2049 static void 2050 cacheSync(void *v) 2051 { 2052 Cache *c; 2053 2054 c = v; 2055 cacheFlush(c, 0); 2056 } 2057