1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 VtLock *dirtylk; 26 int ref; 27 int mode; 28 29 Disk *disk; 30 int size; /* block size */ 31 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 32 VtSession *z; 33 u32int now; /* ticks for usage timestamps */ 34 Block **heads; /* hash table for finding address */ 35 int nheap; /* number of available victims */ 36 Block **heap; /* heap for locating victims */ 37 long nblocks; /* number of blocks allocated */ 38 Block *blocks; /* array of block descriptors */ 39 u8int *mem; /* memory for all block data & blists */ 40 41 BList *blfree; 42 VtRendez *blrend; 43 44 int ndirty; /* number of dirty blocks in the cache */ 45 int maxdirty; /* max number of dirty blocks */ 46 u32int vers; 47 48 long hashSize; 49 50 FreeList *fl; 51 52 VtRendez *die; /* daemon threads should die when != nil */ 53 54 VtRendez *flush; 55 VtRendez *flushwait; 56 VtRendez *heapwait; 57 BAddr *baddr; 58 int bw, br, be; 59 int nflush; 60 61 Periodic *sync; 62 63 /* unlink daemon */ 64 BList *uhead; 65 BList *utail; 66 VtRendez *unlink; 67 68 /* block counts */ 69 int nused; 70 int ndisk; 71 }; 72 73 struct BList { 74 int part; 75 u32int addr; 76 uchar type; 77 u32int tag; 78 u32int epoch; 79 u32int vers; 80 81 int recurse; /* for block unlink */ 82 83 /* for roll back */ 84 int index; /* -1 indicates not valid */ 85 union { 86 uchar score[VtScoreSize]; 87 uchar entry[VtEntrySize]; 88 } old; 89 BList *next; 90 }; 91 92 struct BAddr { 93 int part; 94 u32int addr; 95 u32int vers; 96 }; 97 98 struct FreeList { 99 VtLock *lk; 100 u32int last; /* last block allocated */ 101 u32int end; /* end of data partition */ 102 u32int nfree; /* number of free blocks */ 103 u32int nused; /* number of used blocks */ 104 u32int epochLow; /* low epoch when last updated nfree and nused */ 105 }; 106 107 static FreeList *flAlloc(u32int end); 108 static void flFree(FreeList *fl); 109 110 static Block *cacheBumpBlock(Cache *c); 111 static void heapDel(Block*); 112 static void heapIns(Block*); 113 static void cacheCheck(Cache*); 114 static void unlinkThread(void *a); 115 static void flushThread(void *a); 116 static void flushBody(Cache *c); 117 static void unlinkBody(Cache *c); 118 static int cacheFlushBlock(Cache *c); 119 static void cacheSync(void*); 120 static BList *blistAlloc(Block*); 121 static void blistFree(Cache*, BList*); 122 static void doRemoveLink(Cache*, BList*); 123 static void doRemoveLinkList(Cache*, BList*); 124 125 /* 126 * Mapping from local block type to Venti type 127 */ 128 int vtType[BtMax] = { 129 VtDataType, /* BtData | 0 */ 130 VtPointerType0, /* BtData | 1 */ 131 VtPointerType1, /* BtData | 2 */ 132 VtPointerType2, /* BtData | 3 */ 133 VtPointerType3, /* BtData | 4 */ 134 VtPointerType4, /* BtData | 5 */ 135 VtPointerType5, /* BtData | 6 */ 136 VtPointerType6, /* BtData | 7 */ 137 VtDirType, /* BtDir | 0 */ 138 VtPointerType0, /* BtDir | 1 */ 139 VtPointerType1, /* BtDir | 2 */ 140 VtPointerType2, /* BtDir | 3 */ 141 VtPointerType3, /* BtDir | 4 */ 142 VtPointerType4, /* BtDir | 5 */ 143 VtPointerType5, /* BtDir | 6 */ 144 VtPointerType6, /* BtDir | 7 */ 145 }; 146 147 /* 148 * Allocate the memory cache. 149 */ 150 Cache * 151 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 152 { 153 int i; 154 Cache *c; 155 Block *b; 156 BList *bl; 157 u8int *p; 158 int nbl; 159 160 c = vtMemAllocZ(sizeof(Cache)); 161 162 /* reasonable number of BList elements */ 163 nbl = nblocks * 4; 164 165 c->lk = vtLockAlloc(); 166 c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */ 167 c->ref = 1; 168 c->disk = disk; 169 c->z = z; 170 c->size = diskBlockSize(disk); 171 bwatchSetBlockSize(c->size); 172 /* round c->size up to be a nice multiple */ 173 c->size = (c->size + 127) & ~127; 174 c->ndmap = (c->size/20 + 7) / 8; 175 c->nblocks = nblocks; 176 c->hashSize = nblocks; 177 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 178 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 179 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 180 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 181 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 182 c->mode = mode; 183 c->vers++; 184 p = c->mem; 185 for(i = 0; i < nblocks; i++){ 186 b = &c->blocks[i]; 187 b->lk = vtLockAlloc(); 188 b->c = c; 189 b->data = p; 190 b->heap = i; 191 b->ioready = vtRendezAlloc(b->lk); 192 c->heap[i] = b; 193 p += c->size; 194 } 195 c->nheap = nblocks; 196 for(i = 0; i < nbl; i++){ 197 bl = (BList*)p; 198 bl->next = c->blfree; 199 c->blfree = bl; 200 p += sizeof(BList); 201 } 202 /* separate loop to keep blocks and blists reasonably aligned */ 203 for(i = 0; i < nblocks; i++){ 204 b = &c->blocks[i]; 205 b->dmap = p; 206 p += c->ndmap; 207 } 208 209 c->blrend = vtRendezAlloc(c->lk); 210 211 c->maxdirty = nblocks*(DirtyPercentage*0.01); 212 213 c->fl = flAlloc(diskSize(disk, PartData)); 214 215 c->unlink = vtRendezAlloc(c->lk); 216 c->flush = vtRendezAlloc(c->lk); 217 c->flushwait = vtRendezAlloc(c->lk); 218 c->heapwait = vtRendezAlloc(c->lk); 219 c->sync = periodicAlloc(cacheSync, c, 30*1000); 220 221 if(mode == OReadWrite){ 222 c->ref += 2; 223 vtThread(unlinkThread, c); 224 vtThread(flushThread, c); 225 } 226 cacheCheck(c); 227 228 return c; 229 } 230 231 /* 232 * Free the whole memory cache, flushing all dirty blocks to the disk. 233 */ 234 void 235 cacheFree(Cache *c) 236 { 237 int i; 238 239 /* kill off daemon threads */ 240 vtLock(c->lk); 241 c->die = vtRendezAlloc(c->lk); 242 periodicKill(c->sync); 243 vtWakeup(c->flush); 244 vtWakeup(c->unlink); 245 while(c->ref > 1) 246 vtSleep(c->die); 247 248 /* flush everything out */ 249 do { 250 unlinkBody(c); 251 vtUnlock(c->lk); 252 while(cacheFlushBlock(c)) 253 ; 254 diskFlush(c->disk); 255 vtLock(c->lk); 256 } while(c->uhead || c->ndirty); 257 vtUnlock(c->lk); 258 259 cacheCheck(c); 260 261 for(i = 0; i < c->nblocks; i++){ 262 assert(c->blocks[i].ref == 0); 263 vtRendezFree(c->blocks[i].ioready); 264 vtLockFree(c->blocks[i].lk); 265 } 266 flFree(c->fl); 267 vtMemFree(c->baddr); 268 vtMemFree(c->heads); 269 vtMemFree(c->blocks); 270 vtMemFree(c->mem); 271 vtLockFree(c->lk); 272 diskFree(c->disk); 273 vtRendezFree(c->blrend); 274 /* don't close vtSession */ 275 vtMemFree(c); 276 } 277 278 static void 279 cacheDump(Cache *c) 280 { 281 int i; 282 Block *b; 283 284 for(i = 0; i < c->nblocks; i++){ 285 b = &c->blocks[i]; 286 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 287 i, b->part, b->addr, b->score, b->l.type, b->ref, 288 bsStr(b->l.state), bioStr(b->iostate), b->pc); 289 } 290 } 291 292 static void 293 cacheCheck(Cache *c) 294 { 295 u32int size, now; 296 int i, k, refed; 297 static uchar zero[VtScoreSize]; 298 Block *b; 299 300 size = c->size; 301 now = c->now; 302 303 for(i = 0; i < c->nheap; i++){ 304 if(c->heap[i]->heap != i) 305 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 306 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 307 vtFatal("bad heap ordering"); 308 k = (i << 1) + 1; 309 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 310 vtFatal("bad heap ordering"); 311 k++; 312 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 313 vtFatal("bad heap ordering"); 314 } 315 316 refed = 0; 317 for(i = 0; i < c->nblocks; i++){ 318 b = &c->blocks[i]; 319 if(b->data != &c->mem[i * size]) 320 vtFatal("mis-blocked at %d", i); 321 if(b->ref && b->heap == BadHeap){ 322 refed++; 323 } 324 } 325 if(c->nheap + refed != c->nblocks){ 326 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 327 cacheDump(c); 328 } 329 assert(c->nheap + refed == c->nblocks); 330 refed = 0; 331 for(i = 0; i < c->nblocks; i++){ 332 b = &c->blocks[i]; 333 if(b->ref){ 334 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 335 refed++; 336 } 337 } 338 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 339 } 340 341 342 /* 343 * locate the block with the oldest second to last use. 344 * remove it from the heap, and fix up the heap. 345 */ 346 /* called with c->lk held */ 347 static Block * 348 cacheBumpBlock(Cache *c) 349 { 350 int printed; 351 Block *b; 352 353 /* 354 * locate the block with the oldest second to last use. 355 * remove it from the heap, and fix up the heap. 356 */ 357 printed = 0; 358 if(c->nheap == 0){ 359 while(c->nheap == 0){ 360 vtWakeup(c->flush); 361 vtSleep(c->heapwait); 362 if(c->nheap == 0){ 363 printed = 1; 364 fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty); 365 } 366 } 367 if(printed) 368 fprint(2, "cache is okay again, %d dirty\n", c->ndirty); 369 } 370 371 b = c->heap[0]; 372 heapDel(b); 373 374 assert(b->heap == BadHeap); 375 assert(b->ref == 0); 376 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 377 assert(b->prior == nil); 378 assert(b->uhead == nil); 379 380 /* 381 * unchain the block from hash chain 382 */ 383 if(b->prev){ 384 *(b->prev) = b->next; 385 if(b->next) 386 b->next->prev = b->prev; 387 b->prev = nil; 388 } 389 390 391 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 392 /* set block to a reasonable state */ 393 b->ref = 1; 394 b->part = PartError; 395 memset(&b->l, 0, sizeof(b->l)); 396 b->iostate = BioEmpty; 397 398 return b; 399 } 400 401 /* 402 * look for a particular version of the block in the memory cache. 403 */ 404 static Block * 405 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 406 int waitlock, int *lockfailure) 407 { 408 Block *b; 409 ulong h; 410 411 h = addr % c->hashSize; 412 413 if(lockfailure) 414 *lockfailure = 0; 415 416 /* 417 * look for the block in the cache 418 */ 419 vtLock(c->lk); 420 for(b = c->heads[h]; b != nil; b = b->next){ 421 if(b->part == part && b->addr == addr) 422 break; 423 } 424 if(b == nil || b->vers != vers){ 425 vtUnlock(c->lk); 426 return nil; 427 } 428 if(!waitlock && !vtCanLock(b->lk)){ 429 *lockfailure = 1; 430 vtUnlock(c->lk); 431 return nil; 432 } 433 heapDel(b); 434 b->ref++; 435 vtUnlock(c->lk); 436 437 bwatchLock(b); 438 if(waitlock) 439 vtLock(b->lk); 440 b->nlock = 1; 441 442 for(;;){ 443 switch(b->iostate){ 444 default: 445 abort(); 446 case BioEmpty: 447 case BioLabel: 448 case BioClean: 449 case BioDirty: 450 if(b->vers != vers){ 451 blockPut(b); 452 return nil; 453 } 454 return b; 455 case BioReading: 456 case BioWriting: 457 vtSleep(b->ioready); 458 break; 459 case BioVentiError: 460 blockPut(b); 461 vtSetError("venti i/o error block 0x%.8ux", addr); 462 return nil; 463 case BioReadError: 464 blockPut(b); 465 vtSetError("error reading block 0x%.8ux", addr); 466 return nil; 467 } 468 } 469 /* NOT REACHED */ 470 } 471 static Block* 472 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 473 { 474 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 475 } 476 477 478 /* 479 * fetch a local (on-disk) block from the memory cache. 480 * if it's not there, load it, bumping some other block. 481 */ 482 Block * 483 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 484 { 485 Block *b; 486 ulong h; 487 488 assert(part != PartVenti); 489 490 h = addr % c->hashSize; 491 492 /* 493 * look for the block in the cache 494 */ 495 vtLock(c->lk); 496 for(b = c->heads[h]; b != nil; b = b->next){ 497 if(b->part != part || b->addr != addr) 498 continue; 499 if(epoch && b->l.epoch != epoch){ 500 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 501 vtUnlock(c->lk); 502 vtSetError(ELabelMismatch); 503 return nil; 504 } 505 heapDel(b); 506 b->ref++; 507 break; 508 } 509 510 if(b == nil){ 511 b = cacheBumpBlock(c); 512 513 b->part = part; 514 b->addr = addr; 515 localToGlobal(addr, b->score); 516 517 /* chain onto correct hash */ 518 b->next = c->heads[h]; 519 c->heads[h] = b; 520 if(b->next != nil) 521 b->next->prev = &b->next; 522 b->prev = &c->heads[h]; 523 } 524 525 vtUnlock(c->lk); 526 527 /* 528 * BUG: what if the epoch changes right here? 529 * In the worst case, we could end up in some weird 530 * lock loop, because the block we want no longer exists, 531 * and instead we're trying to lock a block we have no 532 * business grabbing. 533 * 534 * For now, I'm not going to worry about it. 535 */ 536 537 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 538 bwatchLock(b); 539 vtLock(b->lk); 540 b->nlock = 1; 541 542 if(part == PartData && b->iostate == BioEmpty){ 543 if(!readLabel(c, &b->l, addr)){ 544 blockPut(b); 545 return nil; 546 } 547 blockSetIOState(b, BioLabel); 548 } 549 if(epoch && b->l.epoch != epoch){ 550 blockPut(b); 551 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 552 vtSetError(ELabelMismatch); 553 return nil; 554 } 555 556 b->pc = getcallerpc(&c); 557 for(;;){ 558 switch(b->iostate){ 559 default: 560 abort(); 561 case BioEmpty: 562 case BioLabel: 563 if(mode == OOverWrite){ 564 blockSetIOState(b, BioClean); 565 return b; 566 } 567 diskRead(c->disk, b); 568 vtSleep(b->ioready); 569 break; 570 case BioClean: 571 case BioDirty: 572 return b; 573 case BioReading: 574 case BioWriting: 575 vtSleep(b->ioready); 576 break; 577 case BioReadError: 578 blockSetIOState(b, BioEmpty); 579 blockPut(b); 580 vtSetError("error reading block 0x%.8ux", addr); 581 return nil; 582 } 583 } 584 /* NOT REACHED */ 585 } 586 587 Block * 588 cacheLocal(Cache *c, int part, u32int addr, int mode) 589 { 590 return _cacheLocal(c, part, addr, mode, 0); 591 } 592 593 /* 594 * fetch a local (on-disk) block from the memory cache. 595 * if it's not there, load it, bumping some other block. 596 * check tag and type. 597 */ 598 Block * 599 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 600 { 601 Block *b; 602 603 b = _cacheLocal(c, PartData, addr, mode, epoch); 604 if(b == nil) 605 return nil; 606 if(b->l.type != type || b->l.tag != tag){ 607 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 608 addr, b->l.type, type, b->l.tag, tag); 609 vtSetError(ELabelMismatch); 610 blockPut(b); 611 return nil; 612 } 613 b->pc = getcallerpc(&c); 614 return b; 615 } 616 617 /* 618 * fetch a global (Venti) block from the memory cache. 619 * if it's not there, load it, bumping some other block. 620 * check tag and type if it's really a local block in disguise. 621 */ 622 Block * 623 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 624 { 625 int n; 626 Block *b; 627 ulong h; 628 u32int addr; 629 630 addr = globalToLocal(score); 631 if(addr != NilBlock){ 632 b = cacheLocalData(c, addr, type, tag, mode, 0); 633 if(b) 634 b->pc = getcallerpc(&c); 635 return b; 636 } 637 638 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 639 640 /* 641 * look for the block in the cache 642 */ 643 vtLock(c->lk); 644 for(b = c->heads[h]; b != nil; b = b->next){ 645 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 646 continue; 647 heapDel(b); 648 b->ref++; 649 break; 650 } 651 652 if(b == nil){ 653 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 654 655 b = cacheBumpBlock(c); 656 657 b->part = PartVenti; 658 b->addr = NilBlock; 659 b->l.type = type; 660 memmove(b->score, score, VtScoreSize); 661 662 /* chain onto correct hash */ 663 b->next = c->heads[h]; 664 c->heads[h] = b; 665 if(b->next != nil) 666 b->next->prev = &b->next; 667 b->prev = &c->heads[h]; 668 } 669 vtUnlock(c->lk); 670 671 bwatchLock(b); 672 vtLock(b->lk); 673 b->nlock = 1; 674 b->pc = getcallerpc(&c); 675 676 switch(b->iostate){ 677 default: 678 abort(); 679 case BioEmpty: 680 n = vtRead(c->z, score, vtType[type], b->data, c->size); 681 if(n < 0 || !vtSha1Check(score, b->data, n)){ 682 blockSetIOState(b, BioVentiError); 683 blockPut(b); 684 vtSetError( 685 "venti error reading block %V or wrong score: %r", 686 score); 687 return nil; 688 } 689 vtZeroExtend(vtType[type], b->data, n, c->size); 690 blockSetIOState(b, BioClean); 691 return b; 692 case BioClean: 693 return b; 694 case BioVentiError: 695 blockPut(b); 696 vtSetError("venti i/o error or wrong score, block %V", score); 697 return nil; 698 case BioReadError: 699 blockPut(b); 700 vtSetError("error reading block %V", b->score); 701 return nil; 702 } 703 /* NOT REACHED */ 704 } 705 706 /* 707 * allocate a new on-disk block and load it into the memory cache. 708 * BUG: if the disk is full, should we flush some of it to Venti? 709 */ 710 static u32int lastAlloc; 711 712 Block * 713 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 714 { 715 FreeList *fl; 716 u32int addr; 717 Block *b; 718 int n, nwrap; 719 Label lab; 720 721 n = c->size / LabelSize; 722 fl = c->fl; 723 724 vtLock(fl->lk); 725 addr = fl->last; 726 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 727 if(b == nil){ 728 fprint(2, "cacheAllocBlock: xxx %R\n"); 729 vtUnlock(fl->lk); 730 return nil; 731 } 732 nwrap = 0; 733 for(;;){ 734 if(++addr >= fl->end){ 735 addr = 0; 736 if(++nwrap >= 2){ 737 blockPut(b); 738 fl->last = 0; 739 vtSetError("disk is full"); 740 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 741 vtUnlock(fl->lk); 742 return nil; 743 } 744 } 745 if(addr%n == 0){ 746 blockPut(b); 747 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 748 if(b == nil){ 749 fl->last = addr; 750 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 751 vtUnlock(fl->lk); 752 return nil; 753 } 754 } 755 if(!labelUnpack(&lab, b->data, addr%n)) 756 continue; 757 if(lab.state == BsFree) 758 goto Found; 759 if(lab.state&BsClosed) 760 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 761 goto Found; 762 } 763 Found: 764 blockPut(b); 765 b = cacheLocal(c, PartData, addr, OOverWrite); 766 if(b == nil){ 767 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 768 return nil; 769 } 770 assert(b->iostate == BioLabel || b->iostate == BioClean); 771 fl->last = addr; 772 lab.type = type; 773 lab.tag = tag; 774 lab.state = BsAlloc; 775 lab.epoch = epoch; 776 lab.epochClose = ~(u32int)0; 777 if(!blockSetLabel(b, &lab, 1)){ 778 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 779 blockPut(b); 780 return nil; 781 } 782 vtZeroExtend(vtType[type], b->data, 0, c->size); 783 if(0)diskWrite(c->disk, b); 784 785 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 786 lastAlloc = addr; 787 fl->nused++; 788 vtUnlock(fl->lk); 789 b->pc = getcallerpc(&c); 790 return b; 791 } 792 793 int 794 cacheDirty(Cache *c) 795 { 796 return c->ndirty; 797 } 798 799 void 800 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 801 { 802 int n; 803 u32int addr, nused; 804 Block *b; 805 Label lab; 806 FreeList *fl; 807 808 fl = c->fl; 809 n = c->size / LabelSize; 810 *bsize = c->size; 811 vtLock(fl->lk); 812 if(fl->epochLow == epochLow){ 813 *used = fl->nused; 814 *total = fl->end; 815 vtUnlock(fl->lk); 816 return; 817 } 818 b = nil; 819 nused = 0; 820 for(addr=0; addr<fl->end; addr++){ 821 if(addr%n == 0){ 822 blockPut(b); 823 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 824 if(b == nil){ 825 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 826 break; 827 } 828 } 829 if(!labelUnpack(&lab, b->data, addr%n)) 830 continue; 831 if(lab.state == BsFree) 832 continue; 833 if(lab.state&BsClosed) 834 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 835 continue; 836 nused++; 837 } 838 blockPut(b); 839 if(addr == fl->end){ 840 fl->nused = nused; 841 fl->epochLow = epochLow; 842 } 843 *used = nused; 844 *total = fl->end; 845 vtUnlock(fl->lk); 846 return; 847 } 848 849 static FreeList * 850 flAlloc(u32int end) 851 { 852 FreeList *fl; 853 854 fl = vtMemAllocZ(sizeof(*fl)); 855 fl->lk = vtLockAlloc(); 856 fl->last = 0; 857 fl->end = end; 858 return fl; 859 } 860 861 static void 862 flFree(FreeList *fl) 863 { 864 vtLockFree(fl->lk); 865 vtMemFree(fl); 866 } 867 868 u32int 869 cacheLocalSize(Cache *c, int part) 870 { 871 return diskSize(c->disk, part); 872 } 873 874 /* 875 * The thread that has locked b may refer to it by 876 * multiple names. Nlock counts the number of 877 * references the locking thread holds. It will call 878 * blockPut once per reference. 879 */ 880 void 881 blockDupLock(Block *b) 882 { 883 assert(b->nlock > 0); 884 b->nlock++; 885 } 886 887 /* 888 * we're done with the block. 889 * unlock it. can't use it after calling this. 890 */ 891 void 892 blockPut(Block* b) 893 { 894 Cache *c; 895 896 if(b == nil) 897 return; 898 899 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 900 901 if(b->iostate == BioDirty) 902 bwatchDependency(b); 903 904 if(--b->nlock > 0) 905 return; 906 907 /* 908 * b->nlock should probably stay at zero while 909 * the block is unlocked, but diskThread and vtSleep 910 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 911 * so we have to keep b->nlock set to 1 even 912 * when the block is unlocked. 913 */ 914 assert(b->nlock == 0); 915 b->nlock = 1; 916 // b->pc = 0; 917 918 bwatchUnlock(b); 919 vtUnlock(b->lk); 920 c = b->c; 921 vtLock(c->lk); 922 923 if(--b->ref > 0){ 924 vtUnlock(c->lk); 925 return; 926 } 927 928 assert(b->ref == 0); 929 switch(b->iostate){ 930 default: 931 b->used = c->now++; 932 heapIns(b); 933 break; 934 case BioEmpty: 935 case BioLabel: 936 if(c->nheap == 0) 937 b->used = c->now++; 938 else 939 b->used = c->heap[0]->used; 940 heapIns(b); 941 break; 942 case BioDirty: 943 break; 944 } 945 vtUnlock(c->lk); 946 } 947 948 /* 949 * set the label associated with a block. 950 */ 951 Block* 952 _blockSetLabel(Block *b, Label *l) 953 { 954 int lpb; 955 Block *bb; 956 u32int a; 957 Cache *c; 958 959 c = b->c; 960 961 assert(b->part == PartData); 962 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 963 lpb = c->size / LabelSize; 964 a = b->addr / lpb; 965 bb = cacheLocal(c, PartLabel, a, OReadWrite); 966 if(bb == nil){ 967 blockPut(b); 968 return nil; 969 } 970 b->l = *l; 971 labelPack(l, bb->data, b->addr%lpb); 972 blockDirty(bb); 973 return bb; 974 } 975 976 int 977 blockSetLabel(Block *b, Label *l, int allocating) 978 { 979 Block *lb; 980 Label oldl; 981 982 oldl = b->l; 983 lb = _blockSetLabel(b, l); 984 if(lb == nil) 985 return 0; 986 987 /* 988 * If we're allocating the block, make sure the label (bl) 989 * goes to disk before the data block (b) itself. This is to help 990 * the blocks that in turn depend on b. 991 * 992 * Suppose bx depends on (must be written out after) b. 993 * Once we write b we'll think it's safe to write bx. 994 * Bx can't get at b unless it has a valid label, though. 995 * 996 * Allocation is the only case in which having a current label 997 * is vital because: 998 * 999 * - l.type is set at allocation and never changes. 1000 * - l.tag is set at allocation and never changes. 1001 * - l.state is not checked when we load blocks. 1002 * - the archiver cares deeply about l.state being 1003 * BaActive vs. BaCopied, but that's handled 1004 * by direct calls to _blockSetLabel. 1005 */ 1006 1007 if(allocating) 1008 blockDependency(b, lb, -1, nil, nil); 1009 blockPut(lb); 1010 return 1; 1011 } 1012 1013 /* 1014 * Record that bb must be written out before b. 1015 * If index is given, we're about to overwrite the score/e 1016 * at that index in the block. Save the old value so we 1017 * can write a safer ``old'' version of the block if pressed. 1018 */ 1019 void 1020 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1021 { 1022 BList *p; 1023 1024 if(bb->iostate == BioClean) 1025 return; 1026 1027 /* 1028 * Dependencies for blocks containing Entry structures 1029 * or scores must always be explained. The problem with 1030 * only explaining some of them is this. Suppose we have two 1031 * dependencies for the same field, the first explained 1032 * and the second not. We try to write the block when the first 1033 * dependency is not written but the second is. We will roll back 1034 * the first change even though the second trumps it. 1035 */ 1036 if(index == -1 && bb->part == PartData) 1037 assert(b->l.type == BtData); 1038 1039 if(bb->iostate != BioDirty){ 1040 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1041 bb->part, bb->addr, bb->l.type, bb->iostate); 1042 abort(); 1043 } 1044 1045 p = blistAlloc(bb); 1046 if(p == nil) 1047 return; 1048 1049 assert(bb->iostate == BioDirty); 1050 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1051 1052 p->part = bb->part; 1053 p->addr = bb->addr; 1054 p->type = bb->l.type; 1055 p->vers = bb->vers; 1056 p->index = index; 1057 if(p->index >= 0){ 1058 /* 1059 * This test would just be b->l.type==BtDir except 1060 * we need to exclude the super block. 1061 */ 1062 if(b->l.type == BtDir && b->part == PartData) 1063 entryPack(e, p->old.entry, 0); 1064 else 1065 memmove(p->old.score, score, VtScoreSize); 1066 } 1067 p->next = b->prior; 1068 b->prior = p; 1069 } 1070 1071 /* 1072 * Mark an in-memory block as dirty. If there are too many 1073 * dirty blocks, start writing some out to disk. 1074 * 1075 * If there were way too many dirty blocks, we used to 1076 * try to do some flushing ourselves, but it's just too dangerous -- 1077 * it implies that the callers cannot have any of our priors locked, 1078 * but this is hard to avoid in some cases. 1079 */ 1080 int 1081 blockDirty(Block *b) 1082 { 1083 Cache *c; 1084 1085 c = b->c; 1086 1087 assert(b->part != PartVenti); 1088 1089 if(b->iostate == BioDirty) 1090 return 1; 1091 assert(b->iostate == BioClean); 1092 1093 vtLock(c->dirtylk); 1094 vtLock(c->lk); 1095 b->iostate = BioDirty; 1096 c->ndirty++; 1097 if(c->ndirty > (c->maxdirty>>1)) 1098 vtWakeup(c->flush); 1099 vtUnlock(c->lk); 1100 vtUnlock(c->dirtylk); 1101 1102 return 1; 1103 } 1104 1105 /* 1106 * We've decided to write out b. Maybe b has some pointers to blocks 1107 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1108 * copy of b that is safe to write out. (diskThread will make sure the block 1109 * remains marked as dirty.) 1110 */ 1111 uchar * 1112 blockRollback(Block *b, uchar *buf) 1113 { 1114 u32int addr; 1115 BList *p; 1116 Super super; 1117 1118 /* easy case */ 1119 if(b->prior == nil) 1120 return b->data; 1121 1122 memmove(buf, b->data, b->c->size); 1123 for(p=b->prior; p; p=p->next){ 1124 /* 1125 * we know p->index >= 0 because blockWrite has vetted this block for us. 1126 */ 1127 assert(p->index >= 0); 1128 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1129 if(b->part == PartSuper){ 1130 assert(p->index == 0); 1131 superUnpack(&super, buf); 1132 addr = globalToLocal(p->old.score); 1133 if(addr == NilBlock){ 1134 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1135 abort(); 1136 } 1137 super.active = addr; 1138 superPack(&super, buf); 1139 continue; 1140 } 1141 if(b->l.type == BtDir) 1142 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1143 else 1144 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1145 } 1146 return buf; 1147 } 1148 1149 /* 1150 * Try to write block b. 1151 * If b depends on other blocks: 1152 * 1153 * If the block has been written out, remove the dependency. 1154 * If the dependency is replaced by a more recent dependency, 1155 * throw it out. 1156 * If we know how to write out an old version of b that doesn't 1157 * depend on it, do that. 1158 * 1159 * Otherwise, bail. 1160 */ 1161 int 1162 blockWrite(Block *b) 1163 { 1164 uchar *dmap; 1165 Cache *c; 1166 BList *p, **pp; 1167 Block *bb; 1168 int lockfail; 1169 1170 c = b->c; 1171 1172 if(b->iostate != BioDirty) 1173 return 1; 1174 1175 dmap = b->dmap; 1176 memset(dmap, 0, c->ndmap); 1177 pp = &b->prior; 1178 for(p=*pp; p; p=*pp){ 1179 if(p->index >= 0){ 1180 /* more recent dependency has succeeded; this one can go */ 1181 if(dmap[p->index/8] & (1<<(p->index%8))) 1182 goto ignblock; 1183 } 1184 1185 lockfail = 0; 1186 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1187 if(bb == nil){ 1188 if(lockfail) 1189 return 0; 1190 /* block not in cache => was written already */ 1191 dmap[p->index/8] |= 1<<(p->index%8); 1192 goto ignblock; 1193 } 1194 1195 /* 1196 * same version of block is still in cache. 1197 * 1198 * the assertion is true because the block still has version p->vers, 1199 * which means it hasn't been written out since we last saw it. 1200 */ 1201 if(bb->iostate != BioDirty){ 1202 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1203 bb->part, bb->addr, bb->l.type, bb->iostate); 1204 /* probably BioWriting if it happens? */ 1205 if(bb->iostate == BioClean) 1206 goto ignblock; 1207 } 1208 1209 blockPut(bb); 1210 1211 if(p->index < 0){ 1212 /* 1213 * We don't know how to temporarily undo 1214 * b's dependency on bb, so just don't write b yet. 1215 */ 1216 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1217 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1218 return 0; 1219 } 1220 /* keep walking down the list */ 1221 pp = &p->next; 1222 continue; 1223 1224 ignblock: 1225 *pp = p->next; 1226 blistFree(c, p); 1227 continue; 1228 } 1229 1230 /* 1231 * DiskWrite must never be called with a double-locked block. 1232 * This call to diskWrite is okay because blockWrite is only called 1233 * from the cache flush thread, which never double-locks a block. 1234 */ 1235 diskWrite(c->disk, b); 1236 return 1; 1237 } 1238 1239 /* 1240 * Change the I/O state of block b. 1241 * Just an assignment except for magic in 1242 * switch statement (read comments there). 1243 */ 1244 void 1245 blockSetIOState(Block *b, int iostate) 1246 { 1247 int dowakeup; 1248 Cache *c; 1249 BList *p, *q; 1250 1251 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1252 1253 c = b->c; 1254 1255 dowakeup = 0; 1256 switch(iostate){ 1257 default: 1258 abort(); 1259 case BioEmpty: 1260 assert(!b->uhead); 1261 break; 1262 case BioLabel: 1263 assert(!b->uhead); 1264 break; 1265 case BioClean: 1266 bwatchDependency(b); 1267 /* 1268 * If b->prior is set, it means a write just finished. 1269 * The prior list isn't needed anymore. 1270 */ 1271 for(p=b->prior; p; p=q){ 1272 q = p->next; 1273 blistFree(c, p); 1274 } 1275 b->prior = nil; 1276 /* 1277 * Freeing a block or just finished a write. 1278 * Move the blocks from the per-block unlink 1279 * queue to the cache unlink queue. 1280 */ 1281 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1282 vtLock(c->lk); 1283 c->ndirty--; 1284 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1285 b->vers = c->vers++; 1286 if(b->uhead){ 1287 /* add unlink blocks to unlink queue */ 1288 if(c->uhead == nil){ 1289 c->uhead = b->uhead; 1290 vtWakeup(c->unlink); 1291 }else 1292 c->utail->next = b->uhead; 1293 c->utail = b->utail; 1294 b->uhead = nil; 1295 } 1296 vtUnlock(c->lk); 1297 } 1298 assert(!b->uhead); 1299 dowakeup = 1; 1300 break; 1301 case BioDirty: 1302 /* 1303 * Wrote out an old version of the block (see blockRollback). 1304 * Bump a version count, leave it dirty. 1305 */ 1306 if(b->iostate == BioWriting){ 1307 vtLock(c->lk); 1308 b->vers = c->vers++; 1309 vtUnlock(c->lk); 1310 dowakeup = 1; 1311 } 1312 break; 1313 case BioReading: 1314 case BioWriting: 1315 /* 1316 * Adding block to disk queue. Bump reference count. 1317 * diskThread decs the count later by calling blockPut. 1318 * This is here because we need to lock c->lk to 1319 * manipulate the ref count. 1320 */ 1321 vtLock(c->lk); 1322 b->ref++; 1323 vtUnlock(c->lk); 1324 break; 1325 case BioReadError: 1326 case BioVentiError: 1327 /* 1328 * Oops. 1329 */ 1330 dowakeup = 1; 1331 break; 1332 } 1333 b->iostate = iostate; 1334 /* 1335 * Now that the state has changed, we can wake the waiters. 1336 */ 1337 if(dowakeup) 1338 vtWakeupAll(b->ioready); 1339 } 1340 1341 /* 1342 * The active file system is a tree of blocks. 1343 * When we add snapshots to the mix, the entire file system 1344 * becomes a dag and thus requires a bit more care. 1345 * 1346 * The life of the file system is divided into epochs. A snapshot 1347 * ends one epoch and begins the next. Each file system block 1348 * is marked with the epoch in which it was created (b.epoch). 1349 * When the block is unlinked from the file system (closed), it is marked 1350 * with the epoch in which it was removed (b.epochClose). 1351 * Once we have discarded or archived all snapshots up to 1352 * b.epochClose, we can reclaim the block. 1353 * 1354 * If a block was created in a past epoch but is not yet closed, 1355 * it is treated as copy-on-write. Of course, in order to insert the 1356 * new pointer into the tree, the parent must be made writable, 1357 * and so on up the tree. The recursion stops because the root 1358 * block is always writable. 1359 * 1360 * If blocks are never closed, they will never be reused, and 1361 * we will run out of disk space. But marking a block as closed 1362 * requires some care about dependencies and write orderings. 1363 * 1364 * (1) If a block p points at a copy-on-write block b and we 1365 * copy b to create bb, then p must be written out after bb and 1366 * lbb (bb's label block). 1367 * 1368 * (2) We have to mark b as closed, but only after we switch 1369 * the pointer, so lb must be written out after p. In fact, we 1370 * can't even update the in-memory copy, or the cache might 1371 * mistakenly give out b for reuse before p gets written. 1372 * 1373 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1374 * The caller is expected to record a "p after bb" dependency 1375 * to finish (1), and also expected to call blockRemoveLink 1376 * to arrange for (2) to happen once p is written. 1377 * 1378 * Until (2) happens, some pieces of the code (e.g., the archiver) 1379 * still need to know whether a block has been copied, so we 1380 * set the BsCopied bit in the label and force that to disk *before* 1381 * the copy gets written out. 1382 */ 1383 Block* 1384 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1385 { 1386 Block *bb, *lb; 1387 Label l; 1388 1389 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1390 fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n", 1391 b->addr, &b->l, elo, ehi); 1392 1393 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1394 if(bb == nil){ 1395 blockPut(b); 1396 return nil; 1397 } 1398 1399 /* 1400 * Update label so we know the block has been copied. 1401 * (It will be marked closed once it has been unlinked from 1402 * the tree.) This must follow cacheAllocBlock since we 1403 * can't be holding onto lb when we call cacheAllocBlock. 1404 */ 1405 if((b->l.state&BsCopied)==0) 1406 if(b->part == PartData){ /* not the superblock */ 1407 l = b->l; 1408 l.state |= BsCopied; 1409 lb = _blockSetLabel(b, &l); 1410 if(lb == nil){ 1411 /* can't set label => can't copy block */ 1412 blockPut(b); 1413 l.type = BtMax; 1414 l.state = BsFree; 1415 l.epoch = 0; 1416 l.epochClose = 0; 1417 l.tag = 0; 1418 blockSetLabel(bb, &l, 0); 1419 blockPut(bb); 1420 return nil; 1421 } 1422 blockDependency(bb, lb, -1, nil, nil); 1423 blockPut(lb); 1424 } 1425 1426 memmove(bb->data, b->data, b->c->size); 1427 blockDirty(bb); 1428 blockPut(b); 1429 return bb; 1430 } 1431 1432 /* 1433 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1434 * If recurse is set, we are unlinking all of bb's children as well. 1435 * 1436 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1437 * the relevant information to b's list of unlinked blocks. Once b is written, 1438 * the list will be queued for processing. 1439 * 1440 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1441 */ 1442 void 1443 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1444 { 1445 BList *p, **pp, bl; 1446 1447 /* remove bb from prior list */ 1448 for(pp=&b->prior; (p=*pp)!=nil; ){ 1449 if(p->part == PartData && p->addr == addr){ 1450 *pp = p->next; 1451 blistFree(b->c, p); 1452 }else 1453 pp = &p->next; 1454 } 1455 1456 bl.part = PartData; 1457 bl.addr = addr; 1458 bl.type = type; 1459 bl.tag = tag; 1460 if(b->l.epoch == 0) 1461 assert(b->part == PartSuper); 1462 bl.epoch = b->l.epoch; 1463 bl.next = nil; 1464 bl.recurse = recurse; 1465 1466 p = blistAlloc(b); 1467 if(p == nil){ 1468 /* 1469 * We were out of blists so blistAlloc wrote b to disk. 1470 */ 1471 doRemoveLink(b->c, &bl); 1472 return; 1473 } 1474 1475 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1476 assert(b->iostate == BioDirty); 1477 1478 *p = bl; 1479 if(b->uhead == nil) 1480 b->uhead = p; 1481 else 1482 b->utail->next = p; 1483 b->utail = p; 1484 } 1485 1486 /* 1487 * Process removal of a single block and perhaps its children. 1488 */ 1489 static void 1490 doRemoveLink(Cache *c, BList *p) 1491 { 1492 int i, n, recurse; 1493 u32int a; 1494 Block *b; 1495 Label l; 1496 BList bl; 1497 1498 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1499 1500 /* 1501 * We're not really going to overwrite b, but if we're not 1502 * going to look at its contents, there is no point in reading 1503 * them from the disk. 1504 */ 1505 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1506 if(b == nil) 1507 return; 1508 1509 /* 1510 * When we're unlinking from the superblock, close with the next epoch. 1511 */ 1512 if(p->epoch == 0) 1513 p->epoch = b->l.epoch+1; 1514 1515 /* sanity check */ 1516 if(b->l.epoch > p->epoch){ 1517 fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch); 1518 blockPut(b); 1519 return; 1520 } 1521 1522 if(recurse){ 1523 n = c->size / VtScoreSize; 1524 for(i=0; i<n; i++){ 1525 a = globalToLocal(b->data + i*VtScoreSize); 1526 if(a == NilBlock || !readLabel(c, &l, a)) 1527 continue; 1528 if(l.state&BsClosed) 1529 continue; 1530 /* 1531 * If stack space becomes an issue... 1532 p->addr = a; 1533 p->type = l.type; 1534 p->tag = l.tag; 1535 doRemoveLink(c, p); 1536 */ 1537 1538 bl.part = PartData; 1539 bl.addr = a; 1540 bl.type = l.type; 1541 bl.tag = l.tag; 1542 bl.epoch = p->epoch; 1543 bl.next = nil; 1544 bl.recurse = 1; 1545 /* give up the block lock - share with others */ 1546 blockPut(b); 1547 doRemoveLink(c, &bl); 1548 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1549 if(b == nil){ 1550 fprint(2, "warning: lost block in doRemoveLink\n"); 1551 return; 1552 } 1553 } 1554 } 1555 1556 l = b->l; 1557 l.state |= BsClosed; 1558 l.epochClose = p->epoch; 1559 blockSetLabel(b, &l, 0); 1560 blockPut(b); 1561 } 1562 1563 /* 1564 * Allocate a BList so that we can record a dependency 1565 * or queue a removal related to block b. 1566 * If we can't find a BList, we write out b and return nil. 1567 */ 1568 static BList * 1569 blistAlloc(Block *b) 1570 { 1571 Cache *c; 1572 BList *p; 1573 1574 if(b->iostate != BioDirty){ 1575 /* 1576 * should not happen anymore - 1577 * blockDirty used to flush but no longer does. 1578 */ 1579 assert(b->iostate == BioClean); 1580 fprint(2, "blistAlloc: called on clean block\n"); 1581 return nil; 1582 } 1583 1584 c = b->c; 1585 vtLock(c->lk); 1586 if(c->blfree == nil){ 1587 /* 1588 * No free BLists. What are our options? 1589 */ 1590 1591 /* Block has no priors? Just write it. */ 1592 if(b->prior == nil){ 1593 vtUnlock(c->lk); 1594 diskWriteAndWait(c->disk, b); 1595 return nil; 1596 } 1597 1598 /* 1599 * Wake the flush thread, which will hopefully free up 1600 * some BLists for us. We used to flush a block from 1601 * our own prior list and reclaim that BList, but this is 1602 * a no-no: some of the blocks on our prior list may 1603 * be locked by our caller. Or maybe their label blocks 1604 * are locked by our caller. In any event, it's too hard 1605 * to make sure we can do I/O for ourselves. Instead, 1606 * we assume the flush thread will find something. 1607 * (The flush thread never blocks waiting for a block, 1608 * so it can't deadlock like we can.) 1609 */ 1610 while(c->blfree == nil){ 1611 vtWakeup(c->flush); 1612 vtSleep(c->blrend); 1613 if(c->blfree == nil) 1614 fprint(2, "flushing for blists\n"); 1615 } 1616 } 1617 1618 p = c->blfree; 1619 c->blfree = p->next; 1620 vtUnlock(c->lk); 1621 return p; 1622 } 1623 1624 static void 1625 blistFree(Cache *c, BList *bl) 1626 { 1627 vtLock(c->lk); 1628 bl->next = c->blfree; 1629 c->blfree = bl; 1630 vtWakeup(c->blrend); 1631 vtUnlock(c->lk); 1632 } 1633 1634 char* 1635 bsStr(int state) 1636 { 1637 static char s[100]; 1638 1639 if(state == BsFree) 1640 return "Free"; 1641 if(state == BsBad) 1642 return "Bad"; 1643 1644 sprint(s, "%x", state); 1645 if(!(state&BsAlloc)) 1646 strcat(s, ",Free"); /* should not happen */ 1647 if(state&BsCopied) 1648 strcat(s, ",Copied"); 1649 if(state&BsVenti) 1650 strcat(s, ",Venti"); 1651 if(state&BsClosed) 1652 strcat(s, ",Closed"); 1653 return s; 1654 } 1655 1656 char * 1657 bioStr(int iostate) 1658 { 1659 switch(iostate){ 1660 default: 1661 return "Unknown!!"; 1662 case BioEmpty: 1663 return "Empty"; 1664 case BioLabel: 1665 return "Label"; 1666 case BioClean: 1667 return "Clean"; 1668 case BioDirty: 1669 return "Dirty"; 1670 case BioReading: 1671 return "Reading"; 1672 case BioWriting: 1673 return "Writing"; 1674 case BioReadError: 1675 return "ReadError"; 1676 case BioVentiError: 1677 return "VentiError"; 1678 case BioMax: 1679 return "Max"; 1680 } 1681 } 1682 1683 static char *bttab[] = { 1684 "BtData", 1685 "BtData+1", 1686 "BtData+2", 1687 "BtData+3", 1688 "BtData+4", 1689 "BtData+5", 1690 "BtData+6", 1691 "BtData+7", 1692 "BtDir", 1693 "BtDir+1", 1694 "BtDir+2", 1695 "BtDir+3", 1696 "BtDir+4", 1697 "BtDir+5", 1698 "BtDir+6", 1699 "BtDir+7", 1700 }; 1701 1702 char* 1703 btStr(int type) 1704 { 1705 if(type < nelem(bttab)) 1706 return bttab[type]; 1707 return "unknown"; 1708 } 1709 1710 int 1711 labelFmt(Fmt *f) 1712 { 1713 Label *l; 1714 1715 l = va_arg(f->args, Label*); 1716 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1717 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1718 } 1719 1720 int 1721 scoreFmt(Fmt *f) 1722 { 1723 uchar *v; 1724 int i; 1725 u32int addr; 1726 1727 v = va_arg(f->args, uchar*); 1728 if(v == nil){ 1729 fmtprint(f, "*"); 1730 }else if((addr = globalToLocal(v)) != NilBlock) 1731 fmtprint(f, "0x%.8ux", addr); 1732 else{ 1733 for(i = 0; i < VtScoreSize; i++) 1734 fmtprint(f, "%2.2ux", v[i]); 1735 } 1736 1737 return 0; 1738 } 1739 1740 static int 1741 upHeap(int i, Block *b) 1742 { 1743 Block *bb; 1744 u32int now; 1745 int p; 1746 Cache *c; 1747 1748 c = b->c; 1749 now = c->now; 1750 for(; i != 0; i = p){ 1751 p = (i - 1) >> 1; 1752 bb = c->heap[p]; 1753 if(b->used - now >= bb->used - now) 1754 break; 1755 c->heap[i] = bb; 1756 bb->heap = i; 1757 } 1758 c->heap[i] = b; 1759 b->heap = i; 1760 1761 return i; 1762 } 1763 1764 static int 1765 downHeap(int i, Block *b) 1766 { 1767 Block *bb; 1768 u32int now; 1769 int k; 1770 Cache *c; 1771 1772 c = b->c; 1773 now = c->now; 1774 for(; ; i = k){ 1775 k = (i << 1) + 1; 1776 if(k >= c->nheap) 1777 break; 1778 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1779 k++; 1780 bb = c->heap[k]; 1781 if(b->used - now <= bb->used - now) 1782 break; 1783 c->heap[i] = bb; 1784 bb->heap = i; 1785 } 1786 c->heap[i] = b; 1787 b->heap = i; 1788 return i; 1789 } 1790 1791 /* 1792 * Delete a block from the heap. 1793 * Called with c->lk held. 1794 */ 1795 static void 1796 heapDel(Block *b) 1797 { 1798 int i, si; 1799 Cache *c; 1800 1801 c = b->c; 1802 1803 si = b->heap; 1804 if(si == BadHeap) 1805 return; 1806 b->heap = BadHeap; 1807 c->nheap--; 1808 if(si == c->nheap) 1809 return; 1810 b = c->heap[c->nheap]; 1811 i = upHeap(si, b); 1812 if(i == si) 1813 downHeap(i, b); 1814 } 1815 1816 /* 1817 * Insert a block into the heap. 1818 * Called with c->lk held. 1819 */ 1820 static void 1821 heapIns(Block *b) 1822 { 1823 assert(b->heap == BadHeap); 1824 upHeap(b->c->nheap++, b); 1825 vtWakeup(b->c->heapwait); 1826 } 1827 1828 /* 1829 * Get just the label for a block. 1830 */ 1831 int 1832 readLabel(Cache *c, Label *l, u32int addr) 1833 { 1834 int lpb; 1835 Block *b; 1836 u32int a; 1837 1838 lpb = c->size / LabelSize; 1839 a = addr / lpb; 1840 b = cacheLocal(c, PartLabel, a, OReadOnly); 1841 if(b == nil){ 1842 blockPut(b); 1843 return 0; 1844 } 1845 1846 if(!labelUnpack(l, b->data, addr%lpb)){ 1847 blockPut(b); 1848 return 0; 1849 } 1850 blockPut(b); 1851 return 1; 1852 } 1853 1854 /* 1855 * Process unlink queue. 1856 * Called with c->lk held. 1857 */ 1858 static void 1859 unlinkBody(Cache *c) 1860 { 1861 BList *p; 1862 1863 while(c->uhead != nil){ 1864 p = c->uhead; 1865 c->uhead = p->next; 1866 vtUnlock(c->lk); 1867 doRemoveLink(c, p); 1868 vtLock(c->lk); 1869 p->next = c->blfree; 1870 c->blfree = p; 1871 } 1872 } 1873 1874 /* 1875 * Occasionally unlink the blocks on the cache unlink queue. 1876 */ 1877 static void 1878 unlinkThread(void *a) 1879 { 1880 Cache *c = a; 1881 1882 vtThreadSetName("unlink"); 1883 1884 vtLock(c->lk); 1885 for(;;){ 1886 while(c->uhead == nil && c->die == nil) 1887 vtSleep(c->unlink); 1888 if(c->die != nil) 1889 break; 1890 unlinkBody(c); 1891 } 1892 c->ref--; 1893 vtWakeup(c->die); 1894 vtUnlock(c->lk); 1895 } 1896 1897 static int 1898 baddrCmp(void *a0, void *a1) 1899 { 1900 BAddr *b0, *b1; 1901 b0 = a0; 1902 b1 = a1; 1903 1904 if(b0->part < b1->part) 1905 return -1; 1906 if(b0->part > b1->part) 1907 return 1; 1908 if(b0->addr < b1->addr) 1909 return -1; 1910 if(b0->addr > b1->addr) 1911 return 1; 1912 return 0; 1913 } 1914 1915 /* 1916 * Scan the block list for dirty blocks; add them to the list c->baddr. 1917 */ 1918 static void 1919 flushFill(Cache *c) 1920 { 1921 int i, ndirty; 1922 BAddr *p; 1923 Block *b; 1924 1925 vtLock(c->lk); 1926 if(c->ndirty == 0){ 1927 vtUnlock(c->lk); 1928 return; 1929 } 1930 1931 p = c->baddr; 1932 ndirty = 0; 1933 for(i=0; i<c->nblocks; i++){ 1934 b = c->blocks + i; 1935 if(b->part == PartError) 1936 continue; 1937 if(b->iostate == BioDirty || b->iostate == BioWriting) 1938 ndirty++; 1939 if(b->iostate != BioDirty) 1940 continue; 1941 p->part = b->part; 1942 p->addr = b->addr; 1943 p->vers = b->vers; 1944 p++; 1945 } 1946 if(ndirty != c->ndirty){ 1947 fprint(2, "ndirty mismatch expected %d found %d\n", 1948 c->ndirty, ndirty); 1949 c->ndirty = ndirty; 1950 } 1951 vtUnlock(c->lk); 1952 1953 c->bw = p - c->baddr; 1954 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1955 } 1956 1957 /* 1958 * This is not thread safe, i.e. it can't be called from multiple threads. 1959 * 1960 * It's okay how we use it, because it only gets called in 1961 * the flushThread. And cacheFree, but only after 1962 * cacheFree has killed off the flushThread. 1963 */ 1964 static int 1965 cacheFlushBlock(Cache *c) 1966 { 1967 Block *b; 1968 BAddr *p; 1969 int lockfail, nfail; 1970 1971 nfail = 0; 1972 for(;;){ 1973 if(c->br == c->be){ 1974 if(c->bw == 0 || c->bw == c->be) 1975 flushFill(c); 1976 c->br = 0; 1977 c->be = c->bw; 1978 c->bw = 0; 1979 c->nflush = 0; 1980 } 1981 1982 if(c->br == c->be) 1983 return 0; 1984 p = c->baddr + c->br; 1985 c->br++; 1986 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1987 1988 if(b && blockWrite(b)){ 1989 c->nflush++; 1990 blockPut(b); 1991 return 1; 1992 } 1993 if(b) 1994 blockPut(b); 1995 1996 /* 1997 * Why didn't we write the block? 1998 */ 1999 2000 /* Block already written out */ 2001 if(b == nil && !lockfail) 2002 continue; 2003 2004 /* Failed to acquire lock; sleep if happens a lot. */ 2005 if(lockfail && ++nfail > 100){ 2006 sleep(500); 2007 nfail = 0; 2008 } 2009 /* Requeue block. */ 2010 if(c->bw < c->be) 2011 c->baddr[c->bw++] = *p; 2012 } 2013 } 2014 2015 /* 2016 * Occasionally flush dirty blocks from memory to the disk. 2017 */ 2018 static void 2019 flushThread(void *a) 2020 { 2021 Cache *c = a; 2022 int i; 2023 2024 vtThreadSetName("flush"); 2025 vtLock(c->lk); 2026 while(c->die == nil){ 2027 vtSleep(c->flush); 2028 vtUnlock(c->lk); 2029 for(i=0; i<FlushSize; i++) 2030 if(!cacheFlushBlock(c)){ 2031 /* 2032 * If i==0, could be someone is waking us repeatedly 2033 * to flush the cache but there's no work to do. 2034 * Pause a little. 2035 */ 2036 if(i==0){ 2037 // fprint(2, "flushthread found nothing to flush - %d dirty\n", c->ndirty); 2038 sleep(250); 2039 } 2040 break; 2041 } 2042 if(i==0 && c->ndirty){ 2043 /* 2044 * All the blocks are being written right now -- there's nothing to do. 2045 * We might be spinning with cacheFlush though -- he'll just keep 2046 * kicking us until c->ndirty goes down. Probably we should sleep 2047 * on something that the diskThread can kick, but for now we'll 2048 * just pause for a little while waiting for disks to finish. 2049 */ 2050 sleep(100); 2051 } 2052 vtLock(c->lk); 2053 vtWakeupAll(c->flushwait); 2054 } 2055 c->ref--; 2056 vtWakeup(c->die); 2057 vtUnlock(c->lk); 2058 } 2059 2060 /* 2061 * Flush the cache. 2062 */ 2063 void 2064 cacheFlush(Cache *c, int wait) 2065 { 2066 /* 2067 * Lock c->dirtylk so that more blocks aren't being dirtied 2068 * while we try to write out what's already here. 2069 * Otherwise we might not ever finish! 2070 */ 2071 vtLock(c->dirtylk); 2072 vtLock(c->lk); 2073 if(wait){ 2074 while(c->ndirty){ 2075 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2076 // c->ndirty, c->uhead); 2077 vtWakeup(c->flush); 2078 vtSleep(c->flushwait); 2079 } 2080 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2081 }else if(c->ndirty) 2082 vtWakeup(c->flush); 2083 vtUnlock(c->lk); 2084 vtUnlock(c->dirtylk); 2085 } 2086 2087 /* 2088 * Kick the flushThread every 30 seconds. 2089 */ 2090 static void 2091 cacheSync(void *v) 2092 { 2093 Cache *c; 2094 2095 c = v; 2096 cacheFlush(c, 0); 2097 } 2098