1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 VtLock *dirtylk; 26 int ref; 27 int mode; 28 29 Disk *disk; 30 int size; /* block size */ 31 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 32 VtSession *z; 33 u32int now; /* ticks for usage timestamps */ 34 Block **heads; /* hash table for finding address */ 35 int nheap; /* number of available victims */ 36 Block **heap; /* heap for locating victims */ 37 long nblocks; /* number of blocks allocated */ 38 Block *blocks; /* array of block descriptors */ 39 u8int *mem; /* memory for all block data & blists */ 40 41 BList *blfree; 42 VtRendez *blrend; 43 44 int ndirty; /* number of dirty blocks in the cache */ 45 int maxdirty; /* max number of dirty blocks */ 46 u32int vers; 47 48 long hashSize; 49 50 FreeList *fl; 51 52 VtRendez *die; /* daemon threads should die when != nil */ 53 54 VtRendez *flush; 55 VtRendez *flushwait; 56 VtRendez *heapwait; 57 BAddr *baddr; 58 int bw, br, be; 59 int nflush; 60 61 Periodic *sync; 62 63 /* unlink daemon */ 64 BList *uhead; 65 BList *utail; 66 VtRendez *unlink; 67 68 /* block counts */ 69 int nused; 70 int ndisk; 71 }; 72 73 struct BList { 74 int part; 75 u32int addr; 76 uchar type; 77 u32int tag; 78 u32int epoch; 79 u32int vers; 80 81 int recurse; /* for block unlink */ 82 83 /* for roll back */ 84 int index; /* -1 indicates not valid */ 85 union { 86 uchar score[VtScoreSize]; 87 uchar entry[VtEntrySize]; 88 } old; 89 BList *next; 90 }; 91 92 struct BAddr { 93 int part; 94 u32int addr; 95 u32int vers; 96 }; 97 98 struct FreeList { 99 VtLock *lk; 100 u32int last; /* last block allocated */ 101 u32int end; /* end of data partition */ 102 u32int nused; /* number of used blocks */ 103 u32int epochLow; /* low epoch when last updated nused */ 104 }; 105 106 static FreeList *flAlloc(u32int end); 107 static void flFree(FreeList *fl); 108 109 static Block *cacheBumpBlock(Cache *c); 110 static void heapDel(Block*); 111 static void heapIns(Block*); 112 static void cacheCheck(Cache*); 113 static void unlinkThread(void *a); 114 static void flushThread(void *a); 115 static void flushBody(Cache *c); 116 static void unlinkBody(Cache *c); 117 static int cacheFlushBlock(Cache *c); 118 static void cacheSync(void*); 119 static BList *blistAlloc(Block*); 120 static void blistFree(Cache*, BList*); 121 static void doRemoveLink(Cache*, BList*); 122 static void doRemoveLinkList(Cache*, BList*); 123 124 /* 125 * Mapping from local block type to Venti type 126 */ 127 int vtType[BtMax] = { 128 VtDataType, /* BtData | 0 */ 129 VtPointerType0, /* BtData | 1 */ 130 VtPointerType1, /* BtData | 2 */ 131 VtPointerType2, /* BtData | 3 */ 132 VtPointerType3, /* BtData | 4 */ 133 VtPointerType4, /* BtData | 5 */ 134 VtPointerType5, /* BtData | 6 */ 135 VtPointerType6, /* BtData | 7 */ 136 VtDirType, /* BtDir | 0 */ 137 VtPointerType0, /* BtDir | 1 */ 138 VtPointerType1, /* BtDir | 2 */ 139 VtPointerType2, /* BtDir | 3 */ 140 VtPointerType3, /* BtDir | 4 */ 141 VtPointerType4, /* BtDir | 5 */ 142 VtPointerType5, /* BtDir | 6 */ 143 VtPointerType6, /* BtDir | 7 */ 144 }; 145 146 /* 147 * Allocate the memory cache. 148 */ 149 Cache * 150 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 151 { 152 int i; 153 Cache *c; 154 Block *b; 155 BList *bl; 156 u8int *p; 157 int nbl; 158 159 c = vtMemAllocZ(sizeof(Cache)); 160 161 /* reasonable number of BList elements */ 162 nbl = nblocks * 4; 163 164 c->lk = vtLockAlloc(); 165 c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */ 166 c->ref = 1; 167 c->disk = disk; 168 c->z = z; 169 c->size = diskBlockSize(disk); 170 bwatchSetBlockSize(c->size); 171 /* round c->size up to be a nice multiple */ 172 c->size = (c->size + 127) & ~127; 173 c->ndmap = (c->size/20 + 7) / 8; 174 c->nblocks = nblocks; 175 c->hashSize = nblocks; 176 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 177 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 178 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 179 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 180 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 181 c->mode = mode; 182 c->vers++; 183 p = c->mem; 184 for(i = 0; i < nblocks; i++){ 185 b = &c->blocks[i]; 186 b->lk = vtLockAlloc(); 187 b->c = c; 188 b->data = p; 189 b->heap = i; 190 b->ioready = vtRendezAlloc(b->lk); 191 c->heap[i] = b; 192 p += c->size; 193 } 194 c->nheap = nblocks; 195 for(i = 0; i < nbl; i++){ 196 bl = (BList*)p; 197 bl->next = c->blfree; 198 c->blfree = bl; 199 p += sizeof(BList); 200 } 201 /* separate loop to keep blocks and blists reasonably aligned */ 202 for(i = 0; i < nblocks; i++){ 203 b = &c->blocks[i]; 204 b->dmap = p; 205 p += c->ndmap; 206 } 207 208 c->blrend = vtRendezAlloc(c->lk); 209 210 c->maxdirty = nblocks*(DirtyPercentage*0.01); 211 212 c->fl = flAlloc(diskSize(disk, PartData)); 213 214 c->unlink = vtRendezAlloc(c->lk); 215 c->flush = vtRendezAlloc(c->lk); 216 c->flushwait = vtRendezAlloc(c->lk); 217 c->heapwait = vtRendezAlloc(c->lk); 218 c->sync = periodicAlloc(cacheSync, c, 30*1000); 219 220 if(mode == OReadWrite){ 221 c->ref += 2; 222 vtThread(unlinkThread, c); 223 vtThread(flushThread, c); 224 } 225 cacheCheck(c); 226 227 return c; 228 } 229 230 /* 231 * Free the whole memory cache, flushing all dirty blocks to the disk. 232 */ 233 void 234 cacheFree(Cache *c) 235 { 236 int i; 237 238 /* kill off daemon threads */ 239 vtLock(c->lk); 240 c->die = vtRendezAlloc(c->lk); 241 periodicKill(c->sync); 242 vtWakeup(c->flush); 243 vtWakeup(c->unlink); 244 while(c->ref > 1) 245 vtSleep(c->die); 246 247 /* flush everything out */ 248 do { 249 unlinkBody(c); 250 vtUnlock(c->lk); 251 while(cacheFlushBlock(c)) 252 ; 253 diskFlush(c->disk); 254 vtLock(c->lk); 255 } while(c->uhead || c->ndirty); 256 vtUnlock(c->lk); 257 258 cacheCheck(c); 259 260 for(i = 0; i < c->nblocks; i++){ 261 assert(c->blocks[i].ref == 0); 262 vtRendezFree(c->blocks[i].ioready); 263 vtLockFree(c->blocks[i].lk); 264 } 265 flFree(c->fl); 266 vtMemFree(c->baddr); 267 vtMemFree(c->heads); 268 vtMemFree(c->blocks); 269 vtMemFree(c->mem); 270 vtLockFree(c->lk); 271 diskFree(c->disk); 272 vtRendezFree(c->blrend); 273 /* don't close vtSession */ 274 vtMemFree(c); 275 } 276 277 static void 278 cacheDump(Cache *c) 279 { 280 int i; 281 Block *b; 282 283 for(i = 0; i < c->nblocks; i++){ 284 b = &c->blocks[i]; 285 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 286 i, b->part, b->addr, b->score, b->l.type, b->ref, 287 bsStr(b->l.state), bioStr(b->iostate), b->pc); 288 } 289 } 290 291 static void 292 cacheCheck(Cache *c) 293 { 294 u32int size, now; 295 int i, k, refed; 296 static uchar zero[VtScoreSize]; 297 Block *b; 298 299 size = c->size; 300 now = c->now; 301 302 for(i = 0; i < c->nheap; i++){ 303 if(c->heap[i]->heap != i) 304 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 305 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 306 vtFatal("bad heap ordering"); 307 k = (i << 1) + 1; 308 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 309 vtFatal("bad heap ordering"); 310 k++; 311 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 312 vtFatal("bad heap ordering"); 313 } 314 315 refed = 0; 316 for(i = 0; i < c->nblocks; i++){ 317 b = &c->blocks[i]; 318 if(b->data != &c->mem[i * size]) 319 vtFatal("mis-blocked at %d", i); 320 if(b->ref && b->heap == BadHeap){ 321 refed++; 322 } 323 } 324 if(c->nheap + refed != c->nblocks){ 325 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 326 cacheDump(c); 327 } 328 assert(c->nheap + refed == c->nblocks); 329 refed = 0; 330 for(i = 0; i < c->nblocks; i++){ 331 b = &c->blocks[i]; 332 if(b->ref){ 333 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 334 refed++; 335 } 336 } 337 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 338 } 339 340 341 /* 342 * locate the block with the oldest second to last use. 343 * remove it from the heap, and fix up the heap. 344 */ 345 /* called with c->lk held */ 346 static Block * 347 cacheBumpBlock(Cache *c) 348 { 349 int printed; 350 Block *b; 351 352 /* 353 * locate the block with the oldest second to last use. 354 * remove it from the heap, and fix up the heap. 355 */ 356 printed = 0; 357 if(c->nheap == 0){ 358 while(c->nheap == 0){ 359 vtWakeup(c->flush); 360 vtSleep(c->heapwait); 361 if(c->nheap == 0){ 362 printed = 1; 363 fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty); 364 } 365 } 366 if(printed) 367 fprint(2, "cache is okay again, %d dirty\n", c->ndirty); 368 } 369 370 b = c->heap[0]; 371 heapDel(b); 372 373 assert(b->heap == BadHeap); 374 assert(b->ref == 0); 375 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 376 assert(b->prior == nil); 377 assert(b->uhead == nil); 378 379 /* 380 * unchain the block from hash chain 381 */ 382 if(b->prev){ 383 *(b->prev) = b->next; 384 if(b->next) 385 b->next->prev = b->prev; 386 b->prev = nil; 387 } 388 389 390 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 391 /* set block to a reasonable state */ 392 b->ref = 1; 393 b->part = PartError; 394 memset(&b->l, 0, sizeof(b->l)); 395 b->iostate = BioEmpty; 396 397 return b; 398 } 399 400 /* 401 * look for a particular version of the block in the memory cache. 402 */ 403 static Block * 404 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 405 int waitlock, int *lockfailure) 406 { 407 Block *b; 408 ulong h; 409 410 h = addr % c->hashSize; 411 412 if(lockfailure) 413 *lockfailure = 0; 414 415 /* 416 * look for the block in the cache 417 */ 418 vtLock(c->lk); 419 for(b = c->heads[h]; b != nil; b = b->next){ 420 if(b->part == part && b->addr == addr) 421 break; 422 } 423 if(b == nil || b->vers != vers){ 424 vtUnlock(c->lk); 425 return nil; 426 } 427 if(!waitlock && !vtCanLock(b->lk)){ 428 *lockfailure = 1; 429 vtUnlock(c->lk); 430 return nil; 431 } 432 heapDel(b); 433 b->ref++; 434 vtUnlock(c->lk); 435 436 bwatchLock(b); 437 if(waitlock) 438 vtLock(b->lk); 439 b->nlock = 1; 440 441 for(;;){ 442 switch(b->iostate){ 443 default: 444 abort(); 445 case BioEmpty: 446 case BioLabel: 447 case BioClean: 448 case BioDirty: 449 if(b->vers != vers){ 450 blockPut(b); 451 return nil; 452 } 453 return b; 454 case BioReading: 455 case BioWriting: 456 vtSleep(b->ioready); 457 break; 458 case BioVentiError: 459 blockPut(b); 460 vtSetError("venti i/o error block 0x%.8ux", addr); 461 return nil; 462 case BioReadError: 463 blockPut(b); 464 vtSetError("error reading block 0x%.8ux", addr); 465 return nil; 466 } 467 } 468 /* NOT REACHED */ 469 } 470 static Block* 471 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 472 { 473 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 474 } 475 476 477 /* 478 * fetch a local (on-disk) block from the memory cache. 479 * if it's not there, load it, bumping some other block. 480 */ 481 Block * 482 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 483 { 484 Block *b; 485 ulong h; 486 487 assert(part != PartVenti); 488 489 h = addr % c->hashSize; 490 491 /* 492 * look for the block in the cache 493 */ 494 vtLock(c->lk); 495 for(b = c->heads[h]; b != nil; b = b->next){ 496 if(b->part != part || b->addr != addr) 497 continue; 498 if(epoch && b->l.epoch != epoch){ 499 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 500 vtUnlock(c->lk); 501 vtSetError(ELabelMismatch); 502 return nil; 503 } 504 heapDel(b); 505 b->ref++; 506 break; 507 } 508 509 if(b == nil){ 510 b = cacheBumpBlock(c); 511 512 b->part = part; 513 b->addr = addr; 514 localToGlobal(addr, b->score); 515 516 /* chain onto correct hash */ 517 b->next = c->heads[h]; 518 c->heads[h] = b; 519 if(b->next != nil) 520 b->next->prev = &b->next; 521 b->prev = &c->heads[h]; 522 } 523 524 vtUnlock(c->lk); 525 526 /* 527 * BUG: what if the epoch changes right here? 528 * In the worst case, we could end up in some weird 529 * lock loop, because the block we want no longer exists, 530 * and instead we're trying to lock a block we have no 531 * business grabbing. 532 * 533 * For now, I'm not going to worry about it. 534 */ 535 536 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 537 bwatchLock(b); 538 vtLock(b->lk); 539 b->nlock = 1; 540 541 if(part == PartData && b->iostate == BioEmpty){ 542 if(!readLabel(c, &b->l, addr)){ 543 blockPut(b); 544 return nil; 545 } 546 blockSetIOState(b, BioLabel); 547 } 548 if(epoch && b->l.epoch != epoch){ 549 blockPut(b); 550 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 551 vtSetError(ELabelMismatch); 552 return nil; 553 } 554 555 b->pc = getcallerpc(&c); 556 for(;;){ 557 switch(b->iostate){ 558 default: 559 abort(); 560 case BioEmpty: 561 case BioLabel: 562 if(mode == OOverWrite){ 563 blockSetIOState(b, BioClean); 564 return b; 565 } 566 diskRead(c->disk, b); 567 vtSleep(b->ioready); 568 break; 569 case BioClean: 570 case BioDirty: 571 return b; 572 case BioReading: 573 case BioWriting: 574 vtSleep(b->ioready); 575 break; 576 case BioReadError: 577 blockSetIOState(b, BioEmpty); 578 blockPut(b); 579 vtSetError("error reading block 0x%.8ux", addr); 580 return nil; 581 } 582 } 583 /* NOT REACHED */ 584 } 585 586 Block * 587 cacheLocal(Cache *c, int part, u32int addr, int mode) 588 { 589 return _cacheLocal(c, part, addr, mode, 0); 590 } 591 592 /* 593 * fetch a local (on-disk) block from the memory cache. 594 * if it's not there, load it, bumping some other block. 595 * check tag and type. 596 */ 597 Block * 598 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 599 { 600 Block *b; 601 602 b = _cacheLocal(c, PartData, addr, mode, epoch); 603 if(b == nil) 604 return nil; 605 if(b->l.type != type || b->l.tag != tag){ 606 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 607 addr, b->l.type, type, b->l.tag, tag); 608 vtSetError(ELabelMismatch); 609 blockPut(b); 610 return nil; 611 } 612 b->pc = getcallerpc(&c); 613 return b; 614 } 615 616 /* 617 * fetch a global (Venti) block from the memory cache. 618 * if it's not there, load it, bumping some other block. 619 * check tag and type if it's really a local block in disguise. 620 */ 621 Block * 622 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 623 { 624 int n; 625 Block *b; 626 ulong h; 627 u32int addr; 628 629 addr = globalToLocal(score); 630 if(addr != NilBlock){ 631 b = cacheLocalData(c, addr, type, tag, mode, 0); 632 if(b) 633 b->pc = getcallerpc(&c); 634 return b; 635 } 636 637 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 638 639 /* 640 * look for the block in the cache 641 */ 642 vtLock(c->lk); 643 for(b = c->heads[h]; b != nil; b = b->next){ 644 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 645 continue; 646 heapDel(b); 647 b->ref++; 648 break; 649 } 650 651 if(b == nil){ 652 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 653 654 b = cacheBumpBlock(c); 655 656 b->part = PartVenti; 657 b->addr = NilBlock; 658 b->l.type = type; 659 memmove(b->score, score, VtScoreSize); 660 661 /* chain onto correct hash */ 662 b->next = c->heads[h]; 663 c->heads[h] = b; 664 if(b->next != nil) 665 b->next->prev = &b->next; 666 b->prev = &c->heads[h]; 667 } 668 vtUnlock(c->lk); 669 670 bwatchLock(b); 671 vtLock(b->lk); 672 b->nlock = 1; 673 b->pc = getcallerpc(&c); 674 675 switch(b->iostate){ 676 default: 677 abort(); 678 case BioEmpty: 679 n = vtRead(c->z, score, vtType[type], b->data, c->size); 680 if(n < 0 || !vtSha1Check(score, b->data, n)){ 681 blockSetIOState(b, BioVentiError); 682 blockPut(b); 683 vtSetError( 684 "venti error reading block %V or wrong score: %r", 685 score); 686 return nil; 687 } 688 vtZeroExtend(vtType[type], b->data, n, c->size); 689 blockSetIOState(b, BioClean); 690 return b; 691 case BioClean: 692 return b; 693 case BioVentiError: 694 blockPut(b); 695 vtSetError("venti i/o error or wrong score, block %V", score); 696 return nil; 697 case BioReadError: 698 blockPut(b); 699 vtSetError("error reading block %V", b->score); 700 return nil; 701 } 702 /* NOT REACHED */ 703 } 704 705 /* 706 * allocate a new on-disk block and load it into the memory cache. 707 * BUG: if the disk is full, should we flush some of it to Venti? 708 */ 709 static u32int lastAlloc; 710 711 Block * 712 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 713 { 714 FreeList *fl; 715 u32int addr; 716 Block *b; 717 int n, nwrap; 718 Label lab; 719 720 n = c->size / LabelSize; 721 fl = c->fl; 722 723 vtLock(fl->lk); 724 addr = fl->last; 725 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 726 if(b == nil){ 727 fprint(2, "cacheAllocBlock: xxx %R\n"); 728 vtUnlock(fl->lk); 729 return nil; 730 } 731 nwrap = 0; 732 for(;;){ 733 if(++addr >= fl->end){ 734 addr = 0; 735 if(++nwrap >= 2){ 736 blockPut(b); 737 fl->last = 0; 738 vtSetError("disk is full"); 739 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 740 vtUnlock(fl->lk); 741 return nil; 742 } 743 } 744 if(addr%n == 0){ 745 blockPut(b); 746 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 747 if(b == nil){ 748 fl->last = addr; 749 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 750 vtUnlock(fl->lk); 751 return nil; 752 } 753 } 754 if(!labelUnpack(&lab, b->data, addr%n)) 755 continue; 756 if(lab.state == BsFree) 757 goto Found; 758 if(lab.state&BsClosed) 759 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 760 goto Found; 761 } 762 Found: 763 blockPut(b); 764 b = cacheLocal(c, PartData, addr, OOverWrite); 765 if(b == nil){ 766 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 767 return nil; 768 } 769 assert(b->iostate == BioLabel || b->iostate == BioClean); 770 fl->last = addr; 771 lab.type = type; 772 lab.tag = tag; 773 lab.state = BsAlloc; 774 lab.epoch = epoch; 775 lab.epochClose = ~(u32int)0; 776 if(!blockSetLabel(b, &lab, 1)){ 777 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 778 blockPut(b); 779 return nil; 780 } 781 vtZeroExtend(vtType[type], b->data, 0, c->size); 782 if(0)diskWrite(c->disk, b); 783 784 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 785 lastAlloc = addr; 786 fl->nused++; 787 vtUnlock(fl->lk); 788 b->pc = getcallerpc(&c); 789 return b; 790 } 791 792 int 793 cacheDirty(Cache *c) 794 { 795 return c->ndirty; 796 } 797 798 void 799 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 800 { 801 int n; 802 u32int addr, nused; 803 Block *b; 804 Label lab; 805 FreeList *fl; 806 807 fl = c->fl; 808 n = c->size / LabelSize; 809 *bsize = c->size; 810 vtLock(fl->lk); 811 if(fl->epochLow == epochLow){ 812 *used = fl->nused; 813 *total = fl->end; 814 vtUnlock(fl->lk); 815 return; 816 } 817 b = nil; 818 nused = 0; 819 for(addr=0; addr<fl->end; addr++){ 820 if(addr%n == 0){ 821 blockPut(b); 822 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 823 if(b == nil){ 824 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 825 break; 826 } 827 } 828 if(!labelUnpack(&lab, b->data, addr%n)) 829 continue; 830 if(lab.state == BsFree) 831 continue; 832 if(lab.state&BsClosed) 833 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 834 continue; 835 nused++; 836 } 837 blockPut(b); 838 if(addr == fl->end){ 839 fl->nused = nused; 840 fl->epochLow = epochLow; 841 } 842 *used = nused; 843 *total = fl->end; 844 vtUnlock(fl->lk); 845 return; 846 } 847 848 static FreeList * 849 flAlloc(u32int end) 850 { 851 FreeList *fl; 852 853 fl = vtMemAllocZ(sizeof(*fl)); 854 fl->lk = vtLockAlloc(); 855 fl->last = 0; 856 fl->end = end; 857 return fl; 858 } 859 860 static void 861 flFree(FreeList *fl) 862 { 863 vtLockFree(fl->lk); 864 vtMemFree(fl); 865 } 866 867 u32int 868 cacheLocalSize(Cache *c, int part) 869 { 870 return diskSize(c->disk, part); 871 } 872 873 /* 874 * The thread that has locked b may refer to it by 875 * multiple names. Nlock counts the number of 876 * references the locking thread holds. It will call 877 * blockPut once per reference. 878 */ 879 void 880 blockDupLock(Block *b) 881 { 882 assert(b->nlock > 0); 883 b->nlock++; 884 } 885 886 /* 887 * we're done with the block. 888 * unlock it. can't use it after calling this. 889 */ 890 void 891 blockPut(Block* b) 892 { 893 Cache *c; 894 895 if(b == nil) 896 return; 897 898 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 899 900 if(b->iostate == BioDirty) 901 bwatchDependency(b); 902 903 if(--b->nlock > 0) 904 return; 905 906 /* 907 * b->nlock should probably stay at zero while 908 * the block is unlocked, but diskThread and vtSleep 909 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 910 * so we have to keep b->nlock set to 1 even 911 * when the block is unlocked. 912 */ 913 assert(b->nlock == 0); 914 b->nlock = 1; 915 // b->pc = 0; 916 917 bwatchUnlock(b); 918 vtUnlock(b->lk); 919 c = b->c; 920 vtLock(c->lk); 921 922 if(--b->ref > 0){ 923 vtUnlock(c->lk); 924 return; 925 } 926 927 assert(b->ref == 0); 928 switch(b->iostate){ 929 default: 930 b->used = c->now++; 931 heapIns(b); 932 break; 933 case BioEmpty: 934 case BioLabel: 935 if(c->nheap == 0) 936 b->used = c->now++; 937 else 938 b->used = c->heap[0]->used; 939 heapIns(b); 940 break; 941 case BioDirty: 942 break; 943 } 944 vtUnlock(c->lk); 945 } 946 947 /* 948 * set the label associated with a block. 949 */ 950 Block* 951 _blockSetLabel(Block *b, Label *l) 952 { 953 int lpb; 954 Block *bb; 955 u32int a; 956 Cache *c; 957 958 c = b->c; 959 960 assert(b->part == PartData); 961 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 962 lpb = c->size / LabelSize; 963 a = b->addr / lpb; 964 bb = cacheLocal(c, PartLabel, a, OReadWrite); 965 if(bb == nil){ 966 blockPut(b); 967 return nil; 968 } 969 b->l = *l; 970 labelPack(l, bb->data, b->addr%lpb); 971 blockDirty(bb); 972 return bb; 973 } 974 975 int 976 blockSetLabel(Block *b, Label *l, int allocating) 977 { 978 Block *lb; 979 Label oldl; 980 981 oldl = b->l; 982 lb = _blockSetLabel(b, l); 983 if(lb == nil) 984 return 0; 985 986 /* 987 * If we're allocating the block, make sure the label (bl) 988 * goes to disk before the data block (b) itself. This is to help 989 * the blocks that in turn depend on b. 990 * 991 * Suppose bx depends on (must be written out after) b. 992 * Once we write b we'll think it's safe to write bx. 993 * Bx can't get at b unless it has a valid label, though. 994 * 995 * Allocation is the only case in which having a current label 996 * is vital because: 997 * 998 * - l.type is set at allocation and never changes. 999 * - l.tag is set at allocation and never changes. 1000 * - l.state is not checked when we load blocks. 1001 * - the archiver cares deeply about l.state being 1002 * BaActive vs. BaCopied, but that's handled 1003 * by direct calls to _blockSetLabel. 1004 */ 1005 1006 if(allocating) 1007 blockDependency(b, lb, -1, nil, nil); 1008 blockPut(lb); 1009 return 1; 1010 } 1011 1012 /* 1013 * Record that bb must be written out before b. 1014 * If index is given, we're about to overwrite the score/e 1015 * at that index in the block. Save the old value so we 1016 * can write a safer ``old'' version of the block if pressed. 1017 */ 1018 void 1019 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1020 { 1021 BList *p; 1022 1023 if(bb->iostate == BioClean) 1024 return; 1025 1026 /* 1027 * Dependencies for blocks containing Entry structures 1028 * or scores must always be explained. The problem with 1029 * only explaining some of them is this. Suppose we have two 1030 * dependencies for the same field, the first explained 1031 * and the second not. We try to write the block when the first 1032 * dependency is not written but the second is. We will roll back 1033 * the first change even though the second trumps it. 1034 */ 1035 if(index == -1 && bb->part == PartData) 1036 assert(b->l.type == BtData); 1037 1038 if(bb->iostate != BioDirty){ 1039 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1040 bb->part, bb->addr, bb->l.type, bb->iostate); 1041 abort(); 1042 } 1043 1044 p = blistAlloc(bb); 1045 if(p == nil) 1046 return; 1047 1048 assert(bb->iostate == BioDirty); 1049 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1050 1051 p->part = bb->part; 1052 p->addr = bb->addr; 1053 p->type = bb->l.type; 1054 p->vers = bb->vers; 1055 p->index = index; 1056 if(p->index >= 0){ 1057 /* 1058 * This test would just be b->l.type==BtDir except 1059 * we need to exclude the super block. 1060 */ 1061 if(b->l.type == BtDir && b->part == PartData) 1062 entryPack(e, p->old.entry, 0); 1063 else 1064 memmove(p->old.score, score, VtScoreSize); 1065 } 1066 p->next = b->prior; 1067 b->prior = p; 1068 } 1069 1070 /* 1071 * Mark an in-memory block as dirty. If there are too many 1072 * dirty blocks, start writing some out to disk. 1073 * 1074 * If there were way too many dirty blocks, we used to 1075 * try to do some flushing ourselves, but it's just too dangerous -- 1076 * it implies that the callers cannot have any of our priors locked, 1077 * but this is hard to avoid in some cases. 1078 */ 1079 int 1080 blockDirty(Block *b) 1081 { 1082 Cache *c; 1083 1084 c = b->c; 1085 1086 assert(b->part != PartVenti); 1087 1088 if(b->iostate == BioDirty) 1089 return 1; 1090 assert(b->iostate == BioClean); 1091 1092 vtLock(c->dirtylk); 1093 vtLock(c->lk); 1094 b->iostate = BioDirty; 1095 c->ndirty++; 1096 if(c->ndirty > (c->maxdirty>>1)) 1097 vtWakeup(c->flush); 1098 vtUnlock(c->lk); 1099 vtUnlock(c->dirtylk); 1100 1101 return 1; 1102 } 1103 1104 /* 1105 * We've decided to write out b. Maybe b has some pointers to blocks 1106 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1107 * copy of b that is safe to write out. (diskThread will make sure the block 1108 * remains marked as dirty.) 1109 */ 1110 uchar * 1111 blockRollback(Block *b, uchar *buf) 1112 { 1113 u32int addr; 1114 BList *p; 1115 Super super; 1116 1117 /* easy case */ 1118 if(b->prior == nil) 1119 return b->data; 1120 1121 memmove(buf, b->data, b->c->size); 1122 for(p=b->prior; p; p=p->next){ 1123 /* 1124 * we know p->index >= 0 because blockWrite has vetted this block for us. 1125 */ 1126 assert(p->index >= 0); 1127 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1128 if(b->part == PartSuper){ 1129 assert(p->index == 0); 1130 superUnpack(&super, buf); 1131 addr = globalToLocal(p->old.score); 1132 if(addr == NilBlock){ 1133 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1134 abort(); 1135 } 1136 super.active = addr; 1137 superPack(&super, buf); 1138 continue; 1139 } 1140 if(b->l.type == BtDir) 1141 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1142 else 1143 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1144 } 1145 return buf; 1146 } 1147 1148 /* 1149 * Try to write block b. 1150 * If b depends on other blocks: 1151 * 1152 * If the block has been written out, remove the dependency. 1153 * If the dependency is replaced by a more recent dependency, 1154 * throw it out. 1155 * If we know how to write out an old version of b that doesn't 1156 * depend on it, do that. 1157 * 1158 * Otherwise, bail. 1159 */ 1160 int 1161 blockWrite(Block *b) 1162 { 1163 uchar *dmap; 1164 Cache *c; 1165 BList *p, **pp; 1166 Block *bb; 1167 int lockfail; 1168 1169 c = b->c; 1170 1171 if(b->iostate != BioDirty) 1172 return 1; 1173 1174 dmap = b->dmap; 1175 memset(dmap, 0, c->ndmap); 1176 pp = &b->prior; 1177 for(p=*pp; p; p=*pp){ 1178 if(p->index >= 0){ 1179 /* more recent dependency has succeeded; this one can go */ 1180 if(dmap[p->index/8] & (1<<(p->index%8))) 1181 goto ignblock; 1182 } 1183 1184 lockfail = 0; 1185 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1186 if(bb == nil){ 1187 if(lockfail) 1188 return 0; 1189 /* block not in cache => was written already */ 1190 dmap[p->index/8] |= 1<<(p->index%8); 1191 goto ignblock; 1192 } 1193 1194 /* 1195 * same version of block is still in cache. 1196 * 1197 * the assertion is true because the block still has version p->vers, 1198 * which means it hasn't been written out since we last saw it. 1199 */ 1200 if(bb->iostate != BioDirty){ 1201 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1202 bb->part, bb->addr, bb->l.type, bb->iostate); 1203 /* probably BioWriting if it happens? */ 1204 if(bb->iostate == BioClean) 1205 goto ignblock; 1206 } 1207 1208 blockPut(bb); 1209 1210 if(p->index < 0){ 1211 /* 1212 * We don't know how to temporarily undo 1213 * b's dependency on bb, so just don't write b yet. 1214 */ 1215 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1216 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1217 return 0; 1218 } 1219 /* keep walking down the list */ 1220 pp = &p->next; 1221 continue; 1222 1223 ignblock: 1224 *pp = p->next; 1225 blistFree(c, p); 1226 continue; 1227 } 1228 1229 /* 1230 * DiskWrite must never be called with a double-locked block. 1231 * This call to diskWrite is okay because blockWrite is only called 1232 * from the cache flush thread, which never double-locks a block. 1233 */ 1234 diskWrite(c->disk, b); 1235 return 1; 1236 } 1237 1238 /* 1239 * Change the I/O state of block b. 1240 * Just an assignment except for magic in 1241 * switch statement (read comments there). 1242 */ 1243 void 1244 blockSetIOState(Block *b, int iostate) 1245 { 1246 int dowakeup; 1247 Cache *c; 1248 BList *p, *q; 1249 1250 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1251 1252 c = b->c; 1253 1254 dowakeup = 0; 1255 switch(iostate){ 1256 default: 1257 abort(); 1258 case BioEmpty: 1259 assert(!b->uhead); 1260 break; 1261 case BioLabel: 1262 assert(!b->uhead); 1263 break; 1264 case BioClean: 1265 bwatchDependency(b); 1266 /* 1267 * If b->prior is set, it means a write just finished. 1268 * The prior list isn't needed anymore. 1269 */ 1270 for(p=b->prior; p; p=q){ 1271 q = p->next; 1272 blistFree(c, p); 1273 } 1274 b->prior = nil; 1275 /* 1276 * Freeing a block or just finished a write. 1277 * Move the blocks from the per-block unlink 1278 * queue to the cache unlink queue. 1279 */ 1280 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1281 vtLock(c->lk); 1282 c->ndirty--; 1283 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1284 b->vers = c->vers++; 1285 if(b->uhead){ 1286 /* add unlink blocks to unlink queue */ 1287 if(c->uhead == nil){ 1288 c->uhead = b->uhead; 1289 vtWakeup(c->unlink); 1290 }else 1291 c->utail->next = b->uhead; 1292 c->utail = b->utail; 1293 b->uhead = nil; 1294 } 1295 vtUnlock(c->lk); 1296 } 1297 assert(!b->uhead); 1298 dowakeup = 1; 1299 break; 1300 case BioDirty: 1301 /* 1302 * Wrote out an old version of the block (see blockRollback). 1303 * Bump a version count, leave it dirty. 1304 */ 1305 if(b->iostate == BioWriting){ 1306 vtLock(c->lk); 1307 b->vers = c->vers++; 1308 vtUnlock(c->lk); 1309 dowakeup = 1; 1310 } 1311 break; 1312 case BioReading: 1313 case BioWriting: 1314 /* 1315 * Adding block to disk queue. Bump reference count. 1316 * diskThread decs the count later by calling blockPut. 1317 * This is here because we need to lock c->lk to 1318 * manipulate the ref count. 1319 */ 1320 vtLock(c->lk); 1321 b->ref++; 1322 vtUnlock(c->lk); 1323 break; 1324 case BioReadError: 1325 case BioVentiError: 1326 /* 1327 * Oops. 1328 */ 1329 dowakeup = 1; 1330 break; 1331 } 1332 b->iostate = iostate; 1333 /* 1334 * Now that the state has changed, we can wake the waiters. 1335 */ 1336 if(dowakeup) 1337 vtWakeupAll(b->ioready); 1338 } 1339 1340 /* 1341 * The active file system is a tree of blocks. 1342 * When we add snapshots to the mix, the entire file system 1343 * becomes a dag and thus requires a bit more care. 1344 * 1345 * The life of the file system is divided into epochs. A snapshot 1346 * ends one epoch and begins the next. Each file system block 1347 * is marked with the epoch in which it was created (b.epoch). 1348 * When the block is unlinked from the file system (closed), it is marked 1349 * with the epoch in which it was removed (b.epochClose). 1350 * Once we have discarded or archived all snapshots up to 1351 * b.epochClose, we can reclaim the block. 1352 * 1353 * If a block was created in a past epoch but is not yet closed, 1354 * it is treated as copy-on-write. Of course, in order to insert the 1355 * new pointer into the tree, the parent must be made writable, 1356 * and so on up the tree. The recursion stops because the root 1357 * block is always writable. 1358 * 1359 * If blocks are never closed, they will never be reused, and 1360 * we will run out of disk space. But marking a block as closed 1361 * requires some care about dependencies and write orderings. 1362 * 1363 * (1) If a block p points at a copy-on-write block b and we 1364 * copy b to create bb, then p must be written out after bb and 1365 * lbb (bb's label block). 1366 * 1367 * (2) We have to mark b as closed, but only after we switch 1368 * the pointer, so lb must be written out after p. In fact, we 1369 * can't even update the in-memory copy, or the cache might 1370 * mistakenly give out b for reuse before p gets written. 1371 * 1372 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1373 * The caller is expected to record a "p after bb" dependency 1374 * to finish (1), and also expected to call blockRemoveLink 1375 * to arrange for (2) to happen once p is written. 1376 * 1377 * Until (2) happens, some pieces of the code (e.g., the archiver) 1378 * still need to know whether a block has been copied, so we 1379 * set the BsCopied bit in the label and force that to disk *before* 1380 * the copy gets written out. 1381 */ 1382 Block* 1383 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1384 { 1385 Block *bb, *lb; 1386 Label l; 1387 1388 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1389 fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n", 1390 b->addr, &b->l, elo, ehi); 1391 1392 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1393 if(bb == nil){ 1394 blockPut(b); 1395 return nil; 1396 } 1397 1398 /* 1399 * Update label so we know the block has been copied. 1400 * (It will be marked closed once it has been unlinked from 1401 * the tree.) This must follow cacheAllocBlock since we 1402 * can't be holding onto lb when we call cacheAllocBlock. 1403 */ 1404 if((b->l.state&BsCopied)==0) 1405 if(b->part == PartData){ /* not the superblock */ 1406 l = b->l; 1407 l.state |= BsCopied; 1408 lb = _blockSetLabel(b, &l); 1409 if(lb == nil){ 1410 /* can't set label => can't copy block */ 1411 blockPut(b); 1412 l.type = BtMax; 1413 l.state = BsFree; 1414 l.epoch = 0; 1415 l.epochClose = 0; 1416 l.tag = 0; 1417 blockSetLabel(bb, &l, 0); 1418 blockPut(bb); 1419 return nil; 1420 } 1421 blockDependency(bb, lb, -1, nil, nil); 1422 blockPut(lb); 1423 } 1424 1425 memmove(bb->data, b->data, b->c->size); 1426 blockDirty(bb); 1427 blockPut(b); 1428 return bb; 1429 } 1430 1431 /* 1432 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1433 * If recurse is set, we are unlinking all of bb's children as well. 1434 * 1435 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1436 * the relevant information to b's list of unlinked blocks. Once b is written, 1437 * the list will be queued for processing. 1438 * 1439 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1440 */ 1441 void 1442 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1443 { 1444 BList *p, **pp, bl; 1445 1446 /* remove bb from prior list */ 1447 for(pp=&b->prior; (p=*pp)!=nil; ){ 1448 if(p->part == PartData && p->addr == addr){ 1449 *pp = p->next; 1450 blistFree(b->c, p); 1451 }else 1452 pp = &p->next; 1453 } 1454 1455 bl.part = PartData; 1456 bl.addr = addr; 1457 bl.type = type; 1458 bl.tag = tag; 1459 if(b->l.epoch == 0) 1460 assert(b->part == PartSuper); 1461 bl.epoch = b->l.epoch; 1462 bl.next = nil; 1463 bl.recurse = recurse; 1464 1465 p = blistAlloc(b); 1466 if(p == nil){ 1467 /* 1468 * We were out of blists so blistAlloc wrote b to disk. 1469 */ 1470 doRemoveLink(b->c, &bl); 1471 return; 1472 } 1473 1474 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1475 assert(b->iostate == BioDirty); 1476 1477 *p = bl; 1478 if(b->uhead == nil) 1479 b->uhead = p; 1480 else 1481 b->utail->next = p; 1482 b->utail = p; 1483 } 1484 1485 /* 1486 * Process removal of a single block and perhaps its children. 1487 */ 1488 static void 1489 doRemoveLink(Cache *c, BList *p) 1490 { 1491 int i, n, recurse; 1492 u32int a; 1493 Block *b; 1494 Label l; 1495 BList bl; 1496 1497 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1498 1499 /* 1500 * We're not really going to overwrite b, but if we're not 1501 * going to look at its contents, there is no point in reading 1502 * them from the disk. 1503 */ 1504 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1505 if(b == nil) 1506 return; 1507 1508 /* 1509 * When we're unlinking from the superblock, close with the next epoch. 1510 */ 1511 if(p->epoch == 0) 1512 p->epoch = b->l.epoch+1; 1513 1514 /* sanity check */ 1515 if(b->l.epoch > p->epoch){ 1516 fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch); 1517 blockPut(b); 1518 return; 1519 } 1520 1521 if(recurse){ 1522 n = c->size / VtScoreSize; 1523 for(i=0; i<n; i++){ 1524 a = globalToLocal(b->data + i*VtScoreSize); 1525 if(a == NilBlock || !readLabel(c, &l, a)) 1526 continue; 1527 if(l.state&BsClosed) 1528 continue; 1529 /* 1530 * If stack space becomes an issue... 1531 p->addr = a; 1532 p->type = l.type; 1533 p->tag = l.tag; 1534 doRemoveLink(c, p); 1535 */ 1536 1537 bl.part = PartData; 1538 bl.addr = a; 1539 bl.type = l.type; 1540 bl.tag = l.tag; 1541 bl.epoch = p->epoch; 1542 bl.next = nil; 1543 bl.recurse = 1; 1544 /* give up the block lock - share with others */ 1545 blockPut(b); 1546 doRemoveLink(c, &bl); 1547 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1548 if(b == nil){ 1549 fprint(2, "warning: lost block in doRemoveLink\n"); 1550 return; 1551 } 1552 } 1553 } 1554 1555 l = b->l; 1556 l.state |= BsClosed; 1557 l.epochClose = p->epoch; 1558 if(l.epochClose == l.epoch){ 1559 vtLock(c->fl->lk); 1560 if(l.epoch == c->fl->epochLow) 1561 c->fl->nused--; 1562 blockSetLabel(b, &l, 0); 1563 vtUnlock(c->fl->lk); 1564 }else 1565 blockSetLabel(b, &l, 0); 1566 blockPut(b); 1567 } 1568 1569 /* 1570 * Allocate a BList so that we can record a dependency 1571 * or queue a removal related to block b. 1572 * If we can't find a BList, we write out b and return nil. 1573 */ 1574 static BList * 1575 blistAlloc(Block *b) 1576 { 1577 Cache *c; 1578 BList *p; 1579 1580 if(b->iostate != BioDirty){ 1581 /* 1582 * should not happen anymore - 1583 * blockDirty used to flush but no longer does. 1584 */ 1585 assert(b->iostate == BioClean); 1586 fprint(2, "blistAlloc: called on clean block\n"); 1587 return nil; 1588 } 1589 1590 c = b->c; 1591 vtLock(c->lk); 1592 if(c->blfree == nil){ 1593 /* 1594 * No free BLists. What are our options? 1595 */ 1596 1597 /* Block has no priors? Just write it. */ 1598 if(b->prior == nil){ 1599 vtUnlock(c->lk); 1600 diskWriteAndWait(c->disk, b); 1601 return nil; 1602 } 1603 1604 /* 1605 * Wake the flush thread, which will hopefully free up 1606 * some BLists for us. We used to flush a block from 1607 * our own prior list and reclaim that BList, but this is 1608 * a no-no: some of the blocks on our prior list may 1609 * be locked by our caller. Or maybe their label blocks 1610 * are locked by our caller. In any event, it's too hard 1611 * to make sure we can do I/O for ourselves. Instead, 1612 * we assume the flush thread will find something. 1613 * (The flush thread never blocks waiting for a block, 1614 * so it can't deadlock like we can.) 1615 */ 1616 while(c->blfree == nil){ 1617 vtWakeup(c->flush); 1618 vtSleep(c->blrend); 1619 if(c->blfree == nil) 1620 fprint(2, "flushing for blists\n"); 1621 } 1622 } 1623 1624 p = c->blfree; 1625 c->blfree = p->next; 1626 vtUnlock(c->lk); 1627 return p; 1628 } 1629 1630 static void 1631 blistFree(Cache *c, BList *bl) 1632 { 1633 vtLock(c->lk); 1634 bl->next = c->blfree; 1635 c->blfree = bl; 1636 vtWakeup(c->blrend); 1637 vtUnlock(c->lk); 1638 } 1639 1640 char* 1641 bsStr(int state) 1642 { 1643 static char s[100]; 1644 1645 if(state == BsFree) 1646 return "Free"; 1647 if(state == BsBad) 1648 return "Bad"; 1649 1650 sprint(s, "%x", state); 1651 if(!(state&BsAlloc)) 1652 strcat(s, ",Free"); /* should not happen */ 1653 if(state&BsCopied) 1654 strcat(s, ",Copied"); 1655 if(state&BsVenti) 1656 strcat(s, ",Venti"); 1657 if(state&BsClosed) 1658 strcat(s, ",Closed"); 1659 return s; 1660 } 1661 1662 char * 1663 bioStr(int iostate) 1664 { 1665 switch(iostate){ 1666 default: 1667 return "Unknown!!"; 1668 case BioEmpty: 1669 return "Empty"; 1670 case BioLabel: 1671 return "Label"; 1672 case BioClean: 1673 return "Clean"; 1674 case BioDirty: 1675 return "Dirty"; 1676 case BioReading: 1677 return "Reading"; 1678 case BioWriting: 1679 return "Writing"; 1680 case BioReadError: 1681 return "ReadError"; 1682 case BioVentiError: 1683 return "VentiError"; 1684 case BioMax: 1685 return "Max"; 1686 } 1687 } 1688 1689 static char *bttab[] = { 1690 "BtData", 1691 "BtData+1", 1692 "BtData+2", 1693 "BtData+3", 1694 "BtData+4", 1695 "BtData+5", 1696 "BtData+6", 1697 "BtData+7", 1698 "BtDir", 1699 "BtDir+1", 1700 "BtDir+2", 1701 "BtDir+3", 1702 "BtDir+4", 1703 "BtDir+5", 1704 "BtDir+6", 1705 "BtDir+7", 1706 }; 1707 1708 char* 1709 btStr(int type) 1710 { 1711 if(type < nelem(bttab)) 1712 return bttab[type]; 1713 return "unknown"; 1714 } 1715 1716 int 1717 labelFmt(Fmt *f) 1718 { 1719 Label *l; 1720 1721 l = va_arg(f->args, Label*); 1722 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1723 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1724 } 1725 1726 int 1727 scoreFmt(Fmt *f) 1728 { 1729 uchar *v; 1730 int i; 1731 u32int addr; 1732 1733 v = va_arg(f->args, uchar*); 1734 if(v == nil){ 1735 fmtprint(f, "*"); 1736 }else if((addr = globalToLocal(v)) != NilBlock) 1737 fmtprint(f, "0x%.8ux", addr); 1738 else{ 1739 for(i = 0; i < VtScoreSize; i++) 1740 fmtprint(f, "%2.2ux", v[i]); 1741 } 1742 1743 return 0; 1744 } 1745 1746 static int 1747 upHeap(int i, Block *b) 1748 { 1749 Block *bb; 1750 u32int now; 1751 int p; 1752 Cache *c; 1753 1754 c = b->c; 1755 now = c->now; 1756 for(; i != 0; i = p){ 1757 p = (i - 1) >> 1; 1758 bb = c->heap[p]; 1759 if(b->used - now >= bb->used - now) 1760 break; 1761 c->heap[i] = bb; 1762 bb->heap = i; 1763 } 1764 c->heap[i] = b; 1765 b->heap = i; 1766 1767 return i; 1768 } 1769 1770 static int 1771 downHeap(int i, Block *b) 1772 { 1773 Block *bb; 1774 u32int now; 1775 int k; 1776 Cache *c; 1777 1778 c = b->c; 1779 now = c->now; 1780 for(; ; i = k){ 1781 k = (i << 1) + 1; 1782 if(k >= c->nheap) 1783 break; 1784 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1785 k++; 1786 bb = c->heap[k]; 1787 if(b->used - now <= bb->used - now) 1788 break; 1789 c->heap[i] = bb; 1790 bb->heap = i; 1791 } 1792 c->heap[i] = b; 1793 b->heap = i; 1794 return i; 1795 } 1796 1797 /* 1798 * Delete a block from the heap. 1799 * Called with c->lk held. 1800 */ 1801 static void 1802 heapDel(Block *b) 1803 { 1804 int i, si; 1805 Cache *c; 1806 1807 c = b->c; 1808 1809 si = b->heap; 1810 if(si == BadHeap) 1811 return; 1812 b->heap = BadHeap; 1813 c->nheap--; 1814 if(si == c->nheap) 1815 return; 1816 b = c->heap[c->nheap]; 1817 i = upHeap(si, b); 1818 if(i == si) 1819 downHeap(i, b); 1820 } 1821 1822 /* 1823 * Insert a block into the heap. 1824 * Called with c->lk held. 1825 */ 1826 static void 1827 heapIns(Block *b) 1828 { 1829 assert(b->heap == BadHeap); 1830 upHeap(b->c->nheap++, b); 1831 vtWakeup(b->c->heapwait); 1832 } 1833 1834 /* 1835 * Get just the label for a block. 1836 */ 1837 int 1838 readLabel(Cache *c, Label *l, u32int addr) 1839 { 1840 int lpb; 1841 Block *b; 1842 u32int a; 1843 1844 lpb = c->size / LabelSize; 1845 a = addr / lpb; 1846 b = cacheLocal(c, PartLabel, a, OReadOnly); 1847 if(b == nil){ 1848 blockPut(b); 1849 return 0; 1850 } 1851 1852 if(!labelUnpack(l, b->data, addr%lpb)){ 1853 blockPut(b); 1854 return 0; 1855 } 1856 blockPut(b); 1857 return 1; 1858 } 1859 1860 /* 1861 * Process unlink queue. 1862 * Called with c->lk held. 1863 */ 1864 static void 1865 unlinkBody(Cache *c) 1866 { 1867 BList *p; 1868 1869 while(c->uhead != nil){ 1870 p = c->uhead; 1871 c->uhead = p->next; 1872 vtUnlock(c->lk); 1873 doRemoveLink(c, p); 1874 vtLock(c->lk); 1875 p->next = c->blfree; 1876 c->blfree = p; 1877 } 1878 } 1879 1880 /* 1881 * Occasionally unlink the blocks on the cache unlink queue. 1882 */ 1883 static void 1884 unlinkThread(void *a) 1885 { 1886 Cache *c = a; 1887 1888 vtThreadSetName("unlink"); 1889 1890 vtLock(c->lk); 1891 for(;;){ 1892 while(c->uhead == nil && c->die == nil) 1893 vtSleep(c->unlink); 1894 if(c->die != nil) 1895 break; 1896 unlinkBody(c); 1897 } 1898 c->ref--; 1899 vtWakeup(c->die); 1900 vtUnlock(c->lk); 1901 } 1902 1903 static int 1904 baddrCmp(void *a0, void *a1) 1905 { 1906 BAddr *b0, *b1; 1907 b0 = a0; 1908 b1 = a1; 1909 1910 if(b0->part < b1->part) 1911 return -1; 1912 if(b0->part > b1->part) 1913 return 1; 1914 if(b0->addr < b1->addr) 1915 return -1; 1916 if(b0->addr > b1->addr) 1917 return 1; 1918 return 0; 1919 } 1920 1921 /* 1922 * Scan the block list for dirty blocks; add them to the list c->baddr. 1923 */ 1924 static void 1925 flushFill(Cache *c) 1926 { 1927 int i, ndirty; 1928 BAddr *p; 1929 Block *b; 1930 1931 vtLock(c->lk); 1932 if(c->ndirty == 0){ 1933 vtUnlock(c->lk); 1934 return; 1935 } 1936 1937 p = c->baddr; 1938 ndirty = 0; 1939 for(i=0; i<c->nblocks; i++){ 1940 b = c->blocks + i; 1941 if(b->part == PartError) 1942 continue; 1943 if(b->iostate == BioDirty || b->iostate == BioWriting) 1944 ndirty++; 1945 if(b->iostate != BioDirty) 1946 continue; 1947 p->part = b->part; 1948 p->addr = b->addr; 1949 p->vers = b->vers; 1950 p++; 1951 } 1952 if(ndirty != c->ndirty){ 1953 fprint(2, "ndirty mismatch expected %d found %d\n", 1954 c->ndirty, ndirty); 1955 c->ndirty = ndirty; 1956 } 1957 vtUnlock(c->lk); 1958 1959 c->bw = p - c->baddr; 1960 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1961 } 1962 1963 /* 1964 * This is not thread safe, i.e. it can't be called from multiple threads. 1965 * 1966 * It's okay how we use it, because it only gets called in 1967 * the flushThread. And cacheFree, but only after 1968 * cacheFree has killed off the flushThread. 1969 */ 1970 static int 1971 cacheFlushBlock(Cache *c) 1972 { 1973 Block *b; 1974 BAddr *p; 1975 int lockfail, nfail; 1976 1977 nfail = 0; 1978 for(;;){ 1979 if(c->br == c->be){ 1980 if(c->bw == 0 || c->bw == c->be) 1981 flushFill(c); 1982 c->br = 0; 1983 c->be = c->bw; 1984 c->bw = 0; 1985 c->nflush = 0; 1986 } 1987 1988 if(c->br == c->be) 1989 return 0; 1990 p = c->baddr + c->br; 1991 c->br++; 1992 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1993 1994 if(b && blockWrite(b)){ 1995 c->nflush++; 1996 blockPut(b); 1997 return 1; 1998 } 1999 if(b) 2000 blockPut(b); 2001 2002 /* 2003 * Why didn't we write the block? 2004 */ 2005 2006 /* Block already written out */ 2007 if(b == nil && !lockfail) 2008 continue; 2009 2010 /* Failed to acquire lock; sleep if happens a lot. */ 2011 if(lockfail && ++nfail > 100){ 2012 sleep(500); 2013 nfail = 0; 2014 } 2015 /* Requeue block. */ 2016 if(c->bw < c->be) 2017 c->baddr[c->bw++] = *p; 2018 } 2019 } 2020 2021 /* 2022 * Occasionally flush dirty blocks from memory to the disk. 2023 */ 2024 static void 2025 flushThread(void *a) 2026 { 2027 Cache *c = a; 2028 int i; 2029 2030 vtThreadSetName("flush"); 2031 vtLock(c->lk); 2032 while(c->die == nil){ 2033 vtSleep(c->flush); 2034 vtUnlock(c->lk); 2035 for(i=0; i<FlushSize; i++) 2036 if(!cacheFlushBlock(c)){ 2037 /* 2038 * If i==0, could be someone is waking us repeatedly 2039 * to flush the cache but there's no work to do. 2040 * Pause a little. 2041 */ 2042 if(i==0){ 2043 // fprint(2, "flushthread found nothing to flush - %d dirty\n", c->ndirty); 2044 sleep(250); 2045 } 2046 break; 2047 } 2048 if(i==0 && c->ndirty){ 2049 /* 2050 * All the blocks are being written right now -- there's nothing to do. 2051 * We might be spinning with cacheFlush though -- he'll just keep 2052 * kicking us until c->ndirty goes down. Probably we should sleep 2053 * on something that the diskThread can kick, but for now we'll 2054 * just pause for a little while waiting for disks to finish. 2055 */ 2056 sleep(100); 2057 } 2058 vtLock(c->lk); 2059 vtWakeupAll(c->flushwait); 2060 } 2061 c->ref--; 2062 vtWakeup(c->die); 2063 vtUnlock(c->lk); 2064 } 2065 2066 /* 2067 * Flush the cache. 2068 */ 2069 void 2070 cacheFlush(Cache *c, int wait) 2071 { 2072 /* 2073 * Lock c->dirtylk so that more blocks aren't being dirtied 2074 * while we try to write out what's already here. 2075 * Otherwise we might not ever finish! 2076 */ 2077 vtLock(c->dirtylk); 2078 vtLock(c->lk); 2079 if(wait){ 2080 while(c->ndirty){ 2081 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2082 // c->ndirty, c->uhead); 2083 vtWakeup(c->flush); 2084 vtSleep(c->flushwait); 2085 } 2086 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2087 }else if(c->ndirty) 2088 vtWakeup(c->flush); 2089 vtUnlock(c->lk); 2090 vtUnlock(c->dirtylk); 2091 } 2092 2093 /* 2094 * Kick the flushThread every 30 seconds. 2095 */ 2096 static void 2097 cacheSync(void *v) 2098 { 2099 Cache *c; 2100 2101 c = v; 2102 cacheFlush(c, 0); 2103 } 2104