1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 VtLock *dirtylk; 26 int ref; 27 int mode; 28 29 Disk *disk; 30 int size; /* block size */ 31 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 32 VtSession *z; 33 u32int now; /* ticks for usage timestamps */ 34 Block **heads; /* hash table for finding address */ 35 int nheap; /* number of available victims */ 36 Block **heap; /* heap for locating victims */ 37 long nblocks; /* number of blocks allocated */ 38 Block *blocks; /* array of block descriptors */ 39 u8int *mem; /* memory for all block data & blists */ 40 41 BList *blfree; 42 VtRendez *blrend; 43 44 int ndirty; /* number of dirty blocks in the cache */ 45 int maxdirty; /* max number of dirty blocks */ 46 u32int vers; 47 48 long hashSize; 49 50 FreeList *fl; 51 52 VtRendez *die; /* daemon threads should die when != nil */ 53 54 VtRendez *flush; 55 VtRendez *flushwait; 56 VtRendez *heapwait; 57 BAddr *baddr; 58 int bw, br, be; 59 int nflush; 60 61 Periodic *sync; 62 63 /* unlink daemon */ 64 BList *uhead; 65 BList *utail; 66 VtRendez *unlink; 67 68 /* block counts */ 69 int nused; 70 int ndisk; 71 }; 72 73 struct BList { 74 int part; 75 u32int addr; 76 uchar type; 77 u32int tag; 78 u32int epoch; 79 u32int vers; 80 81 int recurse; /* for block unlink */ 82 83 /* for roll back */ 84 int index; /* -1 indicates not valid */ 85 union { 86 uchar score[VtScoreSize]; 87 uchar entry[VtEntrySize]; 88 } old; 89 BList *next; 90 }; 91 92 struct BAddr { 93 int part; 94 u32int addr; 95 u32int vers; 96 }; 97 98 struct FreeList { 99 VtLock *lk; 100 u32int last; /* last block allocated */ 101 u32int end; /* end of data partition */ 102 u32int nfree; /* number of free blocks */ 103 u32int nused; /* number of used blocks */ 104 u32int epochLow; /* low epoch when last updated nfree and nused */ 105 }; 106 107 static FreeList *flAlloc(u32int end); 108 static void flFree(FreeList *fl); 109 110 static Block *cacheBumpBlock(Cache *c); 111 static void heapDel(Block*); 112 static void heapIns(Block*); 113 static void cacheCheck(Cache*); 114 static void unlinkThread(void *a); 115 static void flushThread(void *a); 116 static void flushBody(Cache *c); 117 static void unlinkBody(Cache *c); 118 static int cacheFlushBlock(Cache *c); 119 static void cacheSync(void*); 120 static BList *blistAlloc(Block*); 121 static void blistFree(Cache*, BList*); 122 static void doRemoveLink(Cache*, BList*); 123 static void doRemoveLinkList(Cache*, BList*); 124 125 /* 126 * Mapping from local block type to Venti type 127 */ 128 int vtType[BtMax] = { 129 VtDataType, /* BtData | 0 */ 130 VtPointerType0, /* BtData | 1 */ 131 VtPointerType1, /* BtData | 2 */ 132 VtPointerType2, /* BtData | 3 */ 133 VtPointerType3, /* BtData | 4 */ 134 VtPointerType4, /* BtData | 5 */ 135 VtPointerType5, /* BtData | 6 */ 136 VtPointerType6, /* BtData | 7 */ 137 VtDirType, /* BtDir | 0 */ 138 VtPointerType0, /* BtDir | 1 */ 139 VtPointerType1, /* BtDir | 2 */ 140 VtPointerType2, /* BtDir | 3 */ 141 VtPointerType3, /* BtDir | 4 */ 142 VtPointerType4, /* BtDir | 5 */ 143 VtPointerType5, /* BtDir | 6 */ 144 VtPointerType6, /* BtDir | 7 */ 145 }; 146 147 /* 148 * Allocate the memory cache. 149 */ 150 Cache * 151 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 152 { 153 int i; 154 Cache *c; 155 Block *b; 156 BList *bl; 157 u8int *p; 158 int nbl; 159 160 c = vtMemAllocZ(sizeof(Cache)); 161 162 /* reasonable number of BList elements */ 163 nbl = nblocks * 4; 164 165 c->lk = vtLockAlloc(); 166 c->dirtylk = vtLockAlloc(); /* allowed to dirty blocks */ 167 c->ref = 1; 168 c->disk = disk; 169 c->z = z; 170 c->size = diskBlockSize(disk); 171 bwatchSetBlockSize(c->size); 172 /* round c->size up to be a nice multiple */ 173 c->size = (c->size + 127) & ~127; 174 c->ndmap = (c->size/20 + 7) / 8; 175 c->nblocks = nblocks; 176 c->hashSize = nblocks; 177 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 178 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 179 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 180 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 181 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 182 c->mode = mode; 183 c->vers++; 184 p = c->mem; 185 for(i = 0; i < nblocks; i++){ 186 b = &c->blocks[i]; 187 b->lk = vtLockAlloc(); 188 b->c = c; 189 b->data = p; 190 b->heap = i; 191 b->ioready = vtRendezAlloc(b->lk); 192 c->heap[i] = b; 193 p += c->size; 194 } 195 c->nheap = nblocks; 196 for(i = 0; i < nbl; i++){ 197 bl = (BList*)p; 198 bl->next = c->blfree; 199 c->blfree = bl; 200 p += sizeof(BList); 201 } 202 /* separate loop to keep blocks and blists reasonably aligned */ 203 for(i = 0; i < nblocks; i++){ 204 b = &c->blocks[i]; 205 b->dmap = p; 206 p += c->ndmap; 207 } 208 209 c->blrend = vtRendezAlloc(c->lk); 210 211 c->maxdirty = nblocks*(DirtyPercentage*0.01); 212 213 c->fl = flAlloc(diskSize(disk, PartData)); 214 215 c->unlink = vtRendezAlloc(c->lk); 216 c->flush = vtRendezAlloc(c->lk); 217 c->flushwait = vtRendezAlloc(c->lk); 218 c->heapwait = vtRendezAlloc(c->lk); 219 c->sync = periodicAlloc(cacheSync, c, 30*1000); 220 221 if(mode == OReadWrite){ 222 c->ref += 2; 223 vtThread(unlinkThread, c); 224 vtThread(flushThread, c); 225 } 226 cacheCheck(c); 227 228 return c; 229 } 230 231 /* 232 * Free the whole memory cache, flushing all dirty blocks to the disk. 233 */ 234 void 235 cacheFree(Cache *c) 236 { 237 int i; 238 239 /* kill off daemon threads */ 240 vtLock(c->lk); 241 c->die = vtRendezAlloc(c->lk); 242 periodicKill(c->sync); 243 vtWakeup(c->flush); 244 vtWakeup(c->unlink); 245 while(c->ref > 1) 246 vtSleep(c->die); 247 248 /* flush everything out */ 249 do { 250 unlinkBody(c); 251 vtUnlock(c->lk); 252 while(cacheFlushBlock(c)) 253 ; 254 diskFlush(c->disk); 255 vtLock(c->lk); 256 } while(c->uhead || c->ndirty); 257 vtUnlock(c->lk); 258 259 cacheCheck(c); 260 261 for(i = 0; i < c->nblocks; i++){ 262 assert(c->blocks[i].ref == 0); 263 vtRendezFree(c->blocks[i].ioready); 264 vtLockFree(c->blocks[i].lk); 265 } 266 flFree(c->fl); 267 vtMemFree(c->baddr); 268 vtMemFree(c->heads); 269 vtMemFree(c->blocks); 270 vtMemFree(c->mem); 271 vtLockFree(c->lk); 272 diskFree(c->disk); 273 vtRendezFree(c->blrend); 274 /* don't close vtSession */ 275 vtMemFree(c); 276 } 277 278 static void 279 cacheDump(Cache *c) 280 { 281 int i; 282 Block *b; 283 284 for(i = 0; i < c->nblocks; i++){ 285 b = &c->blocks[i]; 286 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 287 i, b->part, b->addr, b->score, b->l.type, b->ref, 288 bsStr(b->l.state), bioStr(b->iostate), b->pc); 289 } 290 } 291 292 static void 293 cacheCheck(Cache *c) 294 { 295 u32int size, now; 296 int i, k, refed; 297 static uchar zero[VtScoreSize]; 298 Block *b; 299 300 size = c->size; 301 now = c->now; 302 303 for(i = 0; i < c->nheap; i++){ 304 if(c->heap[i]->heap != i) 305 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 306 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 307 vtFatal("bad heap ordering"); 308 k = (i << 1) + 1; 309 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 310 vtFatal("bad heap ordering"); 311 k++; 312 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 313 vtFatal("bad heap ordering"); 314 } 315 316 refed = 0; 317 for(i = 0; i < c->nblocks; i++){ 318 b = &c->blocks[i]; 319 if(b->data != &c->mem[i * size]) 320 vtFatal("mis-blocked at %d", i); 321 if(b->ref && b->heap == BadHeap){ 322 refed++; 323 } 324 } 325 if(c->nheap + refed != c->nblocks){ 326 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 327 cacheDump(c); 328 } 329 assert(c->nheap + refed == c->nblocks); 330 refed = 0; 331 for(i = 0; i < c->nblocks; i++){ 332 b = &c->blocks[i]; 333 if(b->ref){ 334 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 335 refed++; 336 } 337 } 338 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 339 } 340 341 342 /* 343 * locate the block with the oldest second to last use. 344 * remove it from the heap, and fix up the heap. 345 */ 346 /* called with c->lk held */ 347 static Block * 348 cacheBumpBlock(Cache *c) 349 { 350 int printed; 351 Block *b; 352 353 /* 354 * locate the block with the oldest second to last use. 355 * remove it from the heap, and fix up the heap. 356 */ 357 printed = 0; 358 if(c->nheap == 0){ 359 while(c->nheap == 0){ 360 vtWakeup(c->flush); 361 vtSleep(c->heapwait); 362 if(c->nheap == 0){ 363 printed = 1; 364 fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty); 365 } 366 } 367 if(printed) 368 fprint(2, "cache is okay again, %d dirty\n", c->ndirty); 369 } 370 371 b = c->heap[0]; 372 heapDel(b); 373 374 assert(b->heap == BadHeap); 375 assert(b->ref == 0); 376 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 377 assert(b->prior == nil); 378 assert(b->uhead == nil); 379 380 /* 381 * unchain the block from hash chain 382 */ 383 if(b->prev){ 384 *(b->prev) = b->next; 385 if(b->next) 386 b->next->prev = b->prev; 387 b->prev = nil; 388 } 389 390 391 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 392 /* set block to a reasonable state */ 393 b->ref = 1; 394 b->part = PartError; 395 memset(&b->l, 0, sizeof(b->l)); 396 b->iostate = BioEmpty; 397 398 return b; 399 } 400 401 /* 402 * look for a particular version of the block in the memory cache. 403 */ 404 static Block * 405 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 406 int waitlock, int *lockfailure) 407 { 408 Block *b; 409 ulong h; 410 411 h = addr % c->hashSize; 412 413 if(lockfailure) 414 *lockfailure = 0; 415 416 /* 417 * look for the block in the cache 418 */ 419 vtLock(c->lk); 420 for(b = c->heads[h]; b != nil; b = b->next){ 421 if(b->part == part && b->addr == addr) 422 break; 423 } 424 if(b == nil || b->vers != vers){ 425 vtUnlock(c->lk); 426 return nil; 427 } 428 if(!waitlock && !vtCanLock(b->lk)){ 429 *lockfailure = 1; 430 vtUnlock(c->lk); 431 return nil; 432 } 433 heapDel(b); 434 b->ref++; 435 vtUnlock(c->lk); 436 437 bwatchLock(b); 438 if(waitlock) 439 vtLock(b->lk); 440 b->nlock = 1; 441 442 for(;;){ 443 switch(b->iostate){ 444 default: 445 abort(); 446 case BioEmpty: 447 case BioLabel: 448 case BioClean: 449 case BioDirty: 450 if(b->vers != vers){ 451 blockPut(b); 452 return nil; 453 } 454 return b; 455 case BioReading: 456 case BioWriting: 457 vtSleep(b->ioready); 458 break; 459 case BioVentiError: 460 blockPut(b); 461 vtSetError("venti i/o error block 0x%.8ux", addr); 462 return nil; 463 case BioReadError: 464 blockPut(b); 465 vtSetError("i/o error block 0x%.8ux", addr); 466 return nil; 467 } 468 } 469 /* NOT REACHED */ 470 } 471 static Block* 472 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 473 { 474 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 475 } 476 477 478 /* 479 * fetch a local (on-disk) block from the memory cache. 480 * if it's not there, load it, bumping some other block. 481 */ 482 Block * 483 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 484 { 485 Block *b; 486 ulong h; 487 488 assert(part != PartVenti); 489 490 h = addr % c->hashSize; 491 492 /* 493 * look for the block in the cache 494 */ 495 vtLock(c->lk); 496 for(b = c->heads[h]; b != nil; b = b->next){ 497 if(b->part != part || b->addr != addr) 498 continue; 499 if(epoch && b->l.epoch != epoch){ 500 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 501 vtUnlock(c->lk); 502 vtSetError(ELabelMismatch); 503 return nil; 504 } 505 heapDel(b); 506 b->ref++; 507 break; 508 } 509 510 if(b == nil){ 511 b = cacheBumpBlock(c); 512 513 b->part = part; 514 b->addr = addr; 515 localToGlobal(addr, b->score); 516 517 /* chain onto correct hash */ 518 b->next = c->heads[h]; 519 c->heads[h] = b; 520 if(b->next != nil) 521 b->next->prev = &b->next; 522 b->prev = &c->heads[h]; 523 } 524 525 vtUnlock(c->lk); 526 527 /* 528 * BUG: what if the epoch changes right here? 529 * In the worst case, we could end up in some weird 530 * lock loop, because the block we want no longer exists, 531 * and instead we're trying to lock a block we have no 532 * business grabbing. 533 * 534 * For now, I'm not going to worry about it. 535 */ 536 537 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 538 bwatchLock(b); 539 vtLock(b->lk); 540 b->nlock = 1; 541 542 if(part == PartData && b->iostate == BioEmpty){ 543 if(!readLabel(c, &b->l, addr)){ 544 blockPut(b); 545 return nil; 546 } 547 blockSetIOState(b, BioLabel); 548 } 549 if(epoch && b->l.epoch != epoch){ 550 blockPut(b); 551 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 552 vtSetError(ELabelMismatch); 553 return nil; 554 } 555 556 b->pc = getcallerpc(&c); 557 for(;;){ 558 switch(b->iostate){ 559 default: 560 abort(); 561 case BioEmpty: 562 case BioLabel: 563 if(mode == OOverWrite){ 564 blockSetIOState(b, BioClean); 565 return b; 566 } 567 diskRead(c->disk, b); 568 vtSleep(b->ioready); 569 break; 570 case BioClean: 571 case BioDirty: 572 return b; 573 case BioReading: 574 case BioWriting: 575 vtSleep(b->ioready); 576 break; 577 case BioReadError: 578 blockSetIOState(b, BioEmpty); 579 blockPut(b); 580 vtSetError("i/o error block 0x%.8ux", addr); 581 return nil; 582 } 583 } 584 /* NOT REACHED */ 585 } 586 587 Block * 588 cacheLocal(Cache *c, int part, u32int addr, int mode) 589 { 590 return _cacheLocal(c, part, addr, mode, 0); 591 } 592 593 /* 594 * fetch a local (on-disk) block from the memory cache. 595 * if it's not there, load it, bumping some other block. 596 * check tag and type. 597 */ 598 Block * 599 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 600 { 601 Block *b; 602 603 b = _cacheLocal(c, PartData, addr, mode, epoch); 604 if(b == nil) 605 return nil; 606 if(b->l.type != type || b->l.tag != tag){ 607 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 608 addr, b->l.type, type, b->l.tag, tag); 609 vtSetError(ELabelMismatch); 610 blockPut(b); 611 return nil; 612 } 613 b->pc = getcallerpc(&c); 614 return b; 615 } 616 617 /* 618 * fetch a global (Venti) block from the memory cache. 619 * if it's not there, load it, bumping some other block. 620 * check tag and type if it's really a local block in disguise. 621 */ 622 Block * 623 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 624 { 625 int n; 626 Block *b; 627 ulong h; 628 u32int addr; 629 630 addr = globalToLocal(score); 631 if(addr != NilBlock){ 632 b = cacheLocalData(c, addr, type, tag, mode, 0); 633 if(b) 634 b->pc = getcallerpc(&c); 635 return b; 636 } 637 638 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 639 640 /* 641 * look for the block in the cache 642 */ 643 vtLock(c->lk); 644 for(b = c->heads[h]; b != nil; b = b->next){ 645 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 646 continue; 647 heapDel(b); 648 b->ref++; 649 break; 650 } 651 652 if(b == nil){ 653 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 654 655 b = cacheBumpBlock(c); 656 657 b->part = PartVenti; 658 b->addr = NilBlock; 659 b->l.type = type; 660 memmove(b->score, score, VtScoreSize); 661 662 /* chain onto correct hash */ 663 b->next = c->heads[h]; 664 c->heads[h] = b; 665 if(b->next != nil) 666 b->next->prev = &b->next; 667 b->prev = &c->heads[h]; 668 } 669 vtUnlock(c->lk); 670 671 bwatchLock(b); 672 vtLock(b->lk); 673 b->nlock = 1; 674 b->pc = getcallerpc(&c); 675 676 switch(b->iostate){ 677 default: 678 abort(); 679 case BioEmpty: 680 n = vtRead(c->z, score, vtType[type], b->data, c->size); 681 if(n < 0 || !vtSha1Check(score, b->data, n)){ 682 blockSetIOState(b, BioVentiError); 683 blockPut(b); 684 vtSetError("venti i/o error block %V: %r", score); 685 return nil; 686 } 687 vtZeroExtend(vtType[type], b->data, n, c->size); 688 blockSetIOState(b, BioClean); 689 return b; 690 case BioClean: 691 return b; 692 case BioVentiError: 693 blockPut(b); 694 vtSetError("venti i/o error block %V", score); 695 return nil; 696 case BioReadError: 697 blockPut(b); 698 vtSetError("i/o error block %V", b->score); 699 return nil; 700 } 701 /* NOT REACHED */ 702 } 703 704 /* 705 * allocate a new on-disk block and load it into the memory cache. 706 * BUG: if the disk is full, should we flush some of it to Venti? 707 */ 708 static u32int lastAlloc; 709 710 Block * 711 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 712 { 713 FreeList *fl; 714 u32int addr; 715 Block *b; 716 int n, nwrap; 717 Label lab; 718 719 n = c->size / LabelSize; 720 fl = c->fl; 721 722 vtLock(fl->lk); 723 addr = fl->last; 724 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 725 if(b == nil){ 726 fprint(2, "cacheAllocBlock: xxx %R\n"); 727 vtUnlock(fl->lk); 728 return nil; 729 } 730 nwrap = 0; 731 for(;;){ 732 if(++addr >= fl->end){ 733 addr = 0; 734 if(++nwrap >= 2){ 735 blockPut(b); 736 fl->last = 0; 737 vtSetError("disk is full"); 738 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 739 vtUnlock(fl->lk); 740 return nil; 741 } 742 } 743 if(addr%n == 0){ 744 blockPut(b); 745 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 746 if(b == nil){ 747 fl->last = addr; 748 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 749 vtUnlock(fl->lk); 750 return nil; 751 } 752 } 753 if(!labelUnpack(&lab, b->data, addr%n)) 754 continue; 755 if(lab.state == BsFree) 756 goto Found; 757 if(lab.state&BsClosed) 758 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 759 goto Found; 760 } 761 Found: 762 blockPut(b); 763 b = cacheLocal(c, PartData, addr, OOverWrite); 764 if(b == nil){ 765 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 766 return nil; 767 } 768 assert(b->iostate == BioLabel || b->iostate == BioClean); 769 fl->last = addr; 770 lab.type = type; 771 lab.tag = tag; 772 lab.state = BsAlloc; 773 lab.epoch = epoch; 774 lab.epochClose = ~(u32int)0; 775 if(!blockSetLabel(b, &lab, 1)){ 776 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 777 blockPut(b); 778 return nil; 779 } 780 vtZeroExtend(vtType[type], b->data, 0, c->size); 781 if(0)diskWrite(c->disk, b); 782 783 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 784 lastAlloc = addr; 785 fl->nused++; 786 vtUnlock(fl->lk); 787 b->pc = getcallerpc(&c); 788 return b; 789 } 790 791 int 792 cacheDirty(Cache *c) 793 { 794 return c->ndirty; 795 } 796 797 void 798 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 799 { 800 int n; 801 u32int addr, nused; 802 Block *b; 803 Label lab; 804 FreeList *fl; 805 806 fl = c->fl; 807 n = c->size / LabelSize; 808 *bsize = c->size; 809 vtLock(fl->lk); 810 if(fl->epochLow == epochLow){ 811 *used = fl->nused; 812 *total = fl->end; 813 vtUnlock(fl->lk); 814 return; 815 } 816 b = nil; 817 nused = 0; 818 for(addr=0; addr<fl->end; addr++){ 819 if(addr%n == 0){ 820 blockPut(b); 821 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 822 if(b == nil){ 823 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 824 break; 825 } 826 } 827 if(!labelUnpack(&lab, b->data, addr%n)) 828 continue; 829 if(lab.state == BsFree) 830 continue; 831 if(lab.state&BsClosed) 832 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 833 continue; 834 nused++; 835 } 836 blockPut(b); 837 if(addr == fl->end){ 838 fl->nused = nused; 839 fl->epochLow = epochLow; 840 } 841 *used = nused; 842 *total = fl->end; 843 vtUnlock(fl->lk); 844 return; 845 } 846 847 static FreeList * 848 flAlloc(u32int end) 849 { 850 FreeList *fl; 851 852 fl = vtMemAllocZ(sizeof(*fl)); 853 fl->lk = vtLockAlloc(); 854 fl->last = 0; 855 fl->end = end; 856 return fl; 857 } 858 859 static void 860 flFree(FreeList *fl) 861 { 862 vtLockFree(fl->lk); 863 vtMemFree(fl); 864 } 865 866 u32int 867 cacheLocalSize(Cache *c, int part) 868 { 869 return diskSize(c->disk, part); 870 } 871 872 /* 873 * The thread that has locked b may refer to it by 874 * multiple names. Nlock counts the number of 875 * references the locking thread holds. It will call 876 * blockPut once per reference. 877 */ 878 void 879 blockDupLock(Block *b) 880 { 881 assert(b->nlock > 0); 882 b->nlock++; 883 } 884 885 /* 886 * we're done with the block. 887 * unlock it. can't use it after calling this. 888 */ 889 void 890 blockPut(Block* b) 891 { 892 Cache *c; 893 894 if(b == nil) 895 return; 896 897 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 898 899 if(b->iostate == BioDirty) 900 bwatchDependency(b); 901 902 if(--b->nlock > 0) 903 return; 904 905 /* 906 * b->nlock should probably stay at zero while 907 * the block is unlocked, but diskThread and vtSleep 908 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 909 * so we have to keep b->nlock set to 1 even 910 * when the block is unlocked. 911 */ 912 assert(b->nlock == 0); 913 b->nlock = 1; 914 // b->pc = 0; 915 916 bwatchUnlock(b); 917 vtUnlock(b->lk); 918 c = b->c; 919 vtLock(c->lk); 920 921 if(--b->ref > 0){ 922 vtUnlock(c->lk); 923 return; 924 } 925 926 assert(b->ref == 0); 927 switch(b->iostate){ 928 default: 929 b->used = c->now++; 930 heapIns(b); 931 break; 932 case BioEmpty: 933 case BioLabel: 934 if(c->nheap == 0) 935 b->used = c->now++; 936 else 937 b->used = c->heap[0]->used; 938 heapIns(b); 939 break; 940 case BioDirty: 941 break; 942 } 943 vtUnlock(c->lk); 944 } 945 946 /* 947 * set the label associated with a block. 948 */ 949 Block* 950 _blockSetLabel(Block *b, Label *l) 951 { 952 int lpb; 953 Block *bb; 954 u32int a; 955 Cache *c; 956 957 c = b->c; 958 959 assert(b->part == PartData); 960 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 961 lpb = c->size / LabelSize; 962 a = b->addr / lpb; 963 bb = cacheLocal(c, PartLabel, a, OReadWrite); 964 if(bb == nil){ 965 blockPut(b); 966 return nil; 967 } 968 b->l = *l; 969 labelPack(l, bb->data, b->addr%lpb); 970 blockDirty(bb); 971 return bb; 972 } 973 974 int 975 blockSetLabel(Block *b, Label *l, int allocating) 976 { 977 Block *lb; 978 Label oldl; 979 980 oldl = b->l; 981 lb = _blockSetLabel(b, l); 982 if(lb == nil) 983 return 0; 984 985 /* 986 * If we're allocating the block, make sure the label (bl) 987 * goes to disk before the data block (b) itself. This is to help 988 * the blocks that in turn depend on b. 989 * 990 * Suppose bx depends on (must be written out after) b. 991 * Once we write b we'll think it's safe to write bx. 992 * Bx can't get at b unless it has a valid label, though. 993 * 994 * Allocation is the only case in which having a current label 995 * is vital because: 996 * 997 * - l.type is set at allocation and never changes. 998 * - l.tag is set at allocation and never changes. 999 * - l.state is not checked when we load blocks. 1000 * - the archiver cares deeply about l.state being 1001 * BaActive vs. BaCopied, but that's handled 1002 * by direct calls to _blockSetLabel. 1003 */ 1004 1005 if(allocating) 1006 blockDependency(b, lb, -1, nil, nil); 1007 blockPut(lb); 1008 return 1; 1009 } 1010 1011 /* 1012 * Record that bb must be written out before b. 1013 * If index is given, we're about to overwrite the score/e 1014 * at that index in the block. Save the old value so we 1015 * can write a safer ``old'' version of the block if pressed. 1016 */ 1017 void 1018 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1019 { 1020 BList *p; 1021 1022 if(bb->iostate == BioClean) 1023 return; 1024 1025 /* 1026 * Dependencies for blocks containing Entry structures 1027 * or scores must always be explained. The problem with 1028 * only explaining some of them is this. Suppose we have two 1029 * dependencies for the same field, the first explained 1030 * and the second not. We try to write the block when the first 1031 * dependency is not written but the second is. We will roll back 1032 * the first change even though the second trumps it. 1033 */ 1034 if(index == -1 && bb->part == PartData) 1035 assert(b->l.type == BtData); 1036 1037 if(bb->iostate != BioDirty){ 1038 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1039 bb->part, bb->addr, bb->l.type, bb->iostate); 1040 abort(); 1041 } 1042 1043 p = blistAlloc(bb); 1044 if(p == nil) 1045 return; 1046 1047 assert(bb->iostate == BioDirty); 1048 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1049 1050 p->part = bb->part; 1051 p->addr = bb->addr; 1052 p->type = bb->l.type; 1053 p->vers = bb->vers; 1054 p->index = index; 1055 if(p->index >= 0){ 1056 /* 1057 * This test would just be b->l.type==BtDir except 1058 * we need to exclude the super block. 1059 */ 1060 if(b->l.type == BtDir && b->part == PartData) 1061 entryPack(e, p->old.entry, 0); 1062 else 1063 memmove(p->old.score, score, VtScoreSize); 1064 } 1065 p->next = b->prior; 1066 b->prior = p; 1067 } 1068 1069 /* 1070 * Mark an in-memory block as dirty. If there are too many 1071 * dirty blocks, start writing some out to disk. 1072 * 1073 * If there were way too many dirty blocks, we used to 1074 * try to do some flushing ourselves, but it's just too dangerous -- 1075 * it implies that the callers cannot have any of our priors locked, 1076 * but this is hard to avoid in some cases. 1077 */ 1078 int 1079 blockDirty(Block *b) 1080 { 1081 Cache *c; 1082 1083 c = b->c; 1084 1085 assert(b->part != PartVenti); 1086 1087 if(b->iostate == BioDirty) 1088 return 1; 1089 assert(b->iostate == BioClean); 1090 1091 vtLock(c->dirtylk); 1092 vtLock(c->lk); 1093 b->iostate = BioDirty; 1094 c->ndirty++; 1095 if(c->ndirty > (c->maxdirty>>1)) 1096 vtWakeup(c->flush); 1097 vtUnlock(c->lk); 1098 vtUnlock(c->dirtylk); 1099 1100 return 1; 1101 } 1102 1103 /* 1104 * We've decided to write out b. Maybe b has some pointers to blocks 1105 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1106 * copy of b that is safe to write out. (diskThread will make sure the block 1107 * remains marked as dirty.) 1108 */ 1109 uchar * 1110 blockRollback(Block *b, uchar *buf) 1111 { 1112 u32int addr; 1113 BList *p; 1114 Super super; 1115 1116 /* easy case */ 1117 if(b->prior == nil) 1118 return b->data; 1119 1120 memmove(buf, b->data, b->c->size); 1121 for(p=b->prior; p; p=p->next){ 1122 /* 1123 * we know p->index >= 0 because blockWrite has vetted this block for us. 1124 */ 1125 assert(p->index >= 0); 1126 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1127 if(b->part == PartSuper){ 1128 assert(p->index == 0); 1129 superUnpack(&super, buf); 1130 addr = globalToLocal(p->old.score); 1131 if(addr == NilBlock){ 1132 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1133 abort(); 1134 } 1135 super.active = addr; 1136 superPack(&super, buf); 1137 continue; 1138 } 1139 if(b->l.type == BtDir) 1140 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1141 else 1142 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1143 } 1144 return buf; 1145 } 1146 1147 /* 1148 * Try to write block b. 1149 * If b depends on other blocks: 1150 * 1151 * If the block has been written out, remove the dependency. 1152 * If the dependency is replaced by a more recent dependency, 1153 * throw it out. 1154 * If we know how to write out an old version of b that doesn't 1155 * depend on it, do that. 1156 * 1157 * Otherwise, bail. 1158 */ 1159 int 1160 blockWrite(Block *b) 1161 { 1162 uchar *dmap; 1163 Cache *c; 1164 BList *p, **pp; 1165 Block *bb; 1166 int lockfail; 1167 1168 c = b->c; 1169 1170 if(b->iostate != BioDirty) 1171 return 1; 1172 1173 dmap = b->dmap; 1174 memset(dmap, 0, c->ndmap); 1175 pp = &b->prior; 1176 for(p=*pp; p; p=*pp){ 1177 if(p->index >= 0){ 1178 /* more recent dependency has succeeded; this one can go */ 1179 if(dmap[p->index/8] & (1<<(p->index%8))) 1180 goto ignblock; 1181 } 1182 1183 lockfail = 0; 1184 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1185 if(bb == nil){ 1186 if(lockfail) 1187 return 0; 1188 /* block not in cache => was written already */ 1189 dmap[p->index/8] |= 1<<(p->index%8); 1190 goto ignblock; 1191 } 1192 1193 /* 1194 * same version of block is still in cache. 1195 * 1196 * the assertion is true because the block still has version p->vers, 1197 * which means it hasn't been written out since we last saw it. 1198 */ 1199 if(bb->iostate != BioDirty){ 1200 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1201 bb->part, bb->addr, bb->l.type, bb->iostate); 1202 /* probably BioWriting if it happens? */ 1203 if(bb->iostate == BioClean) 1204 goto ignblock; 1205 } 1206 1207 blockPut(bb); 1208 1209 if(p->index < 0){ 1210 /* 1211 * We don't know how to temporarily undo 1212 * b's dependency on bb, so just don't write b yet. 1213 */ 1214 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1215 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1216 return 0; 1217 } 1218 /* keep walking down the list */ 1219 pp = &p->next; 1220 continue; 1221 1222 ignblock: 1223 *pp = p->next; 1224 blistFree(c, p); 1225 continue; 1226 } 1227 1228 /* 1229 * DiskWrite must never be called with a double-locked block. 1230 * This call to diskWrite is okay because blockWrite is only called 1231 * from the cache flush thread, which never double-locks a block. 1232 */ 1233 diskWrite(c->disk, b); 1234 return 1; 1235 } 1236 1237 /* 1238 * Change the I/O state of block b. 1239 * Just an assignment except for magic in 1240 * switch statement (read comments there). 1241 */ 1242 void 1243 blockSetIOState(Block *b, int iostate) 1244 { 1245 int dowakeup; 1246 Cache *c; 1247 BList *p, *q; 1248 1249 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1250 1251 c = b->c; 1252 1253 dowakeup = 0; 1254 switch(iostate){ 1255 default: 1256 abort(); 1257 case BioEmpty: 1258 assert(!b->uhead); 1259 break; 1260 case BioLabel: 1261 assert(!b->uhead); 1262 break; 1263 case BioClean: 1264 bwatchDependency(b); 1265 /* 1266 * If b->prior is set, it means a write just finished. 1267 * The prior list isn't needed anymore. 1268 */ 1269 for(p=b->prior; p; p=q){ 1270 q = p->next; 1271 blistFree(c, p); 1272 } 1273 b->prior = nil; 1274 /* 1275 * Freeing a block or just finished a write. 1276 * Move the blocks from the per-block unlink 1277 * queue to the cache unlink queue. 1278 */ 1279 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1280 vtLock(c->lk); 1281 c->ndirty--; 1282 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1283 b->vers = c->vers++; 1284 if(b->uhead){ 1285 /* add unlink blocks to unlink queue */ 1286 if(c->uhead == nil){ 1287 c->uhead = b->uhead; 1288 vtWakeup(c->unlink); 1289 }else 1290 c->utail->next = b->uhead; 1291 c->utail = b->utail; 1292 b->uhead = nil; 1293 } 1294 vtUnlock(c->lk); 1295 } 1296 assert(!b->uhead); 1297 dowakeup = 1; 1298 break; 1299 case BioDirty: 1300 /* 1301 * Wrote out an old version of the block (see blockRollback). 1302 * Bump a version count, leave it dirty. 1303 */ 1304 if(b->iostate == BioWriting){ 1305 vtLock(c->lk); 1306 b->vers = c->vers++; 1307 vtUnlock(c->lk); 1308 dowakeup = 1; 1309 } 1310 break; 1311 case BioReading: 1312 case BioWriting: 1313 /* 1314 * Adding block to disk queue. Bump reference count. 1315 * diskThread decs the count later by calling blockPut. 1316 * This is here because we need to lock c->lk to 1317 * manipulate the ref count. 1318 */ 1319 vtLock(c->lk); 1320 b->ref++; 1321 vtUnlock(c->lk); 1322 break; 1323 case BioReadError: 1324 case BioVentiError: 1325 /* 1326 * Oops. 1327 */ 1328 dowakeup = 1; 1329 break; 1330 } 1331 b->iostate = iostate; 1332 /* 1333 * Now that the state has changed, we can wake the waiters. 1334 */ 1335 if(dowakeup) 1336 vtWakeupAll(b->ioready); 1337 } 1338 1339 /* 1340 * The active file system is a tree of blocks. 1341 * When we add snapshots to the mix, the entire file system 1342 * becomes a dag and thus requires a bit more care. 1343 * 1344 * The life of the file system is divided into epochs. A snapshot 1345 * ends one epoch and begins the next. Each file system block 1346 * is marked with the epoch in which it was created (b.epoch). 1347 * When the block is unlinked from the file system (closed), it is marked 1348 * with the epoch in which it was removed (b.epochClose). 1349 * Once we have discarded or archived all snapshots up to 1350 * b.epochClose, we can reclaim the block. 1351 * 1352 * If a block was created in a past epoch but is not yet closed, 1353 * it is treated as copy-on-write. Of course, in order to insert the 1354 * new pointer into the tree, the parent must be made writable, 1355 * and so on up the tree. The recursion stops because the root 1356 * block is always writable. 1357 * 1358 * If blocks are never closed, they will never be reused, and 1359 * we will run out of disk space. But marking a block as closed 1360 * requires some care about dependencies and write orderings. 1361 * 1362 * (1) If a block p points at a copy-on-write block b and we 1363 * copy b to create bb, then p must be written out after bb and 1364 * lbb (bb's label block). 1365 * 1366 * (2) We have to mark b as closed, but only after we switch 1367 * the pointer, so lb must be written out after p. In fact, we 1368 * can't even update the in-memory copy, or the cache might 1369 * mistakenly give out b for reuse before p gets written. 1370 * 1371 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1372 * The caller is expected to record a "p after bb" dependency 1373 * to finish (1), and also expected to call blockRemoveLink 1374 * to arrange for (2) to happen once p is written. 1375 * 1376 * Until (2) happens, some pieces of the code (e.g., the archiver) 1377 * still need to know whether a block has been copied, so we 1378 * set the BsCopied bit in the label and force that to disk *before* 1379 * the copy gets written out. 1380 */ 1381 Block* 1382 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1383 { 1384 Block *bb, *lb; 1385 Label l; 1386 1387 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1388 fprint(2, "blockCopy %#ux %L but fs is [%ud,%ud]\n", 1389 b->addr, &b->l, elo, ehi); 1390 1391 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1392 if(bb == nil){ 1393 blockPut(b); 1394 return nil; 1395 } 1396 1397 /* 1398 * Update label so we know the block has been copied. 1399 * (It will be marked closed once it has been unlinked from 1400 * the tree.) This must follow cacheAllocBlock since we 1401 * can't be holding onto lb when we call cacheAllocBlock. 1402 */ 1403 if((b->l.state&BsCopied)==0) 1404 if(b->part == PartData){ /* not the superblock */ 1405 l = b->l; 1406 l.state |= BsCopied; 1407 lb = _blockSetLabel(b, &l); 1408 if(lb == nil){ 1409 /* can't set label => can't copy block */ 1410 blockPut(b); 1411 l.type = BtMax; 1412 l.state = BsFree; 1413 l.epoch = 0; 1414 l.epochClose = 0; 1415 l.tag = 0; 1416 blockSetLabel(bb, &l, 0); 1417 blockPut(bb); 1418 return nil; 1419 } 1420 blockDependency(bb, lb, -1, nil, nil); 1421 blockPut(lb); 1422 } 1423 1424 memmove(bb->data, b->data, b->c->size); 1425 blockDirty(bb); 1426 blockPut(b); 1427 return bb; 1428 } 1429 1430 /* 1431 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1432 * If recurse is set, we are unlinking all of bb's children as well. 1433 * 1434 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1435 * the relevant information to b's list of unlinked blocks. Once b is written, 1436 * the list will be queued for processing. 1437 * 1438 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1439 */ 1440 void 1441 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1442 { 1443 BList *p, **pp, bl; 1444 1445 /* remove bb from prior list */ 1446 for(pp=&b->prior; (p=*pp)!=nil; ){ 1447 if(p->part == PartData && p->addr == addr){ 1448 *pp = p->next; 1449 blistFree(b->c, p); 1450 }else 1451 pp = &p->next; 1452 } 1453 1454 bl.part = PartData; 1455 bl.addr = addr; 1456 bl.type = type; 1457 bl.tag = tag; 1458 if(b->l.epoch == 0) 1459 assert(b->part == PartSuper); 1460 bl.epoch = b->l.epoch; 1461 bl.next = nil; 1462 bl.recurse = recurse; 1463 1464 p = blistAlloc(b); 1465 if(p == nil){ 1466 /* 1467 * We were out of blists so blistAlloc wrote b to disk. 1468 */ 1469 doRemoveLink(b->c, &bl); 1470 return; 1471 } 1472 1473 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1474 assert(b->iostate == BioDirty); 1475 1476 *p = bl; 1477 if(b->uhead == nil) 1478 b->uhead = p; 1479 else 1480 b->utail->next = p; 1481 b->utail = p; 1482 } 1483 1484 /* 1485 * Process removal of a single block and perhaps its children. 1486 */ 1487 static void 1488 doRemoveLink(Cache *c, BList *p) 1489 { 1490 int i, n, recurse; 1491 u32int a; 1492 Block *b; 1493 Label l; 1494 BList bl; 1495 1496 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1497 1498 /* 1499 * We're not really going to overwrite b, but if we're not 1500 * going to look at its contents, there is no point in reading 1501 * them from the disk. 1502 */ 1503 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1504 if(b == nil) 1505 return; 1506 1507 /* 1508 * When we're unlinking from the superblock, close with the next epoch. 1509 */ 1510 if(p->epoch == 0) 1511 p->epoch = b->l.epoch+1; 1512 1513 /* sanity check */ 1514 if(b->l.epoch > p->epoch){ 1515 fprint(2, "doRemoveLink: strange epoch %ud > %ud\n", b->l.epoch, p->epoch); 1516 blockPut(b); 1517 return; 1518 } 1519 1520 if(recurse){ 1521 n = c->size / VtScoreSize; 1522 for(i=0; i<n; i++){ 1523 a = globalToLocal(b->data + i*VtScoreSize); 1524 if(a == NilBlock || !readLabel(c, &l, a)) 1525 continue; 1526 if(l.state&BsClosed) 1527 continue; 1528 /* 1529 * If stack space becomes an issue... 1530 p->addr = a; 1531 p->type = l.type; 1532 p->tag = l.tag; 1533 doRemoveLink(c, p); 1534 */ 1535 1536 bl.part = PartData; 1537 bl.addr = a; 1538 bl.type = l.type; 1539 bl.tag = l.tag; 1540 bl.epoch = p->epoch; 1541 bl.next = nil; 1542 bl.recurse = 1; 1543 /* give up the block lock - share with others */ 1544 blockPut(b); 1545 doRemoveLink(c, &bl); 1546 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1547 if(b == nil){ 1548 fprint(2, "warning: lost block in doRemoveLink\n"); 1549 return; 1550 } 1551 } 1552 } 1553 1554 l = b->l; 1555 l.state |= BsClosed; 1556 l.epochClose = p->epoch; 1557 blockSetLabel(b, &l, 0); 1558 blockPut(b); 1559 } 1560 1561 /* 1562 * Allocate a BList so that we can record a dependency 1563 * or queue a removal related to block b. 1564 * If we can't find a BList, we write out b and return nil. 1565 */ 1566 static BList * 1567 blistAlloc(Block *b) 1568 { 1569 Cache *c; 1570 BList *p; 1571 1572 if(b->iostate != BioDirty){ 1573 /* 1574 * should not happen anymore - 1575 * blockDirty used to flush but no longer does. 1576 */ 1577 assert(b->iostate == BioClean); 1578 fprint(2, "blistAlloc: called on clean block\n"); 1579 return nil; 1580 } 1581 1582 c = b->c; 1583 vtLock(c->lk); 1584 if(c->blfree == nil){ 1585 /* 1586 * No free BLists. What are our options? 1587 */ 1588 1589 /* Block has no priors? Just write it. */ 1590 if(b->prior == nil){ 1591 vtUnlock(c->lk); 1592 diskWriteAndWait(c->disk, b); 1593 return nil; 1594 } 1595 1596 /* 1597 * Wake the flush thread, which will hopefully free up 1598 * some BLists for us. We used to flush a block from 1599 * our own prior list and reclaim that BList, but this is 1600 * a no-no: some of the blocks on our prior list may 1601 * be locked by our caller. Or maybe their label blocks 1602 * are locked by our caller. In any event, it's too hard 1603 * to make sure we can do I/O for ourselves. Instead, 1604 * we assume the flush thread will find something. 1605 * (The flush thread never blocks waiting for a block, 1606 * so it can't deadlock like we can.) 1607 */ 1608 vtLock(c->lk); 1609 while(c->blfree == nil){ 1610 vtWakeup(c->flush); 1611 vtSleep(c->blrend); 1612 if(c->blfree == nil) 1613 fprint(2, "flushing for blists\n"); 1614 } 1615 } 1616 1617 p = c->blfree; 1618 c->blfree = p->next; 1619 vtUnlock(c->lk); 1620 return p; 1621 } 1622 1623 static void 1624 blistFree(Cache *c, BList *bl) 1625 { 1626 vtLock(c->lk); 1627 bl->next = c->blfree; 1628 c->blfree = bl; 1629 vtWakeup(c->blrend); 1630 vtUnlock(c->lk); 1631 } 1632 1633 char* 1634 bsStr(int state) 1635 { 1636 static char s[100]; 1637 1638 if(state == BsFree) 1639 return "Free"; 1640 if(state == BsBad) 1641 return "Bad"; 1642 1643 sprint(s, "%x", state); 1644 if(!(state&BsAlloc)) 1645 strcat(s, ",Free"); /* should not happen */ 1646 if(state&BsCopied) 1647 strcat(s, ",Copied"); 1648 if(state&BsVenti) 1649 strcat(s, ",Venti"); 1650 if(state&BsClosed) 1651 strcat(s, ",Closed"); 1652 return s; 1653 } 1654 1655 char * 1656 bioStr(int iostate) 1657 { 1658 switch(iostate){ 1659 default: 1660 return "Unknown!!"; 1661 case BioEmpty: 1662 return "Empty"; 1663 case BioLabel: 1664 return "Label"; 1665 case BioClean: 1666 return "Clean"; 1667 case BioDirty: 1668 return "Dirty"; 1669 case BioReading: 1670 return "Reading"; 1671 case BioWriting: 1672 return "Writing"; 1673 case BioReadError: 1674 return "ReadError"; 1675 case BioVentiError: 1676 return "VentiError"; 1677 case BioMax: 1678 return "Max"; 1679 } 1680 } 1681 1682 static char *bttab[] = { 1683 "BtData", 1684 "BtData+1", 1685 "BtData+2", 1686 "BtData+3", 1687 "BtData+4", 1688 "BtData+5", 1689 "BtData+6", 1690 "BtData+7", 1691 "BtDir", 1692 "BtDir+1", 1693 "BtDir+2", 1694 "BtDir+3", 1695 "BtDir+4", 1696 "BtDir+5", 1697 "BtDir+6", 1698 "BtDir+7", 1699 }; 1700 1701 char* 1702 btStr(int type) 1703 { 1704 if(type < nelem(bttab)) 1705 return bttab[type]; 1706 return "unknown"; 1707 } 1708 1709 int 1710 labelFmt(Fmt *f) 1711 { 1712 Label *l; 1713 1714 l = va_arg(f->args, Label*); 1715 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1716 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1717 } 1718 1719 int 1720 scoreFmt(Fmt *f) 1721 { 1722 uchar *v; 1723 int i; 1724 u32int addr; 1725 1726 v = va_arg(f->args, uchar*); 1727 if(v == nil){ 1728 fmtprint(f, "*"); 1729 }else if((addr = globalToLocal(v)) != NilBlock) 1730 fmtprint(f, "0x%.8ux", addr); 1731 else{ 1732 for(i = 0; i < VtScoreSize; i++) 1733 fmtprint(f, "%2.2ux", v[i]); 1734 } 1735 1736 return 0; 1737 } 1738 1739 static int 1740 upHeap(int i, Block *b) 1741 { 1742 Block *bb; 1743 u32int now; 1744 int p; 1745 Cache *c; 1746 1747 c = b->c; 1748 now = c->now; 1749 for(; i != 0; i = p){ 1750 p = (i - 1) >> 1; 1751 bb = c->heap[p]; 1752 if(b->used - now >= bb->used - now) 1753 break; 1754 c->heap[i] = bb; 1755 bb->heap = i; 1756 } 1757 c->heap[i] = b; 1758 b->heap = i; 1759 1760 return i; 1761 } 1762 1763 static int 1764 downHeap(int i, Block *b) 1765 { 1766 Block *bb; 1767 u32int now; 1768 int k; 1769 Cache *c; 1770 1771 c = b->c; 1772 now = c->now; 1773 for(; ; i = k){ 1774 k = (i << 1) + 1; 1775 if(k >= c->nheap) 1776 break; 1777 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1778 k++; 1779 bb = c->heap[k]; 1780 if(b->used - now <= bb->used - now) 1781 break; 1782 c->heap[i] = bb; 1783 bb->heap = i; 1784 } 1785 c->heap[i] = b; 1786 b->heap = i; 1787 return i; 1788 } 1789 1790 /* 1791 * Delete a block from the heap. 1792 * Called with c->lk held. 1793 */ 1794 static void 1795 heapDel(Block *b) 1796 { 1797 int i, si; 1798 Cache *c; 1799 1800 c = b->c; 1801 1802 si = b->heap; 1803 if(si == BadHeap) 1804 return; 1805 b->heap = BadHeap; 1806 c->nheap--; 1807 if(si == c->nheap) 1808 return; 1809 b = c->heap[c->nheap]; 1810 i = upHeap(si, b); 1811 if(i == si) 1812 downHeap(i, b); 1813 } 1814 1815 /* 1816 * Insert a block into the heap. 1817 * Called with c->lk held. 1818 */ 1819 static void 1820 heapIns(Block *b) 1821 { 1822 assert(b->heap == BadHeap); 1823 upHeap(b->c->nheap++, b); 1824 vtWakeup(b->c->heapwait); 1825 } 1826 1827 /* 1828 * Get just the label for a block. 1829 */ 1830 int 1831 readLabel(Cache *c, Label *l, u32int addr) 1832 { 1833 int lpb; 1834 Block *b; 1835 u32int a; 1836 1837 lpb = c->size / LabelSize; 1838 a = addr / lpb; 1839 b = cacheLocal(c, PartLabel, a, OReadOnly); 1840 if(b == nil){ 1841 blockPut(b); 1842 return 0; 1843 } 1844 1845 if(!labelUnpack(l, b->data, addr%lpb)){ 1846 blockPut(b); 1847 return 0; 1848 } 1849 blockPut(b); 1850 return 1; 1851 } 1852 1853 /* 1854 * Process unlink queue. 1855 * Called with c->lk held. 1856 */ 1857 static void 1858 unlinkBody(Cache *c) 1859 { 1860 BList *p; 1861 1862 while(c->uhead != nil){ 1863 p = c->uhead; 1864 c->uhead = p->next; 1865 vtUnlock(c->lk); 1866 doRemoveLink(c, p); 1867 vtLock(c->lk); 1868 p->next = c->blfree; 1869 c->blfree = p; 1870 } 1871 } 1872 1873 /* 1874 * Occasionally unlink the blocks on the cache unlink queue. 1875 */ 1876 static void 1877 unlinkThread(void *a) 1878 { 1879 Cache *c = a; 1880 1881 vtThreadSetName("unlink"); 1882 1883 vtLock(c->lk); 1884 for(;;){ 1885 while(c->uhead == nil && c->die == nil) 1886 vtSleep(c->unlink); 1887 if(c->die != nil) 1888 break; 1889 unlinkBody(c); 1890 } 1891 c->ref--; 1892 vtWakeup(c->die); 1893 vtUnlock(c->lk); 1894 } 1895 1896 static int 1897 baddrCmp(void *a0, void *a1) 1898 { 1899 BAddr *b0, *b1; 1900 b0 = a0; 1901 b1 = a1; 1902 1903 if(b0->part < b1->part) 1904 return -1; 1905 if(b0->part > b1->part) 1906 return 1; 1907 if(b0->addr < b1->addr) 1908 return -1; 1909 if(b0->addr > b1->addr) 1910 return 1; 1911 return 0; 1912 } 1913 1914 /* 1915 * Scan the block list for dirty blocks; add them to the list c->baddr. 1916 */ 1917 static void 1918 flushFill(Cache *c) 1919 { 1920 int i, ndirty; 1921 BAddr *p; 1922 Block *b; 1923 1924 vtLock(c->lk); 1925 if(c->ndirty == 0){ 1926 vtUnlock(c->lk); 1927 return; 1928 } 1929 1930 p = c->baddr; 1931 ndirty = 0; 1932 for(i=0; i<c->nblocks; i++){ 1933 b = c->blocks + i; 1934 if(b->part == PartError) 1935 continue; 1936 if(b->iostate == BioDirty || b->iostate == BioWriting) 1937 ndirty++; 1938 if(b->iostate != BioDirty) 1939 continue; 1940 p->part = b->part; 1941 p->addr = b->addr; 1942 p->vers = b->vers; 1943 p++; 1944 } 1945 if(ndirty != c->ndirty){ 1946 fprint(2, "ndirty mismatch expected %d found %d\n", 1947 c->ndirty, ndirty); 1948 c->ndirty = ndirty; 1949 } 1950 vtUnlock(c->lk); 1951 1952 c->bw = p - c->baddr; 1953 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1954 } 1955 1956 /* 1957 * This is not thread safe, i.e. it can't be called from multiple threads. 1958 * 1959 * It's okay how we use it, because it only gets called in 1960 * the flushThread. And cacheFree, but only after 1961 * cacheFree has killed off the flushThread. 1962 */ 1963 static int 1964 cacheFlushBlock(Cache *c) 1965 { 1966 Block *b; 1967 BAddr *p; 1968 int lockfail, nfail; 1969 1970 nfail = 0; 1971 for(;;){ 1972 if(c->br == c->be){ 1973 if(c->bw == 0 || c->bw == c->be) 1974 flushFill(c); 1975 c->br = 0; 1976 c->be = c->bw; 1977 c->bw = 0; 1978 c->nflush = 0; 1979 } 1980 1981 if(c->br == c->be) 1982 return 0; 1983 p = c->baddr + c->br; 1984 c->br++; 1985 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1986 1987 if(b && blockWrite(b)){ 1988 c->nflush++; 1989 blockPut(b); 1990 return 1; 1991 } 1992 if(b) 1993 blockPut(b); 1994 1995 /* 1996 * Why didn't we write the block? 1997 */ 1998 1999 /* Block already written out */ 2000 if(b == nil && !lockfail) 2001 continue; 2002 2003 /* Failed to acquire lock; sleep if happens a lot. */ 2004 if(lockfail && ++nfail > 100){ 2005 sleep(500); 2006 nfail = 0; 2007 } 2008 /* Requeue block. */ 2009 if(c->bw < c->be) 2010 c->baddr[c->bw++] = *p; 2011 } 2012 return 0; 2013 } 2014 2015 /* 2016 * Occasionally flush dirty blocks from memory to the disk. 2017 */ 2018 static void 2019 flushThread(void *a) 2020 { 2021 Cache *c = a; 2022 int i; 2023 2024 vtThreadSetName("flush"); 2025 vtLock(c->lk); 2026 while(c->die == nil){ 2027 vtSleep(c->flush); 2028 vtUnlock(c->lk); 2029 for(i=0; i<FlushSize; i++) 2030 if(!cacheFlushBlock(c)){ 2031 /* 2032 * If i==0, could be someone is waking us repeatedly 2033 * to flush the cache but there's no work to do. 2034 * Pause a little. 2035 */ 2036 if(i==0){ 2037 fprint(2, "flushthread found nothing to flush - %d dirty\n", c->ndirty); 2038 sleep(250); 2039 } 2040 break; 2041 } 2042 if(i==0 && c->ndirty){ 2043 /* 2044 * All the blocks are being written right now -- there's nothing to do. 2045 * We might be spinning with cacheFlush though -- he'll just keep 2046 * kicking us until c->ndirty goes down. Probably we should sleep 2047 * on something that the diskThread can kick, but for now we'll 2048 * just pause for a little while waiting for disks to finish. 2049 */ 2050 sleep(100); 2051 } 2052 vtLock(c->lk); 2053 vtWakeupAll(c->flushwait); 2054 } 2055 c->ref--; 2056 vtWakeup(c->die); 2057 vtUnlock(c->lk); 2058 } 2059 2060 /* 2061 * Flush the cache. 2062 */ 2063 void 2064 cacheFlush(Cache *c, int wait) 2065 { 2066 /* 2067 * Lock c->dirtylk so that more blocks aren't being dirtied 2068 * while we try to write out what's already here. 2069 * Otherwise we might not ever finish! 2070 */ 2071 vtLock(c->dirtylk); 2072 vtLock(c->lk); 2073 if(wait){ 2074 while(c->ndirty){ 2075 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2076 // c->ndirty, c->uhead); 2077 vtWakeup(c->flush); 2078 vtSleep(c->flushwait); 2079 } 2080 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2081 }else if(c->ndirty) 2082 vtWakeup(c->flush); 2083 vtUnlock(c->lk); 2084 vtUnlock(c->dirtylk); 2085 } 2086 2087 /* 2088 * Kick the flushThread every 30 seconds. 2089 */ 2090 static void 2091 cacheSync(void *v) 2092 { 2093 Cache *c; 2094 2095 c = v; 2096 cacheFlush(c, 0); 2097 } 2098