1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtSession *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 VtRendez *blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 VtRendez *die; /* daemon threads should die when != nil */ 52 53 VtRendez *flush; 54 VtRendez *flushwait; 55 VtRendez *heapwait; 56 BAddr *baddr; 57 int bw, br, be; 58 int nflush; 59 60 Periodic *sync; 61 62 /* unlink daemon */ 63 BList *uhead; 64 BList *utail; 65 VtRendez *unlink; 66 67 /* block counts */ 68 int nused; 69 int ndisk; 70 }; 71 72 struct BList { 73 int part; 74 u32int addr; 75 uchar type; 76 u32int tag; 77 u32int epoch; 78 u32int vers; 79 80 int recurse; /* for block unlink */ 81 82 /* for roll back */ 83 int index; /* -1 indicates not valid */ 84 union { 85 uchar score[VtScoreSize]; 86 uchar entry[VtEntrySize]; 87 } old; 88 BList *next; 89 }; 90 91 struct BAddr { 92 int part; 93 u32int addr; 94 u32int vers; 95 }; 96 97 struct FreeList { 98 VtLock *lk; 99 u32int last; /* last block allocated */ 100 u32int end; /* end of data partition */ 101 u32int nused; /* number of used blocks */ 102 u32int epochLow; /* low epoch when last updated nused */ 103 }; 104 105 static FreeList *flAlloc(u32int end); 106 static void flFree(FreeList *fl); 107 108 static Block *cacheBumpBlock(Cache *c); 109 static void heapDel(Block*); 110 static void heapIns(Block*); 111 static void cacheCheck(Cache*); 112 static void unlinkThread(void *a); 113 static void flushThread(void *a); 114 static void unlinkBody(Cache *c); 115 static int cacheFlushBlock(Cache *c); 116 static void cacheSync(void*); 117 static BList *blistAlloc(Block*); 118 static void blistFree(Cache*, BList*); 119 static void doRemoveLink(Cache*, BList*); 120 121 /* 122 * Mapping from local block type to Venti type 123 */ 124 int vtType[BtMax] = { 125 VtDataType, /* BtData | 0 */ 126 VtPointerType0, /* BtData | 1 */ 127 VtPointerType1, /* BtData | 2 */ 128 VtPointerType2, /* BtData | 3 */ 129 VtPointerType3, /* BtData | 4 */ 130 VtPointerType4, /* BtData | 5 */ 131 VtPointerType5, /* BtData | 6 */ 132 VtPointerType6, /* BtData | 7 */ 133 VtDirType, /* BtDir | 0 */ 134 VtPointerType0, /* BtDir | 1 */ 135 VtPointerType1, /* BtDir | 2 */ 136 VtPointerType2, /* BtDir | 3 */ 137 VtPointerType3, /* BtDir | 4 */ 138 VtPointerType4, /* BtDir | 5 */ 139 VtPointerType5, /* BtDir | 6 */ 140 VtPointerType6, /* BtDir | 7 */ 141 }; 142 143 /* 144 * Allocate the memory cache. 145 */ 146 Cache * 147 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 148 { 149 int i; 150 Cache *c; 151 Block *b; 152 BList *bl; 153 u8int *p; 154 int nbl; 155 156 c = vtMemAllocZ(sizeof(Cache)); 157 158 /* reasonable number of BList elements */ 159 nbl = nblocks * 4; 160 161 c->lk = vtLockAlloc(); 162 c->ref = 1; 163 c->disk = disk; 164 c->z = z; 165 c->size = diskBlockSize(disk); 166 bwatchSetBlockSize(c->size); 167 /* round c->size up to be a nice multiple */ 168 c->size = (c->size + 127) & ~127; 169 c->ndmap = (c->size/20 + 7) / 8; 170 c->nblocks = nblocks; 171 c->hashSize = nblocks; 172 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 173 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 174 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 175 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 176 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 177 c->mode = mode; 178 c->vers++; 179 p = c->mem; 180 for(i = 0; i < nblocks; i++){ 181 b = &c->blocks[i]; 182 b->lk = vtLockAlloc(); 183 b->c = c; 184 b->data = p; 185 b->heap = i; 186 b->ioready = vtRendezAlloc(b->lk); 187 c->heap[i] = b; 188 p += c->size; 189 } 190 c->nheap = nblocks; 191 for(i = 0; i < nbl; i++){ 192 bl = (BList*)p; 193 bl->next = c->blfree; 194 c->blfree = bl; 195 p += sizeof(BList); 196 } 197 /* separate loop to keep blocks and blists reasonably aligned */ 198 for(i = 0; i < nblocks; i++){ 199 b = &c->blocks[i]; 200 b->dmap = p; 201 p += c->ndmap; 202 } 203 204 c->blrend = vtRendezAlloc(c->lk); 205 206 c->maxdirty = nblocks*(DirtyPercentage*0.01); 207 208 c->fl = flAlloc(diskSize(disk, PartData)); 209 210 c->unlink = vtRendezAlloc(c->lk); 211 c->flush = vtRendezAlloc(c->lk); 212 c->flushwait = vtRendezAlloc(c->lk); 213 c->heapwait = vtRendezAlloc(c->lk); 214 c->sync = periodicAlloc(cacheSync, c, 30*1000); 215 216 if(mode == OReadWrite){ 217 c->ref += 2; 218 vtThread(unlinkThread, c); 219 vtThread(flushThread, c); 220 } 221 cacheCheck(c); 222 223 return c; 224 } 225 226 /* 227 * Free the whole memory cache, flushing all dirty blocks to the disk. 228 */ 229 void 230 cacheFree(Cache *c) 231 { 232 int i; 233 234 /* kill off daemon threads */ 235 vtLock(c->lk); 236 c->die = vtRendezAlloc(c->lk); 237 periodicKill(c->sync); 238 vtWakeup(c->flush); 239 vtWakeup(c->unlink); 240 while(c->ref > 1) 241 vtSleep(c->die); 242 243 /* flush everything out */ 244 do { 245 unlinkBody(c); 246 vtUnlock(c->lk); 247 while(cacheFlushBlock(c)) 248 ; 249 diskFlush(c->disk); 250 vtLock(c->lk); 251 } while(c->uhead || c->ndirty); 252 vtUnlock(c->lk); 253 254 cacheCheck(c); 255 256 for(i = 0; i < c->nblocks; i++){ 257 assert(c->blocks[i].ref == 0); 258 vtRendezFree(c->blocks[i].ioready); 259 vtLockFree(c->blocks[i].lk); 260 } 261 flFree(c->fl); 262 vtMemFree(c->baddr); 263 vtMemFree(c->heads); 264 vtMemFree(c->blocks); 265 vtMemFree(c->mem); 266 vtLockFree(c->lk); 267 diskFree(c->disk); 268 vtRendezFree(c->blrend); 269 /* don't close vtSession */ 270 vtMemFree(c); 271 } 272 273 static void 274 cacheDump(Cache *c) 275 { 276 int i; 277 Block *b; 278 279 for(i = 0; i < c->nblocks; i++){ 280 b = &c->blocks[i]; 281 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n", 282 i, b->part, b->addr, b->score, b->l.type, b->ref, 283 bsStr(b->l.state), bioStr(b->iostate), b->pc); 284 } 285 } 286 287 static void 288 cacheCheck(Cache *c) 289 { 290 u32int size, now; 291 int i, k, refed; 292 static uchar zero[VtScoreSize]; 293 Block *b; 294 295 size = c->size; 296 now = c->now; 297 298 for(i = 0; i < c->nheap; i++){ 299 if(c->heap[i]->heap != i) 300 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 301 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 302 vtFatal("bad heap ordering"); 303 k = (i << 1) + 1; 304 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 305 vtFatal("bad heap ordering"); 306 k++; 307 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 308 vtFatal("bad heap ordering"); 309 } 310 311 refed = 0; 312 for(i = 0; i < c->nblocks; i++){ 313 b = &c->blocks[i]; 314 if(b->data != &c->mem[i * size]) 315 vtFatal("mis-blocked at %d", i); 316 if(b->ref && b->heap == BadHeap){ 317 refed++; 318 } 319 } 320 if(c->nheap + refed != c->nblocks){ 321 fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks); 322 cacheDump(c); 323 } 324 assert(c->nheap + refed == c->nblocks); 325 refed = 0; 326 for(i = 0; i < c->nblocks; i++){ 327 b = &c->blocks[i]; 328 if(b->ref){ 329 if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l); 330 refed++; 331 } 332 } 333 if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed); 334 } 335 336 337 /* 338 * locate the block with the oldest second to last use. 339 * remove it from the heap, and fix up the heap. 340 */ 341 /* called with c->lk held */ 342 static Block * 343 cacheBumpBlock(Cache *c) 344 { 345 int printed; 346 Block *b; 347 348 /* 349 * locate the block with the oldest second to last use. 350 * remove it from the heap, and fix up the heap. 351 */ 352 printed = 0; 353 if(c->nheap == 0){ 354 while(c->nheap == 0){ 355 vtWakeup(c->flush); 356 vtSleep(c->heapwait); 357 if(c->nheap == 0){ 358 printed = 1; 359 fprint(2, "%s: entire cache is busy, %d dirty " 360 "-- waking flush thread\n", 361 argv0, c->ndirty); 362 } 363 } 364 if(printed) 365 fprint(2, "%s: cache is okay again, %d dirty\n", 366 argv0, c->ndirty); 367 } 368 369 b = c->heap[0]; 370 heapDel(b); 371 372 assert(b->heap == BadHeap); 373 assert(b->ref == 0); 374 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 375 assert(b->prior == nil); 376 assert(b->uhead == nil); 377 378 /* 379 * unchain the block from hash chain 380 */ 381 if(b->prev){ 382 *(b->prev) = b->next; 383 if(b->next) 384 b->next->prev = b->prev; 385 b->prev = nil; 386 } 387 388 389 if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score); 390 /* set block to a reasonable state */ 391 b->ref = 1; 392 b->part = PartError; 393 memset(&b->l, 0, sizeof(b->l)); 394 b->iostate = BioEmpty; 395 396 return b; 397 } 398 399 /* 400 * look for a particular version of the block in the memory cache. 401 */ 402 static Block * 403 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 404 int waitlock, int *lockfailure) 405 { 406 Block *b; 407 ulong h; 408 409 h = addr % c->hashSize; 410 411 if(lockfailure) 412 *lockfailure = 0; 413 414 /* 415 * look for the block in the cache 416 */ 417 vtLock(c->lk); 418 for(b = c->heads[h]; b != nil; b = b->next){ 419 if(b->part == part && b->addr == addr) 420 break; 421 } 422 if(b == nil || b->vers != vers){ 423 vtUnlock(c->lk); 424 return nil; 425 } 426 if(!waitlock && !vtCanLock(b->lk)){ 427 *lockfailure = 1; 428 vtUnlock(c->lk); 429 return nil; 430 } 431 heapDel(b); 432 b->ref++; 433 vtUnlock(c->lk); 434 435 bwatchLock(b); 436 if(waitlock) 437 vtLock(b->lk); 438 b->nlock = 1; 439 440 for(;;){ 441 switch(b->iostate){ 442 default: 443 abort(); 444 case BioEmpty: 445 case BioLabel: 446 case BioClean: 447 case BioDirty: 448 if(b->vers != vers){ 449 blockPut(b); 450 return nil; 451 } 452 return b; 453 case BioReading: 454 case BioWriting: 455 vtSleep(b->ioready); 456 break; 457 case BioVentiError: 458 blockPut(b); 459 vtSetError("venti i/o error block 0x%.8ux", addr); 460 return nil; 461 case BioReadError: 462 blockPut(b); 463 vtSetError("error reading block 0x%.8ux", addr); 464 return nil; 465 } 466 } 467 /* NOT REACHED */ 468 } 469 static Block* 470 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 471 { 472 return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0); 473 } 474 475 476 /* 477 * fetch a local (on-disk) block from the memory cache. 478 * if it's not there, load it, bumping some other block. 479 */ 480 Block * 481 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 482 { 483 Block *b; 484 ulong h; 485 486 assert(part != PartVenti); 487 488 h = addr % c->hashSize; 489 490 /* 491 * look for the block in the cache 492 */ 493 vtLock(c->lk); 494 for(b = c->heads[h]; b != nil; b = b->next){ 495 if(b->part != part || b->addr != addr) 496 continue; 497 if(epoch && b->l.epoch != epoch){ 498 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 499 vtUnlock(c->lk); 500 vtSetError(ELabelMismatch); 501 return nil; 502 } 503 heapDel(b); 504 b->ref++; 505 break; 506 } 507 508 if(b == nil){ 509 b = cacheBumpBlock(c); 510 511 b->part = part; 512 b->addr = addr; 513 localToGlobal(addr, b->score); 514 515 /* chain onto correct hash */ 516 b->next = c->heads[h]; 517 c->heads[h] = b; 518 if(b->next != nil) 519 b->next->prev = &b->next; 520 b->prev = &c->heads[h]; 521 } 522 523 vtUnlock(c->lk); 524 525 /* 526 * BUG: what if the epoch changes right here? 527 * In the worst case, we could end up in some weird 528 * lock loop, because the block we want no longer exists, 529 * and instead we're trying to lock a block we have no 530 * business grabbing. 531 * 532 * For now, I'm not going to worry about it. 533 */ 534 535 if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr); 536 bwatchLock(b); 537 vtLock(b->lk); 538 b->nlock = 1; 539 540 if(part == PartData && b->iostate == BioEmpty){ 541 if(!readLabel(c, &b->l, addr)){ 542 blockPut(b); 543 return nil; 544 } 545 blockSetIOState(b, BioLabel); 546 } 547 if(epoch && b->l.epoch != epoch){ 548 blockPut(b); 549 fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch); 550 vtSetError(ELabelMismatch); 551 return nil; 552 } 553 554 b->pc = getcallerpc(&c); 555 for(;;){ 556 switch(b->iostate){ 557 default: 558 abort(); 559 case BioLabel: 560 if(mode == OOverWrite) 561 /* 562 * leave iostate as BioLabel because data 563 * hasn't been read. 564 */ 565 return b; 566 /* fall through */ 567 case BioEmpty: 568 diskRead(c->disk, b); 569 vtSleep(b->ioready); 570 break; 571 case BioClean: 572 case BioDirty: 573 return b; 574 case BioReading: 575 case BioWriting: 576 vtSleep(b->ioready); 577 break; 578 case BioReadError: 579 blockSetIOState(b, BioEmpty); 580 blockPut(b); 581 vtSetError("error reading block 0x%.8ux", addr); 582 return nil; 583 } 584 } 585 /* NOT REACHED */ 586 } 587 588 Block * 589 cacheLocal(Cache *c, int part, u32int addr, int mode) 590 { 591 return _cacheLocal(c, part, addr, mode, 0); 592 } 593 594 /* 595 * fetch a local (on-disk) block from the memory cache. 596 * if it's not there, load it, bumping some other block. 597 * check tag and type. 598 */ 599 Block * 600 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 601 { 602 Block *b; 603 604 b = _cacheLocal(c, PartData, addr, mode, epoch); 605 if(b == nil) 606 return nil; 607 if(b->l.type != type || b->l.tag != tag){ 608 fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 609 argv0, addr, b->l.type, type, b->l.tag, tag); 610 vtSetError(ELabelMismatch); 611 blockPut(b); 612 return nil; 613 } 614 b->pc = getcallerpc(&c); 615 return b; 616 } 617 618 /* 619 * fetch a global (Venti) block from the memory cache. 620 * if it's not there, load it, bumping some other block. 621 * check tag and type if it's really a local block in disguise. 622 */ 623 Block * 624 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 625 { 626 int n; 627 Block *b; 628 ulong h; 629 u32int addr; 630 631 addr = globalToLocal(score); 632 if(addr != NilBlock){ 633 b = cacheLocalData(c, addr, type, tag, mode, 0); 634 if(b) 635 b->pc = getcallerpc(&c); 636 return b; 637 } 638 639 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 640 641 /* 642 * look for the block in the cache 643 */ 644 vtLock(c->lk); 645 for(b = c->heads[h]; b != nil; b = b->next){ 646 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 647 continue; 648 heapDel(b); 649 b->ref++; 650 break; 651 } 652 653 if(b == nil){ 654 if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type); 655 656 b = cacheBumpBlock(c); 657 658 b->part = PartVenti; 659 b->addr = NilBlock; 660 b->l.type = type; 661 memmove(b->score, score, VtScoreSize); 662 663 /* chain onto correct hash */ 664 b->next = c->heads[h]; 665 c->heads[h] = b; 666 if(b->next != nil) 667 b->next->prev = &b->next; 668 b->prev = &c->heads[h]; 669 } 670 vtUnlock(c->lk); 671 672 bwatchLock(b); 673 vtLock(b->lk); 674 b->nlock = 1; 675 b->pc = getcallerpc(&c); 676 677 switch(b->iostate){ 678 default: 679 abort(); 680 case BioEmpty: 681 n = vtRead(c->z, score, vtType[type], b->data, c->size); 682 if(n < 0 || !vtSha1Check(score, b->data, n)){ 683 blockSetIOState(b, BioVentiError); 684 blockPut(b); 685 vtSetError( 686 "venti error reading block %V or wrong score: %r", 687 score); 688 return nil; 689 } 690 vtZeroExtend(vtType[type], b->data, n, c->size); 691 blockSetIOState(b, BioClean); 692 return b; 693 case BioClean: 694 return b; 695 case BioVentiError: 696 blockPut(b); 697 vtSetError("venti i/o error or wrong score, block %V", score); 698 return nil; 699 case BioReadError: 700 blockPut(b); 701 vtSetError("error reading block %V", b->score); 702 return nil; 703 } 704 /* NOT REACHED */ 705 } 706 707 /* 708 * allocate a new on-disk block and load it into the memory cache. 709 * BUG: if the disk is full, should we flush some of it to Venti? 710 */ 711 static u32int lastAlloc; 712 713 Block * 714 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 715 { 716 FreeList *fl; 717 u32int addr; 718 Block *b; 719 int n, nwrap; 720 Label lab; 721 722 n = c->size / LabelSize; 723 fl = c->fl; 724 725 vtLock(fl->lk); 726 addr = fl->last; 727 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 728 if(b == nil){ 729 fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0); 730 vtUnlock(fl->lk); 731 return nil; 732 } 733 nwrap = 0; 734 for(;;){ 735 if(++addr >= fl->end){ 736 addr = 0; 737 if(++nwrap >= 2){ 738 blockPut(b); 739 vtSetError("disk is full"); 740 /* 741 * try to avoid a continuous spew of console 742 * messages. 743 */ 744 if (fl->last != 0) 745 fprint(2, "%s: cacheAllocBlock: xxx1 %R\n", 746 argv0); 747 fl->last = 0; 748 vtUnlock(fl->lk); 749 return nil; 750 } 751 } 752 if(addr%n == 0){ 753 blockPut(b); 754 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 755 if(b == nil){ 756 fl->last = addr; 757 fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0); 758 vtUnlock(fl->lk); 759 return nil; 760 } 761 } 762 if(!labelUnpack(&lab, b->data, addr%n)) 763 continue; 764 if(lab.state == BsFree) 765 goto Found; 766 if(lab.state&BsClosed) 767 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 768 goto Found; 769 } 770 Found: 771 blockPut(b); 772 b = cacheLocal(c, PartData, addr, OOverWrite); 773 if(b == nil){ 774 fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0); 775 return nil; 776 } 777 assert(b->iostate == BioLabel || b->iostate == BioClean); 778 fl->last = addr; 779 lab.type = type; 780 lab.tag = tag; 781 lab.state = BsAlloc; 782 lab.epoch = epoch; 783 lab.epochClose = ~(u32int)0; 784 if(!blockSetLabel(b, &lab, 1)){ 785 fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0); 786 blockPut(b); 787 return nil; 788 } 789 vtZeroExtend(vtType[type], b->data, 0, c->size); 790 if(0)diskWrite(c->disk, b); 791 792 if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag); 793 lastAlloc = addr; 794 fl->nused++; 795 vtUnlock(fl->lk); 796 b->pc = getcallerpc(&c); 797 return b; 798 } 799 800 int 801 cacheDirty(Cache *c) 802 { 803 return c->ndirty; 804 } 805 806 void 807 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 808 { 809 int n; 810 u32int addr, nused; 811 Block *b; 812 Label lab; 813 FreeList *fl; 814 815 fl = c->fl; 816 n = c->size / LabelSize; 817 *bsize = c->size; 818 vtLock(fl->lk); 819 if(fl->epochLow == epochLow){ 820 *used = fl->nused; 821 *total = fl->end; 822 vtUnlock(fl->lk); 823 return; 824 } 825 b = nil; 826 nused = 0; 827 for(addr=0; addr<fl->end; addr++){ 828 if(addr%n == 0){ 829 blockPut(b); 830 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 831 if(b == nil){ 832 fprint(2, "%s: flCountUsed: loading %ux: %R\n", 833 argv0, addr/n); 834 break; 835 } 836 } 837 if(!labelUnpack(&lab, b->data, addr%n)) 838 continue; 839 if(lab.state == BsFree) 840 continue; 841 if(lab.state&BsClosed) 842 if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose) 843 continue; 844 nused++; 845 } 846 blockPut(b); 847 if(addr == fl->end){ 848 fl->nused = nused; 849 fl->epochLow = epochLow; 850 } 851 *used = nused; 852 *total = fl->end; 853 vtUnlock(fl->lk); 854 return; 855 } 856 857 static FreeList * 858 flAlloc(u32int end) 859 { 860 FreeList *fl; 861 862 fl = vtMemAllocZ(sizeof(*fl)); 863 fl->lk = vtLockAlloc(); 864 fl->last = 0; 865 fl->end = end; 866 return fl; 867 } 868 869 static void 870 flFree(FreeList *fl) 871 { 872 vtLockFree(fl->lk); 873 vtMemFree(fl); 874 } 875 876 u32int 877 cacheLocalSize(Cache *c, int part) 878 { 879 return diskSize(c->disk, part); 880 } 881 882 /* 883 * The thread that has locked b may refer to it by 884 * multiple names. Nlock counts the number of 885 * references the locking thread holds. It will call 886 * blockPut once per reference. 887 */ 888 void 889 blockDupLock(Block *b) 890 { 891 assert(b->nlock > 0); 892 b->nlock++; 893 } 894 895 /* 896 * we're done with the block. 897 * unlock it. can't use it after calling this. 898 */ 899 void 900 blockPut(Block* b) 901 { 902 Cache *c; 903 904 if(b == nil) 905 return; 906 907 if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 908 909 if(b->iostate == BioDirty) 910 bwatchDependency(b); 911 912 if(--b->nlock > 0) 913 return; 914 915 /* 916 * b->nlock should probably stay at zero while 917 * the block is unlocked, but diskThread and vtSleep 918 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 919 * so we have to keep b->nlock set to 1 even 920 * when the block is unlocked. 921 */ 922 assert(b->nlock == 0); 923 b->nlock = 1; 924 // b->pc = 0; 925 926 bwatchUnlock(b); 927 vtUnlock(b->lk); 928 c = b->c; 929 vtLock(c->lk); 930 931 if(--b->ref > 0){ 932 vtUnlock(c->lk); 933 return; 934 } 935 936 assert(b->ref == 0); 937 switch(b->iostate){ 938 default: 939 b->used = c->now++; 940 heapIns(b); 941 break; 942 case BioEmpty: 943 case BioLabel: 944 if(c->nheap == 0) 945 b->used = c->now++; 946 else 947 b->used = c->heap[0]->used; 948 heapIns(b); 949 break; 950 case BioDirty: 951 break; 952 } 953 vtUnlock(c->lk); 954 } 955 956 /* 957 * set the label associated with a block. 958 */ 959 Block* 960 _blockSetLabel(Block *b, Label *l) 961 { 962 int lpb; 963 Block *bb; 964 u32int a; 965 Cache *c; 966 967 c = b->c; 968 969 assert(b->part == PartData); 970 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 971 lpb = c->size / LabelSize; 972 a = b->addr / lpb; 973 bb = cacheLocal(c, PartLabel, a, OReadWrite); 974 if(bb == nil){ 975 blockPut(b); 976 return nil; 977 } 978 b->l = *l; 979 labelPack(l, bb->data, b->addr%lpb); 980 blockDirty(bb); 981 return bb; 982 } 983 984 int 985 blockSetLabel(Block *b, Label *l, int allocating) 986 { 987 Block *lb; 988 Label oldl; 989 990 oldl = b->l; 991 lb = _blockSetLabel(b, l); 992 if(lb == nil) 993 return 0; 994 995 /* 996 * If we're allocating the block, make sure the label (bl) 997 * goes to disk before the data block (b) itself. This is to help 998 * the blocks that in turn depend on b. 999 * 1000 * Suppose bx depends on (must be written out after) b. 1001 * Once we write b we'll think it's safe to write bx. 1002 * Bx can't get at b unless it has a valid label, though. 1003 * 1004 * Allocation is the only case in which having a current label 1005 * is vital because: 1006 * 1007 * - l.type is set at allocation and never changes. 1008 * - l.tag is set at allocation and never changes. 1009 * - l.state is not checked when we load blocks. 1010 * - the archiver cares deeply about l.state being 1011 * BaActive vs. BaCopied, but that's handled 1012 * by direct calls to _blockSetLabel. 1013 */ 1014 1015 if(allocating) 1016 blockDependency(b, lb, -1, nil, nil); 1017 blockPut(lb); 1018 return 1; 1019 } 1020 1021 /* 1022 * Record that bb must be written out before b. 1023 * If index is given, we're about to overwrite the score/e 1024 * at that index in the block. Save the old value so we 1025 * can write a safer ``old'' version of the block if pressed. 1026 */ 1027 void 1028 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1029 { 1030 BList *p; 1031 1032 if(bb->iostate == BioClean) 1033 return; 1034 1035 /* 1036 * Dependencies for blocks containing Entry structures 1037 * or scores must always be explained. The problem with 1038 * only explaining some of them is this. Suppose we have two 1039 * dependencies for the same field, the first explained 1040 * and the second not. We try to write the block when the first 1041 * dependency is not written but the second is. We will roll back 1042 * the first change even though the second trumps it. 1043 */ 1044 if(index == -1 && bb->part == PartData) 1045 assert(b->l.type == BtData); 1046 1047 if(bb->iostate != BioDirty){ 1048 fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n", 1049 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1050 abort(); 1051 } 1052 1053 p = blistAlloc(bb); 1054 if(p == nil) 1055 return; 1056 1057 assert(bb->iostate == BioDirty); 1058 if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1059 1060 p->part = bb->part; 1061 p->addr = bb->addr; 1062 p->type = bb->l.type; 1063 p->vers = bb->vers; 1064 p->index = index; 1065 if(p->index >= 0){ 1066 /* 1067 * This test would just be b->l.type==BtDir except 1068 * we need to exclude the super block. 1069 */ 1070 if(b->l.type == BtDir && b->part == PartData) 1071 entryPack(e, p->old.entry, 0); 1072 else 1073 memmove(p->old.score, score, VtScoreSize); 1074 } 1075 p->next = b->prior; 1076 b->prior = p; 1077 } 1078 1079 /* 1080 * Mark an in-memory block as dirty. If there are too many 1081 * dirty blocks, start writing some out to disk. 1082 * 1083 * If there were way too many dirty blocks, we used to 1084 * try to do some flushing ourselves, but it's just too dangerous -- 1085 * it implies that the callers cannot have any of our priors locked, 1086 * but this is hard to avoid in some cases. 1087 */ 1088 int 1089 blockDirty(Block *b) 1090 { 1091 Cache *c; 1092 1093 c = b->c; 1094 1095 assert(b->part != PartVenti); 1096 1097 if(b->iostate == BioDirty) 1098 return 1; 1099 assert(b->iostate == BioClean || b->iostate == BioLabel); 1100 1101 vtLock(c->lk); 1102 b->iostate = BioDirty; 1103 c->ndirty++; 1104 if(c->ndirty > (c->maxdirty>>1)) 1105 vtWakeup(c->flush); 1106 vtUnlock(c->lk); 1107 1108 return 1; 1109 } 1110 1111 /* 1112 * We've decided to write out b. Maybe b has some pointers to blocks 1113 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1114 * copy of b that is safe to write out. (diskThread will make sure the block 1115 * remains marked as dirty.) 1116 */ 1117 uchar * 1118 blockRollback(Block *b, uchar *buf) 1119 { 1120 u32int addr; 1121 BList *p; 1122 Super super; 1123 1124 /* easy case */ 1125 if(b->prior == nil) 1126 return b->data; 1127 1128 memmove(buf, b->data, b->c->size); 1129 for(p=b->prior; p; p=p->next){ 1130 /* 1131 * we know p->index >= 0 because blockWrite has vetted this block for us. 1132 */ 1133 assert(p->index >= 0); 1134 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1135 if(b->part == PartSuper){ 1136 assert(p->index == 0); 1137 superUnpack(&super, buf); 1138 addr = globalToLocal(p->old.score); 1139 if(addr == NilBlock){ 1140 fprint(2, "%s: rolling back super block: " 1141 "bad replacement addr %V\n", 1142 argv0, p->old.score); 1143 abort(); 1144 } 1145 super.active = addr; 1146 superPack(&super, buf); 1147 continue; 1148 } 1149 if(b->l.type == BtDir) 1150 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1151 else 1152 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1153 } 1154 return buf; 1155 } 1156 1157 /* 1158 * Try to write block b. 1159 * If b depends on other blocks: 1160 * 1161 * If the block has been written out, remove the dependency. 1162 * If the dependency is replaced by a more recent dependency, 1163 * throw it out. 1164 * If we know how to write out an old version of b that doesn't 1165 * depend on it, do that. 1166 * 1167 * Otherwise, bail. 1168 */ 1169 int 1170 blockWrite(Block *b, int waitlock) 1171 { 1172 uchar *dmap; 1173 Cache *c; 1174 BList *p, **pp; 1175 Block *bb; 1176 int lockfail; 1177 1178 c = b->c; 1179 1180 if(b->iostate != BioDirty) 1181 return 1; 1182 1183 dmap = b->dmap; 1184 memset(dmap, 0, c->ndmap); 1185 pp = &b->prior; 1186 for(p=*pp; p; p=*pp){ 1187 if(p->index >= 0){ 1188 /* more recent dependency has succeeded; this one can go */ 1189 if(dmap[p->index/8] & (1<<(p->index%8))) 1190 goto ignblock; 1191 } 1192 1193 lockfail = 0; 1194 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock, 1195 &lockfail); 1196 if(bb == nil){ 1197 if(lockfail) 1198 return 0; 1199 /* block not in cache => was written already */ 1200 dmap[p->index/8] |= 1<<(p->index%8); 1201 goto ignblock; 1202 } 1203 1204 /* 1205 * same version of block is still in cache. 1206 * 1207 * the assertion is true because the block still has version p->vers, 1208 * which means it hasn't been written out since we last saw it. 1209 */ 1210 if(bb->iostate != BioDirty){ 1211 fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n", 1212 argv0, bb->part, bb->addr, bb->l.type, bb->iostate); 1213 /* probably BioWriting if it happens? */ 1214 if(bb->iostate == BioClean) 1215 goto ignblock; 1216 } 1217 1218 blockPut(bb); 1219 1220 if(p->index < 0){ 1221 /* 1222 * We don't know how to temporarily undo 1223 * b's dependency on bb, so just don't write b yet. 1224 */ 1225 if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1226 argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1227 return 0; 1228 } 1229 /* keep walking down the list */ 1230 pp = &p->next; 1231 continue; 1232 1233 ignblock: 1234 *pp = p->next; 1235 blistFree(c, p); 1236 continue; 1237 } 1238 1239 /* 1240 * DiskWrite must never be called with a double-locked block. 1241 * This call to diskWrite is okay because blockWrite is only called 1242 * from the cache flush thread, which never double-locks a block. 1243 */ 1244 diskWrite(c->disk, b); 1245 return 1; 1246 } 1247 1248 /* 1249 * Change the I/O state of block b. 1250 * Just an assignment except for magic in 1251 * switch statement (read comments there). 1252 */ 1253 void 1254 blockSetIOState(Block *b, int iostate) 1255 { 1256 int dowakeup; 1257 Cache *c; 1258 BList *p, *q; 1259 1260 if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1261 1262 c = b->c; 1263 1264 dowakeup = 0; 1265 switch(iostate){ 1266 default: 1267 abort(); 1268 case BioEmpty: 1269 assert(!b->uhead); 1270 break; 1271 case BioLabel: 1272 assert(!b->uhead); 1273 break; 1274 case BioClean: 1275 bwatchDependency(b); 1276 /* 1277 * If b->prior is set, it means a write just finished. 1278 * The prior list isn't needed anymore. 1279 */ 1280 for(p=b->prior; p; p=q){ 1281 q = p->next; 1282 blistFree(c, p); 1283 } 1284 b->prior = nil; 1285 /* 1286 * Freeing a block or just finished a write. 1287 * Move the blocks from the per-block unlink 1288 * queue to the cache unlink queue. 1289 */ 1290 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1291 vtLock(c->lk); 1292 c->ndirty--; 1293 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1294 b->vers = c->vers++; 1295 if(b->uhead){ 1296 /* add unlink blocks to unlink queue */ 1297 if(c->uhead == nil){ 1298 c->uhead = b->uhead; 1299 vtWakeup(c->unlink); 1300 }else 1301 c->utail->next = b->uhead; 1302 c->utail = b->utail; 1303 b->uhead = nil; 1304 } 1305 vtUnlock(c->lk); 1306 } 1307 assert(!b->uhead); 1308 dowakeup = 1; 1309 break; 1310 case BioDirty: 1311 /* 1312 * Wrote out an old version of the block (see blockRollback). 1313 * Bump a version count, leave it dirty. 1314 */ 1315 if(b->iostate == BioWriting){ 1316 vtLock(c->lk); 1317 b->vers = c->vers++; 1318 vtUnlock(c->lk); 1319 dowakeup = 1; 1320 } 1321 break; 1322 case BioReading: 1323 case BioWriting: 1324 /* 1325 * Adding block to disk queue. Bump reference count. 1326 * diskThread decs the count later by calling blockPut. 1327 * This is here because we need to lock c->lk to 1328 * manipulate the ref count. 1329 */ 1330 vtLock(c->lk); 1331 b->ref++; 1332 vtUnlock(c->lk); 1333 break; 1334 case BioReadError: 1335 case BioVentiError: 1336 /* 1337 * Oops. 1338 */ 1339 dowakeup = 1; 1340 break; 1341 } 1342 b->iostate = iostate; 1343 /* 1344 * Now that the state has changed, we can wake the waiters. 1345 */ 1346 if(dowakeup) 1347 vtWakeupAll(b->ioready); 1348 } 1349 1350 /* 1351 * The active file system is a tree of blocks. 1352 * When we add snapshots to the mix, the entire file system 1353 * becomes a dag and thus requires a bit more care. 1354 * 1355 * The life of the file system is divided into epochs. A snapshot 1356 * ends one epoch and begins the next. Each file system block 1357 * is marked with the epoch in which it was created (b.epoch). 1358 * When the block is unlinked from the file system (closed), it is marked 1359 * with the epoch in which it was removed (b.epochClose). 1360 * Once we have discarded or archived all snapshots up to 1361 * b.epochClose, we can reclaim the block. 1362 * 1363 * If a block was created in a past epoch but is not yet closed, 1364 * it is treated as copy-on-write. Of course, in order to insert the 1365 * new pointer into the tree, the parent must be made writable, 1366 * and so on up the tree. The recursion stops because the root 1367 * block is always writable. 1368 * 1369 * If blocks are never closed, they will never be reused, and 1370 * we will run out of disk space. But marking a block as closed 1371 * requires some care about dependencies and write orderings. 1372 * 1373 * (1) If a block p points at a copy-on-write block b and we 1374 * copy b to create bb, then p must be written out after bb and 1375 * lbb (bb's label block). 1376 * 1377 * (2) We have to mark b as closed, but only after we switch 1378 * the pointer, so lb must be written out after p. In fact, we 1379 * can't even update the in-memory copy, or the cache might 1380 * mistakenly give out b for reuse before p gets written. 1381 * 1382 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency. 1383 * The caller is expected to record a "p after bb" dependency 1384 * to finish (1), and also expected to call blockRemoveLink 1385 * to arrange for (2) to happen once p is written. 1386 * 1387 * Until (2) happens, some pieces of the code (e.g., the archiver) 1388 * still need to know whether a block has been copied, so we 1389 * set the BsCopied bit in the label and force that to disk *before* 1390 * the copy gets written out. 1391 */ 1392 Block* 1393 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 1394 { 1395 Block *bb, *lb; 1396 Label l; 1397 1398 if((b->l.state&BsClosed) || b->l.epoch >= ehi) 1399 fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n", 1400 argv0, b->addr, &b->l, elo, ehi); 1401 1402 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 1403 if(bb == nil){ 1404 blockPut(b); 1405 return nil; 1406 } 1407 1408 /* 1409 * Update label so we know the block has been copied. 1410 * (It will be marked closed once it has been unlinked from 1411 * the tree.) This must follow cacheAllocBlock since we 1412 * can't be holding onto lb when we call cacheAllocBlock. 1413 */ 1414 if((b->l.state&BsCopied)==0) 1415 if(b->part == PartData){ /* not the superblock */ 1416 l = b->l; 1417 l.state |= BsCopied; 1418 lb = _blockSetLabel(b, &l); 1419 if(lb == nil){ 1420 /* can't set label => can't copy block */ 1421 blockPut(b); 1422 l.type = BtMax; 1423 l.state = BsFree; 1424 l.epoch = 0; 1425 l.epochClose = 0; 1426 l.tag = 0; 1427 blockSetLabel(bb, &l, 0); 1428 blockPut(bb); 1429 return nil; 1430 } 1431 blockDependency(bb, lb, -1, nil, nil); 1432 blockPut(lb); 1433 } 1434 1435 memmove(bb->data, b->data, b->c->size); 1436 blockDirty(bb); 1437 blockPut(b); 1438 return bb; 1439 } 1440 1441 /* 1442 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1443 * If recurse is set, we are unlinking all of bb's children as well. 1444 * 1445 * We can't reclaim bb (or its kids) until the block b gets written to disk. We add 1446 * the relevant information to b's list of unlinked blocks. Once b is written, 1447 * the list will be queued for processing. 1448 * 1449 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list. 1450 */ 1451 void 1452 blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse) 1453 { 1454 BList *p, **pp, bl; 1455 1456 /* remove bb from prior list */ 1457 for(pp=&b->prior; (p=*pp)!=nil; ){ 1458 if(p->part == PartData && p->addr == addr){ 1459 *pp = p->next; 1460 blistFree(b->c, p); 1461 }else 1462 pp = &p->next; 1463 } 1464 1465 bl.part = PartData; 1466 bl.addr = addr; 1467 bl.type = type; 1468 bl.tag = tag; 1469 if(b->l.epoch == 0) 1470 assert(b->part == PartSuper); 1471 bl.epoch = b->l.epoch; 1472 bl.next = nil; 1473 bl.recurse = recurse; 1474 1475 if(b->part == PartSuper && b->iostate == BioClean) 1476 p = nil; 1477 else 1478 p = blistAlloc(b); 1479 if(p == nil){ 1480 /* 1481 * b has already been written to disk. 1482 */ 1483 doRemoveLink(b->c, &bl); 1484 return; 1485 } 1486 1487 /* Uhead is only processed when the block goes from Dirty -> Clean */ 1488 assert(b->iostate == BioDirty); 1489 1490 *p = bl; 1491 if(b->uhead == nil) 1492 b->uhead = p; 1493 else 1494 b->utail->next = p; 1495 b->utail = p; 1496 } 1497 1498 /* 1499 * Process removal of a single block and perhaps its children. 1500 */ 1501 static void 1502 doRemoveLink(Cache *c, BList *p) 1503 { 1504 int i, n, recurse; 1505 u32int a; 1506 Block *b; 1507 Label l; 1508 BList bl; 1509 1510 recurse = (p->recurse && p->type != BtData && p->type != BtDir); 1511 1512 /* 1513 * We're not really going to overwrite b, but if we're not 1514 * going to look at its contents, there is no point in reading 1515 * them from the disk. 1516 */ 1517 b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0); 1518 if(b == nil) 1519 return; 1520 1521 /* 1522 * When we're unlinking from the superblock, close with the next epoch. 1523 */ 1524 if(p->epoch == 0) 1525 p->epoch = b->l.epoch+1; 1526 1527 /* sanity check */ 1528 if(b->l.epoch > p->epoch){ 1529 fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n", 1530 argv0, b->l.epoch, p->epoch); 1531 blockPut(b); 1532 return; 1533 } 1534 1535 if(recurse){ 1536 n = c->size / VtScoreSize; 1537 for(i=0; i<n; i++){ 1538 a = globalToLocal(b->data + i*VtScoreSize); 1539 if(a == NilBlock || !readLabel(c, &l, a)) 1540 continue; 1541 if(l.state&BsClosed) 1542 continue; 1543 /* 1544 * If stack space becomes an issue... 1545 p->addr = a; 1546 p->type = l.type; 1547 p->tag = l.tag; 1548 doRemoveLink(c, p); 1549 */ 1550 1551 bl.part = PartData; 1552 bl.addr = a; 1553 bl.type = l.type; 1554 bl.tag = l.tag; 1555 bl.epoch = p->epoch; 1556 bl.next = nil; 1557 bl.recurse = 1; 1558 /* give up the block lock - share with others */ 1559 blockPut(b); 1560 doRemoveLink(c, &bl); 1561 b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0); 1562 if(b == nil){ 1563 fprint(2, "%s: warning: lost block in doRemoveLink\n", 1564 argv0); 1565 return; 1566 } 1567 } 1568 } 1569 1570 l = b->l; 1571 l.state |= BsClosed; 1572 l.epochClose = p->epoch; 1573 if(l.epochClose == l.epoch){ 1574 vtLock(c->fl->lk); 1575 if(l.epoch == c->fl->epochLow) 1576 c->fl->nused--; 1577 blockSetLabel(b, &l, 0); 1578 vtUnlock(c->fl->lk); 1579 }else 1580 blockSetLabel(b, &l, 0); 1581 blockPut(b); 1582 } 1583 1584 /* 1585 * Allocate a BList so that we can record a dependency 1586 * or queue a removal related to block b. 1587 * If we can't find a BList, we write out b and return nil. 1588 */ 1589 static BList * 1590 blistAlloc(Block *b) 1591 { 1592 Cache *c; 1593 BList *p; 1594 1595 if(b->iostate != BioDirty){ 1596 /* 1597 * should not happen anymore - 1598 * blockDirty used to flush but no longer does. 1599 */ 1600 assert(b->iostate == BioClean); 1601 fprint(2, "%s: blistAlloc: called on clean block\n", argv0); 1602 return nil; 1603 } 1604 1605 c = b->c; 1606 vtLock(c->lk); 1607 if(c->blfree == nil){ 1608 /* 1609 * No free BLists. What are our options? 1610 */ 1611 1612 /* Block has no priors? Just write it. */ 1613 if(b->prior == nil){ 1614 vtUnlock(c->lk); 1615 diskWriteAndWait(c->disk, b); 1616 return nil; 1617 } 1618 1619 /* 1620 * Wake the flush thread, which will hopefully free up 1621 * some BLists for us. We used to flush a block from 1622 * our own prior list and reclaim that BList, but this is 1623 * a no-no: some of the blocks on our prior list may 1624 * be locked by our caller. Or maybe their label blocks 1625 * are locked by our caller. In any event, it's too hard 1626 * to make sure we can do I/O for ourselves. Instead, 1627 * we assume the flush thread will find something. 1628 * (The flush thread never blocks waiting for a block, 1629 * so it can't deadlock like we can.) 1630 */ 1631 while(c->blfree == nil){ 1632 vtWakeup(c->flush); 1633 vtSleep(c->blrend); 1634 if(c->blfree == nil) 1635 fprint(2, "%s: flushing for blists\n", argv0); 1636 } 1637 } 1638 1639 p = c->blfree; 1640 c->blfree = p->next; 1641 vtUnlock(c->lk); 1642 return p; 1643 } 1644 1645 static void 1646 blistFree(Cache *c, BList *bl) 1647 { 1648 vtLock(c->lk); 1649 bl->next = c->blfree; 1650 c->blfree = bl; 1651 vtWakeup(c->blrend); 1652 vtUnlock(c->lk); 1653 } 1654 1655 char* 1656 bsStr(int state) 1657 { 1658 static char s[100]; 1659 1660 if(state == BsFree) 1661 return "Free"; 1662 if(state == BsBad) 1663 return "Bad"; 1664 1665 sprint(s, "%x", state); 1666 if(!(state&BsAlloc)) 1667 strcat(s, ",Free"); /* should not happen */ 1668 if(state&BsCopied) 1669 strcat(s, ",Copied"); 1670 if(state&BsVenti) 1671 strcat(s, ",Venti"); 1672 if(state&BsClosed) 1673 strcat(s, ",Closed"); 1674 return s; 1675 } 1676 1677 char * 1678 bioStr(int iostate) 1679 { 1680 switch(iostate){ 1681 default: 1682 return "Unknown!!"; 1683 case BioEmpty: 1684 return "Empty"; 1685 case BioLabel: 1686 return "Label"; 1687 case BioClean: 1688 return "Clean"; 1689 case BioDirty: 1690 return "Dirty"; 1691 case BioReading: 1692 return "Reading"; 1693 case BioWriting: 1694 return "Writing"; 1695 case BioReadError: 1696 return "ReadError"; 1697 case BioVentiError: 1698 return "VentiError"; 1699 case BioMax: 1700 return "Max"; 1701 } 1702 } 1703 1704 static char *bttab[] = { 1705 "BtData", 1706 "BtData+1", 1707 "BtData+2", 1708 "BtData+3", 1709 "BtData+4", 1710 "BtData+5", 1711 "BtData+6", 1712 "BtData+7", 1713 "BtDir", 1714 "BtDir+1", 1715 "BtDir+2", 1716 "BtDir+3", 1717 "BtDir+4", 1718 "BtDir+5", 1719 "BtDir+6", 1720 "BtDir+7", 1721 }; 1722 1723 char* 1724 btStr(int type) 1725 { 1726 if(type < nelem(bttab)) 1727 return bttab[type]; 1728 return "unknown"; 1729 } 1730 1731 int 1732 labelFmt(Fmt *f) 1733 { 1734 Label *l; 1735 1736 l = va_arg(f->args, Label*); 1737 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1738 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1739 } 1740 1741 int 1742 scoreFmt(Fmt *f) 1743 { 1744 uchar *v; 1745 int i; 1746 u32int addr; 1747 1748 v = va_arg(f->args, uchar*); 1749 if(v == nil){ 1750 fmtprint(f, "*"); 1751 }else if((addr = globalToLocal(v)) != NilBlock) 1752 fmtprint(f, "0x%.8ux", addr); 1753 else{ 1754 for(i = 0; i < VtScoreSize; i++) 1755 fmtprint(f, "%2.2ux", v[i]); 1756 } 1757 1758 return 0; 1759 } 1760 1761 static int 1762 upHeap(int i, Block *b) 1763 { 1764 Block *bb; 1765 u32int now; 1766 int p; 1767 Cache *c; 1768 1769 c = b->c; 1770 now = c->now; 1771 for(; i != 0; i = p){ 1772 p = (i - 1) >> 1; 1773 bb = c->heap[p]; 1774 if(b->used - now >= bb->used - now) 1775 break; 1776 c->heap[i] = bb; 1777 bb->heap = i; 1778 } 1779 c->heap[i] = b; 1780 b->heap = i; 1781 1782 return i; 1783 } 1784 1785 static int 1786 downHeap(int i, Block *b) 1787 { 1788 Block *bb; 1789 u32int now; 1790 int k; 1791 Cache *c; 1792 1793 c = b->c; 1794 now = c->now; 1795 for(; ; i = k){ 1796 k = (i << 1) + 1; 1797 if(k >= c->nheap) 1798 break; 1799 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1800 k++; 1801 bb = c->heap[k]; 1802 if(b->used - now <= bb->used - now) 1803 break; 1804 c->heap[i] = bb; 1805 bb->heap = i; 1806 } 1807 c->heap[i] = b; 1808 b->heap = i; 1809 return i; 1810 } 1811 1812 /* 1813 * Delete a block from the heap. 1814 * Called with c->lk held. 1815 */ 1816 static void 1817 heapDel(Block *b) 1818 { 1819 int i, si; 1820 Cache *c; 1821 1822 c = b->c; 1823 1824 si = b->heap; 1825 if(si == BadHeap) 1826 return; 1827 b->heap = BadHeap; 1828 c->nheap--; 1829 if(si == c->nheap) 1830 return; 1831 b = c->heap[c->nheap]; 1832 i = upHeap(si, b); 1833 if(i == si) 1834 downHeap(i, b); 1835 } 1836 1837 /* 1838 * Insert a block into the heap. 1839 * Called with c->lk held. 1840 */ 1841 static void 1842 heapIns(Block *b) 1843 { 1844 assert(b->heap == BadHeap); 1845 upHeap(b->c->nheap++, b); 1846 vtWakeup(b->c->heapwait); 1847 } 1848 1849 /* 1850 * Get just the label for a block. 1851 */ 1852 int 1853 readLabel(Cache *c, Label *l, u32int addr) 1854 { 1855 int lpb; 1856 Block *b; 1857 u32int a; 1858 1859 lpb = c->size / LabelSize; 1860 a = addr / lpb; 1861 b = cacheLocal(c, PartLabel, a, OReadOnly); 1862 if(b == nil){ 1863 blockPut(b); 1864 return 0; 1865 } 1866 1867 if(!labelUnpack(l, b->data, addr%lpb)){ 1868 blockPut(b); 1869 return 0; 1870 } 1871 blockPut(b); 1872 return 1; 1873 } 1874 1875 /* 1876 * Process unlink queue. 1877 * Called with c->lk held. 1878 */ 1879 static void 1880 unlinkBody(Cache *c) 1881 { 1882 BList *p; 1883 1884 while(c->uhead != nil){ 1885 p = c->uhead; 1886 c->uhead = p->next; 1887 vtUnlock(c->lk); 1888 doRemoveLink(c, p); 1889 vtLock(c->lk); 1890 p->next = c->blfree; 1891 c->blfree = p; 1892 } 1893 } 1894 1895 /* 1896 * Occasionally unlink the blocks on the cache unlink queue. 1897 */ 1898 static void 1899 unlinkThread(void *a) 1900 { 1901 Cache *c = a; 1902 1903 vtThreadSetName("unlink"); 1904 1905 vtLock(c->lk); 1906 for(;;){ 1907 while(c->uhead == nil && c->die == nil) 1908 vtSleep(c->unlink); 1909 if(c->die != nil) 1910 break; 1911 unlinkBody(c); 1912 } 1913 c->ref--; 1914 vtWakeup(c->die); 1915 vtUnlock(c->lk); 1916 } 1917 1918 static int 1919 baddrCmp(void *a0, void *a1) 1920 { 1921 BAddr *b0, *b1; 1922 b0 = a0; 1923 b1 = a1; 1924 1925 if(b0->part < b1->part) 1926 return -1; 1927 if(b0->part > b1->part) 1928 return 1; 1929 if(b0->addr < b1->addr) 1930 return -1; 1931 if(b0->addr > b1->addr) 1932 return 1; 1933 return 0; 1934 } 1935 1936 /* 1937 * Scan the block list for dirty blocks; add them to the list c->baddr. 1938 */ 1939 static void 1940 flushFill(Cache *c) 1941 { 1942 int i, ndirty; 1943 BAddr *p; 1944 Block *b; 1945 1946 vtLock(c->lk); 1947 if(c->ndirty == 0){ 1948 vtUnlock(c->lk); 1949 return; 1950 } 1951 1952 p = c->baddr; 1953 ndirty = 0; 1954 for(i=0; i<c->nblocks; i++){ 1955 b = c->blocks + i; 1956 if(b->part == PartError) 1957 continue; 1958 if(b->iostate == BioDirty || b->iostate == BioWriting) 1959 ndirty++; 1960 if(b->iostate != BioDirty) 1961 continue; 1962 p->part = b->part; 1963 p->addr = b->addr; 1964 p->vers = b->vers; 1965 p++; 1966 } 1967 if(ndirty != c->ndirty){ 1968 fprint(2, "%s: ndirty mismatch expected %d found %d\n", 1969 argv0, c->ndirty, ndirty); 1970 c->ndirty = ndirty; 1971 } 1972 vtUnlock(c->lk); 1973 1974 c->bw = p - c->baddr; 1975 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1976 } 1977 1978 /* 1979 * This is not thread safe, i.e. it can't be called from multiple threads. 1980 * 1981 * It's okay how we use it, because it only gets called in 1982 * the flushThread. And cacheFree, but only after 1983 * cacheFree has killed off the flushThread. 1984 */ 1985 static int 1986 cacheFlushBlock(Cache *c) 1987 { 1988 Block *b; 1989 BAddr *p; 1990 int lockfail, nfail; 1991 1992 nfail = 0; 1993 for(;;){ 1994 if(c->br == c->be){ 1995 if(c->bw == 0 || c->bw == c->be) 1996 flushFill(c); 1997 c->br = 0; 1998 c->be = c->bw; 1999 c->bw = 0; 2000 c->nflush = 0; 2001 } 2002 2003 if(c->br == c->be) 2004 return 0; 2005 p = c->baddr + c->br; 2006 c->br++; 2007 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock, 2008 &lockfail); 2009 2010 if(b && blockWrite(b, Nowaitlock)){ 2011 c->nflush++; 2012 blockPut(b); 2013 return 1; 2014 } 2015 if(b) 2016 blockPut(b); 2017 2018 /* 2019 * Why didn't we write the block? 2020 */ 2021 2022 /* Block already written out */ 2023 if(b == nil && !lockfail) 2024 continue; 2025 2026 /* Failed to acquire lock; sleep if happens a lot. */ 2027 if(lockfail && ++nfail > 100){ 2028 sleep(500); 2029 nfail = 0; 2030 } 2031 /* Requeue block. */ 2032 if(c->bw < c->be) 2033 c->baddr[c->bw++] = *p; 2034 } 2035 } 2036 2037 /* 2038 * Occasionally flush dirty blocks from memory to the disk. 2039 */ 2040 static void 2041 flushThread(void *a) 2042 { 2043 Cache *c = a; 2044 int i; 2045 2046 vtThreadSetName("flush"); 2047 vtLock(c->lk); 2048 while(c->die == nil){ 2049 vtSleep(c->flush); 2050 vtUnlock(c->lk); 2051 for(i=0; i<FlushSize; i++) 2052 if(!cacheFlushBlock(c)){ 2053 /* 2054 * If i==0, could be someone is waking us repeatedly 2055 * to flush the cache but there's no work to do. 2056 * Pause a little. 2057 */ 2058 if(i==0){ 2059 // fprint(2, "%s: flushthread found " 2060 // "nothing to flush - %d dirty\n", 2061 // argv0, c->ndirty); 2062 sleep(250); 2063 } 2064 break; 2065 } 2066 if(i==0 && c->ndirty){ 2067 /* 2068 * All the blocks are being written right now -- there's nothing to do. 2069 * We might be spinning with cacheFlush though -- he'll just keep 2070 * kicking us until c->ndirty goes down. Probably we should sleep 2071 * on something that the diskThread can kick, but for now we'll 2072 * just pause for a little while waiting for disks to finish. 2073 */ 2074 sleep(100); 2075 } 2076 vtLock(c->lk); 2077 vtWakeupAll(c->flushwait); 2078 } 2079 c->ref--; 2080 vtWakeup(c->die); 2081 vtUnlock(c->lk); 2082 } 2083 2084 /* 2085 * Flush the cache. 2086 */ 2087 void 2088 cacheFlush(Cache *c, int wait) 2089 { 2090 vtLock(c->lk); 2091 if(wait){ 2092 while(c->ndirty){ 2093 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2094 // c->ndirty, c->uhead); 2095 vtWakeup(c->flush); 2096 vtSleep(c->flushwait); 2097 } 2098 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2099 }else if(c->ndirty) 2100 vtWakeup(c->flush); 2101 vtUnlock(c->lk); 2102 } 2103 2104 /* 2105 * Kick the flushThread every 30 seconds. 2106 */ 2107 static void 2108 cacheSync(void *v) 2109 { 2110 Cache *c; 2111 2112 c = v; 2113 cacheFlush(c, 0); 2114 } 2115