1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtSession *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 VtRendez *blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 VtRendez *die; /* daemon threads should die when != nil */ 52 53 VtRendez *flush; 54 VtRendez *flushwait; 55 VtRendez *heapwait; 56 BAddr *baddr; 57 int bw, br, be; 58 int nflush; 59 60 Periodic *sync; 61 62 /* unlink daemon */ 63 BList *uhead; 64 BList *utail; 65 VtRendez *unlink; 66 67 /* block counts */ 68 int nused; 69 int ndisk; 70 }; 71 72 struct BList { 73 int part; 74 u32int addr; 75 uchar type; 76 u32int tag; 77 u32int epoch; 78 u32int vers; 79 80 /* for roll back */ 81 int index; /* -1 indicates not valid */ 82 union { 83 uchar score[VtScoreSize]; 84 uchar entry[VtEntrySize]; 85 } old; 86 BList *next; 87 }; 88 89 struct BAddr { 90 int part; 91 u32int addr; 92 u32int vers; 93 }; 94 95 struct FreeList { 96 VtLock *lk; 97 u32int last; /* last block allocated */ 98 u32int end; /* end of data partition */ 99 u32int nfree; /* number of free blocks */ 100 u32int nused; /* number of used blocks */ 101 u32int epochLow; /* low epoch when last updated nfree and nused */ 102 }; 103 104 static FreeList *flAlloc(u32int end); 105 static void flFree(FreeList *fl); 106 107 static Block *cacheBumpBlock(Cache *c); 108 static void heapDel(Block*); 109 static void heapIns(Block*); 110 static void cacheCheck(Cache*); 111 static int readLabel(Cache*, Label*, u32int addr); 112 static void unlinkThread(void *a); 113 static void flushThread(void *a); 114 static void flushBody(Cache *c); 115 static void unlinkBody(Cache *c); 116 static int cacheFlushBlock(Cache *c); 117 static void cacheSync(void*); 118 /* 119 * Mapping from local block type to Venti type 120 */ 121 int vtType[BtMax] = { 122 VtDataType, /* BtData | 0 */ 123 VtPointerType0, /* BtData | 1 */ 124 VtPointerType1, /* BtData | 2 */ 125 VtPointerType2, /* BtData | 3 */ 126 VtPointerType3, /* BtData | 4 */ 127 VtPointerType4, /* BtData | 5 */ 128 VtPointerType5, /* BtData | 6 */ 129 VtPointerType6, /* BtData | 7 */ 130 VtDirType, /* BtDir | 0 */ 131 VtPointerType0, /* BtDir | 1 */ 132 VtPointerType1, /* BtDir | 2 */ 133 VtPointerType2, /* BtDir | 3 */ 134 VtPointerType3, /* BtDir | 4 */ 135 VtPointerType4, /* BtDir | 5 */ 136 VtPointerType5, /* BtDir | 6 */ 137 VtPointerType6, /* BtDir | 7 */ 138 }; 139 140 /* 141 * Allocate the memory cache. 142 */ 143 Cache * 144 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 145 { 146 int i; 147 Cache *c; 148 Block *b; 149 BList *bl; 150 u8int *p; 151 int nbl; 152 153 c = vtMemAllocZ(sizeof(Cache)); 154 155 /* reasonable number of BList elements */ 156 nbl = nblocks * 4; 157 158 c->lk = vtLockAlloc(); 159 c->ref = 1; 160 c->disk = disk; 161 c->z = z; 162 c->size = diskBlockSize(disk); 163 bwatchSetBlockSize(c->size); 164 /* round c->size up to be a nice multiple */ 165 c->size = (c->size + 127) & ~127; 166 c->ndmap = (c->size/20 + 7) / 8; 167 c->nblocks = nblocks; 168 c->hashSize = nblocks; 169 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 170 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 171 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 172 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 173 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 174 c->mode = mode; 175 c->vers++; 176 p = c->mem; 177 for(i = 0; i < nblocks; i++){ 178 b = &c->blocks[i]; 179 b->lk = vtLockAlloc(); 180 b->c = c; 181 b->data = p; 182 b->heap = i; 183 b->ioready = vtRendezAlloc(b->lk); 184 c->heap[i] = b; 185 p += c->size; 186 } 187 c->nheap = nblocks; 188 for(i = 0; i < nbl; i++){ 189 bl = (BList*)p; 190 bl->next = c->blfree; 191 c->blfree = bl; 192 p += sizeof(BList); 193 } 194 /* separate loop to keep blocks and blists reasonably aligned */ 195 for(i = 0; i < nblocks; i++){ 196 b = &c->blocks[i]; 197 b->dmap = p; 198 p += c->ndmap; 199 } 200 201 c->blrend = vtRendezAlloc(c->lk); 202 203 c->maxdirty = nblocks*(DirtyPercentage*0.01); 204 205 c->fl = flAlloc(diskSize(disk, PartData)); 206 207 c->unlink = vtRendezAlloc(c->lk); 208 c->flush = vtRendezAlloc(c->lk); 209 c->flushwait = vtRendezAlloc(c->lk); 210 c->heapwait = vtRendezAlloc(c->lk); 211 c->sync = periodicAlloc(cacheSync, c, 30*1000); 212 213 if(mode == OReadWrite){ 214 c->ref += 2; 215 vtThread(unlinkThread, c); 216 vtThread(flushThread, c); 217 } 218 cacheCheck(c); 219 220 return c; 221 } 222 223 /* 224 * Free the whole memory cache, flushing all dirty blocks to the disk. 225 */ 226 void 227 cacheFree(Cache *c) 228 { 229 int i; 230 231 /* kill off daemon threads */ 232 vtLock(c->lk); 233 c->die = vtRendezAlloc(c->lk); 234 periodicKill(c->sync); 235 vtWakeup(c->flush); 236 vtWakeup(c->unlink); 237 while(c->ref > 1) 238 vtSleep(c->die); 239 240 /* flush everything out */ 241 do { 242 unlinkBody(c); 243 vtUnlock(c->lk); 244 while(cacheFlushBlock(c)) 245 ; 246 diskFlush(c->disk); 247 vtLock(c->lk); 248 } while(c->uhead || c->ndirty); 249 vtUnlock(c->lk); 250 251 cacheCheck(c); 252 253 for(i = 0; i < c->nblocks; i++){ 254 assert(c->blocks[i].ref == 0); 255 vtRendezFree(c->blocks[i].ioready); 256 vtLockFree(c->blocks[i].lk); 257 } 258 flFree(c->fl); 259 vtMemFree(c->baddr); 260 vtMemFree(c->heads); 261 vtMemFree(c->blocks); 262 vtMemFree(c->mem); 263 vtLockFree(c->lk); 264 diskFree(c->disk); 265 vtRendezFree(c->blrend); 266 /* don't close vtSession */ 267 vtMemFree(c); 268 } 269 270 static void 271 cacheDump(Cache *c) 272 { 273 int i; 274 Block *b; 275 276 for(i = 0; i < c->nblocks; i++){ 277 b = &c->blocks[i]; 278 fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=0x%lux\n", 279 i, b->part, b->addr, b->score, b->l.type, b->ref, 280 bsStr(b->l.state), bioStr(b->iostate), b->pc); 281 } 282 } 283 284 static void 285 cacheCheck(Cache *c) 286 { 287 u32int size, now; 288 int i, k, refed; 289 static uchar zero[VtScoreSize]; 290 Block *b; 291 292 size = c->size; 293 now = c->now; 294 295 for(i = 0; i < c->nheap; i++){ 296 if(c->heap[i]->heap != i) 297 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 298 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 299 vtFatal("bad heap ordering"); 300 k = (i << 1) + 1; 301 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 302 vtFatal("bad heap ordering"); 303 k++; 304 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 305 vtFatal("bad heap ordering"); 306 } 307 308 refed = 0; 309 for(i = 0; i < c->nblocks; i++){ 310 b = &c->blocks[i]; 311 if(b->data != &c->mem[i * size]) 312 vtFatal("mis-blocked at %d", i); 313 if(b->ref && b->heap == BadHeap){ 314 refed++; 315 } 316 } 317 if(c->nheap + refed != c->nblocks){ 318 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 319 cacheDump(c); 320 } 321 assert(c->nheap + refed == c->nblocks); 322 refed = 0; 323 for(i = 0; i < c->nblocks; i++){ 324 b = &c->blocks[i]; 325 if(b->ref){ 326 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 327 refed++; 328 } 329 } 330 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 331 } 332 333 334 /* 335 * locate the block with the oldest second to last use. 336 * remove it from the heap, and fix up the heap. 337 */ 338 /* called with c->lk held */ 339 static Block * 340 cacheBumpBlock(Cache *c) 341 { 342 Block *b; 343 344 /* 345 * locate the block with the oldest second to last use. 346 * remove it from the heap, and fix up the heap. 347 */ 348 if(c->nheap == 0){ 349 while(c->nheap == 0){ 350 fprint(2, "entire cache is busy, %d dirty -- waking flush thread\n", c->ndirty); 351 vtWakeup(c->flush); 352 vtSleep(c->heapwait); 353 } 354 fprint(2, "cache is okay again\n"); 355 } 356 357 b = c->heap[0]; 358 heapDel(b); 359 360 assert(b->heap == BadHeap); 361 assert(b->ref == 0); 362 assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting); 363 assert(b->prior == nil); 364 assert(b->uhead == nil); 365 366 /* 367 * unchain the block from hash chain 368 */ 369 if(b->prev){ 370 *(b->prev) = b->next; 371 if(b->next) 372 b->next->prev = b->prev; 373 b->prev = nil; 374 } 375 376 377 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 378 /* set block to a reasonable state */ 379 b->ref = 1; 380 b->part = PartError; 381 memset(&b->l, 0, sizeof(b->l)); 382 b->iostate = BioEmpty; 383 384 return b; 385 } 386 387 /* 388 * look for a particular version of the block in the memory cache. 389 */ 390 static Block * 391 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 392 int waitlock, int *lockfailure) 393 { 394 Block *b; 395 ulong h; 396 397 h = addr % c->hashSize; 398 399 if(lockfailure) 400 *lockfailure = 0; 401 402 /* 403 * look for the block in the cache 404 */ 405 vtLock(c->lk); 406 for(b = c->heads[h]; b != nil; b = b->next){ 407 if(b->part == part && b->addr == addr) 408 break; 409 } 410 if(b == nil || b->vers != vers){ 411 vtUnlock(c->lk); 412 return nil; 413 } 414 if(!waitlock && !vtCanLock(b->lk)){ 415 *lockfailure = 1; 416 vtUnlock(c->lk); 417 return nil; 418 } 419 heapDel(b); 420 b->ref++; 421 vtUnlock(c->lk); 422 423 bwatchLock(b); 424 if(waitlock) 425 vtLock(b->lk); 426 b->nlock = 1; 427 428 for(;;){ 429 switch(b->iostate){ 430 default: 431 abort(); 432 case BioEmpty: 433 case BioLabel: 434 case BioClean: 435 case BioDirty: 436 if(b->vers != vers){ 437 blockPut(b); 438 return nil; 439 } 440 return b; 441 case BioReading: 442 case BioWriting: 443 vtSleep(b->ioready); 444 break; 445 case BioVentiError: 446 blockPut(b); 447 vtSetError(EVentiIO); 448 return nil; 449 case BioReadError: 450 blockPut(b); 451 vtSetError(EIO); 452 return nil; 453 } 454 } 455 /* NOT REACHED */ 456 } 457 static Block* 458 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 459 { 460 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 461 } 462 463 464 /* 465 * fetch a local (on-disk) block from the memory cache. 466 * if it's not there, load it, bumping some other block. 467 */ 468 Block * 469 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 470 { 471 Block *b; 472 ulong h; 473 474 assert(part != PartVenti); 475 476 h = addr % c->hashSize; 477 478 /* 479 * look for the block in the cache 480 */ 481 vtLock(c->lk); 482 for(b = c->heads[h]; b != nil; b = b->next){ 483 if(b->part != part || b->addr != addr) 484 continue; 485 if(epoch && b->l.epoch != epoch){ 486 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 487 vtUnlock(c->lk); 488 vtSetError(ELabelMismatch); 489 return nil; 490 } 491 heapDel(b); 492 b->ref++; 493 break; 494 } 495 496 if(b == nil){ 497 b = cacheBumpBlock(c); 498 499 b->part = part; 500 b->addr = addr; 501 localToGlobal(addr, b->score); 502 503 /* chain onto correct hash */ 504 b->next = c->heads[h]; 505 c->heads[h] = b; 506 if(b->next != nil) 507 b->next->prev = &b->next; 508 b->prev = &c->heads[h]; 509 } 510 511 vtUnlock(c->lk); 512 513 /* 514 * BUG: what if the epoch changes right here? 515 * In the worst case, we could end up in some weird 516 * lock loop, because the block we want no longer exists, 517 * and instead we're trying to lock a block we have no 518 * business grabbing. 519 * 520 * For now, I'm not going to worry about it. 521 */ 522 523 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 524 bwatchLock(b); 525 vtLock(b->lk); 526 b->nlock = 1; 527 528 if(part == PartData && b->iostate == BioEmpty){ 529 if(!readLabel(c, &b->l, addr)){ 530 blockPut(b); 531 return nil; 532 } 533 blockSetIOState(b, BioLabel); 534 } 535 if(epoch && b->l.epoch != epoch){ 536 blockPut(b); 537 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 538 vtSetError(ELabelMismatch); 539 return nil; 540 } 541 542 b->pc = getcallerpc(&c); 543 for(;;){ 544 switch(b->iostate){ 545 default: 546 abort(); 547 case BioEmpty: 548 case BioLabel: 549 if(mode == OOverWrite){ 550 blockSetIOState(b, BioClean); 551 return b; 552 } 553 diskRead(c->disk, b); 554 vtSleep(b->ioready); 555 break; 556 case BioClean: 557 case BioDirty: 558 return b; 559 case BioReading: 560 case BioWriting: 561 vtSleep(b->ioready); 562 break; 563 case BioReadError: 564 blockSetIOState(b, BioEmpty); 565 blockPut(b); 566 vtSetError(EIO); 567 return nil; 568 } 569 } 570 /* NOT REACHED */ 571 } 572 573 Block * 574 cacheLocal(Cache *c, int part, u32int addr, int mode) 575 { 576 return _cacheLocal(c, part, addr, mode, 0); 577 } 578 579 /* 580 * fetch a local (on-disk) block from the memory cache. 581 * if it's not there, load it, bumping some other block. 582 * check tag and type. 583 */ 584 Block * 585 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 586 { 587 Block *b; 588 589 b = _cacheLocal(c, PartData, addr, mode, epoch); 590 if(b == nil) 591 return nil; 592 if(b->l.type != type || b->l.tag != tag){ 593 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n", 594 addr, b->l.type, type, b->l.tag, tag); 595 vtSetError(ELabelMismatch); 596 blockPut(b); 597 return nil; 598 } 599 b->pc = getcallerpc(&c); 600 return b; 601 } 602 603 /* 604 * fetch a global (Venti) block from the memory cache. 605 * if it's not there, load it, bumping some other block. 606 * check tag and type if it's really a local block in disguise. 607 */ 608 Block * 609 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 610 { 611 int n; 612 Block *b; 613 ulong h; 614 u32int addr; 615 616 addr = globalToLocal(score); 617 if(addr != NilBlock){ 618 b = cacheLocalData(c, addr, type, tag, mode, 0); 619 if(b) 620 b->pc = getcallerpc(&c); 621 return b; 622 } 623 624 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 625 626 /* 627 * look for the block in the cache 628 */ 629 vtLock(c->lk); 630 for(b = c->heads[h]; b != nil; b = b->next){ 631 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 632 continue; 633 heapDel(b); 634 b->ref++; 635 break; 636 } 637 638 if(b == nil){ 639 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 640 641 b = cacheBumpBlock(c); 642 643 b->part = PartVenti; 644 b->addr = NilBlock; 645 b->l.type = type; 646 memmove(b->score, score, VtScoreSize); 647 648 /* chain onto correct hash */ 649 b->next = c->heads[h]; 650 c->heads[h] = b; 651 if(b->next != nil) 652 b->next->prev = &b->next; 653 b->prev = &c->heads[h]; 654 } 655 vtUnlock(c->lk); 656 657 bwatchLock(b); 658 vtLock(b->lk); 659 b->nlock = 1; 660 b->pc = getcallerpc(&c); 661 662 switch(b->iostate){ 663 default: 664 abort(); 665 case BioEmpty: 666 n = vtRead(c->z, score, vtType[type], b->data, c->size); 667 if(n < 0 || !vtSha1Check(score, b->data, n)){ 668 blockSetIOState(b, BioVentiError); 669 blockPut(b); 670 vtSetError(EIO); 671 return nil; 672 } 673 vtZeroExtend(vtType[type], b->data, n, c->size); 674 blockSetIOState(b, BioClean); 675 return b; 676 case BioClean: 677 return b; 678 case BioVentiError: 679 blockPut(b); 680 vtSetError(EVentiIO); 681 return nil; 682 case BioReadError: 683 blockPut(b); 684 vtSetError(EIO); 685 return nil; 686 } 687 /* NOT REACHED */ 688 } 689 690 /* 691 * allocate a new on-disk block and load it into the memory cache. 692 * BUG: if the disk is full, should we flush some of it to Venti? 693 */ 694 static u32int lastAlloc; 695 696 Block * 697 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 698 { 699 FreeList *fl; 700 u32int addr; 701 Block *b; 702 int n, nwrap; 703 Label lab; 704 705 n = c->size / LabelSize; 706 fl = c->fl; 707 708 vtLock(fl->lk); 709 addr = fl->last; 710 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 711 if(b == nil){ 712 fprint(2, "cacheAllocBlock: xxx %R\n"); 713 vtUnlock(fl->lk); 714 return nil; 715 } 716 nwrap = 0; 717 for(;;){ 718 if(++addr >= fl->end){ 719 addr = 0; 720 if(++nwrap >= 2){ 721 blockPut(b); 722 fl->last = 0; 723 vtSetError("disk is full"); 724 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 725 vtUnlock(fl->lk); 726 return nil; 727 } 728 } 729 if(addr%n == 0){ 730 blockPut(b); 731 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 732 if(b == nil){ 733 fl->last = addr; 734 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 735 vtUnlock(fl->lk); 736 return nil; 737 } 738 } 739 if(!labelUnpack(&lab, b->data, addr%n)) 740 continue; 741 if(lab.state == BsFree) 742 goto Found; 743 if((lab.state&BsClosed) && lab.epochClose <= epochLow) 744 goto Found; 745 } 746 Found: 747 blockPut(b); 748 b = cacheLocal(c, PartData, addr, OOverWrite); 749 if(b == nil){ 750 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 751 return nil; 752 } 753 assert(b->iostate == BioLabel || b->iostate == BioClean); 754 fl->last = addr; 755 lab.type = type; 756 lab.tag = tag; 757 lab.state = BsAlloc; 758 lab.epoch = epoch; 759 lab.epochClose = ~(u32int)0; 760 if(!blockSetLabel(b, &lab)){ 761 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 762 blockPut(b); 763 return nil; 764 } 765 vtZeroExtend(vtType[type], b->data, 0, c->size); 766 if(0)diskWrite(c->disk, b); 767 768 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 769 lastAlloc = addr; 770 fl->nused++; 771 vtUnlock(fl->lk); 772 b->pc = getcallerpc(&c); 773 return b; 774 } 775 776 void 777 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 778 { 779 int n; 780 u32int addr, nused; 781 Block *b; 782 Label lab; 783 FreeList *fl; 784 785 fl = c->fl; 786 n = c->size / LabelSize; 787 *bsize = c->size; 788 vtLock(fl->lk); 789 if(fl->epochLow == epochLow){ 790 *used = fl->nused; 791 *total = fl->end; 792 vtUnlock(fl->lk); 793 return; 794 } 795 b = nil; 796 nused = 0; 797 for(addr=0; addr<fl->end; addr++){ 798 if(addr%n == 0){ 799 blockPut(b); 800 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 801 if(b == nil){ 802 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 803 break; 804 } 805 } 806 if(!labelUnpack(&lab, b->data, addr%n)) 807 continue; 808 if(lab.state == BsFree) 809 continue; 810 if((lab.state&BsClosed) && lab.epochClose <= epochLow) 811 continue; 812 nused++; 813 } 814 blockPut(b); 815 if(addr == fl->end){ 816 fl->nused = nused; 817 fl->epochLow = epochLow; 818 } 819 *used = nused; 820 *total = fl->end; 821 vtUnlock(fl->lk); 822 return; 823 } 824 825 static FreeList * 826 flAlloc(u32int end) 827 { 828 FreeList *fl; 829 830 fl = vtMemAllocZ(sizeof(*fl)); 831 fl->lk = vtLockAlloc(); 832 fl->last = 0; 833 fl->end = end; 834 return fl; 835 } 836 837 static void 838 flFree(FreeList *fl) 839 { 840 vtLockFree(fl->lk); 841 vtMemFree(fl); 842 } 843 844 u32int 845 cacheLocalSize(Cache *c, int part) 846 { 847 return diskSize(c->disk, part); 848 } 849 850 /* 851 * Copy on write. Copied blocks have to be marked BaCopy. 852 * See the big comment near blockRemoveLink. 853 */ 854 Block* 855 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 856 { 857 Block *bb, *lb; 858 Label l; 859 860 assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi); 861 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 862 if(bb == nil){ 863 blockPut(b); 864 return nil; 865 } 866 867 //fprint(2, "alloc %lux copy %V\n", bb->addr, b->score); 868 /* 869 * Change label on b to mark that we've copied it. 870 * This has to come after cacheAllocBlock, since we 871 * can't hold any labels blocks (lb) while we try to 872 * fetch others (in cacheAllocBlock). 873 */ 874 if(!(b->l.state&BsCopied) && b->part==PartData){ 875 l = b->l; 876 l.state |= BsCopied; 877 lb = _blockSetLabel(b, &l); 878 if(lb == nil){ 879 /* can't set label => can't copy block */ 880 blockPut(b); 881 l.type = BtMax; 882 l.state = BsFree; 883 l.epoch = 0; 884 l.epochClose = 0; 885 l.tag = 0; 886 /* ignore error: block gets lost on error */ 887 blockSetLabel(bb, &l); 888 blockPut(bb); 889 return nil; 890 } 891 blockDependency(bb, lb, -1, nil, nil); 892 blockPut(lb); 893 } 894 895 if(0){ 896 if(b->addr != NilBlock) 897 fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n", 898 b->addr, b->l.epoch, bb->addr, bb->l.epoch); 899 else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0) 900 fprint(2, "blockCopy %V => %#ux/%ud\n", 901 b->score, bb->addr, bb->l.epoch); 902 } 903 904 memmove(bb->data, b->data, b->c->size); 905 blockDirty(bb); 906 blockPut(b); 907 return bb; 908 } 909 910 /* 911 * The thread that has locked b may refer to it by 912 * multiple names. Nlock counts the number of 913 * references the locking thread holds. It will call 914 * blockPut once per reference. 915 */ 916 void 917 blockDupLock(Block *b) 918 { 919 assert(b->nlock > 0); 920 b->nlock++; 921 } 922 923 /* 924 * we're done with the block. 925 * unlock it. can't use it after calling this. 926 */ 927 void 928 blockPut(Block* b) 929 { 930 Cache *c; 931 932 if(b == nil) 933 return; 934 935 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 936 937 if(b->iostate == BioDirty) 938 bwatchDependency(b); 939 940 if(--b->nlock > 0) 941 return; 942 943 /* 944 * b->nlock should probably stay at zero while 945 * the block is unlocked, but diskThread and vtSleep 946 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 947 * so we have to keep b->nlock set to 1 even 948 * when the block is unlocked. 949 */ 950 assert(b->nlock == 0); 951 b->nlock = 1; 952 // b->pc = 0; 953 954 bwatchUnlock(b); 955 vtUnlock(b->lk); 956 c = b->c; 957 vtLock(c->lk); 958 959 if(--b->ref > 0){ 960 vtUnlock(c->lk); 961 return; 962 } 963 964 assert(b->ref == 0); 965 switch(b->iostate){ 966 default: 967 b->used = c->now++; 968 heapIns(b); 969 break; 970 case BioEmpty: 971 case BioLabel: 972 if(c->nheap == 0) 973 b->used = c->now++; 974 else 975 b->used = c->heap[0]->used; 976 heapIns(b); 977 break; 978 case BioDirty: 979 break; 980 } 981 vtUnlock(c->lk); 982 } 983 984 /* 985 * we're deleting a block; delete all the blocks it points to 986 * that are still active (i.e., not needed by snapshots). 987 */ 988 static void 989 blockCleanup(Block *b, u32int epoch) 990 { 991 Cache *c; 992 Block *bb; 993 int i, n; 994 Label l; 995 u32int a; 996 int type; 997 int mode; 998 999 type = b->l.type; 1000 c = b->c; 1001 1002 bwatchReset(b->score); 1003 1004 blockSetIOState(b, BioClean); 1005 1006 /* do not recursively free directories */ 1007 if(type == BtData || type == BtDir) 1008 return; 1009 1010 n = c->size / VtScoreSize; 1011 mode = OReadWrite; 1012 if(type-1 == BtData || type-1 == BtDir) 1013 mode = OOverWrite; 1014 for(i=0; i<n; i++){ 1015 a = globalToLocal(b->data + i*VtScoreSize); 1016 if(a == NilBlock || !readLabel(c, &l, a)) 1017 continue; 1018 if((l.state&BsClosed) || l.epoch != epoch) 1019 continue; 1020 bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0); 1021 if(bb == nil) 1022 continue; 1023 if((bb->l.state&BsClosed) || bb->l.epoch != epoch){ 1024 fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n", 1025 a, &l, &bb->l); 1026 blockPut(bb); 1027 continue; 1028 } 1029 blockCleanup(bb, epoch); 1030 l.type = BtMax; 1031 l.epoch = epoch; 1032 l.epochClose = 0; 1033 l.state = BsFree; 1034 l.tag = 0; 1035 blockSetLabel(bb, &l); 1036 blockPut(bb); 1037 } 1038 } 1039 1040 /* 1041 * We don't need the block at addr anymore for the active file system. 1042 * If we don't need it for the snapshots, remove it completely. 1043 * Epoch is the epoch during which we got rid of the block. 1044 * See blockRemoveLink for more. 1045 */ 1046 static int 1047 unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch) 1048 { 1049 Block *b; 1050 Label l; 1051 1052 if(addr == NilBlock) 1053 return 1; 1054 1055 //fprint(2, "unlinkBlock %#ux\n", addr); 1056 b = cacheLocalData(c, addr, type, tag, OReadOnly, 0); 1057 if(b == nil) 1058 return 0; 1059 if(b->l.epoch > epoch){ 1060 fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch); 1061 blockPut(b); 1062 return 0; 1063 } 1064 1065 l = b->l; 1066 if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){ 1067 l.state = BsFree; 1068 l.type = BtMax; 1069 l.tag = 0; 1070 l.epoch = 0; 1071 l.epochClose = 0; 1072 blockCleanup(b, epoch); 1073 }else{ 1074 l.state |= BsClosed; 1075 l.epochClose = epoch; 1076 } 1077 blockSetLabel(b, &l); 1078 blockPut(b); 1079 return 1; 1080 } 1081 1082 /* 1083 * try to allocate a BList so we can record that b must 1084 * be written out before some other block. 1085 * if can't find a BList, write b out instead and return nil. 1086 */ 1087 static BList * 1088 blistAlloc(Block *b) 1089 { 1090 Cache *c; 1091 BList *p; 1092 1093 /* 1094 * It's possible that when we marked b dirty, there were 1095 * too many dirty blocks so we just wrote b there and then. 1096 * So b might not be dirty. If it's not, no need to worry 1097 * about recording the write constraint. 1098 * 1099 * BlockRemoveLink depends on the fact that if blistAlloc 1100 * returns non-nil, b really is dirty. 1101 */ 1102 if(b->iostate != BioDirty){ 1103 assert(b->iostate == BioClean); 1104 return nil; 1105 } 1106 1107 /* 1108 * Easy: maybe there's a free list left. 1109 */ 1110 c = b->c; 1111 vtLock(c->lk); 1112 if(c->blfree){ 1113 HaveBlFree: 1114 p = c->blfree; 1115 c->blfree = p->next; 1116 vtUnlock(c->lk); 1117 assert(b->iostate == BioDirty); 1118 return p; 1119 } 1120 vtUnlock(c->lk); 1121 1122 /* 1123 * No free BLists. What are our options? 1124 */ 1125 1126 /* Block has no priors? Just write it. */ 1127 if(b->prior == nil){ 1128 diskWriteAndWait(c->disk, b); 1129 return nil; 1130 } 1131 1132 /* 1133 * Wake the flush thread, which will hopefully free up 1134 * some BLists for us. We used to flush a block from 1135 * our own prior list and reclaim that BList, but this is 1136 * a no-no: some of the blocks on our prior list may 1137 * be locked by our caller. Or maybe their label blocks 1138 * are locked by our caller. In any event, it's too hard 1139 * to make sure we can do I/O for ourselves. Instead, 1140 * we assume the flush thread will find something. 1141 * (The flush thread never blocks waiting for a block, 1142 * so it won't deadlock like we will.) 1143 */ 1144 vtLock(c->lk); 1145 while(c->blfree == nil){ 1146 vtWakeup(c->flush); 1147 vtSleep(c->blrend); 1148 } 1149 goto HaveBlFree; 1150 } 1151 1152 void 1153 blistFree(Cache *c, BList *bl) 1154 { 1155 vtLock(c->lk); 1156 bl->next = c->blfree; 1157 c->blfree = bl; 1158 vtWakeup(c->blrend); 1159 vtUnlock(c->lk); 1160 cacheFlush(c, 0); 1161 } 1162 1163 /* 1164 * Record that bb must be written out before b. 1165 * If index is given, we're about to overwrite the score/e 1166 * at that index in the block. Save the old value so we 1167 * can write a safer ``old'' version of the block if pressed. 1168 */ 1169 void 1170 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1171 { 1172 BList *p; 1173 1174 if(bb->iostate == BioClean) 1175 return; 1176 1177 /* 1178 * Dependencies for blocks containing Entry structures 1179 * or scores must always be explained. The problem with 1180 * only explaining some of them is this. Suppose we have two 1181 * dependencies for the same field, the first explained 1182 * and the second not. We try to write the block when the first 1183 * dependency is not written but the second is. We will roll back 1184 * the first change even though the second trumps it. 1185 */ 1186 if(index == -1 && bb->part == PartData) 1187 assert(b->l.type == BtData); 1188 1189 if(bb->iostate != BioDirty){ 1190 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1191 bb->part, bb->addr, bb->l.type, bb->iostate); 1192 abort(); 1193 } 1194 1195 p = blistAlloc(bb); 1196 if(p == nil) 1197 return; 1198 1199 assert(bb->iostate == BioDirty); 1200 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1201 1202 p->part = bb->part; 1203 p->addr = bb->addr; 1204 p->type = bb->l.type; 1205 p->vers = bb->vers; 1206 p->index = index; 1207 if(p->index >= 0){ 1208 /* 1209 * This test would just be b->l.type==BtDir except 1210 * we need to exclude the super block. 1211 */ 1212 if(b->l.type == BtDir && b->part == PartData) 1213 entryPack(e, p->old.entry, 0); 1214 else 1215 memmove(p->old.score, score, VtScoreSize); 1216 } 1217 p->next = b->prior; 1218 b->prior = p; 1219 } 1220 1221 /* 1222 * Mark an in-memory block as dirty. If there are too many 1223 * dirty blocks, start writing some out to disk. 1224 * 1225 * If there were way too many dirty blocks, we used to 1226 * try to do some flushing ourselves, but it's just too dangerous -- 1227 * it implies that the callers cannot have any of our priors locked, 1228 * but this is hard to avoid in some cases. 1229 */ 1230 int 1231 blockDirty(Block *b) 1232 { 1233 Cache *c; 1234 1235 c = b->c; 1236 1237 assert(b->part != PartVenti); 1238 1239 if(b->iostate == BioDirty) 1240 return 1; 1241 assert(b->iostate == BioClean); 1242 1243 vtLock(c->lk); 1244 b->iostate = BioDirty; 1245 c->ndirty++; 1246 if(c->ndirty > (c->maxdirty>>1)) 1247 vtWakeup(c->flush); 1248 vtUnlock(c->lk); 1249 1250 return 1; 1251 } 1252 1253 /* 1254 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1255 * 1256 * The file system maintains the following invariants (i-iv checked by flchk): 1257 * 1258 * (i) b.e in [bb.e, bb.eClose) 1259 * (ii) if b.e==bb.e, then no other b' in e points at bb. 1260 * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb. 1261 * (iv) if b is active then no other active b' points at bb. 1262 * (v) if b is a past life of b' then only one of b and b' is active (too hard to check) 1263 * 1264 * The file system initially satisfies these invariants, and we can verify that 1265 * the various file system operations maintain them. See fossil.invariants. 1266 * 1267 * Condition (i) lets us reclaim blocks once the low epoch is greater 1268 * than epochClose. 1269 * 1270 * If the condition in (iii) is satisfied, then this is the only pointer to bb, 1271 * so bb can be reclaimed once b has been written to disk. blockRemoveLink 1272 * checks !(b.state&Copied) as an optimization. UnlinkBlock and blockCleanup 1273 * will check the conditions again for each block they consider. 1274 */ 1275 int 1276 blockRemoveLink(Block *b, u32int addr, int type, u32int tag) 1277 { 1278 BList *bl; 1279 BList *p, **pp; 1280 Cache *c; 1281 1282 c = b->c; 1283 1284 /* remove unlinked block from prior list */ 1285 pp = &b->prior; 1286 for(p=*pp; p; p=*pp){ 1287 if(p->part != PartData || p->addr != addr){ 1288 pp = &p->next; 1289 continue; 1290 } 1291 *pp = p->next; 1292 blistFree(c, p); 1293 } 1294 1295 /* if b has been copied, can't reclaim blocks it points at. */ 1296 if(b->l.state & BsCopied) 1297 return 0; 1298 1299 bl = blistAlloc(b); 1300 if(bl == nil) 1301 return unlinkBlock(b->c, addr, type, tag, b->l.epoch); 1302 1303 /* 1304 * Because bl != nil, we know b is dirty. 1305 * (Linking b->uhead onto a clean block is 1306 * counterproductive, since we only look at 1307 * b->uhead when a block transitions from 1308 * dirty to clean.) 1309 */ 1310 assert(b->iostate == BioDirty); 1311 1312 bl->part = PartData; 1313 bl->addr = addr; 1314 bl->type = type; 1315 bl->tag = tag; 1316 bl->epoch = b->l.epoch; 1317 if(b->uhead == nil) 1318 b->uhead = bl; 1319 else 1320 b->utail->next = bl; 1321 b->utail = bl; 1322 bl->next = nil; 1323 return 1; 1324 } 1325 1326 /* 1327 * set the label associated with a block. 1328 */ 1329 Block* 1330 _blockSetLabel(Block *b, Label *l) 1331 { 1332 int lpb; 1333 Block *bb; 1334 u32int a; 1335 Cache *c; 1336 1337 c = b->c; 1338 1339 assert(b->part == PartData); 1340 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 1341 lpb = c->size / LabelSize; 1342 a = b->addr / lpb; 1343 bb = cacheLocal(c, PartLabel, a, OReadWrite); 1344 if(bb == nil){ 1345 blockPut(b); 1346 return nil; 1347 } 1348 b->l = *l; 1349 labelPack(l, bb->data, b->addr%lpb); 1350 blockDirty(bb); 1351 return bb; 1352 } 1353 1354 int 1355 blockSetLabel(Block *b, Label *l) 1356 { 1357 Block *lb; 1358 Label oldl; 1359 1360 oldl = b->l; 1361 lb = _blockSetLabel(b, l); 1362 if(lb == nil) 1363 return 0; 1364 1365 /* 1366 * If we're allocating the block, make sure the label (bl) 1367 * goes to disk before the data block (b) itself. This is to help 1368 * the blocks that in turn depend on b. 1369 * 1370 * Suppose bx depends on (must be written out after) b. 1371 * Once we write b we'll think it's safe to write bx. 1372 * Bx can't get at b unless it has a valid label, though. 1373 * 1374 * Allocation is the only case in which having a current label 1375 * is vital because: 1376 * 1377 * - l.type is set at allocation and never changes. 1378 * - l.tag is set at allocation and never changes. 1379 * - l.state is not checked when we load blocks. 1380 * - the archiver cares deeply about l.state being 1381 * BaActive vs. BaCopied, but that's handled 1382 * by direct calls to _blockSetLabel. 1383 */ 1384 1385 if(oldl.state == BsFree) 1386 blockDependency(b, lb, -1, nil, nil); 1387 blockPut(lb); 1388 return 1; 1389 } 1390 1391 /* 1392 * We've decided to write out b. Maybe b has some pointers to blocks 1393 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1394 * copy of b that is safe to write out. (diskThread will make sure the block 1395 * remains marked as dirty.) 1396 */ 1397 uchar * 1398 blockRollback(Block *b, uchar *buf) 1399 { 1400 u32int addr; 1401 BList *p; 1402 Super super; 1403 1404 /* easy case */ 1405 if(b->prior == nil) 1406 return b->data; 1407 1408 memmove(buf, b->data, b->c->size); 1409 for(p=b->prior; p; p=p->next){ 1410 /* 1411 * we know p->index >= 0 because blockWrite has vetted this block for us. 1412 */ 1413 assert(p->index >= 0); 1414 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1415 if(b->part == PartSuper){ 1416 assert(p->index == 0); 1417 superUnpack(&super, buf); 1418 addr = globalToLocal(p->old.score); 1419 if(addr == NilBlock){ 1420 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1421 abort(); 1422 } 1423 super.active = addr; 1424 superPack(&super, buf); 1425 continue; 1426 } 1427 if(b->l.type == BtDir) 1428 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1429 else 1430 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1431 } 1432 return buf; 1433 } 1434 1435 /* 1436 * Try to write block b. 1437 * If b depends on other blocks: 1438 * 1439 * If the block has been written out, remove the dependency. 1440 * If the dependency is replaced by a more recent dependency, 1441 * throw it out. 1442 * If we know how to write out an old version of b that doesn't 1443 * depend on it, do that. 1444 * 1445 * Otherwise, bail. 1446 */ 1447 int 1448 blockWrite(Block *b) 1449 { 1450 uchar *dmap; 1451 Cache *c; 1452 BList *p, **pp; 1453 Block *bb; 1454 int lockfail; 1455 1456 c = b->c; 1457 1458 if(b->iostate != BioDirty) 1459 return 1; 1460 1461 dmap = b->dmap; 1462 memset(dmap, 0, c->ndmap); 1463 pp = &b->prior; 1464 for(p=*pp; p; p=*pp){ 1465 if(p->index >= 0){ 1466 /* more recent dependency has succeeded; this one can go */ 1467 if(dmap[p->index/8] & (1<<(p->index%8))) 1468 goto ignblock; 1469 } 1470 1471 lockfail = 0; 1472 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1473 if(bb == nil){ 1474 if(lockfail) 1475 return 0; 1476 /* block not in cache => was written already */ 1477 dmap[p->index/8] |= 1<<(p->index%8); 1478 goto ignblock; 1479 } 1480 1481 /* 1482 * same version of block is still in cache. 1483 * 1484 * the assertion is true because the block still has version p->vers, 1485 * which means it hasn't been written out since we last saw it. 1486 */ 1487 if(bb->iostate != BioDirty){ 1488 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1489 bb->part, bb->addr, bb->l.type, bb->iostate); 1490 /* probably BioWriting if it happens? */ 1491 if(bb->iostate == BioClean) 1492 goto ignblock; 1493 } 1494 1495 blockPut(bb); 1496 1497 if(p->index < 0){ 1498 /* 1499 * We don't know how to temporarily undo 1500 * b's dependency on bb, so just don't write b yet. 1501 */ 1502 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1503 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1504 return 0; 1505 } 1506 /* keep walking down the list */ 1507 pp = &p->next; 1508 continue; 1509 1510 ignblock: 1511 *pp = p->next; 1512 blistFree(c, p); 1513 continue; 1514 } 1515 1516 /* 1517 * DiskWrite must never be called with a double-locked block. 1518 * This call to diskWrite is okay because blockWrite is only called 1519 * from the cache flush thread, which never double-locks a block. 1520 */ 1521 diskWrite(c->disk, b); 1522 return 1; 1523 } 1524 1525 /* 1526 * Change the I/O state of block b. 1527 * Just an assignment except for magic in 1528 * switch statement (read comments there). 1529 */ 1530 void 1531 blockSetIOState(Block *b, int iostate) 1532 { 1533 int dowakeup; 1534 Cache *c; 1535 BList *p, *q; 1536 1537 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1538 1539 c = b->c; 1540 1541 dowakeup = 0; 1542 switch(iostate){ 1543 default: 1544 abort(); 1545 case BioEmpty: 1546 assert(!b->uhead); 1547 break; 1548 case BioLabel: 1549 assert(!b->uhead); 1550 break; 1551 case BioClean: 1552 bwatchDependency(b); 1553 /* 1554 * If b->prior is set, it means a write just finished. 1555 * The prior list isn't needed anymore. 1556 */ 1557 for(p=b->prior; p; p=q){ 1558 q = p->next; 1559 blistFree(c, p); 1560 } 1561 b->prior = nil; 1562 /* 1563 * Freeing a block or just finished a write. 1564 * Move the blocks from the per-block unlink 1565 * queue to the cache unlink queue. 1566 */ 1567 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1568 vtLock(c->lk); 1569 c->ndirty--; 1570 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1571 b->vers = c->vers++; 1572 if(b->uhead){ 1573 /* add unlink blocks to unlink queue */ 1574 if(c->uhead == nil){ 1575 c->uhead = b->uhead; 1576 vtWakeup(c->unlink); 1577 }else 1578 c->utail->next = b->uhead; 1579 c->utail = b->utail; 1580 b->uhead = nil; 1581 } 1582 vtUnlock(c->lk); 1583 } 1584 assert(!b->uhead); 1585 dowakeup = 1; 1586 break; 1587 case BioDirty: 1588 /* 1589 * Wrote out an old version of the block (see blockRollback). 1590 * Bump a version count, leave it dirty. 1591 */ 1592 if(b->iostate == BioWriting){ 1593 vtLock(c->lk); 1594 b->vers = c->vers++; 1595 vtUnlock(c->lk); 1596 dowakeup = 1; 1597 } 1598 break; 1599 case BioReading: 1600 case BioWriting: 1601 /* 1602 * Adding block to disk queue. Bump reference count. 1603 * diskThread decs the count later by calling blockPut. 1604 * This is here because we need to lock c->lk to 1605 * manipulate the ref count. 1606 */ 1607 vtLock(c->lk); 1608 b->ref++; 1609 vtUnlock(c->lk); 1610 break; 1611 case BioReadError: 1612 case BioVentiError: 1613 /* 1614 * Oops. 1615 */ 1616 dowakeup = 1; 1617 break; 1618 } 1619 b->iostate = iostate; 1620 /* 1621 * Now that the state has changed, we can wake the waiters. 1622 */ 1623 if(dowakeup) 1624 vtWakeupAll(b->ioready); 1625 } 1626 1627 char* 1628 bsStr(int state) 1629 { 1630 static char s[100]; 1631 1632 if(state == BsFree) 1633 return "Free"; 1634 if(state == BsBad) 1635 return "Bad"; 1636 1637 sprint(s, "%x", state); 1638 if(!(state&BsAlloc)) 1639 strcat(s, ",Free"); /* should not happen */ 1640 if(state&BsCopied) 1641 strcat(s, ",Copied"); 1642 if(state&BsVenti) 1643 strcat(s, ",Venti"); 1644 if(state&BsClosed) 1645 strcat(s, ",Closed"); 1646 return s; 1647 } 1648 1649 char * 1650 bioStr(int iostate) 1651 { 1652 switch(iostate){ 1653 default: 1654 return "Unknown!!"; 1655 case BioEmpty: 1656 return "Empty"; 1657 case BioLabel: 1658 return "Label"; 1659 case BioClean: 1660 return "Clean"; 1661 case BioDirty: 1662 return "Dirty"; 1663 case BioReading: 1664 return "Reading"; 1665 case BioWriting: 1666 return "Writing"; 1667 case BioReadError: 1668 return "ReadError"; 1669 case BioVentiError: 1670 return "VentiError"; 1671 case BioMax: 1672 return "Max"; 1673 } 1674 } 1675 1676 static char *bttab[] = { 1677 "BtData", 1678 "BtData+1", 1679 "BtData+2", 1680 "BtData+3", 1681 "BtData+4", 1682 "BtData+5", 1683 "BtData+6", 1684 "BtData+7", 1685 "BtDir", 1686 "BtDir+1", 1687 "BtDir+2", 1688 "BtDir+3", 1689 "BtDir+4", 1690 "BtDir+5", 1691 "BtDir+6", 1692 "BtDir+7", 1693 }; 1694 1695 char* 1696 btStr(int type) 1697 { 1698 if(type < nelem(bttab)) 1699 return bttab[type]; 1700 return "unknown"; 1701 } 1702 1703 int 1704 labelFmt(Fmt *f) 1705 { 1706 Label *l; 1707 1708 l = va_arg(f->args, Label*); 1709 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1710 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1711 } 1712 1713 int 1714 scoreFmt(Fmt *f) 1715 { 1716 uchar *v; 1717 int i; 1718 u32int addr; 1719 1720 v = va_arg(f->args, uchar*); 1721 if(v == nil){ 1722 fmtprint(f, "*"); 1723 }else if((addr = globalToLocal(v)) != NilBlock) 1724 fmtprint(f, "0x%.8ux", addr); 1725 else{ 1726 for(i = 0; i < VtScoreSize; i++) 1727 fmtprint(f, "%2.2ux", v[i]); 1728 } 1729 1730 return 0; 1731 } 1732 1733 static int 1734 upHeap(int i, Block *b) 1735 { 1736 Block *bb; 1737 u32int now; 1738 int p; 1739 Cache *c; 1740 1741 c = b->c; 1742 now = c->now; 1743 for(; i != 0; i = p){ 1744 p = (i - 1) >> 1; 1745 bb = c->heap[p]; 1746 if(b->used - now >= bb->used - now) 1747 break; 1748 c->heap[i] = bb; 1749 bb->heap = i; 1750 } 1751 c->heap[i] = b; 1752 b->heap = i; 1753 1754 return i; 1755 } 1756 1757 static int 1758 downHeap(int i, Block *b) 1759 { 1760 Block *bb; 1761 u32int now; 1762 int k; 1763 Cache *c; 1764 1765 c = b->c; 1766 now = c->now; 1767 for(; ; i = k){ 1768 k = (i << 1) + 1; 1769 if(k >= c->nheap) 1770 break; 1771 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1772 k++; 1773 bb = c->heap[k]; 1774 if(b->used - now <= bb->used - now) 1775 break; 1776 c->heap[i] = bb; 1777 bb->heap = i; 1778 } 1779 c->heap[i] = b; 1780 b->heap = i; 1781 return i; 1782 } 1783 1784 /* 1785 * Delete a block from the heap. 1786 * Called with c->lk held. 1787 */ 1788 static void 1789 heapDel(Block *b) 1790 { 1791 int i, si; 1792 Cache *c; 1793 1794 c = b->c; 1795 1796 si = b->heap; 1797 if(si == BadHeap) 1798 return; 1799 b->heap = BadHeap; 1800 c->nheap--; 1801 if(si == c->nheap) 1802 return; 1803 b = c->heap[c->nheap]; 1804 i = upHeap(si, b); 1805 if(i == si) 1806 downHeap(i, b); 1807 } 1808 1809 /* 1810 * Insert a block into the heap. 1811 * Called with c->lk held. 1812 */ 1813 static void 1814 heapIns(Block *b) 1815 { 1816 assert(b->heap == BadHeap); 1817 upHeap(b->c->nheap++, b); 1818 vtWakeup(b->c->heapwait); 1819 } 1820 1821 /* 1822 * Get just the label for a block. 1823 */ 1824 static int 1825 readLabel(Cache *c, Label *l, u32int addr) 1826 { 1827 int lpb; 1828 Block *b; 1829 u32int a; 1830 1831 lpb = c->size / LabelSize; 1832 a = addr / lpb; 1833 b = cacheLocal(c, PartLabel, a, OReadOnly); 1834 if(b == nil){ 1835 blockPut(b); 1836 return 0; 1837 } 1838 1839 if(!labelUnpack(l, b->data, addr%lpb)){ 1840 blockPut(b); 1841 return 0; 1842 } 1843 blockPut(b); 1844 return 1; 1845 } 1846 1847 /* 1848 * Process unlink queue. 1849 * Called with c->lk held. 1850 */ 1851 static void 1852 unlinkBody(Cache *c) 1853 { 1854 BList *p; 1855 1856 while(c->uhead != nil){ 1857 p = c->uhead; 1858 c->uhead = p->next; 1859 vtUnlock(c->lk); 1860 1861 if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch)) 1862 fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n", 1863 p->addr, p->type, p->tag); 1864 1865 vtLock(c->lk); 1866 p->next = c->blfree; 1867 c->blfree = p; 1868 } 1869 } 1870 1871 /* 1872 * Occasionally unlink the blocks on the cache unlink queue. 1873 */ 1874 static void 1875 unlinkThread(void *a) 1876 { 1877 Cache *c = a; 1878 1879 vtThreadSetName("unlink"); 1880 1881 vtLock(c->lk); 1882 for(;;){ 1883 while(c->uhead == nil && c->die == nil) 1884 vtSleep(c->unlink); 1885 if(c->die != nil) 1886 break; 1887 unlinkBody(c); 1888 } 1889 c->ref--; 1890 vtWakeup(c->die); 1891 vtUnlock(c->lk); 1892 } 1893 1894 static int 1895 baddrCmp(void *a0, void *a1) 1896 { 1897 BAddr *b0, *b1; 1898 b0 = a0; 1899 b1 = a1; 1900 1901 if(b0->part < b1->part) 1902 return -1; 1903 if(b0->part > b1->part) 1904 return 1; 1905 if(b0->addr < b1->addr) 1906 return -1; 1907 if(b0->addr > b1->addr) 1908 return 1; 1909 return 0; 1910 } 1911 1912 /* 1913 * Scan the block list for dirty blocks; add them to the list c->baddr. 1914 */ 1915 static void 1916 flushFill(Cache *c) 1917 { 1918 int i, ndirty; 1919 BAddr *p; 1920 Block *b; 1921 1922 vtLock(c->lk); 1923 if(c->ndirty == 0){ 1924 vtUnlock(c->lk); 1925 return; 1926 } 1927 1928 p = c->baddr; 1929 ndirty = 0; 1930 for(i=0; i<c->nblocks; i++){ 1931 b = c->blocks + i; 1932 if(b->part == PartError) 1933 continue; 1934 if(b->iostate == BioDirty || b->iostate == BioWriting) 1935 ndirty++; 1936 if(b->iostate != BioDirty) 1937 continue; 1938 p->part = b->part; 1939 p->addr = b->addr; 1940 p->vers = b->vers; 1941 p++; 1942 } 1943 if(ndirty != c->ndirty){ 1944 fprint(2, "ndirty mismatch expected %d found %d\n", 1945 c->ndirty, ndirty); 1946 c->ndirty = ndirty; 1947 } 1948 vtUnlock(c->lk); 1949 1950 c->bw = p - c->baddr; 1951 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1952 } 1953 1954 /* 1955 * This is not thread safe, i.e. it can't be called from multiple threads. 1956 * 1957 * It's okay how we use it, because it only gets called in 1958 * the flushThread. And cacheFree, but only after 1959 * cacheFree has killed off the flushThread. 1960 */ 1961 static int 1962 cacheFlushBlock(Cache *c) 1963 { 1964 Block *b; 1965 BAddr *p; 1966 int lockfail, nfail; 1967 1968 nfail = 0; 1969 for(;;){ 1970 if(c->br == c->be){ 1971 if(c->bw == 0 || c->bw == c->be) 1972 flushFill(c); 1973 c->br = 0; 1974 c->be = c->bw; 1975 c->bw = 0; 1976 c->nflush = 0; 1977 } 1978 1979 if(c->br == c->be) 1980 return 0; 1981 p = c->baddr + c->br; 1982 c->br++; 1983 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1984 1985 if(b && blockWrite(b)){ 1986 c->nflush++; 1987 blockPut(b); 1988 return 1; 1989 } 1990 if(b) 1991 blockPut(b); 1992 1993 /* 1994 * Why didn't we write the block? 1995 */ 1996 1997 /* Block already written out */ 1998 if(b == nil && !lockfail) 1999 continue; 2000 2001 /* Failed to acquire lock; sleep if happens a lot. */ 2002 if(lockfail && ++nfail > 100){ 2003 sleep(500); 2004 nfail = 0; 2005 } 2006 /* Requeue block. */ 2007 if(c->bw < c->be) 2008 c->baddr[c->bw++] = *p; 2009 } 2010 return 0; 2011 } 2012 2013 /* 2014 * Occasionally flush dirty blocks from memory to the disk. 2015 */ 2016 static void 2017 flushThread(void *a) 2018 { 2019 Cache *c = a; 2020 int i; 2021 2022 vtThreadSetName("flush"); 2023 vtLock(c->lk); 2024 while(c->die == nil){ 2025 vtSleep(c->flush); 2026 vtUnlock(c->lk); 2027 for(i=0; i<FlushSize; i++) 2028 if(!cacheFlushBlock(c)){ 2029 /* 2030 * If i==0, could be someone is waking us repeatedly 2031 * to flush the cache but there's no work to do. 2032 * Pause a little. 2033 */ 2034 if(i==0) 2035 sleep(250); 2036 break; 2037 } 2038 if(i==0 && c->ndirty){ 2039 /* 2040 * All the blocks are being written right now -- there's nothing to do. 2041 * We might be spinning with cacheFlush though -- he'll just keep 2042 * kicking us until c->ndirty goes down. Probably we should sleep 2043 * on something that the diskThread can kick, but for now we'll 2044 * just pause for a little while waiting for disks to finish. 2045 */ 2046 sleep(100); 2047 } 2048 vtLock(c->lk); 2049 vtWakeupAll(c->flushwait); 2050 } 2051 c->ref--; 2052 vtWakeup(c->die); 2053 vtUnlock(c->lk); 2054 } 2055 2056 /* 2057 * Keep flushing until everything is clean. 2058 */ 2059 void 2060 cacheFlush(Cache *c, int wait) 2061 { 2062 vtLock(c->lk); 2063 if(wait){ 2064 while(c->ndirty){ 2065 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2066 // c->ndirty, c->uhead); 2067 vtWakeup(c->flush); 2068 vtSleep(c->flushwait); 2069 } 2070 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2071 }else 2072 vtWakeup(c->flush); 2073 vtUnlock(c->lk); 2074 } 2075 2076 /* 2077 * Kick the flushThread every 30 seconds. 2078 */ 2079 static void 2080 cacheSync(void *v) 2081 { 2082 Cache *c; 2083 2084 c = v; 2085 cacheFlush(c, 0); 2086 } 2087