1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtSession *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 VtRendez *blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 VtRendez *die; /* daemon threads should die when != nil */ 52 53 VtRendez *flush; 54 VtRendez *flushwait; 55 BAddr *baddr; 56 int bw, br, be; 57 int nflush; 58 59 Periodic *sync; 60 61 /* unlink daemon */ 62 BList *uhead; 63 BList *utail; 64 VtRendez *unlink; 65 }; 66 67 struct BList { 68 int part; 69 u32int addr; 70 uchar type; 71 u32int tag; 72 u32int epoch; 73 u32int vers; 74 75 /* for roll back */ 76 int index; /* -1 indicates not valid */ 77 union { 78 uchar score[VtScoreSize]; 79 uchar entry[VtEntrySize]; 80 } old; 81 BList *next; 82 }; 83 84 struct BAddr { 85 int part; 86 u32int addr; 87 u32int vers; 88 }; 89 90 struct FreeList { 91 VtLock *lk; 92 u32int last; /* last block allocated */ 93 u32int end; /* end of data partition */ 94 }; 95 96 static FreeList *flAlloc(u32int end); 97 static void flFree(FreeList *fl); 98 99 static Block *cacheBumpBlock(Cache *c); 100 static void heapDel(Block*); 101 static void heapIns(Block*); 102 static void cacheCheck(Cache*); 103 static int readLabel(Cache*, Label*, u32int addr); 104 static void unlinkThread(void *a); 105 static void flushThread(void *a); 106 static void flushBody(Cache *c); 107 static void unlinkBody(Cache *c); 108 static int cacheFlushBlock(Cache *c); 109 static void cacheSync(void*); 110 /* 111 * Mapping from local block type to Venti type 112 */ 113 int vtType[BtMax] = { 114 VtDataType, /* BtData | 0 */ 115 VtPointerType0, /* BtData | 1 */ 116 VtPointerType1, /* BtData | 2 */ 117 VtPointerType2, /* BtData | 3 */ 118 VtPointerType3, /* BtData | 4 */ 119 VtPointerType4, /* BtData | 5 */ 120 VtPointerType5, /* BtData | 6 */ 121 VtPointerType6, /* BtData | 7 */ 122 VtDirType, /* BtDir | 0 */ 123 VtPointerType0, /* BtDir | 1 */ 124 VtPointerType1, /* BtDir | 2 */ 125 VtPointerType2, /* BtDir | 3 */ 126 VtPointerType3, /* BtDir | 4 */ 127 VtPointerType4, /* BtDir | 5 */ 128 VtPointerType5, /* BtDir | 6 */ 129 VtPointerType6, /* BtDir | 7 */ 130 }; 131 132 /* 133 * Allocate the memory cache. 134 */ 135 Cache * 136 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 137 { 138 int i; 139 Cache *c; 140 Block *b; 141 BList *bl; 142 u8int *p; 143 int nbl; 144 145 c = vtMemAllocZ(sizeof(Cache)); 146 147 /* reasonable number of BList elements */ 148 nbl = nblocks * 4; 149 150 c->lk = vtLockAlloc(); 151 c->ref = 1; 152 c->disk = disk; 153 c->z = z; 154 c->size = diskBlockSize(disk); 155 bwatchSetBlockSize(c->size); 156 /* round c->size up to be a nice multiple */ 157 c->size = (c->size + 127) & ~127; 158 c->ndmap = (c->size/20 + 7) / 8; 159 c->nblocks = nblocks; 160 c->hashSize = nblocks; 161 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 162 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 163 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 164 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 165 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 166 c->mode = mode; 167 c->vers++; 168 p = c->mem; 169 for(i = 0; i < nblocks; i++){ 170 b = &c->blocks[i]; 171 b->lk = vtLockAlloc(); 172 b->c = c; 173 b->data = p; 174 b->heap = i; 175 b->ioready = vtRendezAlloc(b->lk); 176 c->heap[i] = b; 177 p += c->size; 178 } 179 c->nheap = nblocks; 180 for(i = 0; i < nbl; i++){ 181 bl = (BList*)p; 182 bl->next = c->blfree; 183 c->blfree = bl; 184 p += sizeof(BList); 185 } 186 /* separate loop to keep blocks and blists reasonably aligned */ 187 for(i = 0; i < nblocks; i++){ 188 b = &c->blocks[i]; 189 b->dmap = p; 190 p += c->ndmap; 191 } 192 193 c->blrend = vtRendezAlloc(c->lk); 194 195 c->maxdirty = nblocks*(DirtyPercentage*0.01); 196 197 c->fl = flAlloc(diskSize(disk, PartData)); 198 199 c->unlink = vtRendezAlloc(c->lk); 200 c->flush = vtRendezAlloc(c->lk); 201 c->flushwait = vtRendezAlloc(c->lk); 202 c->sync = periodicAlloc(cacheSync, c, 30*1000); 203 204 if(mode == OReadWrite){ 205 c->ref += 2; 206 vtThread(unlinkThread, c); 207 vtThread(flushThread, c); 208 } 209 cacheCheck(c); 210 211 return c; 212 } 213 214 /* 215 * Free the whole memory cache, flushing all dirty blocks to the disk. 216 */ 217 void 218 cacheFree(Cache *c) 219 { 220 int i; 221 222 /* kill off daemon threads */ 223 vtLock(c->lk); 224 c->die = vtRendezAlloc(c->lk); 225 periodicKill(c->sync); 226 vtWakeup(c->flush); 227 vtWakeup(c->unlink); 228 while(c->ref > 1) 229 vtSleep(c->die); 230 231 /* flush everything out */ 232 do { 233 unlinkBody(c); 234 vtUnlock(c->lk); 235 while(cacheFlushBlock(c)) 236 ; 237 diskFlush(c->disk); 238 vtLock(c->lk); 239 } while(c->uhead || c->ndirty); 240 vtUnlock(c->lk); 241 242 cacheCheck(c); 243 244 for(i = 0; i < c->nblocks; i++){ 245 assert(c->blocks[i].ref == 0); 246 vtRendezFree(c->blocks[i].ioready); 247 vtLockFree(c->blocks[i].lk); 248 } 249 flFree(c->fl); 250 vtMemFree(c->baddr); 251 vtMemFree(c->heads); 252 vtMemFree(c->blocks); 253 vtMemFree(c->mem); 254 vtLockFree(c->lk); 255 diskFree(c->disk); 256 vtRendezFree(c->blrend); 257 /* don't close vtSession */ 258 vtMemFree(c); 259 } 260 261 static void 262 cacheDump(Cache *c) 263 { 264 int i; 265 Block *b; 266 267 for(i = 0; i < c->nblocks; i++){ 268 b = &c->blocks[i]; 269 fprint(2, "p=%d a=%ud %V t=%d ref=%d state=%s io=%s\n", 270 b->part, b->addr, b->score, b->l.type, b->ref, 271 bsStr(b->l.state), bioStr(b->iostate)); 272 } 273 } 274 275 static void 276 cacheCheck(Cache *c) 277 { 278 u32int size, now; 279 int i, k, refed; 280 static uchar zero[VtScoreSize]; 281 Block *b; 282 283 size = c->size; 284 now = c->now; 285 286 for(i = 0; i < c->nheap; i++){ 287 if(c->heap[i]->heap != i) 288 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 289 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 290 vtFatal("bad heap ordering"); 291 k = (i << 1) + 1; 292 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 293 vtFatal("bad heap ordering"); 294 k++; 295 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 296 vtFatal("bad heap ordering"); 297 } 298 299 refed = 0; 300 for(i = 0; i < c->nblocks; i++){ 301 b = &c->blocks[i]; 302 if(b->data != &c->mem[i * size]) 303 vtFatal("mis-blocked at %d", i); 304 if(b->ref && b->heap == BadHeap){ 305 refed++; 306 } 307 } 308 if(c->nheap + refed != c->nblocks){ 309 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 310 cacheDump(c); 311 } 312 assert(c->nheap + refed == c->nblocks); 313 refed = 0; 314 for(i = 0; i < c->nblocks; i++){ 315 b = &c->blocks[i]; 316 if(b->ref){ 317 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 318 refed++; 319 } 320 } 321 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 322 } 323 324 325 /* 326 * locate the block with the oldest second to last use. 327 * remove it from the heap, and fix up the heap. 328 */ 329 /* called with c->lk held */ 330 static Block * 331 cacheBumpBlock(Cache *c) 332 { 333 Block *b; 334 335 /* 336 * locate the block with the oldest second to last use. 337 * remove it from the heap, and fix up the heap. 338 */ 339 if(c->nheap == 0) 340 vtFatal("cacheBumpBlock: no free blocks in cache"); 341 b = c->heap[0]; 342 heapDel(b); 343 344 assert(b->heap == BadHeap); 345 assert(b->ref == 0); 346 assert(b->iostate == BioEmpty || b->iostate == BioLabel || b->iostate == BioClean); 347 assert(b->prior == nil); 348 assert(b->uhead == nil); 349 350 /* 351 * unchain the block from hash chain 352 */ 353 if(b->prev){ 354 *(b->prev) = b->next; 355 if(b->next) 356 b->next->prev = b->prev; 357 b->prev = nil; 358 } 359 360 361 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 362 /* set block to a reasonable state */ 363 b->ref = 1; 364 b->part = PartError; 365 memset(&b->l, 0, sizeof(b->l)); 366 b->iostate = BioEmpty; 367 368 return b; 369 } 370 371 /* 372 * look for a particular version of the block in the memory cache. 373 */ 374 static Block * 375 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 376 int waitlock, int *lockfailure) 377 { 378 Block *b; 379 ulong h; 380 381 h = addr % c->hashSize; 382 383 if(lockfailure) 384 *lockfailure = 0; 385 386 /* 387 * look for the block in the cache 388 */ 389 vtLock(c->lk); 390 for(b = c->heads[h]; b != nil; b = b->next){ 391 if(b->part == part && b->addr == addr) 392 break; 393 } 394 if(b == nil || b->vers != vers){ 395 vtUnlock(c->lk); 396 return nil; 397 } 398 if(!waitlock && !vtCanLock(b->lk)){ 399 *lockfailure = 1; 400 vtUnlock(c->lk); 401 return nil; 402 } 403 heapDel(b); 404 b->ref++; 405 vtUnlock(c->lk); 406 407 bwatchLock(b); 408 if(waitlock) 409 vtLock(b->lk); 410 b->nlock = 1; 411 412 for(;;){ 413 switch(b->iostate){ 414 default: 415 abort(); 416 case BioEmpty: 417 case BioLabel: 418 case BioClean: 419 case BioDirty: 420 if(b->vers != vers){ 421 blockPut(b); 422 return nil; 423 } 424 return b; 425 case BioReading: 426 case BioWriting: 427 vtSleep(b->ioready); 428 break; 429 case BioVentiError: 430 case BioReadError: 431 blockSetIOState(b, BioEmpty); 432 blockPut(b); 433 vtSetError(EIO); 434 return nil; 435 } 436 } 437 /* NOT REACHED */ 438 } 439 static Block* 440 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 441 { 442 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 443 } 444 445 446 /* 447 * fetch a local (on-disk) block from the memory cache. 448 * if it's not there, load it, bumping some other block. 449 */ 450 Block * 451 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 452 { 453 Block *b; 454 ulong h; 455 456 assert(part != PartVenti); 457 458 h = addr % c->hashSize; 459 460 /* 461 * look for the block in the cache 462 */ 463 vtLock(c->lk); 464 for(b = c->heads[h]; b != nil; b = b->next){ 465 if(b->part != part || b->addr != addr) 466 continue; 467 if(epoch && b->l.epoch != epoch){ 468 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 469 vtUnlock(c->lk); 470 vtSetError(ELabelMismatch); 471 return nil; 472 } 473 heapDel(b); 474 b->ref++; 475 break; 476 } 477 478 if(b == nil){ 479 b = cacheBumpBlock(c); 480 481 b->part = part; 482 b->addr = addr; 483 localToGlobal(addr, b->score); 484 485 /* chain onto correct hash */ 486 b->next = c->heads[h]; 487 c->heads[h] = b; 488 if(b->next != nil) 489 b->next->prev = &b->next; 490 b->prev = &c->heads[h]; 491 } 492 493 vtUnlock(c->lk); 494 495 /* 496 * BUG: what if the epoch changes right here? 497 * In the worst case, we could end up in some weird 498 * lock loop, because the block we want no longer exists, 499 * and instead we're trying to lock a block we have no 500 * business grabbing. 501 * 502 * For now, I'm not going to worry about it. 503 */ 504 505 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 506 bwatchLock(b); 507 vtLock(b->lk); 508 b->nlock = 1; 509 510 if(part == PartData && b->iostate == BioEmpty){ 511 if(!readLabel(c, &b->l, addr)){ 512 blockPut(b); 513 return nil; 514 } 515 blockSetIOState(b, BioLabel); 516 } 517 if(epoch && b->l.epoch != epoch){ 518 blockPut(b); 519 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 520 vtSetError(ELabelMismatch); 521 return nil; 522 } 523 524 b->pc = getcallerpc(&c); 525 for(;;){ 526 switch(b->iostate){ 527 default: 528 abort(); 529 case BioEmpty: 530 case BioLabel: 531 if(mode == OOverWrite){ 532 blockSetIOState(b, BioClean); 533 return b; 534 } 535 diskRead(c->disk, b); 536 vtSleep(b->ioready); 537 break; 538 case BioClean: 539 case BioDirty: 540 return b; 541 case BioReading: 542 case BioWriting: 543 vtSleep(b->ioready); 544 break; 545 case BioReadError: 546 blockSetIOState(b, BioEmpty); 547 blockPut(b); 548 vtSetError(EIO); 549 return nil; 550 } 551 } 552 /* NOT REACHED */ 553 } 554 555 Block * 556 cacheLocal(Cache *c, int part, u32int addr, int mode) 557 { 558 return _cacheLocal(c, part, addr, mode, 0); 559 } 560 561 /* 562 * fetch a local (on-disk) block from the memory cache. 563 * if it's not there, load it, bumping some other block. 564 * check tag and type. 565 */ 566 Block * 567 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 568 { 569 Block *b; 570 571 b = _cacheLocal(c, PartData, addr, mode, epoch); 572 if(b == nil) 573 return nil; 574 if(b->l.type != type || b->l.tag != tag){ 575 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %x exp %x\n", 576 addr, b->l.type, type, b->l.tag, tag); 577 abort(); 578 vtSetError(ELabelMismatch); 579 blockPut(b); 580 return nil; 581 } 582 b->pc = getcallerpc(&c); 583 return b; 584 } 585 586 /* 587 * fetch a global (Venti) block from the memory cache. 588 * if it's not there, load it, bumping some other block. 589 * check tag and type if it's really a local block in disguise. 590 */ 591 Block * 592 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 593 { 594 int n; 595 Block *b; 596 ulong h; 597 u32int addr; 598 599 addr = globalToLocal(score); 600 if(addr != NilBlock){ 601 b = cacheLocalData(c, addr, type, tag, mode, 0); 602 if(b) 603 b->pc = getcallerpc(&c); 604 return b; 605 } 606 607 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 608 609 /* 610 * look for the block in the cache 611 */ 612 vtLock(c->lk); 613 for(b = c->heads[h]; b != nil; b = b->next){ 614 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 615 continue; 616 heapDel(b); 617 b->ref++; 618 break; 619 } 620 621 if(b == nil){ 622 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 623 624 b = cacheBumpBlock(c); 625 626 b->part = PartVenti; 627 b->addr = NilBlock; 628 b->l.type = type; 629 memmove(b->score, score, VtScoreSize); 630 631 /* chain onto correct hash */ 632 b->next = c->heads[h]; 633 c->heads[h] = b; 634 if(b->next != nil) 635 b->next->prev = &b->next; 636 b->prev = &c->heads[h]; 637 } 638 vtUnlock(c->lk); 639 640 bwatchLock(b); 641 vtLock(b->lk); 642 b->nlock = 1; 643 b->pc = getcallerpc(&c); 644 645 switch(b->iostate){ 646 default: 647 abort(); 648 case BioEmpty: 649 n = vtRead(c->z, score, vtType[type], b->data, c->size); 650 if(n < 0 || !vtSha1Check(score, b->data, n)){ 651 blockSetIOState(b, BioVentiError); 652 blockPut(b); 653 return nil; 654 } 655 vtZeroExtend(vtType[type], b->data, n, c->size); 656 blockSetIOState(b, BioClean); 657 return b; 658 case BioClean: 659 return b; 660 case BioVentiError: 661 case BioReadError: 662 blockPut(b); 663 vtSetError(EIO); 664 blockSetIOState(b, BioEmpty); 665 return nil; 666 } 667 /* NOT REACHED */ 668 } 669 670 /* 671 * allocate a new on-disk block and load it into the memory cache. 672 * BUG: if the disk is full, should we flush some of it to Venti? 673 */ 674 static u32int lastAlloc; 675 676 Block * 677 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 678 { 679 FreeList *fl; 680 u32int addr; 681 Block *b; 682 int n, nwrap; 683 Label lab; 684 685 n = c->size / LabelSize; 686 fl = c->fl; 687 688 vtLock(fl->lk); 689 fl->last = 0; 690 addr = fl->last; 691 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 692 if(b == nil){ 693 fprint(2, "cacheAllocBlock: xxx %R\n"); 694 vtUnlock(fl->lk); 695 return nil; 696 } 697 nwrap = 0; 698 for(;;){ 699 if(++addr >= fl->end){ 700 addr = 0; 701 fprint(2, "cacheAllocBlock wrap %d\n", fl->end); 702 if(++nwrap >= 2){ 703 blockPut(b); 704 fl->last = 0; 705 vtSetError("disk is full"); 706 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 707 vtUnlock(fl->lk); 708 return nil; 709 } 710 } 711 if(addr%n == 0){ 712 blockPut(b); 713 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 714 if(b == nil){ 715 fl->last = addr; 716 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 717 vtUnlock(fl->lk); 718 return nil; 719 } 720 } 721 if(!labelUnpack(&lab, b->data, addr%n)) 722 continue; 723 if(lab.state == BsFree) 724 goto Found; 725 if((lab.state&BsClosed) && lab.epochClose <= epochLow) 726 goto Found; 727 } 728 Found: 729 blockPut(b); 730 b = cacheLocal(c, PartData, addr, OOverWrite); 731 if(b == nil){ 732 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 733 return nil; 734 } 735 assert(b->iostate == BioLabel || b->iostate == BioClean); 736 fl->last = addr; 737 lab.type = type; 738 lab.tag = tag; 739 lab.state = BsAlloc; 740 lab.epoch = epoch; 741 lab.epochClose = ~(u32int)0; 742 if(!blockSetLabel(b, &lab)){ 743 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 744 blockPut(b); 745 return nil; 746 } 747 vtZeroExtend(vtType[type], b->data, 0, c->size); 748 if(0)diskWrite(c->disk, b); 749 750 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 751 lastAlloc = addr; 752 vtUnlock(fl->lk); 753 b->pc = getcallerpc(&c); 754 return b; 755 } 756 757 static FreeList * 758 flAlloc(u32int end) 759 { 760 FreeList *fl; 761 762 fl = vtMemAllocZ(sizeof(*fl)); 763 fl->lk = vtLockAlloc(); 764 fl->last = end; 765 fl->end = end; 766 return fl; 767 } 768 769 static void 770 flFree(FreeList *fl) 771 { 772 vtLockFree(fl->lk); 773 vtMemFree(fl); 774 } 775 776 u32int 777 cacheLocalSize(Cache *c, int part) 778 { 779 return diskSize(c->disk, part); 780 } 781 782 /* 783 * Copy on write. Copied blocks have to be marked BaCopy. 784 * See the big comment near blockRemoveLink. 785 */ 786 Block* 787 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 788 { 789 Block *bb, *lb; 790 Label l; 791 792 assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi); 793 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 794 if(bb == nil){ 795 blockPut(b); 796 return nil; 797 } 798 799 /* 800 * Change label on b to mark that we've copied it. 801 * This has to come after cacheAllocBlock, since we 802 * can't hold any labels blocks (lb) while we try to 803 * fetch others (in cacheAllocBlock). 804 */ 805 if(!(b->l.state&BsCopied) && b->part==PartData){ 806 l = b->l; 807 l.state |= BsCopied; 808 lb = _blockSetLabel(b, &l); 809 if(lb == nil){ 810 /* can't set label => can't copy block */ 811 blockPut(b); 812 l.type = BtMax; 813 l.state = BsFree; 814 l.epoch = 0; 815 l.epochClose = 0; 816 l.tag = 0; 817 /* ignore error: block gets lost on error */ 818 blockSetLabel(bb, &l); 819 blockPut(bb); 820 return nil; 821 } 822 blockDependency(bb, lb, -1, nil, nil); 823 blockPut(lb); 824 } 825 826 if(0){ 827 if(b->addr != NilBlock) 828 fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n", 829 b->addr, b->l.epoch, bb->addr, bb->l.epoch); 830 else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0) 831 fprint(2, "blockCopy %V => %#ux/%ud\n", 832 b->score, bb->addr, bb->l.epoch); 833 } 834 835 memmove(bb->data, b->data, b->c->size); 836 blockDirty(bb); 837 blockPut(b); 838 return bb; 839 } 840 841 /* 842 * The thread that has locked b may refer to it by 843 * multiple names. Nlock counts the number of 844 * references the locking thread holds. It will call 845 * blockPut once per reference. 846 */ 847 void 848 blockDupLock(Block *b) 849 { 850 assert(b->nlock > 0); 851 b->nlock++; 852 } 853 854 /* 855 * we're done with the block. 856 * unlock it. can't use it after calling this. 857 */ 858 void 859 blockPut(Block* b) 860 { 861 Cache *c; 862 863 if(b == nil) 864 return; 865 866 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 867 868 if(b->iostate == BioDirty) 869 bwatchDependency(b); 870 871 if(--b->nlock > 0) 872 return; 873 874 /* 875 * b->nlock should probably stay at zero while 876 * the block is unlocked, but diskThread and vtSleep 877 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 878 * so we have to keep b->nlock set to 1 even 879 * when the block is unlocked. 880 */ 881 assert(b->nlock == 0); 882 b->nlock = 1; 883 // b->pc = 0; 884 885 bwatchUnlock(b); 886 vtUnlock(b->lk); 887 c = b->c; 888 vtLock(c->lk); 889 890 if(--b->ref > 0){ 891 vtUnlock(c->lk); 892 return; 893 } 894 895 assert(b->ref == 0); 896 switch(b->iostate){ 897 default: 898 b->used = c->now++; 899 heapIns(b); 900 break; 901 case BioEmpty: 902 case BioLabel: 903 if(c->nheap == 0) 904 b->used = c->now++; 905 else 906 b->used = c->heap[0]->used; 907 heapIns(b); 908 break; 909 case BioDirty: 910 break; 911 } 912 vtUnlock(c->lk); 913 } 914 915 /* 916 * we're deleting a block; delete all the blocks it points to 917 * that are still active (i.e., not needed by snapshots). 918 */ 919 static void 920 blockCleanup(Block *b, u32int epoch) 921 { 922 Cache *c; 923 Block *bb; 924 int i, n; 925 Label l; 926 u32int a; 927 int type; 928 int mode; 929 930 type = b->l.type; 931 c = b->c; 932 933 bwatchReset(b->score); 934 935 blockSetIOState(b, BioClean); 936 937 /* do not recursively free directories */ 938 if(type == BtData || type == BtDir) 939 return; 940 941 n = c->size / VtScoreSize; 942 mode = OReadWrite; 943 if(type-1 == BtData || type-1 == BtDir) 944 mode = OOverWrite; 945 for(i=0; i<n; i++){ 946 a = globalToLocal(b->data + i*VtScoreSize); 947 if(a == NilBlock || !readLabel(c, &l, a)) 948 continue; 949 if((l.state&BsClosed) || l.epoch != epoch) 950 continue; 951 bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0); 952 if(bb == nil) 953 continue; 954 if((bb->l.state&BsClosed) || bb->l.epoch != epoch){ 955 fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n", 956 a, &l, &bb->l); 957 blockPut(bb); 958 continue; 959 } 960 blockCleanup(bb, epoch); 961 l.type = BtMax; 962 l.epoch = epoch; 963 l.epochClose = 0; 964 l.state = BsFree; 965 l.tag = 0; 966 blockSetLabel(bb, &l); 967 blockPut(bb); 968 } 969 } 970 971 /* 972 * We don't need the block at addr anymore for the active file system. 973 * If we don't need it for the snapshots, remove it completely. 974 * Epoch is the epoch during which we got rid of the block. 975 * See blockRemoveLink for more. 976 */ 977 static int 978 unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch) 979 { 980 Block *b; 981 Label l; 982 983 if(addr == NilBlock) 984 return 1; 985 986 //fprint(2, "unlinkBlock %#ux\n", addr); 987 b = cacheLocalData(c, addr, type, tag, OReadOnly, 0); 988 if(b == nil) 989 return 0; 990 if(b->l.epoch > epoch){ 991 fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch); 992 blockPut(b); 993 return 0; 994 } 995 996 l = b->l; 997 if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){ 998 l.state = BsFree; 999 l.type = BtMax; 1000 l.tag = 0; 1001 l.epoch = 0; 1002 l.epochClose = 0; 1003 blockCleanup(b, epoch); 1004 }else{ 1005 l.state |= BsClosed; 1006 l.epochClose = epoch; 1007 } 1008 blockSetLabel(b, &l); 1009 blockPut(b); 1010 return 1; 1011 } 1012 1013 /* 1014 * try to allocate a BList so we can record that b must 1015 * be written out before some other block. 1016 * if can't find a BList, write b out instead and return nil. 1017 */ 1018 static BList * 1019 blistAlloc(Block *b) 1020 { 1021 Cache *c; 1022 BList *p; 1023 1024 /* 1025 * It's possible that when we marked b dirty, there were 1026 * too many dirty blocks so we just wrote b there and then. 1027 * So b might not be dirty. If it's not, no need to worry 1028 * about recording the write constraint. 1029 * 1030 * BlockRemoveLink depends on the fact that if blistAlloc 1031 * returns non-nil, b really is dirty. 1032 */ 1033 if(b->iostate != BioDirty){ 1034 assert(b->iostate == BioClean); 1035 return nil; 1036 } 1037 1038 /* 1039 * Easy: maybe there's a free list left. 1040 */ 1041 c = b->c; 1042 vtLock(c->lk); 1043 if(c->blfree){ 1044 HaveBlFree: 1045 p = c->blfree; 1046 c->blfree = p->next; 1047 vtUnlock(c->lk); 1048 return p; 1049 } 1050 vtUnlock(c->lk); 1051 1052 /* 1053 * No free BLists. What are our options? 1054 */ 1055 1056 /* Block has no priors? Just write it. */ 1057 if(b->prior == nil){ 1058 diskWrite(c->disk, b); 1059 while(b->iostate != BioClean) 1060 vtSleep(b->ioready); 1061 return nil; 1062 } 1063 1064 /* 1065 * Wake the flush thread, which will hopefully free up 1066 * some BLists for us. We used to flush a block from 1067 * our own prior list and reclaim that BList, but this is 1068 * a no-no: some of the blocks on our prior list may 1069 * be locked by our caller. Or maybe their label blocks 1070 * are locked by our caller. In any event, it's too hard 1071 * to make sure we can do I/O for ourselves. Instead, 1072 * we assume the flush thread will find something. 1073 * (The flush thread never blocks waiting for a block, 1074 * so it won't deadlock like we will.) 1075 */ 1076 vtLock(c->lk); 1077 while(c->blfree == nil){ 1078 vtWakeup(c->flush); 1079 vtSleep(c->blrend); 1080 } 1081 goto HaveBlFree; 1082 } 1083 1084 void 1085 blistFree(Cache *c, BList *bl) 1086 { 1087 vtLock(c->lk); 1088 bl->next = c->blfree; 1089 c->blfree = bl; 1090 vtWakeup(c->blrend); 1091 vtUnlock(c->lk); 1092 } 1093 1094 /* 1095 * Flush b or one of the blocks it depends on. 1096 */ 1097 void 1098 blockFlush(Block *b) 1099 { 1100 int first, nlock; 1101 BList *p, **pp; 1102 Block *bb; 1103 Cache *c; 1104 1105 //fprint(2, "blockFlush %p\n", b); 1106 1107 c = b->c; 1108 1109 first = 1; 1110 pp = &b->prior; 1111 for(p=*pp; p; p=*pp){ 1112 bb = cacheLocalLookup(c, p->part, p->addr, p->vers); 1113 if(bb == nil){ 1114 *pp = p->next; 1115 blistFree(c, p); 1116 continue; 1117 } 1118 if(!first) 1119 blockPut(b); 1120 first = 0; 1121 b = bb; 1122 pp = &b->prior; 1123 } 1124 1125 /* 1126 * If b->nlock > 1, the block is aliased within 1127 * a single thread. That thread is us, and it's 1128 * the block that was passed in (rather than a prior). 1129 * DiskWrite does some funny stuff with VtLock 1130 * and blockPut that basically assumes b->nlock==1. 1131 * We humor diskWrite by temporarily setting 1132 * nlock to 1. This needs to be revisited. (TODO) 1133 */ 1134 nlock = b->nlock; 1135 if(nlock > 1){ 1136 assert(first); 1137 b->nlock = 1; 1138 } 1139 diskWrite(c->disk, b); 1140 while(b->iostate != BioClean) 1141 vtSleep(b->ioready); 1142 b->nlock = nlock; 1143 if(!first) 1144 blockPut(b); 1145 } 1146 1147 /* 1148 * Record that bb must be written out before b. 1149 * If index is given, we're about to overwrite the score/e 1150 * at that index in the block. Save the old value so we 1151 * can write a safer ``old'' version of the block if pressed. 1152 */ 1153 void 1154 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1155 { 1156 BList *p; 1157 1158 if(bb->iostate == BioClean) 1159 return; 1160 1161 /* 1162 * Dependencies for blocks containing Entry structures 1163 * or scores must always be explained. The problem with 1164 * only explaining some of them is this. Suppose we have two 1165 * dependencies for the same field, the first explained 1166 * and the second not. We try to write the block when the first 1167 * dependency is not written but the second is. We will roll back 1168 * the first change even though the second trumps it. 1169 */ 1170 if(index == -1 && bb->part == PartData) 1171 assert(b->l.type == BtData); 1172 1173 assert(bb->iostate == BioDirty); 1174 1175 p = blistAlloc(bb); 1176 if(p == nil) 1177 return; 1178 1179 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1180 1181 p->part = bb->part; 1182 p->addr = bb->addr; 1183 p->type = bb->l.type; 1184 p->vers = bb->vers; 1185 p->index = index; 1186 if(p->index >= 0){ 1187 /* 1188 * This test would just be b->l.type==BtDir except 1189 * we need to exclude the super block. 1190 */ 1191 if(b->l.type == BtDir && b->part == PartData) 1192 entryPack(e, p->old.entry, 0); 1193 else 1194 memmove(p->old.score, score, VtScoreSize); 1195 } 1196 p->next = b->prior; 1197 b->prior = p; 1198 } 1199 1200 /* 1201 * Mark an in-memory block as dirty. If there are too many 1202 * dirty blocks, start writing some out to disk. If there are way 1203 * too many dirty blocks, write this one out too. 1204 * 1205 * Note that since we might call blockFlush, which walks 1206 * the prior list, we can't call blockDirty while holding a lock 1207 * on any of our priors. This can be tested by recompiling 1208 * with flush always set to 1 below. 1209 */ 1210 int 1211 blockDirty(Block *b) 1212 { 1213 Cache *c; 1214 int flush; 1215 1216 c = b->c; 1217 1218 assert(b->part != PartVenti); 1219 1220 if(b->iostate == BioDirty) 1221 return 1; 1222 assert(b->iostate == BioClean); 1223 1224 vtLock(c->lk); 1225 b->iostate = BioDirty; 1226 c->ndirty++; 1227 if(c->ndirty > (c->maxdirty>>1)) 1228 vtWakeup(c->flush); 1229 flush = c->ndirty > c->maxdirty; 1230 vtUnlock(c->lk); 1231 1232 if(flush) 1233 blockFlush(b); 1234 1235 return 1; 1236 } 1237 1238 /* 1239 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1240 * 1241 * The file system maintains the following invariants (i-iv checked by flchk): 1242 * 1243 * (i) b.e in [bb.e, bb.eClose) 1244 * (ii) if b.e==bb.e, then no other b' in e points at bb. 1245 * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb. 1246 * (iv) if b is active then no other active b' points at bb. 1247 * (v) if b is a past life of b' then only one of b and b' is active (too hard to check) 1248 * 1249 * The file system initially satisfies these invariants, and we can verify that 1250 * the various file system operations maintain them. See fossil.invariants. 1251 * 1252 * Condition (i) lets us reclaim blocks once the low epoch is greater 1253 * than epochClose. 1254 * 1255 * If the condition in (iii) is satisfied, then this is the only pointer to bb, 1256 * so bb can be reclaimed once b has been written to disk. blockRemoveLink 1257 * checks !(b.state&Copied) as an optimization. UnlinkBlock and blockCleanup 1258 * will check the conditions again for each block they consider. 1259 */ 1260 int 1261 blockRemoveLink(Block *b, u32int addr, int type, u32int tag) 1262 { 1263 BList *bl; 1264 BList *p, **pp; 1265 Cache *c; 1266 1267 c = b->c; 1268 1269 /* remove unlinked block from prior list */ 1270 pp = &b->prior; 1271 for(p=*pp; p; p=*pp){ 1272 if(p->part != PartData || p->addr != addr){ 1273 pp = &p->next; 1274 continue; 1275 } 1276 *pp = p->next; 1277 blistFree(c, p); 1278 } 1279 1280 /* if b has been copied, can't reclaim blocks it points at. */ 1281 if(b->l.state & BsCopied) 1282 return 0; 1283 1284 bl = blistAlloc(b); 1285 if(bl == nil) 1286 return unlinkBlock(b->c, addr, type, tag, b->l.epoch); 1287 1288 /* 1289 * Because bl != nil, we know b is dirty. 1290 * (Linking b->uhead onto a clean block is 1291 * counterproductive, since we only look at 1292 * b->uhead when a block transitions from 1293 * dirty to clean.) 1294 */ 1295 assert(b->iostate == BioDirty); 1296 1297 bl->part = PartData; 1298 bl->addr = addr; 1299 bl->type = type; 1300 bl->tag = tag; 1301 bl->epoch = b->l.epoch; 1302 if(b->uhead == nil) 1303 b->uhead = bl; 1304 else 1305 b->utail->next = bl; 1306 b->utail = bl; 1307 bl->next = nil; 1308 return 1; 1309 } 1310 1311 /* 1312 * set the label associated with a block. 1313 */ 1314 Block* 1315 _blockSetLabel(Block *b, Label *l) 1316 { 1317 int lpb; 1318 Block *bb; 1319 u32int a; 1320 Cache *c; 1321 1322 c = b->c; 1323 1324 assert(b->part == PartData); 1325 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 1326 lpb = c->size / LabelSize; 1327 a = b->addr / lpb; 1328 bb = cacheLocal(c, PartLabel, a, OReadWrite); 1329 if(bb == nil){ 1330 blockPut(b); 1331 return nil; 1332 } 1333 b->l = *l; 1334 labelPack(l, bb->data, b->addr%lpb); 1335 blockDirty(bb); 1336 return bb; 1337 } 1338 1339 int 1340 blockSetLabel(Block *b, Label *l) 1341 { 1342 Block *lb; 1343 Label oldl; 1344 1345 oldl = b->l; 1346 lb = _blockSetLabel(b, l); 1347 if(lb == nil) 1348 return 0; 1349 1350 /* 1351 * If we're allocating the block, make sure the label (bl) 1352 * goes to disk before the data block (b) itself. This is to help 1353 * the blocks that in turn depend on b. 1354 * 1355 * Suppose bx depends on (must be written out after) b. 1356 * Once we write b we'll think it's safe to write bx. 1357 * Bx can't get at b unless it has a valid label, though. 1358 * 1359 * Allocation is the only case in which having a current label 1360 * is vital because: 1361 * 1362 * - l.type is set at allocation and never changes. 1363 * - l.tag is set at allocation and never changes. 1364 * - l.state is not checked when we load blocks. 1365 * - the archiver cares deeply about l.state being 1366 * BaActive vs. BaCopied, but that's handled 1367 * by direct calls to _blockSetLabel. 1368 */ 1369 1370 if(oldl.state == BsFree) 1371 blockDependency(b, lb, -1, nil, nil); 1372 blockPut(lb); 1373 return 1; 1374 } 1375 1376 /* 1377 * We've decided to write out b. Maybe b has some pointers to blocks 1378 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1379 * copy of b that is safe to write out. (diskThread will make sure the block 1380 * remains marked as dirty.) 1381 */ 1382 uchar * 1383 blockRollback(Block *b, uchar *buf) 1384 { 1385 u32int addr; 1386 BList *p; 1387 Super super; 1388 1389 /* easy case */ 1390 if(b->prior == nil) 1391 return b->data; 1392 1393 memmove(buf, b->data, b->c->size); 1394 for(p=b->prior; p; p=p->next){ 1395 /* 1396 * we know p->index >= 0 because blockWrite has vetted this block for us. 1397 */ 1398 assert(p->index >= 0); 1399 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1400 if(b->part == PartSuper){ 1401 assert(p->index == 0); 1402 superUnpack(&super, buf); 1403 addr = globalToLocal(p->old.score); 1404 if(addr == NilBlock){ 1405 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1406 abort(); 1407 } 1408 super.active = addr; 1409 superPack(&super, buf); 1410 continue; 1411 } 1412 if(b->l.type == BtDir) 1413 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1414 else 1415 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1416 } 1417 return buf; 1418 } 1419 1420 /* 1421 * Try to write block b. 1422 * If b depends on other blocks: 1423 * 1424 * If the block has been written out, remove the dependency. 1425 * If we know how to write out an old version of b that doesn't 1426 * depend on it, do that. 1427 * 1428 * Otherwise, bail. 1429 */ 1430 int 1431 blockWrite(Block *b) 1432 { 1433 uchar *dmap; 1434 Cache *c; 1435 BList *p, **pp; 1436 Block *bb; 1437 int lockfail; 1438 1439 c = b->c; 1440 1441 if(b->iostate != BioDirty) 1442 return 1; 1443 1444 dmap = b->dmap; 1445 memset(dmap, 0, c->ndmap); 1446 pp = &b->prior; 1447 for(p=*pp; p; p=*pp){ 1448 if(p->index >= 0){ 1449 /* more recent dependency has succeeded; this one can go */ 1450 if(dmap[p->index/8] & (1<<(p->index%8))) 1451 goto ignblock; 1452 } 1453 1454 lockfail = 0; 1455 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1456 if(bb == nil){ 1457 if(lockfail) 1458 return 0; 1459 /* block not in cache => was written already */ 1460 dmap[p->index/8] |= 1<<(p->index%8); 1461 goto ignblock; 1462 } 1463 1464 /* 1465 * same version of block is still in cache. 1466 * 1467 * the assertion is true because the block still has version p->vers, 1468 * which means it hasn't been written out since we last saw it. 1469 */ 1470 assert(bb->iostate == BioDirty); 1471 blockPut(bb); 1472 1473 if(p->index < 0){ 1474 /* 1475 * We don't know how to temporarily undo 1476 * b's dependency on bb, so just don't write b yet. 1477 */ 1478 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1479 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1480 return 0; 1481 } 1482 /* keep walking down the list */ 1483 pp = &p->next; 1484 continue; 1485 1486 ignblock: 1487 *pp = p->next; 1488 blistFree(c, p); 1489 continue; 1490 } 1491 1492 diskWrite(c->disk, b); 1493 return 1; 1494 } 1495 1496 /* 1497 * Change the I/O state of block b. 1498 * Just an assignment except for magic in 1499 * switch statement (read comments there). 1500 */ 1501 void 1502 blockSetIOState(Block *b, int iostate) 1503 { 1504 int dowakeup; 1505 Cache *c; 1506 BList *p, *q; 1507 1508 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1509 1510 c = b->c; 1511 1512 dowakeup = 0; 1513 switch(iostate){ 1514 default: 1515 abort(); 1516 case BioEmpty: 1517 assert(!b->uhead); 1518 break; 1519 case BioLabel: 1520 assert(!b->uhead); 1521 break; 1522 case BioClean: 1523 bwatchDependency(b); 1524 /* 1525 * If b->prior is set, it means a write just finished. 1526 * The prior list isn't needed anymore. 1527 */ 1528 for(p=b->prior; p; p=q){ 1529 q = p->next; 1530 blistFree(c, p); 1531 } 1532 b->prior = nil; 1533 /* 1534 * Freeing a block or just finished a write. 1535 * Move the blocks from the per-block unlink 1536 * queue to the cache unlink queue. 1537 */ 1538 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1539 vtLock(c->lk); 1540 c->ndirty--; 1541 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1542 b->vers = c->vers++; 1543 if(b->uhead){ 1544 /* add unlink blocks to unlink queue */ 1545 if(c->uhead == nil){ 1546 c->uhead = b->uhead; 1547 vtWakeup(c->unlink); 1548 }else 1549 c->utail->next = b->uhead; 1550 c->utail = b->utail; 1551 b->uhead = nil; 1552 } 1553 vtUnlock(c->lk); 1554 } 1555 assert(!b->uhead); 1556 dowakeup = 1; 1557 break; 1558 case BioDirty: 1559 /* 1560 * Wrote out an old version of the block (see blockRollback). 1561 * Bump a version count, leave it dirty. 1562 */ 1563 if(b->iostate == BioWriting){ 1564 vtLock(c->lk); 1565 b->vers = c->vers++; 1566 vtUnlock(c->lk); 1567 dowakeup = 1; 1568 } 1569 break; 1570 case BioReading: 1571 case BioWriting: 1572 /* 1573 * Adding block to disk queue. Bump reference count. 1574 * diskThread decs the count later by calling blockPut. 1575 * This is here because we need to lock c->lk to 1576 * manipulate the ref count. 1577 */ 1578 vtLock(c->lk); 1579 b->ref++; 1580 vtUnlock(c->lk); 1581 break; 1582 case BioReadError: 1583 case BioVentiError: 1584 /* 1585 * Oops. 1586 */ 1587 dowakeup = 1; 1588 break; 1589 } 1590 b->iostate = iostate; 1591 /* 1592 * Now that the state has changed, we can wake the waiters. 1593 */ 1594 if(dowakeup) 1595 vtWakeupAll(b->ioready); 1596 } 1597 1598 char* 1599 bsStr(int state) 1600 { 1601 static char s[100]; 1602 1603 if(state == BsFree) 1604 return "Free"; 1605 if(state == BsBad) 1606 return "Bad"; 1607 1608 sprint(s, "%x", state); 1609 if(!(state&BsAlloc)) 1610 strcat(s, ",Free"); /* should not happen */ 1611 if(state&BsCopied) 1612 strcat(s, ",Copied"); 1613 if(state&BsVenti) 1614 strcat(s, ",Venti"); 1615 if(state&BsClosed) 1616 strcat(s, ",Closed"); 1617 return s; 1618 } 1619 1620 char * 1621 bioStr(int iostate) 1622 { 1623 switch(iostate){ 1624 default: 1625 return "Unknown!!"; 1626 case BioEmpty: 1627 return "Empty"; 1628 case BioLabel: 1629 return "Label"; 1630 case BioClean: 1631 return "Clean"; 1632 case BioDirty: 1633 return "Dirty"; 1634 case BioReading: 1635 return "Reading"; 1636 case BioWriting: 1637 return "Writing"; 1638 case BioReadError: 1639 return "ReadError"; 1640 case BioVentiError: 1641 return "VentiError"; 1642 case BioMax: 1643 return "Max"; 1644 } 1645 } 1646 1647 static char *bttab[] = { 1648 "BtData", 1649 "BtData+1", 1650 "BtData+2", 1651 "BtData+3", 1652 "BtData+4", 1653 "BtData+5", 1654 "BtData+6", 1655 "BtData+7", 1656 "BtDir", 1657 "BtDir+1", 1658 "BtDir+2", 1659 "BtDir+3", 1660 "BtDir+4", 1661 "BtDir+5", 1662 "BtDir+6", 1663 "BtDir+7", 1664 }; 1665 1666 char* 1667 btStr(int type) 1668 { 1669 if(type < nelem(bttab)) 1670 return bttab[type]; 1671 return "unknown"; 1672 } 1673 1674 int 1675 labelFmt(Fmt *f) 1676 { 1677 Label *l; 1678 1679 l = va_arg(f->args, Label*); 1680 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1681 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1682 } 1683 1684 int 1685 scoreFmt(Fmt *f) 1686 { 1687 uchar *v; 1688 int i; 1689 u32int addr; 1690 1691 v = va_arg(f->args, uchar*); 1692 if(v == nil){ 1693 fmtprint(f, "*"); 1694 }else if((addr = globalToLocal(v)) != NilBlock) 1695 fmtprint(f, "0x%.8ux", addr); 1696 else{ 1697 for(i = 0; i < VtScoreSize; i++) 1698 fmtprint(f, "%2.2ux", v[i]); 1699 } 1700 1701 return 0; 1702 } 1703 1704 static int 1705 upHeap(int i, Block *b) 1706 { 1707 Block *bb; 1708 u32int now; 1709 int p; 1710 Cache *c; 1711 1712 c = b->c; 1713 now = c->now; 1714 for(; i != 0; i = p){ 1715 p = (i - 1) >> 1; 1716 bb = c->heap[p]; 1717 if(b->used - now >= bb->used - now) 1718 break; 1719 c->heap[i] = bb; 1720 bb->heap = i; 1721 } 1722 c->heap[i] = b; 1723 b->heap = i; 1724 1725 return i; 1726 } 1727 1728 static int 1729 downHeap(int i, Block *b) 1730 { 1731 Block *bb; 1732 u32int now; 1733 int k; 1734 Cache *c; 1735 1736 c = b->c; 1737 now = c->now; 1738 for(; ; i = k){ 1739 k = (i << 1) + 1; 1740 if(k >= c->nheap) 1741 break; 1742 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1743 k++; 1744 bb = c->heap[k]; 1745 if(b->used - now <= bb->used - now) 1746 break; 1747 c->heap[i] = bb; 1748 bb->heap = i; 1749 } 1750 c->heap[i] = b; 1751 b->heap = i; 1752 return i; 1753 } 1754 1755 /* 1756 * Delete a block from the heap. 1757 * Called with c->lk held. 1758 */ 1759 static void 1760 heapDel(Block *b) 1761 { 1762 int i, si; 1763 Cache *c; 1764 1765 c = b->c; 1766 1767 si = b->heap; 1768 if(si == BadHeap) 1769 return; 1770 b->heap = BadHeap; 1771 c->nheap--; 1772 if(si == c->nheap) 1773 return; 1774 b = c->heap[c->nheap]; 1775 i = upHeap(si, b); 1776 if(i == si) 1777 downHeap(i, b); 1778 } 1779 1780 /* 1781 * Insert a block into the heap. 1782 * Called with c->lk held. 1783 */ 1784 static void 1785 heapIns(Block *b) 1786 { 1787 assert(b->heap == BadHeap); 1788 upHeap(b->c->nheap++, b); 1789 } 1790 1791 /* 1792 * Get just the label for a block. 1793 */ 1794 static int 1795 readLabel(Cache *c, Label *l, u32int addr) 1796 { 1797 int lpb; 1798 Block *b; 1799 u32int a; 1800 1801 lpb = c->size / LabelSize; 1802 a = addr / lpb; 1803 b = cacheLocal(c, PartLabel, a, OReadOnly); 1804 if(b == nil){ 1805 blockPut(b); 1806 return 0; 1807 } 1808 1809 if(!labelUnpack(l, b->data, addr%lpb)){ 1810 blockPut(b); 1811 return 0; 1812 } 1813 blockPut(b); 1814 return 1; 1815 } 1816 1817 /* 1818 * Process unlink queue. 1819 * Called with c->lk held. 1820 */ 1821 static void 1822 unlinkBody(Cache *c) 1823 { 1824 BList *p; 1825 1826 while(c->uhead != nil){ 1827 p = c->uhead; 1828 c->uhead = p->next; 1829 vtUnlock(c->lk); 1830 1831 if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch)) 1832 fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n", 1833 p->addr, p->type, p->tag); 1834 1835 vtLock(c->lk); 1836 p->next = c->blfree; 1837 c->blfree = p; 1838 } 1839 } 1840 1841 /* 1842 * Occasionally unlink the blocks on the cache unlink queue. 1843 */ 1844 static void 1845 unlinkThread(void *a) 1846 { 1847 Cache *c = a; 1848 1849 vtThreadSetName("unlink"); 1850 1851 vtLock(c->lk); 1852 for(;;){ 1853 while(c->uhead == nil && c->die == nil) 1854 vtSleep(c->unlink); 1855 if(c->die != nil) 1856 break; 1857 unlinkBody(c); 1858 } 1859 c->ref--; 1860 vtWakeup(c->die); 1861 vtUnlock(c->lk); 1862 } 1863 1864 static int 1865 baddrCmp(void *a0, void *a1) 1866 { 1867 BAddr *b0, *b1; 1868 b0 = a0; 1869 b1 = a1; 1870 1871 if(b0->part < b1->part) 1872 return -1; 1873 if(b0->part > b1->part) 1874 return 1; 1875 if(b0->addr < b1->addr) 1876 return -1; 1877 if(b0->addr > b1->addr) 1878 return 1; 1879 return 0; 1880 } 1881 1882 /* 1883 * Scan the block list for dirty blocks; add them to the list c->baddr. 1884 */ 1885 static void 1886 flushFill(Cache *c) 1887 { 1888 int i, ndirty; 1889 BAddr *p; 1890 Block *b; 1891 1892 vtLock(c->lk); 1893 // if(c->ndirty == 0){ 1894 // vtUnlock(c->lk); 1895 // return; 1896 // } 1897 1898 p = c->baddr; 1899 ndirty = 0; 1900 for(i=0; i<c->nblocks; i++){ 1901 b = c->blocks + i; 1902 if(b->part == PartError) 1903 continue; 1904 if(b->iostate == BioDirty || b->iostate == BioWriting) 1905 ndirty++; 1906 if(b->iostate != BioDirty) 1907 continue; 1908 p->part = b->part; 1909 p->addr = b->addr; 1910 p->vers = b->vers; 1911 p++; 1912 } 1913 if(ndirty != c->ndirty){ 1914 fprint(2, "ndirty mismatch expected %d found %d\n", 1915 c->ndirty, ndirty); 1916 c->ndirty = ndirty; 1917 } 1918 vtUnlock(c->lk); 1919 1920 c->bw = p - c->baddr; 1921 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1922 } 1923 1924 /* 1925 * This is not thread safe, i.e. it can't be called from multiple threads. 1926 * 1927 * It's okay how we use it, because it only gets called in 1928 * the flushThread. And cacheFree, but only after 1929 * cacheFree has killed off the flushThread. 1930 */ 1931 static int 1932 cacheFlushBlock(Cache *c) 1933 { 1934 Block *b; 1935 BAddr *p; 1936 int lockfail, nfail; 1937 1938 nfail = 0; 1939 for(;;){ 1940 if(c->br == c->be){ 1941 if(c->bw == 0 || c->bw == c->be) 1942 flushFill(c); 1943 c->br = 0; 1944 c->be = c->bw; 1945 c->bw = 0; 1946 c->nflush = 0; 1947 } 1948 1949 if(c->br == c->be) 1950 return 0; 1951 p = c->baddr + c->br; 1952 c->br++; 1953 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1954 1955 if(b && blockWrite(b)){ 1956 c->nflush++; 1957 blockPut(b); 1958 return 1; 1959 } 1960 if(b) 1961 blockPut(b); 1962 1963 /* 1964 * Why didn't we write the block? 1965 */ 1966 1967 /* Block already written out */ 1968 if(b == nil && !lockfail) 1969 continue; 1970 1971 /* Failed to acquire lock; sleep if happens a lot. */ 1972 if(lockfail && ++nfail > 100) 1973 sleep(500); 1974 1975 /* Requeue block. */ 1976 if(c->bw < c->be) 1977 c->baddr[c->bw++] = *p; 1978 } 1979 return 0; 1980 } 1981 1982 /* 1983 * Occasionally flush dirty blocks from memory to the disk. 1984 */ 1985 static void 1986 flushThread(void *a) 1987 { 1988 Cache *c = a; 1989 int i; 1990 1991 vtThreadSetName("flush"); 1992 vtLock(c->lk); 1993 while(c->die == nil){ 1994 vtSleep(c->flush); 1995 vtUnlock(c->lk); 1996 for(i=0; i<FlushSize; i++) 1997 if(!cacheFlushBlock(c)) 1998 break; 1999 vtLock(c->lk); 2000 vtWakeupAll(c->flushwait); 2001 } 2002 c->ref--; 2003 vtWakeup(c->die); 2004 vtUnlock(c->lk); 2005 } 2006 2007 /* 2008 * Keep flushing until everything is clean. 2009 */ 2010 void 2011 cacheFlush(Cache *c, int wait) 2012 { 2013 vtLock(c->lk); 2014 if(wait){ 2015 while(c->ndirty){ 2016 consPrint("cacheFlush: %d dirty blocks\n", c->ndirty); 2017 vtWakeup(c->flush); 2018 vtSleep(c->flushwait); 2019 } 2020 consPrint("cacheFlush: done\n", c->ndirty); 2021 }else 2022 vtWakeup(c->flush); 2023 vtUnlock(c->lk); 2024 } 2025 2026 /* 2027 * Kick the flushThread every 30 seconds. 2028 */ 2029 static void 2030 cacheSync(void *v) 2031 { 2032 Cache *c; 2033 2034 c = v; 2035 cacheFlush(c, 0); 2036 } 2037