1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 #include "9.h" /* for cacheFlush */ 7 8 typedef struct FreeList FreeList; 9 typedef struct BAddr BAddr; 10 11 enum { 12 BadHeap = ~0, 13 }; 14 15 /* 16 * Store data to the memory cache in c->size blocks 17 * with the block zero extended to fill it out. When writing to 18 * Venti, the block will be zero truncated. The walker will also check 19 * that the block fits within psize or dsize as the case may be. 20 */ 21 22 struct Cache 23 { 24 VtLock *lk; 25 int ref; 26 int mode; 27 28 Disk *disk; 29 int size; /* block size */ 30 int ndmap; /* size of per-block dirty pointer map used in blockWrite */ 31 VtSession *z; 32 u32int now; /* ticks for usage timestamps */ 33 Block **heads; /* hash table for finding address */ 34 int nheap; /* number of available victims */ 35 Block **heap; /* heap for locating victims */ 36 long nblocks; /* number of blocks allocated */ 37 Block *blocks; /* array of block descriptors */ 38 u8int *mem; /* memory for all block data & blists */ 39 40 BList *blfree; 41 VtRendez *blrend; 42 43 int ndirty; /* number of dirty blocks in the cache */ 44 int maxdirty; /* max number of dirty blocks */ 45 u32int vers; 46 47 long hashSize; 48 49 FreeList *fl; 50 51 VtRendez *die; /* daemon threads should die when != nil */ 52 53 VtRendez *flush; 54 VtRendez *flushwait; 55 BAddr *baddr; 56 int bw, br, be; 57 int nflush; 58 59 Periodic *sync; 60 61 /* unlink daemon */ 62 BList *uhead; 63 BList *utail; 64 VtRendez *unlink; 65 66 /* block counts */ 67 int nused; 68 int ndisk; 69 }; 70 71 struct BList { 72 int part; 73 u32int addr; 74 uchar type; 75 u32int tag; 76 u32int epoch; 77 u32int vers; 78 79 /* for roll back */ 80 int index; /* -1 indicates not valid */ 81 union { 82 uchar score[VtScoreSize]; 83 uchar entry[VtEntrySize]; 84 } old; 85 BList *next; 86 }; 87 88 struct BAddr { 89 int part; 90 u32int addr; 91 u32int vers; 92 }; 93 94 struct FreeList { 95 VtLock *lk; 96 u32int last; /* last block allocated */ 97 u32int end; /* end of data partition */ 98 u32int nfree; /* number of free blocks */ 99 u32int nused; /* number of used blocks */ 100 u32int epochLow; /* low epoch when last updated nfree and nused */ 101 }; 102 103 static FreeList *flAlloc(u32int end); 104 static void flFree(FreeList *fl); 105 106 static Block *cacheBumpBlock(Cache *c); 107 static void heapDel(Block*); 108 static void heapIns(Block*); 109 static void cacheCheck(Cache*); 110 static int readLabel(Cache*, Label*, u32int addr); 111 static void unlinkThread(void *a); 112 static void flushThread(void *a); 113 static void flushBody(Cache *c); 114 static void unlinkBody(Cache *c); 115 static int cacheFlushBlock(Cache *c); 116 static void cacheSync(void*); 117 /* 118 * Mapping from local block type to Venti type 119 */ 120 int vtType[BtMax] = { 121 VtDataType, /* BtData | 0 */ 122 VtPointerType0, /* BtData | 1 */ 123 VtPointerType1, /* BtData | 2 */ 124 VtPointerType2, /* BtData | 3 */ 125 VtPointerType3, /* BtData | 4 */ 126 VtPointerType4, /* BtData | 5 */ 127 VtPointerType5, /* BtData | 6 */ 128 VtPointerType6, /* BtData | 7 */ 129 VtDirType, /* BtDir | 0 */ 130 VtPointerType0, /* BtDir | 1 */ 131 VtPointerType1, /* BtDir | 2 */ 132 VtPointerType2, /* BtDir | 3 */ 133 VtPointerType3, /* BtDir | 4 */ 134 VtPointerType4, /* BtDir | 5 */ 135 VtPointerType5, /* BtDir | 6 */ 136 VtPointerType6, /* BtDir | 7 */ 137 }; 138 139 /* 140 * Allocate the memory cache. 141 */ 142 Cache * 143 cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode) 144 { 145 int i; 146 Cache *c; 147 Block *b; 148 BList *bl; 149 u8int *p; 150 int nbl; 151 152 c = vtMemAllocZ(sizeof(Cache)); 153 154 /* reasonable number of BList elements */ 155 nbl = nblocks * 4; 156 157 c->lk = vtLockAlloc(); 158 c->ref = 1; 159 c->disk = disk; 160 c->z = z; 161 c->size = diskBlockSize(disk); 162 bwatchSetBlockSize(c->size); 163 /* round c->size up to be a nice multiple */ 164 c->size = (c->size + 127) & ~127; 165 c->ndmap = (c->size/20 + 7) / 8; 166 c->nblocks = nblocks; 167 c->hashSize = nblocks; 168 c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*)); 169 c->heap = vtMemAllocZ(nblocks*sizeof(Block*)); 170 c->blocks = vtMemAllocZ(nblocks*sizeof(Block)); 171 c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList)); 172 c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr)); 173 c->mode = mode; 174 c->vers++; 175 p = c->mem; 176 for(i = 0; i < nblocks; i++){ 177 b = &c->blocks[i]; 178 b->lk = vtLockAlloc(); 179 b->c = c; 180 b->data = p; 181 b->heap = i; 182 b->ioready = vtRendezAlloc(b->lk); 183 c->heap[i] = b; 184 p += c->size; 185 } 186 c->nheap = nblocks; 187 for(i = 0; i < nbl; i++){ 188 bl = (BList*)p; 189 bl->next = c->blfree; 190 c->blfree = bl; 191 p += sizeof(BList); 192 } 193 /* separate loop to keep blocks and blists reasonably aligned */ 194 for(i = 0; i < nblocks; i++){ 195 b = &c->blocks[i]; 196 b->dmap = p; 197 p += c->ndmap; 198 } 199 200 c->blrend = vtRendezAlloc(c->lk); 201 202 c->maxdirty = nblocks*(DirtyPercentage*0.01); 203 204 c->fl = flAlloc(diskSize(disk, PartData)); 205 206 c->unlink = vtRendezAlloc(c->lk); 207 c->flush = vtRendezAlloc(c->lk); 208 c->flushwait = vtRendezAlloc(c->lk); 209 c->sync = periodicAlloc(cacheSync, c, 30*1000); 210 211 if(mode == OReadWrite){ 212 c->ref += 2; 213 vtThread(unlinkThread, c); 214 vtThread(flushThread, c); 215 } 216 cacheCheck(c); 217 218 return c; 219 } 220 221 /* 222 * Free the whole memory cache, flushing all dirty blocks to the disk. 223 */ 224 void 225 cacheFree(Cache *c) 226 { 227 int i; 228 229 /* kill off daemon threads */ 230 vtLock(c->lk); 231 c->die = vtRendezAlloc(c->lk); 232 periodicKill(c->sync); 233 vtWakeup(c->flush); 234 vtWakeup(c->unlink); 235 while(c->ref > 1) 236 vtSleep(c->die); 237 238 /* flush everything out */ 239 do { 240 unlinkBody(c); 241 vtUnlock(c->lk); 242 while(cacheFlushBlock(c)) 243 ; 244 diskFlush(c->disk); 245 vtLock(c->lk); 246 } while(c->uhead || c->ndirty); 247 vtUnlock(c->lk); 248 249 cacheCheck(c); 250 251 for(i = 0; i < c->nblocks; i++){ 252 assert(c->blocks[i].ref == 0); 253 vtRendezFree(c->blocks[i].ioready); 254 vtLockFree(c->blocks[i].lk); 255 } 256 flFree(c->fl); 257 vtMemFree(c->baddr); 258 vtMemFree(c->heads); 259 vtMemFree(c->blocks); 260 vtMemFree(c->mem); 261 vtLockFree(c->lk); 262 diskFree(c->disk); 263 vtRendezFree(c->blrend); 264 /* don't close vtSession */ 265 vtMemFree(c); 266 } 267 268 static void 269 cacheDump(Cache *c) 270 { 271 int i; 272 Block *b; 273 274 for(i = 0; i < c->nblocks; i++){ 275 b = &c->blocks[i]; 276 fprint(2, "p=%d a=%ud %V t=%d ref=%d state=%s io=%s\n", 277 b->part, b->addr, b->score, b->l.type, b->ref, 278 bsStr(b->l.state), bioStr(b->iostate)); 279 } 280 } 281 282 static void 283 cacheCheck(Cache *c) 284 { 285 u32int size, now; 286 int i, k, refed; 287 static uchar zero[VtScoreSize]; 288 Block *b; 289 290 size = c->size; 291 now = c->now; 292 293 for(i = 0; i < c->nheap; i++){ 294 if(c->heap[i]->heap != i) 295 vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap); 296 if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now) 297 vtFatal("bad heap ordering"); 298 k = (i << 1) + 1; 299 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 300 vtFatal("bad heap ordering"); 301 k++; 302 if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now) 303 vtFatal("bad heap ordering"); 304 } 305 306 refed = 0; 307 for(i = 0; i < c->nblocks; i++){ 308 b = &c->blocks[i]; 309 if(b->data != &c->mem[i * size]) 310 vtFatal("mis-blocked at %d", i); 311 if(b->ref && b->heap == BadHeap){ 312 refed++; 313 } 314 } 315 if(c->nheap + refed != c->nblocks){ 316 fprint(2, "cacheCheck: nheap %d refed %d nblocks %ld\n", c->nheap, refed, c->nblocks); 317 cacheDump(c); 318 } 319 assert(c->nheap + refed == c->nblocks); 320 refed = 0; 321 for(i = 0; i < c->nblocks; i++){ 322 b = &c->blocks[i]; 323 if(b->ref){ 324 if(1)fprint(2, "p=%d a=%ud %V ref=%d %L\n", b->part, b->addr, b->score, b->ref, &b->l); 325 refed++; 326 } 327 } 328 if(refed > 0)fprint(2, "cacheCheck: in used %d\n", refed); 329 } 330 331 332 /* 333 * locate the block with the oldest second to last use. 334 * remove it from the heap, and fix up the heap. 335 */ 336 /* called with c->lk held */ 337 static Block * 338 cacheBumpBlock(Cache *c) 339 { 340 Block *b; 341 342 /* 343 * locate the block with the oldest second to last use. 344 * remove it from the heap, and fix up the heap. 345 */ 346 if(c->nheap == 0) 347 vtFatal("cacheBumpBlock: no free blocks in cache"); 348 b = c->heap[0]; 349 heapDel(b); 350 351 assert(b->heap == BadHeap); 352 assert(b->ref == 0); 353 assert(b->iostate == BioEmpty || b->iostate == BioLabel || b->iostate == BioClean); 354 assert(b->prior == nil); 355 assert(b->uhead == nil); 356 357 /* 358 * unchain the block from hash chain 359 */ 360 if(b->prev){ 361 *(b->prev) = b->next; 362 if(b->next) 363 b->next->prev = b->prev; 364 b->prev = nil; 365 } 366 367 368 if(0)fprint(2, "droping %d:%x:%V\n", b->part, b->addr, b->score); 369 /* set block to a reasonable state */ 370 b->ref = 1; 371 b->part = PartError; 372 memset(&b->l, 0, sizeof(b->l)); 373 b->iostate = BioEmpty; 374 375 return b; 376 } 377 378 /* 379 * look for a particular version of the block in the memory cache. 380 */ 381 static Block * 382 _cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers, 383 int waitlock, int *lockfailure) 384 { 385 Block *b; 386 ulong h; 387 388 h = addr % c->hashSize; 389 390 if(lockfailure) 391 *lockfailure = 0; 392 393 /* 394 * look for the block in the cache 395 */ 396 vtLock(c->lk); 397 for(b = c->heads[h]; b != nil; b = b->next){ 398 if(b->part == part && b->addr == addr) 399 break; 400 } 401 if(b == nil || b->vers != vers){ 402 vtUnlock(c->lk); 403 return nil; 404 } 405 if(!waitlock && !vtCanLock(b->lk)){ 406 *lockfailure = 1; 407 vtUnlock(c->lk); 408 return nil; 409 } 410 heapDel(b); 411 b->ref++; 412 vtUnlock(c->lk); 413 414 bwatchLock(b); 415 if(waitlock) 416 vtLock(b->lk); 417 b->nlock = 1; 418 419 for(;;){ 420 switch(b->iostate){ 421 default: 422 abort(); 423 case BioEmpty: 424 case BioLabel: 425 case BioClean: 426 case BioDirty: 427 if(b->vers != vers){ 428 blockPut(b); 429 return nil; 430 } 431 return b; 432 case BioReading: 433 case BioWriting: 434 vtSleep(b->ioready); 435 break; 436 case BioVentiError: 437 case BioReadError: 438 blockSetIOState(b, BioEmpty); 439 blockPut(b); 440 vtSetError(EIO); 441 return nil; 442 } 443 } 444 /* NOT REACHED */ 445 } 446 static Block* 447 cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers) 448 { 449 return _cacheLocalLookup(c, part, addr, vers, 1, 0); 450 } 451 452 453 /* 454 * fetch a local (on-disk) block from the memory cache. 455 * if it's not there, load it, bumping some other block. 456 */ 457 Block * 458 _cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch) 459 { 460 Block *b; 461 ulong h; 462 463 assert(part != PartVenti); 464 465 h = addr % c->hashSize; 466 467 /* 468 * look for the block in the cache 469 */ 470 vtLock(c->lk); 471 for(b = c->heads[h]; b != nil; b = b->next){ 472 if(b->part != part || b->addr != addr) 473 continue; 474 if(epoch && b->l.epoch != epoch){ 475 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 476 vtUnlock(c->lk); 477 vtSetError(ELabelMismatch); 478 return nil; 479 } 480 heapDel(b); 481 b->ref++; 482 break; 483 } 484 485 if(b == nil){ 486 b = cacheBumpBlock(c); 487 488 b->part = part; 489 b->addr = addr; 490 localToGlobal(addr, b->score); 491 492 /* chain onto correct hash */ 493 b->next = c->heads[h]; 494 c->heads[h] = b; 495 if(b->next != nil) 496 b->next->prev = &b->next; 497 b->prev = &c->heads[h]; 498 } 499 500 vtUnlock(c->lk); 501 502 /* 503 * BUG: what if the epoch changes right here? 504 * In the worst case, we could end up in some weird 505 * lock loop, because the block we want no longer exists, 506 * and instead we're trying to lock a block we have no 507 * business grabbing. 508 * 509 * For now, I'm not going to worry about it. 510 */ 511 512 if(0)fprint(2, "cacheLocal: %d: %d %x\n", getpid(), b->part, b->addr); 513 bwatchLock(b); 514 vtLock(b->lk); 515 b->nlock = 1; 516 517 if(part == PartData && b->iostate == BioEmpty){ 518 if(!readLabel(c, &b->l, addr)){ 519 blockPut(b); 520 return nil; 521 } 522 blockSetIOState(b, BioLabel); 523 } 524 if(epoch && b->l.epoch != epoch){ 525 blockPut(b); 526 fprint(2, "_cacheLocal want epoch %ud got %ud\n", epoch, b->l.epoch); 527 vtSetError(ELabelMismatch); 528 return nil; 529 } 530 531 b->pc = getcallerpc(&c); 532 for(;;){ 533 switch(b->iostate){ 534 default: 535 abort(); 536 case BioEmpty: 537 case BioLabel: 538 if(mode == OOverWrite){ 539 blockSetIOState(b, BioClean); 540 return b; 541 } 542 diskRead(c->disk, b); 543 vtSleep(b->ioready); 544 break; 545 case BioClean: 546 case BioDirty: 547 return b; 548 case BioReading: 549 case BioWriting: 550 vtSleep(b->ioready); 551 break; 552 case BioReadError: 553 blockSetIOState(b, BioEmpty); 554 blockPut(b); 555 vtSetError(EIO); 556 return nil; 557 } 558 } 559 /* NOT REACHED */ 560 } 561 562 Block * 563 cacheLocal(Cache *c, int part, u32int addr, int mode) 564 { 565 return _cacheLocal(c, part, addr, mode, 0); 566 } 567 568 /* 569 * fetch a local (on-disk) block from the memory cache. 570 * if it's not there, load it, bumping some other block. 571 * check tag and type. 572 */ 573 Block * 574 cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch) 575 { 576 Block *b; 577 578 b = _cacheLocal(c, PartData, addr, mode, epoch); 579 if(b == nil) 580 return nil; 581 if(b->l.type != type || b->l.tag != tag){ 582 fprint(2, "cacheLocalData: addr=%d type got %d exp %d: tag got %x exp %x\n", 583 addr, b->l.type, type, b->l.tag, tag); 584 abort(); 585 vtSetError(ELabelMismatch); 586 blockPut(b); 587 return nil; 588 } 589 b->pc = getcallerpc(&c); 590 return b; 591 } 592 593 /* 594 * fetch a global (Venti) block from the memory cache. 595 * if it's not there, load it, bumping some other block. 596 * check tag and type if it's really a local block in disguise. 597 */ 598 Block * 599 cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode) 600 { 601 int n; 602 Block *b; 603 ulong h; 604 u32int addr; 605 606 addr = globalToLocal(score); 607 if(addr != NilBlock){ 608 b = cacheLocalData(c, addr, type, tag, mode, 0); 609 if(b) 610 b->pc = getcallerpc(&c); 611 return b; 612 } 613 614 h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize; 615 616 /* 617 * look for the block in the cache 618 */ 619 vtLock(c->lk); 620 for(b = c->heads[h]; b != nil; b = b->next){ 621 if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type) 622 continue; 623 heapDel(b); 624 b->ref++; 625 break; 626 } 627 628 if(b == nil){ 629 if(0)fprint(2, "cacheGlobal %V %d\n", score, type); 630 631 b = cacheBumpBlock(c); 632 633 b->part = PartVenti; 634 b->addr = NilBlock; 635 b->l.type = type; 636 memmove(b->score, score, VtScoreSize); 637 638 /* chain onto correct hash */ 639 b->next = c->heads[h]; 640 c->heads[h] = b; 641 if(b->next != nil) 642 b->next->prev = &b->next; 643 b->prev = &c->heads[h]; 644 } 645 vtUnlock(c->lk); 646 647 bwatchLock(b); 648 vtLock(b->lk); 649 b->nlock = 1; 650 b->pc = getcallerpc(&c); 651 652 switch(b->iostate){ 653 default: 654 abort(); 655 case BioEmpty: 656 n = vtRead(c->z, score, vtType[type], b->data, c->size); 657 if(n < 0 || !vtSha1Check(score, b->data, n)){ 658 blockSetIOState(b, BioVentiError); 659 blockPut(b); 660 return nil; 661 } 662 vtZeroExtend(vtType[type], b->data, n, c->size); 663 blockSetIOState(b, BioClean); 664 return b; 665 case BioClean: 666 return b; 667 case BioVentiError: 668 case BioReadError: 669 blockPut(b); 670 vtSetError(EIO); 671 blockSetIOState(b, BioEmpty); 672 return nil; 673 } 674 /* NOT REACHED */ 675 } 676 677 /* 678 * allocate a new on-disk block and load it into the memory cache. 679 * BUG: if the disk is full, should we flush some of it to Venti? 680 */ 681 static u32int lastAlloc; 682 683 Block * 684 cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow) 685 { 686 FreeList *fl; 687 u32int addr; 688 Block *b; 689 int n, nwrap; 690 Label lab; 691 692 n = c->size / LabelSize; 693 fl = c->fl; 694 695 vtLock(fl->lk); 696 addr = fl->last; 697 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 698 if(b == nil){ 699 fprint(2, "cacheAllocBlock: xxx %R\n"); 700 vtUnlock(fl->lk); 701 return nil; 702 } 703 nwrap = 0; 704 for(;;){ 705 if(++addr >= fl->end){ 706 addr = 0; 707 fprint(2, "cacheAllocBlock wrap %d\n", fl->end); 708 if(++nwrap >= 2){ 709 blockPut(b); 710 fl->last = 0; 711 vtSetError("disk is full"); 712 fprint(2, "cacheAllocBlock: xxx1 %R\n"); 713 vtUnlock(fl->lk); 714 return nil; 715 } 716 } 717 if(addr%n == 0){ 718 blockPut(b); 719 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 720 if(b == nil){ 721 fl->last = addr; 722 fprint(2, "cacheAllocBlock: xxx2 %R\n"); 723 vtUnlock(fl->lk); 724 return nil; 725 } 726 } 727 if(!labelUnpack(&lab, b->data, addr%n)) 728 continue; 729 if(lab.state == BsFree) 730 goto Found; 731 if((lab.state&BsClosed) && lab.epochClose <= epochLow) 732 goto Found; 733 } 734 Found: 735 blockPut(b); 736 b = cacheLocal(c, PartData, addr, OOverWrite); 737 if(b == nil){ 738 fprint(2, "cacheAllocBlock: xxx3 %R\n"); 739 return nil; 740 } 741 assert(b->iostate == BioLabel || b->iostate == BioClean); 742 fl->last = addr; 743 lab.type = type; 744 lab.tag = tag; 745 lab.state = BsAlloc; 746 lab.epoch = epoch; 747 lab.epochClose = ~(u32int)0; 748 if(!blockSetLabel(b, &lab)){ 749 fprint(2, "cacheAllocBlock: xxx4 %R\n"); 750 blockPut(b); 751 return nil; 752 } 753 vtZeroExtend(vtType[type], b->data, 0, c->size); 754 if(0)diskWrite(c->disk, b); 755 756 if(0)fprint(2, "fsAlloc %ud type=%d tag = %ux\n", addr, type, tag); 757 lastAlloc = addr; 758 fl->nused++; 759 vtUnlock(fl->lk); 760 b->pc = getcallerpc(&c); 761 return b; 762 } 763 764 void 765 cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize) 766 { 767 int n; 768 u32int addr, nused; 769 Block *b; 770 Label lab; 771 FreeList *fl; 772 773 fl = c->fl; 774 n = c->size / LabelSize; 775 *bsize = c->size; 776 vtLock(fl->lk); 777 if(fl->epochLow == epochLow){ 778 *used = fl->nused; 779 *total = fl->end; 780 vtUnlock(fl->lk); 781 return; 782 } 783 b = nil; 784 nused = 0; 785 for(addr=0; addr<fl->end; addr++){ 786 if(addr%n == 0){ 787 blockPut(b); 788 b = cacheLocal(c, PartLabel, addr/n, OReadOnly); 789 if(b == nil){ 790 fprint(2, "flCountUsed: loading %ux: %R\n", addr/n); 791 break; 792 } 793 } 794 if(!labelUnpack(&lab, b->data, addr%n)) 795 continue; 796 if(lab.state == BsFree) 797 continue; 798 if((lab.state&BsClosed) && lab.epochClose <= epochLow) 799 continue; 800 nused++; 801 } 802 blockPut(b); 803 if(addr == fl->end){ 804 fl->nused = nused; 805 fl->epochLow = epochLow; 806 } 807 *used = nused; 808 *total = fl->end; 809 vtUnlock(fl->lk); 810 return; 811 } 812 813 static FreeList * 814 flAlloc(u32int end) 815 { 816 FreeList *fl; 817 818 fl = vtMemAllocZ(sizeof(*fl)); 819 fl->lk = vtLockAlloc(); 820 fl->last = 0; 821 fl->end = end; 822 return fl; 823 } 824 825 static void 826 flFree(FreeList *fl) 827 { 828 vtLockFree(fl->lk); 829 vtMemFree(fl); 830 } 831 832 u32int 833 cacheLocalSize(Cache *c, int part) 834 { 835 return diskSize(c->disk, part); 836 } 837 838 /* 839 * Copy on write. Copied blocks have to be marked BaCopy. 840 * See the big comment near blockRemoveLink. 841 */ 842 Block* 843 blockCopy(Block *b, u32int tag, u32int ehi, u32int elo) 844 { 845 Block *bb, *lb; 846 Label l; 847 848 assert((b->l.state&BsClosed)==0 && b->l.epoch < ehi); 849 bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo); 850 if(bb == nil){ 851 blockPut(b); 852 return nil; 853 } 854 855 //fprint(2, "alloc %lux copy %V\n", bb->addr, b->score); 856 /* 857 * Change label on b to mark that we've copied it. 858 * This has to come after cacheAllocBlock, since we 859 * can't hold any labels blocks (lb) while we try to 860 * fetch others (in cacheAllocBlock). 861 */ 862 if(!(b->l.state&BsCopied) && b->part==PartData){ 863 l = b->l; 864 l.state |= BsCopied; 865 lb = _blockSetLabel(b, &l); 866 if(lb == nil){ 867 /* can't set label => can't copy block */ 868 blockPut(b); 869 l.type = BtMax; 870 l.state = BsFree; 871 l.epoch = 0; 872 l.epochClose = 0; 873 l.tag = 0; 874 /* ignore error: block gets lost on error */ 875 blockSetLabel(bb, &l); 876 blockPut(bb); 877 return nil; 878 } 879 blockDependency(bb, lb, -1, nil, nil); 880 blockPut(lb); 881 } 882 883 if(0){ 884 if(b->addr != NilBlock) 885 fprint(2, "blockCopy %#ux/%ud => %#ux/%ud\n", 886 b->addr, b->l.epoch, bb->addr, bb->l.epoch); 887 else if(memcmp(b->score, vtZeroScore, VtScoreSize) != 0) 888 fprint(2, "blockCopy %V => %#ux/%ud\n", 889 b->score, bb->addr, bb->l.epoch); 890 } 891 892 memmove(bb->data, b->data, b->c->size); 893 blockDirty(bb); 894 blockPut(b); 895 return bb; 896 } 897 898 /* 899 * The thread that has locked b may refer to it by 900 * multiple names. Nlock counts the number of 901 * references the locking thread holds. It will call 902 * blockPut once per reference. 903 */ 904 void 905 blockDupLock(Block *b) 906 { 907 assert(b->nlock > 0); 908 b->nlock++; 909 } 910 911 /* 912 * we're done with the block. 913 * unlock it. can't use it after calling this. 914 */ 915 void 916 blockPut(Block* b) 917 { 918 Cache *c; 919 920 if(b == nil) 921 return; 922 923 if(0)fprint(2, "blockPut: %d: %d %x %d %s\n", getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate)); 924 925 if(b->iostate == BioDirty) 926 bwatchDependency(b); 927 928 if(--b->nlock > 0) 929 return; 930 931 /* 932 * b->nlock should probably stay at zero while 933 * the block is unlocked, but diskThread and vtSleep 934 * conspire to assume that they can just vtLock(b->lk); blockPut(b), 935 * so we have to keep b->nlock set to 1 even 936 * when the block is unlocked. 937 */ 938 assert(b->nlock == 0); 939 b->nlock = 1; 940 // b->pc = 0; 941 942 bwatchUnlock(b); 943 vtUnlock(b->lk); 944 c = b->c; 945 vtLock(c->lk); 946 947 if(--b->ref > 0){ 948 vtUnlock(c->lk); 949 return; 950 } 951 952 assert(b->ref == 0); 953 switch(b->iostate){ 954 default: 955 b->used = c->now++; 956 heapIns(b); 957 break; 958 case BioEmpty: 959 case BioLabel: 960 if(c->nheap == 0) 961 b->used = c->now++; 962 else 963 b->used = c->heap[0]->used; 964 heapIns(b); 965 break; 966 case BioDirty: 967 break; 968 } 969 vtUnlock(c->lk); 970 } 971 972 /* 973 * we're deleting a block; delete all the blocks it points to 974 * that are still active (i.e., not needed by snapshots). 975 */ 976 static void 977 blockCleanup(Block *b, u32int epoch) 978 { 979 Cache *c; 980 Block *bb; 981 int i, n; 982 Label l; 983 u32int a; 984 int type; 985 int mode; 986 987 type = b->l.type; 988 c = b->c; 989 990 bwatchReset(b->score); 991 992 blockSetIOState(b, BioClean); 993 994 /* do not recursively free directories */ 995 if(type == BtData || type == BtDir) 996 return; 997 998 n = c->size / VtScoreSize; 999 mode = OReadWrite; 1000 if(type-1 == BtData || type-1 == BtDir) 1001 mode = OOverWrite; 1002 for(i=0; i<n; i++){ 1003 a = globalToLocal(b->data + i*VtScoreSize); 1004 if(a == NilBlock || !readLabel(c, &l, a)) 1005 continue; 1006 if((l.state&BsClosed) || l.epoch != epoch) 1007 continue; 1008 bb = cacheLocalData(c, a, type-1, b->l.tag, mode, 0); 1009 if(bb == nil) 1010 continue; 1011 if((bb->l.state&BsClosed) || bb->l.epoch != epoch){ 1012 fprint(2, "cleanupBlock: block %ud changed underfoot! expected %L got %L\n", 1013 a, &l, &bb->l); 1014 blockPut(bb); 1015 continue; 1016 } 1017 blockCleanup(bb, epoch); 1018 l.type = BtMax; 1019 l.epoch = epoch; 1020 l.epochClose = 0; 1021 l.state = BsFree; 1022 l.tag = 0; 1023 blockSetLabel(bb, &l); 1024 blockPut(bb); 1025 } 1026 } 1027 1028 /* 1029 * We don't need the block at addr anymore for the active file system. 1030 * If we don't need it for the snapshots, remove it completely. 1031 * Epoch is the epoch during which we got rid of the block. 1032 * See blockRemoveLink for more. 1033 */ 1034 static int 1035 unlinkBlock(Cache *c, u32int addr, int type, u32int tag, u32int epoch) 1036 { 1037 Block *b; 1038 Label l; 1039 1040 if(addr == NilBlock) 1041 return 1; 1042 1043 //fprint(2, "unlinkBlock %#ux\n", addr); 1044 b = cacheLocalData(c, addr, type, tag, OReadOnly, 0); 1045 if(b == nil) 1046 return 0; 1047 if(b->l.epoch > epoch){ 1048 fprint(2, "unlinkBlock: strange epoch :%ud %ud\n", b->l.epoch, epoch); 1049 blockPut(b); 1050 return 0; 1051 } 1052 1053 l = b->l; 1054 if((b->l.state&BsClosed)==0 && b->l.epoch==epoch){ 1055 l.state = BsFree; 1056 l.type = BtMax; 1057 l.tag = 0; 1058 l.epoch = 0; 1059 l.epochClose = 0; 1060 blockCleanup(b, epoch); 1061 }else{ 1062 l.state |= BsClosed; 1063 l.epochClose = epoch; 1064 } 1065 blockSetLabel(b, &l); 1066 blockPut(b); 1067 return 1; 1068 } 1069 1070 /* 1071 * try to allocate a BList so we can record that b must 1072 * be written out before some other block. 1073 * if can't find a BList, write b out instead and return nil. 1074 */ 1075 static BList * 1076 blistAlloc(Block *b) 1077 { 1078 Cache *c; 1079 BList *p; 1080 1081 /* 1082 * It's possible that when we marked b dirty, there were 1083 * too many dirty blocks so we just wrote b there and then. 1084 * So b might not be dirty. If it's not, no need to worry 1085 * about recording the write constraint. 1086 * 1087 * BlockRemoveLink depends on the fact that if blistAlloc 1088 * returns non-nil, b really is dirty. 1089 */ 1090 if(b->iostate != BioDirty){ 1091 assert(b->iostate == BioClean); 1092 return nil; 1093 } 1094 1095 /* 1096 * Easy: maybe there's a free list left. 1097 */ 1098 c = b->c; 1099 vtLock(c->lk); 1100 if(c->blfree){ 1101 HaveBlFree: 1102 p = c->blfree; 1103 c->blfree = p->next; 1104 vtUnlock(c->lk); 1105 return p; 1106 } 1107 vtUnlock(c->lk); 1108 1109 /* 1110 * No free BLists. What are our options? 1111 */ 1112 1113 /* Block has no priors? Just write it. */ 1114 if(b->prior == nil){ 1115 diskWrite(c->disk, b); 1116 while(b->iostate != BioClean) 1117 vtSleep(b->ioready); 1118 return nil; 1119 } 1120 1121 /* 1122 * Wake the flush thread, which will hopefully free up 1123 * some BLists for us. We used to flush a block from 1124 * our own prior list and reclaim that BList, but this is 1125 * a no-no: some of the blocks on our prior list may 1126 * be locked by our caller. Or maybe their label blocks 1127 * are locked by our caller. In any event, it's too hard 1128 * to make sure we can do I/O for ourselves. Instead, 1129 * we assume the flush thread will find something. 1130 * (The flush thread never blocks waiting for a block, 1131 * so it won't deadlock like we will.) 1132 */ 1133 vtLock(c->lk); 1134 while(c->blfree == nil){ 1135 vtWakeup(c->flush); 1136 vtSleep(c->blrend); 1137 } 1138 goto HaveBlFree; 1139 } 1140 1141 void 1142 blistFree(Cache *c, BList *bl) 1143 { 1144 vtLock(c->lk); 1145 bl->next = c->blfree; 1146 c->blfree = bl; 1147 vtWakeup(c->blrend); 1148 vtUnlock(c->lk); 1149 } 1150 1151 /* 1152 * Flush b or one of the blocks it depends on. 1153 */ 1154 void 1155 blockFlush(Block *b) 1156 { 1157 int first, nlock; 1158 BList *p, **pp; 1159 Block *bb; 1160 Cache *c; 1161 1162 //fprint(2, "blockFlush %p\n", b); 1163 1164 c = b->c; 1165 1166 first = 1; 1167 pp = &b->prior; 1168 for(p=*pp; p; p=*pp){ 1169 bb = cacheLocalLookup(c, p->part, p->addr, p->vers); 1170 if(bb == nil){ 1171 *pp = p->next; 1172 blistFree(c, p); 1173 continue; 1174 } 1175 if(!first) 1176 blockPut(b); 1177 first = 0; 1178 b = bb; 1179 pp = &b->prior; 1180 } 1181 1182 /* 1183 * If b->nlock > 1, the block is aliased within 1184 * a single thread. That thread is us, and it's 1185 * the block that was passed in (rather than a prior). 1186 * DiskWrite does some funny stuff with VtLock 1187 * and blockPut that basically assumes b->nlock==1. 1188 * We humor diskWrite by temporarily setting 1189 * nlock to 1. This needs to be revisited. (TODO) 1190 */ 1191 nlock = b->nlock; 1192 if(nlock > 1){ 1193 assert(first); 1194 b->nlock = 1; 1195 } 1196 diskWrite(c->disk, b); 1197 while(b->iostate != BioClean) 1198 vtSleep(b->ioready); 1199 b->nlock = nlock; 1200 if(!first) 1201 blockPut(b); 1202 } 1203 1204 /* 1205 * Record that bb must be written out before b. 1206 * If index is given, we're about to overwrite the score/e 1207 * at that index in the block. Save the old value so we 1208 * can write a safer ``old'' version of the block if pressed. 1209 */ 1210 void 1211 blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e) 1212 { 1213 BList *p; 1214 1215 if(bb->iostate == BioClean) 1216 return; 1217 1218 /* 1219 * Dependencies for blocks containing Entry structures 1220 * or scores must always be explained. The problem with 1221 * only explaining some of them is this. Suppose we have two 1222 * dependencies for the same field, the first explained 1223 * and the second not. We try to write the block when the first 1224 * dependency is not written but the second is. We will roll back 1225 * the first change even though the second trumps it. 1226 */ 1227 if(index == -1 && bb->part == PartData) 1228 assert(b->l.type == BtData); 1229 1230 if(bb->iostate != BioDirty){ 1231 fprint(2, "%d:%x:%d iostate is %d in blockDependency\n", 1232 bb->part, bb->addr, bb->l.type, bb->iostate); 1233 abort(); 1234 } 1235 1236 p = blistAlloc(bb); 1237 if(p == nil) 1238 return; 1239 1240 if(0)fprint(2, "%d:%x:%d depends on %d:%x:%d\n", b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type); 1241 1242 p->part = bb->part; 1243 p->addr = bb->addr; 1244 p->type = bb->l.type; 1245 p->vers = bb->vers; 1246 p->index = index; 1247 if(p->index >= 0){ 1248 /* 1249 * This test would just be b->l.type==BtDir except 1250 * we need to exclude the super block. 1251 */ 1252 if(b->l.type == BtDir && b->part == PartData) 1253 entryPack(e, p->old.entry, 0); 1254 else 1255 memmove(p->old.score, score, VtScoreSize); 1256 } 1257 p->next = b->prior; 1258 b->prior = p; 1259 } 1260 1261 /* 1262 * Mark an in-memory block as dirty. If there are too many 1263 * dirty blocks, start writing some out to disk. If there are way 1264 * too many dirty blocks, write this one out too. 1265 * 1266 * Note that since we might call blockFlush, which walks 1267 * the prior list, we can't call blockDirty while holding a lock 1268 * on any of our priors. This can be tested by recompiling 1269 * with flush always set to 1 below. 1270 */ 1271 int 1272 blockDirty(Block *b) 1273 { 1274 Cache *c; 1275 int flush; 1276 1277 c = b->c; 1278 1279 assert(b->part != PartVenti); 1280 1281 if(b->iostate == BioDirty) 1282 return 1; 1283 assert(b->iostate == BioClean); 1284 1285 vtLock(c->lk); 1286 b->iostate = BioDirty; 1287 c->ndirty++; 1288 if(c->ndirty > (c->maxdirty>>1)) 1289 vtWakeup(c->flush); 1290 flush = c->ndirty > c->maxdirty; 1291 vtUnlock(c->lk); 1292 1293 if(flush) 1294 blockFlush(b); 1295 1296 return 1; 1297 } 1298 1299 /* 1300 * Block b once pointed at the block bb at addr/type/tag, but no longer does. 1301 * 1302 * The file system maintains the following invariants (i-iv checked by flchk): 1303 * 1304 * (i) b.e in [bb.e, bb.eClose) 1305 * (ii) if b.e==bb.e, then no other b' in e points at bb. 1306 * (iii) if !(b.state&Copied) and b.e==bb.e then no other b' points at bb. 1307 * (iv) if b is active then no other active b' points at bb. 1308 * (v) if b is a past life of b' then only one of b and b' is active (too hard to check) 1309 * 1310 * The file system initially satisfies these invariants, and we can verify that 1311 * the various file system operations maintain them. See fossil.invariants. 1312 * 1313 * Condition (i) lets us reclaim blocks once the low epoch is greater 1314 * than epochClose. 1315 * 1316 * If the condition in (iii) is satisfied, then this is the only pointer to bb, 1317 * so bb can be reclaimed once b has been written to disk. blockRemoveLink 1318 * checks !(b.state&Copied) as an optimization. UnlinkBlock and blockCleanup 1319 * will check the conditions again for each block they consider. 1320 */ 1321 int 1322 blockRemoveLink(Block *b, u32int addr, int type, u32int tag) 1323 { 1324 BList *bl; 1325 BList *p, **pp; 1326 Cache *c; 1327 1328 c = b->c; 1329 1330 /* remove unlinked block from prior list */ 1331 pp = &b->prior; 1332 for(p=*pp; p; p=*pp){ 1333 if(p->part != PartData || p->addr != addr){ 1334 pp = &p->next; 1335 continue; 1336 } 1337 *pp = p->next; 1338 blistFree(c, p); 1339 } 1340 1341 /* if b has been copied, can't reclaim blocks it points at. */ 1342 if(b->l.state & BsCopied) 1343 return 0; 1344 1345 bl = blistAlloc(b); 1346 if(bl == nil) 1347 return unlinkBlock(b->c, addr, type, tag, b->l.epoch); 1348 1349 /* 1350 * Because bl != nil, we know b is dirty. 1351 * (Linking b->uhead onto a clean block is 1352 * counterproductive, since we only look at 1353 * b->uhead when a block transitions from 1354 * dirty to clean.) 1355 */ 1356 assert(b->iostate == BioDirty); 1357 1358 bl->part = PartData; 1359 bl->addr = addr; 1360 bl->type = type; 1361 bl->tag = tag; 1362 bl->epoch = b->l.epoch; 1363 if(b->uhead == nil) 1364 b->uhead = bl; 1365 else 1366 b->utail->next = bl; 1367 b->utail = bl; 1368 bl->next = nil; 1369 return 1; 1370 } 1371 1372 /* 1373 * set the label associated with a block. 1374 */ 1375 Block* 1376 _blockSetLabel(Block *b, Label *l) 1377 { 1378 int lpb; 1379 Block *bb; 1380 u32int a; 1381 Cache *c; 1382 1383 c = b->c; 1384 1385 assert(b->part == PartData); 1386 assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty); 1387 lpb = c->size / LabelSize; 1388 a = b->addr / lpb; 1389 bb = cacheLocal(c, PartLabel, a, OReadWrite); 1390 if(bb == nil){ 1391 blockPut(b); 1392 return nil; 1393 } 1394 b->l = *l; 1395 labelPack(l, bb->data, b->addr%lpb); 1396 blockDirty(bb); 1397 return bb; 1398 } 1399 1400 int 1401 blockSetLabel(Block *b, Label *l) 1402 { 1403 Block *lb; 1404 Label oldl; 1405 1406 oldl = b->l; 1407 lb = _blockSetLabel(b, l); 1408 if(lb == nil) 1409 return 0; 1410 1411 /* 1412 * If we're allocating the block, make sure the label (bl) 1413 * goes to disk before the data block (b) itself. This is to help 1414 * the blocks that in turn depend on b. 1415 * 1416 * Suppose bx depends on (must be written out after) b. 1417 * Once we write b we'll think it's safe to write bx. 1418 * Bx can't get at b unless it has a valid label, though. 1419 * 1420 * Allocation is the only case in which having a current label 1421 * is vital because: 1422 * 1423 * - l.type is set at allocation and never changes. 1424 * - l.tag is set at allocation and never changes. 1425 * - l.state is not checked when we load blocks. 1426 * - the archiver cares deeply about l.state being 1427 * BaActive vs. BaCopied, but that's handled 1428 * by direct calls to _blockSetLabel. 1429 */ 1430 1431 if(oldl.state == BsFree) 1432 blockDependency(b, lb, -1, nil, nil); 1433 blockPut(lb); 1434 return 1; 1435 } 1436 1437 /* 1438 * We've decided to write out b. Maybe b has some pointers to blocks 1439 * that haven't yet been written to disk. If so, construct a slightly out-of-date 1440 * copy of b that is safe to write out. (diskThread will make sure the block 1441 * remains marked as dirty.) 1442 */ 1443 uchar * 1444 blockRollback(Block *b, uchar *buf) 1445 { 1446 u32int addr; 1447 BList *p; 1448 Super super; 1449 1450 /* easy case */ 1451 if(b->prior == nil) 1452 return b->data; 1453 1454 memmove(buf, b->data, b->c->size); 1455 for(p=b->prior; p; p=p->next){ 1456 /* 1457 * we know p->index >= 0 because blockWrite has vetted this block for us. 1458 */ 1459 assert(p->index >= 0); 1460 assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData)); 1461 if(b->part == PartSuper){ 1462 assert(p->index == 0); 1463 superUnpack(&super, buf); 1464 addr = globalToLocal(p->old.score); 1465 if(addr == NilBlock){ 1466 fprint(2, "rolling back super block: bad replacement addr %V\n", p->old.score); 1467 abort(); 1468 } 1469 super.active = addr; 1470 superPack(&super, buf); 1471 continue; 1472 } 1473 if(b->l.type == BtDir) 1474 memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize); 1475 else 1476 memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize); 1477 } 1478 return buf; 1479 } 1480 1481 /* 1482 * Try to write block b. 1483 * If b depends on other blocks: 1484 * 1485 * If the block has been written out, remove the dependency. 1486 * If the dependency is replaced by a more recent dependency, 1487 * throw it out. 1488 * If we know how to write out an old version of b that doesn't 1489 * depend on it, do that. 1490 * 1491 * Otherwise, bail. 1492 */ 1493 int 1494 blockWrite(Block *b) 1495 { 1496 uchar *dmap; 1497 Cache *c; 1498 BList *p, **pp; 1499 Block *bb; 1500 int lockfail; 1501 1502 c = b->c; 1503 1504 if(b->iostate != BioDirty) 1505 return 1; 1506 1507 dmap = b->dmap; 1508 memset(dmap, 0, c->ndmap); 1509 pp = &b->prior; 1510 for(p=*pp; p; p=*pp){ 1511 if(p->index >= 0){ 1512 /* more recent dependency has succeeded; this one can go */ 1513 if(dmap[p->index/8] & (1<<(p->index%8))) 1514 goto ignblock; 1515 } 1516 1517 lockfail = 0; 1518 bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 1519 if(bb == nil){ 1520 if(lockfail) 1521 return 0; 1522 /* block not in cache => was written already */ 1523 dmap[p->index/8] |= 1<<(p->index%8); 1524 goto ignblock; 1525 } 1526 1527 /* 1528 * same version of block is still in cache. 1529 * 1530 * the assertion is true because the block still has version p->vers, 1531 * which means it hasn't been written out since we last saw it. 1532 */ 1533 if(bb->iostate != BioDirty){ 1534 fprint(2, "%d:%x:%d iostate is %d in blockWrite\n", 1535 bb->part, bb->addr, bb->l.type, bb->iostate); 1536 /* probably BioWriting if it happens? */ 1537 } 1538 1539 blockPut(bb); 1540 1541 if(p->index < 0){ 1542 /* 1543 * We don't know how to temporarily undo 1544 * b's dependency on bb, so just don't write b yet. 1545 */ 1546 if(0) fprint(2, "blockWrite skipping %d %x %d %d; need to write %d %x %d\n", 1547 b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers); 1548 return 0; 1549 } 1550 /* keep walking down the list */ 1551 pp = &p->next; 1552 continue; 1553 1554 ignblock: 1555 *pp = p->next; 1556 blistFree(c, p); 1557 continue; 1558 } 1559 1560 diskWrite(c->disk, b); 1561 return 1; 1562 } 1563 1564 /* 1565 * Change the I/O state of block b. 1566 * Just an assignment except for magic in 1567 * switch statement (read comments there). 1568 */ 1569 void 1570 blockSetIOState(Block *b, int iostate) 1571 { 1572 int dowakeup; 1573 Cache *c; 1574 BList *p, *q; 1575 1576 if(0) fprint(2, "iostate part=%d addr=%x %s->%s\n", b->part, b->addr, bioStr(b->iostate), bioStr(iostate)); 1577 1578 c = b->c; 1579 1580 dowakeup = 0; 1581 switch(iostate){ 1582 default: 1583 abort(); 1584 case BioEmpty: 1585 assert(!b->uhead); 1586 break; 1587 case BioLabel: 1588 assert(!b->uhead); 1589 break; 1590 case BioClean: 1591 bwatchDependency(b); 1592 /* 1593 * If b->prior is set, it means a write just finished. 1594 * The prior list isn't needed anymore. 1595 */ 1596 for(p=b->prior; p; p=q){ 1597 q = p->next; 1598 blistFree(c, p); 1599 } 1600 b->prior = nil; 1601 /* 1602 * Freeing a block or just finished a write. 1603 * Move the blocks from the per-block unlink 1604 * queue to the cache unlink queue. 1605 */ 1606 if(b->iostate == BioDirty || b->iostate == BioWriting){ 1607 vtLock(c->lk); 1608 c->ndirty--; 1609 b->iostate = iostate; /* change here to keep in sync with ndirty */ 1610 b->vers = c->vers++; 1611 if(b->uhead){ 1612 /* add unlink blocks to unlink queue */ 1613 if(c->uhead == nil){ 1614 c->uhead = b->uhead; 1615 vtWakeup(c->unlink); 1616 }else 1617 c->utail->next = b->uhead; 1618 c->utail = b->utail; 1619 b->uhead = nil; 1620 } 1621 vtUnlock(c->lk); 1622 } 1623 assert(!b->uhead); 1624 dowakeup = 1; 1625 break; 1626 case BioDirty: 1627 /* 1628 * Wrote out an old version of the block (see blockRollback). 1629 * Bump a version count, leave it dirty. 1630 */ 1631 if(b->iostate == BioWriting){ 1632 vtLock(c->lk); 1633 b->vers = c->vers++; 1634 vtUnlock(c->lk); 1635 dowakeup = 1; 1636 } 1637 break; 1638 case BioReading: 1639 case BioWriting: 1640 /* 1641 * Adding block to disk queue. Bump reference count. 1642 * diskThread decs the count later by calling blockPut. 1643 * This is here because we need to lock c->lk to 1644 * manipulate the ref count. 1645 */ 1646 vtLock(c->lk); 1647 b->ref++; 1648 vtUnlock(c->lk); 1649 break; 1650 case BioReadError: 1651 case BioVentiError: 1652 /* 1653 * Oops. 1654 */ 1655 dowakeup = 1; 1656 break; 1657 } 1658 b->iostate = iostate; 1659 /* 1660 * Now that the state has changed, we can wake the waiters. 1661 */ 1662 if(dowakeup) 1663 vtWakeupAll(b->ioready); 1664 } 1665 1666 char* 1667 bsStr(int state) 1668 { 1669 static char s[100]; 1670 1671 if(state == BsFree) 1672 return "Free"; 1673 if(state == BsBad) 1674 return "Bad"; 1675 1676 sprint(s, "%x", state); 1677 if(!(state&BsAlloc)) 1678 strcat(s, ",Free"); /* should not happen */ 1679 if(state&BsCopied) 1680 strcat(s, ",Copied"); 1681 if(state&BsVenti) 1682 strcat(s, ",Venti"); 1683 if(state&BsClosed) 1684 strcat(s, ",Closed"); 1685 return s; 1686 } 1687 1688 char * 1689 bioStr(int iostate) 1690 { 1691 switch(iostate){ 1692 default: 1693 return "Unknown!!"; 1694 case BioEmpty: 1695 return "Empty"; 1696 case BioLabel: 1697 return "Label"; 1698 case BioClean: 1699 return "Clean"; 1700 case BioDirty: 1701 return "Dirty"; 1702 case BioReading: 1703 return "Reading"; 1704 case BioWriting: 1705 return "Writing"; 1706 case BioReadError: 1707 return "ReadError"; 1708 case BioVentiError: 1709 return "VentiError"; 1710 case BioMax: 1711 return "Max"; 1712 } 1713 } 1714 1715 static char *bttab[] = { 1716 "BtData", 1717 "BtData+1", 1718 "BtData+2", 1719 "BtData+3", 1720 "BtData+4", 1721 "BtData+5", 1722 "BtData+6", 1723 "BtData+7", 1724 "BtDir", 1725 "BtDir+1", 1726 "BtDir+2", 1727 "BtDir+3", 1728 "BtDir+4", 1729 "BtDir+5", 1730 "BtDir+6", 1731 "BtDir+7", 1732 }; 1733 1734 char* 1735 btStr(int type) 1736 { 1737 if(type < nelem(bttab)) 1738 return bttab[type]; 1739 return "unknown"; 1740 } 1741 1742 int 1743 labelFmt(Fmt *f) 1744 { 1745 Label *l; 1746 1747 l = va_arg(f->args, Label*); 1748 return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux", 1749 btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag); 1750 } 1751 1752 int 1753 scoreFmt(Fmt *f) 1754 { 1755 uchar *v; 1756 int i; 1757 u32int addr; 1758 1759 v = va_arg(f->args, uchar*); 1760 if(v == nil){ 1761 fmtprint(f, "*"); 1762 }else if((addr = globalToLocal(v)) != NilBlock) 1763 fmtprint(f, "0x%.8ux", addr); 1764 else{ 1765 for(i = 0; i < VtScoreSize; i++) 1766 fmtprint(f, "%2.2ux", v[i]); 1767 } 1768 1769 return 0; 1770 } 1771 1772 static int 1773 upHeap(int i, Block *b) 1774 { 1775 Block *bb; 1776 u32int now; 1777 int p; 1778 Cache *c; 1779 1780 c = b->c; 1781 now = c->now; 1782 for(; i != 0; i = p){ 1783 p = (i - 1) >> 1; 1784 bb = c->heap[p]; 1785 if(b->used - now >= bb->used - now) 1786 break; 1787 c->heap[i] = bb; 1788 bb->heap = i; 1789 } 1790 c->heap[i] = b; 1791 b->heap = i; 1792 1793 return i; 1794 } 1795 1796 static int 1797 downHeap(int i, Block *b) 1798 { 1799 Block *bb; 1800 u32int now; 1801 int k; 1802 Cache *c; 1803 1804 c = b->c; 1805 now = c->now; 1806 for(; ; i = k){ 1807 k = (i << 1) + 1; 1808 if(k >= c->nheap) 1809 break; 1810 if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now) 1811 k++; 1812 bb = c->heap[k]; 1813 if(b->used - now <= bb->used - now) 1814 break; 1815 c->heap[i] = bb; 1816 bb->heap = i; 1817 } 1818 c->heap[i] = b; 1819 b->heap = i; 1820 return i; 1821 } 1822 1823 /* 1824 * Delete a block from the heap. 1825 * Called with c->lk held. 1826 */ 1827 static void 1828 heapDel(Block *b) 1829 { 1830 int i, si; 1831 Cache *c; 1832 1833 c = b->c; 1834 1835 si = b->heap; 1836 if(si == BadHeap) 1837 return; 1838 b->heap = BadHeap; 1839 c->nheap--; 1840 if(si == c->nheap) 1841 return; 1842 b = c->heap[c->nheap]; 1843 i = upHeap(si, b); 1844 if(i == si) 1845 downHeap(i, b); 1846 } 1847 1848 /* 1849 * Insert a block into the heap. 1850 * Called with c->lk held. 1851 */ 1852 static void 1853 heapIns(Block *b) 1854 { 1855 assert(b->heap == BadHeap); 1856 upHeap(b->c->nheap++, b); 1857 } 1858 1859 /* 1860 * Get just the label for a block. 1861 */ 1862 static int 1863 readLabel(Cache *c, Label *l, u32int addr) 1864 { 1865 int lpb; 1866 Block *b; 1867 u32int a; 1868 1869 lpb = c->size / LabelSize; 1870 a = addr / lpb; 1871 b = cacheLocal(c, PartLabel, a, OReadOnly); 1872 if(b == nil){ 1873 blockPut(b); 1874 return 0; 1875 } 1876 1877 if(!labelUnpack(l, b->data, addr%lpb)){ 1878 blockPut(b); 1879 return 0; 1880 } 1881 blockPut(b); 1882 return 1; 1883 } 1884 1885 /* 1886 * Process unlink queue. 1887 * Called with c->lk held. 1888 */ 1889 static void 1890 unlinkBody(Cache *c) 1891 { 1892 BList *p; 1893 1894 while(c->uhead != nil){ 1895 p = c->uhead; 1896 c->uhead = p->next; 1897 vtUnlock(c->lk); 1898 1899 if(!unlinkBlock(c, p->addr, p->type, p->tag, p->epoch)) 1900 fprint(2, "unlinkBlock failed: addr=%x type=%d tag = %ux: %r\n", 1901 p->addr, p->type, p->tag); 1902 1903 vtLock(c->lk); 1904 p->next = c->blfree; 1905 c->blfree = p; 1906 } 1907 } 1908 1909 /* 1910 * Occasionally unlink the blocks on the cache unlink queue. 1911 */ 1912 static void 1913 unlinkThread(void *a) 1914 { 1915 Cache *c = a; 1916 1917 vtThreadSetName("unlink"); 1918 1919 vtLock(c->lk); 1920 for(;;){ 1921 while(c->uhead == nil && c->die == nil) 1922 vtSleep(c->unlink); 1923 if(c->die != nil) 1924 break; 1925 unlinkBody(c); 1926 } 1927 c->ref--; 1928 vtWakeup(c->die); 1929 vtUnlock(c->lk); 1930 } 1931 1932 static int 1933 baddrCmp(void *a0, void *a1) 1934 { 1935 BAddr *b0, *b1; 1936 b0 = a0; 1937 b1 = a1; 1938 1939 if(b0->part < b1->part) 1940 return -1; 1941 if(b0->part > b1->part) 1942 return 1; 1943 if(b0->addr < b1->addr) 1944 return -1; 1945 if(b0->addr > b1->addr) 1946 return 1; 1947 return 0; 1948 } 1949 1950 /* 1951 * Scan the block list for dirty blocks; add them to the list c->baddr. 1952 */ 1953 static void 1954 flushFill(Cache *c) 1955 { 1956 int i, ndirty; 1957 BAddr *p; 1958 Block *b; 1959 1960 vtLock(c->lk); 1961 if(c->ndirty == 0){ 1962 vtUnlock(c->lk); 1963 return; 1964 } 1965 1966 p = c->baddr; 1967 ndirty = 0; 1968 for(i=0; i<c->nblocks; i++){ 1969 b = c->blocks + i; 1970 if(b->part == PartError) 1971 continue; 1972 if(b->iostate == BioDirty || b->iostate == BioWriting) 1973 ndirty++; 1974 if(b->iostate != BioDirty) 1975 continue; 1976 p->part = b->part; 1977 p->addr = b->addr; 1978 p->vers = b->vers; 1979 p++; 1980 } 1981 if(ndirty != c->ndirty){ 1982 fprint(2, "ndirty mismatch expected %d found %d\n", 1983 c->ndirty, ndirty); 1984 c->ndirty = ndirty; 1985 } 1986 vtUnlock(c->lk); 1987 1988 c->bw = p - c->baddr; 1989 qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp); 1990 } 1991 1992 /* 1993 * This is not thread safe, i.e. it can't be called from multiple threads. 1994 * 1995 * It's okay how we use it, because it only gets called in 1996 * the flushThread. And cacheFree, but only after 1997 * cacheFree has killed off the flushThread. 1998 */ 1999 static int 2000 cacheFlushBlock(Cache *c) 2001 { 2002 Block *b; 2003 BAddr *p; 2004 int lockfail, nfail; 2005 2006 nfail = 0; 2007 for(;;){ 2008 if(c->br == c->be){ 2009 if(c->bw == 0 || c->bw == c->be) 2010 flushFill(c); 2011 c->br = 0; 2012 c->be = c->bw; 2013 c->bw = 0; 2014 c->nflush = 0; 2015 } 2016 2017 if(c->br == c->be) 2018 return 0; 2019 p = c->baddr + c->br; 2020 c->br++; 2021 b = _cacheLocalLookup(c, p->part, p->addr, p->vers, 0, &lockfail); 2022 2023 if(b && blockWrite(b)){ 2024 c->nflush++; 2025 blockPut(b); 2026 return 1; 2027 } 2028 if(b) 2029 blockPut(b); 2030 2031 /* 2032 * Why didn't we write the block? 2033 */ 2034 2035 /* Block already written out */ 2036 if(b == nil && !lockfail) 2037 continue; 2038 2039 /* Failed to acquire lock; sleep if happens a lot. */ 2040 if(lockfail && ++nfail > 100) 2041 sleep(500); 2042 2043 /* Requeue block. */ 2044 if(c->bw < c->be) 2045 c->baddr[c->bw++] = *p; 2046 } 2047 return 0; 2048 } 2049 2050 /* 2051 * Occasionally flush dirty blocks from memory to the disk. 2052 */ 2053 static void 2054 flushThread(void *a) 2055 { 2056 Cache *c = a; 2057 int i; 2058 2059 vtThreadSetName("flush"); 2060 vtLock(c->lk); 2061 while(c->die == nil){ 2062 vtSleep(c->flush); 2063 vtUnlock(c->lk); 2064 for(i=0; i<FlushSize; i++) 2065 if(!cacheFlushBlock(c)){ 2066 /* 2067 * If i==0, could be someone is waking us repeatedly 2068 * to flush the cache but there's no work to do. 2069 * Pause a little. 2070 */ 2071 if(i==0) 2072 sleep(250); 2073 break; 2074 } 2075 if(i==0 && c->ndirty){ 2076 /* 2077 * All the blocks are being written right now -- there's nothing to do. 2078 * We might be spinning with cacheFlush though -- he'll just keep 2079 * kicking us until c->ndirty goes down. Probably we should sleep 2080 * on something that the diskThread can kick, but for now we'll 2081 * just pause for a little while waiting for disks to finish. 2082 */ 2083 sleep(100); 2084 } 2085 vtLock(c->lk); 2086 vtWakeupAll(c->flushwait); 2087 } 2088 c->ref--; 2089 vtWakeup(c->die); 2090 vtUnlock(c->lk); 2091 } 2092 2093 /* 2094 * Keep flushing until everything is clean. 2095 */ 2096 void 2097 cacheFlush(Cache *c, int wait) 2098 { 2099 vtLock(c->lk); 2100 if(wait){ 2101 while(c->ndirty){ 2102 // consPrint("cacheFlush: %d dirty blocks, uhead %p\n", 2103 // c->ndirty, c->uhead); 2104 vtWakeup(c->flush); 2105 vtSleep(c->flushwait); 2106 } 2107 // consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead); 2108 }else 2109 vtWakeup(c->flush); 2110 vtUnlock(c->lk); 2111 } 2112 2113 /* 2114 * Kick the flushThread every 30 seconds. 2115 */ 2116 static void 2117 cacheSync(void *v) 2118 { 2119 Cache *c; 2120 2121 c = v; 2122 cacheFlush(c, 0); 2123 } 2124