1 #include "stdinc.h" 2 #include "dat.h" 3 #include "fns.h" 4 #include "error.h" 5 6 static void fsMetaFlush(void *a); 7 static Snap *snapInit(Fs*); 8 static void snapClose(Snap*); 9 10 Fs * 11 fsOpen(char *file, VtSession *z, long ncache, int mode) 12 { 13 Fs *fs; 14 Disk *disk; 15 int fd; 16 Block *b, *bs; 17 Super super; 18 int m; 19 uchar oscore[VtScoreSize]; 20 21 switch(mode){ 22 default: 23 vtSetError(EBadMode); 24 return nil; 25 case OReadOnly: 26 m = OREAD; 27 break; 28 case OReadWrite: 29 m = ORDWR; 30 break; 31 } 32 fd = open(file, m); 33 if(fd < 0){ 34 vtOSError(); 35 return nil; 36 } 37 38 bwatchInit(); 39 disk = diskAlloc(fd); 40 if(disk == nil){ 41 close(fd); 42 return nil; 43 } 44 45 fs = vtMemAllocZ(sizeof(Fs)); 46 fs->mode = mode; 47 fs->blockSize = diskBlockSize(disk); 48 fs->elk = vtLockAlloc(); 49 fs->cache = cacheAlloc(disk, z, ncache, mode); 50 if(mode == OReadWrite) 51 fs->arch = archInit(fs->cache, disk, fs, z); 52 fs->z = z; 53 54 b = cacheLocal(fs->cache, PartSuper, 0, mode); 55 if(b == nil) 56 goto Err; 57 if(!superUnpack(&super, b->data)){ 58 blockPut(b); 59 goto Err; 60 } 61 blockPut(b); 62 63 fs->ehi = super.epochHigh; 64 fs->elo = super.epochLow; 65 66 fprint(2, "fs->ehi %d fs->elo %d active=%d\n", fs->ehi, fs->elo, super.active); 67 68 fs->source = sourceRoot(fs, super.active, mode); 69 if(fs->source == nil){ 70 /* 71 * Perhaps it failed because the block is copy-on-write. 72 * Do the copy and try again. 73 */ 74 if(mode == OReadOnly || strcmp(vtGetError(), EBadRoot) != 0) 75 goto Err; 76 b = cacheLocalData(fs->cache, super.active, BtDir, RootTag, OReadWrite, 0); 77 if(b == nil) 78 goto Err; 79 if(!(b->l.state&BsClosed) && b->l.epoch == fs->ehi){ 80 blockPut(b); 81 goto Err; 82 } 83 b = blockCopy(b, RootTag, fs->ehi, fs->elo); 84 if(b == nil) 85 goto Err; 86 localToGlobal(super.active, oscore); 87 super.active = b->addr; 88 bs = cacheLocal(fs->cache, PartSuper, 0, OReadWrite); 89 if(bs == nil){ 90 blockPut(b); 91 goto Err; 92 } 93 superPack(&super, bs->data); 94 blockDependency(bs, b, 0, oscore, nil); 95 blockDirty(bs); 96 blockPut(bs); 97 blockPut(b); 98 fs->source = sourceRoot(fs, super.active, mode); 99 if(fs->source == nil) 100 goto Err; 101 } 102 103 fprint(2, "got fs source\n"); 104 105 vtRLock(fs->elk); 106 fs->file = fileRoot(fs->source); 107 vtRUnlock(fs->elk); 108 if(fs->file == nil) 109 goto Err; 110 111 fprint(2, "got file root\n"); 112 113 if(mode == OReadWrite){ 114 fs->metaFlush = periodicAlloc(fsMetaFlush, fs, 1000); 115 fs->snap = snapInit(fs); 116 } 117 return fs; 118 119 Err: 120 fsClose(fs); 121 return nil; 122 } 123 124 void 125 fsClose(Fs *fs) 126 { 127 vtRLock(fs->elk); 128 periodicKill(fs->metaFlush); 129 snapClose(fs->snap); 130 if(fs->file){ 131 fileMetaFlush(fs->file, 0); 132 if(!fileDecRef(fs->file)) 133 vtFatal("fsClose: files still in use: %r\n"); 134 } 135 fs->file = nil; 136 sourceClose(fs->source); 137 cacheFree(fs->cache); 138 if(fs->arch) 139 archFree(fs->arch); 140 vtRUnlock(fs->elk); 141 vtLockFree(fs->elk); 142 memset(fs, ~0, sizeof(Fs)); 143 vtMemFree(fs); 144 } 145 146 int 147 fsRedial(Fs *fs, char *host) 148 { 149 if(!vtRedial(fs->z, host)) 150 return 0; 151 if(!vtConnect(fs->z, 0)) 152 return 0; 153 return 1; 154 } 155 156 File * 157 fsGetRoot(Fs *fs) 158 { 159 return fileIncRef(fs->file); 160 } 161 162 int 163 fsGetBlockSize(Fs *fs) 164 { 165 return fs->blockSize; 166 } 167 168 Block* 169 superGet(Cache *c, Super* super) 170 { 171 Block *b; 172 173 if((b = cacheLocal(c, PartSuper, 0, OReadWrite)) == nil){ 174 fprint(2, "superGet: cacheLocal failed: %R"); 175 return nil; 176 } 177 if(!superUnpack(super, b->data)){ 178 fprint(2, "superGet: superUnpack failed: %R"); 179 blockPut(b); 180 return nil; 181 } 182 183 return b; 184 } 185 186 void 187 superPut(Block* b, Super* super, int forceWrite) 188 { 189 superPack(super, b->data); 190 blockDirty(b); 191 if(forceWrite){ 192 while(!blockWrite(b)){ 193 /* BUG: what should really happen here? */ 194 fprint(2, "could not write super block; waiting 10 seconds\n"); 195 sleep(10*000); 196 } 197 while(b->iostate != BioClean && b->iostate != BioDirty){ 198 assert(b->iostate == BioWriting); 199 vtSleep(b->ioready); 200 } 201 /* 202 * it's okay that b might still be dirty. 203 * that means it got written out but with an old root pointer, 204 * but the other fields went out, and those are the ones 205 * we really care about. (specifically, epochHigh; see fsSnapshot). 206 */ 207 } 208 blockPut(b); 209 } 210 211 /* 212 * Prepare the directory to store a snapshot. 213 * Temporary snapshots go into /snapshot/yyyy/mmdd/hhmm[.#] 214 * Archival snapshots go into /archive/yyyy/mmdd[.#]. 215 * 216 * TODO This should be rewritten to eliminate most of the duplication. 217 */ 218 static File* 219 fileOpenSnapshot(Fs *fs, int doarchive) 220 { 221 int n; 222 char buf[30], *s; 223 File *dir, *f; 224 Tm now; 225 226 if(doarchive){ 227 /* 228 * a snapshot intended to be archived to venti. 229 */ 230 dir = fileOpen(fs, "/archive"); 231 if(dir == nil) 232 return nil; 233 now = *localtime(time(0)); 234 235 /* yyyy */ 236 snprint(buf, sizeof(buf), "%d", now.year+1900); 237 f = fileWalk(dir, buf); 238 if(f == nil) 239 f = fileCreate(dir, buf, ModeDir|0555, "adm"); 240 fileDecRef(dir); 241 if(f == nil) 242 return nil; 243 dir = f; 244 245 /* mmdd[#] */ 246 snprint(buf, sizeof(buf), "%02d%02d", now.mon+1, now.mday); 247 s = buf+strlen(buf); 248 for(n=0;; n++){ 249 if(n) 250 seprint(s, buf+sizeof(buf), ".%d", n); 251 f = fileWalk(dir, buf); 252 if(f != nil){ 253 fileDecRef(f); 254 continue; 255 } 256 f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm"); 257 break; 258 } 259 fileDecRef(dir); 260 return f; 261 }else{ 262 /* 263 * Just a temporary snapshot 264 * We'll use /snapshot/yyyy/mmdd/hhmm. 265 * There may well be a better naming scheme. 266 * (I'd have used hh:mm but ':' is reserved in Microsoft file systems.) 267 */ 268 dir = fileOpen(fs, "/snapshot"); 269 if(dir == nil) 270 return nil; 271 272 /* 273 * used to do /snapshot/# 274 * 275 for(n=0;; n++){ 276 if(n) 277 seprint(s, buf+sizeof(buf), ".%d", n); 278 f = fileWalk(dir, buf); 279 if(f != nil){ 280 fileDecRef(f); 281 continue; 282 } 283 f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm"); 284 break; 285 } 286 dir = fileOpen(fs, "/snapshot"); 287 if(dir == nil) 288 return nil; 289 snprint(buf, sizeof(buf), "%d", fs->ehi); 290 f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm"); 291 fileDecRef(dir); 292 return f; 293 */ 294 295 now = *localtime(time(0)); 296 297 /* yyyy */ 298 snprint(buf, sizeof(buf), "%d", now.year+1900); 299 f = fileWalk(dir, buf); 300 if(f == nil) 301 f = fileCreate(dir, buf, ModeDir|0555, "adm"); 302 fileDecRef(dir); 303 if(f == nil) 304 return nil; 305 dir = f; 306 307 /* mmdd */ 308 snprint(buf, sizeof(buf), "%02d%02d", now.mon+1, now.mday); 309 f = fileWalk(dir, buf); 310 if(f == nil) 311 f = fileCreate(dir, buf, ModeDir|0555, "adm"); 312 fileDecRef(dir); 313 if(f == nil) 314 return nil; 315 dir = f; 316 317 /* hhmm[.#] */ 318 snprint(buf, sizeof buf, "%02d%02d", now.hour, now.min); 319 s = buf+strlen(buf); 320 for(n=0;; n++){ 321 if(n) 322 seprint(s, buf+sizeof(buf), ".%d", n); 323 f = fileWalk(dir, buf); 324 if(f != nil){ 325 fileDecRef(f); 326 continue; 327 } 328 f = fileCreate(dir, buf, ModeDir|ModeSnapshot|0555, "adm"); 329 break; 330 } 331 fileDecRef(dir); 332 return f; 333 } 334 } 335 336 int 337 fsEpochLow(Fs *fs, u32int low) 338 { 339 Block *bs; 340 Super super; 341 342 vtLock(fs->elk); 343 if(low > fs->ehi){ 344 vtSetError("bad low epoch (must be <= %ud)", fs->ehi); 345 vtUnlock(fs->elk); 346 return 0; 347 } 348 349 if((bs = superGet(fs->cache, &super)) == nil){ 350 vtUnlock(fs->elk); 351 return 0; 352 } 353 354 super.epochLow = low; 355 fs->elo = low; 356 superPut(bs, &super, 1); 357 vtUnlock(fs->elk); 358 359 return 1; 360 } 361 362 static int 363 bumpEpoch(Fs *fs, int doarchive) 364 { 365 uchar oscore[VtScoreSize]; 366 u32int oldaddr; 367 Block *b, *bs; 368 Entry e; 369 Source *r; 370 Super super; 371 372 /* 373 * Duplicate the root block. 374 * 375 * As a hint to flchk, the garbage collector, 376 * and any (human) debuggers, store a pointer 377 * to the old root block in entry 1 of the new root block. 378 */ 379 r = fs->source; 380 b = cacheGlobal(fs->cache, r->score, BtDir, RootTag, OReadOnly); 381 if(b == nil) 382 return 0; 383 384 memset(&e, 0, sizeof e); 385 e.flags = VtEntryActive | VtEntryLocal | VtEntryDir; 386 memmove(e.score, b->score, VtScoreSize); 387 e.tag = RootTag; 388 e.snap = b->l.epoch; 389 390 b = blockCopy(b, RootTag, fs->ehi+1, fs->elo); 391 if(b == nil){ 392 fprint(2, "bumpEpoch: blockCopy: %R\n"); 393 return 0; 394 } 395 396 if(0) fprint(2, "snapshot root from %d to %d\n", oldaddr, b->addr); 397 entryPack(&e, b->data, 1); 398 blockDirty(b); 399 400 /* 401 * Update the superblock with the new root and epoch. 402 */ 403 if((bs = superGet(fs->cache, &super)) == nil) 404 return 0; 405 406 fs->ehi++; 407 memmove(r->score, b->score, VtScoreSize); 408 r->epoch = fs->ehi; 409 410 super.epochHigh = fs->ehi; 411 oldaddr = super.active; 412 super.active = b->addr; 413 if(doarchive) 414 super.next = oldaddr; 415 416 /* 417 * Record that the new super.active can't get written out until 418 * the new b gets written out. Until then, use the old value. 419 */ 420 localToGlobal(oldaddr, oscore); 421 blockDependency(bs, b, 0, oscore, nil); 422 blockPut(b); 423 424 /* 425 * We force the super block to disk so that super.epochHigh gets updated. 426 * Otherwise, if we crash and come back, we might incorrectly treat as active 427 * some of the blocks that making up the snapshot we just created. 428 * Basically every block in the active file system and all the blocks in 429 * the recently-created snapshot depend on the super block now. 430 * Rather than record all those dependencies, we just force the block to disk. 431 * 432 * Note that blockWrite might actually (will probably) send a slightly outdated 433 * super.active to disk. It will be the address of the most recent root that has 434 * gone to disk. 435 */ 436 superPut(bs, &super, 1); 437 438 return 1; 439 } 440 441 int 442 saveQid(Fs *fs) 443 { 444 Block *b; 445 Super super; 446 u64int qidMax; 447 448 if((b = superGet(fs->cache, &super)) == nil) 449 return 0; 450 qidMax = super.qid; 451 blockPut(b); 452 453 if(!fileSetQidSpace(fs->file, 0, qidMax)) 454 return 0; 455 456 return 1; 457 } 458 459 int 460 fsSnapshot(Fs *fs, int doarchive) 461 { 462 File *src, *dst; 463 464 assert(fs->mode == OReadWrite); 465 466 dst = nil; 467 468 /* 469 * Freeze file system activity. 470 */ 471 vtLock(fs->elk); 472 473 /* 474 * Get the root of the directory we're going to save. 475 */ 476 src = fileOpen(fs, "/active"); 477 if(src == nil) 478 goto Err; 479 480 /* 481 * It is important that we maintain the invariant that: 482 * if both b and bb are marked as Active with epoch e 483 * and b points at bb, then no other pointers to bb exist. 484 * 485 * The archiver uses this property to aggressively reclaim 486 * such blocks once they have been stored on Venti, and 487 * blockCleanup knows about this property as well. 488 * 489 * Let's say src->source is block sb, and src->msource is block 490 * mb. Let's also say that block b holds the Entry structures for 491 * both src->source and src->msource (their Entry structures might 492 * be in different blocks, but the argument is the same). 493 * That is, right now we have: 494 * 495 * b Active w/ epoch e, holds ptrs to sb and mb. 496 * sb Active w/ epoch e. 497 * mb Active w/ epoch e. 498 * 499 * With things as they are now, the invariant requires that 500 * b holds the only pointers to sb and mb. We want to record 501 * pointers to sb and mb in new Entries corresponding to dst, 502 * which breaks the invariant. Thus we need to do something 503 * about b. Specifically, we bump the file system's epoch and 504 * then rewalk the path from the root down to and including b. 505 * This will copy-on-write as we walk, so now the state will be: 506 * 507 * b Snap w/ epoch e, holds ptrs to sb and mb. 508 * new-b Active w/ epoch e+1, holds ptrs to sb and mb. 509 * sb Active w/ epoch e. 510 * mb Active w/ epoch e. 511 * 512 * In this state, it's perfectly okay to add pointers to dst, which 513 * will live in a block marked Active with epoch e+1. 514 * 515 * Of course, we need to make sure that the copied path makes 516 * it out to disk before the new dst block; if the dst block goes out 517 * first and then we crash, the invariant is violated. Rather than 518 * deal with the dependencies, we just sync the file system to disk 519 * right now. 520 */ 521 if(!bumpEpoch(fs, 0) || !fileWalkSources(src)) 522 goto Err; 523 524 /* 525 * Sync to disk. 526 */ 527 cacheFlush(fs->cache, 1); 528 529 /* 530 * Create the directory where we will store the copy of src. 531 */ 532 dst = fileOpenSnapshot(fs, doarchive); 533 if(dst == nil) 534 goto Err; 535 536 /* 537 * Actually make the copy by setting dst's source and msource 538 * to be src's. 539 */ 540 if(!fileSnapshot(dst, src, fs->ehi-1, doarchive)) 541 goto Err; 542 543 fileDecRef(src); 544 fileDecRef(dst); 545 /* 546 * Make another copy of the file system. This one is for the 547 * archiver, so that the file system we archive has the recently 548 * added snapshot both in /active and in /archive/yyyy/mmdd[.#]. 549 */ 550 if(doarchive){ 551 if(!saveQid(fs)) 552 goto Err; 553 if(!bumpEpoch(fs, 1)) 554 goto Err; 555 } 556 557 vtUnlock(fs->elk); 558 559 /* BUG? can fs->arch fall out from under us here? */ 560 if(doarchive && fs->arch) 561 archKick(fs->arch); 562 563 return 1; 564 565 Err: 566 fprint(2, "fsSnapshot: %R\n"); 567 if(src) 568 fileDecRef(src); 569 if(dst) 570 fileDecRef(dst); 571 vtUnlock(fs->elk); 572 return 0; 573 } 574 575 int 576 fsVac(Fs *fs, char *name, uchar score[VtScoreSize]) 577 { 578 int r; 579 DirEntry de; 580 Entry e, ee; 581 File *f; 582 583 vtRLock(fs->elk); 584 f = fileOpen(fs, name); 585 if(f == nil){ 586 vtRUnlock(fs->elk); 587 return 0; 588 } 589 590 if(!fileGetSources(f, &e, &ee, 0) || !fileGetDir(f, &de)){ 591 fileDecRef(f); 592 vtRUnlock(fs->elk); 593 return 0; 594 } 595 fileDecRef(f); 596 597 r = mkVac(fs->z, fs->blockSize, &e, &ee, &de, score); 598 vtRUnlock(fs->elk); 599 return r; 600 } 601 602 static int 603 vtWriteBlock(VtSession *z, uchar *buf, uint n, uint type, uchar score[VtScoreSize]) 604 { 605 if(!vtWrite(z, score, type, buf, n)) 606 return 0; 607 if(!vtSha1Check(score, buf, n)) 608 return 0; 609 return 1; 610 } 611 612 int 613 mkVac(VtSession *z, uint blockSize, Entry *pe, Entry *pee, DirEntry *pde, uchar score[VtScoreSize]) 614 { 615 uchar buf[8192]; 616 int i; 617 uchar *p; 618 uint n; 619 DirEntry de; 620 Entry e, ee, eee; 621 MetaBlock mb; 622 MetaEntry me; 623 VtRoot root; 624 625 e = *pe; 626 ee = *pee; 627 de = *pde; 628 629 if(globalToLocal(e.score) != NilBlock 630 || (ee.flags&VtEntryActive && globalToLocal(ee.score) != NilBlock)){ 631 vtSetError("can only vac paths already stored on venti"); 632 return 0; 633 } 634 635 /* 636 * Build metadata source for root. 637 */ 638 n = deSize(&de); 639 if(n+MetaHeaderSize+MetaIndexSize > sizeof buf){ 640 vtSetError("DirEntry too big"); 641 return 0; 642 } 643 memset(buf, 0, sizeof buf); 644 mbInit(&mb, buf, n+MetaHeaderSize+MetaIndexSize, 1); 645 p = mbAlloc(&mb, n); 646 if(p == nil) 647 abort(); 648 mbSearch(&mb, de.elem, &i, &me); 649 assert(me.p == nil); 650 me.p = p; 651 me.size = n; 652 dePack(&de, &me); 653 mbInsert(&mb, i, &me); 654 mbPack(&mb); 655 656 eee.size = n+MetaHeaderSize+MetaIndexSize; 657 if(!vtWriteBlock(z, buf, eee.size, VtDataType, eee.score)) 658 return 0; 659 eee.psize = 8192; 660 eee.dsize = 8192; 661 eee.depth = 0; 662 eee.flags = VtEntryActive; 663 664 /* 665 * Build root source with three entries in it. 666 */ 667 entryPack(&e, buf, 0); 668 entryPack(&ee, buf, 1); 669 entryPack(&eee, buf, 2); 670 671 n = VtEntrySize*3; 672 memset(&root, 0, sizeof root); 673 if(!vtWriteBlock(z, buf, n, VtDirType, root.score)) 674 return 0; 675 676 /* 677 * Save root. 678 */ 679 root.version = VtRootVersion; 680 strecpy(root.type, root.type+sizeof root.type, "vac"); 681 strecpy(root.name, root.name+sizeof root.name, de.elem); 682 root.blockSize = blockSize; 683 vtRootPack(&root, buf); 684 if(!vtWriteBlock(z, buf, VtRootSize, VtRootType, score)) 685 return 0; 686 687 return 1; 688 } 689 690 int 691 fsSync(Fs *fs) 692 { 693 vtLock(fs->elk); 694 cacheFlush(fs->cache, 1); 695 vtUnlock(fs->elk); 696 return 1; 697 } 698 699 int 700 fsNextQid(Fs *fs, u64int *qid) 701 { 702 Block *b; 703 Super super; 704 705 if((b = superGet(fs->cache, &super)) == nil) 706 return 0; 707 708 *qid = super.qid++; 709 710 /* 711 * It's okay if the super block doesn't go to disk immediately, 712 * since fileMetaAlloc will record a dependency between the 713 * block holding this qid and the super block. See file.c:/^fileMetaAlloc. 714 */ 715 superPut(b, &super, 0); 716 return 1; 717 } 718 719 static void 720 fsMetaFlush(void *a) 721 { 722 Fs *fs = a; 723 724 vtRLock(fs->elk); 725 fileMetaFlush(fs->file, 1); 726 vtRUnlock(fs->elk); 727 cacheFlush(fs->cache, 0); 728 } 729 730 struct Snap 731 { 732 Fs *fs; 733 Periodic *tick; 734 VtLock *lk; 735 uint snapMinutes; 736 uint archMinute; 737 u32int lastSnap; 738 u32int lastArch; 739 uint ignore; 740 }; 741 742 static void 743 snapEvent(void *v) 744 { 745 Snap *s; 746 u32int now, min; 747 Tm tm; 748 749 s = v; 750 751 now = time(0)/60; 752 vtLock(s->lk); 753 754 /* 755 * Snapshots happen every snapMinutes minutes. 756 * If we miss a snapshot (for example, because we 757 * were down), we wait for the next one. 758 */ 759 if(s->snapMinutes != ~0 && s->snapMinutes != 0 760 && now%s->snapMinutes==0 && now != s->lastSnap){ 761 if(0)fprint(2, "snapshot %02d%02d\n", now/60, now%60); 762 if(!fsSnapshot(s->fs, 0)) 763 fprint(2, "fsSnapshot snap: %R\n"); 764 s->lastSnap = now; 765 } 766 767 /* 768 * Archival snapshots happen at archMinute. 769 */ 770 tm = *localtime(now*60); 771 min = tm.hour*60+tm.min; 772 if(s->archMinute != ~0 && min == s->archMinute && now != s->lastArch){ 773 if(0)fprint(2, "archive %02d%02d\n", now/60, now%60); 774 if(!fsSnapshot(s->fs, 1)) 775 fprint(2, "fsSnapshot arch: %R\n"); 776 s->lastArch = now; 777 } 778 vtUnlock(s->lk); 779 } 780 781 static Snap* 782 snapInit(Fs *fs) 783 { 784 Snap *s; 785 786 s = vtMemAllocZ(sizeof(Snap)); 787 s->fs = fs; 788 s->tick = periodicAlloc(snapEvent, s, 10*1000); 789 s->lk = vtLockAlloc(); 790 s->snapMinutes = -1; 791 s->archMinute = -1; 792 s->ignore = 5*2; /* wait five minutes for clock to stabilize */ 793 return s; 794 } 795 796 void 797 snapGetTimes(Snap *s, u32int *arch, u32int *snap) 798 { 799 vtLock(s->lk); 800 *snap = s->snapMinutes; 801 *arch = s->archMinute; 802 vtUnlock(s->lk); 803 } 804 805 void 806 snapSetTimes(Snap *s, u32int arch, u32int snap) 807 { 808 vtLock(s->lk); 809 s->snapMinutes = snap; 810 s->archMinute = arch; 811 vtUnlock(s->lk); 812 } 813 814 static void 815 snapClose(Snap *s) 816 { 817 if(s == nil) 818 return; 819 820 periodicKill(s->tick); 821 vtMemFree(s); 822 } 823 824