1 typedef struct Config Config; 2 typedef struct AMap AMap; 3 typedef struct AMapN AMapN; 4 typedef struct Arena Arena; 5 typedef struct AState AState; 6 typedef struct ArenaCIG ArenaCIG; 7 typedef struct ArenaHead ArenaHead; 8 typedef struct ArenaPart ArenaPart; 9 typedef struct ArenaTail ArenaTail; 10 typedef struct ATailStats ATailStats; 11 typedef struct CIBlock CIBlock; 12 typedef struct Clump Clump; 13 typedef struct ClumpInfo ClumpInfo; 14 typedef struct Graph Graph; 15 typedef struct IAddr IAddr; 16 typedef struct IBucket IBucket; 17 typedef struct IEStream IEStream; 18 typedef struct IEntry IEntry; 19 typedef struct IFile IFile; 20 typedef struct ISect ISect; 21 typedef struct Index Index; 22 typedef struct Lump Lump; 23 typedef struct DBlock DBlock; 24 typedef struct Part Part; 25 typedef struct Statbin Statbin; 26 typedef struct Statdesc Statdesc; 27 typedef struct Stats Stats; 28 typedef struct ZBlock ZBlock; 29 typedef struct Round Round; 30 typedef struct Bloom Bloom; 31 32 #pragma incomplete IEStream 33 34 #define TWID32 ((u32int)~(u32int)0) 35 #define TWID64 ((u64int)~(u64int)0) 36 #define TWID8 ((u8int)~(u8int)0) 37 38 enum 39 { 40 ABlockLog = 9, /* log2(512), the quantum for reading arenas */ 41 ANameSize = 64, 42 MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */ 43 MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */ 44 PartBlank = 256*1024, /* untouched section at beginning of partition */ 45 HeadSize = 512, /* size of a header after PartBlank */ 46 MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */ 47 IndexBase = 1024*1024, /* initial address to use in an index */ 48 MaxIo = 64*1024, /* max size of a single read or write operation */ 49 ICacheBits = 16, /* default bits for indexing icache */ 50 MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */ 51 Unspecified = TWID32, 52 53 /* 54 * return codes from syncarena 55 */ 56 SyncDataErr = 1 << 0, /* problem reading the clump data */ 57 SyncCIErr = 1 << 1, /* found erroneous clump directory entries */ 58 SyncCIZero = 1 << 2, /* found unwritten clump directory entries */ 59 SyncFixErr = 1 << 3, /* error writing fixed data */ 60 SyncHeader = 1 << 4, /* altered header fields */ 61 62 /* 63 * error severity 64 */ 65 EOk = 0, /* error expected in normal operation */ 66 EStrange, /* strange error that should be logged */ 67 ECorrupt, /* corrupted data found in arenas */ 68 EICorrupt, /* corrupted data found in index */ 69 EAdmin, /* should be brought to administrators' attention */ 70 ECrash, /* really bad internal error */ 71 EBug, /* a limitation which should be fixed */ 72 EInconsist, /* inconsistencies between index and arena */ 73 EMax, 74 75 /* 76 * internal disk formats for the venti archival storage system 77 */ 78 /* 79 * magic numbers on disk 80 */ 81 _ClumpMagic = 0xd15cb10cU, /* clump header, deprecated */ 82 ClumpFreeMagic = 0, /* free clump; terminates active clump log */ 83 84 ArenaPartMagic = 0xa9e4a5e7U, /* arena partition header */ 85 ArenaMagic = 0xf2a14eadU, /* arena trailer */ 86 ArenaHeadMagic = 0xd15c4eadU, /* arena header */ 87 88 BloomMagic = 0xb1004eadU, /* bloom filter header */ 89 BloomMaxHash = 32, 90 91 ISectMagic = 0xd15c5ec7U, /* index header */ 92 93 ArenaPartVersion = 3, 94 ArenaVersion4 = 4, 95 ArenaVersion5 = 5, 96 BloomVersion = 1, 97 IndexVersion = 1, 98 ISectVersion1 = 1, 99 ISectVersion2 = 2, 100 101 /* 102 * encodings of clumps on disk 103 */ 104 ClumpEErr = 0, /* can't happen */ 105 ClumpENone, /* plain */ 106 ClumpECompress, /* compressed */ 107 ClumpEMax, 108 109 /* 110 * sizes in bytes on disk 111 */ 112 U8Size = 1, 113 U16Size = 2, 114 U32Size = 4, 115 U64Size = 8, 116 117 ArenaPartSize = 4 * U32Size, 118 ArenaSize4 = 2 * U64Size + 6 * U32Size + ANameSize + U8Size, 119 ArenaSize5 = ArenaSize4 + U32Size, 120 ArenaSize5a = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size, 121 ArenaHeadSize4 = U64Size + 3 * U32Size + ANameSize, 122 ArenaHeadSize5 = ArenaHeadSize4 + U32Size, 123 BloomHeadSize = 4 * U32Size, 124 ISectSize1 = 7 * U32Size + 2 * ANameSize, 125 ISectSize2 = ISectSize1 + U32Size, 126 ClumpInfoSize = U8Size + 2 * U16Size + VtScoreSize, 127 ClumpSize = ClumpInfoSize + U8Size + 3 * U32Size, 128 MaxBloomSize = 1<<(32-3), /* 2^32 bits */ 129 MaxBloomHash = 32, /* bits per score */ 130 /* 131 * BUG - The various block copies that manipulate entry buckets 132 * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40, 133 * so that everything is word-aligned. Buildindex is actually cpu-bound 134 * by the (byte at a time) copying in qsort. 135 */ 136 IBucketSize = U32Size + U16Size, 137 IEntrySize = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize, 138 IEntryTypeOff = VtScoreSize + U32Size + U16Size + U64Size + U16Size, 139 IEntryAddrOff = VtScoreSize + U32Size + U16Size, 140 141 MaxClumpBlocks = (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog, 142 143 IcacheFrac = 1000000, /* denominator */ 144 145 SleepForever = 1000000000, /* magic value for sleep time */ 146 /* 147 * dirty flags - order controls disk write order 148 */ 149 DirtyArena = 1, 150 DirtyArenaCib, 151 DirtyArenaTrailer, 152 DirtyMax, 153 154 ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry. 155 156 VentiZZZZZZZZ 157 }; 158 159 extern char TraceDisk[]; 160 extern char TraceLump[]; 161 extern char TraceBlock[]; 162 extern char TraceProc[]; 163 extern char TraceWork[]; 164 extern char TraceQuiet[]; 165 extern char TraceRpc[]; 166 167 /* 168 * results of parsing and initializing a config file 169 */ 170 struct Config 171 { 172 char *index; /* name of the index to initialize */ 173 int naparts; /* arena partitions initialized */ 174 ArenaPart **aparts; 175 int nsects; /* index sections initialized */ 176 ISect **sects; 177 Bloom *bloom; /* bloom filter */ 178 u32int bcmem; 179 u32int mem; 180 u32int icmem; 181 int queuewrites; 182 char* haddr; 183 char* vaddr; 184 char* webroot; 185 }; 186 187 /* 188 * a Part is the low level interface to files or disks. 189 * there are two main types of partitions 190 * arena paritions, which some number of arenas, each in a sub-partition. 191 * index partition, which only have one subpartition. 192 */ 193 struct Part 194 { 195 int fd; /* rock for accessing the disk */ 196 int mode; 197 u64int offset; 198 u64int size; /* size of the partiton */ 199 u32int blocksize; /* block size for reads and writes */ 200 u32int fsblocksize; /* minimum file system block size */ 201 char *name; 202 char *filename; 203 Channel *writechan; /* chan[dcache.nblock](DBlock*) */ 204 }; 205 206 /* 207 * a cached block from the partition 208 * yuck -- most of this is internal structure for the cache 209 * all other routines should only use data 210 */ 211 struct DBlock 212 { 213 u8int *data; 214 215 Part *part; /* partition in which cached */ 216 u64int addr; /* base address on the partition */ 217 u32int size; /* amount of data available, not amount allocated; should go away */ 218 u32int mode; 219 u32int dirty; 220 u32int dirtying; 221 DBlock *next; /* doubly linked hash chains */ 222 DBlock *prev; 223 u32int heap; /* index in heap table */ 224 u32int used; /* last reference times */ 225 u32int used2; 226 u32int ref; /* reference count */ 227 RWLock lock; /* for access to data only */ 228 Channel *writedonechan; 229 void* chanbuf[1]; /* buffer for the chan! */ 230 }; 231 232 /* 233 * a cached block from the partition 234 * yuck -- most of this is internal structure for the cache 235 * all other routines should only use data 236 * double yuck -- this is mostly the same as a DBlock 237 */ 238 struct Lump 239 { 240 Packet *data; 241 242 Part *part; /* partition in which cached */ 243 u8int score[VtScoreSize]; /* score of packet */ 244 u8int type; /* type of packet */ 245 u32int size; /* amount of data allocated to hold packet */ 246 Lump *next; /* doubly linked hash chains */ 247 Lump *prev; 248 u32int heap; /* index in heap table */ 249 u32int used; /* last reference times */ 250 u32int used2; 251 u32int ref; /* reference count */ 252 QLock lock; /* for access to data only */ 253 }; 254 255 /* 256 * mapping between names and address ranges 257 */ 258 struct AMap 259 { 260 u64int start; 261 u64int stop; 262 char name[ANameSize]; 263 }; 264 265 /* 266 * an AMap along with a length 267 */ 268 struct AMapN 269 { 270 int n; 271 AMap *map; 272 }; 273 274 /* 275 * an ArenaPart is a partition made up of Arenas 276 * it exists because most os's don't support many partitions, 277 * and we want to have many different Arenas 278 */ 279 struct ArenaPart 280 { 281 Part *part; 282 u64int size; /* size of underlying partition, rounded down to blocks */ 283 Arena **arenas; 284 u32int tabbase; /* base address of arena table on disk */ 285 u32int tabsize; /* max. bytes in arena table */ 286 287 /* 288 * fields stored on disk 289 */ 290 u32int version; 291 u32int blocksize; /* "optimal" block size for reads and writes */ 292 u32int arenabase; /* base address of first arena */ 293 294 /* 295 * stored in the arena mapping table on disk 296 */ 297 AMap *map; 298 int narenas; 299 }; 300 301 /* 302 * info about one block in the clump info cache 303 */ 304 struct CIBlock 305 { 306 u32int block; /* blocks in the directory */ 307 int offset; /* offsets of one clump in the data */ 308 DBlock *data; 309 }; 310 311 /* 312 * Statistics kept in the tail. 313 */ 314 struct ATailStats 315 { 316 u32int clumps; /* number of clumps */ 317 u32int cclumps; /* number of compressed clumps */ 318 u64int used; 319 u64int uncsize; 320 u8int sealed; 321 }; 322 323 /* 324 * Arena state - represents a point in the data log 325 */ 326 struct AState 327 { 328 Arena *arena; 329 u64int aa; /* index address */ 330 ATailStats stats; 331 }; 332 333 /* 334 * an Arena is a log of Clumps, preceeded by an ArenaHeader, 335 * and followed by a Arena, each in one disk block. 336 * struct on disk is not always up to date, but should be self-consistent. 337 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found. 338 * <struct name="Arena" type="Arena *"> 339 * <field name="name" val="s->name" type="AName"/> 340 * <field name="version" val="s->version" type="U32int"/> 341 * <field name="partition" val="s->part->name" type="AName"/> 342 * <field name="blocksize" val="s->blocksize" type="U32int"/> 343 * <field name="start" val="s->base" type="U64int"/> 344 * <field name="stop" val="s->base+2*s->blocksize" type="U64int"/> 345 * <field name="created" val="s->ctime" type="U32int"/> 346 * <field name="modified" val="s->wtime" type="U32int"/> 347 * <field name="sealed" val="s->sealed" type="Sealed"/> 348 * <field name="score" val="s->score" type="Score"/> 349 * <field name="clumps" val="s->clumps" type="U32int"/> 350 * <field name="compressedclumps" val="s->cclumps" type="U32int"/> 351 * <field name="data" val="s->uncsize" type="U64int"/> 352 * <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/> 353 * <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/> 354 * </struct> 355 */ 356 struct Arena 357 { 358 QLock lock; /* lock for arena fields, writing to disk */ 359 Part *part; /* partition in which arena lives */ 360 int blocksize; /* size of block to read or write */ 361 u64int base; /* base address on disk */ 362 u64int size; /* total space in the arena */ 363 u8int score[VtScoreSize]; /* score of the entire sealed & summed arena */ 364 365 int clumpmax; /* ClumpInfos per block */ 366 AState mem; 367 int inqueue; 368 369 /* 370 * fields stored on disk 371 */ 372 u32int version; 373 char name[ANameSize]; /* text label */ 374 ATailStats memstats; 375 ATailStats diskstats; 376 u32int ctime; /* first time a block was written */ 377 u32int wtime; /* last time a block was written */ 378 u32int clumpmagic; 379 380 ArenaCIG *cig; 381 int ncig; 382 }; 383 384 struct ArenaCIG 385 { 386 u64int offset; // from arena base 387 }; 388 389 /* 390 * redundant storage of some fields at the beginning of each arena 391 */ 392 struct ArenaHead 393 { 394 u32int version; 395 char name[ANameSize]; 396 u32int blocksize; 397 u64int size; 398 u32int clumpmagic; 399 }; 400 401 /* 402 * most interesting meta information for a clump. 403 * stored in each clump's header and in the Arena's directory, 404 * stored in reverse order just prior to the arena trailer 405 */ 406 struct ClumpInfo 407 { 408 u8int type; 409 u16int size; /* size of disk data, not including header */ 410 u16int uncsize; /* size of uncompressed data */ 411 u8int score[VtScoreSize]; /* score of the uncompressed data only */ 412 }; 413 414 /* 415 * header for an immutable clump of data 416 */ 417 struct Clump 418 { 419 ClumpInfo info; 420 u8int encoding; 421 u32int creator; /* initial client which wrote the block */ 422 u32int time; /* creation at gmt seconds since 1/1/1970 */ 423 }; 424 425 /* 426 * index of all clumps according to their score 427 * this is just a wrapper to tie together the index sections 428 * <struct name="Index" type="Index *"> 429 * <field name="name" val="s->name" type="AName"/> 430 * <field name="version" val="s->version" type="U32int"/> 431 * <field name="blocksize" val="s->blocksize" type="U32int"/> 432 * <field name="tabsize" val="s->tabsize" type="U32int"/> 433 * <field name="buckets" val="s->buckets" type="U32int"/> 434 * <field name="buckdiv" val="s->div" type="U32int"/> 435 * <field name="bitblocks" val="s->div" type="U32int"/> 436 * <field name="maxdepth" val="s->div" type="U32int"/> 437 * <field name="bitkeylog" val="s->div" type="U32int"/> 438 * <field name="bitkeymask" val="s->div" type="U32int"/> 439 * <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/> 440 * <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/> 441 * <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/> 442 * </struct> 443 * <struct name="Amap" type="AMap *"> 444 * <field name="name" val="s->name" type="AName"/> 445 * <field name="start" val="s->start" type="U64int"/> 446 * <field name="stop" val="s->stop" type="U64int"/> 447 * </struct> 448 */ 449 struct Index 450 { 451 u32int div; /* divisor for mapping score to bucket */ 452 u32int buckets; /* last bucket used in disk hash table */ 453 u32int blocksize; 454 u32int tabsize; /* max. bytes in index config */ 455 456 int mapalloc; /* first arena to check when adding a lump */ 457 Arena **arenas; /* arenas in the mapping */ 458 ISect **sects; /* sections which hold the buckets */ 459 Bloom *bloom; /* bloom filter */ 460 461 /* 462 * fields stored in config file 463 */ 464 u32int version; 465 char name[ANameSize]; /* text label */ 466 int nsects; 467 AMap *smap; /* mapping of buckets to index sections */ 468 int narenas; 469 AMap *amap; /* mapping from index addesses to arenas */ 470 471 QLock writing; 472 }; 473 474 /* 475 * one part of the bucket storage for an index. 476 * the index blocks are sequentially allocated 477 * across all of the sections. 478 */ 479 struct ISect 480 { 481 Part *part; 482 int blocklog; /* log2(blocksize) */ 483 int buckmax; /* max. entries in a index bucket */ 484 u32int tabbase; /* base address of index config table on disk */ 485 u32int tabsize; /* max. bytes in index config */ 486 Channel *writechan; 487 Channel *writedonechan; 488 void *ig; /* used by buildindex only */ 489 int ng; 490 491 /* 492 * fields stored on disk 493 */ 494 u32int version; 495 u32int bucketmagic; 496 char name[ANameSize]; /* text label */ 497 char index[ANameSize]; /* index owning the section */ 498 u32int blocksize; /* size of hash buckets in index */ 499 u32int blockbase; /* address of start of on disk index table */ 500 u32int blocks; /* total blocks on disk; some may be unused */ 501 u32int start; /* first bucket in this section */ 502 u32int stop; /* limit of buckets in this section */ 503 }; 504 505 /* 506 * externally interesting part of an IEntry 507 */ 508 struct IAddr 509 { 510 u64int addr; 511 u16int size; /* uncompressed size */ 512 u8int type; /* type of block */ 513 u8int blocks; /* arena io quanta for Clump + data */ 514 }; 515 516 /* 517 * entries in the index 518 * kept in IBuckets in the disk index table, 519 * cached in the memory ICache. 520 */ 521 struct IEntry 522 { 523 /* on disk data - 32 bytes*/ 524 u8int score[VtScoreSize]; 525 IAddr ia; 526 527 IEntry *nexthash; 528 IEntry *nextdirty; 529 IEntry *next; 530 IEntry *prev; 531 u8int state; 532 }; 533 enum { 534 IEClean = 0, 535 IEDirty = 1, 536 IESummary = 2, 537 }; 538 539 /* 540 * buckets in the on disk index table 541 */ 542 struct IBucket 543 { 544 u16int n; /* number of active indices */ 545 u32int buck; /* used by buildindex/checkindex only */ 546 u8int *data; 547 }; 548 549 /* 550 * temporary buffers used by individual threads 551 */ 552 struct ZBlock 553 { 554 u32int len; 555 u32int _size; 556 u8int *data; 557 u8int *free; 558 }; 559 560 /* 561 * simple input buffer for a '\0' terminated text file 562 */ 563 struct IFile 564 { 565 char *name; /* name of the file */ 566 ZBlock *b; /* entire contents of file */ 567 u32int pos; /* current position in the file */ 568 }; 569 570 struct Statdesc 571 { 572 char *name; 573 ulong max; 574 }; 575 576 /* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/ 577 enum 578 { 579 StatRpcTotal, 580 StatRpcRead, 581 StatRpcReadOk, 582 StatRpcReadFail, 583 StatRpcReadBytes, 584 StatRpcReadTime, 585 StatRpcReadCached, 586 StatRpcReadCachedTime, 587 StatRpcReadUncached, 588 StatRpcReadUncachedTime, 589 StatRpcWrite, 590 StatRpcWriteNew, 591 StatRpcWriteOld, 592 StatRpcWriteFail, 593 StatRpcWriteBytes, 594 StatRpcWriteTime, 595 StatRpcWriteNewTime, 596 StatRpcWriteOldTime, 597 598 StatLcacheHit, 599 StatLcacheMiss, 600 StatLcacheRead, 601 StatLcacheWrite, 602 StatLcacheSize, 603 StatLcacheStall, 604 StatLcacheReadTime, 605 606 StatDcacheHit, 607 StatDcacheMiss, 608 StatDcacheLookup, 609 StatDcacheRead, 610 StatDcacheWrite, 611 StatDcacheDirty, 612 StatDcacheSize, 613 StatDcacheFlush, 614 StatDcacheStall, 615 StatDcacheLookupTime, 616 617 StatDblockStall, 618 StatLumpStall, 619 620 StatIcacheHit, 621 StatIcacheMiss, 622 StatIcacheRead, 623 StatIcacheWrite, 624 StatIcacheFill, 625 StatIcachePrefetch, 626 StatIcacheDirty, 627 StatIcacheSize, 628 StatIcacheFlush, 629 StatIcacheStall, 630 StatIcacheReadTime, 631 StatIcacheLookup, 632 StatScacheHit, 633 StatScachePrefetch, 634 635 StatBloomHit, 636 StatBloomMiss, 637 StatBloomFalseMiss, 638 StatBloomLookup, 639 StatBloomOnes, 640 StatBloomBits, 641 642 StatApartRead, 643 StatApartReadBytes, 644 StatApartWrite, 645 StatApartWriteBytes, 646 647 StatIsectRead, 648 StatIsectReadBytes, 649 StatIsectWrite, 650 StatIsectWriteBytes, 651 652 StatSumRead, 653 StatSumReadBytes, 654 655 StatCigLoad, 656 StatCigLoadTime, 657 658 NStat 659 }; 660 661 extern Statdesc statdesc[NStat]; 662 663 /* 664 * statistics about the operation of the server 665 * mainly for performance monitoring and profiling. 666 */ 667 struct Stats 668 { 669 ulong now; 670 ulong n[NStat]; 671 }; 672 673 struct Statbin 674 { 675 uint nsamp; 676 uint min; 677 uint max; 678 uint avg; 679 }; 680 681 struct Graph 682 { 683 long (*fn)(Stats*, Stats*, void*); 684 void *arg; 685 long t0; 686 long t1; 687 long min; 688 long max; 689 long wid; 690 long ht; 691 int fill; 692 }; 693 694 /* 695 * for kicking background processes that run one round after another after another 696 */ 697 struct Round 698 { 699 QLock lock; 700 Rendez start; 701 Rendez finish; 702 Rendez delaywait; 703 int delaytime; 704 int delaykick; 705 char* name; 706 int last; 707 int current; 708 int next; 709 int doanother; 710 }; 711 712 /* 713 * Bloom filter of stored block hashes 714 */ 715 struct Bloom 716 { 717 RWLock lk; /* protects nhash, nbits, tab, mb */ 718 QLock mod; /* one marker at a time, protects nb */ 719 int nhash; 720 ulong size; /* bytes in tab */ 721 ulong bitmask; /* to produce bit index */ 722 u8int *data; 723 Part *part; 724 Channel *writechan; 725 Channel *writedonechan; 726 }; 727 728 extern Index *mainindex; 729 extern u32int maxblocksize; /* max. block size used by any partition */ 730 extern int paranoid; /* should verify hashes on disk read */ 731 extern int queuewrites; /* put all lump writes on a queue and finish later */ 732 extern int readonly; /* only allowed to read the disk data */ 733 extern Stats stats; 734 extern u8int zeroscore[VtScoreSize]; 735 extern int compressblocks; 736 extern int writestodevnull; /* dangerous - for performance debugging */ 737 extern int collectstats; 738 extern QLock memdrawlock; 739 extern int icachesleeptime; 740 extern int minicachesleeptime; 741 extern int arenasumsleeptime; 742 extern int manualscheduling; 743 extern int l0quantum; 744 extern int l1quantum; 745 extern int ignorebloom; 746 extern int icacheprefetch; 747 extern int syncwrites; 748 extern int debugarena; /* print in arena error msgs; -1==unknown */ 749 750 extern Stats *stathist; 751 extern int nstathist; 752 extern ulong stattime; 753 754 #ifndef PLAN9PORT 755 #pragma varargck type "V" uchar* 756 #define ODIRECT 0 757 #endif 758 759