xref: /dflybsd-src/sys/vfs/hammer2/hammer2_disk.h (revision 9443de1e697b0500d2fe9da1bb2a23769759c93b)
1  /*
2   * Copyright (c) 2011-2019 The DragonFly Project.  All rights reserved.
3   *
4   * This code is derived from software contributed to The DragonFly Project
5   * by Matthew Dillon <dillon@dragonflybsd.org>
6   * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7   *
8   * Redistribution and use in source and binary forms, with or without
9   * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer.
14   * 2. Redistributions in binary form must reproduce the above copyright
15   *    notice, this list of conditions and the following disclaimer in
16   *    the documentation and/or other materials provided with the
17   *    distribution.
18   * 3. Neither the name of The DragonFly Project nor the names of its
19   *    contributors may be used to endorse or promote products derived
20   *    from this software without specific, prior written permission.
21   *
22   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23   * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26   * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27   * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30   * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33   * SUCH DAMAGE.
34   */
35  
36  #ifndef _VFS_HAMMER2_DISK_H_
37  #define _VFS_HAMMER2_DISK_H_
38  
39  #ifndef _SYS_UUID_H_
40  #include <sys/uuid.h>
41  #endif
42  #ifndef _SYS_DMSG_H_
43  #include <sys/dmsg.h>
44  #endif
45  
46  /*
47   * The structures below represent the on-disk media structures for the HAMMER2
48   * filesystem.  Note that all fields for on-disk structures are naturally
49   * aligned.  The host endian format is typically used - compatibility is
50   * possible if the implementation detects reversed endian and adjusts accesses
51   * accordingly.
52   *
53   * HAMMER2 primarily revolves around the directory topology:  inodes,
54   * directory entries, and block tables.  Block device buffer cache buffers
55   * are always 64KB.  Logical file buffers are typically 16KB.  All data
56   * references utilize 64-bit byte offsets.
57   *
58   * Free block management is handled independently using blocks reserved by
59   * the media topology.
60   */
61  
62  /*
63   * The data at the end of a file or directory may be a fragment in order
64   * to optimize storage efficiency.  The minimum fragment size is 1KB.
65   * Since allocations are in powers of 2 fragments must also be sized in
66   * powers of 2 (1024, 2048, ... 65536).
67   *
68   * For the moment the maximum allocation size is HAMMER2_PBUFSIZE (64K),
69   * which is 2^16.  Larger extents may be supported in the future.  Smaller
70   * fragments might be supported in the future (down to 64 bytes is possible),
71   * but probably will not be.
72   *
73   * A full indirect block use supports 512 x 128-byte blockrefs in a 64KB
74   * buffer.  Indirect blocks down to 1KB are supported to keep small
75   * directories small.
76   *
77   * A maximally sized file (2^64-1 bytes) requires ~6 indirect block levels
78   * using 64KB indirect blocks (128 byte refs, 512 or radix 9 per indblk).
79   *
80   *	16(datablk) + 9 + 9 + 9 + 9 + 9 + 9 = ~70.
81   *	16(datablk) + 7 + 9 + 9 + 9 + 9 + 9 = ~68.  (smaller top level indblk)
82   *
83   * The actual depth depends on copies redundancy and whether the filesystem
84   * has chosen to use a smaller indirect block size at the top level or not.
85   */
86  #define HAMMER2_ALLOC_MIN	1024	/* minimum allocation size */
87  #define HAMMER2_RADIX_MIN	10	/* minimum allocation size 2^N */
88  #define HAMMER2_ALLOC_MAX	65536	/* maximum allocation size */
89  #define HAMMER2_RADIX_MAX	16	/* maximum allocation size 2^N */
90  #define HAMMER2_RADIX_KEY	64	/* number of bits in key */
91  
92  /*
93   * HAMMER2_LBUFSIZE	- Nominal buffer size for I/O rollups.
94   *
95   * HAMMER2_PBUFSIZE	- Topological block size used by files for all
96   *			  blocks except the block straddling EOF.
97   *
98   * HAMMER2_SEGSIZE	- Allocation map segment size, typically 4MB
99   *			  (space represented by a level0 bitmap).
100   */
101  
102  #define HAMMER2_SEGSIZE		(1 << HAMMER2_FREEMAP_LEVEL0_RADIX)
103  #define HAMMER2_SEGRADIX	HAMMER2_FREEMAP_LEVEL0_RADIX
104  
105  #define HAMMER2_PBUFRADIX	16	/* physical buf (1<<16) bytes */
106  #define HAMMER2_PBUFSIZE	65536
107  #define HAMMER2_LBUFRADIX	14	/* logical buf (1<<14) bytes */
108  #define HAMMER2_LBUFSIZE	16384
109  
110  #define HAMMER2_IND_BYTES_MIN	4096
111  #define HAMMER2_IND_BYTES_NOM	HAMMER2_LBUFSIZE
112  #define HAMMER2_IND_BYTES_MAX	HAMMER2_PBUFSIZE
113  #define HAMMER2_IND_RADIX_MIN	12
114  #define HAMMER2_IND_RADIX_NOM	HAMMER2_LBUFRADIX
115  #define HAMMER2_IND_RADIX_MAX	HAMMER2_PBUFRADIX
116  #define HAMMER2_IND_COUNT_MIN	(HAMMER2_IND_BYTES_MIN / \
117  				 sizeof(hammer2_blockref_t))
118  #define HAMMER2_IND_COUNT_NOM	(HAMMER2_IND_BYTES_NOM / \
119  				 sizeof(hammer2_blockref_t))
120  #define HAMMER2_IND_COUNT_MAX	(HAMMER2_IND_BYTES_MAX / \
121  				 sizeof(hammer2_blockref_t))
122  
123  /*
124   * In HAMMER2, arrays of blockrefs are fully set-associative, meaning that
125   * any element can occur at any index and holes can be anywhere.
126   *
127   * Inodes embed either 512 bytes of direct data or an array of 4 blockrefs,
128   * resulting in highly efficient storage for files <= 512 bytes and for files
129   * <= 512KB.  Up to 4 directory entries can be referenced from a directory
130   * without requiring an indirect block.
131   */
132  #define HAMMER2_SET_RADIX		2	/* radix 2 = 4 entries */
133  #define HAMMER2_SET_COUNT		(1 << HAMMER2_SET_RADIX)
134  #define HAMMER2_EMBEDDED_BYTES		512	/* inode blockset/dd size */
135  #define HAMMER2_EMBEDDED_RADIX		9
136  
137  #define HAMMER2_PBUFMASK	(HAMMER2_PBUFSIZE - 1)
138  #define HAMMER2_LBUFMASK	(HAMMER2_LBUFSIZE - 1)
139  #define HAMMER2_SEGMASK		(HAMMER2_SEGSIZE - 1)
140  
141  #define HAMMER2_LBUFMASK64	((hammer2_off_t)HAMMER2_LBUFMASK)
142  #define HAMMER2_PBUFSIZE64	((hammer2_off_t)HAMMER2_PBUFSIZE)
143  #define HAMMER2_PBUFMASK64	((hammer2_off_t)HAMMER2_PBUFMASK)
144  #define HAMMER2_SEGSIZE64	((hammer2_off_t)HAMMER2_SEGSIZE)
145  #define HAMMER2_SEGMASK64	((hammer2_off_t)HAMMER2_SEGMASK)
146  
147  #define HAMMER2_UUID_STRING	"5cbb9ad1-862d-11dc-a94d-01301bb8a9f5"
148  
149  /*
150   * A 4MB segment is reserved at the beginning of each 1GB.  This segment
151   * contains the volume header (or backup volume header), the free block
152   * table, and possibly other information in the future.
153   *
154   * 4MB = 64 x 64K blocks.  Each 4MB segment is broken down as follows:
155   *
156   * ==========
157   *  0 volume header (for the first four 2GB zones)
158   *  1 freemap00 level1 FREEMAP_LEAF (256 x 128B bitmap data per 1GB)
159   *  2           level2 FREEMAP_NODE (256 x 128B indirect block per 256GB)
160   *  3           level3 FREEMAP_NODE (256 x 128B indirect block per 64TB)
161   *  4           level4 FREEMAP_NODE (256 x 128B indirect block per 16PB)
162   *  5           level5 FREEMAP_NODE (256 x 128B indirect block per 4EB)
163   *  6 freemap01 level1 (rotation)
164   *  7           level2
165   *  8           level3
166   *  9           level4
167   * 10           level5
168   * 11 freemap02 level1 (rotation)
169   * 12           level2
170   * 13           level3
171   * 14           level4
172   * 15           level5
173   * 16 freemap03 level1 (rotation)
174   * 17           level2
175   * 18           level3
176   * 19           level4
177   * 20           level5
178   * 21 freemap04 level1 (rotation)
179   * 22           level2
180   * 23           level3
181   * 24           level4
182   * 25           level5
183   * 26 freemap05 level1 (rotation)
184   * 27           level2
185   * 28           level3
186   * 29           level4
187   * 30           level5
188   * 31 freemap06 level1 (rotation)
189   * 32           level2
190   * 33           level3
191   * 34           level4
192   * 35           level5
193   * 36 freemap07 level1 (rotation)
194   * 37           level2
195   * 38           level3
196   * 39           level4
197   * 40           level5
198   * 41 unused
199   * .. unused
200   * 63 unused
201   * ==========
202   *
203   * The first four 2GB zones contain volume headers and volume header backups.
204   * After that the volume header block# is reserved for future use.  Similarly,
205   * there are many blocks related to various Freemap levels which are not
206   * used in every segment and those are also reserved for future use.
207   * Note that each FREEMAP_LEAF or FREEMAP_NODE uses 32KB out of 64KB slot.
208   *
209   *			Freemap (see the FREEMAP document)
210   *
211   * The freemap utilizes blocks #1-40 in 8 sets of 5 blocks.  Each block in
212   * a set represents a level of depth in the freemap topology.  Eight sets
213   * exist to prevent live updates from disturbing the state of the freemap
214   * were a crash/reboot to occur.  That is, a live update is not committed
215   * until the update's flush reaches the volume root.  There are FOUR volume
216   * roots representing the last four synchronization points, so the freemap
217   * must be consistent no matter which volume root is chosen by the mount
218   * code.
219   *
220   * Each freemap set is 5 x 64K blocks and represents the 1GB, 256GB, 64TB,
221   * 16PB and 4EB indirect map.  The volume header itself has a set of 4 freemap
222   * blockrefs representing another 2 bits, giving us a total 64 bits of
223   * representable address space.
224   *
225   * The Level 0 64KB block represents 1GB of storage represented by 32KB
226   * (256 x struct hammer2_bmap_data).  Each structure represents 4MB of storage
227   * and has a 512 bit bitmap, using 2 bits to represent a 16KB chunk of
228   * storage.  These 2 bits represent the following states:
229   *
230   *	00	Free
231   *	01	(reserved) (Possibly partially allocated)
232   *	10	Possibly free
233   *	11	Allocated
234   *
235   * One important thing to note here is that the freemap resolution is 16KB,
236   * but the minimum storage allocation size is 1KB.  The hammer2 vfs keeps
237   * track of sub-allocations in memory, which means that on a unmount or reboot
238   * the entire 16KB of a partially allocated block will be considered fully
239   * allocated.  It is possible for fragmentation to build up over time, but
240   * defragmentation is fairly easy to accomplish since all modifications
241   * allocate a new block.
242   *
243   * The Second thing to note is that due to the way snapshots and inode
244   * replication works, deleting a file cannot immediately free the related
245   * space.  Furthermore, deletions often do not bother to traverse the
246   * block subhierarchy being deleted.  And to go even further, whole
247   * sub-directory trees can be deleted simply by deleting the directory inode
248   * at the top.  So even though we have a symbol to represent a 'possibly free'
249   * block (binary 10), only the bulk free scanning code can actually use it.
250   * Normal 'rm's or other deletions do not.
251   *
252   * WARNING!  ZONE_SEG and VOLUME_ALIGN must be a multiple of 1<<LEVEL0_RADIX
253   *	     (i.e. a multiple of 4MB).  VOLUME_ALIGN must be >= ZONE_SEG.
254   *
255   * In Summary:
256   *
257   * (1) Modifications to freemap blocks 'allocate' a new copy (aka use a block
258   *     from the next set).  The new copy is reused until a flush occurs at
259   *     which point the next modification will then rotate to the next set.
260   */
261  #define HAMMER2_VOLUME_ALIGN		(8 * 1024 * 1024)
262  #define HAMMER2_VOLUME_ALIGN64		((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
263  #define HAMMER2_VOLUME_ALIGNMASK	(HAMMER2_VOLUME_ALIGN - 1)
264  #define HAMMER2_VOLUME_ALIGNMASK64	((hammer2_off_t)HAMMER2_VOLUME_ALIGNMASK)
265  
266  #define HAMMER2_NEWFS_ALIGN		(HAMMER2_VOLUME_ALIGN)
267  #define HAMMER2_NEWFS_ALIGN64		((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
268  #define HAMMER2_NEWFS_ALIGNMASK		(HAMMER2_VOLUME_ALIGN - 1)
269  #define HAMMER2_NEWFS_ALIGNMASK64	((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK)
270  
271  #define HAMMER2_ZONE_BYTES64		(2LLU * 1024 * 1024 * 1024)
272  #define HAMMER2_ZONE_MASK64		(HAMMER2_ZONE_BYTES64 - 1)
273  #define HAMMER2_ZONE_SEG		(4 * 1024 * 1024)
274  #define HAMMER2_ZONE_SEG64		((hammer2_off_t)HAMMER2_ZONE_SEG)
275  #define HAMMER2_ZONE_BLOCKS_SEG		(HAMMER2_ZONE_SEG / HAMMER2_PBUFSIZE)
276  
277  #define HAMMER2_ZONE_FREEMAP_INC	5	/* 5 deep */
278  
279  #define HAMMER2_ZONE_VOLHDR		0	/* volume header or backup */
280  #define HAMMER2_ZONE_FREEMAP_00		1	/* normal freemap rotation */
281  #define HAMMER2_ZONE_FREEMAP_01		6	/* normal freemap rotation */
282  #define HAMMER2_ZONE_FREEMAP_02		11	/* normal freemap rotation */
283  #define HAMMER2_ZONE_FREEMAP_03		16	/* normal freemap rotation */
284  #define HAMMER2_ZONE_FREEMAP_04		21	/* normal freemap rotation */
285  #define HAMMER2_ZONE_FREEMAP_05		26	/* normal freemap rotation */
286  #define HAMMER2_ZONE_FREEMAP_06		31	/* normal freemap rotation */
287  #define HAMMER2_ZONE_FREEMAP_07		36	/* normal freemap rotation */
288  #define HAMMER2_ZONE_FREEMAP_END	41	/* (non-inclusive) */
289  
290  #define HAMMER2_ZONE_UNUSED41		41
291  #define HAMMER2_ZONE_UNUSED42		42
292  #define HAMMER2_ZONE_UNUSED43		43
293  #define HAMMER2_ZONE_UNUSED44		44
294  #define HAMMER2_ZONE_UNUSED45		45
295  #define HAMMER2_ZONE_UNUSED46		46
296  #define HAMMER2_ZONE_UNUSED47		47
297  #define HAMMER2_ZONE_UNUSED48		48
298  #define HAMMER2_ZONE_UNUSED49		49
299  #define HAMMER2_ZONE_UNUSED50		50
300  #define HAMMER2_ZONE_UNUSED51		51
301  #define HAMMER2_ZONE_UNUSED52		52
302  #define HAMMER2_ZONE_UNUSED53		53
303  #define HAMMER2_ZONE_UNUSED54		54
304  #define HAMMER2_ZONE_UNUSED55		55
305  #define HAMMER2_ZONE_UNUSED56		56
306  #define HAMMER2_ZONE_UNUSED57		57
307  #define HAMMER2_ZONE_UNUSED58		58
308  #define HAMMER2_ZONE_UNUSED59		59
309  #define HAMMER2_ZONE_UNUSED60		60
310  #define HAMMER2_ZONE_UNUSED61		61
311  #define HAMMER2_ZONE_UNUSED62		62
312  #define HAMMER2_ZONE_UNUSED63		63
313  #define HAMMER2_ZONE_END		64	/* non-inclusive */
314  
315  #define HAMMER2_NFREEMAPS		8	/* FREEMAP_00 - FREEMAP_07 */
316  
317  						/* relative to FREEMAP_x */
318  #define HAMMER2_ZONEFM_LEVEL1		0	/* 1GB leafmap */
319  #define HAMMER2_ZONEFM_LEVEL2		1	/* 256GB indmap */
320  #define HAMMER2_ZONEFM_LEVEL3		2	/* 64TB indmap */
321  #define HAMMER2_ZONEFM_LEVEL4		3	/* 16PB indmap */
322  #define HAMMER2_ZONEFM_LEVEL5		4	/* 4EB indmap */
323  /* LEVEL6 is a set of 4 blockrefs in the volume header 16EB */
324  
325  /*
326   * Freemap radix.  Assumes a set-count of 4, 128-byte blockrefs,
327   * 32KB indirect block for freemap (LEVELN_PSIZE below).
328   *
329   * Leaf entry represents 4MB of storage broken down into a 512-bit
330   * bitmap, 2-bits per entry.  So course bitmap item represents 16KB.
331   */
332  #if HAMMER2_SET_COUNT != 4
333  #error "hammer2_disk.h - freemap assumes SET_COUNT is 4"
334  #endif
335  #define HAMMER2_FREEMAP_LEVEL6_RADIX	64	/* 16EB (end) */
336  #define HAMMER2_FREEMAP_LEVEL5_RADIX	62	/* 4EB */
337  #define HAMMER2_FREEMAP_LEVEL4_RADIX	54	/* 16PB */
338  #define HAMMER2_FREEMAP_LEVEL3_RADIX	46	/* 64TB */
339  #define HAMMER2_FREEMAP_LEVEL2_RADIX	38	/* 256GB */
340  #define HAMMER2_FREEMAP_LEVEL1_RADIX	30	/* 1GB */
341  #define HAMMER2_FREEMAP_LEVEL0_RADIX	22	/* 4MB (x 256 in l-1 leaf) */
342  
343  #define HAMMER2_FREEMAP_LEVELN_PSIZE	32768	/* physical bytes */
344  
345  #define HAMMER2_FREEMAP_LEVEL5_SIZE	((hammer2_off_t)1 <<		\
346  					 HAMMER2_FREEMAP_LEVEL5_RADIX)
347  #define HAMMER2_FREEMAP_LEVEL4_SIZE	((hammer2_off_t)1 <<		\
348  					 HAMMER2_FREEMAP_LEVEL4_RADIX)
349  #define HAMMER2_FREEMAP_LEVEL3_SIZE	((hammer2_off_t)1 <<		\
350  					 HAMMER2_FREEMAP_LEVEL3_RADIX)
351  #define HAMMER2_FREEMAP_LEVEL2_SIZE	((hammer2_off_t)1 <<		\
352  					 HAMMER2_FREEMAP_LEVEL2_RADIX)
353  #define HAMMER2_FREEMAP_LEVEL1_SIZE	((hammer2_off_t)1 <<		\
354  					 HAMMER2_FREEMAP_LEVEL1_RADIX)
355  #define HAMMER2_FREEMAP_LEVEL0_SIZE	((hammer2_off_t)1 <<		\
356  					 HAMMER2_FREEMAP_LEVEL0_RADIX)
357  
358  #define HAMMER2_FREEMAP_LEVEL5_MASK	(HAMMER2_FREEMAP_LEVEL5_SIZE - 1)
359  #define HAMMER2_FREEMAP_LEVEL4_MASK	(HAMMER2_FREEMAP_LEVEL4_SIZE - 1)
360  #define HAMMER2_FREEMAP_LEVEL3_MASK	(HAMMER2_FREEMAP_LEVEL3_SIZE - 1)
361  #define HAMMER2_FREEMAP_LEVEL2_MASK	(HAMMER2_FREEMAP_LEVEL2_SIZE - 1)
362  #define HAMMER2_FREEMAP_LEVEL1_MASK	(HAMMER2_FREEMAP_LEVEL1_SIZE - 1)
363  #define HAMMER2_FREEMAP_LEVEL0_MASK	(HAMMER2_FREEMAP_LEVEL0_SIZE - 1)
364  
365  #define HAMMER2_FREEMAP_COUNT		(int)(HAMMER2_FREEMAP_LEVELN_PSIZE / \
366  					 sizeof(hammer2_bmap_data_t))
367  
368  /*
369   * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone,
370   *     which is on a 1GB demark.  This will eat a little more space but for
371   *     now we retain compatibility and make FMZONEBASE every 1GB
372   */
373  #define H2FMZONEBASE(key)	((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK)
374  #define H2FMBASE(key, radix)	rounddown2(key, (hammer2_off_t)1 << (radix))
375  
376  /*
377   * 16KB bitmap granularity (x2 bits per entry).
378   */
379  #define HAMMER2_FREEMAP_BLOCK_RADIX	14
380  #define HAMMER2_FREEMAP_BLOCK_SIZE	(1 << HAMMER2_FREEMAP_BLOCK_RADIX)
381  #define HAMMER2_FREEMAP_BLOCK_MASK	(HAMMER2_FREEMAP_BLOCK_SIZE - 1)
382  
383  /*
384   * bitmap[] structure.  2 bits per HAMMER2_FREEMAP_BLOCK_SIZE.
385   *
386   * 8 x 64-bit elements, 2 bits per block.
387   * 32 blocks (radix 5) per element.
388   * representing INDEX_SIZE bytes worth of storage per element.
389   */
390  
391  typedef uint64_t hammer2_bitmap_t;
392  
393  #define HAMMER2_BMAP_ALLONES		((hammer2_bitmap_t)-1)
394  #define HAMMER2_BMAP_ELEMENTS		8
395  #define HAMMER2_BMAP_BITS_PER_ELEMENT	64
396  #define HAMMER2_BMAP_INDEX_RADIX	5	/* 32 blocks per element */
397  #define HAMMER2_BMAP_BLOCKS_PER_ELEMENT	(1 << HAMMER2_BMAP_INDEX_RADIX)
398  
399  #define HAMMER2_BMAP_INDEX_SIZE		(HAMMER2_FREEMAP_BLOCK_SIZE * \
400  					 HAMMER2_BMAP_BLOCKS_PER_ELEMENT)
401  #define HAMMER2_BMAP_INDEX_MASK		(HAMMER2_BMAP_INDEX_SIZE - 1)
402  
403  #define HAMMER2_BMAP_SIZE		(HAMMER2_BMAP_INDEX_SIZE * \
404  					 HAMMER2_BMAP_ELEMENTS)
405  #define HAMMER2_BMAP_MASK		(HAMMER2_BMAP_SIZE - 1)
406  
407  /*
408   * Two linear areas can be reserved after the initial 4MB segment in the base
409   * zone (the one starting at offset 0).  These areas are NOT managed by the
410   * block allocator and do not fall under HAMMER2 crc checking rules based
411   * at the volume header (but can be self-CRCd internally, depending).
412   */
413  #define HAMMER2_BOOT_MIN_BYTES		HAMMER2_VOLUME_ALIGN
414  #define HAMMER2_BOOT_NOM_BYTES		(64*1024*1024)
415  #define HAMMER2_BOOT_MAX_BYTES		(256*1024*1024)
416  
417  #define HAMMER2_AUX_MIN_BYTES		HAMMER2_VOLUME_ALIGN
418  #define HAMMER2_AUX_NOM_BYTES		(256*1024*1024)
419  #define HAMMER2_AUX_MAX_BYTES		(1024*1024*1024)
420  
421  /*
422   * Most HAMMER2 types are implemented as unsigned 64-bit integers.
423   * Transaction ids are monotonic.
424   *
425   * We utilize 32-bit iSCSI CRCs.
426   */
427  typedef uint64_t hammer2_tid_t;
428  typedef uint64_t hammer2_off_t;
429  typedef uint64_t hammer2_key_t;
430  typedef uint32_t hammer2_crc32_t;
431  
432  /*
433   * Miscellaneous ranges (all are unsigned).
434   */
435  #define HAMMER2_TID_MIN		1ULL
436  #define HAMMER2_TID_MAX		0xFFFFFFFFFFFFFFFFULL
437  #define HAMMER2_KEY_MIN		0ULL
438  #define HAMMER2_KEY_MAX		0xFFFFFFFFFFFFFFFFULL
439  
440  /*
441   * HAMMER2 data offset special cases and masking.
442   *
443   * All HAMMER2 data offsets have to be broken down into a 64K buffer base
444   * offset (HAMMER2_OFF_MASK_HI) and a 64K buffer index (HAMMER2_OFF_MASK_LO).
445   *
446   * Indexes into physical buffers are always 64-byte aligned.  The low 6 bits
447   * of the data offset field specifies how large the data chunk being pointed
448   * to as a power of 2.  The theoretical minimum radix is thus 6 (The space
449   * needed in the low bits of the data offset field).  However, the practical
450   * minimum allocation chunk size is 1KB (a radix of 10), so HAMMER2 sets
451   * HAMMER2_RADIX_MIN to 10.  The maximum radix is currently 16 (64KB), but
452   * we fully intend to support larger extents in the future.
453   *
454   * WARNING! A radix of 0 (such as when data_off is all 0's) is a special
455   *	    case which means no data associated with the blockref, and
456   *	    not the '1 byte' it would otherwise calculate to.
457   */
458  #define HAMMER2_OFF_MASK	0xFFFFFFFFFFFFFFC0ULL
459  #define HAMMER2_OFF_MASK_LO	(HAMMER2_OFF_MASK & HAMMER2_PBUFMASK64)
460  #define HAMMER2_OFF_MASK_HI	(~HAMMER2_PBUFMASK64)
461  #define HAMMER2_OFF_MASK_RADIX	0x000000000000003FULL
462  
463  /*
464   * HAMMER2 directory support and pre-defined keys
465   */
466  #define HAMMER2_DIRHASH_VISIBLE	0x8000000000000000ULL
467  #define HAMMER2_DIRHASH_USERMSK	0x7FFFFFFFFFFFFFFFULL
468  #define HAMMER2_DIRHASH_LOMASK	0x0000000000007FFFULL
469  #if 0
470  #define HAMMER2_DIRHASH_HIMASK	0xFFFFFFFFFFFF0000ULL
471  #define HAMMER2_DIRHASH_FORCED	0x0000000000008000ULL	/* bit forced on */
472  #endif
473  
474  #define HAMMER2_SROOT_KEY	0x0000000000000000ULL	/* volume to sroot */
475  #define HAMMER2_BOOT_KEY	0xd9b36ce135528000ULL	/* sroot to BOOT PFS */
476  
477  /************************************************************************
478   *				DMSG SUPPORT				*
479   ************************************************************************
480   * LNK_VOLCONF
481   *
482   * All HAMMER2 directories directly under the super-root on your local
483   * media can be mounted separately, even if they share the same physical
484   * device.
485   *
486   * When you do a HAMMER2 mount you are effectively tying into a HAMMER2
487   * cluster via local media.  The local media does not have to participate
488   * in the cluster, other than to provide the hammer2_volconf[] array and
489   * root inode for the mount.
490   *
491   * This is important: The mount device path you specify serves to bootstrap
492   * your entry into the cluster, but your mount will make active connections
493   * to ALL copy elements in the hammer2_volconf[] array which match the
494   * PFSID of the directory in the super-root that you specified.  The local
495   * media path does not have to be mentioned in this array but becomes part
496   * of the cluster based on its type and access rights.  ALL ELEMENTS ARE
497   * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM.
498   *
499   * The actual cluster may be far larger than the elements you list in the
500   * hammer2_volconf[] array.  You list only the elements you wish to
501   * directly connect to and you are able to access the rest of the cluster
502   * indirectly through those connections.
503   *
504   * WARNING!  This structure must be exactly 128 bytes long for its config
505   *	     array to fit in the volume header.
506   */
507  struct hammer2_volconf {
508  	uint8_t	copyid;		/* 00	 copyid 0-255 (must match slot) */
509  	uint8_t inprog;		/* 01	 operation in progress, or 0 */
510  	uint8_t chain_to;	/* 02	 operation chaining to, or 0 */
511  	uint8_t chain_from;	/* 03	 operation chaining from, or 0 */
512  	uint16_t flags;		/* 04-05 flags field */
513  	uint8_t error;		/* 06	 last operational error */
514  	uint8_t priority;	/* 07	 priority and round-robin flag */
515  	uint8_t remote_pfs_type;/* 08	 probed direct remote PFS type */
516  	uint8_t reserved08[23];	/* 09-1F */
517  	uuid_t	pfs_clid;	/* 20-2F copy target must match this uuid */
518  	uint8_t label[16];	/* 30-3F import/export label */
519  	uint8_t path[64];	/* 40-7F target specification string or key */
520  } __packed;
521  
522  typedef struct hammer2_volconf hammer2_volconf_t;
523  
524  #define DMSG_VOLF_ENABLED	0x0001
525  #define DMSG_VOLF_INPROG	0x0002
526  #define DMSG_VOLF_CONN_RR	0x80	/* round-robin at same priority */
527  #define DMSG_VOLF_CONN_EF	0x40	/* media errors flagged */
528  #define DMSG_VOLF_CONN_PRI	0x0F	/* select priority 0-15 (15=best) */
529  
530  struct dmsg_lnk_hammer2_volconf {
531  	dmsg_hdr_t		head;
532  	hammer2_volconf_t	copy;	/* copy spec */
533  	int32_t			index;
534  	int32_t			unused01;
535  	uuid_t			mediaid;
536  	int64_t			reserved02[32];
537  } __packed;
538  
539  typedef struct dmsg_lnk_hammer2_volconf dmsg_lnk_hammer2_volconf_t;
540  
541  #define DMSG_LNK_HAMMER2_VOLCONF DMSG_LNK(DMSG_LNK_CMD_HAMMER2_VOLCONF, \
542  					  dmsg_lnk_hammer2_volconf)
543  
544  #define H2_LNK_VOLCONF(msg)	((dmsg_lnk_hammer2_volconf_t *)(msg)->any.buf)
545  
546  /*
547   * HAMMER2 directory entry header (embedded in blockref)  exactly 16 bytes
548   */
549  struct hammer2_dirent_head {
550  	hammer2_tid_t		inum;		/* inode number */
551  	uint16_t		namlen;		/* name length */
552  	uint8_t			type;		/* OBJTYPE_*	*/
553  	uint8_t			unused0B;
554  	uint8_t			unused0C[4];
555  } __packed;
556  
557  typedef struct hammer2_dirent_head hammer2_dirent_head_t;
558  
559  /*
560   * The media block reference structure.  This forms the core of the HAMMER2
561   * media topology recursion.  This 128-byte data structure is embedded in the
562   * volume header, in inodes (which are also directory entries), and in
563   * indirect blocks.
564   *
565   * A blockref references a single media item, which typically can be a
566   * directory entry (aka inode), indirect block, or data block.
567   *
568   * The primary feature a blockref represents is the ability to validate
569   * the entire tree underneath it via its check code.  Any modification to
570   * anything propagates up the blockref tree all the way to the root, replacing
571   * the related blocks and compounding the generated check code.
572   *
573   * The check code can be a simple 32-bit iscsi code, a 64-bit crc, or as
574   * complex as a 512 bit cryptographic hash.  I originally used a 64-byte
575   * blockref but later expanded it to 128 bytes to be able to support the
576   * larger check code as well as to embed statistics for quota operation.
577   *
578   * Simple check codes are not sufficient for unverified dedup.  Even with
579   * a maximally-sized check code unverified dedup should only be used in
580   * subdirectory trees where you do not need 100% data integrity.
581   *
582   * Unverified dedup is deduping based on meta-data only without verifying
583   * that the data blocks are actually identical.  Verified dedup guarantees
584   * integrity but is a far more I/O-expensive operation.
585   *
586   * --
587   *
588   * mirror_tid - per cluster node modified (propagated upward by flush)
589   * modify_tid - clc record modified (not propagated).
590   * update_tid - clc record updated (propagated upward on verification)
591   *
592   * CLC - Stands for 'Cluster Level Change', identifiers which are identical
593   *	 within the topology across all cluster nodes (when fully
594   *	 synchronized).
595   *
596   * NOTE: The range of keys represented by the blockref is (key) to
597   *	 ((key) + (1LL << keybits) - 1).  HAMMER2 usually populates
598   *	 blocks bottom-up, inserting a new root when radix expansion
599   *	 is required.
600   *
601   * leaf_count  - Helps manage leaf collapse calculations when indirect
602   *		 blocks become mostly empty.  This value caps out at
603   *		 HAMMER2_BLOCKREF_LEAF_MAX (65535).
604   *
605   *		 Used by the chain code to determine when to pull leafs up
606   *		 from nearly empty indirect blocks.  For the purposes of this
607   *		 calculation, BREF_TYPE_INODE is considered a leaf, along
608   *		 with DIRENT and DATA.
609   *
610   *				    RESERVED FIELDS
611   *
612   * A number of blockref fields are reserved and should generally be set to
613   * 0 for future compatibility.
614   *
615   *				FUTURE BLOCKREF EXPANSION
616   *
617   * CONTENT ADDRESSABLE INDEXING (future) - Using a 256 or 512-bit check code.
618   */
619  struct hammer2_blockref {		/* MUST BE EXACTLY 128 BYTES */
620  	uint8_t		type;		/* type of underlying item */
621  	uint8_t		methods;	/* check method & compression method */
622  	uint8_t		copyid;		/* specify which copy this is */
623  	uint8_t		keybits;	/* #of keybits masked off 0=leaf */
624  	uint8_t		vradix;		/* virtual data/meta-data size */
625  	uint8_t		flags;		/* blockref flags */
626  	uint16_t	leaf_count;	/* leaf aggregation count */
627  	hammer2_key_t	key;		/* key specification */
628  	hammer2_tid_t	mirror_tid;	/* media flush topology & freemap */
629  	hammer2_tid_t	modify_tid;	/* clc modify (not propagated) */
630  	hammer2_off_t	data_off;	/* low 6 bits is phys size (radix)*/
631  	hammer2_tid_t	update_tid;	/* clc modify (propagated upward) */
632  	union {
633  		char	buf[16];
634  
635  		/*
636  		 * Directory entry header (BREF_TYPE_DIRENT)
637  		 *
638  		 * NOTE: check.buf contains filename if <= 64 bytes.  Longer
639  		 *	 filenames are stored in a data reference of size
640  		 *	 HAMMER2_ALLOC_MIN (at least 256, typically 1024).
641  		 *
642  		 * NOTE: inode structure may contain a copy of a recently
643  		 *	 associated filename, for recovery purposes.
644  		 *
645  		 * NOTE: Superroot entries are INODEs, not DIRENTs.  Code
646  		 *	 allows both cases.
647  		 */
648  		hammer2_dirent_head_t dirent;
649  
650  		/*
651  		 * Statistics aggregation (BREF_TYPE_INODE, BREF_TYPE_INDIRECT)
652  		 */
653  		struct {
654  			hammer2_key_t	data_count;
655  			hammer2_key_t	inode_count;
656  		} stats;
657  	} embed;
658  	union {				/* check info */
659  		char	buf[64];
660  		struct {
661  			uint32_t value;
662  			uint32_t reserved[15];
663  		} iscsi32;
664  		struct {
665  			uint64_t value;
666  			uint64_t reserved[7];
667  		} xxhash64;
668  		struct {
669  			char data[24];
670  			char reserved[40];
671  		} sha192;
672  		struct {
673  			char data[32];
674  			char reserved[32];
675  		} sha256;
676  		struct {
677  			char data[64];
678  		} sha512;
679  
680  		/*
681  		 * Freemap hints are embedded in addition to the icrc32.
682  		 *
683  		 * bigmask - Radixes available for allocation (0-31).
684  		 *	     Heuristical (may be permissive but not
685  		 *	     restrictive).  Typically only radix values
686  		 *	     10-16 are used (i.e. (1<<10) through (1<<16)).
687  		 *
688  		 * avail   - Total available space remaining, in bytes
689  		 */
690  		struct {
691  			uint32_t icrc32;
692  			uint32_t bigmask;	/* available radixes */
693  			uint64_t avail;		/* total available bytes */
694  			char reserved[48];
695  		} freemap;
696  	} check;
697  } __packed;
698  
699  typedef struct hammer2_blockref hammer2_blockref_t;
700  
701  #define HAMMER2_BLOCKREF_BYTES		128	/* blockref struct in bytes */
702  #define HAMMER2_BLOCKREF_RADIX		7
703  
704  #define HAMMER2_BLOCKREF_LEAF_MAX	65535
705  
706  /*
707   * On-media and off-media blockref types.
708   *
709   * types >= 128 are pseudo values that should never be present on-media.
710   */
711  #define HAMMER2_BREF_TYPE_EMPTY		0
712  #define HAMMER2_BREF_TYPE_INODE		1
713  #define HAMMER2_BREF_TYPE_INDIRECT	2
714  #define HAMMER2_BREF_TYPE_DATA		3
715  #define HAMMER2_BREF_TYPE_DIRENT	4
716  #define HAMMER2_BREF_TYPE_FREEMAP_NODE	5
717  #define HAMMER2_BREF_TYPE_FREEMAP_LEAF	6
718  #define HAMMER2_BREF_TYPE_INVALID	7
719  #define HAMMER2_BREF_TYPE_FREEMAP	254	/* pseudo-type */
720  #define HAMMER2_BREF_TYPE_VOLUME	255	/* pseudo-type */
721  
722  #define HAMMER2_BREF_FLAG_PFSROOT	0x01	/* see also related opflag */
723  #define HAMMER2_BREF_FLAG_UNUSED	0x02
724  #define HAMMER2_BREF_FLAG_EMERG_MIP	0x04	/* emerg modified-in-place */
725  
726  /*
727   * Check mode defaults to xxhash64.
728   */
729  #define HAMMER2_CHECK_NONE		0
730  #define HAMMER2_CHECK_DISABLED		1
731  #define HAMMER2_CHECK_ISCSI32		2
732  #define HAMMER2_CHECK_XXHASH64		3
733  #define HAMMER2_CHECK_SHA192		4
734  #define HAMMER2_CHECK_FREEMAP		5
735  
736  #define HAMMER2_CHECK_DEFAULT		HAMMER2_CHECK_XXHASH64
737  
738  /*
739   * Compression mode defaults to LZ4.
740   */
741  #define HAMMER2_COMP_NONE		0
742  #define HAMMER2_COMP_AUTOZERO		1
743  #define HAMMER2_COMP_LZ4		2
744  #define HAMMER2_COMP_ZLIB		3
745  
746  #define HAMMER2_COMP_DEFAULT		HAMMER2_COMP_LZ4
747  
748  /*
749   * Encode/decode check mode and compression mode for bref.methods.
750   * The compression level is not encoded in bref.methods.
751   */
752  #define HAMMER2_ENC_CHECK(n)		(((n) & 15) << 4)
753  #define HAMMER2_DEC_CHECK(n)		(((n) >> 4) & 15)
754  #define HAMMER2_ENC_COMP(n)		((n) & 15)
755  #define HAMMER2_DEC_COMP(n)		((n) & 15)
756  
757  /*
758   * Encode/decode check or compression algorithm request in
759   * ipdata->meta.check_algo and ipdata->meta.comp_algo.
760   */
761  #define HAMMER2_ENC_ALGO(n)		(n)
762  #define HAMMER2_DEC_ALGO(n)		((n) & 15)
763  #define HAMMER2_ENC_LEVEL(n)		((n) << 4)
764  #define HAMMER2_DEC_LEVEL(n)		(((n) >> 4) & 15)
765  
766  /*
767   * HAMMER2 block references are collected into sets of 4 blockrefs.  These
768   * sets are fully associative, meaning the elements making up a set may
769   * contain duplicate entries, holes, but valid elements are always sorted.
770   *
771   * When redundancy is desired a set may contain several duplicate
772   * entries pointing to different copies of the same data.  Up to 4 copies
773   * are supported. Not implemented.
774   *
775   * When a set fills up another level of indirection is inserted, moving
776   * some or all of the set's contents into indirect blocks placed under the
777   * set.  This is a top-down approach in that indirect blocks are not created
778   * until the set actually becomes full (that is, the entries in the set can
779   * shortcut the indirect blocks when the set is not full).  Depending on how
780   * things are filled multiple indirect blocks will eventually be created.
781   */
782  struct hammer2_blockset {
783  	hammer2_blockref_t	blockref[HAMMER2_SET_COUNT];
784  };
785  
786  typedef struct hammer2_blockset hammer2_blockset_t;
787  
788  /*
789   * Catch programmer snafus
790   */
791  #if (1 << HAMMER2_SET_RADIX) != HAMMER2_SET_COUNT
792  #error "hammer2 direct radix is incorrect"
793  #endif
794  #if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE
795  #error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent"
796  #endif
797  #if (1 << HAMMER2_RADIX_MIN) != HAMMER2_ALLOC_MIN
798  #error "HAMMER2_RADIX_MIN and HAMMER2_ALLOC_MIN are inconsistent"
799  #endif
800  
801  /*
802   * hammer2_bmap_data - A freemap entry in the LEVEL1 block.
803   *
804   * Each 128-byte entry contains the bitmap and meta-data required to manage
805   * a LEVEL0 (4MB) block of storage.  The storage is managed in 256 x 16KB
806   * chunks.
807   *
808   * A smaller allocation granularity is supported via a linear iterator and/or
809   * must otherwise be tracked in ram.
810   *
811   * (data structure must be 128 bytes exactly)
812   *
813   * linear  - A BYTE linear allocation offset used for sub-16KB allocations
814   *	     only.  May contain values between 0 and 4MB.  Must be ignored
815   *	     if 16KB-aligned (i.e. force bitmap scan), otherwise may be
816   *	     used to sub-allocate within the 16KB block (which is already
817   *	     marked as allocated in the bitmap).
818   *
819   *	     Sub-allocations need only be 1KB-aligned and do not have to be
820   *	     size-aligned, and 16KB or larger allocations do not update this
821   *	     field, resulting in pretty good packing.
822   *
823   *	     Please note that file data granularity may be limited by
824   *	     other issues such as buffer cache direct-mapping and the
825   *	     desire to support sector sizes up to 16KB (so H2 only issues
826   *	     I/O's in multiples of 16KB anyway).
827   *
828   * class   - Clustering class.  Cleared to 0 only if the entire leaf becomes
829   *	     free.  Used to cluster device buffers so all elements must have
830   *	     the same device block size, but may mix logical sizes.
831   *
832   *	     Typically integrated with the blockref type in the upper 8 bits
833   *	     to localize inodes and indrect blocks, improving bulk free scans
834   *	     and directory scans.
835   *
836   * bitmap  - Two bits per 16KB allocation block arranged in arrays of
837   *	     64-bit elements, 256x2 bits representing ~4MB worth of media
838   *	     storage.  Bit patterns are as follows:
839   *
840   *	     00	Unallocated
841   *	     01 (reserved)
842   *	     10 Possibly free
843   *           11 Allocated
844   *
845   * ==========
846   * level6 freemap
847   * blockref[0]       : 4EB
848   * blockref[1]       : 4EB
849   * blockref[2]       : 4EB
850   * blockref[3]       : 4EB
851   * -----------------------------------------------------------------------
852   * 4 x 128B = 512B   : 4 x 4EB = 16EB
853   *
854   * level2-5 FREEMAP_NODE
855   * blockref[0]       : 1GB,256GB,64TB,16PB
856   * blockref[1]       : 1GB,256GB,64TB,16PB
857   * ...
858   * blockref[255]     : 1GB,256GB,64TB,16PB
859   * -----------------------------------------------------------------------
860   * 256 x 128B = 32KB : 256 x 1GB,256GB,64TB,16PB = 256GB,64TB,16PB,4EB
861   *
862   * level1 FREEMAP_LEAF
863   * bmap_data[0]      : 8 x 8B = 512bits = 256 x 2bits -> 256 x 16KB = 4MB
864   * bmap_data[1]      : 8 x 8B = 512bits = 256 x 2bits -> 256 x 16KB = 4MB
865   * ...
866   * bmap_data[255]    : 8 x 8B = 512bits = 256 x 2bits -> 256 x 16KB = 4MB
867   * -----------------------------------------------------------------------
868   * 256 x 128B = 32KB : 256 x 4MB = 1GB
869   * ==========
870   */
871  struct hammer2_bmap_data {
872  	int32_t linear;		/* 00 linear sub-granular allocation offset */
873  	uint16_t class;		/* 04-05 clustering class ((type<<8)|radix) */
874  	uint8_t reserved06;	/* 06 */
875  	uint8_t reserved07;	/* 07 */
876  	uint32_t reserved08;	/* 08 */
877  	uint32_t reserved0C;	/* 0C */
878  	uint32_t reserved10;	/* 10 */
879  	uint32_t reserved14;	/* 14 */
880  	uint32_t reserved18;	/* 18 */
881  	uint32_t avail;		/* 1C */
882  	uint32_t reserved20[8];	/* 20-3F */
883  				/* 40-7F 512 bits manages 4MB of storage */
884  	hammer2_bitmap_t bitmapq[HAMMER2_BMAP_ELEMENTS];
885  } __packed;
886  
887  typedef struct hammer2_bmap_data hammer2_bmap_data_t;
888  
889  /*
890   * The inode number is stored in the inode rather than being
891   * based on the location of the inode (since the location moves every time
892   * the inode or anything underneath the inode is modified).
893   *
894   * The inode is 1024 bytes, made up of 256 bytes of meta-data, 256 bytes
895   * for the filename, and 512 bytes worth of direct file data OR an embedded
896   * blockset.  The in-memory hammer2_inode structure contains only the mostly-
897   * node-independent meta-data portion (some flags are node-specific and will
898   * not be synchronized).  The rest of the inode is node-specific and chain I/O
899   * is required to obtain it.
900   *
901   * Directories represent one inode per blockref.  Inodes are not laid out
902   * as a file but instead are represented by the related blockrefs.  The
903   * blockrefs, in turn, are indexed by the 64-bit directory hash key.  Remember
904   * that blocksets are fully associative, so a certain degree efficiency is
905   * achieved just from that.
906   *
907   * Up to 512 bytes of direct data can be embedded in an inode, and since
908   * inodes are essentially directory entries this also means that small data
909   * files end up simply being laid out linearly in the directory, resulting
910   * in fewer seeks and highly optimal access.
911   *
912   * The compression mode can be changed at any time in the inode and is
913   * recorded on a blockref-by-blockref basis.
914   */
915  #define HAMMER2_INODE_BYTES		1024	/* (asserted by code) */
916  #define HAMMER2_INODE_MAXNAME		256	/* maximum name in bytes */
917  #define HAMMER2_INODE_VERSION_ONE	1
918  
919  #define HAMMER2_INODE_START		1024	/* dynamically allocated */
920  
921  struct hammer2_inode_meta {
922  	uint16_t	version;	/* 0000 inode data version */
923  	uint8_t		reserved02;	/* 0002 */
924  	uint8_t		pfs_subtype;	/* 0003 pfs sub-type */
925  
926  	/*
927  	 * core inode attributes, inode type, misc flags
928  	 */
929  	uint32_t	uflags;		/* 0004 chflags */
930  	uint32_t	rmajor;		/* 0008 available for device nodes */
931  	uint32_t	rminor;		/* 000C available for device nodes */
932  	uint64_t	ctime;		/* 0010 inode change time */
933  	uint64_t	mtime;		/* 0018 modified time */
934  	uint64_t	atime;		/* 0020 access time (unsupported) */
935  	uint64_t	btime;		/* 0028 birth time */
936  	uuid_t		uid;		/* 0030 uid / degenerate unix uid */
937  	uuid_t		gid;		/* 0040 gid / degenerate unix gid */
938  
939  	uint8_t		type;		/* 0050 object type */
940  	uint8_t		op_flags;	/* 0051 operational flags */
941  	uint16_t	cap_flags;	/* 0052 capability flags */
942  	uint32_t	mode;		/* 0054 unix modes (typ low 16 bits) */
943  
944  	/*
945  	 * inode size, identification, localized recursive configuration
946  	 * for compression and backup copies.
947  	 *
948  	 * NOTE: Nominal parent inode number (iparent) is only applicable
949  	 *	 for directories but can also help for files during
950  	 *	 catastrophic recovery.
951  	 */
952  	hammer2_tid_t	inum;		/* 0058 inode number */
953  	hammer2_off_t	size;		/* 0060 size of file */
954  	uint64_t	nlinks;		/* 0068 hard links (typ only dirs) */
955  	hammer2_tid_t	iparent;	/* 0070 nominal parent inum */
956  	hammer2_key_t	name_key;	/* 0078 full filename key */
957  	uint16_t	name_len;	/* 0080 filename length */
958  	uint8_t		ncopies;	/* 0082 ncopies to local media */
959  	uint8_t		comp_algo;	/* 0083 compression request & algo */
960  	uint8_t		unused84;	/* 0084 */
961  	uint8_t		check_algo;	/* 0085 check code request & algo */
962  
963  	/*
964  	 * These fields are currently only applicable to PFSROOTs.
965  	 *
966  	 * NOTE: We can't use {volume_data->fsid, pfs_clid} to uniquely
967  	 *	 identify an instance of a PFS in the cluster because
968  	 *	 a mount may contain more than one copy of the PFS as
969  	 *	 a separate node.  {pfs_clid, pfs_fsid} must be used for
970  	 *	 registration in the cluster.
971  	 */
972  	uint8_t		pfs_nmasters;	/* 0086 (if PFSROOT) if multi-master */
973  	uint8_t		pfs_type;	/* 0087 (if PFSROOT) node type */
974  	hammer2_tid_t	pfs_inum;	/* 0088 (if PFSROOT) inum allocator */
975  	uuid_t		pfs_clid;	/* 0090 (if PFSROOT) cluster uuid */
976  	uuid_t		pfs_fsid;	/* 00A0 (if PFSROOT) unique uuid */
977  
978  	/*
979  	 * Quotas and aggregate sub-tree inode and data counters.  Note that
980  	 * quotas are not replicated downward, they are explicitly set by
981  	 * the sysop and in-memory structures keep track of inheritance.
982  	 */
983  	hammer2_key_t	data_quota;	/* 00B0 subtree quota in bytes */
984  	hammer2_key_t	unusedB8;	/* 00B8 */
985  	hammer2_key_t	inode_quota;	/* 00C0 subtree quota inode count */
986  	hammer2_key_t	unusedC8;	/* 00C8 */
987  
988  	/*
989  	 * The last snapshot tid is tested against modify_tid to determine
990  	 * when a copy must be made of a data block whos check mode has been
991  	 * disabled (a disabled check mode allows data blocks to be updated
992  	 * in place instead of copy-on-write).
993  	 */
994  	hammer2_tid_t	pfs_lsnap_tid;	/* 00D0 last snapshot tid */
995  	hammer2_tid_t	reservedD8;	/* 00D8 (avail) */
996  
997  	/*
998  	 * Tracks (possibly degenerate) free areas covering all sub-tree
999  	 * allocations under inode, not counting the inode itself.
1000  	 * 0/0 indicates empty entry.  fully set-associative.
1001  	 *
1002  	 * (not yet implemented)
1003  	 */
1004  	uint64_t	decrypt_check;	/* 00E0 decryption validator */
1005  	hammer2_off_t	reservedE8[3];	/* 00E8/F0/F8 */
1006  } __packed;
1007  
1008  typedef struct hammer2_inode_meta hammer2_inode_meta_t;
1009  
1010  struct hammer2_inode_data {
1011  	hammer2_inode_meta_t	meta;	/* 0000-00FF */
1012  	unsigned char	filename[HAMMER2_INODE_MAXNAME];
1013  					/* 0100-01FF (256 char, unterminated) */
1014  	union {				/* 0200-03FF (64x8 = 512 bytes) */
1015  		hammer2_blockset_t blockset;
1016  		char data[HAMMER2_EMBEDDED_BYTES];
1017  	} u;
1018  } __packed;
1019  
1020  typedef struct hammer2_inode_data hammer2_inode_data_t;
1021  
1022  #define HAMMER2_OPFLAG_DIRECTDATA	0x01
1023  #define HAMMER2_OPFLAG_PFSROOT		0x02	/* (see also bref flag) */
1024  #define HAMMER2_OPFLAG_COPYIDS		0x04	/* copyids override parent */
1025  
1026  #define HAMMER2_OBJTYPE_UNKNOWN		0
1027  #define HAMMER2_OBJTYPE_DIRECTORY	1
1028  #define HAMMER2_OBJTYPE_REGFILE		2
1029  #define HAMMER2_OBJTYPE_FIFO		4
1030  #define HAMMER2_OBJTYPE_CDEV		5
1031  #define HAMMER2_OBJTYPE_BDEV		6
1032  #define HAMMER2_OBJTYPE_SOFTLINK	7
1033  #define HAMMER2_OBJTYPE_UNUSED08	8
1034  #define HAMMER2_OBJTYPE_SOCKET		9
1035  #define HAMMER2_OBJTYPE_WHITEOUT	10
1036  
1037  #define HAMMER2_COPYID_NONE		0
1038  #define HAMMER2_COPYID_LOCAL		((uint8_t)-1)
1039  
1040  #define HAMMER2_COPYID_COUNT		256
1041  
1042  /*
1043   * PFS types identify the role of a PFS within a cluster.  The PFS types
1044   * is stored on media and in LNK_SPAN messages and used in other places.
1045   *
1046   * The low 4 bits specify the current active type while the high 4 bits
1047   * specify the transition target if the PFS is being upgraded or downgraded,
1048   * If the upper 4 bits are not zero it may effect how a PFS is used during
1049   * the transition.
1050   *
1051   * Generally speaking, downgrading a MASTER to a SLAVE cannot complete until
1052   * at least all MASTERs have updated their pfs_nmasters field.  And upgrading
1053   * a SLAVE to a MASTER cannot complete until the new prospective master has
1054   * been fully synchronized (though theoretically full synchronization is
1055   * not required if a (new) quorum of other masters are fully synchronized).
1056   *
1057   * It generally does not matter which PFS element you actually mount, you
1058   * are mounting 'the cluster'.  So, for example, a network mount will mount
1059   * a DUMMY PFS type on a memory filesystem.  However, there are two exceptions.
1060   * In order to gain the benefits of a SOFT_MASTER or SOFT_SLAVE, those PFSs
1061   * must be directly mounted.
1062   */
1063  #define HAMMER2_PFSTYPE_NONE		0x00
1064  #define HAMMER2_PFSTYPE_CACHE		0x01
1065  #define HAMMER2_PFSTYPE_UNUSED02	0x02
1066  #define HAMMER2_PFSTYPE_SLAVE		0x03
1067  #define HAMMER2_PFSTYPE_SOFT_SLAVE	0x04
1068  #define HAMMER2_PFSTYPE_SOFT_MASTER	0x05
1069  #define HAMMER2_PFSTYPE_MASTER		0x06
1070  #define HAMMER2_PFSTYPE_UNUSED07	0x07
1071  #define HAMMER2_PFSTYPE_SUPROOT		0x08
1072  #define HAMMER2_PFSTYPE_DUMMY		0x09
1073  #define HAMMER2_PFSTYPE_MAX		16
1074  
1075  #define HAMMER2_PFSTRAN_NONE		0x00	/* no transition in progress */
1076  #define HAMMER2_PFSTRAN_CACHE		0x10
1077  #define HAMMER2_PFSTRAN_UNUSED20	0x20
1078  #define HAMMER2_PFSTRAN_SLAVE		0x30
1079  #define HAMMER2_PFSTRAN_SOFT_SLAVE	0x40
1080  #define HAMMER2_PFSTRAN_SOFT_MASTER	0x50
1081  #define HAMMER2_PFSTRAN_MASTER		0x60
1082  #define HAMMER2_PFSTRAN_UNUSED70	0x70
1083  #define HAMMER2_PFSTRAN_SUPROOT		0x80
1084  #define HAMMER2_PFSTRAN_DUMMY		0x90
1085  
1086  #define HAMMER2_PFS_DEC(n)		((n) & 0x0F)
1087  #define HAMMER2_PFS_DEC_TRANSITION(n)	(((n) >> 4) & 0x0F)
1088  #define HAMMER2_PFS_ENC_TRANSITION(n)	(((n) & 0x0F) << 4)
1089  
1090  #define HAMMER2_PFSSUBTYPE_NONE		0
1091  #define HAMMER2_PFSSUBTYPE_SNAPSHOT	1	/* manual/managed snapshot */
1092  #define HAMMER2_PFSSUBTYPE_AUTOSNAP	2	/* automatic snapshot */
1093  
1094  /*
1095   * PFS mode of operation is a bitmask.  This is typically not stored
1096   * on-media, but defined here because the field may be used in dmsgs.
1097   */
1098  #define HAMMER2_PFSMODE_QUORUM		0x01
1099  #define HAMMER2_PFSMODE_RW		0x02
1100  
1101  /*
1102   * The volume header eats a 64K block at the beginning of each 2GB zone
1103   * up to four copies.
1104   *
1105   * All information is stored in host byte order.  The volume header's magic
1106   * number may be checked to determine the byte order.  If you wish to mount
1107   * between machines w/ different endian modes you'll need filesystem code
1108   * which acts on the media data consistently (either all one way or all the
1109   * other).  Our code currently does not do that.
1110   *
1111   * A read-write mount may have to recover missing allocations by doing an
1112   * incremental mirror scan looking for modifications made after alloc_tid.
1113   * If alloc_tid == last_tid then no recovery operation is needed.  Recovery
1114   * operations are usually very, very fast.
1115   *
1116   * Read-only mounts do not need to do any recovery, access to the filesystem
1117   * topology is always consistent after a crash (is always consistent, period).
1118   * However, there may be shortcutted blockref updates present from deep in
1119   * the tree which are stored in the volumeh eader and must be tracked on
1120   * the fly.
1121   *
1122   * NOTE: The copyinfo[] array contains the configuration for both the
1123   *	 cluster connections and any local media copies.  The volume
1124   *	 header will be replicated for each local media copy.
1125   *
1126   *	 The mount command may specify multiple medias or just one and
1127   *	 allow HAMMER2 to pick up the others when it checks the copyinfo[]
1128   *	 array on mount.
1129   *
1130   * NOTE: sroot_blockset points to the super-root directory, not the root
1131   *	 directory.  The root directory will be a subdirectory under the
1132   *	 super-root.
1133   *
1134   *	 The super-root directory contains all root directories and all
1135   *	 snapshots (readonly or writable).  It is possible to do a
1136   *	 null-mount of the super-root using special path constructions
1137   *	 relative to your mounted root.
1138   */
1139  #define HAMMER2_VOLUME_ID_HBO	0x48414d3205172011LLU
1140  #define HAMMER2_VOLUME_ID_ABO	0x11201705324d4148LLU
1141  
1142  /*
1143   * If volume version is HAMMER2_VOL_VERSION_MULTI_VOLUMES or above, max
1144   * HAMMER2_MAX_VOLUMES volumes are supported. There must be 1 (and only 1)
1145   * volume with volume id HAMMER2_ROOT_VOLUME.
1146   * Otherwise filesystem only supports 1 volume, and that volume must have
1147   * volume id HAMMER2_ROOT_VOLUME(0) which was a reserved field then.
1148   */
1149  #define HAMMER2_MAX_VOLUMES	64
1150  #define HAMMER2_ROOT_VOLUME	0
1151  
1152  struct hammer2_volume_data {
1153  	/*
1154  	 * sector #0 - 512 bytes
1155  	 */
1156  	uint64_t	magic;			/* 0000 Signature */
1157  	hammer2_off_t	boot_beg;		/* 0008 Boot area (future) */
1158  	hammer2_off_t	boot_end;		/* 0010 (size = end - beg) */
1159  	hammer2_off_t	aux_beg;		/* 0018 Aux area (future) */
1160  	hammer2_off_t	aux_end;		/* 0020 (size = end - beg) */
1161  	hammer2_off_t	volu_size;		/* 0028 Volume size, bytes */
1162  
1163  	uint32_t	version;		/* 0030 */
1164  	uint32_t	flags;			/* 0034 */
1165  	uint8_t		copyid;			/* 0038 copyid of phys vol */
1166  	uint8_t		freemap_version;	/* 0039 freemap algorithm */
1167  	uint8_t		peer_type;		/* 003A HAMMER2_PEER_xxx */
1168  	uint8_t		volu_id;		/* 003B */
1169  	uint8_t		nvolumes;		/* 003C */
1170  	uint8_t		reserved003D;		/* 003D */
1171  	uint16_t	reserved003E;		/* 003E */
1172  
1173  	uuid_t		fsid;			/* 0040 */
1174  	uuid_t		fstype;			/* 0050 */
1175  
1176  	/*
1177  	 * allocator_size is precalculated at newfs time and does not include
1178  	 * reserved blocks, boot, or aux areas.
1179  	 *
1180  	 * Initial non-reserved-area allocations do not use the freemap
1181  	 * but instead adjust alloc_iterator.  Dynamic allocations take
1182  	 * over starting at (allocator_beg).  This makes newfs_hammer2's
1183  	 * job a lot easier and can also serve as a testing jig.
1184  	 */
1185  	hammer2_off_t	allocator_size;		/* 0060 Total data space */
1186  	hammer2_off_t   allocator_free;		/* 0068	Free space */
1187  	hammer2_off_t	allocator_beg;		/* 0070 Initial allocations */
1188  
1189  	/*
1190  	 * mirror_tid reflects the highest committed change for this
1191  	 * block device regardless of whether it is to the super-root
1192  	 * or to a PFS or whatever.
1193  	 *
1194  	 * freemap_tid reflects the highest committed freemap change for
1195  	 * this block device.
1196  	 */
1197  	hammer2_tid_t	mirror_tid;		/* 0078 committed tid (vol) */
1198  	hammer2_tid_t	reserved0080;		/* 0080 */
1199  	hammer2_tid_t	reserved0088;		/* 0088 */
1200  	hammer2_tid_t	freemap_tid;		/* 0090 committed tid (fmap) */
1201  	hammer2_tid_t	bulkfree_tid;		/* 0098 bulkfree incremental */
1202  	hammer2_tid_t	reserved00A0[4];	/* 00A0-00BF */
1203  
1204  	hammer2_off_t	total_size;		/* 00C0 Total volume size, bytes */
1205  
1206  	/*
1207  	 * Copyids are allocated dynamically from the copyexists bitmap.
1208  	 * An id from the active copies set (up to 8, see copyinfo later on)
1209  	 * may still exist after the copy set has been removed from the
1210  	 * volume header and its bit will remain active in the bitmap and
1211  	 * cannot be reused until it is 100% removed from the hierarchy.
1212  	 */
1213  	uint32_t	copyexists[8];		/* 00C8-00E7 copy exists bmap */
1214  	char		reserved00E8[248];	/* 00E8-01DF */
1215  
1216  	/*
1217  	 * 32 bit CRC array at the end of the first 512 byte sector.
1218  	 *
1219  	 * icrc_sects[7] - First 512-4 bytes of volume header (including all
1220  	 *		   the other icrc's except this one).
1221  	 *
1222  	 * icrc_sects[6] - Sector 1 (512 bytes) of volume header, which is
1223  	 *		   the blockset for the root.
1224  	 *
1225  	 * icrc_sects[5] - Sector 2
1226  	 * icrc_sects[4] - Sector 3
1227  	 * icrc_sects[3] - Sector 4 (the freemap blockset)
1228  	 */
1229  	hammer2_crc32_t	icrc_sects[8];		/* 01E0-01FF */
1230  
1231  	/*
1232  	 * sector #1 - 512 bytes
1233  	 *
1234  	 * The entire sector is used by a blockset, but currently only first
1235  	 * blockref is used.
1236  	 */
1237  	hammer2_blockset_t sroot_blockset;	/* 0200-03FF Superroot dir */
1238  
1239  	/*
1240  	 * sector #2-6
1241  	 */
1242  	char	sector2[512];			/* 0400-05FF reserved */
1243  	char	sector3[512];			/* 0600-07FF reserved */
1244  	hammer2_blockset_t freemap_blockset;	/* 0800-09FF freemap  */
1245  	char	sector5[512];			/* 0A00-0BFF reserved */
1246  	char	sector6[512];			/* 0C00-0DFF reserved */
1247  
1248  	/*
1249  	 * sector #7 - 512 bytes
1250  	 * Maximum 64 volume offsets within logical offset.
1251  	 */
1252  	hammer2_off_t volu_loff[HAMMER2_MAX_VOLUMES];
1253  
1254  	/*
1255  	 * sector #8-71	- 32768 bytes
1256  	 *
1257  	 * Contains the configuration for up to 256 copyinfo targets.  These
1258  	 * specify local and remote copies operating as masters or slaves.
1259  	 * copyid's 0 and 255 are reserved (0 indicates an empty slot and 255
1260  	 * indicates the local media).
1261  	 */
1262  						/* 1000-8FFF copyinfo config */
1263  	hammer2_volconf_t copyinfo[HAMMER2_COPYID_COUNT];
1264  
1265  	/*
1266  	 * Remaining sections are reserved for future use.
1267  	 */
1268  	char		reserved9000[0x6FFC];	/* 9000-FFFB reserved */
1269  
1270  	/*
1271  	 * icrc on entire volume header
1272  	 */
1273  	hammer2_crc32_t	icrc_volheader;		/* FFFC-FFFF full volume icrc*/
1274  } __packed;
1275  
1276  typedef struct hammer2_volume_data hammer2_volume_data_t;
1277  
1278  /*
1279   * Various parts of the volume header have their own iCRCs.
1280   *
1281   * The first 512 bytes has its own iCRC stored at the end of the 512 bytes
1282   * and not included the icrc calculation.
1283   *
1284   * The second 512 bytes also has its own iCRC but it is stored in the first
1285   * 512 bytes so it covers the entire second 512 bytes.
1286   *
1287   * The whole volume block (64KB) has an iCRC covering all but the last 4 bytes,
1288   * which is where the iCRC for the whole volume is stored.  This is currently
1289   * a catch-all for anything not individually iCRCd.
1290   */
1291  #define HAMMER2_VOL_ICRC_SECT0		7
1292  #define HAMMER2_VOL_ICRC_SECT1		6
1293  
1294  #define HAMMER2_VOLUME_BYTES		65536
1295  
1296  #define HAMMER2_VOLUME_ICRC0_OFF	0
1297  #define HAMMER2_VOLUME_ICRC1_OFF	512
1298  #define HAMMER2_VOLUME_ICRCVH_OFF	0
1299  
1300  #define HAMMER2_VOLUME_ICRC0_SIZE	(512 - 4)
1301  #define HAMMER2_VOLUME_ICRC1_SIZE	(512)
1302  #define HAMMER2_VOLUME_ICRCVH_SIZE	(65536 - 4)
1303  
1304  #define HAMMER2_VOL_VERSION_MULTI_VOLUMES	2
1305  
1306  #define HAMMER2_VOL_VERSION_MIN		1
1307  #define HAMMER2_VOL_VERSION_DEFAULT	HAMMER2_VOL_VERSION_MULTI_VOLUMES
1308  #define HAMMER2_VOL_VERSION_WIP		(HAMMER2_VOL_VERSION_MULTI_VOLUMES + 1)
1309  
1310  #define HAMMER2_NUM_VOLHDRS		4
1311  
1312  union hammer2_media_data {
1313  	hammer2_volume_data_t	voldata;
1314          hammer2_inode_data_t    ipdata;
1315  	hammer2_blockset_t	blkset;
1316  	hammer2_blockref_t	npdata[HAMMER2_IND_COUNT_MAX];
1317  	hammer2_bmap_data_t	bmdata[HAMMER2_FREEMAP_COUNT];
1318  	char			buf[HAMMER2_PBUFSIZE];
1319  } __packed;
1320  
1321  typedef union hammer2_media_data hammer2_media_data_t;
1322  
1323  #endif /* !_VFS_HAMMER2_DISK_H_ */
1324