xref: /spdk/lib/blob/blobstore.h (revision 2dc4a231ac65d10dd2e1a96684094bef1b7ebb95)
1488570ebSJim Harris /*   SPDX-License-Identifier: BSD-3-Clause
2a6dbe372Spaul luse  *   Copyright (C) 2017 Intel Corporation.
3d89352a9SBen Walker  *   All rights reserved.
4ce67e0c7SMike Gerdts  *   Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5d89352a9SBen Walker  */
6d89352a9SBen Walker 
7d89352a9SBen Walker #ifndef SPDK_BLOBSTORE_H
8d89352a9SBen Walker #define SPDK_BLOBSTORE_H
9d89352a9SBen Walker 
10d89352a9SBen Walker #include "spdk/assert.h"
11d89352a9SBen Walker #include "spdk/blob.h"
12d89352a9SBen Walker #include "spdk/queue.h"
13d89352a9SBen Walker #include "spdk/util.h"
147de351f1SLiu Xiaodong #include "spdk/tree.h"
15316cf9efSMike Gerdts #include "spdk/thread.h"
16d89352a9SBen Walker 
17130d278aSPaul Luse #include "request.h"
18130d278aSPaul Luse 
19d89352a9SBen Walker /* In Memory Data Structures
20d89352a9SBen Walker  *
21d89352a9SBen Walker  * The following data structures exist only in memory.
22d89352a9SBen Walker  */
23d89352a9SBen Walker 
24d89352a9SBen Walker #define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024)
25d89352a9SBen Walker #define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX
264a3182b8SBen Walker #define SPDK_BLOB_OPTS_MAX_MD_OPS 32
274ebe8214SZiye Yang #define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512
28721695e1SPaul Luse #define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32)
29d89352a9SBen Walker 
30d89352a9SBen Walker struct spdk_xattr {
31d89352a9SBen Walker 	uint32_t	index;
32ee9db7daSZiye Yang 	uint16_t	value_len;
33d89352a9SBen Walker 	char		*name;
34d89352a9SBen Walker 	void		*value;
35d89352a9SBen Walker 	TAILQ_ENTRY(spdk_xattr)	link;
36d89352a9SBen Walker };
37d89352a9SBen Walker 
38d89352a9SBen Walker /* The mutable part of the blob data that is sync'd to
39d89352a9SBen Walker  * disk. The data in here is both mutable and persistent.
40d89352a9SBen Walker  */
41d89352a9SBen Walker struct spdk_blob_mut_data {
42d89352a9SBen Walker 	/* Number of data clusters in the blob */
43d89352a9SBen Walker 	uint64_t	num_clusters;
44d89352a9SBen Walker 
45d89352a9SBen Walker 	/* Array LBAs that are the beginning of a cluster, in
46d89352a9SBen Walker 	 * the order they appear in the blob.
47d89352a9SBen Walker 	 */
48d89352a9SBen Walker 	uint64_t	*clusters;
49d89352a9SBen Walker 
50d89352a9SBen Walker 	/* The size of the clusters array. This is greater than or
51d89352a9SBen Walker 	 * equal to 'num_clusters'.
52d89352a9SBen Walker 	 */
53d89352a9SBen Walker 	size_t		cluster_array_size;
54d89352a9SBen Walker 
552dbbbbd8SDamiano Cipriani 	/* The number of allocated clusters in the clusters array */
562dbbbbd8SDamiano Cipriani 	uint64_t	num_allocated_clusters;
572dbbbbd8SDamiano Cipriani 
58f60b4a7eSTomasz Zawadzki 	/* Number of extent pages */
59f60b4a7eSTomasz Zawadzki 	uint64_t	num_extent_pages;
60f60b4a7eSTomasz Zawadzki 
61f60b4a7eSTomasz Zawadzki 	/* Array of page offsets into the metadata region,
62f60b4a7eSTomasz Zawadzki 	 * containing extents. Can contain entries for not yet
63f60b4a7eSTomasz Zawadzki 	 * allocated pages. */
64f60b4a7eSTomasz Zawadzki 	uint32_t	*extent_pages;
65f60b4a7eSTomasz Zawadzki 
66f60b4a7eSTomasz Zawadzki 	/* The size of the extent page array. This is greater than or
67f60b4a7eSTomasz Zawadzki 	 * equal to 'num_extent_pages'. */
68f60b4a7eSTomasz Zawadzki 	size_t		extent_pages_array_size;
69f60b4a7eSTomasz Zawadzki 
70d89352a9SBen Walker 	/* Number of metadata pages */
71d89352a9SBen Walker 	uint32_t	num_pages;
72d89352a9SBen Walker 
73d89352a9SBen Walker 	/* Array of page offsets into the metadata region, in
74d89352a9SBen Walker 	 * the order of the metadata page sequence.
75d89352a9SBen Walker 	 */
76d89352a9SBen Walker 	uint32_t	*pages;
77d89352a9SBen Walker };
78d89352a9SBen Walker 
79d89352a9SBen Walker enum spdk_blob_state {
80d89352a9SBen Walker 	/* The blob in-memory version does not match the on-disk
81d89352a9SBen Walker 	 * version.
82d89352a9SBen Walker 	 */
83d89352a9SBen Walker 	SPDK_BLOB_STATE_DIRTY,
84d89352a9SBen Walker 
85d89352a9SBen Walker 	/* The blob in memory version of the blob matches the on disk
86d89352a9SBen Walker 	 * version.
87d89352a9SBen Walker 	 */
88d89352a9SBen Walker 	SPDK_BLOB_STATE_CLEAN,
89d89352a9SBen Walker 
90d89352a9SBen Walker 	/* The in-memory state being synchronized with the on-disk
91d89352a9SBen Walker 	 * blob state. */
92d89352a9SBen Walker 	SPDK_BLOB_STATE_LOADING,
93d89352a9SBen Walker };
94d89352a9SBen Walker 
957ba8c006SPiotr Pelplinski TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr);
967ba8c006SPiotr Pelplinski 
97d7e065beSTomasz Kulasek struct spdk_blob_list {
98d7e065beSTomasz Kulasek 	spdk_blob_id id;
99d7e065beSTomasz Kulasek 	size_t clone_count;
100d7e065beSTomasz Kulasek 	TAILQ_HEAD(, spdk_blob_list) clones;
101d7e065beSTomasz Kulasek 	TAILQ_ENTRY(spdk_blob_list) link;
102d7e065beSTomasz Kulasek };
103d7e065beSTomasz Kulasek 
104c8efd8a8SJim Harris struct spdk_blob {
105d89352a9SBen Walker 	struct spdk_blob_store *bs;
106d89352a9SBen Walker 
107d89352a9SBen Walker 	uint32_t	open_ref;
108d89352a9SBen Walker 
109d89352a9SBen Walker 	spdk_blob_id	id;
110d7e065beSTomasz Kulasek 	spdk_blob_id	parent_id;
111d89352a9SBen Walker 
112d89352a9SBen Walker 	enum spdk_blob_state		state;
113d89352a9SBen Walker 
114d89352a9SBen Walker 	/* Two copies of the mutable data. One is a version
115d89352a9SBen Walker 	 * that matches the last known data on disk (clean).
116d89352a9SBen Walker 	 * The other (active) is the current data. Syncing
117d89352a9SBen Walker 	 * a blob makes the clean match the active.
118d89352a9SBen Walker 	 */
119d89352a9SBen Walker 	struct spdk_blob_mut_data	clean;
120d89352a9SBen Walker 	struct spdk_blob_mut_data	active;
121d89352a9SBen Walker 
12275cb2da9SJim Harris 	bool		invalid;
123f2223d7dSJim Harris 	bool		data_ro;
124f2223d7dSJim Harris 	bool		md_ro;
125f2223d7dSJim Harris 
126d12ba75bSJim Harris 	uint64_t	invalid_flags;
127d12ba75bSJim Harris 	uint64_t	data_ro_flags;
128d12ba75bSJim Harris 	uint64_t	md_ro_flags;
129d12ba75bSJim Harris 
1304132ac52SMaciej Szwed 	struct spdk_bs_dev *back_bs_dev;
1314132ac52SMaciej Szwed 
132d89352a9SBen Walker 	/* TODO: The xattrs are mutable, but we don't want to be
1336fa48bbfSChen Wang 	 * copying them unnecessarily. Figure this out.
134d89352a9SBen Walker 	 */
1357ba8c006SPiotr Pelplinski 	struct spdk_xattr_tailq xattrs;
1367ba8c006SPiotr Pelplinski 	struct spdk_xattr_tailq xattrs_internal;
137d89352a9SBen Walker 
1387de351f1SLiu Xiaodong 	RB_ENTRY(spdk_blob) link;
1398c45ed38SPiotr Pelplinski 
1408c45ed38SPiotr Pelplinski 	uint32_t frozen_refcnt;
1418256cecfSMaciej Szwed 	bool locked_operation_in_progress;
142adb39585SMaciej Szwed 	enum blob_clear_method clear_method;
143c33840b7STomasz Zawadzki 	bool extent_rle_found;
144c33840b7STomasz Zawadzki 	bool extent_table_found;
145c33840b7STomasz Zawadzki 	bool use_extent_table;
146f60b4a7eSTomasz Zawadzki 
147030be573STomasz Zawadzki 	/* A list of pending metadata pending_persists */
148030be573STomasz Zawadzki 	TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists;
149ceaa0c7fSTomasz Zawadzki 	TAILQ_HEAD(, spdk_blob_persist_ctx) persists_to_complete;
150030be573STomasz Zawadzki 
151cc6920a4SJosh Soref 	/* Number of data clusters retrieved from extent table,
152f60b4a7eSTomasz Zawadzki 	 * that many have to be read from extent pages. */
15378257ab6STomasz Zawadzki 	uint64_t	remaining_clusters_in_et;
154d89352a9SBen Walker };
155d89352a9SBen Walker 
156d89352a9SBen Walker struct spdk_blob_store {
157d89352a9SBen Walker 	uint64_t			md_start; /* Offset from beginning of disk, in pages */
158d89352a9SBen Walker 	uint32_t			md_len; /* Count, in pages */
159*2dc4a231SAtul Malakar 	uint32_t                        md_page_size; /* Metadata page size */
16060e8fb49SBen Walker 
161d89352a9SBen Walker 	struct spdk_io_channel		*md_channel;
16260e8fb49SBen Walker 	uint32_t			max_channel_ops;
163d89352a9SBen Walker 
164dfb102b7SJim Harris 	struct spdk_thread		*md_thread;
165dfb102b7SJim Harris 
166d89352a9SBen Walker 	struct spdk_bs_dev		*dev;
167d89352a9SBen Walker 
1682a608d02SMike Gerdts 	struct spdk_bit_array		*used_md_pages;		/* Protected by used_lock */
1692a608d02SMike Gerdts 	struct spdk_bit_pool		*used_clusters;		/* Protected by used_lock */
17040c911b9SJim Harris 	struct spdk_bit_array		*used_blobids;
17130ee8137SBen Walker 	struct spdk_bit_array		*open_blobids;
172d89352a9SBen Walker 
173316cf9efSMike Gerdts 	struct spdk_spinlock		used_lock;
1749103821dSMaciej Szwed 
175d89352a9SBen Walker 	uint32_t			cluster_sz;
176d89352a9SBen Walker 	uint64_t			total_clusters;
1775eb52b95STomasz Zawadzki 	uint64_t			total_data_clusters;
1782a608d02SMike Gerdts 	uint64_t			num_free_clusters;	/* Protected by used_lock */
179f3001308SJim Harris 	uint64_t			pages_per_cluster;
1803299bf6dSJim Harris 	uint64_t			io_units_per_cluster;
181b3348624STomasz Zawadzki 	uint8_t				pages_per_cluster_shift;
1823299bf6dSJim Harris 	uint8_t				io_units_per_cluster_shift;
1836609b776SPiotr Pelplinski 	uint32_t			io_unit_size;
184d89352a9SBen Walker 
185d89352a9SBen Walker 	spdk_blob_id			super_blob;
186eb8b1e20SMaciej Szwed 	struct spdk_bs_type		bstype;
187d89352a9SBen Walker 
188130d278aSPaul Luse 	struct spdk_bs_cpl		unload_cpl;
189130d278aSPaul Luse 	int				unload_err;
190130d278aSPaul Luse 
1917de351f1SLiu Xiaodong 	RB_HEAD(spdk_blob_tree, spdk_blob) open_blobs;
192d7e065beSTomasz Kulasek 	TAILQ_HEAD(, spdk_blob_list)	snapshots;
193bc8f2cd9SPiotr Pelplinski 
194bc8f2cd9SPiotr Pelplinski 	bool				clean;
195ce67e0c7SMike Gerdts 
196ce67e0c7SMike Gerdts 	spdk_bs_esnap_dev_create	esnap_bs_dev_create;
197a4a73fecSMike Gerdts 	void				*esnap_ctx;
198ba91ffbaSMike Gerdts 
199ba91ffbaSMike Gerdts 	/* If external snapshot channels are being destroyed while
200ba91ffbaSMike Gerdts 	 * the blobstore is unloaded, the unload is deferred until
201ba91ffbaSMike Gerdts 	 * after the channel destruction completes.
202ba91ffbaSMike Gerdts 	 */
203ba91ffbaSMike Gerdts 	uint32_t			esnap_channels_unloading;
204ba91ffbaSMike Gerdts 	spdk_bs_op_complete		esnap_unload_cb_fn;
205ba91ffbaSMike Gerdts 	void				*esnap_unload_cb_arg;
206d89352a9SBen Walker };
207d89352a9SBen Walker 
208d89352a9SBen Walker struct spdk_bs_channel {
209d89352a9SBen Walker 	struct spdk_bs_request_set	*req_mem;
210d89352a9SBen Walker 	TAILQ_HEAD(, spdk_bs_request_set) reqs;
211d89352a9SBen Walker 
212d89352a9SBen Walker 	struct spdk_blob_store		*bs;
213d89352a9SBen Walker 
214d89352a9SBen Walker 	struct spdk_bs_dev		*dev;
215d89352a9SBen Walker 	struct spdk_io_channel		*dev_channel;
2164132ac52SMaciej Szwed 
2171eca87c3SAlexey Marchuk 	/* This page is only used during insert of a new cluster. */
2181eca87c3SAlexey Marchuk 	struct spdk_blob_md_page	*new_cluster_page;
2191eca87c3SAlexey Marchuk 
2204132ac52SMaciej Szwed 	TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc;
2218c45ed38SPiotr Pelplinski 	TAILQ_HEAD(, spdk_bs_request_set) queued_io;
222b47cee6cSMike Gerdts 
223b47cee6cSMike Gerdts 	RB_HEAD(blob_esnap_channel_tree, blob_esnap_channel) esnap_channels;
224d89352a9SBen Walker };
225d89352a9SBen Walker 
226f6e075cdSMaciej Szwed /** operation type */
227f6e075cdSMaciej Szwed enum spdk_blob_op_type {
228f6e075cdSMaciej Szwed 	SPDK_BLOB_WRITE,
229f6e075cdSMaciej Szwed 	SPDK_BLOB_READ,
230f6e075cdSMaciej Szwed 	SPDK_BLOB_UNMAP,
231f6e075cdSMaciej Szwed 	SPDK_BLOB_WRITE_ZEROES,
232b2503cb3SJim Harris 	SPDK_BLOB_WRITEV,
233b2503cb3SJim Harris 	SPDK_BLOB_READV,
234f6e075cdSMaciej Szwed };
235f6e075cdSMaciej Szwed 
236c26c4e9fSPiotr Pelplinski /* back bs_dev */
237c26c4e9fSPiotr Pelplinski 
238c26c4e9fSPiotr Pelplinski #define BLOB_SNAPSHOT "SNAP"
239777627e0SPiotr Pelplinski #define SNAPSHOT_IN_PROGRESS "SNAPTMP"
24092cafd15SMaciej Szwed #define SNAPSHOT_PENDING_REMOVAL "SNAPRM"
241ce67e0c7SMike Gerdts #define BLOB_EXTERNAL_SNAPSHOT_ID "EXTSNAP"
242c26c4e9fSPiotr Pelplinski 
243c26c4e9fSPiotr Pelplinski struct spdk_blob_bs_dev {
244c26c4e9fSPiotr Pelplinski 	struct spdk_bs_dev bs_dev;
245c26c4e9fSPiotr Pelplinski 	struct spdk_blob *blob;
246c26c4e9fSPiotr Pelplinski };
247c26c4e9fSPiotr Pelplinski 
248d89352a9SBen Walker /* On-Disk Data Structures
249d89352a9SBen Walker  *
250d89352a9SBen Walker  * The following data structures exist on disk.
251d89352a9SBen Walker  */
252eb8b1e20SMaciej Szwed #define SPDK_BS_INITIAL_VERSION 1
253d12ba75bSJim Harris #define SPDK_BS_VERSION 3 /* current version */
254d89352a9SBen Walker 
255d89352a9SBen Walker #pragma pack(push, 1)
256d89352a9SBen Walker 
257d89352a9SBen Walker #define SPDK_MD_MASK_TYPE_USED_PAGES 0
258d89352a9SBen Walker #define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1
25940c911b9SJim Harris #define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2
260d89352a9SBen Walker 
261d89352a9SBen Walker struct spdk_bs_md_mask {
262d89352a9SBen Walker 	uint8_t		type;
263d89352a9SBen Walker 	uint32_t	length; /* In bits */
264d89352a9SBen Walker 	uint8_t		mask[0];
265d89352a9SBen Walker };
266d89352a9SBen Walker 
267d89352a9SBen Walker #define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0
268d89352a9SBen Walker #define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2
269d12ba75bSJim Harris #define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3
2707ba8c006SPiotr Pelplinski #define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4
271d89352a9SBen Walker 
272f60b4a7eSTomasz Zawadzki /* Following descriptors define cluster layout in a blob.
273f60b4a7eSTomasz Zawadzki  * EXTENT_RLE cannot be present in blobs metadata,
274f60b4a7eSTomasz Zawadzki  * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */
2753dadb79eSTomasz Zawadzki 
2763dadb79eSTomasz Zawadzki /* EXTENT_RLE descriptor holds an array of LBA that points to
2773dadb79eSTomasz Zawadzki  * beginning of allocated clusters. The array is run-length encoded,
2783dadb79eSTomasz Zawadzki  * with 0's being unallocated clusters. It is part of serialized
2793dadb79eSTomasz Zawadzki  * metadata chain for a blob. */
2803dadb79eSTomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1
281f60b4a7eSTomasz Zawadzki /* EXTENT_TABLE descriptor holds array of md page offsets that
282f60b4a7eSTomasz Zawadzki  * point to pages with EXTENT_PAGE descriptor. The 0's in the array
283f60b4a7eSTomasz Zawadzki  * are run-length encoded, non-zero values are unallocated pages.
284f60b4a7eSTomasz Zawadzki  * It is part of serialized metadata chain for a blob. */
285f60b4a7eSTomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5
286f4e58993STomasz Zawadzki /* EXTENT_PAGE descriptor holds an array of LBAs that point to
287f4e58993STomasz Zawadzki  * beginning of allocated clusters. The array is run-length encoded,
288f4e58993STomasz Zawadzki  * with 0's being unallocated clusters. It is NOT part of
289f4e58993STomasz Zawadzki  * serialized metadata chain for a blob. */
290f4e58993STomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6
2913dadb79eSTomasz Zawadzki 
292d89352a9SBen Walker struct spdk_blob_md_descriptor_xattr {
293d89352a9SBen Walker 	uint8_t		type;
294d89352a9SBen Walker 	uint32_t	length;
295d89352a9SBen Walker 
296d89352a9SBen Walker 	uint16_t	name_length;
297d89352a9SBen Walker 	uint16_t	value_length;
298d89352a9SBen Walker 
299d89352a9SBen Walker 	char		name[0];
300d89352a9SBen Walker 	/* String name immediately followed by string value. */
301d89352a9SBen Walker };
302d89352a9SBen Walker 
3033e372f35STomasz Zawadzki struct spdk_blob_md_descriptor_extent_rle {
304d89352a9SBen Walker 	uint8_t		type;
305d89352a9SBen Walker 	uint32_t	length;
306d89352a9SBen Walker 
307d89352a9SBen Walker 	struct {
308d89352a9SBen Walker 		uint32_t	cluster_idx;
309d89352a9SBen Walker 		uint32_t	length; /* In units of clusters */
310d89352a9SBen Walker 	} extents[0];
311d89352a9SBen Walker };
312d89352a9SBen Walker 
313f60b4a7eSTomasz Zawadzki struct spdk_blob_md_descriptor_extent_table {
314f60b4a7eSTomasz Zawadzki 	uint8_t		type;
315f60b4a7eSTomasz Zawadzki 	uint32_t	length;
316f60b4a7eSTomasz Zawadzki 
317f60b4a7eSTomasz Zawadzki 	/* Number of data clusters in the blob */
318f60b4a7eSTomasz Zawadzki 	uint64_t	num_clusters;
319f60b4a7eSTomasz Zawadzki 
320f60b4a7eSTomasz Zawadzki 	struct {
321f60b4a7eSTomasz Zawadzki 		uint32_t	page_idx;
322f60b4a7eSTomasz Zawadzki 		uint32_t	num_pages; /* In units of pages */
323f60b4a7eSTomasz Zawadzki 	} extent_page[0];
324f60b4a7eSTomasz Zawadzki };
325f60b4a7eSTomasz Zawadzki 
326f4e58993STomasz Zawadzki struct spdk_blob_md_descriptor_extent_page {
327f4e58993STomasz Zawadzki 	uint8_t		type;
328f4e58993STomasz Zawadzki 	uint32_t	length;
329f4e58993STomasz Zawadzki 
33042109157STomasz Zawadzki 	/* First cluster index in this extent page */
33142109157STomasz Zawadzki 	uint32_t	start_cluster_idx;
33242109157STomasz Zawadzki 
333f4e58993STomasz Zawadzki 	uint32_t	cluster_idx[0];
334f4e58993STomasz Zawadzki };
335f4e58993STomasz Zawadzki 
336489ea86eSPiotr Pelplinski #define SPDK_BLOB_THIN_PROV		(1ULL << 0)
3377ba8c006SPiotr Pelplinski #define SPDK_BLOB_INTERNAL_XATTR	(1ULL << 1)
33829bd5020STomasz Zawadzki #define SPDK_BLOB_EXTENT_TABLE		(1ULL << 2)
339ce67e0c7SMike Gerdts #define SPDK_BLOB_EXTERNAL_SNAPSHOT	(1ULL << 3)
340ce67e0c7SMike Gerdts #define SPDK_BLOB_INVALID_FLAGS_MASK	(SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | \
341ce67e0c7SMike Gerdts 					 SPDK_BLOB_EXTENT_TABLE | SPDK_BLOB_EXTERNAL_SNAPSHOT)
342c315d8e8SPiotr Pelplinski 
343c315d8e8SPiotr Pelplinski #define SPDK_BLOB_READ_ONLY (1ULL << 0)
344c315d8e8SPiotr Pelplinski #define SPDK_BLOB_DATA_RO_FLAGS_MASK	SPDK_BLOB_READ_ONLY
345ea69d6d6Spaul luse 
346ea69d6d6Spaul luse #define SPDK_BLOB_CLEAR_METHOD_SHIFT 0
347ea69d6d6Spaul luse #define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT)
348ea69d6d6Spaul luse #define SPDK_BLOB_MD_RO_FLAGS_MASK	SPDK_BLOB_CLEAR_METHOD
349d12ba75bSJim Harris 
350d12ba75bSJim Harris struct spdk_blob_md_descriptor_flags {
351d12ba75bSJim Harris 	uint8_t		type;
352d12ba75bSJim Harris 	uint32_t	length;
353d12ba75bSJim Harris 
354d12ba75bSJim Harris 	/*
355d12ba75bSJim Harris 	 * If a flag in invalid_flags is set that the application is not aware of,
356d12ba75bSJim Harris 	 *  it will not allow the blob to be opened.
357d12ba75bSJim Harris 	 */
358d12ba75bSJim Harris 	uint64_t	invalid_flags;
359d12ba75bSJim Harris 
360d12ba75bSJim Harris 	/*
361d12ba75bSJim Harris 	 * If a flag in data_ro_flags is set that the application is not aware of,
362d12ba75bSJim Harris 	 *  allow the blob to be opened in data_read_only and md_read_only mode.
363d12ba75bSJim Harris 	 */
364d12ba75bSJim Harris 	uint64_t	data_ro_flags;
365d12ba75bSJim Harris 
366d12ba75bSJim Harris 	/*
3679b72cda8SMike Gerdts 	 * If a flag in md_ro_flags is set the application is not aware of,
368d12ba75bSJim Harris 	 *  allow the blob to be opened in md_read_only mode.
369d12ba75bSJim Harris 	 */
370d12ba75bSJim Harris 	uint64_t	md_ro_flags;
371d12ba75bSJim Harris };
372d12ba75bSJim Harris 
373d89352a9SBen Walker struct spdk_blob_md_descriptor {
374d89352a9SBen Walker 	uint8_t		type;
375d89352a9SBen Walker 	uint32_t	length;
376d89352a9SBen Walker };
377d89352a9SBen Walker 
378d89352a9SBen Walker #define SPDK_INVALID_MD_PAGE UINT32_MAX
379d89352a9SBen Walker 
380d89352a9SBen Walker struct spdk_blob_md_page {
381d89352a9SBen Walker 	spdk_blob_id     id;
382d89352a9SBen Walker 
383d89352a9SBen Walker 	uint32_t	sequence_num;
384d89352a9SBen Walker 	uint32_t	reserved0;
385d89352a9SBen Walker 
386d89352a9SBen Walker 	/* Descriptors here */
38797b3efa3SBen Walker 	uint8_t		descriptors[4072];
388d89352a9SBen Walker 
389d89352a9SBen Walker 	uint32_t	next;
390d89352a9SBen Walker 	uint32_t	crc;
391d89352a9SBen Walker };
39226e9b6eaSPaul Luse #define SPDK_BS_PAGE_SIZE 0x1000
39326e9b6eaSPaul Luse SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size");
394d89352a9SBen Walker 
395320ab72fSShuhei Matsumoto #define SPDK_BS_MAX_DESC_SIZE SPDK_SIZEOF_MEMBER(struct spdk_blob_md_page, descriptors)
39669a8877eSTomasz Zawadzki 
397e1ce5515STomasz Zawadzki /* Maximum number of extents a single Extent Page can fit.
398e1ce5515STomasz Zawadzki  * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */
399e1ce5515STomasz Zawadzki #define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t))
400e1ce5515STomasz Zawadzki #define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u)
40159f7f3f7STomasz Zawadzki 
402d89352a9SBen Walker #define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB"
403d89352a9SBen Walker 
404d89352a9SBen Walker struct spdk_bs_super_block {
405d89352a9SBen Walker 	uint8_t		signature[8];
406d89352a9SBen Walker 	uint32_t	version;
407d89352a9SBen Walker 	uint32_t	length;
408d89352a9SBen Walker 	uint32_t	clean; /* If there was a clean shutdown, this is 1. */
409d89352a9SBen Walker 	spdk_blob_id	super_blob;
410d89352a9SBen Walker 
411d89352a9SBen Walker 	uint32_t	cluster_size; /* In bytes */
412d89352a9SBen Walker 
413d89352a9SBen Walker 	uint32_t	used_page_mask_start; /* Offset from beginning of disk, in pages */
414d89352a9SBen Walker 	uint32_t	used_page_mask_len; /* Count, in pages */
415d89352a9SBen Walker 
416d89352a9SBen Walker 	uint32_t	used_cluster_mask_start; /* Offset from beginning of disk, in pages */
417d89352a9SBen Walker 	uint32_t	used_cluster_mask_len; /* Count, in pages */
418d89352a9SBen Walker 
419d89352a9SBen Walker 	uint32_t	md_start; /* Offset from beginning of disk, in pages */
420d89352a9SBen Walker 	uint32_t	md_len; /* Count, in pages */
421d89352a9SBen Walker 
422eb8b1e20SMaciej Szwed 	struct spdk_bs_type	bstype; /* blobstore type */
423eb8b1e20SMaciej Szwed 
42440c911b9SJim Harris 	uint32_t	used_blobid_mask_start; /* Offset from beginning of disk, in pages */
42540c911b9SJim Harris 	uint32_t	used_blobid_mask_len; /* Count, in pages */
42640c911b9SJim Harris 
4272c91e919SPiotr Pelplinski 	uint64_t	size; /* size of blobstore in bytes */
4286609b776SPiotr Pelplinski 	uint32_t	io_unit_size; /* Size of io unit in bytes */
4292c91e919SPiotr Pelplinski 
430*2dc4a231SAtul Malakar 	uint32_t        md_page_size; /* Size in bytes */
431*2dc4a231SAtul Malakar 	uint8_t		reserved[3996];
432*2dc4a231SAtul Malakar 
43329bcd5a9SCunyin Chang 	uint32_t	crc;
434d89352a9SBen Walker };
435d89352a9SBen Walker SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size");
436d89352a9SBen Walker 
437d89352a9SBen Walker #pragma pack(pop)
438d89352a9SBen Walker 
439ad7fdd12SSeth Howell struct spdk_bs_dev *bs_create_zeroes_dev(void);
440ad7fdd12SSeth Howell struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob);
441b47cee6cSMike Gerdts struct spdk_io_channel *blob_esnap_get_io_channel(struct spdk_io_channel *ch,
442b47cee6cSMike Gerdts 		struct spdk_blob *blob);
443d3594f84Sxupeng-mingtu bool blob_backed_with_zeroes_dev(struct spdk_blob *blob);
4448970f868SBen Walker 
445d89352a9SBen Walker /* Unit Conversions
446d89352a9SBen Walker  *
447d89352a9SBen Walker  * The blobstore works with several different units:
448d89352a9SBen Walker  * - Byte: Self explanatory
449d89352a9SBen Walker  * - LBA: The logical blocks on the backing storage device.
450d89352a9SBen Walker  * - Page: The read/write units of blobs and metadata. This is
451d89352a9SBen Walker  *         an offset into a blob in units of 4KiB.
452d89352a9SBen Walker  * - Cluster Index: The disk is broken into a sequential list of
453d89352a9SBen Walker  *		    clusters. This is the offset from the beginning.
454d89352a9SBen Walker  *
455d89352a9SBen Walker  * NOTE: These conversions all act on simple magnitudes, not with any sort
456d89352a9SBen Walker  *        of knowledge about the blobs themselves. For instance, converting
457d89352a9SBen Walker  *        a page to an lba with the conversion function below simply converts
458d89352a9SBen Walker  *        a number of pages to an equivalent number of lbas, but that
459d89352a9SBen Walker  *        lba certainly isn't the right lba that corresponds to a page offset
460d89352a9SBen Walker  *        for a particular blob.
461d89352a9SBen Walker  */
462d89352a9SBen Walker static inline uint64_t
463b5d68d59SSeth Howell bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length)
464d89352a9SBen Walker {
465d89352a9SBen Walker 	assert(length % bs->dev->blocklen == 0);
466d89352a9SBen Walker 
467d89352a9SBen Walker 	return length / bs->dev->blocklen;
468d89352a9SBen Walker }
469d89352a9SBen Walker 
470d89352a9SBen Walker static inline uint64_t
471b5d68d59SSeth Howell bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length)
4724132ac52SMaciej Szwed {
4734132ac52SMaciej Szwed 	assert(length % bs_dev->blocklen == 0);
4744132ac52SMaciej Szwed 
4754132ac52SMaciej Szwed 	return length / bs_dev->blocklen;
4764132ac52SMaciej Szwed }
4774132ac52SMaciej Szwed 
4784132ac52SMaciej Szwed static inline uint64_t
479b5d68d59SSeth Howell bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page)
480d89352a9SBen Walker {
481*2dc4a231SAtul Malakar 	return page * bs->md_page_size / bs->dev->blocklen;
482d89352a9SBen Walker }
483d89352a9SBen Walker 
4844132ac52SMaciej Szwed static inline uint64_t
485b5d68d59SSeth Howell bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page)
4864b8db27bSTomasz Zawadzki {
4874b8db27bSTomasz Zawadzki 	assert(page < bs->md_len);
488b5d68d59SSeth Howell 	return bs_page_to_lba(bs, page + bs->md_start);
4894b8db27bSTomasz Zawadzki }
4904b8db27bSTomasz Zawadzki 
4914b8db27bSTomasz Zawadzki static inline uint64_t
4923299bf6dSJim Harris bs_dev_io_unit_to_lba(struct spdk_blob *blob, struct spdk_bs_dev *bs_dev, uint64_t io_unit)
4934132ac52SMaciej Szwed {
4943299bf6dSJim Harris 	return io_unit * blob->bs->io_unit_size / bs_dev->blocklen;
4954132ac52SMaciej Szwed }
4964132ac52SMaciej Szwed 
497f3001308SJim Harris static inline uint64_t
4983299bf6dSJim Harris bs_cluster_to_io_unit(struct spdk_blob_store *bs, uint32_t cluster)
499d89352a9SBen Walker {
5003299bf6dSJim Harris 	return (uint64_t)cluster * bs->io_units_per_cluster;
501d89352a9SBen Walker }
502d89352a9SBen Walker 
503d89352a9SBen Walker static inline uint32_t
5043299bf6dSJim Harris bs_io_unit_to_cluster(struct spdk_blob_store *bs, uint64_t io_unit)
505d89352a9SBen Walker {
5063299bf6dSJim Harris 	assert(io_unit % bs->io_units_per_cluster == 0);
507d89352a9SBen Walker 
5083299bf6dSJim Harris 	return io_unit / bs->io_units_per_cluster;
509d89352a9SBen Walker }
510d89352a9SBen Walker 
511d89352a9SBen Walker static inline uint64_t
512b5d68d59SSeth Howell bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster)
513d89352a9SBen Walker {
514ffa82355SJim Harris 	assert(bs->cluster_sz / bs->dev->blocklen > 0);
515ffa82355SJim Harris 
51689426e9bSDaniel Verkamp 	return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen);
517d89352a9SBen Walker }
518d89352a9SBen Walker 
519d89352a9SBen Walker static inline uint32_t
520b5d68d59SSeth Howell bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba)
521d89352a9SBen Walker {
522d89352a9SBen Walker 	assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0);
523d89352a9SBen Walker 
524d89352a9SBen Walker 	return lba / (bs->cluster_sz / bs->dev->blocklen);
525d89352a9SBen Walker }
526d89352a9SBen Walker 
5274132ac52SMaciej Szwed static inline uint64_t
528b5d68d59SSeth Howell bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit)
5294132ac52SMaciej Szwed {
5306609b776SPiotr Pelplinski 	return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen);
5314132ac52SMaciej Szwed }
5324132ac52SMaciej Szwed 
5334132ac52SMaciej Szwed static inline uint64_t
534b5d68d59SSeth Howell bs_cluster_to_extent_table_id(uint64_t cluster_num)
5351b23560fSTomasz Zawadzki {
5361b23560fSTomasz Zawadzki 	return cluster_num / SPDK_EXTENTS_PER_EP;
5371b23560fSTomasz Zawadzki }
5381b23560fSTomasz Zawadzki 
5391b23560fSTomasz Zawadzki static inline uint32_t *
540b5d68d59SSeth Howell bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num)
5411b23560fSTomasz Zawadzki {
542b5d68d59SSeth Howell 	uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num);
5431b23560fSTomasz Zawadzki 
5442bccb7c9STomasz Zawadzki 	assert(blob->use_extent_table);
5452bccb7c9STomasz Zawadzki 	assert(extent_table_id < blob->active.extent_pages_array_size);
5461b23560fSTomasz Zawadzki 
5471b23560fSTomasz Zawadzki 	return &blob->active.extent_pages[extent_table_id];
5481b23560fSTomasz Zawadzki }
5491b23560fSTomasz Zawadzki 
550ddf5a8daSDamiano Cipriani static inline uint64_t
551ddf5a8daSDamiano Cipriani bs_io_units_per_cluster(struct spdk_blob *blob)
552ddf5a8daSDamiano Cipriani {
5533299bf6dSJim Harris 	return blob->bs->io_units_per_cluster;
554ddf5a8daSDamiano Cipriani }
555ddf5a8daSDamiano Cipriani 
556d89352a9SBen Walker /* End basic conversions */
557d89352a9SBen Walker 
558f3001308SJim Harris static inline uint64_t
559b5d68d59SSeth Howell bs_blobid_to_page(spdk_blob_id id)
560d89352a9SBen Walker {
561d89352a9SBen Walker 	return id & 0xFFFFFFFF;
562d89352a9SBen Walker }
563d89352a9SBen Walker 
564721695e1SPaul Luse /* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper
565721695e1SPaul Luse  * 32 bits are not currently used. Stick a 1 there just to catch bugs where the
566721695e1SPaul Luse  * code assumes blob id == page_idx.
567721695e1SPaul Luse  */
568721695e1SPaul Luse static inline spdk_blob_id
569b5d68d59SSeth Howell bs_page_to_blobid(uint64_t page_idx)
570721695e1SPaul Luse {
571f3001308SJim Harris 	if (page_idx > UINT32_MAX) {
572f3001308SJim Harris 		return SPDK_BLOBID_INVALID;
573f3001308SJim Harris 	}
574721695e1SPaul Luse 	return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx;
575721695e1SPaul Luse }
576721695e1SPaul Luse 
5776609b776SPiotr Pelplinski /* Given an io unit offset into a blob, look up the LBA for the
5786609b776SPiotr Pelplinski  * start of that io unit.
579d89352a9SBen Walker  */
580d89352a9SBen Walker static inline uint64_t
581b5d68d59SSeth Howell bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit)
582d89352a9SBen Walker {
583d89352a9SBen Walker 	uint64_t	lba;
584b3348624STomasz Zawadzki 	uint8_t		shift;
5853299bf6dSJim Harris 	uint64_t	io_units_per_cluster = blob->bs->io_units_per_cluster;
5866609b776SPiotr Pelplinski 
5873299bf6dSJim Harris 	shift = blob->bs->io_units_per_cluster_shift;
5883299bf6dSJim Harris 	assert(io_unit < blob->active.num_clusters * io_units_per_cluster);
589b3348624STomasz Zawadzki 	if (shift != 0) {
5903299bf6dSJim Harris 		lba = blob->active.clusters[io_unit >> shift];
591b3348624STomasz Zawadzki 	} else {
5923299bf6dSJim Harris 		lba = blob->active.clusters[io_unit / io_units_per_cluster];
593b3348624STomasz Zawadzki 	}
5943299bf6dSJim Harris 	if (lba == 0) {
5953299bf6dSJim Harris 		return 0;
5963299bf6dSJim Harris 	} else {
5973299bf6dSJim Harris 		return lba + io_unit % io_units_per_cluster;
5983299bf6dSJim Harris 	}
599d89352a9SBen Walker }
600d89352a9SBen Walker 
6016609b776SPiotr Pelplinski /* Given an io_unit offset into a blob, look up the number of io_units until the
6026609b776SPiotr Pelplinski  * next cluster boundary.
6036609b776SPiotr Pelplinski  */
6046609b776SPiotr Pelplinski static inline uint32_t
605b5d68d59SSeth Howell bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit)
6066609b776SPiotr Pelplinski {
6076609b776SPiotr Pelplinski 	uint64_t	io_units_per_cluster;
6086609b776SPiotr Pelplinski 
609ddf5a8daSDamiano Cipriani 	io_units_per_cluster = bs_io_units_per_cluster(blob);
6106609b776SPiotr Pelplinski 
6116609b776SPiotr Pelplinski 	return io_units_per_cluster - (io_unit % io_units_per_cluster);
6126609b776SPiotr Pelplinski }
6136609b776SPiotr Pelplinski 
6143299bf6dSJim Harris /* Given an io_unit offset into a blob, look up the number of io_unit into blob to beginning of current cluster */
615952532afSJinlong Chen static inline uint64_t
616b5d68d59SSeth Howell bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit)
6174132ac52SMaciej Szwed {
6183299bf6dSJim Harris 	uint64_t	io_units_per_cluster = blob->bs->io_units_per_cluster;
6194132ac52SMaciej Szwed 
6203299bf6dSJim Harris 	return io_unit - (io_unit % io_units_per_cluster);
6214132ac52SMaciej Szwed }
6224132ac52SMaciej Szwed 
6236609b776SPiotr Pelplinski /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */
6246609b776SPiotr Pelplinski static inline uint32_t
625b5d68d59SSeth Howell bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit)
6266609b776SPiotr Pelplinski {
6273299bf6dSJim Harris 	uint64_t	io_units_per_cluster = blob->bs->io_units_per_cluster;
6283299bf6dSJim Harris 	uint8_t		shift = blob->bs->io_units_per_cluster_shift;
629b3348624STomasz Zawadzki 
630b3348624STomasz Zawadzki 	if (shift != 0) {
6313299bf6dSJim Harris 		return io_unit >> shift;
632b3348624STomasz Zawadzki 	} else {
6333299bf6dSJim Harris 		return io_unit / io_units_per_cluster;
634b3348624STomasz Zawadzki 	}
6356609b776SPiotr Pelplinski }
6366609b776SPiotr Pelplinski 
6376609b776SPiotr Pelplinski /* Given an io unit offset into a blob, look up if it is from allocated cluster. */
6384132ac52SMaciej Szwed static inline bool
639b5d68d59SSeth Howell bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit)
6404132ac52SMaciej Szwed {
6413299bf6dSJim Harris 	uint64_t lba = bs_blob_io_unit_to_lba(blob, io_unit);
6424132ac52SMaciej Szwed 
6434132ac52SMaciej Szwed 	if (lba == 0) {
6444132ac52SMaciej Szwed 		assert(spdk_blob_is_thin_provisioned(blob));
6454132ac52SMaciej Szwed 		return false;
6464132ac52SMaciej Szwed 	} else {
6474132ac52SMaciej Szwed 		return true;
6484132ac52SMaciej Szwed 	}
6494132ac52SMaciej Szwed }
6504132ac52SMaciej Szwed 
651d89352a9SBen Walker #endif
652