1488570ebSJim Harris /* SPDX-License-Identifier: BSD-3-Clause 2a6dbe372Spaul luse * Copyright (C) 2017 Intel Corporation. 3d89352a9SBen Walker * All rights reserved. 4ce67e0c7SMike Gerdts * Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5d89352a9SBen Walker */ 6d89352a9SBen Walker 7d89352a9SBen Walker #ifndef SPDK_BLOBSTORE_H 8d89352a9SBen Walker #define SPDK_BLOBSTORE_H 9d89352a9SBen Walker 10d89352a9SBen Walker #include "spdk/assert.h" 11d89352a9SBen Walker #include "spdk/blob.h" 12d89352a9SBen Walker #include "spdk/queue.h" 13d89352a9SBen Walker #include "spdk/util.h" 147de351f1SLiu Xiaodong #include "spdk/tree.h" 15316cf9efSMike Gerdts #include "spdk/thread.h" 16d89352a9SBen Walker 17130d278aSPaul Luse #include "request.h" 18130d278aSPaul Luse 19d89352a9SBen Walker /* In Memory Data Structures 20d89352a9SBen Walker * 21d89352a9SBen Walker * The following data structures exist only in memory. 22d89352a9SBen Walker */ 23d89352a9SBen Walker 24d89352a9SBen Walker #define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024) 25d89352a9SBen Walker #define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX 264a3182b8SBen Walker #define SPDK_BLOB_OPTS_MAX_MD_OPS 32 274ebe8214SZiye Yang #define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512 28721695e1SPaul Luse #define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32) 29d89352a9SBen Walker 30d89352a9SBen Walker struct spdk_xattr { 31d89352a9SBen Walker uint32_t index; 32ee9db7daSZiye Yang uint16_t value_len; 33d89352a9SBen Walker char *name; 34d89352a9SBen Walker void *value; 35d89352a9SBen Walker TAILQ_ENTRY(spdk_xattr) link; 36d89352a9SBen Walker }; 37d89352a9SBen Walker 38d89352a9SBen Walker /* The mutable part of the blob data that is sync'd to 39d89352a9SBen Walker * disk. The data in here is both mutable and persistent. 40d89352a9SBen Walker */ 41d89352a9SBen Walker struct spdk_blob_mut_data { 42d89352a9SBen Walker /* Number of data clusters in the blob */ 43d89352a9SBen Walker uint64_t num_clusters; 44d89352a9SBen Walker 45d89352a9SBen Walker /* Array LBAs that are the beginning of a cluster, in 46d89352a9SBen Walker * the order they appear in the blob. 47d89352a9SBen Walker */ 48d89352a9SBen Walker uint64_t *clusters; 49d89352a9SBen Walker 50d89352a9SBen Walker /* The size of the clusters array. This is greater than or 51d89352a9SBen Walker * equal to 'num_clusters'. 52d89352a9SBen Walker */ 53d89352a9SBen Walker size_t cluster_array_size; 54d89352a9SBen Walker 552dbbbbd8SDamiano Cipriani /* The number of allocated clusters in the clusters array */ 562dbbbbd8SDamiano Cipriani uint64_t num_allocated_clusters; 572dbbbbd8SDamiano Cipriani 58f60b4a7eSTomasz Zawadzki /* Number of extent pages */ 59f60b4a7eSTomasz Zawadzki uint64_t num_extent_pages; 60f60b4a7eSTomasz Zawadzki 61f60b4a7eSTomasz Zawadzki /* Array of page offsets into the metadata region, 62f60b4a7eSTomasz Zawadzki * containing extents. Can contain entries for not yet 63f60b4a7eSTomasz Zawadzki * allocated pages. */ 64f60b4a7eSTomasz Zawadzki uint32_t *extent_pages; 65f60b4a7eSTomasz Zawadzki 66f60b4a7eSTomasz Zawadzki /* The size of the extent page array. This is greater than or 67f60b4a7eSTomasz Zawadzki * equal to 'num_extent_pages'. */ 68f60b4a7eSTomasz Zawadzki size_t extent_pages_array_size; 69f60b4a7eSTomasz Zawadzki 70d89352a9SBen Walker /* Number of metadata pages */ 71d89352a9SBen Walker uint32_t num_pages; 72d89352a9SBen Walker 73d89352a9SBen Walker /* Array of page offsets into the metadata region, in 74d89352a9SBen Walker * the order of the metadata page sequence. 75d89352a9SBen Walker */ 76d89352a9SBen Walker uint32_t *pages; 77d89352a9SBen Walker }; 78d89352a9SBen Walker 79d89352a9SBen Walker enum spdk_blob_state { 80d89352a9SBen Walker /* The blob in-memory version does not match the on-disk 81d89352a9SBen Walker * version. 82d89352a9SBen Walker */ 83d89352a9SBen Walker SPDK_BLOB_STATE_DIRTY, 84d89352a9SBen Walker 85d89352a9SBen Walker /* The blob in memory version of the blob matches the on disk 86d89352a9SBen Walker * version. 87d89352a9SBen Walker */ 88d89352a9SBen Walker SPDK_BLOB_STATE_CLEAN, 89d89352a9SBen Walker 90d89352a9SBen Walker /* The in-memory state being synchronized with the on-disk 91d89352a9SBen Walker * blob state. */ 92d89352a9SBen Walker SPDK_BLOB_STATE_LOADING, 93d89352a9SBen Walker }; 94d89352a9SBen Walker 957ba8c006SPiotr Pelplinski TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr); 967ba8c006SPiotr Pelplinski 97d7e065beSTomasz Kulasek struct spdk_blob_list { 98d7e065beSTomasz Kulasek spdk_blob_id id; 99d7e065beSTomasz Kulasek size_t clone_count; 100d7e065beSTomasz Kulasek TAILQ_HEAD(, spdk_blob_list) clones; 101d7e065beSTomasz Kulasek TAILQ_ENTRY(spdk_blob_list) link; 102d7e065beSTomasz Kulasek }; 103d7e065beSTomasz Kulasek 104c8efd8a8SJim Harris struct spdk_blob { 105d89352a9SBen Walker struct spdk_blob_store *bs; 106d89352a9SBen Walker 107d89352a9SBen Walker uint32_t open_ref; 108d89352a9SBen Walker 109d89352a9SBen Walker spdk_blob_id id; 110d7e065beSTomasz Kulasek spdk_blob_id parent_id; 111d89352a9SBen Walker 112d89352a9SBen Walker enum spdk_blob_state state; 113d89352a9SBen Walker 114d89352a9SBen Walker /* Two copies of the mutable data. One is a version 115d89352a9SBen Walker * that matches the last known data on disk (clean). 116d89352a9SBen Walker * The other (active) is the current data. Syncing 117d89352a9SBen Walker * a blob makes the clean match the active. 118d89352a9SBen Walker */ 119d89352a9SBen Walker struct spdk_blob_mut_data clean; 120d89352a9SBen Walker struct spdk_blob_mut_data active; 121d89352a9SBen Walker 12275cb2da9SJim Harris bool invalid; 123f2223d7dSJim Harris bool data_ro; 124f2223d7dSJim Harris bool md_ro; 125f2223d7dSJim Harris 126d12ba75bSJim Harris uint64_t invalid_flags; 127d12ba75bSJim Harris uint64_t data_ro_flags; 128d12ba75bSJim Harris uint64_t md_ro_flags; 129d12ba75bSJim Harris 1304132ac52SMaciej Szwed struct spdk_bs_dev *back_bs_dev; 1314132ac52SMaciej Szwed 132d89352a9SBen Walker /* TODO: The xattrs are mutable, but we don't want to be 1336fa48bbfSChen Wang * copying them unnecessarily. Figure this out. 134d89352a9SBen Walker */ 1357ba8c006SPiotr Pelplinski struct spdk_xattr_tailq xattrs; 1367ba8c006SPiotr Pelplinski struct spdk_xattr_tailq xattrs_internal; 137d89352a9SBen Walker 1387de351f1SLiu Xiaodong RB_ENTRY(spdk_blob) link; 1398c45ed38SPiotr Pelplinski 1408c45ed38SPiotr Pelplinski uint32_t frozen_refcnt; 1418256cecfSMaciej Szwed bool locked_operation_in_progress; 142adb39585SMaciej Szwed enum blob_clear_method clear_method; 143c33840b7STomasz Zawadzki bool extent_rle_found; 144c33840b7STomasz Zawadzki bool extent_table_found; 145c33840b7STomasz Zawadzki bool use_extent_table; 146f60b4a7eSTomasz Zawadzki 147030be573STomasz Zawadzki /* A list of pending metadata pending_persists */ 148030be573STomasz Zawadzki TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists; 149ceaa0c7fSTomasz Zawadzki TAILQ_HEAD(, spdk_blob_persist_ctx) persists_to_complete; 150030be573STomasz Zawadzki 151cc6920a4SJosh Soref /* Number of data clusters retrieved from extent table, 152f60b4a7eSTomasz Zawadzki * that many have to be read from extent pages. */ 15378257ab6STomasz Zawadzki uint64_t remaining_clusters_in_et; 154d89352a9SBen Walker }; 155d89352a9SBen Walker 156d89352a9SBen Walker struct spdk_blob_store { 157d89352a9SBen Walker uint64_t md_start; /* Offset from beginning of disk, in pages */ 158d89352a9SBen Walker uint32_t md_len; /* Count, in pages */ 159*2dc4a231SAtul Malakar uint32_t md_page_size; /* Metadata page size */ 16060e8fb49SBen Walker 161d89352a9SBen Walker struct spdk_io_channel *md_channel; 16260e8fb49SBen Walker uint32_t max_channel_ops; 163d89352a9SBen Walker 164dfb102b7SJim Harris struct spdk_thread *md_thread; 165dfb102b7SJim Harris 166d89352a9SBen Walker struct spdk_bs_dev *dev; 167d89352a9SBen Walker 1682a608d02SMike Gerdts struct spdk_bit_array *used_md_pages; /* Protected by used_lock */ 1692a608d02SMike Gerdts struct spdk_bit_pool *used_clusters; /* Protected by used_lock */ 17040c911b9SJim Harris struct spdk_bit_array *used_blobids; 17130ee8137SBen Walker struct spdk_bit_array *open_blobids; 172d89352a9SBen Walker 173316cf9efSMike Gerdts struct spdk_spinlock used_lock; 1749103821dSMaciej Szwed 175d89352a9SBen Walker uint32_t cluster_sz; 176d89352a9SBen Walker uint64_t total_clusters; 1775eb52b95STomasz Zawadzki uint64_t total_data_clusters; 1782a608d02SMike Gerdts uint64_t num_free_clusters; /* Protected by used_lock */ 179f3001308SJim Harris uint64_t pages_per_cluster; 1803299bf6dSJim Harris uint64_t io_units_per_cluster; 181b3348624STomasz Zawadzki uint8_t pages_per_cluster_shift; 1823299bf6dSJim Harris uint8_t io_units_per_cluster_shift; 1836609b776SPiotr Pelplinski uint32_t io_unit_size; 184d89352a9SBen Walker 185d89352a9SBen Walker spdk_blob_id super_blob; 186eb8b1e20SMaciej Szwed struct spdk_bs_type bstype; 187d89352a9SBen Walker 188130d278aSPaul Luse struct spdk_bs_cpl unload_cpl; 189130d278aSPaul Luse int unload_err; 190130d278aSPaul Luse 1917de351f1SLiu Xiaodong RB_HEAD(spdk_blob_tree, spdk_blob) open_blobs; 192d7e065beSTomasz Kulasek TAILQ_HEAD(, spdk_blob_list) snapshots; 193bc8f2cd9SPiotr Pelplinski 194bc8f2cd9SPiotr Pelplinski bool clean; 195ce67e0c7SMike Gerdts 196ce67e0c7SMike Gerdts spdk_bs_esnap_dev_create esnap_bs_dev_create; 197a4a73fecSMike Gerdts void *esnap_ctx; 198ba91ffbaSMike Gerdts 199ba91ffbaSMike Gerdts /* If external snapshot channels are being destroyed while 200ba91ffbaSMike Gerdts * the blobstore is unloaded, the unload is deferred until 201ba91ffbaSMike Gerdts * after the channel destruction completes. 202ba91ffbaSMike Gerdts */ 203ba91ffbaSMike Gerdts uint32_t esnap_channels_unloading; 204ba91ffbaSMike Gerdts spdk_bs_op_complete esnap_unload_cb_fn; 205ba91ffbaSMike Gerdts void *esnap_unload_cb_arg; 206d89352a9SBen Walker }; 207d89352a9SBen Walker 208d89352a9SBen Walker struct spdk_bs_channel { 209d89352a9SBen Walker struct spdk_bs_request_set *req_mem; 210d89352a9SBen Walker TAILQ_HEAD(, spdk_bs_request_set) reqs; 211d89352a9SBen Walker 212d89352a9SBen Walker struct spdk_blob_store *bs; 213d89352a9SBen Walker 214d89352a9SBen Walker struct spdk_bs_dev *dev; 215d89352a9SBen Walker struct spdk_io_channel *dev_channel; 2164132ac52SMaciej Szwed 2171eca87c3SAlexey Marchuk /* This page is only used during insert of a new cluster. */ 2181eca87c3SAlexey Marchuk struct spdk_blob_md_page *new_cluster_page; 2191eca87c3SAlexey Marchuk 2204132ac52SMaciej Szwed TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc; 2218c45ed38SPiotr Pelplinski TAILQ_HEAD(, spdk_bs_request_set) queued_io; 222b47cee6cSMike Gerdts 223b47cee6cSMike Gerdts RB_HEAD(blob_esnap_channel_tree, blob_esnap_channel) esnap_channels; 224d89352a9SBen Walker }; 225d89352a9SBen Walker 226f6e075cdSMaciej Szwed /** operation type */ 227f6e075cdSMaciej Szwed enum spdk_blob_op_type { 228f6e075cdSMaciej Szwed SPDK_BLOB_WRITE, 229f6e075cdSMaciej Szwed SPDK_BLOB_READ, 230f6e075cdSMaciej Szwed SPDK_BLOB_UNMAP, 231f6e075cdSMaciej Szwed SPDK_BLOB_WRITE_ZEROES, 232b2503cb3SJim Harris SPDK_BLOB_WRITEV, 233b2503cb3SJim Harris SPDK_BLOB_READV, 234f6e075cdSMaciej Szwed }; 235f6e075cdSMaciej Szwed 236c26c4e9fSPiotr Pelplinski /* back bs_dev */ 237c26c4e9fSPiotr Pelplinski 238c26c4e9fSPiotr Pelplinski #define BLOB_SNAPSHOT "SNAP" 239777627e0SPiotr Pelplinski #define SNAPSHOT_IN_PROGRESS "SNAPTMP" 24092cafd15SMaciej Szwed #define SNAPSHOT_PENDING_REMOVAL "SNAPRM" 241ce67e0c7SMike Gerdts #define BLOB_EXTERNAL_SNAPSHOT_ID "EXTSNAP" 242c26c4e9fSPiotr Pelplinski 243c26c4e9fSPiotr Pelplinski struct spdk_blob_bs_dev { 244c26c4e9fSPiotr Pelplinski struct spdk_bs_dev bs_dev; 245c26c4e9fSPiotr Pelplinski struct spdk_blob *blob; 246c26c4e9fSPiotr Pelplinski }; 247c26c4e9fSPiotr Pelplinski 248d89352a9SBen Walker /* On-Disk Data Structures 249d89352a9SBen Walker * 250d89352a9SBen Walker * The following data structures exist on disk. 251d89352a9SBen Walker */ 252eb8b1e20SMaciej Szwed #define SPDK_BS_INITIAL_VERSION 1 253d12ba75bSJim Harris #define SPDK_BS_VERSION 3 /* current version */ 254d89352a9SBen Walker 255d89352a9SBen Walker #pragma pack(push, 1) 256d89352a9SBen Walker 257d89352a9SBen Walker #define SPDK_MD_MASK_TYPE_USED_PAGES 0 258d89352a9SBen Walker #define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1 25940c911b9SJim Harris #define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2 260d89352a9SBen Walker 261d89352a9SBen Walker struct spdk_bs_md_mask { 262d89352a9SBen Walker uint8_t type; 263d89352a9SBen Walker uint32_t length; /* In bits */ 264d89352a9SBen Walker uint8_t mask[0]; 265d89352a9SBen Walker }; 266d89352a9SBen Walker 267d89352a9SBen Walker #define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0 268d89352a9SBen Walker #define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2 269d12ba75bSJim Harris #define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3 2707ba8c006SPiotr Pelplinski #define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4 271d89352a9SBen Walker 272f60b4a7eSTomasz Zawadzki /* Following descriptors define cluster layout in a blob. 273f60b4a7eSTomasz Zawadzki * EXTENT_RLE cannot be present in blobs metadata, 274f60b4a7eSTomasz Zawadzki * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */ 2753dadb79eSTomasz Zawadzki 2763dadb79eSTomasz Zawadzki /* EXTENT_RLE descriptor holds an array of LBA that points to 2773dadb79eSTomasz Zawadzki * beginning of allocated clusters. The array is run-length encoded, 2783dadb79eSTomasz Zawadzki * with 0's being unallocated clusters. It is part of serialized 2793dadb79eSTomasz Zawadzki * metadata chain for a blob. */ 2803dadb79eSTomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1 281f60b4a7eSTomasz Zawadzki /* EXTENT_TABLE descriptor holds array of md page offsets that 282f60b4a7eSTomasz Zawadzki * point to pages with EXTENT_PAGE descriptor. The 0's in the array 283f60b4a7eSTomasz Zawadzki * are run-length encoded, non-zero values are unallocated pages. 284f60b4a7eSTomasz Zawadzki * It is part of serialized metadata chain for a blob. */ 285f60b4a7eSTomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5 286f4e58993STomasz Zawadzki /* EXTENT_PAGE descriptor holds an array of LBAs that point to 287f4e58993STomasz Zawadzki * beginning of allocated clusters. The array is run-length encoded, 288f4e58993STomasz Zawadzki * with 0's being unallocated clusters. It is NOT part of 289f4e58993STomasz Zawadzki * serialized metadata chain for a blob. */ 290f4e58993STomasz Zawadzki #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6 2913dadb79eSTomasz Zawadzki 292d89352a9SBen Walker struct spdk_blob_md_descriptor_xattr { 293d89352a9SBen Walker uint8_t type; 294d89352a9SBen Walker uint32_t length; 295d89352a9SBen Walker 296d89352a9SBen Walker uint16_t name_length; 297d89352a9SBen Walker uint16_t value_length; 298d89352a9SBen Walker 299d89352a9SBen Walker char name[0]; 300d89352a9SBen Walker /* String name immediately followed by string value. */ 301d89352a9SBen Walker }; 302d89352a9SBen Walker 3033e372f35STomasz Zawadzki struct spdk_blob_md_descriptor_extent_rle { 304d89352a9SBen Walker uint8_t type; 305d89352a9SBen Walker uint32_t length; 306d89352a9SBen Walker 307d89352a9SBen Walker struct { 308d89352a9SBen Walker uint32_t cluster_idx; 309d89352a9SBen Walker uint32_t length; /* In units of clusters */ 310d89352a9SBen Walker } extents[0]; 311d89352a9SBen Walker }; 312d89352a9SBen Walker 313f60b4a7eSTomasz Zawadzki struct spdk_blob_md_descriptor_extent_table { 314f60b4a7eSTomasz Zawadzki uint8_t type; 315f60b4a7eSTomasz Zawadzki uint32_t length; 316f60b4a7eSTomasz Zawadzki 317f60b4a7eSTomasz Zawadzki /* Number of data clusters in the blob */ 318f60b4a7eSTomasz Zawadzki uint64_t num_clusters; 319f60b4a7eSTomasz Zawadzki 320f60b4a7eSTomasz Zawadzki struct { 321f60b4a7eSTomasz Zawadzki uint32_t page_idx; 322f60b4a7eSTomasz Zawadzki uint32_t num_pages; /* In units of pages */ 323f60b4a7eSTomasz Zawadzki } extent_page[0]; 324f60b4a7eSTomasz Zawadzki }; 325f60b4a7eSTomasz Zawadzki 326f4e58993STomasz Zawadzki struct spdk_blob_md_descriptor_extent_page { 327f4e58993STomasz Zawadzki uint8_t type; 328f4e58993STomasz Zawadzki uint32_t length; 329f4e58993STomasz Zawadzki 33042109157STomasz Zawadzki /* First cluster index in this extent page */ 33142109157STomasz Zawadzki uint32_t start_cluster_idx; 33242109157STomasz Zawadzki 333f4e58993STomasz Zawadzki uint32_t cluster_idx[0]; 334f4e58993STomasz Zawadzki }; 335f4e58993STomasz Zawadzki 336489ea86eSPiotr Pelplinski #define SPDK_BLOB_THIN_PROV (1ULL << 0) 3377ba8c006SPiotr Pelplinski #define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1) 33829bd5020STomasz Zawadzki #define SPDK_BLOB_EXTENT_TABLE (1ULL << 2) 339ce67e0c7SMike Gerdts #define SPDK_BLOB_EXTERNAL_SNAPSHOT (1ULL << 3) 340ce67e0c7SMike Gerdts #define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | \ 341ce67e0c7SMike Gerdts SPDK_BLOB_EXTENT_TABLE | SPDK_BLOB_EXTERNAL_SNAPSHOT) 342c315d8e8SPiotr Pelplinski 343c315d8e8SPiotr Pelplinski #define SPDK_BLOB_READ_ONLY (1ULL << 0) 344c315d8e8SPiotr Pelplinski #define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY 345ea69d6d6Spaul luse 346ea69d6d6Spaul luse #define SPDK_BLOB_CLEAR_METHOD_SHIFT 0 347ea69d6d6Spaul luse #define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT) 348ea69d6d6Spaul luse #define SPDK_BLOB_MD_RO_FLAGS_MASK SPDK_BLOB_CLEAR_METHOD 349d12ba75bSJim Harris 350d12ba75bSJim Harris struct spdk_blob_md_descriptor_flags { 351d12ba75bSJim Harris uint8_t type; 352d12ba75bSJim Harris uint32_t length; 353d12ba75bSJim Harris 354d12ba75bSJim Harris /* 355d12ba75bSJim Harris * If a flag in invalid_flags is set that the application is not aware of, 356d12ba75bSJim Harris * it will not allow the blob to be opened. 357d12ba75bSJim Harris */ 358d12ba75bSJim Harris uint64_t invalid_flags; 359d12ba75bSJim Harris 360d12ba75bSJim Harris /* 361d12ba75bSJim Harris * If a flag in data_ro_flags is set that the application is not aware of, 362d12ba75bSJim Harris * allow the blob to be opened in data_read_only and md_read_only mode. 363d12ba75bSJim Harris */ 364d12ba75bSJim Harris uint64_t data_ro_flags; 365d12ba75bSJim Harris 366d12ba75bSJim Harris /* 3679b72cda8SMike Gerdts * If a flag in md_ro_flags is set the application is not aware of, 368d12ba75bSJim Harris * allow the blob to be opened in md_read_only mode. 369d12ba75bSJim Harris */ 370d12ba75bSJim Harris uint64_t md_ro_flags; 371d12ba75bSJim Harris }; 372d12ba75bSJim Harris 373d89352a9SBen Walker struct spdk_blob_md_descriptor { 374d89352a9SBen Walker uint8_t type; 375d89352a9SBen Walker uint32_t length; 376d89352a9SBen Walker }; 377d89352a9SBen Walker 378d89352a9SBen Walker #define SPDK_INVALID_MD_PAGE UINT32_MAX 379d89352a9SBen Walker 380d89352a9SBen Walker struct spdk_blob_md_page { 381d89352a9SBen Walker spdk_blob_id id; 382d89352a9SBen Walker 383d89352a9SBen Walker uint32_t sequence_num; 384d89352a9SBen Walker uint32_t reserved0; 385d89352a9SBen Walker 386d89352a9SBen Walker /* Descriptors here */ 38797b3efa3SBen Walker uint8_t descriptors[4072]; 388d89352a9SBen Walker 389d89352a9SBen Walker uint32_t next; 390d89352a9SBen Walker uint32_t crc; 391d89352a9SBen Walker }; 39226e9b6eaSPaul Luse #define SPDK_BS_PAGE_SIZE 0x1000 39326e9b6eaSPaul Luse SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size"); 394d89352a9SBen Walker 395320ab72fSShuhei Matsumoto #define SPDK_BS_MAX_DESC_SIZE SPDK_SIZEOF_MEMBER(struct spdk_blob_md_page, descriptors) 39669a8877eSTomasz Zawadzki 397e1ce5515STomasz Zawadzki /* Maximum number of extents a single Extent Page can fit. 398e1ce5515STomasz Zawadzki * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */ 399e1ce5515STomasz Zawadzki #define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t)) 400e1ce5515STomasz Zawadzki #define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u) 40159f7f3f7STomasz Zawadzki 402d89352a9SBen Walker #define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB" 403d89352a9SBen Walker 404d89352a9SBen Walker struct spdk_bs_super_block { 405d89352a9SBen Walker uint8_t signature[8]; 406d89352a9SBen Walker uint32_t version; 407d89352a9SBen Walker uint32_t length; 408d89352a9SBen Walker uint32_t clean; /* If there was a clean shutdown, this is 1. */ 409d89352a9SBen Walker spdk_blob_id super_blob; 410d89352a9SBen Walker 411d89352a9SBen Walker uint32_t cluster_size; /* In bytes */ 412d89352a9SBen Walker 413d89352a9SBen Walker uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ 414d89352a9SBen Walker uint32_t used_page_mask_len; /* Count, in pages */ 415d89352a9SBen Walker 416d89352a9SBen Walker uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ 417d89352a9SBen Walker uint32_t used_cluster_mask_len; /* Count, in pages */ 418d89352a9SBen Walker 419d89352a9SBen Walker uint32_t md_start; /* Offset from beginning of disk, in pages */ 420d89352a9SBen Walker uint32_t md_len; /* Count, in pages */ 421d89352a9SBen Walker 422eb8b1e20SMaciej Szwed struct spdk_bs_type bstype; /* blobstore type */ 423eb8b1e20SMaciej Szwed 42440c911b9SJim Harris uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ 42540c911b9SJim Harris uint32_t used_blobid_mask_len; /* Count, in pages */ 42640c911b9SJim Harris 4272c91e919SPiotr Pelplinski uint64_t size; /* size of blobstore in bytes */ 4286609b776SPiotr Pelplinski uint32_t io_unit_size; /* Size of io unit in bytes */ 4292c91e919SPiotr Pelplinski 430*2dc4a231SAtul Malakar uint32_t md_page_size; /* Size in bytes */ 431*2dc4a231SAtul Malakar uint8_t reserved[3996]; 432*2dc4a231SAtul Malakar 43329bcd5a9SCunyin Chang uint32_t crc; 434d89352a9SBen Walker }; 435d89352a9SBen Walker SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size"); 436d89352a9SBen Walker 437d89352a9SBen Walker #pragma pack(pop) 438d89352a9SBen Walker 439ad7fdd12SSeth Howell struct spdk_bs_dev *bs_create_zeroes_dev(void); 440ad7fdd12SSeth Howell struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob); 441b47cee6cSMike Gerdts struct spdk_io_channel *blob_esnap_get_io_channel(struct spdk_io_channel *ch, 442b47cee6cSMike Gerdts struct spdk_blob *blob); 443d3594f84Sxupeng-mingtu bool blob_backed_with_zeroes_dev(struct spdk_blob *blob); 4448970f868SBen Walker 445d89352a9SBen Walker /* Unit Conversions 446d89352a9SBen Walker * 447d89352a9SBen Walker * The blobstore works with several different units: 448d89352a9SBen Walker * - Byte: Self explanatory 449d89352a9SBen Walker * - LBA: The logical blocks on the backing storage device. 450d89352a9SBen Walker * - Page: The read/write units of blobs and metadata. This is 451d89352a9SBen Walker * an offset into a blob in units of 4KiB. 452d89352a9SBen Walker * - Cluster Index: The disk is broken into a sequential list of 453d89352a9SBen Walker * clusters. This is the offset from the beginning. 454d89352a9SBen Walker * 455d89352a9SBen Walker * NOTE: These conversions all act on simple magnitudes, not with any sort 456d89352a9SBen Walker * of knowledge about the blobs themselves. For instance, converting 457d89352a9SBen Walker * a page to an lba with the conversion function below simply converts 458d89352a9SBen Walker * a number of pages to an equivalent number of lbas, but that 459d89352a9SBen Walker * lba certainly isn't the right lba that corresponds to a page offset 460d89352a9SBen Walker * for a particular blob. 461d89352a9SBen Walker */ 462d89352a9SBen Walker static inline uint64_t 463b5d68d59SSeth Howell bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length) 464d89352a9SBen Walker { 465d89352a9SBen Walker assert(length % bs->dev->blocklen == 0); 466d89352a9SBen Walker 467d89352a9SBen Walker return length / bs->dev->blocklen; 468d89352a9SBen Walker } 469d89352a9SBen Walker 470d89352a9SBen Walker static inline uint64_t 471b5d68d59SSeth Howell bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length) 4724132ac52SMaciej Szwed { 4734132ac52SMaciej Szwed assert(length % bs_dev->blocklen == 0); 4744132ac52SMaciej Szwed 4754132ac52SMaciej Szwed return length / bs_dev->blocklen; 4764132ac52SMaciej Szwed } 4774132ac52SMaciej Szwed 4784132ac52SMaciej Szwed static inline uint64_t 479b5d68d59SSeth Howell bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page) 480d89352a9SBen Walker { 481*2dc4a231SAtul Malakar return page * bs->md_page_size / bs->dev->blocklen; 482d89352a9SBen Walker } 483d89352a9SBen Walker 4844132ac52SMaciej Szwed static inline uint64_t 485b5d68d59SSeth Howell bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page) 4864b8db27bSTomasz Zawadzki { 4874b8db27bSTomasz Zawadzki assert(page < bs->md_len); 488b5d68d59SSeth Howell return bs_page_to_lba(bs, page + bs->md_start); 4894b8db27bSTomasz Zawadzki } 4904b8db27bSTomasz Zawadzki 4914b8db27bSTomasz Zawadzki static inline uint64_t 4923299bf6dSJim Harris bs_dev_io_unit_to_lba(struct spdk_blob *blob, struct spdk_bs_dev *bs_dev, uint64_t io_unit) 4934132ac52SMaciej Szwed { 4943299bf6dSJim Harris return io_unit * blob->bs->io_unit_size / bs_dev->blocklen; 4954132ac52SMaciej Szwed } 4964132ac52SMaciej Szwed 497f3001308SJim Harris static inline uint64_t 4983299bf6dSJim Harris bs_cluster_to_io_unit(struct spdk_blob_store *bs, uint32_t cluster) 499d89352a9SBen Walker { 5003299bf6dSJim Harris return (uint64_t)cluster * bs->io_units_per_cluster; 501d89352a9SBen Walker } 502d89352a9SBen Walker 503d89352a9SBen Walker static inline uint32_t 5043299bf6dSJim Harris bs_io_unit_to_cluster(struct spdk_blob_store *bs, uint64_t io_unit) 505d89352a9SBen Walker { 5063299bf6dSJim Harris assert(io_unit % bs->io_units_per_cluster == 0); 507d89352a9SBen Walker 5083299bf6dSJim Harris return io_unit / bs->io_units_per_cluster; 509d89352a9SBen Walker } 510d89352a9SBen Walker 511d89352a9SBen Walker static inline uint64_t 512b5d68d59SSeth Howell bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster) 513d89352a9SBen Walker { 514ffa82355SJim Harris assert(bs->cluster_sz / bs->dev->blocklen > 0); 515ffa82355SJim Harris 51689426e9bSDaniel Verkamp return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen); 517d89352a9SBen Walker } 518d89352a9SBen Walker 519d89352a9SBen Walker static inline uint32_t 520b5d68d59SSeth Howell bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba) 521d89352a9SBen Walker { 522d89352a9SBen Walker assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0); 523d89352a9SBen Walker 524d89352a9SBen Walker return lba / (bs->cluster_sz / bs->dev->blocklen); 525d89352a9SBen Walker } 526d89352a9SBen Walker 5274132ac52SMaciej Szwed static inline uint64_t 528b5d68d59SSeth Howell bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit) 5294132ac52SMaciej Szwed { 5306609b776SPiotr Pelplinski return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen); 5314132ac52SMaciej Szwed } 5324132ac52SMaciej Szwed 5334132ac52SMaciej Szwed static inline uint64_t 534b5d68d59SSeth Howell bs_cluster_to_extent_table_id(uint64_t cluster_num) 5351b23560fSTomasz Zawadzki { 5361b23560fSTomasz Zawadzki return cluster_num / SPDK_EXTENTS_PER_EP; 5371b23560fSTomasz Zawadzki } 5381b23560fSTomasz Zawadzki 5391b23560fSTomasz Zawadzki static inline uint32_t * 540b5d68d59SSeth Howell bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num) 5411b23560fSTomasz Zawadzki { 542b5d68d59SSeth Howell uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num); 5431b23560fSTomasz Zawadzki 5442bccb7c9STomasz Zawadzki assert(blob->use_extent_table); 5452bccb7c9STomasz Zawadzki assert(extent_table_id < blob->active.extent_pages_array_size); 5461b23560fSTomasz Zawadzki 5471b23560fSTomasz Zawadzki return &blob->active.extent_pages[extent_table_id]; 5481b23560fSTomasz Zawadzki } 5491b23560fSTomasz Zawadzki 550ddf5a8daSDamiano Cipriani static inline uint64_t 551ddf5a8daSDamiano Cipriani bs_io_units_per_cluster(struct spdk_blob *blob) 552ddf5a8daSDamiano Cipriani { 5533299bf6dSJim Harris return blob->bs->io_units_per_cluster; 554ddf5a8daSDamiano Cipriani } 555ddf5a8daSDamiano Cipriani 556d89352a9SBen Walker /* End basic conversions */ 557d89352a9SBen Walker 558f3001308SJim Harris static inline uint64_t 559b5d68d59SSeth Howell bs_blobid_to_page(spdk_blob_id id) 560d89352a9SBen Walker { 561d89352a9SBen Walker return id & 0xFFFFFFFF; 562d89352a9SBen Walker } 563d89352a9SBen Walker 564721695e1SPaul Luse /* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper 565721695e1SPaul Luse * 32 bits are not currently used. Stick a 1 there just to catch bugs where the 566721695e1SPaul Luse * code assumes blob id == page_idx. 567721695e1SPaul Luse */ 568721695e1SPaul Luse static inline spdk_blob_id 569b5d68d59SSeth Howell bs_page_to_blobid(uint64_t page_idx) 570721695e1SPaul Luse { 571f3001308SJim Harris if (page_idx > UINT32_MAX) { 572f3001308SJim Harris return SPDK_BLOBID_INVALID; 573f3001308SJim Harris } 574721695e1SPaul Luse return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; 575721695e1SPaul Luse } 576721695e1SPaul Luse 5776609b776SPiotr Pelplinski /* Given an io unit offset into a blob, look up the LBA for the 5786609b776SPiotr Pelplinski * start of that io unit. 579d89352a9SBen Walker */ 580d89352a9SBen Walker static inline uint64_t 581b5d68d59SSeth Howell bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit) 582d89352a9SBen Walker { 583d89352a9SBen Walker uint64_t lba; 584b3348624STomasz Zawadzki uint8_t shift; 5853299bf6dSJim Harris uint64_t io_units_per_cluster = blob->bs->io_units_per_cluster; 5866609b776SPiotr Pelplinski 5873299bf6dSJim Harris shift = blob->bs->io_units_per_cluster_shift; 5883299bf6dSJim Harris assert(io_unit < blob->active.num_clusters * io_units_per_cluster); 589b3348624STomasz Zawadzki if (shift != 0) { 5903299bf6dSJim Harris lba = blob->active.clusters[io_unit >> shift]; 591b3348624STomasz Zawadzki } else { 5923299bf6dSJim Harris lba = blob->active.clusters[io_unit / io_units_per_cluster]; 593b3348624STomasz Zawadzki } 5943299bf6dSJim Harris if (lba == 0) { 5953299bf6dSJim Harris return 0; 5963299bf6dSJim Harris } else { 5973299bf6dSJim Harris return lba + io_unit % io_units_per_cluster; 5983299bf6dSJim Harris } 599d89352a9SBen Walker } 600d89352a9SBen Walker 6016609b776SPiotr Pelplinski /* Given an io_unit offset into a blob, look up the number of io_units until the 6026609b776SPiotr Pelplinski * next cluster boundary. 6036609b776SPiotr Pelplinski */ 6046609b776SPiotr Pelplinski static inline uint32_t 605b5d68d59SSeth Howell bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit) 6066609b776SPiotr Pelplinski { 6076609b776SPiotr Pelplinski uint64_t io_units_per_cluster; 6086609b776SPiotr Pelplinski 609ddf5a8daSDamiano Cipriani io_units_per_cluster = bs_io_units_per_cluster(blob); 6106609b776SPiotr Pelplinski 6116609b776SPiotr Pelplinski return io_units_per_cluster - (io_unit % io_units_per_cluster); 6126609b776SPiotr Pelplinski } 6136609b776SPiotr Pelplinski 6143299bf6dSJim Harris /* Given an io_unit offset into a blob, look up the number of io_unit into blob to beginning of current cluster */ 615952532afSJinlong Chen static inline uint64_t 616b5d68d59SSeth Howell bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit) 6174132ac52SMaciej Szwed { 6183299bf6dSJim Harris uint64_t io_units_per_cluster = blob->bs->io_units_per_cluster; 6194132ac52SMaciej Szwed 6203299bf6dSJim Harris return io_unit - (io_unit % io_units_per_cluster); 6214132ac52SMaciej Szwed } 6224132ac52SMaciej Szwed 6236609b776SPiotr Pelplinski /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ 6246609b776SPiotr Pelplinski static inline uint32_t 625b5d68d59SSeth Howell bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit) 6266609b776SPiotr Pelplinski { 6273299bf6dSJim Harris uint64_t io_units_per_cluster = blob->bs->io_units_per_cluster; 6283299bf6dSJim Harris uint8_t shift = blob->bs->io_units_per_cluster_shift; 629b3348624STomasz Zawadzki 630b3348624STomasz Zawadzki if (shift != 0) { 6313299bf6dSJim Harris return io_unit >> shift; 632b3348624STomasz Zawadzki } else { 6333299bf6dSJim Harris return io_unit / io_units_per_cluster; 634b3348624STomasz Zawadzki } 6356609b776SPiotr Pelplinski } 6366609b776SPiotr Pelplinski 6376609b776SPiotr Pelplinski /* Given an io unit offset into a blob, look up if it is from allocated cluster. */ 6384132ac52SMaciej Szwed static inline bool 639b5d68d59SSeth Howell bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit) 6404132ac52SMaciej Szwed { 6413299bf6dSJim Harris uint64_t lba = bs_blob_io_unit_to_lba(blob, io_unit); 6424132ac52SMaciej Szwed 6434132ac52SMaciej Szwed if (lba == 0) { 6444132ac52SMaciej Szwed assert(spdk_blob_is_thin_provisioned(blob)); 6454132ac52SMaciej Szwed return false; 6464132ac52SMaciej Szwed } else { 6474132ac52SMaciej Szwed return true; 6484132ac52SMaciej Szwed } 6494132ac52SMaciej Szwed } 6504132ac52SMaciej Szwed 651d89352a9SBen Walker #endif 652