1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #ifndef SPDK_BLOBSTORE_H 8 #define SPDK_BLOBSTORE_H 9 10 #include "spdk/assert.h" 11 #include "spdk/blob.h" 12 #include "spdk/queue.h" 13 #include "spdk/util.h" 14 #include "spdk/tree.h" 15 16 #include "request.h" 17 18 /* In Memory Data Structures 19 * 20 * The following data structures exist only in memory. 21 */ 22 23 #define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024) 24 #define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX 25 #define SPDK_BLOB_OPTS_MAX_MD_OPS 32 26 #define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512 27 #define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32) 28 29 struct spdk_xattr { 30 uint32_t index; 31 uint16_t value_len; 32 char *name; 33 void *value; 34 TAILQ_ENTRY(spdk_xattr) link; 35 }; 36 37 /* The mutable part of the blob data that is sync'd to 38 * disk. The data in here is both mutable and persistent. 39 */ 40 struct spdk_blob_mut_data { 41 /* Number of data clusters in the blob */ 42 uint64_t num_clusters; 43 44 /* Array LBAs that are the beginning of a cluster, in 45 * the order they appear in the blob. 46 */ 47 uint64_t *clusters; 48 49 /* The size of the clusters array. This is greater than or 50 * equal to 'num_clusters'. 51 */ 52 size_t cluster_array_size; 53 54 /* Number of extent pages */ 55 uint64_t num_extent_pages; 56 57 /* Array of page offsets into the metadata region, 58 * containing extents. Can contain entries for not yet 59 * allocated pages. */ 60 uint32_t *extent_pages; 61 62 /* The size of the extent page array. This is greater than or 63 * equal to 'num_extent_pages'. */ 64 size_t extent_pages_array_size; 65 66 /* Number of metadata pages */ 67 uint32_t num_pages; 68 69 /* Array of page offsets into the metadata region, in 70 * the order of the metadata page sequence. 71 */ 72 uint32_t *pages; 73 }; 74 75 enum spdk_blob_state { 76 /* The blob in-memory version does not match the on-disk 77 * version. 78 */ 79 SPDK_BLOB_STATE_DIRTY, 80 81 /* The blob in memory version of the blob matches the on disk 82 * version. 83 */ 84 SPDK_BLOB_STATE_CLEAN, 85 86 /* The in-memory state being synchronized with the on-disk 87 * blob state. */ 88 SPDK_BLOB_STATE_LOADING, 89 }; 90 91 TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr); 92 93 struct spdk_blob_list { 94 spdk_blob_id id; 95 size_t clone_count; 96 TAILQ_HEAD(, spdk_blob_list) clones; 97 TAILQ_ENTRY(spdk_blob_list) link; 98 }; 99 100 struct spdk_blob { 101 struct spdk_blob_store *bs; 102 103 uint32_t open_ref; 104 105 spdk_blob_id id; 106 spdk_blob_id parent_id; 107 108 enum spdk_blob_state state; 109 110 /* Two copies of the mutable data. One is a version 111 * that matches the last known data on disk (clean). 112 * The other (active) is the current data. Syncing 113 * a blob makes the clean match the active. 114 */ 115 struct spdk_blob_mut_data clean; 116 struct spdk_blob_mut_data active; 117 118 bool invalid; 119 bool data_ro; 120 bool md_ro; 121 122 uint64_t invalid_flags; 123 uint64_t data_ro_flags; 124 uint64_t md_ro_flags; 125 126 struct spdk_bs_dev *back_bs_dev; 127 128 /* TODO: The xattrs are mutable, but we don't want to be 129 * copying them unnecessarily. Figure this out. 130 */ 131 struct spdk_xattr_tailq xattrs; 132 struct spdk_xattr_tailq xattrs_internal; 133 134 RB_ENTRY(spdk_blob) link; 135 136 uint32_t frozen_refcnt; 137 bool locked_operation_in_progress; 138 enum blob_clear_method clear_method; 139 bool extent_rle_found; 140 bool extent_table_found; 141 bool use_extent_table; 142 143 /* A list of pending metadata pending_persists */ 144 TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists; 145 TAILQ_HEAD(, spdk_blob_persist_ctx) persists_to_complete; 146 147 /* Number of data clusters retrieved from extent table, 148 * that many have to be read from extent pages. */ 149 uint64_t remaining_clusters_in_et; 150 }; 151 152 struct spdk_blob_store { 153 uint64_t md_start; /* Offset from beginning of disk, in pages */ 154 uint32_t md_len; /* Count, in pages */ 155 156 struct spdk_io_channel *md_channel; 157 uint32_t max_channel_ops; 158 159 struct spdk_thread *md_thread; 160 161 struct spdk_bs_dev *dev; 162 163 struct spdk_bit_array *used_md_pages; 164 struct spdk_bit_pool *used_clusters; 165 struct spdk_bit_array *used_blobids; 166 struct spdk_bit_array *open_blobids; 167 168 pthread_mutex_t used_clusters_mutex; 169 170 uint32_t cluster_sz; 171 uint64_t total_clusters; 172 uint64_t total_data_clusters; 173 uint64_t num_free_clusters; 174 uint64_t pages_per_cluster; 175 uint8_t pages_per_cluster_shift; 176 uint32_t io_unit_size; 177 178 spdk_blob_id super_blob; 179 struct spdk_bs_type bstype; 180 181 struct spdk_bs_cpl unload_cpl; 182 int unload_err; 183 184 RB_HEAD(spdk_blob_tree, spdk_blob) open_blobs; 185 TAILQ_HEAD(, spdk_blob_list) snapshots; 186 187 bool clean; 188 }; 189 190 struct spdk_bs_channel { 191 struct spdk_bs_request_set *req_mem; 192 TAILQ_HEAD(, spdk_bs_request_set) reqs; 193 194 struct spdk_blob_store *bs; 195 196 struct spdk_bs_dev *dev; 197 struct spdk_io_channel *dev_channel; 198 199 /* This page is only used during insert of a new cluster. */ 200 struct spdk_blob_md_page *new_cluster_page; 201 202 TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc; 203 TAILQ_HEAD(, spdk_bs_request_set) queued_io; 204 }; 205 206 /** operation type */ 207 enum spdk_blob_op_type { 208 SPDK_BLOB_WRITE, 209 SPDK_BLOB_READ, 210 SPDK_BLOB_UNMAP, 211 SPDK_BLOB_WRITE_ZEROES, 212 SPDK_BLOB_WRITEV, 213 SPDK_BLOB_READV, 214 }; 215 216 /* back bs_dev */ 217 218 #define BLOB_SNAPSHOT "SNAP" 219 #define SNAPSHOT_IN_PROGRESS "SNAPTMP" 220 #define SNAPSHOT_PENDING_REMOVAL "SNAPRM" 221 222 struct spdk_blob_bs_dev { 223 struct spdk_bs_dev bs_dev; 224 struct spdk_blob *blob; 225 }; 226 227 /* On-Disk Data Structures 228 * 229 * The following data structures exist on disk. 230 */ 231 #define SPDK_BS_INITIAL_VERSION 1 232 #define SPDK_BS_VERSION 3 /* current version */ 233 234 #pragma pack(push, 1) 235 236 #define SPDK_MD_MASK_TYPE_USED_PAGES 0 237 #define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1 238 #define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2 239 240 struct spdk_bs_md_mask { 241 uint8_t type; 242 uint32_t length; /* In bits */ 243 uint8_t mask[0]; 244 }; 245 246 #define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0 247 #define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2 248 #define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3 249 #define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4 250 251 /* Following descriptors define cluster layout in a blob. 252 * EXTENT_RLE cannot be present in blobs metadata, 253 * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */ 254 255 /* EXTENT_RLE descriptor holds an array of LBA that points to 256 * beginning of allocated clusters. The array is run-length encoded, 257 * with 0's being unallocated clusters. It is part of serialized 258 * metadata chain for a blob. */ 259 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1 260 /* EXTENT_TABLE descriptor holds array of md page offsets that 261 * point to pages with EXTENT_PAGE descriptor. The 0's in the array 262 * are run-length encoded, non-zero values are unallocated pages. 263 * It is part of serialized metadata chain for a blob. */ 264 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5 265 /* EXTENT_PAGE descriptor holds an array of LBAs that point to 266 * beginning of allocated clusters. The array is run-length encoded, 267 * with 0's being unallocated clusters. It is NOT part of 268 * serialized metadata chain for a blob. */ 269 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6 270 271 struct spdk_blob_md_descriptor_xattr { 272 uint8_t type; 273 uint32_t length; 274 275 uint16_t name_length; 276 uint16_t value_length; 277 278 char name[0]; 279 /* String name immediately followed by string value. */ 280 }; 281 282 struct spdk_blob_md_descriptor_extent_rle { 283 uint8_t type; 284 uint32_t length; 285 286 struct { 287 uint32_t cluster_idx; 288 uint32_t length; /* In units of clusters */ 289 } extents[0]; 290 }; 291 292 struct spdk_blob_md_descriptor_extent_table { 293 uint8_t type; 294 uint32_t length; 295 296 /* Number of data clusters in the blob */ 297 uint64_t num_clusters; 298 299 struct { 300 uint32_t page_idx; 301 uint32_t num_pages; /* In units of pages */ 302 } extent_page[0]; 303 }; 304 305 struct spdk_blob_md_descriptor_extent_page { 306 uint8_t type; 307 uint32_t length; 308 309 /* First cluster index in this extent page */ 310 uint32_t start_cluster_idx; 311 312 uint32_t cluster_idx[0]; 313 }; 314 315 #define SPDK_BLOB_THIN_PROV (1ULL << 0) 316 #define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1) 317 #define SPDK_BLOB_EXTENT_TABLE (1ULL << 2) 318 #define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | SPDK_BLOB_EXTENT_TABLE) 319 320 #define SPDK_BLOB_READ_ONLY (1ULL << 0) 321 #define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY 322 323 #define SPDK_BLOB_CLEAR_METHOD_SHIFT 0 324 #define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT) 325 #define SPDK_BLOB_MD_RO_FLAGS_MASK SPDK_BLOB_CLEAR_METHOD 326 327 struct spdk_blob_md_descriptor_flags { 328 uint8_t type; 329 uint32_t length; 330 331 /* 332 * If a flag in invalid_flags is set that the application is not aware of, 333 * it will not allow the blob to be opened. 334 */ 335 uint64_t invalid_flags; 336 337 /* 338 * If a flag in data_ro_flags is set that the application is not aware of, 339 * allow the blob to be opened in data_read_only and md_read_only mode. 340 */ 341 uint64_t data_ro_flags; 342 343 /* 344 * If a flag in md_ro_flags is set the application is not aware of, 345 * allow the blob to be opened in md_read_only mode. 346 */ 347 uint64_t md_ro_flags; 348 }; 349 350 struct spdk_blob_md_descriptor { 351 uint8_t type; 352 uint32_t length; 353 }; 354 355 #define SPDK_INVALID_MD_PAGE UINT32_MAX 356 357 struct spdk_blob_md_page { 358 spdk_blob_id id; 359 360 uint32_t sequence_num; 361 uint32_t reserved0; 362 363 /* Descriptors here */ 364 uint8_t descriptors[4072]; 365 366 uint32_t next; 367 uint32_t crc; 368 }; 369 #define SPDK_BS_PAGE_SIZE 0x1000 370 SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size"); 371 372 #define SPDK_BS_MAX_DESC_SIZE SPDK_SIZEOF_MEMBER(struct spdk_blob_md_page, descriptors) 373 374 /* Maximum number of extents a single Extent Page can fit. 375 * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */ 376 #define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t)) 377 #define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u) 378 379 #define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB" 380 381 struct spdk_bs_super_block { 382 uint8_t signature[8]; 383 uint32_t version; 384 uint32_t length; 385 uint32_t clean; /* If there was a clean shutdown, this is 1. */ 386 spdk_blob_id super_blob; 387 388 uint32_t cluster_size; /* In bytes */ 389 390 uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ 391 uint32_t used_page_mask_len; /* Count, in pages */ 392 393 uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ 394 uint32_t used_cluster_mask_len; /* Count, in pages */ 395 396 uint32_t md_start; /* Offset from beginning of disk, in pages */ 397 uint32_t md_len; /* Count, in pages */ 398 399 struct spdk_bs_type bstype; /* blobstore type */ 400 401 uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ 402 uint32_t used_blobid_mask_len; /* Count, in pages */ 403 404 uint64_t size; /* size of blobstore in bytes */ 405 uint32_t io_unit_size; /* Size of io unit in bytes */ 406 407 uint8_t reserved[4000]; 408 uint32_t crc; 409 }; 410 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size"); 411 412 #pragma pack(pop) 413 414 struct spdk_bs_dev *bs_create_zeroes_dev(void); 415 struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob); 416 417 /* Unit Conversions 418 * 419 * The blobstore works with several different units: 420 * - Byte: Self explanatory 421 * - LBA: The logical blocks on the backing storage device. 422 * - Page: The read/write units of blobs and metadata. This is 423 * an offset into a blob in units of 4KiB. 424 * - Cluster Index: The disk is broken into a sequential list of 425 * clusters. This is the offset from the beginning. 426 * 427 * NOTE: These conversions all act on simple magnitudes, not with any sort 428 * of knowledge about the blobs themselves. For instance, converting 429 * a page to an lba with the conversion function below simply converts 430 * a number of pages to an equivalent number of lbas, but that 431 * lba certainly isn't the right lba that corresponds to a page offset 432 * for a particular blob. 433 */ 434 static inline uint64_t 435 bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length) 436 { 437 assert(length % bs->dev->blocklen == 0); 438 439 return length / bs->dev->blocklen; 440 } 441 442 static inline uint64_t 443 bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length) 444 { 445 assert(length % bs_dev->blocklen == 0); 446 447 return length / bs_dev->blocklen; 448 } 449 450 static inline uint64_t 451 bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page) 452 { 453 return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen; 454 } 455 456 static inline uint64_t 457 bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page) 458 { 459 assert(page < bs->md_len); 460 return bs_page_to_lba(bs, page + bs->md_start); 461 } 462 463 static inline uint64_t 464 bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page) 465 { 466 return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen; 467 } 468 469 static inline uint64_t 470 bs_io_unit_per_page(struct spdk_blob_store *bs) 471 { 472 return SPDK_BS_PAGE_SIZE / bs->io_unit_size; 473 } 474 475 static inline uint64_t 476 bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit) 477 { 478 return io_unit / bs_io_unit_per_page(bs); 479 } 480 481 static inline uint64_t 482 bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster) 483 { 484 return (uint64_t)cluster * bs->pages_per_cluster; 485 } 486 487 static inline uint32_t 488 bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page) 489 { 490 assert(page % bs->pages_per_cluster == 0); 491 492 return page / bs->pages_per_cluster; 493 } 494 495 static inline uint64_t 496 bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster) 497 { 498 return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen); 499 } 500 501 static inline uint32_t 502 bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba) 503 { 504 assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0); 505 506 return lba / (bs->cluster_sz / bs->dev->blocklen); 507 } 508 509 static inline uint64_t 510 bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit) 511 { 512 return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen); 513 } 514 515 static inline uint64_t 516 bs_cluster_to_extent_table_id(uint64_t cluster_num) 517 { 518 return cluster_num / SPDK_EXTENTS_PER_EP; 519 } 520 521 static inline uint32_t * 522 bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num) 523 { 524 uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num); 525 526 assert(blob->use_extent_table); 527 assert(extent_table_id < blob->active.extent_pages_array_size); 528 529 return &blob->active.extent_pages[extent_table_id]; 530 } 531 532 /* End basic conversions */ 533 534 static inline uint64_t 535 bs_blobid_to_page(spdk_blob_id id) 536 { 537 return id & 0xFFFFFFFF; 538 } 539 540 /* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper 541 * 32 bits are not currently used. Stick a 1 there just to catch bugs where the 542 * code assumes blob id == page_idx. 543 */ 544 static inline spdk_blob_id 545 bs_page_to_blobid(uint64_t page_idx) 546 { 547 if (page_idx > UINT32_MAX) { 548 return SPDK_BLOBID_INVALID; 549 } 550 return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; 551 } 552 553 /* Given an io unit offset into a blob, look up the LBA for the 554 * start of that io unit. 555 */ 556 static inline uint64_t 557 bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit) 558 { 559 uint64_t lba; 560 uint64_t pages_per_cluster; 561 uint8_t shift; 562 uint64_t io_units_per_cluster; 563 uint64_t io_units_per_page; 564 uint64_t page; 565 566 page = bs_io_unit_to_page(blob->bs, io_unit); 567 568 pages_per_cluster = blob->bs->pages_per_cluster; 569 shift = blob->bs->pages_per_cluster_shift; 570 io_units_per_page = bs_io_unit_per_page(blob->bs); 571 572 assert(page < blob->active.num_clusters * pages_per_cluster); 573 574 if (shift != 0) { 575 io_units_per_cluster = io_units_per_page << shift; 576 lba = blob->active.clusters[page >> shift]; 577 } else { 578 io_units_per_cluster = io_units_per_page * pages_per_cluster; 579 lba = blob->active.clusters[page / pages_per_cluster]; 580 } 581 lba += io_unit % io_units_per_cluster; 582 return lba; 583 } 584 585 /* Given an io_unit offset into a blob, look up the number of io_units until the 586 * next cluster boundary. 587 */ 588 static inline uint32_t 589 bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit) 590 { 591 uint64_t io_units_per_cluster; 592 uint8_t shift = blob->bs->pages_per_cluster_shift; 593 594 if (shift != 0) { 595 io_units_per_cluster = bs_io_unit_per_page(blob->bs) << shift; 596 } else { 597 io_units_per_cluster = bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster; 598 } 599 600 return io_units_per_cluster - (io_unit % io_units_per_cluster); 601 } 602 603 /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ 604 static inline uint32_t 605 bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit) 606 { 607 uint64_t pages_per_cluster; 608 uint64_t page; 609 610 pages_per_cluster = blob->bs->pages_per_cluster; 611 page = bs_io_unit_to_page(blob->bs, io_unit); 612 613 return page - (page % pages_per_cluster); 614 } 615 616 /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ 617 static inline uint32_t 618 bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit) 619 { 620 uint64_t pages_per_cluster = blob->bs->pages_per_cluster; 621 uint8_t shift = blob->bs->pages_per_cluster_shift; 622 uint32_t page_offset; 623 624 page_offset = io_unit / bs_io_unit_per_page(blob->bs); 625 if (shift != 0) { 626 return page_offset >> shift; 627 } else { 628 return page_offset / pages_per_cluster; 629 } 630 } 631 632 /* Given an io unit offset into a blob, look up if it is from allocated cluster. */ 633 static inline bool 634 bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit) 635 { 636 uint64_t lba; 637 uint64_t page; 638 uint64_t pages_per_cluster; 639 uint8_t shift; 640 641 shift = blob->bs->pages_per_cluster_shift; 642 pages_per_cluster = blob->bs->pages_per_cluster; 643 page = bs_io_unit_to_page(blob->bs, io_unit); 644 645 assert(page < blob->active.num_clusters * pages_per_cluster); 646 647 if (shift != 0) { 648 lba = blob->active.clusters[page >> shift]; 649 } else { 650 lba = blob->active.clusters[page / pages_per_cluster]; 651 } 652 653 if (lba == 0) { 654 assert(spdk_blob_is_thin_provisioned(blob)); 655 return false; 656 } else { 657 return true; 658 } 659 } 660 661 #endif 662