1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #ifndef SPDK_BLOBSTORE_H 8 #define SPDK_BLOBSTORE_H 9 10 #include "spdk/assert.h" 11 #include "spdk/blob.h" 12 #include "spdk/queue.h" 13 #include "spdk/util.h" 14 #include "spdk/tree.h" 15 16 #include "request.h" 17 18 /* In Memory Data Structures 19 * 20 * The following data structures exist only in memory. 21 */ 22 23 #define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024) 24 #define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX 25 #define SPDK_BLOB_OPTS_MAX_MD_OPS 32 26 #define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512 27 #define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32) 28 29 struct spdk_xattr { 30 uint32_t index; 31 uint16_t value_len; 32 char *name; 33 void *value; 34 TAILQ_ENTRY(spdk_xattr) link; 35 }; 36 37 /* The mutable part of the blob data that is sync'd to 38 * disk. The data in here is both mutable and persistent. 39 */ 40 struct spdk_blob_mut_data { 41 /* Number of data clusters in the blob */ 42 uint64_t num_clusters; 43 44 /* Array LBAs that are the beginning of a cluster, in 45 * the order they appear in the blob. 46 */ 47 uint64_t *clusters; 48 49 /* The size of the clusters array. This is greater than or 50 * equal to 'num_clusters'. 51 */ 52 size_t cluster_array_size; 53 54 /* Number of extent pages */ 55 uint64_t num_extent_pages; 56 57 /* Array of page offsets into the metadata region, 58 * containing extents. Can contain entries for not yet 59 * allocated pages. */ 60 uint32_t *extent_pages; 61 62 /* The size of the extent page array. This is greater than or 63 * equal to 'num_extent_pages'. */ 64 size_t extent_pages_array_size; 65 66 /* Number of metadata pages */ 67 uint32_t num_pages; 68 69 /* Array of page offsets into the metadata region, in 70 * the order of the metadata page sequence. 71 */ 72 uint32_t *pages; 73 }; 74 75 enum spdk_blob_state { 76 /* The blob in-memory version does not match the on-disk 77 * version. 78 */ 79 SPDK_BLOB_STATE_DIRTY, 80 81 /* The blob in memory version of the blob matches the on disk 82 * version. 83 */ 84 SPDK_BLOB_STATE_CLEAN, 85 86 /* The in-memory state being synchronized with the on-disk 87 * blob state. */ 88 SPDK_BLOB_STATE_LOADING, 89 }; 90 91 TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr); 92 93 struct spdk_blob_list { 94 spdk_blob_id id; 95 size_t clone_count; 96 TAILQ_HEAD(, spdk_blob_list) clones; 97 TAILQ_ENTRY(spdk_blob_list) link; 98 }; 99 100 struct spdk_blob { 101 struct spdk_blob_store *bs; 102 103 uint32_t open_ref; 104 105 spdk_blob_id id; 106 spdk_blob_id parent_id; 107 108 enum spdk_blob_state state; 109 110 /* Two copies of the mutable data. One is a version 111 * that matches the last known data on disk (clean). 112 * The other (active) is the current data. Syncing 113 * a blob makes the clean match the active. 114 */ 115 struct spdk_blob_mut_data clean; 116 struct spdk_blob_mut_data active; 117 118 bool invalid; 119 bool data_ro; 120 bool md_ro; 121 122 uint64_t invalid_flags; 123 uint64_t data_ro_flags; 124 uint64_t md_ro_flags; 125 126 struct spdk_bs_dev *back_bs_dev; 127 128 /* TODO: The xattrs are mutable, but we don't want to be 129 * copying them unnecessarily. Figure this out. 130 */ 131 struct spdk_xattr_tailq xattrs; 132 struct spdk_xattr_tailq xattrs_internal; 133 134 RB_ENTRY(spdk_blob) link; 135 136 uint32_t frozen_refcnt; 137 bool locked_operation_in_progress; 138 enum blob_clear_method clear_method; 139 bool extent_rle_found; 140 bool extent_table_found; 141 bool use_extent_table; 142 143 /* A list of pending metadata pending_persists */ 144 TAILQ_HEAD(, spdk_blob_persist_ctx) pending_persists; 145 TAILQ_HEAD(, spdk_blob_persist_ctx) persists_to_complete; 146 147 /* Number of data clusters retrieved from extent table, 148 * that many have to be read from extent pages. */ 149 uint64_t remaining_clusters_in_et; 150 }; 151 152 struct spdk_blob_store { 153 uint64_t md_start; /* Offset from beginning of disk, in pages */ 154 uint32_t md_len; /* Count, in pages */ 155 156 struct spdk_io_channel *md_channel; 157 uint32_t max_channel_ops; 158 159 struct spdk_thread *md_thread; 160 161 struct spdk_bs_dev *dev; 162 163 struct spdk_bit_array *used_md_pages; 164 struct spdk_bit_pool *used_clusters; 165 struct spdk_bit_array *used_blobids; 166 struct spdk_bit_array *open_blobids; 167 168 pthread_mutex_t used_clusters_mutex; 169 170 uint32_t cluster_sz; 171 uint64_t total_clusters; 172 uint64_t total_data_clusters; 173 uint64_t num_free_clusters; 174 uint64_t pages_per_cluster; 175 uint8_t pages_per_cluster_shift; 176 uint32_t io_unit_size; 177 178 spdk_blob_id super_blob; 179 struct spdk_bs_type bstype; 180 181 struct spdk_bs_cpl unload_cpl; 182 int unload_err; 183 184 RB_HEAD(spdk_blob_tree, spdk_blob) open_blobs; 185 TAILQ_HEAD(, spdk_blob_list) snapshots; 186 187 bool clean; 188 }; 189 190 struct spdk_bs_channel { 191 struct spdk_bs_request_set *req_mem; 192 TAILQ_HEAD(, spdk_bs_request_set) reqs; 193 194 struct spdk_blob_store *bs; 195 196 struct spdk_bs_dev *dev; 197 struct spdk_io_channel *dev_channel; 198 199 /* This page is only used during insert of a new cluster. */ 200 struct spdk_blob_md_page *new_cluster_page; 201 202 TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc; 203 TAILQ_HEAD(, spdk_bs_request_set) queued_io; 204 }; 205 206 /** operation type */ 207 enum spdk_blob_op_type { 208 SPDK_BLOB_WRITE, 209 SPDK_BLOB_READ, 210 SPDK_BLOB_UNMAP, 211 SPDK_BLOB_WRITE_ZEROES, 212 SPDK_BLOB_WRITEV, 213 SPDK_BLOB_READV, 214 }; 215 216 /* back bs_dev */ 217 218 #define BLOB_SNAPSHOT "SNAP" 219 #define SNAPSHOT_IN_PROGRESS "SNAPTMP" 220 #define SNAPSHOT_PENDING_REMOVAL "SNAPRM" 221 222 struct spdk_blob_bs_dev { 223 struct spdk_bs_dev bs_dev; 224 struct spdk_blob *blob; 225 }; 226 227 /* On-Disk Data Structures 228 * 229 * The following data structures exist on disk. 230 */ 231 #define SPDK_BS_INITIAL_VERSION 1 232 #define SPDK_BS_VERSION 3 /* current version */ 233 234 #pragma pack(push, 1) 235 236 #define SPDK_MD_MASK_TYPE_USED_PAGES 0 237 #define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1 238 #define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2 239 240 struct spdk_bs_md_mask { 241 uint8_t type; 242 uint32_t length; /* In bits */ 243 uint8_t mask[0]; 244 }; 245 246 #define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0 247 #define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2 248 #define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3 249 #define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4 250 251 /* Following descriptors define cluster layout in a blob. 252 * EXTENT_RLE cannot be present in blobs metadata, 253 * at the same time as EXTENT_TABLE and EXTENT_PAGE descriptors. */ 254 255 /* EXTENT_RLE descriptor holds an array of LBA that points to 256 * beginning of allocated clusters. The array is run-length encoded, 257 * with 0's being unallocated clusters. It is part of serialized 258 * metadata chain for a blob. */ 259 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE 1 260 /* EXTENT_TABLE descriptor holds array of md page offsets that 261 * point to pages with EXTENT_PAGE descriptor. The 0's in the array 262 * are run-length encoded, non-zero values are unallocated pages. 263 * It is part of serialized metadata chain for a blob. */ 264 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE 5 265 /* EXTENT_PAGE descriptor holds an array of LBAs that point to 266 * beginning of allocated clusters. The array is run-length encoded, 267 * with 0's being unallocated clusters. It is NOT part of 268 * serialized metadata chain for a blob. */ 269 #define SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE 6 270 271 struct spdk_blob_md_descriptor_xattr { 272 uint8_t type; 273 uint32_t length; 274 275 uint16_t name_length; 276 uint16_t value_length; 277 278 char name[0]; 279 /* String name immediately followed by string value. */ 280 }; 281 282 struct spdk_blob_md_descriptor_extent_rle { 283 uint8_t type; 284 uint32_t length; 285 286 struct { 287 uint32_t cluster_idx; 288 uint32_t length; /* In units of clusters */ 289 } extents[0]; 290 }; 291 292 struct spdk_blob_md_descriptor_extent_table { 293 uint8_t type; 294 uint32_t length; 295 296 /* Number of data clusters in the blob */ 297 uint64_t num_clusters; 298 299 struct { 300 uint32_t page_idx; 301 uint32_t num_pages; /* In units of pages */ 302 } extent_page[0]; 303 }; 304 305 struct spdk_blob_md_descriptor_extent_page { 306 uint8_t type; 307 uint32_t length; 308 309 /* First cluster index in this extent page */ 310 uint32_t start_cluster_idx; 311 312 uint32_t cluster_idx[0]; 313 }; 314 315 #define SPDK_BLOB_THIN_PROV (1ULL << 0) 316 #define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1) 317 #define SPDK_BLOB_EXTENT_TABLE (1ULL << 2) 318 #define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR | SPDK_BLOB_EXTENT_TABLE) 319 320 #define SPDK_BLOB_READ_ONLY (1ULL << 0) 321 #define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY 322 323 #define SPDK_BLOB_CLEAR_METHOD_SHIFT 0 324 #define SPDK_BLOB_CLEAR_METHOD (3ULL << SPDK_BLOB_CLEAR_METHOD_SHIFT) 325 #define SPDK_BLOB_MD_RO_FLAGS_MASK SPDK_BLOB_CLEAR_METHOD 326 327 struct spdk_blob_md_descriptor_flags { 328 uint8_t type; 329 uint32_t length; 330 331 /* 332 * If a flag in invalid_flags is set that the application is not aware of, 333 * it will not allow the blob to be opened. 334 */ 335 uint64_t invalid_flags; 336 337 /* 338 * If a flag in data_ro_flags is set that the application is not aware of, 339 * allow the blob to be opened in data_read_only and md_read_only mode. 340 */ 341 uint64_t data_ro_flags; 342 343 /* 344 * If a flag in md_ro_flags is set the application is not aware of, 345 * allow the blob to be opened in md_read_only mode. 346 */ 347 uint64_t md_ro_flags; 348 }; 349 350 struct spdk_blob_md_descriptor { 351 uint8_t type; 352 uint32_t length; 353 }; 354 355 #define SPDK_INVALID_MD_PAGE UINT32_MAX 356 357 struct spdk_blob_md_page { 358 spdk_blob_id id; 359 360 uint32_t sequence_num; 361 uint32_t reserved0; 362 363 /* Descriptors here */ 364 uint8_t descriptors[4072]; 365 366 uint32_t next; 367 uint32_t crc; 368 }; 369 #define SPDK_BS_PAGE_SIZE 0x1000 370 SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size"); 371 372 #define SPDK_BS_MAX_DESC_SIZE SPDK_SIZEOF_MEMBER(struct spdk_blob_md_page, descriptors) 373 374 /* Maximum number of extents a single Extent Page can fit. 375 * For an SPDK_BS_PAGE_SIZE of 4K SPDK_EXTENTS_PER_EP would be 512. */ 376 #define SPDK_EXTENTS_PER_EP_MAX ((SPDK_BS_MAX_DESC_SIZE - sizeof(struct spdk_blob_md_descriptor_extent_page)) / sizeof(uint32_t)) 377 #define SPDK_EXTENTS_PER_EP (spdk_align64pow2(SPDK_EXTENTS_PER_EP_MAX + 1) >> 1u) 378 379 #define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB" 380 381 struct spdk_bs_super_block { 382 uint8_t signature[8]; 383 uint32_t version; 384 uint32_t length; 385 uint32_t clean; /* If there was a clean shutdown, this is 1. */ 386 spdk_blob_id super_blob; 387 388 uint32_t cluster_size; /* In bytes */ 389 390 uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ 391 uint32_t used_page_mask_len; /* Count, in pages */ 392 393 uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ 394 uint32_t used_cluster_mask_len; /* Count, in pages */ 395 396 uint32_t md_start; /* Offset from beginning of disk, in pages */ 397 uint32_t md_len; /* Count, in pages */ 398 399 struct spdk_bs_type bstype; /* blobstore type */ 400 401 uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ 402 uint32_t used_blobid_mask_len; /* Count, in pages */ 403 404 uint64_t size; /* size of blobstore in bytes */ 405 uint32_t io_unit_size; /* Size of io unit in bytes */ 406 407 uint8_t reserved[4000]; 408 uint32_t crc; 409 }; 410 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size"); 411 412 #pragma pack(pop) 413 414 struct spdk_bs_dev *bs_create_zeroes_dev(void); 415 struct spdk_bs_dev *bs_create_blob_bs_dev(struct spdk_blob *blob); 416 417 /* Unit Conversions 418 * 419 * The blobstore works with several different units: 420 * - Byte: Self explanatory 421 * - LBA: The logical blocks on the backing storage device. 422 * - Page: The read/write units of blobs and metadata. This is 423 * an offset into a blob in units of 4KiB. 424 * - Cluster Index: The disk is broken into a sequential list of 425 * clusters. This is the offset from the beginning. 426 * 427 * NOTE: These conversions all act on simple magnitudes, not with any sort 428 * of knowledge about the blobs themselves. For instance, converting 429 * a page to an lba with the conversion function below simply converts 430 * a number of pages to an equivalent number of lbas, but that 431 * lba certainly isn't the right lba that corresponds to a page offset 432 * for a particular blob. 433 */ 434 static inline uint64_t 435 bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length) 436 { 437 assert(length % bs->dev->blocklen == 0); 438 439 return length / bs->dev->blocklen; 440 } 441 442 static inline uint64_t 443 bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length) 444 { 445 assert(length % bs_dev->blocklen == 0); 446 447 return length / bs_dev->blocklen; 448 } 449 450 static inline uint64_t 451 bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page) 452 { 453 return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen; 454 } 455 456 static inline uint64_t 457 bs_md_page_to_lba(struct spdk_blob_store *bs, uint32_t page) 458 { 459 assert(page < bs->md_len); 460 return bs_page_to_lba(bs, page + bs->md_start); 461 } 462 463 static inline uint64_t 464 bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page) 465 { 466 return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen; 467 } 468 469 static inline uint64_t 470 bs_io_unit_per_page(struct spdk_blob_store *bs) 471 { 472 return SPDK_BS_PAGE_SIZE / bs->io_unit_size; 473 } 474 475 static inline uint64_t 476 bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit) 477 { 478 return io_unit / bs_io_unit_per_page(bs); 479 } 480 481 static inline uint64_t 482 bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster) 483 { 484 return (uint64_t)cluster * bs->pages_per_cluster; 485 } 486 487 static inline uint32_t 488 bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page) 489 { 490 assert(page % bs->pages_per_cluster == 0); 491 492 return page / bs->pages_per_cluster; 493 } 494 495 static inline uint64_t 496 bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster) 497 { 498 assert(bs->cluster_sz / bs->dev->blocklen > 0); 499 500 return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen); 501 } 502 503 static inline uint32_t 504 bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba) 505 { 506 assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0); 507 508 return lba / (bs->cluster_sz / bs->dev->blocklen); 509 } 510 511 static inline uint64_t 512 bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit) 513 { 514 return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen); 515 } 516 517 static inline uint64_t 518 bs_cluster_to_extent_table_id(uint64_t cluster_num) 519 { 520 return cluster_num / SPDK_EXTENTS_PER_EP; 521 } 522 523 static inline uint32_t * 524 bs_cluster_to_extent_page(struct spdk_blob *blob, uint64_t cluster_num) 525 { 526 uint64_t extent_table_id = bs_cluster_to_extent_table_id(cluster_num); 527 528 assert(blob->use_extent_table); 529 assert(extent_table_id < blob->active.extent_pages_array_size); 530 531 return &blob->active.extent_pages[extent_table_id]; 532 } 533 534 static inline uint64_t 535 bs_io_units_per_cluster(struct spdk_blob *blob) 536 { 537 uint64_t io_units_per_cluster; 538 uint8_t shift = blob->bs->pages_per_cluster_shift; 539 540 if (shift != 0) { 541 io_units_per_cluster = bs_io_unit_per_page(blob->bs) << shift; 542 } else { 543 io_units_per_cluster = bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster; 544 } 545 546 return io_units_per_cluster; 547 } 548 549 /* End basic conversions */ 550 551 static inline uint64_t 552 bs_blobid_to_page(spdk_blob_id id) 553 { 554 return id & 0xFFFFFFFF; 555 } 556 557 /* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper 558 * 32 bits are not currently used. Stick a 1 there just to catch bugs where the 559 * code assumes blob id == page_idx. 560 */ 561 static inline spdk_blob_id 562 bs_page_to_blobid(uint64_t page_idx) 563 { 564 if (page_idx > UINT32_MAX) { 565 return SPDK_BLOBID_INVALID; 566 } 567 return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; 568 } 569 570 /* Given an io unit offset into a blob, look up the LBA for the 571 * start of that io unit. 572 */ 573 static inline uint64_t 574 bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit) 575 { 576 uint64_t lba; 577 uint64_t pages_per_cluster; 578 uint8_t shift; 579 uint64_t io_units_per_cluster; 580 uint64_t io_units_per_page; 581 uint64_t page; 582 583 page = bs_io_unit_to_page(blob->bs, io_unit); 584 585 pages_per_cluster = blob->bs->pages_per_cluster; 586 shift = blob->bs->pages_per_cluster_shift; 587 io_units_per_page = bs_io_unit_per_page(blob->bs); 588 589 assert(page < blob->active.num_clusters * pages_per_cluster); 590 591 if (shift != 0) { 592 io_units_per_cluster = io_units_per_page << shift; 593 lba = blob->active.clusters[page >> shift]; 594 } else { 595 io_units_per_cluster = io_units_per_page * pages_per_cluster; 596 lba = blob->active.clusters[page / pages_per_cluster]; 597 } 598 lba += io_unit % io_units_per_cluster; 599 return lba; 600 } 601 602 /* Given an io_unit offset into a blob, look up the number of io_units until the 603 * next cluster boundary. 604 */ 605 static inline uint32_t 606 bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit) 607 { 608 uint64_t io_units_per_cluster; 609 610 io_units_per_cluster = bs_io_units_per_cluster(blob); 611 612 return io_units_per_cluster - (io_unit % io_units_per_cluster); 613 } 614 615 /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ 616 static inline uint32_t 617 bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit) 618 { 619 uint64_t pages_per_cluster; 620 uint64_t page; 621 622 pages_per_cluster = blob->bs->pages_per_cluster; 623 page = bs_io_unit_to_page(blob->bs, io_unit); 624 625 return page - (page % pages_per_cluster); 626 } 627 628 /* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ 629 static inline uint32_t 630 bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit) 631 { 632 uint64_t pages_per_cluster = blob->bs->pages_per_cluster; 633 uint8_t shift = blob->bs->pages_per_cluster_shift; 634 uint32_t page_offset; 635 636 page_offset = io_unit / bs_io_unit_per_page(blob->bs); 637 if (shift != 0) { 638 return page_offset >> shift; 639 } else { 640 return page_offset / pages_per_cluster; 641 } 642 } 643 644 /* Given an io unit offset into a blob, look up if it is from allocated cluster. */ 645 static inline bool 646 bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit) 647 { 648 uint64_t lba; 649 uint64_t page; 650 uint64_t pages_per_cluster; 651 uint8_t shift; 652 653 shift = blob->bs->pages_per_cluster_shift; 654 pages_per_cluster = blob->bs->pages_per_cluster; 655 page = bs_io_unit_to_page(blob->bs, io_unit); 656 657 assert(page < blob->active.num_clusters * pages_per_cluster); 658 659 if (shift != 0) { 660 lba = blob->active.clusters[page >> shift]; 661 } else { 662 lba = blob->active.clusters[page / pages_per_cluster]; 663 } 664 665 if (lba == 0) { 666 assert(spdk_blob_is_thin_provisioned(blob)); 667 return false; 668 } else { 669 return true; 670 } 671 } 672 673 #endif 674