1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/blob.h" 37 #include "spdk/crc32.h" 38 #include "spdk/env.h" 39 #include "spdk/queue.h" 40 #include "spdk/thread.h" 41 #include "spdk/bit_array.h" 42 #include "spdk/bit_pool.h" 43 #include "spdk/likely.h" 44 #include "spdk/util.h" 45 #include "spdk/string.h" 46 47 #include "spdk_internal/assert.h" 48 #include "spdk/log.h" 49 50 #include "blobstore.h" 51 52 #define BLOB_CRC32C_INITIAL 0xffffffffUL 53 54 static int bs_register_md_thread(struct spdk_blob_store *bs); 55 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 56 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 57 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 58 uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg); 59 60 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 61 uint16_t value_len, bool internal); 62 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 63 const void **value, size_t *value_len, bool internal); 64 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 65 66 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 67 spdk_blob_op_complete cb_fn, void *cb_arg); 68 69 static void 70 blob_verify_md_op(struct spdk_blob *blob) 71 { 72 assert(blob != NULL); 73 assert(spdk_get_thread() == blob->bs->md_thread); 74 assert(blob->state != SPDK_BLOB_STATE_LOADING); 75 } 76 77 static struct spdk_blob_list * 78 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 79 { 80 struct spdk_blob_list *snapshot_entry = NULL; 81 82 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 83 if (snapshot_entry->id == blobid) { 84 break; 85 } 86 } 87 88 return snapshot_entry; 89 } 90 91 static void 92 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 93 { 94 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 95 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 96 97 spdk_bit_array_set(bs->used_md_pages, page); 98 } 99 100 static void 101 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 102 { 103 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 104 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 105 106 spdk_bit_array_clear(bs->used_md_pages, page); 107 } 108 109 static uint32_t 110 bs_claim_cluster(struct spdk_blob_store *bs) 111 { 112 uint32_t cluster_num; 113 114 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 115 if (cluster_num == UINT32_MAX) { 116 return UINT32_MAX; 117 } 118 119 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 120 bs->num_free_clusters--; 121 122 return cluster_num; 123 } 124 125 static void 126 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 127 { 128 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 129 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 130 assert(bs->num_free_clusters < bs->total_clusters); 131 132 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 133 134 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 135 bs->num_free_clusters++; 136 } 137 138 static int 139 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 140 { 141 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 142 143 blob_verify_md_op(blob); 144 145 if (*cluster_lba != 0) { 146 return -EEXIST; 147 } 148 149 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 150 return 0; 151 } 152 153 static int 154 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 155 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 156 { 157 uint32_t *extent_page = 0; 158 159 *cluster = bs_claim_cluster(blob->bs); 160 if (*cluster == UINT32_MAX) { 161 /* No more free clusters. Cannot satisfy the request */ 162 return -ENOSPC; 163 } 164 165 if (blob->use_extent_table) { 166 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 167 if (*extent_page == 0) { 168 /* Extent page shall never occupy md_page so start the search from 1 */ 169 if (*lowest_free_md_page == 0) { 170 *lowest_free_md_page = 1; 171 } 172 /* No extent_page is allocated for the cluster */ 173 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 174 *lowest_free_md_page); 175 if (*lowest_free_md_page == UINT32_MAX) { 176 /* No more free md pages. Cannot satisfy the request */ 177 bs_release_cluster(blob->bs, *cluster); 178 return -ENOSPC; 179 } 180 bs_claim_md_page(blob->bs, *lowest_free_md_page); 181 } 182 } 183 184 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id); 185 186 if (update_map) { 187 blob_insert_cluster(blob, cluster_num, *cluster); 188 if (blob->use_extent_table && *extent_page == 0) { 189 *extent_page = *lowest_free_md_page; 190 } 191 } 192 193 return 0; 194 } 195 196 static void 197 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 198 { 199 xattrs->count = 0; 200 xattrs->names = NULL; 201 xattrs->ctx = NULL; 202 xattrs->get_value = NULL; 203 } 204 205 void 206 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 207 { 208 if (!opts) { 209 SPDK_ERRLOG("opts should not be NULL\n"); 210 return; 211 } 212 213 if (!opts_size) { 214 SPDK_ERRLOG("opts_size should not be zero value\n"); 215 return; 216 } 217 218 memset(opts, 0, opts_size); 219 opts->opts_size = opts_size; 220 221 #define FIELD_OK(field) \ 222 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 223 224 #define SET_FIELD(field, value) \ 225 if (FIELD_OK(field)) { \ 226 opts->field = value; \ 227 } \ 228 229 SET_FIELD(num_clusters, 0); 230 SET_FIELD(thin_provision, false); 231 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 232 233 if (FIELD_OK(xattrs)) { 234 blob_xattrs_init(&opts->xattrs); 235 } 236 237 SET_FIELD(use_extent_table, true); 238 239 #undef FIELD_OK 240 #undef SET_FIELD 241 } 242 243 void 244 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 245 { 246 if (!opts) { 247 SPDK_ERRLOG("opts should not be NULL\n"); 248 return; 249 } 250 251 if (!opts_size) { 252 SPDK_ERRLOG("opts_size should not be zero value\n"); 253 return; 254 } 255 256 memset(opts, 0, opts_size); 257 opts->opts_size = opts_size; 258 259 #define FIELD_OK(field) \ 260 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 261 262 #define SET_FIELD(field, value) \ 263 if (FIELD_OK(field)) { \ 264 opts->field = value; \ 265 } \ 266 267 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 268 269 #undef FIELD_OK 270 #undef SET_FILED 271 } 272 273 static struct spdk_blob * 274 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 275 { 276 struct spdk_blob *blob; 277 278 blob = calloc(1, sizeof(*blob)); 279 if (!blob) { 280 return NULL; 281 } 282 283 blob->id = id; 284 blob->bs = bs; 285 286 blob->parent_id = SPDK_BLOBID_INVALID; 287 288 blob->state = SPDK_BLOB_STATE_DIRTY; 289 blob->extent_rle_found = false; 290 blob->extent_table_found = false; 291 blob->active.num_pages = 1; 292 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 293 if (!blob->active.pages) { 294 free(blob); 295 return NULL; 296 } 297 298 blob->active.pages[0] = bs_blobid_to_page(id); 299 300 TAILQ_INIT(&blob->xattrs); 301 TAILQ_INIT(&blob->xattrs_internal); 302 TAILQ_INIT(&blob->pending_persists); 303 304 return blob; 305 } 306 307 static void 308 xattrs_free(struct spdk_xattr_tailq *xattrs) 309 { 310 struct spdk_xattr *xattr, *xattr_tmp; 311 312 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 313 TAILQ_REMOVE(xattrs, xattr, link); 314 free(xattr->name); 315 free(xattr->value); 316 free(xattr); 317 } 318 } 319 320 static void 321 blob_free(struct spdk_blob *blob) 322 { 323 assert(blob != NULL); 324 assert(TAILQ_EMPTY(&blob->pending_persists)); 325 326 free(blob->active.extent_pages); 327 free(blob->clean.extent_pages); 328 free(blob->active.clusters); 329 free(blob->clean.clusters); 330 free(blob->active.pages); 331 free(blob->clean.pages); 332 333 xattrs_free(&blob->xattrs); 334 xattrs_free(&blob->xattrs_internal); 335 336 if (blob->back_bs_dev) { 337 blob->back_bs_dev->destroy(blob->back_bs_dev); 338 } 339 340 free(blob); 341 } 342 343 struct freeze_io_ctx { 344 struct spdk_bs_cpl cpl; 345 struct spdk_blob *blob; 346 }; 347 348 static void 349 blob_io_sync(struct spdk_io_channel_iter *i) 350 { 351 spdk_for_each_channel_continue(i, 0); 352 } 353 354 static void 355 blob_execute_queued_io(struct spdk_io_channel_iter *i) 356 { 357 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 358 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 359 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 360 struct spdk_bs_request_set *set; 361 struct spdk_bs_user_op_args *args; 362 spdk_bs_user_op_t *op, *tmp; 363 364 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 365 set = (struct spdk_bs_request_set *)op; 366 args = &set->u.user_op; 367 368 if (args->blob == ctx->blob) { 369 TAILQ_REMOVE(&ch->queued_io, op, link); 370 bs_user_op_execute(op); 371 } 372 } 373 374 spdk_for_each_channel_continue(i, 0); 375 } 376 377 static void 378 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 379 { 380 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 381 382 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 383 384 free(ctx); 385 } 386 387 static void 388 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 389 { 390 struct freeze_io_ctx *ctx; 391 392 ctx = calloc(1, sizeof(*ctx)); 393 if (!ctx) { 394 cb_fn(cb_arg, -ENOMEM); 395 return; 396 } 397 398 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 399 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 400 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 401 ctx->blob = blob; 402 403 /* Freeze I/O on blob */ 404 blob->frozen_refcnt++; 405 406 if (blob->frozen_refcnt == 1) { 407 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 408 } else { 409 cb_fn(cb_arg, 0); 410 free(ctx); 411 } 412 } 413 414 static void 415 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 416 { 417 struct freeze_io_ctx *ctx; 418 419 ctx = calloc(1, sizeof(*ctx)); 420 if (!ctx) { 421 cb_fn(cb_arg, -ENOMEM); 422 return; 423 } 424 425 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 426 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 427 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 428 ctx->blob = blob; 429 430 assert(blob->frozen_refcnt > 0); 431 432 blob->frozen_refcnt--; 433 434 if (blob->frozen_refcnt == 0) { 435 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 436 } else { 437 cb_fn(cb_arg, 0); 438 free(ctx); 439 } 440 } 441 442 static int 443 blob_mark_clean(struct spdk_blob *blob) 444 { 445 uint32_t *extent_pages = NULL; 446 uint64_t *clusters = NULL; 447 uint32_t *pages = NULL; 448 449 assert(blob != NULL); 450 451 if (blob->active.num_extent_pages) { 452 assert(blob->active.extent_pages); 453 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 454 if (!extent_pages) { 455 return -ENOMEM; 456 } 457 memcpy(extent_pages, blob->active.extent_pages, 458 blob->active.num_extent_pages * sizeof(*extent_pages)); 459 } 460 461 if (blob->active.num_clusters) { 462 assert(blob->active.clusters); 463 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 464 if (!clusters) { 465 free(extent_pages); 466 return -ENOMEM; 467 } 468 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 469 } 470 471 if (blob->active.num_pages) { 472 assert(blob->active.pages); 473 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 474 if (!pages) { 475 free(extent_pages); 476 free(clusters); 477 return -ENOMEM; 478 } 479 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 480 } 481 482 free(blob->clean.extent_pages); 483 free(blob->clean.clusters); 484 free(blob->clean.pages); 485 486 blob->clean.num_extent_pages = blob->active.num_extent_pages; 487 blob->clean.extent_pages = blob->active.extent_pages; 488 blob->clean.num_clusters = blob->active.num_clusters; 489 blob->clean.clusters = blob->active.clusters; 490 blob->clean.num_pages = blob->active.num_pages; 491 blob->clean.pages = blob->active.pages; 492 493 blob->active.extent_pages = extent_pages; 494 blob->active.clusters = clusters; 495 blob->active.pages = pages; 496 497 /* If the metadata was dirtied again while the metadata was being written to disk, 498 * we do not want to revert the DIRTY state back to CLEAN here. 499 */ 500 if (blob->state == SPDK_BLOB_STATE_LOADING) { 501 blob->state = SPDK_BLOB_STATE_CLEAN; 502 } 503 504 return 0; 505 } 506 507 static int 508 blob_deserialize_xattr(struct spdk_blob *blob, 509 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 510 { 511 struct spdk_xattr *xattr; 512 513 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 514 sizeof(desc_xattr->value_length) + 515 desc_xattr->name_length + desc_xattr->value_length) { 516 return -EINVAL; 517 } 518 519 xattr = calloc(1, sizeof(*xattr)); 520 if (xattr == NULL) { 521 return -ENOMEM; 522 } 523 524 xattr->name = malloc(desc_xattr->name_length + 1); 525 if (xattr->name == NULL) { 526 free(xattr); 527 return -ENOMEM; 528 } 529 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 530 xattr->name[desc_xattr->name_length] = '\0'; 531 532 xattr->value = malloc(desc_xattr->value_length); 533 if (xattr->value == NULL) { 534 free(xattr->name); 535 free(xattr); 536 return -ENOMEM; 537 } 538 xattr->value_len = desc_xattr->value_length; 539 memcpy(xattr->value, 540 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 541 desc_xattr->value_length); 542 543 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 544 545 return 0; 546 } 547 548 549 static int 550 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 551 { 552 struct spdk_blob_md_descriptor *desc; 553 size_t cur_desc = 0; 554 void *tmp; 555 556 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 557 while (cur_desc < sizeof(page->descriptors)) { 558 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 559 if (desc->length == 0) { 560 /* If padding and length are 0, this terminates the page */ 561 break; 562 } 563 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 564 struct spdk_blob_md_descriptor_flags *desc_flags; 565 566 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 567 568 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 569 return -EINVAL; 570 } 571 572 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 573 SPDK_BLOB_INVALID_FLAGS_MASK) { 574 return -EINVAL; 575 } 576 577 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 578 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 579 blob->data_ro = true; 580 blob->md_ro = true; 581 } 582 583 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 584 SPDK_BLOB_MD_RO_FLAGS_MASK) { 585 blob->md_ro = true; 586 } 587 588 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 589 blob->data_ro = true; 590 blob->md_ro = true; 591 } 592 593 blob->invalid_flags = desc_flags->invalid_flags; 594 blob->data_ro_flags = desc_flags->data_ro_flags; 595 blob->md_ro_flags = desc_flags->md_ro_flags; 596 597 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 598 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 599 unsigned int i, j; 600 unsigned int cluster_count = blob->active.num_clusters; 601 602 if (blob->extent_table_found) { 603 /* Extent Table already present in the md, 604 * both descriptors should never be at the same time. */ 605 return -EINVAL; 606 } 607 blob->extent_rle_found = true; 608 609 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 610 611 if (desc_extent_rle->length == 0 || 612 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 613 return -EINVAL; 614 } 615 616 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 617 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 618 if (desc_extent_rle->extents[i].cluster_idx != 0) { 619 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 620 desc_extent_rle->extents[i].cluster_idx + j)) { 621 return -EINVAL; 622 } 623 } 624 cluster_count++; 625 } 626 } 627 628 if (cluster_count == 0) { 629 return -EINVAL; 630 } 631 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 632 if (tmp == NULL) { 633 return -ENOMEM; 634 } 635 blob->active.clusters = tmp; 636 blob->active.cluster_array_size = cluster_count; 637 638 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 639 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 640 if (desc_extent_rle->extents[i].cluster_idx != 0) { 641 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 642 desc_extent_rle->extents[i].cluster_idx + j); 643 } else if (spdk_blob_is_thin_provisioned(blob)) { 644 blob->active.clusters[blob->active.num_clusters++] = 0; 645 } else { 646 return -EINVAL; 647 } 648 } 649 } 650 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 651 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 652 uint32_t num_extent_pages = blob->active.num_extent_pages; 653 uint32_t i, j; 654 size_t extent_pages_length; 655 656 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 657 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 658 659 if (blob->extent_rle_found) { 660 /* This means that Extent RLE is present in MD, 661 * both should never be at the same time. */ 662 return -EINVAL; 663 } else if (blob->extent_table_found && 664 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 665 /* Number of clusters in this ET does not match number 666 * from previously read EXTENT_TABLE. */ 667 return -EINVAL; 668 } 669 670 blob->extent_table_found = true; 671 672 if (desc_extent_table->length == 0 || 673 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 674 return -EINVAL; 675 } 676 677 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 678 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 679 } 680 681 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 682 if (tmp == NULL) { 683 return -ENOMEM; 684 } 685 blob->active.extent_pages = tmp; 686 blob->active.extent_pages_array_size = num_extent_pages; 687 688 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 689 690 /* Extent table entries contain md page numbers for extent pages. 691 * Zeroes represent unallocated extent pages, those are run-length-encoded. 692 */ 693 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 694 if (desc_extent_table->extent_page[i].page_idx != 0) { 695 assert(desc_extent_table->extent_page[i].num_pages == 1); 696 blob->active.extent_pages[blob->active.num_extent_pages++] = 697 desc_extent_table->extent_page[i].page_idx; 698 } else if (spdk_blob_is_thin_provisioned(blob)) { 699 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 700 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 701 } 702 } else { 703 return -EINVAL; 704 } 705 } 706 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 707 struct spdk_blob_md_descriptor_extent_page *desc_extent; 708 unsigned int i; 709 unsigned int cluster_count = 0; 710 size_t cluster_idx_length; 711 712 if (blob->extent_rle_found) { 713 /* This means that Extent RLE is present in MD, 714 * both should never be at the same time. */ 715 return -EINVAL; 716 } 717 718 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 719 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 720 721 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 722 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 723 return -EINVAL; 724 } 725 726 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 727 if (desc_extent->cluster_idx[i] != 0) { 728 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 729 return -EINVAL; 730 } 731 } 732 cluster_count++; 733 } 734 735 if (cluster_count == 0) { 736 return -EINVAL; 737 } 738 739 /* When reading extent pages sequentially starting cluster idx should match 740 * current size of a blob. 741 * If changed to batch reading, this check shall be removed. */ 742 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 743 return -EINVAL; 744 } 745 746 tmp = realloc(blob->active.clusters, 747 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 748 if (tmp == NULL) { 749 return -ENOMEM; 750 } 751 blob->active.clusters = tmp; 752 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 753 754 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 755 if (desc_extent->cluster_idx[i] != 0) { 756 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 757 desc_extent->cluster_idx[i]); 758 } else if (spdk_blob_is_thin_provisioned(blob)) { 759 blob->active.clusters[blob->active.num_clusters++] = 0; 760 } else { 761 return -EINVAL; 762 } 763 } 764 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 765 assert(blob->remaining_clusters_in_et >= cluster_count); 766 blob->remaining_clusters_in_et -= cluster_count; 767 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 768 int rc; 769 770 rc = blob_deserialize_xattr(blob, 771 (struct spdk_blob_md_descriptor_xattr *) desc, false); 772 if (rc != 0) { 773 return rc; 774 } 775 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 776 int rc; 777 778 rc = blob_deserialize_xattr(blob, 779 (struct spdk_blob_md_descriptor_xattr *) desc, true); 780 if (rc != 0) { 781 return rc; 782 } 783 } else { 784 /* Unrecognized descriptor type. Do not fail - just continue to the 785 * next descriptor. If this descriptor is associated with some feature 786 * defined in a newer version of blobstore, that version of blobstore 787 * should create and set an associated feature flag to specify if this 788 * blob can be loaded or not. 789 */ 790 } 791 792 /* Advance to the next descriptor */ 793 cur_desc += sizeof(*desc) + desc->length; 794 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 795 break; 796 } 797 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 798 } 799 800 return 0; 801 } 802 803 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 804 805 static int 806 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 807 { 808 assert(blob != NULL); 809 assert(blob->state == SPDK_BLOB_STATE_LOADING); 810 811 if (bs_load_cur_extent_page_valid(extent_page) == false) { 812 return -ENOENT; 813 } 814 815 return blob_parse_page(extent_page, blob); 816 } 817 818 static int 819 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 820 struct spdk_blob *blob) 821 { 822 const struct spdk_blob_md_page *page; 823 uint32_t i; 824 int rc; 825 void *tmp; 826 827 assert(page_count > 0); 828 assert(pages[0].sequence_num == 0); 829 assert(blob != NULL); 830 assert(blob->state == SPDK_BLOB_STATE_LOADING); 831 assert(blob->active.clusters == NULL); 832 833 /* The blobid provided doesn't match what's in the MD, this can 834 * happen for example if a bogus blobid is passed in through open. 835 */ 836 if (blob->id != pages[0].id) { 837 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 838 blob->id, pages[0].id); 839 return -ENOENT; 840 } 841 842 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 843 if (!tmp) { 844 return -ENOMEM; 845 } 846 blob->active.pages = tmp; 847 848 blob->active.pages[0] = pages[0].id; 849 850 for (i = 1; i < page_count; i++) { 851 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 852 blob->active.pages[i] = pages[i - 1].next; 853 } 854 blob->active.num_pages = page_count; 855 856 for (i = 0; i < page_count; i++) { 857 page = &pages[i]; 858 859 assert(page->id == blob->id); 860 assert(page->sequence_num == i); 861 862 rc = blob_parse_page(page, blob); 863 if (rc != 0) { 864 return rc; 865 } 866 } 867 868 return 0; 869 } 870 871 static int 872 blob_serialize_add_page(const struct spdk_blob *blob, 873 struct spdk_blob_md_page **pages, 874 uint32_t *page_count, 875 struct spdk_blob_md_page **last_page) 876 { 877 struct spdk_blob_md_page *page; 878 879 assert(pages != NULL); 880 assert(page_count != NULL); 881 882 if (*page_count == 0) { 883 assert(*pages == NULL); 884 *page_count = 1; 885 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 886 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 887 } else { 888 assert(*pages != NULL); 889 (*page_count)++; 890 *pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count), 0); 891 } 892 893 if (*pages == NULL) { 894 *page_count = 0; 895 *last_page = NULL; 896 return -ENOMEM; 897 } 898 899 page = &(*pages)[*page_count - 1]; 900 memset(page, 0, sizeof(*page)); 901 page->id = blob->id; 902 page->sequence_num = *page_count - 1; 903 page->next = SPDK_INVALID_MD_PAGE; 904 *last_page = page; 905 906 return 0; 907 } 908 909 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 910 * Update required_sz on both success and failure. 911 * 912 */ 913 static int 914 blob_serialize_xattr(const struct spdk_xattr *xattr, 915 uint8_t *buf, size_t buf_sz, 916 size_t *required_sz, bool internal) 917 { 918 struct spdk_blob_md_descriptor_xattr *desc; 919 920 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 921 strlen(xattr->name) + 922 xattr->value_len; 923 924 if (buf_sz < *required_sz) { 925 return -1; 926 } 927 928 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 929 930 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 931 desc->length = sizeof(desc->name_length) + 932 sizeof(desc->value_length) + 933 strlen(xattr->name) + 934 xattr->value_len; 935 desc->name_length = strlen(xattr->name); 936 desc->value_length = xattr->value_len; 937 938 memcpy(desc->name, xattr->name, desc->name_length); 939 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 940 xattr->value, 941 desc->value_length); 942 943 return 0; 944 } 945 946 static void 947 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 948 uint64_t start_ep, uint64_t *next_ep, 949 uint8_t **buf, size_t *remaining_sz) 950 { 951 struct spdk_blob_md_descriptor_extent_table *desc; 952 size_t cur_sz; 953 uint64_t i, et_idx; 954 uint32_t extent_page, ep_len; 955 956 /* The buffer must have room for at least num_clusters entry */ 957 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 958 if (*remaining_sz < cur_sz) { 959 *next_ep = start_ep; 960 return; 961 } 962 963 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 964 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 965 966 desc->num_clusters = blob->active.num_clusters; 967 968 ep_len = 1; 969 et_idx = 0; 970 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 971 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 972 /* If we ran out of buffer space, return */ 973 break; 974 } 975 976 extent_page = blob->active.extent_pages[i]; 977 /* Verify that next extent_page is unallocated */ 978 if (extent_page == 0 && 979 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 980 ep_len++; 981 continue; 982 } 983 desc->extent_page[et_idx].page_idx = extent_page; 984 desc->extent_page[et_idx].num_pages = ep_len; 985 et_idx++; 986 987 ep_len = 1; 988 cur_sz += sizeof(desc->extent_page[et_idx]); 989 } 990 *next_ep = i; 991 992 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 993 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 994 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 995 } 996 997 static int 998 blob_serialize_extent_table(const struct spdk_blob *blob, 999 struct spdk_blob_md_page **pages, 1000 struct spdk_blob_md_page *cur_page, 1001 uint32_t *page_count, uint8_t **buf, 1002 size_t *remaining_sz) 1003 { 1004 uint64_t last_extent_page; 1005 int rc; 1006 1007 last_extent_page = 0; 1008 /* At least single extent table entry has to be always persisted. 1009 * Such case occurs with num_extent_pages == 0. */ 1010 while (last_extent_page <= blob->active.num_extent_pages) { 1011 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1012 remaining_sz); 1013 1014 if (last_extent_page == blob->active.num_extent_pages) { 1015 break; 1016 } 1017 1018 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1019 if (rc < 0) { 1020 return rc; 1021 } 1022 1023 *buf = (uint8_t *)cur_page->descriptors; 1024 *remaining_sz = sizeof(cur_page->descriptors); 1025 } 1026 1027 return 0; 1028 } 1029 1030 static void 1031 blob_serialize_extent_rle(const struct spdk_blob *blob, 1032 uint64_t start_cluster, uint64_t *next_cluster, 1033 uint8_t **buf, size_t *buf_sz) 1034 { 1035 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1036 size_t cur_sz; 1037 uint64_t i, extent_idx; 1038 uint64_t lba, lba_per_cluster, lba_count; 1039 1040 /* The buffer must have room for at least one extent */ 1041 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1042 if (*buf_sz < cur_sz) { 1043 *next_cluster = start_cluster; 1044 return; 1045 } 1046 1047 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1048 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1049 1050 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1051 1052 lba = blob->active.clusters[start_cluster]; 1053 lba_count = lba_per_cluster; 1054 extent_idx = 0; 1055 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1056 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1057 /* Run-length encode sequential non-zero LBA */ 1058 lba_count += lba_per_cluster; 1059 continue; 1060 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1061 /* Run-length encode unallocated clusters */ 1062 lba_count += lba_per_cluster; 1063 continue; 1064 } 1065 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1066 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1067 extent_idx++; 1068 1069 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1070 1071 if (*buf_sz < cur_sz) { 1072 /* If we ran out of buffer space, return */ 1073 *next_cluster = i; 1074 break; 1075 } 1076 1077 lba = blob->active.clusters[i]; 1078 lba_count = lba_per_cluster; 1079 } 1080 1081 if (*buf_sz >= cur_sz) { 1082 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1083 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1084 extent_idx++; 1085 1086 *next_cluster = blob->active.num_clusters; 1087 } 1088 1089 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1090 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1091 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1092 } 1093 1094 static int 1095 blob_serialize_extents_rle(const struct spdk_blob *blob, 1096 struct spdk_blob_md_page **pages, 1097 struct spdk_blob_md_page *cur_page, 1098 uint32_t *page_count, uint8_t **buf, 1099 size_t *remaining_sz) 1100 { 1101 uint64_t last_cluster; 1102 int rc; 1103 1104 last_cluster = 0; 1105 while (last_cluster < blob->active.num_clusters) { 1106 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1107 1108 if (last_cluster == blob->active.num_clusters) { 1109 break; 1110 } 1111 1112 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1113 if (rc < 0) { 1114 return rc; 1115 } 1116 1117 *buf = (uint8_t *)cur_page->descriptors; 1118 *remaining_sz = sizeof(cur_page->descriptors); 1119 } 1120 1121 return 0; 1122 } 1123 1124 static void 1125 blob_serialize_extent_page(const struct spdk_blob *blob, 1126 uint64_t cluster, struct spdk_blob_md_page *page) 1127 { 1128 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1129 uint64_t i, extent_idx; 1130 uint64_t lba, lba_per_cluster; 1131 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1132 1133 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1134 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1135 1136 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1137 1138 desc_extent->start_cluster_idx = start_cluster_idx; 1139 extent_idx = 0; 1140 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1141 lba = blob->active.clusters[i]; 1142 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1143 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1144 break; 1145 } 1146 } 1147 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1148 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1149 } 1150 1151 static void 1152 blob_serialize_flags(const struct spdk_blob *blob, 1153 uint8_t *buf, size_t *buf_sz) 1154 { 1155 struct spdk_blob_md_descriptor_flags *desc; 1156 1157 /* 1158 * Flags get serialized first, so we should always have room for the flags 1159 * descriptor. 1160 */ 1161 assert(*buf_sz >= sizeof(*desc)); 1162 1163 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1164 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1165 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1166 desc->invalid_flags = blob->invalid_flags; 1167 desc->data_ro_flags = blob->data_ro_flags; 1168 desc->md_ro_flags = blob->md_ro_flags; 1169 1170 *buf_sz -= sizeof(*desc); 1171 } 1172 1173 static int 1174 blob_serialize_xattrs(const struct spdk_blob *blob, 1175 const struct spdk_xattr_tailq *xattrs, bool internal, 1176 struct spdk_blob_md_page **pages, 1177 struct spdk_blob_md_page *cur_page, 1178 uint32_t *page_count, uint8_t **buf, 1179 size_t *remaining_sz) 1180 { 1181 const struct spdk_xattr *xattr; 1182 int rc; 1183 1184 TAILQ_FOREACH(xattr, xattrs, link) { 1185 size_t required_sz = 0; 1186 1187 rc = blob_serialize_xattr(xattr, 1188 *buf, *remaining_sz, 1189 &required_sz, internal); 1190 if (rc < 0) { 1191 /* Need to add a new page to the chain */ 1192 rc = blob_serialize_add_page(blob, pages, page_count, 1193 &cur_page); 1194 if (rc < 0) { 1195 spdk_free(*pages); 1196 *pages = NULL; 1197 *page_count = 0; 1198 return rc; 1199 } 1200 1201 *buf = (uint8_t *)cur_page->descriptors; 1202 *remaining_sz = sizeof(cur_page->descriptors); 1203 1204 /* Try again */ 1205 required_sz = 0; 1206 rc = blob_serialize_xattr(xattr, 1207 *buf, *remaining_sz, 1208 &required_sz, internal); 1209 1210 if (rc < 0) { 1211 spdk_free(*pages); 1212 *pages = NULL; 1213 *page_count = 0; 1214 return rc; 1215 } 1216 } 1217 1218 *remaining_sz -= required_sz; 1219 *buf += required_sz; 1220 } 1221 1222 return 0; 1223 } 1224 1225 static int 1226 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1227 uint32_t *page_count) 1228 { 1229 struct spdk_blob_md_page *cur_page; 1230 int rc; 1231 uint8_t *buf; 1232 size_t remaining_sz; 1233 1234 assert(pages != NULL); 1235 assert(page_count != NULL); 1236 assert(blob != NULL); 1237 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1238 1239 *pages = NULL; 1240 *page_count = 0; 1241 1242 /* A blob always has at least 1 page, even if it has no descriptors */ 1243 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1244 if (rc < 0) { 1245 return rc; 1246 } 1247 1248 buf = (uint8_t *)cur_page->descriptors; 1249 remaining_sz = sizeof(cur_page->descriptors); 1250 1251 /* Serialize flags */ 1252 blob_serialize_flags(blob, buf, &remaining_sz); 1253 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1254 1255 /* Serialize xattrs */ 1256 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1257 pages, cur_page, page_count, &buf, &remaining_sz); 1258 if (rc < 0) { 1259 return rc; 1260 } 1261 1262 /* Serialize internal xattrs */ 1263 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1264 pages, cur_page, page_count, &buf, &remaining_sz); 1265 if (rc < 0) { 1266 return rc; 1267 } 1268 1269 if (blob->use_extent_table) { 1270 /* Serialize extent table */ 1271 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1272 } else { 1273 /* Serialize extents */ 1274 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1275 } 1276 1277 return rc; 1278 } 1279 1280 struct spdk_blob_load_ctx { 1281 struct spdk_blob *blob; 1282 1283 struct spdk_blob_md_page *pages; 1284 uint32_t num_pages; 1285 uint32_t next_extent_page; 1286 spdk_bs_sequence_t *seq; 1287 1288 spdk_bs_sequence_cpl cb_fn; 1289 void *cb_arg; 1290 }; 1291 1292 static uint32_t 1293 blob_md_page_calc_crc(void *page) 1294 { 1295 uint32_t crc; 1296 1297 crc = BLOB_CRC32C_INITIAL; 1298 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1299 crc ^= BLOB_CRC32C_INITIAL; 1300 1301 return crc; 1302 1303 } 1304 1305 static void 1306 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1307 { 1308 struct spdk_blob *blob = ctx->blob; 1309 1310 if (bserrno == 0) { 1311 blob_mark_clean(blob); 1312 } 1313 1314 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1315 1316 /* Free the memory */ 1317 spdk_free(ctx->pages); 1318 free(ctx); 1319 } 1320 1321 static void 1322 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1323 { 1324 struct spdk_blob_load_ctx *ctx = cb_arg; 1325 struct spdk_blob *blob = ctx->blob; 1326 1327 if (bserrno == 0) { 1328 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1329 if (blob->back_bs_dev == NULL) { 1330 bserrno = -ENOMEM; 1331 } 1332 } 1333 if (bserrno != 0) { 1334 SPDK_ERRLOG("Snapshot fail\n"); 1335 } 1336 1337 blob_load_final(ctx, bserrno); 1338 } 1339 1340 static void blob_update_clear_method(struct spdk_blob *blob); 1341 1342 static void 1343 blob_load_backing_dev(void *cb_arg) 1344 { 1345 struct spdk_blob_load_ctx *ctx = cb_arg; 1346 struct spdk_blob *blob = ctx->blob; 1347 const void *value; 1348 size_t len; 1349 int rc; 1350 1351 if (spdk_blob_is_thin_provisioned(blob)) { 1352 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1353 if (rc == 0) { 1354 if (len != sizeof(spdk_blob_id)) { 1355 blob_load_final(ctx, -EINVAL); 1356 return; 1357 } 1358 /* open snapshot blob and continue in the callback function */ 1359 blob->parent_id = *(spdk_blob_id *)value; 1360 spdk_bs_open_blob(blob->bs, blob->parent_id, 1361 blob_load_snapshot_cpl, ctx); 1362 return; 1363 } else { 1364 /* add zeroes_dev for thin provisioned blob */ 1365 blob->back_bs_dev = bs_create_zeroes_dev(); 1366 } 1367 } else { 1368 /* standard blob */ 1369 blob->back_bs_dev = NULL; 1370 } 1371 blob_load_final(ctx, 0); 1372 } 1373 1374 static void 1375 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1376 { 1377 struct spdk_blob_load_ctx *ctx = cb_arg; 1378 struct spdk_blob *blob = ctx->blob; 1379 struct spdk_blob_md_page *page; 1380 uint64_t i; 1381 uint32_t crc; 1382 uint64_t lba; 1383 void *tmp; 1384 uint64_t sz; 1385 1386 if (bserrno) { 1387 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1388 blob_load_final(ctx, bserrno); 1389 return; 1390 } 1391 1392 if (ctx->pages == NULL) { 1393 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1394 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1395 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1396 if (!ctx->pages) { 1397 blob_load_final(ctx, -ENOMEM); 1398 return; 1399 } 1400 ctx->num_pages = 1; 1401 ctx->next_extent_page = 0; 1402 } else { 1403 page = &ctx->pages[0]; 1404 crc = blob_md_page_calc_crc(page); 1405 if (crc != page->crc) { 1406 blob_load_final(ctx, -EINVAL); 1407 return; 1408 } 1409 1410 if (page->next != SPDK_INVALID_MD_PAGE) { 1411 blob_load_final(ctx, -EINVAL); 1412 return; 1413 } 1414 1415 bserrno = blob_parse_extent_page(page, blob); 1416 if (bserrno) { 1417 blob_load_final(ctx, bserrno); 1418 return; 1419 } 1420 } 1421 1422 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1423 if (blob->active.extent_pages[i] != 0) { 1424 /* Extent page was allocated, read and parse it. */ 1425 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1426 ctx->next_extent_page = i + 1; 1427 1428 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1429 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1430 blob_load_cpl_extents_cpl, ctx); 1431 return; 1432 } else { 1433 /* Thin provisioned blobs can point to unallocated extent pages. 1434 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1435 1436 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1437 blob->active.num_clusters += sz; 1438 blob->remaining_clusters_in_et -= sz; 1439 1440 assert(spdk_blob_is_thin_provisioned(blob)); 1441 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1442 1443 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1444 if (tmp == NULL) { 1445 blob_load_final(ctx, -ENOMEM); 1446 return; 1447 } 1448 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1449 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1450 blob->active.clusters = tmp; 1451 blob->active.cluster_array_size = blob->active.num_clusters; 1452 } 1453 } 1454 1455 blob_load_backing_dev(ctx); 1456 } 1457 1458 static void 1459 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1460 { 1461 struct spdk_blob_load_ctx *ctx = cb_arg; 1462 struct spdk_blob *blob = ctx->blob; 1463 struct spdk_blob_md_page *page; 1464 int rc; 1465 uint32_t crc; 1466 uint32_t current_page; 1467 1468 if (ctx->num_pages == 1) { 1469 current_page = bs_blobid_to_page(blob->id); 1470 } else { 1471 assert(ctx->num_pages != 0); 1472 page = &ctx->pages[ctx->num_pages - 2]; 1473 current_page = page->next; 1474 } 1475 1476 if (bserrno) { 1477 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1478 current_page, blob->id, bserrno); 1479 blob_load_final(ctx, bserrno); 1480 return; 1481 } 1482 1483 page = &ctx->pages[ctx->num_pages - 1]; 1484 crc = blob_md_page_calc_crc(page); 1485 if (crc != page->crc) { 1486 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1487 current_page, blob->id); 1488 blob_load_final(ctx, -EINVAL); 1489 return; 1490 } 1491 1492 if (page->next != SPDK_INVALID_MD_PAGE) { 1493 uint32_t next_page = page->next; 1494 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1495 1496 /* Read the next page */ 1497 ctx->num_pages++; 1498 ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), 0); 1499 if (ctx->pages == NULL) { 1500 blob_load_final(ctx, -ENOMEM); 1501 return; 1502 } 1503 1504 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1505 next_lba, 1506 bs_byte_to_lba(blob->bs, sizeof(*page)), 1507 blob_load_cpl, ctx); 1508 return; 1509 } 1510 1511 /* Parse the pages */ 1512 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1513 if (rc) { 1514 blob_load_final(ctx, rc); 1515 return; 1516 } 1517 1518 if (blob->extent_table_found == true) { 1519 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1520 assert(blob->extent_rle_found == false); 1521 blob->use_extent_table = true; 1522 } else { 1523 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1524 * for extent table. No extent_* descriptors means that blob has length of 0 1525 * and no extent_rle descriptors were persisted for it. 1526 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1527 blob->use_extent_table = false; 1528 } 1529 1530 /* Check the clear_method stored in metadata vs what may have been passed 1531 * via spdk_bs_open_blob_ext() and update accordingly. 1532 */ 1533 blob_update_clear_method(blob); 1534 1535 spdk_free(ctx->pages); 1536 ctx->pages = NULL; 1537 1538 if (blob->extent_table_found) { 1539 blob_load_cpl_extents_cpl(seq, ctx, 0); 1540 } else { 1541 blob_load_backing_dev(ctx); 1542 } 1543 } 1544 1545 /* Load a blob from disk given a blobid */ 1546 static void 1547 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1548 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1549 { 1550 struct spdk_blob_load_ctx *ctx; 1551 struct spdk_blob_store *bs; 1552 uint32_t page_num; 1553 uint64_t lba; 1554 1555 blob_verify_md_op(blob); 1556 1557 bs = blob->bs; 1558 1559 ctx = calloc(1, sizeof(*ctx)); 1560 if (!ctx) { 1561 cb_fn(seq, cb_arg, -ENOMEM); 1562 return; 1563 } 1564 1565 ctx->blob = blob; 1566 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1567 if (!ctx->pages) { 1568 free(ctx); 1569 cb_fn(seq, cb_arg, -ENOMEM); 1570 return; 1571 } 1572 ctx->num_pages = 1; 1573 ctx->cb_fn = cb_fn; 1574 ctx->cb_arg = cb_arg; 1575 ctx->seq = seq; 1576 1577 page_num = bs_blobid_to_page(blob->id); 1578 lba = bs_md_page_to_lba(blob->bs, page_num); 1579 1580 blob->state = SPDK_BLOB_STATE_LOADING; 1581 1582 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1583 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1584 blob_load_cpl, ctx); 1585 } 1586 1587 struct spdk_blob_persist_ctx { 1588 struct spdk_blob *blob; 1589 1590 struct spdk_bs_super_block *super; 1591 1592 struct spdk_blob_md_page *pages; 1593 uint32_t next_extent_page; 1594 struct spdk_blob_md_page *extent_page; 1595 1596 spdk_bs_sequence_t *seq; 1597 spdk_bs_sequence_cpl cb_fn; 1598 void *cb_arg; 1599 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1600 }; 1601 1602 static void 1603 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1604 uint32_t lba_count) 1605 { 1606 switch (ctx->blob->clear_method) { 1607 case BLOB_CLEAR_WITH_DEFAULT: 1608 case BLOB_CLEAR_WITH_UNMAP: 1609 bs_batch_unmap_dev(batch, lba, lba_count); 1610 break; 1611 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1612 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1613 break; 1614 case BLOB_CLEAR_WITH_NONE: 1615 default: 1616 break; 1617 } 1618 } 1619 1620 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1621 1622 static void 1623 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1624 { 1625 struct spdk_blob_persist_ctx *next_persist; 1626 struct spdk_blob *blob = ctx->blob; 1627 1628 if (bserrno == 0) { 1629 blob_mark_clean(blob); 1630 } 1631 1632 assert(ctx == TAILQ_FIRST(&blob->pending_persists)); 1633 TAILQ_REMOVE(&blob->pending_persists, ctx, link); 1634 1635 next_persist = TAILQ_FIRST(&blob->pending_persists); 1636 1637 /* Call user callback */ 1638 ctx->cb_fn(seq, ctx->cb_arg, bserrno); 1639 1640 /* Free the memory */ 1641 spdk_free(ctx->pages); 1642 free(ctx); 1643 1644 if (next_persist != NULL) { 1645 blob_persist_check_dirty(next_persist); 1646 } 1647 } 1648 1649 static void 1650 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1651 { 1652 struct spdk_blob_persist_ctx *ctx = cb_arg; 1653 struct spdk_blob *blob = ctx->blob; 1654 struct spdk_blob_store *bs = blob->bs; 1655 size_t i; 1656 1657 if (bserrno != 0) { 1658 blob_persist_complete(seq, ctx, bserrno); 1659 return; 1660 } 1661 1662 /* Release all extent_pages that were truncated */ 1663 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1664 /* Nothing to release if it was not allocated */ 1665 if (blob->active.extent_pages[i] != 0) { 1666 bs_release_md_page(bs, blob->active.extent_pages[i]); 1667 } 1668 } 1669 1670 if (blob->active.num_extent_pages == 0) { 1671 free(blob->active.extent_pages); 1672 blob->active.extent_pages = NULL; 1673 blob->active.extent_pages_array_size = 0; 1674 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1675 #ifndef __clang_analyzer__ 1676 void *tmp; 1677 1678 /* scan-build really can't figure reallocs, workaround it */ 1679 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1680 assert(tmp != NULL); 1681 blob->active.extent_pages = tmp; 1682 #endif 1683 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1684 } 1685 1686 blob_persist_complete(seq, ctx, bserrno); 1687 } 1688 1689 static void 1690 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1691 { 1692 struct spdk_blob *blob = ctx->blob; 1693 struct spdk_blob_store *bs = blob->bs; 1694 size_t i; 1695 uint64_t lba; 1696 uint32_t lba_count; 1697 spdk_bs_batch_t *batch; 1698 1699 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1700 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1701 1702 /* Clear all extent_pages that were truncated */ 1703 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1704 /* Nothing to clear if it was not allocated */ 1705 if (blob->active.extent_pages[i] != 0) { 1706 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1707 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1708 } 1709 } 1710 1711 bs_batch_close(batch); 1712 } 1713 1714 static void 1715 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1716 { 1717 struct spdk_blob_persist_ctx *ctx = cb_arg; 1718 struct spdk_blob *blob = ctx->blob; 1719 struct spdk_blob_store *bs = blob->bs; 1720 size_t i; 1721 1722 if (bserrno != 0) { 1723 blob_persist_complete(seq, ctx, bserrno); 1724 return; 1725 } 1726 1727 pthread_mutex_lock(&bs->used_clusters_mutex); 1728 /* Release all clusters that were truncated */ 1729 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1730 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1731 1732 /* Nothing to release if it was not allocated */ 1733 if (blob->active.clusters[i] != 0) { 1734 bs_release_cluster(bs, cluster_num); 1735 } 1736 } 1737 pthread_mutex_unlock(&bs->used_clusters_mutex); 1738 1739 if (blob->active.num_clusters == 0) { 1740 free(blob->active.clusters); 1741 blob->active.clusters = NULL; 1742 blob->active.cluster_array_size = 0; 1743 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1744 #ifndef __clang_analyzer__ 1745 void *tmp; 1746 1747 /* scan-build really can't figure reallocs, workaround it */ 1748 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1749 assert(tmp != NULL); 1750 blob->active.clusters = tmp; 1751 1752 #endif 1753 blob->active.cluster_array_size = blob->active.num_clusters; 1754 } 1755 1756 /* Move on to clearing extent pages */ 1757 blob_persist_clear_extents(seq, ctx); 1758 } 1759 1760 static void 1761 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1762 { 1763 struct spdk_blob *blob = ctx->blob; 1764 struct spdk_blob_store *bs = blob->bs; 1765 spdk_bs_batch_t *batch; 1766 size_t i; 1767 uint64_t lba; 1768 uint32_t lba_count; 1769 1770 /* Clusters don't move around in blobs. The list shrinks or grows 1771 * at the end, but no changes ever occur in the middle of the list. 1772 */ 1773 1774 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1775 1776 /* Clear all clusters that were truncated */ 1777 lba = 0; 1778 lba_count = 0; 1779 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1780 uint64_t next_lba = blob->active.clusters[i]; 1781 uint32_t next_lba_count = bs_cluster_to_lba(bs, 1); 1782 1783 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1784 /* This cluster is contiguous with the previous one. */ 1785 lba_count += next_lba_count; 1786 continue; 1787 } else if (next_lba == 0) { 1788 continue; 1789 } 1790 1791 /* This cluster is not contiguous with the previous one. */ 1792 1793 /* If a run of LBAs previously existing, clear them now */ 1794 if (lba_count > 0) { 1795 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1796 } 1797 1798 /* Start building the next batch */ 1799 lba = next_lba; 1800 if (next_lba > 0) { 1801 lba_count = next_lba_count; 1802 } else { 1803 lba_count = 0; 1804 } 1805 } 1806 1807 /* If we ended with a contiguous set of LBAs, clear them now */ 1808 if (lba_count > 0) { 1809 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1810 } 1811 1812 bs_batch_close(batch); 1813 } 1814 1815 static void 1816 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1817 { 1818 struct spdk_blob_persist_ctx *ctx = cb_arg; 1819 struct spdk_blob *blob = ctx->blob; 1820 struct spdk_blob_store *bs = blob->bs; 1821 size_t i; 1822 1823 if (bserrno != 0) { 1824 blob_persist_complete(seq, ctx, bserrno); 1825 return; 1826 } 1827 1828 /* This loop starts at 1 because the first page is special and handled 1829 * below. The pages (except the first) are never written in place, 1830 * so any pages in the clean list must be zeroed. 1831 */ 1832 for (i = 1; i < blob->clean.num_pages; i++) { 1833 bs_release_md_page(bs, blob->clean.pages[i]); 1834 } 1835 1836 if (blob->active.num_pages == 0) { 1837 uint32_t page_num; 1838 1839 page_num = bs_blobid_to_page(blob->id); 1840 bs_release_md_page(bs, page_num); 1841 } 1842 1843 /* Move on to clearing clusters */ 1844 blob_persist_clear_clusters(seq, ctx); 1845 } 1846 1847 static void 1848 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1849 { 1850 struct spdk_blob_persist_ctx *ctx = cb_arg; 1851 struct spdk_blob *blob = ctx->blob; 1852 struct spdk_blob_store *bs = blob->bs; 1853 uint64_t lba; 1854 uint32_t lba_count; 1855 spdk_bs_batch_t *batch; 1856 size_t i; 1857 1858 if (bserrno != 0) { 1859 blob_persist_complete(seq, ctx, bserrno); 1860 return; 1861 } 1862 1863 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1864 1865 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1866 1867 /* This loop starts at 1 because the first page is special and handled 1868 * below. The pages (except the first) are never written in place, 1869 * so any pages in the clean list must be zeroed. 1870 */ 1871 for (i = 1; i < blob->clean.num_pages; i++) { 1872 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1873 1874 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1875 } 1876 1877 /* The first page will only be zeroed if this is a delete. */ 1878 if (blob->active.num_pages == 0) { 1879 uint32_t page_num; 1880 1881 /* The first page in the metadata goes where the blobid indicates */ 1882 page_num = bs_blobid_to_page(blob->id); 1883 lba = bs_md_page_to_lba(bs, page_num); 1884 1885 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1886 } 1887 1888 bs_batch_close(batch); 1889 } 1890 1891 static void 1892 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1893 { 1894 struct spdk_blob_persist_ctx *ctx = cb_arg; 1895 struct spdk_blob *blob = ctx->blob; 1896 struct spdk_blob_store *bs = blob->bs; 1897 uint64_t lba; 1898 uint32_t lba_count; 1899 struct spdk_blob_md_page *page; 1900 1901 if (bserrno != 0) { 1902 blob_persist_complete(seq, ctx, bserrno); 1903 return; 1904 } 1905 1906 if (blob->active.num_pages == 0) { 1907 /* Move on to the next step */ 1908 blob_persist_zero_pages(seq, ctx, 0); 1909 return; 1910 } 1911 1912 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1913 1914 page = &ctx->pages[0]; 1915 /* The first page in the metadata goes where the blobid indicates */ 1916 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1917 1918 bs_sequence_write_dev(seq, page, lba, lba_count, 1919 blob_persist_zero_pages, ctx); 1920 } 1921 1922 static void 1923 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1924 { 1925 struct spdk_blob *blob = ctx->blob; 1926 struct spdk_blob_store *bs = blob->bs; 1927 uint64_t lba; 1928 uint32_t lba_count; 1929 struct spdk_blob_md_page *page; 1930 spdk_bs_batch_t *batch; 1931 size_t i; 1932 1933 /* Clusters don't move around in blobs. The list shrinks or grows 1934 * at the end, but no changes ever occur in the middle of the list. 1935 */ 1936 1937 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1938 1939 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1940 1941 /* This starts at 1. The root page is not written until 1942 * all of the others are finished 1943 */ 1944 for (i = 1; i < blob->active.num_pages; i++) { 1945 page = &ctx->pages[i]; 1946 assert(page->sequence_num == i); 1947 1948 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1949 1950 bs_batch_write_dev(batch, page, lba, lba_count); 1951 } 1952 1953 bs_batch_close(batch); 1954 } 1955 1956 static int 1957 blob_resize(struct spdk_blob *blob, uint64_t sz) 1958 { 1959 uint64_t i; 1960 uint64_t *tmp; 1961 uint64_t cluster; 1962 uint32_t lfmd; /* lowest free md page */ 1963 uint64_t num_clusters; 1964 uint32_t *ep_tmp; 1965 uint64_t new_num_ep = 0, current_num_ep = 0; 1966 struct spdk_blob_store *bs; 1967 1968 bs = blob->bs; 1969 1970 blob_verify_md_op(blob); 1971 1972 if (blob->active.num_clusters == sz) { 1973 return 0; 1974 } 1975 1976 if (blob->active.num_clusters < blob->active.cluster_array_size) { 1977 /* If this blob was resized to be larger, then smaller, then 1978 * larger without syncing, then the cluster array already 1979 * contains spare assigned clusters we can use. 1980 */ 1981 num_clusters = spdk_min(blob->active.cluster_array_size, 1982 sz); 1983 } else { 1984 num_clusters = blob->active.num_clusters; 1985 } 1986 1987 if (blob->use_extent_table) { 1988 /* Round up since every cluster beyond current Extent Table size, 1989 * requires new extent page. */ 1990 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 1991 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 1992 } 1993 1994 /* Check first that we have enough clusters and md pages before we start claiming them. */ 1995 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 1996 if ((sz - num_clusters) > bs->num_free_clusters) { 1997 return -ENOSPC; 1998 } 1999 lfmd = 0; 2000 for (i = current_num_ep; i < new_num_ep ; i++) { 2001 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2002 if (lfmd == UINT32_MAX) { 2003 /* No more free md pages. Cannot satisfy the request */ 2004 return -ENOSPC; 2005 } 2006 } 2007 } 2008 2009 if (sz > num_clusters) { 2010 /* Expand the cluster array if necessary. 2011 * We only shrink the array when persisting. 2012 */ 2013 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2014 if (sz > 0 && tmp == NULL) { 2015 return -ENOMEM; 2016 } 2017 memset(tmp + blob->active.cluster_array_size, 0, 2018 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2019 blob->active.clusters = tmp; 2020 blob->active.cluster_array_size = sz; 2021 2022 /* Expand the extents table, only if enough clusters were added */ 2023 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2024 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2025 if (new_num_ep > 0 && ep_tmp == NULL) { 2026 return -ENOMEM; 2027 } 2028 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2029 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2030 blob->active.extent_pages = ep_tmp; 2031 blob->active.extent_pages_array_size = new_num_ep; 2032 } 2033 } 2034 2035 blob->state = SPDK_BLOB_STATE_DIRTY; 2036 2037 if (spdk_blob_is_thin_provisioned(blob) == false) { 2038 cluster = 0; 2039 lfmd = 0; 2040 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2041 for (i = num_clusters; i < sz; i++) { 2042 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2043 lfmd++; 2044 } 2045 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2046 } 2047 2048 blob->active.num_clusters = sz; 2049 blob->active.num_extent_pages = new_num_ep; 2050 2051 return 0; 2052 } 2053 2054 static void 2055 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2056 { 2057 spdk_bs_sequence_t *seq = ctx->seq; 2058 struct spdk_blob *blob = ctx->blob; 2059 struct spdk_blob_store *bs = blob->bs; 2060 uint64_t i; 2061 uint32_t page_num; 2062 void *tmp; 2063 int rc; 2064 2065 /* Generate the new metadata */ 2066 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2067 if (rc < 0) { 2068 blob_persist_complete(seq, ctx, rc); 2069 return; 2070 } 2071 2072 assert(blob->active.num_pages >= 1); 2073 2074 /* Resize the cache of page indices */ 2075 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2076 if (!tmp) { 2077 blob_persist_complete(seq, ctx, -ENOMEM); 2078 return; 2079 } 2080 blob->active.pages = tmp; 2081 2082 /* Assign this metadata to pages. This requires two passes - 2083 * one to verify that there are enough pages and a second 2084 * to actually claim them. */ 2085 page_num = 0; 2086 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2087 for (i = 1; i < blob->active.num_pages; i++) { 2088 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2089 if (page_num == UINT32_MAX) { 2090 blob_persist_complete(seq, ctx, -ENOMEM); 2091 return; 2092 } 2093 page_num++; 2094 } 2095 2096 page_num = 0; 2097 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2098 for (i = 1; i < blob->active.num_pages; i++) { 2099 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2100 ctx->pages[i - 1].next = page_num; 2101 /* Now that previous metadata page is complete, calculate the crc for it. */ 2102 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2103 blob->active.pages[i] = page_num; 2104 bs_claim_md_page(bs, page_num); 2105 SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id); 2106 page_num++; 2107 } 2108 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2109 /* Start writing the metadata from last page to first */ 2110 blob->state = SPDK_BLOB_STATE_CLEAN; 2111 blob_persist_write_page_chain(seq, ctx); 2112 } 2113 2114 static void 2115 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2116 { 2117 struct spdk_blob_persist_ctx *ctx = cb_arg; 2118 struct spdk_blob *blob = ctx->blob; 2119 size_t i; 2120 uint32_t extent_page_id; 2121 uint32_t page_count = 0; 2122 int rc; 2123 2124 if (ctx->extent_page != NULL) { 2125 spdk_free(ctx->extent_page); 2126 ctx->extent_page = NULL; 2127 } 2128 2129 if (bserrno != 0) { 2130 blob_persist_complete(seq, ctx, bserrno); 2131 return; 2132 } 2133 2134 /* Only write out Extent Pages when blob was resized. */ 2135 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2136 extent_page_id = blob->active.extent_pages[i]; 2137 if (extent_page_id == 0) { 2138 /* No Extent Page to persist */ 2139 assert(spdk_blob_is_thin_provisioned(blob)); 2140 continue; 2141 } 2142 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2143 ctx->next_extent_page = i + 1; 2144 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2145 if (rc < 0) { 2146 blob_persist_complete(seq, ctx, rc); 2147 return; 2148 } 2149 2150 blob->state = SPDK_BLOB_STATE_DIRTY; 2151 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2152 2153 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2154 2155 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2156 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2157 blob_persist_write_extent_pages, ctx); 2158 return; 2159 } 2160 2161 blob_persist_generate_new_md(ctx); 2162 } 2163 2164 static void 2165 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2166 { 2167 spdk_bs_sequence_t *seq = ctx->seq; 2168 struct spdk_blob *blob = ctx->blob; 2169 2170 if (blob->active.num_pages == 0) { 2171 /* This is the signal that the blob should be deleted. 2172 * Immediately jump to the clean up routine. */ 2173 assert(blob->clean.num_pages > 0); 2174 blob->state = SPDK_BLOB_STATE_CLEAN; 2175 blob_persist_zero_pages(seq, ctx, 0); 2176 return; 2177 2178 } 2179 2180 if (blob->clean.num_clusters < blob->active.num_clusters) { 2181 /* Blob was resized up */ 2182 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2183 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2184 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2185 /* Blob was resized down */ 2186 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2187 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2188 } else { 2189 /* No change in size occured */ 2190 blob_persist_generate_new_md(ctx); 2191 return; 2192 } 2193 2194 blob_persist_write_extent_pages(seq, ctx, 0); 2195 } 2196 2197 static void 2198 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2199 { 2200 struct spdk_blob_persist_ctx *ctx = cb_arg; 2201 2202 spdk_free(ctx->super); 2203 2204 if (bserrno != 0) { 2205 blob_persist_complete(seq, ctx, bserrno); 2206 return; 2207 } 2208 2209 ctx->blob->bs->clean = 0; 2210 2211 blob_persist_start(ctx); 2212 } 2213 2214 static void 2215 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2216 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2217 2218 2219 static void 2220 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2221 { 2222 struct spdk_blob_persist_ctx *ctx = cb_arg; 2223 2224 if (bserrno != 0) { 2225 spdk_free(ctx->super); 2226 blob_persist_complete(seq, ctx, bserrno); 2227 return; 2228 } 2229 2230 ctx->super->clean = 0; 2231 if (ctx->super->size == 0) { 2232 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2233 } 2234 2235 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2236 } 2237 2238 static void 2239 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2240 { 2241 if (ctx->blob->bs->clean) { 2242 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2243 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2244 if (!ctx->super) { 2245 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2246 return; 2247 } 2248 2249 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2250 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2251 blob_persist_dirty, ctx); 2252 } else { 2253 blob_persist_start(ctx); 2254 } 2255 } 2256 2257 /* Write a blob to disk */ 2258 static void 2259 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2260 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2261 { 2262 struct spdk_blob_persist_ctx *ctx; 2263 2264 blob_verify_md_op(blob); 2265 2266 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->pending_persists)) { 2267 cb_fn(seq, cb_arg, 0); 2268 return; 2269 } 2270 2271 ctx = calloc(1, sizeof(*ctx)); 2272 if (!ctx) { 2273 cb_fn(seq, cb_arg, -ENOMEM); 2274 return; 2275 } 2276 ctx->blob = blob; 2277 ctx->seq = seq; 2278 ctx->cb_fn = cb_fn; 2279 ctx->cb_arg = cb_arg; 2280 2281 /* Multiple blob persists can affect one another, via blob->state or 2282 * blob mutable data changes. To prevent it, queue up the persists. */ 2283 if (!TAILQ_EMPTY(&blob->pending_persists)) { 2284 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2285 return; 2286 } 2287 TAILQ_INSERT_HEAD(&blob->pending_persists, ctx, link); 2288 2289 blob_persist_check_dirty(ctx); 2290 } 2291 2292 struct spdk_blob_copy_cluster_ctx { 2293 struct spdk_blob *blob; 2294 uint8_t *buf; 2295 uint64_t page; 2296 uint64_t new_cluster; 2297 uint32_t new_extent_page; 2298 spdk_bs_sequence_t *seq; 2299 }; 2300 2301 static void 2302 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2303 { 2304 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2305 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2306 TAILQ_HEAD(, spdk_bs_request_set) requests; 2307 spdk_bs_user_op_t *op; 2308 2309 TAILQ_INIT(&requests); 2310 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2311 2312 while (!TAILQ_EMPTY(&requests)) { 2313 op = TAILQ_FIRST(&requests); 2314 TAILQ_REMOVE(&requests, op, link); 2315 if (bserrno == 0) { 2316 bs_user_op_execute(op); 2317 } else { 2318 bs_user_op_abort(op); 2319 } 2320 } 2321 2322 spdk_free(ctx->buf); 2323 free(ctx); 2324 } 2325 2326 static void 2327 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2328 { 2329 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2330 2331 if (bserrno) { 2332 if (bserrno == -EEXIST) { 2333 /* The metadata insert failed because another thread 2334 * allocated the cluster first. Free our cluster 2335 * but continue without error. */ 2336 bserrno = 0; 2337 } 2338 pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex); 2339 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2340 pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex); 2341 if (ctx->new_extent_page != 0) { 2342 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2343 } 2344 } 2345 2346 bs_sequence_finish(ctx->seq, bserrno); 2347 } 2348 2349 static void 2350 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2351 { 2352 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2353 uint32_t cluster_number; 2354 2355 if (bserrno) { 2356 /* The write failed, so jump to the final completion handler */ 2357 bs_sequence_finish(seq, bserrno); 2358 return; 2359 } 2360 2361 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2362 2363 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2364 ctx->new_extent_page, blob_insert_cluster_cpl, ctx); 2365 } 2366 2367 static void 2368 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2369 { 2370 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2371 2372 if (bserrno != 0) { 2373 /* The read failed, so jump to the final completion handler */ 2374 bs_sequence_finish(seq, bserrno); 2375 return; 2376 } 2377 2378 /* Write whole cluster */ 2379 bs_sequence_write_dev(seq, ctx->buf, 2380 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2381 bs_cluster_to_lba(ctx->blob->bs, 1), 2382 blob_write_copy_cpl, ctx); 2383 } 2384 2385 static void 2386 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2387 struct spdk_io_channel *_ch, 2388 uint64_t io_unit, spdk_bs_user_op_t *op) 2389 { 2390 struct spdk_bs_cpl cpl; 2391 struct spdk_bs_channel *ch; 2392 struct spdk_blob_copy_cluster_ctx *ctx; 2393 uint32_t cluster_start_page; 2394 uint32_t cluster_number; 2395 int rc; 2396 2397 ch = spdk_io_channel_get_ctx(_ch); 2398 2399 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2400 /* There are already operations pending. Queue this user op 2401 * and return because it will be re-executed when the outstanding 2402 * cluster allocation completes. */ 2403 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2404 return; 2405 } 2406 2407 /* Round the io_unit offset down to the first page in the cluster */ 2408 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2409 2410 /* Calculate which index in the metadata cluster array the corresponding 2411 * cluster is supposed to be at. */ 2412 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2413 2414 ctx = calloc(1, sizeof(*ctx)); 2415 if (!ctx) { 2416 bs_user_op_abort(op); 2417 return; 2418 } 2419 2420 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2421 2422 ctx->blob = blob; 2423 ctx->page = cluster_start_page; 2424 2425 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2426 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2427 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2428 if (!ctx->buf) { 2429 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2430 blob->bs->cluster_sz); 2431 free(ctx); 2432 bs_user_op_abort(op); 2433 return; 2434 } 2435 } 2436 2437 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2438 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2439 false); 2440 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2441 if (rc != 0) { 2442 spdk_free(ctx->buf); 2443 free(ctx); 2444 bs_user_op_abort(op); 2445 return; 2446 } 2447 2448 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2449 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2450 cpl.u.blob_basic.cb_arg = ctx; 2451 2452 ctx->seq = bs_sequence_start(_ch, &cpl); 2453 if (!ctx->seq) { 2454 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2455 bs_release_cluster(blob->bs, ctx->new_cluster); 2456 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2457 spdk_free(ctx->buf); 2458 free(ctx); 2459 bs_user_op_abort(op); 2460 return; 2461 } 2462 2463 /* Queue the user op to block other incoming operations */ 2464 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2465 2466 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2467 /* Read cluster from backing device */ 2468 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2469 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2470 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2471 blob_write_copy, ctx); 2472 } else { 2473 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2474 ctx->new_extent_page, blob_insert_cluster_cpl, ctx); 2475 } 2476 } 2477 2478 static inline bool 2479 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2480 uint64_t *lba, uint32_t *lba_count) 2481 { 2482 *lba_count = length; 2483 2484 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2485 assert(blob->back_bs_dev != NULL); 2486 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2487 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2488 return false; 2489 } else { 2490 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2491 return true; 2492 } 2493 } 2494 2495 struct op_split_ctx { 2496 struct spdk_blob *blob; 2497 struct spdk_io_channel *channel; 2498 uint64_t io_unit_offset; 2499 uint64_t io_units_remaining; 2500 void *curr_payload; 2501 enum spdk_blob_op_type op_type; 2502 spdk_bs_sequence_t *seq; 2503 }; 2504 2505 static void 2506 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2507 { 2508 struct op_split_ctx *ctx = cb_arg; 2509 struct spdk_blob *blob = ctx->blob; 2510 struct spdk_io_channel *ch = ctx->channel; 2511 enum spdk_blob_op_type op_type = ctx->op_type; 2512 uint8_t *buf = ctx->curr_payload; 2513 uint64_t offset = ctx->io_unit_offset; 2514 uint64_t length = ctx->io_units_remaining; 2515 uint64_t op_length; 2516 2517 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2518 bs_sequence_finish(ctx->seq, bserrno); 2519 free(ctx); 2520 return; 2521 } 2522 2523 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2524 offset)); 2525 2526 /* Update length and payload for next operation */ 2527 ctx->io_units_remaining -= op_length; 2528 ctx->io_unit_offset += op_length; 2529 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2530 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2531 } 2532 2533 switch (op_type) { 2534 case SPDK_BLOB_READ: 2535 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2536 blob_request_submit_op_split_next, ctx); 2537 break; 2538 case SPDK_BLOB_WRITE: 2539 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2540 blob_request_submit_op_split_next, ctx); 2541 break; 2542 case SPDK_BLOB_UNMAP: 2543 spdk_blob_io_unmap(blob, ch, offset, op_length, 2544 blob_request_submit_op_split_next, ctx); 2545 break; 2546 case SPDK_BLOB_WRITE_ZEROES: 2547 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2548 blob_request_submit_op_split_next, ctx); 2549 break; 2550 case SPDK_BLOB_READV: 2551 case SPDK_BLOB_WRITEV: 2552 SPDK_ERRLOG("readv/write not valid\n"); 2553 bs_sequence_finish(ctx->seq, -EINVAL); 2554 free(ctx); 2555 break; 2556 } 2557 } 2558 2559 static void 2560 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2561 void *payload, uint64_t offset, uint64_t length, 2562 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2563 { 2564 struct op_split_ctx *ctx; 2565 spdk_bs_sequence_t *seq; 2566 struct spdk_bs_cpl cpl; 2567 2568 assert(blob != NULL); 2569 2570 ctx = calloc(1, sizeof(struct op_split_ctx)); 2571 if (ctx == NULL) { 2572 cb_fn(cb_arg, -ENOMEM); 2573 return; 2574 } 2575 2576 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2577 cpl.u.blob_basic.cb_fn = cb_fn; 2578 cpl.u.blob_basic.cb_arg = cb_arg; 2579 2580 seq = bs_sequence_start(ch, &cpl); 2581 if (!seq) { 2582 free(ctx); 2583 cb_fn(cb_arg, -ENOMEM); 2584 return; 2585 } 2586 2587 ctx->blob = blob; 2588 ctx->channel = ch; 2589 ctx->curr_payload = payload; 2590 ctx->io_unit_offset = offset; 2591 ctx->io_units_remaining = length; 2592 ctx->op_type = op_type; 2593 ctx->seq = seq; 2594 2595 blob_request_submit_op_split_next(ctx, 0); 2596 } 2597 2598 static void 2599 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2600 void *payload, uint64_t offset, uint64_t length, 2601 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2602 { 2603 struct spdk_bs_cpl cpl; 2604 uint64_t lba; 2605 uint32_t lba_count; 2606 bool is_allocated; 2607 2608 assert(blob != NULL); 2609 2610 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2611 cpl.u.blob_basic.cb_fn = cb_fn; 2612 cpl.u.blob_basic.cb_arg = cb_arg; 2613 2614 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2615 2616 if (blob->frozen_refcnt) { 2617 /* This blob I/O is frozen */ 2618 spdk_bs_user_op_t *op; 2619 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2620 2621 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2622 if (!op) { 2623 cb_fn(cb_arg, -ENOMEM); 2624 return; 2625 } 2626 2627 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2628 2629 return; 2630 } 2631 2632 switch (op_type) { 2633 case SPDK_BLOB_READ: { 2634 spdk_bs_batch_t *batch; 2635 2636 batch = bs_batch_open(_ch, &cpl); 2637 if (!batch) { 2638 cb_fn(cb_arg, -ENOMEM); 2639 return; 2640 } 2641 2642 if (is_allocated) { 2643 /* Read from the blob */ 2644 bs_batch_read_dev(batch, payload, lba, lba_count); 2645 } else { 2646 /* Read from the backing block device */ 2647 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2648 } 2649 2650 bs_batch_close(batch); 2651 break; 2652 } 2653 case SPDK_BLOB_WRITE: 2654 case SPDK_BLOB_WRITE_ZEROES: { 2655 if (is_allocated) { 2656 /* Write to the blob */ 2657 spdk_bs_batch_t *batch; 2658 2659 if (lba_count == 0) { 2660 cb_fn(cb_arg, 0); 2661 return; 2662 } 2663 2664 batch = bs_batch_open(_ch, &cpl); 2665 if (!batch) { 2666 cb_fn(cb_arg, -ENOMEM); 2667 return; 2668 } 2669 2670 if (op_type == SPDK_BLOB_WRITE) { 2671 bs_batch_write_dev(batch, payload, lba, lba_count); 2672 } else { 2673 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2674 } 2675 2676 bs_batch_close(batch); 2677 } else { 2678 /* Queue this operation and allocate the cluster */ 2679 spdk_bs_user_op_t *op; 2680 2681 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2682 if (!op) { 2683 cb_fn(cb_arg, -ENOMEM); 2684 return; 2685 } 2686 2687 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2688 } 2689 break; 2690 } 2691 case SPDK_BLOB_UNMAP: { 2692 spdk_bs_batch_t *batch; 2693 2694 batch = bs_batch_open(_ch, &cpl); 2695 if (!batch) { 2696 cb_fn(cb_arg, -ENOMEM); 2697 return; 2698 } 2699 2700 if (is_allocated) { 2701 bs_batch_unmap_dev(batch, lba, lba_count); 2702 } 2703 2704 bs_batch_close(batch); 2705 break; 2706 } 2707 case SPDK_BLOB_READV: 2708 case SPDK_BLOB_WRITEV: 2709 SPDK_ERRLOG("readv/write not valid\n"); 2710 cb_fn(cb_arg, -EINVAL); 2711 break; 2712 } 2713 } 2714 2715 static void 2716 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2717 void *payload, uint64_t offset, uint64_t length, 2718 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2719 { 2720 assert(blob != NULL); 2721 2722 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2723 cb_fn(cb_arg, -EPERM); 2724 return; 2725 } 2726 2727 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2728 cb_fn(cb_arg, -EINVAL); 2729 return; 2730 } 2731 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2732 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2733 cb_fn, cb_arg, op_type); 2734 } else { 2735 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2736 cb_fn, cb_arg, op_type); 2737 } 2738 } 2739 2740 struct rw_iov_ctx { 2741 struct spdk_blob *blob; 2742 struct spdk_io_channel *channel; 2743 spdk_blob_op_complete cb_fn; 2744 void *cb_arg; 2745 bool read; 2746 int iovcnt; 2747 struct iovec *orig_iov; 2748 uint64_t io_unit_offset; 2749 uint64_t io_units_remaining; 2750 uint64_t io_units_done; 2751 struct iovec iov[0]; 2752 }; 2753 2754 static void 2755 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2756 { 2757 assert(cb_arg == NULL); 2758 bs_sequence_finish(seq, bserrno); 2759 } 2760 2761 static void 2762 rw_iov_split_next(void *cb_arg, int bserrno) 2763 { 2764 struct rw_iov_ctx *ctx = cb_arg; 2765 struct spdk_blob *blob = ctx->blob; 2766 struct iovec *iov, *orig_iov; 2767 int iovcnt; 2768 size_t orig_iovoff; 2769 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2770 uint64_t byte_count; 2771 2772 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2773 ctx->cb_fn(ctx->cb_arg, bserrno); 2774 free(ctx); 2775 return; 2776 } 2777 2778 io_unit_offset = ctx->io_unit_offset; 2779 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2780 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2781 /* 2782 * Get index and offset into the original iov array for our current position in the I/O sequence. 2783 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2784 * point to the current position in the I/O sequence. 2785 */ 2786 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2787 orig_iov = &ctx->orig_iov[0]; 2788 orig_iovoff = 0; 2789 while (byte_count > 0) { 2790 if (byte_count >= orig_iov->iov_len) { 2791 byte_count -= orig_iov->iov_len; 2792 orig_iov++; 2793 } else { 2794 orig_iovoff = byte_count; 2795 byte_count = 0; 2796 } 2797 } 2798 2799 /* 2800 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2801 * bytes of this next I/O remain to be accounted for in the new iov array. 2802 */ 2803 byte_count = io_units_count * blob->bs->io_unit_size; 2804 iov = &ctx->iov[0]; 2805 iovcnt = 0; 2806 while (byte_count > 0) { 2807 assert(iovcnt < ctx->iovcnt); 2808 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2809 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2810 byte_count -= iov->iov_len; 2811 orig_iovoff = 0; 2812 orig_iov++; 2813 iov++; 2814 iovcnt++; 2815 } 2816 2817 ctx->io_unit_offset += io_units_count; 2818 ctx->io_units_remaining -= io_units_count; 2819 ctx->io_units_done += io_units_count; 2820 iov = &ctx->iov[0]; 2821 2822 if (ctx->read) { 2823 spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2824 io_units_count, rw_iov_split_next, ctx); 2825 } else { 2826 spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2827 io_units_count, rw_iov_split_next, ctx); 2828 } 2829 } 2830 2831 static void 2832 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2833 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 2834 spdk_blob_op_complete cb_fn, void *cb_arg, bool read) 2835 { 2836 struct spdk_bs_cpl cpl; 2837 2838 assert(blob != NULL); 2839 2840 if (!read && blob->data_ro) { 2841 cb_fn(cb_arg, -EPERM); 2842 return; 2843 } 2844 2845 if (length == 0) { 2846 cb_fn(cb_arg, 0); 2847 return; 2848 } 2849 2850 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2851 cb_fn(cb_arg, -EINVAL); 2852 return; 2853 } 2854 2855 /* 2856 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2857 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2858 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2859 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 2860 * to allocate a separate iov array and split the I/O such that none of the resulting 2861 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 2862 * but since this case happens very infrequently, any performance impact will be negligible. 2863 * 2864 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 2865 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 2866 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 2867 * when the batch was completed, to allow for freeing the memory for the iov arrays. 2868 */ 2869 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 2870 uint32_t lba_count; 2871 uint64_t lba; 2872 bool is_allocated; 2873 2874 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2875 cpl.u.blob_basic.cb_fn = cb_fn; 2876 cpl.u.blob_basic.cb_arg = cb_arg; 2877 2878 if (blob->frozen_refcnt) { 2879 /* This blob I/O is frozen */ 2880 enum spdk_blob_op_type op_type; 2881 spdk_bs_user_op_t *op; 2882 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 2883 2884 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 2885 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 2886 if (!op) { 2887 cb_fn(cb_arg, -ENOMEM); 2888 return; 2889 } 2890 2891 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2892 2893 return; 2894 } 2895 2896 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2897 2898 if (read) { 2899 spdk_bs_sequence_t *seq; 2900 2901 seq = bs_sequence_start(_channel, &cpl); 2902 if (!seq) { 2903 cb_fn(cb_arg, -ENOMEM); 2904 return; 2905 } 2906 2907 if (is_allocated) { 2908 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2909 } else { 2910 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 2911 rw_iov_done, NULL); 2912 } 2913 } else { 2914 if (is_allocated) { 2915 spdk_bs_sequence_t *seq; 2916 2917 seq = bs_sequence_start(_channel, &cpl); 2918 if (!seq) { 2919 cb_fn(cb_arg, -ENOMEM); 2920 return; 2921 } 2922 2923 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2924 } else { 2925 /* Queue this operation and allocate the cluster */ 2926 spdk_bs_user_op_t *op; 2927 2928 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 2929 length); 2930 if (!op) { 2931 cb_fn(cb_arg, -ENOMEM); 2932 return; 2933 } 2934 2935 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 2936 } 2937 } 2938 } else { 2939 struct rw_iov_ctx *ctx; 2940 2941 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 2942 if (ctx == NULL) { 2943 cb_fn(cb_arg, -ENOMEM); 2944 return; 2945 } 2946 2947 ctx->blob = blob; 2948 ctx->channel = _channel; 2949 ctx->cb_fn = cb_fn; 2950 ctx->cb_arg = cb_arg; 2951 ctx->read = read; 2952 ctx->orig_iov = iov; 2953 ctx->iovcnt = iovcnt; 2954 ctx->io_unit_offset = offset; 2955 ctx->io_units_remaining = length; 2956 ctx->io_units_done = 0; 2957 2958 rw_iov_split_next(ctx, 0); 2959 } 2960 } 2961 2962 static struct spdk_blob * 2963 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 2964 { 2965 struct spdk_blob *blob; 2966 2967 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 2968 return NULL; 2969 } 2970 2971 TAILQ_FOREACH(blob, &bs->blobs, link) { 2972 if (blob->id == blobid) { 2973 return blob; 2974 } 2975 } 2976 2977 return NULL; 2978 } 2979 2980 static void 2981 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 2982 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 2983 { 2984 assert(blob != NULL); 2985 *snapshot_entry = NULL; 2986 *clone_entry = NULL; 2987 2988 if (blob->parent_id == SPDK_BLOBID_INVALID) { 2989 return; 2990 } 2991 2992 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 2993 if ((*snapshot_entry)->id == blob->parent_id) { 2994 break; 2995 } 2996 } 2997 2998 if (*snapshot_entry != NULL) { 2999 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3000 if ((*clone_entry)->id == blob->id) { 3001 break; 3002 } 3003 } 3004 3005 assert(clone_entry != NULL); 3006 } 3007 } 3008 3009 static int 3010 bs_channel_create(void *io_device, void *ctx_buf) 3011 { 3012 struct spdk_blob_store *bs = io_device; 3013 struct spdk_bs_channel *channel = ctx_buf; 3014 struct spdk_bs_dev *dev; 3015 uint32_t max_ops = bs->max_channel_ops; 3016 uint32_t i; 3017 3018 dev = bs->dev; 3019 3020 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3021 if (!channel->req_mem) { 3022 return -1; 3023 } 3024 3025 TAILQ_INIT(&channel->reqs); 3026 3027 for (i = 0; i < max_ops; i++) { 3028 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3029 } 3030 3031 channel->bs = bs; 3032 channel->dev = dev; 3033 channel->dev_channel = dev->create_channel(dev); 3034 3035 if (!channel->dev_channel) { 3036 SPDK_ERRLOG("Failed to create device channel.\n"); 3037 free(channel->req_mem); 3038 return -1; 3039 } 3040 3041 TAILQ_INIT(&channel->need_cluster_alloc); 3042 TAILQ_INIT(&channel->queued_io); 3043 3044 return 0; 3045 } 3046 3047 static void 3048 bs_channel_destroy(void *io_device, void *ctx_buf) 3049 { 3050 struct spdk_bs_channel *channel = ctx_buf; 3051 spdk_bs_user_op_t *op; 3052 3053 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3054 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3055 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3056 bs_user_op_abort(op); 3057 } 3058 3059 while (!TAILQ_EMPTY(&channel->queued_io)) { 3060 op = TAILQ_FIRST(&channel->queued_io); 3061 TAILQ_REMOVE(&channel->queued_io, op, link); 3062 bs_user_op_abort(op); 3063 } 3064 3065 free(channel->req_mem); 3066 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3067 } 3068 3069 static void 3070 bs_dev_destroy(void *io_device) 3071 { 3072 struct spdk_blob_store *bs = io_device; 3073 struct spdk_blob *blob, *blob_tmp; 3074 3075 bs->dev->destroy(bs->dev); 3076 3077 TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) { 3078 TAILQ_REMOVE(&bs->blobs, blob, link); 3079 spdk_bit_array_clear(bs->open_blobids, blob->id); 3080 blob_free(blob); 3081 } 3082 3083 pthread_mutex_destroy(&bs->used_clusters_mutex); 3084 3085 spdk_bit_array_free(&bs->open_blobids); 3086 spdk_bit_array_free(&bs->used_blobids); 3087 spdk_bit_array_free(&bs->used_md_pages); 3088 spdk_bit_pool_free(&bs->used_clusters); 3089 /* 3090 * If this function is called for any reason except a successful unload, 3091 * the unload_cpl type will be NONE and this will be a nop. 3092 */ 3093 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3094 3095 free(bs); 3096 } 3097 3098 static int 3099 bs_blob_list_add(struct spdk_blob *blob) 3100 { 3101 spdk_blob_id snapshot_id; 3102 struct spdk_blob_list *snapshot_entry = NULL; 3103 struct spdk_blob_list *clone_entry = NULL; 3104 3105 assert(blob != NULL); 3106 3107 snapshot_id = blob->parent_id; 3108 if (snapshot_id == SPDK_BLOBID_INVALID) { 3109 return 0; 3110 } 3111 3112 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3113 if (snapshot_entry == NULL) { 3114 /* Snapshot not found */ 3115 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3116 if (snapshot_entry == NULL) { 3117 return -ENOMEM; 3118 } 3119 snapshot_entry->id = snapshot_id; 3120 TAILQ_INIT(&snapshot_entry->clones); 3121 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3122 } else { 3123 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3124 if (clone_entry->id == blob->id) { 3125 break; 3126 } 3127 } 3128 } 3129 3130 if (clone_entry == NULL) { 3131 /* Clone not found */ 3132 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3133 if (clone_entry == NULL) { 3134 return -ENOMEM; 3135 } 3136 clone_entry->id = blob->id; 3137 TAILQ_INIT(&clone_entry->clones); 3138 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3139 snapshot_entry->clone_count++; 3140 } 3141 3142 return 0; 3143 } 3144 3145 static void 3146 bs_blob_list_remove(struct spdk_blob *blob) 3147 { 3148 struct spdk_blob_list *snapshot_entry = NULL; 3149 struct spdk_blob_list *clone_entry = NULL; 3150 3151 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3152 3153 if (snapshot_entry == NULL) { 3154 return; 3155 } 3156 3157 blob->parent_id = SPDK_BLOBID_INVALID; 3158 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3159 free(clone_entry); 3160 3161 snapshot_entry->clone_count--; 3162 } 3163 3164 static int 3165 bs_blob_list_free(struct spdk_blob_store *bs) 3166 { 3167 struct spdk_blob_list *snapshot_entry; 3168 struct spdk_blob_list *snapshot_entry_tmp; 3169 struct spdk_blob_list *clone_entry; 3170 struct spdk_blob_list *clone_entry_tmp; 3171 3172 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3173 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3174 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3175 free(clone_entry); 3176 } 3177 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3178 free(snapshot_entry); 3179 } 3180 3181 return 0; 3182 } 3183 3184 static void 3185 bs_free(struct spdk_blob_store *bs) 3186 { 3187 bs_blob_list_free(bs); 3188 3189 bs_unregister_md_thread(bs); 3190 spdk_io_device_unregister(bs, bs_dev_destroy); 3191 } 3192 3193 void 3194 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3195 { 3196 3197 if (!opts) { 3198 SPDK_ERRLOG("opts should not be NULL\n"); 3199 return; 3200 } 3201 3202 if (!opts_size) { 3203 SPDK_ERRLOG("opts_size should not be zero value\n"); 3204 return; 3205 } 3206 3207 memset(opts, 0, opts_size); 3208 opts->opts_size = opts_size; 3209 3210 #define FIELD_OK(field) \ 3211 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3212 3213 #define SET_FIELD(field, value) \ 3214 if (FIELD_OK(field)) { \ 3215 opts->field = value; \ 3216 } \ 3217 3218 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3219 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3220 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3221 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3222 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3223 3224 if (FIELD_OK(bstype)) { 3225 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3226 } 3227 3228 SET_FIELD(iter_cb_fn, NULL); 3229 SET_FIELD(iter_cb_arg, NULL); 3230 3231 #undef FIELD_OK 3232 #undef SET_FIELD 3233 } 3234 3235 static int 3236 bs_opts_verify(struct spdk_bs_opts *opts) 3237 { 3238 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3239 opts->max_channel_ops == 0) { 3240 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3241 return -1; 3242 } 3243 3244 return 0; 3245 } 3246 3247 /* START spdk_bs_load */ 3248 3249 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3250 3251 struct spdk_bs_load_ctx { 3252 struct spdk_blob_store *bs; 3253 struct spdk_bs_super_block *super; 3254 3255 struct spdk_bs_md_mask *mask; 3256 bool in_page_chain; 3257 uint32_t page_index; 3258 uint32_t cur_page; 3259 struct spdk_blob_md_page *page; 3260 3261 uint64_t num_extent_pages; 3262 uint32_t *extent_page_num; 3263 struct spdk_blob_md_page *extent_pages; 3264 struct spdk_bit_array *used_clusters; 3265 3266 spdk_bs_sequence_t *seq; 3267 spdk_blob_op_with_handle_complete iter_cb_fn; 3268 void *iter_cb_arg; 3269 struct spdk_blob *blob; 3270 spdk_blob_id blobid; 3271 3272 /* These fields are used in the spdk_bs_dump path. */ 3273 FILE *fp; 3274 spdk_bs_dump_print_xattr print_xattr_fn; 3275 char xattr_name[4096]; 3276 }; 3277 3278 static int 3279 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3280 struct spdk_bs_load_ctx **_ctx) 3281 { 3282 struct spdk_blob_store *bs; 3283 struct spdk_bs_load_ctx *ctx; 3284 uint64_t dev_size; 3285 int rc; 3286 3287 dev_size = dev->blocklen * dev->blockcnt; 3288 if (dev_size < opts->cluster_sz) { 3289 /* Device size cannot be smaller than cluster size of blobstore */ 3290 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3291 dev_size, opts->cluster_sz); 3292 return -ENOSPC; 3293 } 3294 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3295 /* Cluster size cannot be smaller than page size */ 3296 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3297 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3298 return -EINVAL; 3299 } 3300 bs = calloc(1, sizeof(struct spdk_blob_store)); 3301 if (!bs) { 3302 return -ENOMEM; 3303 } 3304 3305 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3306 if (!ctx) { 3307 free(bs); 3308 return -ENOMEM; 3309 } 3310 3311 ctx->bs = bs; 3312 ctx->iter_cb_fn = opts->iter_cb_fn; 3313 ctx->iter_cb_arg = opts->iter_cb_arg; 3314 3315 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3316 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3317 if (!ctx->super) { 3318 free(ctx); 3319 free(bs); 3320 return -ENOMEM; 3321 } 3322 3323 TAILQ_INIT(&bs->blobs); 3324 TAILQ_INIT(&bs->snapshots); 3325 bs->dev = dev; 3326 bs->md_thread = spdk_get_thread(); 3327 assert(bs->md_thread != NULL); 3328 3329 /* 3330 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3331 * even multiple of the cluster size. 3332 */ 3333 bs->cluster_sz = opts->cluster_sz; 3334 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3335 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3336 if (!ctx->used_clusters) { 3337 spdk_free(ctx->super); 3338 free(ctx); 3339 free(bs); 3340 return -ENOMEM; 3341 } 3342 3343 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3344 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3345 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3346 } 3347 bs->num_free_clusters = bs->total_clusters; 3348 bs->io_unit_size = dev->blocklen; 3349 3350 bs->max_channel_ops = opts->max_channel_ops; 3351 bs->super_blob = SPDK_BLOBID_INVALID; 3352 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3353 3354 /* The metadata is assumed to be at least 1 page */ 3355 bs->used_md_pages = spdk_bit_array_create(1); 3356 bs->used_blobids = spdk_bit_array_create(0); 3357 bs->open_blobids = spdk_bit_array_create(0); 3358 3359 pthread_mutex_init(&bs->used_clusters_mutex, NULL); 3360 3361 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3362 sizeof(struct spdk_bs_channel), "blobstore"); 3363 rc = bs_register_md_thread(bs); 3364 if (rc == -1) { 3365 spdk_io_device_unregister(bs, NULL); 3366 pthread_mutex_destroy(&bs->used_clusters_mutex); 3367 spdk_bit_array_free(&bs->open_blobids); 3368 spdk_bit_array_free(&bs->used_blobids); 3369 spdk_bit_array_free(&bs->used_md_pages); 3370 spdk_bit_array_free(&ctx->used_clusters); 3371 spdk_free(ctx->super); 3372 free(ctx); 3373 free(bs); 3374 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3375 return -ENOMEM; 3376 } 3377 3378 *_ctx = ctx; 3379 *_bs = bs; 3380 return 0; 3381 } 3382 3383 static void 3384 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3385 { 3386 assert(bserrno != 0); 3387 3388 spdk_free(ctx->super); 3389 bs_sequence_finish(ctx->seq, bserrno); 3390 bs_free(ctx->bs); 3391 spdk_bit_array_free(&ctx->used_clusters); 3392 free(ctx); 3393 } 3394 3395 static void 3396 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3397 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3398 { 3399 /* Update the values in the super block */ 3400 super->super_blob = bs->super_blob; 3401 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3402 super->crc = blob_md_page_calc_crc(super); 3403 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3404 bs_byte_to_lba(bs, sizeof(*super)), 3405 cb_fn, cb_arg); 3406 } 3407 3408 static void 3409 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3410 { 3411 struct spdk_bs_load_ctx *ctx = arg; 3412 uint64_t mask_size, lba, lba_count; 3413 3414 /* Write out the used clusters mask */ 3415 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3416 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3417 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3418 if (!ctx->mask) { 3419 bs_load_ctx_fail(ctx, -ENOMEM); 3420 return; 3421 } 3422 3423 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3424 ctx->mask->length = ctx->bs->total_clusters; 3425 /* We could get here through the normal unload path, or through dirty 3426 * shutdown recovery. For the normal unload path, we use the mask from 3427 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3428 * only the bit array from the load ctx. 3429 */ 3430 if (ctx->bs->used_clusters) { 3431 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3432 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3433 } else { 3434 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3435 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3436 } 3437 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3438 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3439 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3440 } 3441 3442 static void 3443 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3444 { 3445 struct spdk_bs_load_ctx *ctx = arg; 3446 uint64_t mask_size, lba, lba_count; 3447 3448 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3449 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3450 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3451 if (!ctx->mask) { 3452 bs_load_ctx_fail(ctx, -ENOMEM); 3453 return; 3454 } 3455 3456 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3457 ctx->mask->length = ctx->super->md_len; 3458 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3459 3460 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3461 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3462 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3463 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3464 } 3465 3466 static void 3467 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3468 { 3469 struct spdk_bs_load_ctx *ctx = arg; 3470 uint64_t mask_size, lba, lba_count; 3471 3472 if (ctx->super->used_blobid_mask_len == 0) { 3473 /* 3474 * This is a pre-v3 on-disk format where the blobid mask does not get 3475 * written to disk. 3476 */ 3477 cb_fn(seq, arg, 0); 3478 return; 3479 } 3480 3481 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3482 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3483 SPDK_MALLOC_DMA); 3484 if (!ctx->mask) { 3485 bs_load_ctx_fail(ctx, -ENOMEM); 3486 return; 3487 } 3488 3489 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3490 ctx->mask->length = ctx->super->md_len; 3491 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3492 3493 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3494 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3495 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3496 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3497 } 3498 3499 static void 3500 blob_set_thin_provision(struct spdk_blob *blob) 3501 { 3502 blob_verify_md_op(blob); 3503 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3504 blob->state = SPDK_BLOB_STATE_DIRTY; 3505 } 3506 3507 static void 3508 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3509 { 3510 blob_verify_md_op(blob); 3511 blob->clear_method = clear_method; 3512 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3513 blob->state = SPDK_BLOB_STATE_DIRTY; 3514 } 3515 3516 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3517 3518 static void 3519 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3520 { 3521 struct spdk_bs_load_ctx *ctx = cb_arg; 3522 spdk_blob_id id; 3523 int64_t page_num; 3524 3525 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3526 * last blob has been removed */ 3527 page_num = bs_blobid_to_page(ctx->blobid); 3528 page_num++; 3529 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3530 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3531 bs_load_iter(ctx, NULL, -ENOENT); 3532 return; 3533 } 3534 3535 id = bs_page_to_blobid(page_num); 3536 3537 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3538 } 3539 3540 static void 3541 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3542 { 3543 struct spdk_bs_load_ctx *ctx = cb_arg; 3544 3545 if (bserrno != 0) { 3546 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3547 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3548 return; 3549 } 3550 3551 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3552 } 3553 3554 static void 3555 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3556 { 3557 struct spdk_bs_load_ctx *ctx = cb_arg; 3558 uint64_t i; 3559 3560 if (bserrno != 0) { 3561 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3562 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3563 return; 3564 } 3565 3566 /* Snapshot and clone have the same copy of cluster map and extent pages 3567 * at this point. Let's clear both for snpashot now, 3568 * so that it won't be cleared for clone later when we remove snapshot. 3569 * Also set thin provision to pass data corruption check */ 3570 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3571 ctx->blob->active.clusters[i] = 0; 3572 } 3573 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3574 ctx->blob->active.extent_pages[i] = 0; 3575 } 3576 3577 ctx->blob->md_ro = false; 3578 3579 blob_set_thin_provision(ctx->blob); 3580 3581 ctx->blobid = ctx->blob->id; 3582 3583 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3584 } 3585 3586 static void 3587 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3588 { 3589 struct spdk_bs_load_ctx *ctx = cb_arg; 3590 3591 if (bserrno != 0) { 3592 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3593 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3594 return; 3595 } 3596 3597 ctx->blob->md_ro = false; 3598 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3599 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3600 spdk_blob_set_read_only(ctx->blob); 3601 3602 if (ctx->iter_cb_fn) { 3603 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3604 } 3605 bs_blob_list_add(ctx->blob); 3606 3607 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3608 } 3609 3610 static void 3611 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3612 { 3613 struct spdk_bs_load_ctx *ctx = cb_arg; 3614 3615 if (bserrno != 0) { 3616 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3617 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3618 return; 3619 } 3620 3621 if (blob->parent_id == ctx->blob->id) { 3622 /* Power failure occured before updating clone (snapshot delete case) 3623 * or after updating clone (creating snapshot case) - keep snapshot */ 3624 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3625 } else { 3626 /* Power failure occured after updating clone (snapshot delete case) 3627 * or before updating clone (creating snapshot case) - remove snapshot */ 3628 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3629 } 3630 } 3631 3632 static void 3633 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3634 { 3635 struct spdk_bs_load_ctx *ctx = arg; 3636 const void *value; 3637 size_t len; 3638 int rc = 0; 3639 3640 if (bserrno == 0) { 3641 /* Examine blob if it is corrupted after power failure. Fix 3642 * the ones that can be fixed and remove any other corrupted 3643 * ones. If it is not corrupted just process it */ 3644 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3645 if (rc != 0) { 3646 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3647 if (rc != 0) { 3648 /* Not corrupted - process it and continue with iterating through blobs */ 3649 if (ctx->iter_cb_fn) { 3650 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3651 } 3652 bs_blob_list_add(blob); 3653 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3654 return; 3655 } 3656 3657 } 3658 3659 assert(len == sizeof(spdk_blob_id)); 3660 3661 ctx->blob = blob; 3662 3663 /* Open clone to check if we are able to fix this blob or should we remove it */ 3664 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3665 return; 3666 } else if (bserrno == -ENOENT) { 3667 bserrno = 0; 3668 } else { 3669 /* 3670 * This case needs to be looked at further. Same problem 3671 * exists with applications that rely on explicit blob 3672 * iteration. We should just skip the blob that failed 3673 * to load and continue on to the next one. 3674 */ 3675 SPDK_ERRLOG("Error in iterating blobs\n"); 3676 } 3677 3678 ctx->iter_cb_fn = NULL; 3679 3680 spdk_free(ctx->super); 3681 spdk_free(ctx->mask); 3682 bs_sequence_finish(ctx->seq, bserrno); 3683 free(ctx); 3684 } 3685 3686 static void 3687 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3688 { 3689 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3690 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3691 } 3692 3693 static void 3694 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3695 { 3696 struct spdk_bs_load_ctx *ctx = cb_arg; 3697 int rc; 3698 3699 /* The type must be correct */ 3700 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3701 3702 /* The length of the mask (in bits) must not be greater than 3703 * the length of the buffer (converted to bits) */ 3704 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3705 3706 /* The length of the mask must be exactly equal to the size 3707 * (in pages) of the metadata region */ 3708 assert(ctx->mask->length == ctx->super->md_len); 3709 3710 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3711 if (rc < 0) { 3712 spdk_free(ctx->mask); 3713 bs_load_ctx_fail(ctx, rc); 3714 return; 3715 } 3716 3717 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3718 bs_load_complete(ctx); 3719 } 3720 3721 static void 3722 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3723 { 3724 struct spdk_bs_load_ctx *ctx = cb_arg; 3725 uint64_t lba, lba_count, mask_size; 3726 int rc; 3727 3728 if (bserrno != 0) { 3729 bs_load_ctx_fail(ctx, bserrno); 3730 return; 3731 } 3732 3733 /* The type must be correct */ 3734 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3735 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3736 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3737 struct spdk_blob_md_page) * 8)); 3738 /* The length of the mask must be exactly equal to the total number of clusters */ 3739 assert(ctx->mask->length == ctx->bs->total_clusters); 3740 3741 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3742 if (rc < 0) { 3743 spdk_free(ctx->mask); 3744 bs_load_ctx_fail(ctx, rc); 3745 return; 3746 } 3747 3748 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3749 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3750 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3751 3752 spdk_free(ctx->mask); 3753 3754 /* Read the used blobids mask */ 3755 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3756 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3757 SPDK_MALLOC_DMA); 3758 if (!ctx->mask) { 3759 bs_load_ctx_fail(ctx, -ENOMEM); 3760 return; 3761 } 3762 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3763 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3764 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3765 bs_load_used_blobids_cpl, ctx); 3766 } 3767 3768 static void 3769 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3770 { 3771 struct spdk_bs_load_ctx *ctx = cb_arg; 3772 uint64_t lba, lba_count, mask_size; 3773 int rc; 3774 3775 if (bserrno != 0) { 3776 bs_load_ctx_fail(ctx, bserrno); 3777 return; 3778 } 3779 3780 /* The type must be correct */ 3781 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3782 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3783 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3784 8)); 3785 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3786 if (ctx->mask->length != ctx->super->md_len) { 3787 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3788 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3789 ctx->mask->length, ctx->super->md_len); 3790 assert(false); 3791 } 3792 3793 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3794 if (rc < 0) { 3795 spdk_free(ctx->mask); 3796 bs_load_ctx_fail(ctx, rc); 3797 return; 3798 } 3799 3800 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3801 spdk_free(ctx->mask); 3802 3803 /* Read the used clusters mask */ 3804 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3805 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3806 SPDK_MALLOC_DMA); 3807 if (!ctx->mask) { 3808 bs_load_ctx_fail(ctx, -ENOMEM); 3809 return; 3810 } 3811 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3812 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3813 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3814 bs_load_used_clusters_cpl, ctx); 3815 } 3816 3817 static void 3818 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3819 { 3820 uint64_t lba, lba_count, mask_size; 3821 3822 /* Read the used pages mask */ 3823 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3824 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3825 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3826 if (!ctx->mask) { 3827 bs_load_ctx_fail(ctx, -ENOMEM); 3828 return; 3829 } 3830 3831 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3832 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3833 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 3834 bs_load_used_pages_cpl, ctx); 3835 } 3836 3837 static int 3838 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 3839 { 3840 struct spdk_blob_store *bs = ctx->bs; 3841 struct spdk_blob_md_descriptor *desc; 3842 size_t cur_desc = 0; 3843 3844 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 3845 while (cur_desc < sizeof(page->descriptors)) { 3846 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 3847 if (desc->length == 0) { 3848 /* If padding and length are 0, this terminates the page */ 3849 break; 3850 } 3851 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 3852 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 3853 unsigned int i, j; 3854 unsigned int cluster_count = 0; 3855 uint32_t cluster_idx; 3856 3857 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 3858 3859 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 3860 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 3861 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 3862 /* 3863 * cluster_idx = 0 means an unallocated cluster - don't mark that 3864 * in the used cluster map. 3865 */ 3866 if (cluster_idx != 0) { 3867 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 3868 if (bs->num_free_clusters == 0) { 3869 return -ENOSPC; 3870 } 3871 bs->num_free_clusters--; 3872 } 3873 cluster_count++; 3874 } 3875 } 3876 if (cluster_count == 0) { 3877 return -EINVAL; 3878 } 3879 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 3880 struct spdk_blob_md_descriptor_extent_page *desc_extent; 3881 uint32_t i; 3882 uint32_t cluster_count = 0; 3883 uint32_t cluster_idx; 3884 size_t cluster_idx_length; 3885 3886 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 3887 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 3888 3889 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 3890 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 3891 return -EINVAL; 3892 } 3893 3894 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 3895 cluster_idx = desc_extent->cluster_idx[i]; 3896 /* 3897 * cluster_idx = 0 means an unallocated cluster - don't mark that 3898 * in the used cluster map. 3899 */ 3900 if (cluster_idx != 0) { 3901 if (cluster_idx < desc_extent->start_cluster_idx && 3902 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 3903 return -EINVAL; 3904 } 3905 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 3906 if (bs->num_free_clusters == 0) { 3907 return -ENOSPC; 3908 } 3909 bs->num_free_clusters--; 3910 } 3911 cluster_count++; 3912 } 3913 3914 if (cluster_count == 0) { 3915 return -EINVAL; 3916 } 3917 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 3918 /* Skip this item */ 3919 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 3920 /* Skip this item */ 3921 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 3922 /* Skip this item */ 3923 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 3924 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 3925 uint32_t num_extent_pages = ctx->num_extent_pages; 3926 uint32_t i; 3927 size_t extent_pages_length; 3928 void *tmp; 3929 3930 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 3931 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 3932 3933 if (desc_extent_table->length == 0 || 3934 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 3935 return -EINVAL; 3936 } 3937 3938 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 3939 if (desc_extent_table->extent_page[i].page_idx != 0) { 3940 if (desc_extent_table->extent_page[i].num_pages != 1) { 3941 return -EINVAL; 3942 } 3943 num_extent_pages += 1; 3944 } 3945 } 3946 3947 if (num_extent_pages > 0) { 3948 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 3949 if (tmp == NULL) { 3950 return -ENOMEM; 3951 } 3952 ctx->extent_page_num = tmp; 3953 3954 /* Extent table entries contain md page numbers for extent pages. 3955 * Zeroes represent unallocated extent pages, those are run-length-encoded. 3956 */ 3957 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 3958 if (desc_extent_table->extent_page[i].page_idx != 0) { 3959 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 3960 ctx->num_extent_pages += 1; 3961 } 3962 } 3963 } 3964 } else { 3965 /* Error */ 3966 return -EINVAL; 3967 } 3968 /* Advance to the next descriptor */ 3969 cur_desc += sizeof(*desc) + desc->length; 3970 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 3971 break; 3972 } 3973 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 3974 } 3975 return 0; 3976 } 3977 3978 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 3979 { 3980 uint32_t crc; 3981 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 3982 size_t desc_len; 3983 3984 crc = blob_md_page_calc_crc(page); 3985 if (crc != page->crc) { 3986 return false; 3987 } 3988 3989 /* Extent page should always be of sequence num 0. */ 3990 if (page->sequence_num != 0) { 3991 return false; 3992 } 3993 3994 /* Descriptor type must be EXTENT_PAGE. */ 3995 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 3996 return false; 3997 } 3998 3999 /* Descriptor length cannot exceed the page. */ 4000 desc_len = sizeof(*desc) + desc->length; 4001 if (desc_len > sizeof(page->descriptors)) { 4002 return false; 4003 } 4004 4005 /* It has to be the only descriptor in the page. */ 4006 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4007 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4008 if (desc->length != 0) { 4009 return false; 4010 } 4011 } 4012 4013 return true; 4014 } 4015 4016 static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4017 { 4018 uint32_t crc; 4019 struct spdk_blob_md_page *page = ctx->page; 4020 4021 crc = blob_md_page_calc_crc(page); 4022 if (crc != page->crc) { 4023 return false; 4024 } 4025 4026 /* First page of a sequence should match the blobid. */ 4027 if (page->sequence_num == 0 && 4028 bs_page_to_blobid(ctx->cur_page) != page->id) { 4029 return false; 4030 } 4031 assert(bs_load_cur_extent_page_valid(page) == false); 4032 4033 return true; 4034 } 4035 4036 static void 4037 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4038 4039 static void 4040 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4041 { 4042 struct spdk_bs_load_ctx *ctx = cb_arg; 4043 4044 if (bserrno != 0) { 4045 bs_load_ctx_fail(ctx, bserrno); 4046 return; 4047 } 4048 4049 bs_load_complete(ctx); 4050 } 4051 4052 static void 4053 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4054 { 4055 struct spdk_bs_load_ctx *ctx = cb_arg; 4056 4057 spdk_free(ctx->mask); 4058 ctx->mask = NULL; 4059 4060 if (bserrno != 0) { 4061 bs_load_ctx_fail(ctx, bserrno); 4062 return; 4063 } 4064 4065 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4066 } 4067 4068 static void 4069 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4070 { 4071 struct spdk_bs_load_ctx *ctx = cb_arg; 4072 4073 spdk_free(ctx->mask); 4074 ctx->mask = NULL; 4075 4076 if (bserrno != 0) { 4077 bs_load_ctx_fail(ctx, bserrno); 4078 return; 4079 } 4080 4081 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4082 } 4083 4084 static void 4085 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4086 { 4087 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4088 } 4089 4090 static void 4091 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4092 { 4093 uint64_t num_md_clusters; 4094 uint64_t i; 4095 4096 ctx->in_page_chain = false; 4097 4098 do { 4099 ctx->page_index++; 4100 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4101 4102 if (ctx->page_index < ctx->super->md_len) { 4103 ctx->cur_page = ctx->page_index; 4104 bs_load_replay_cur_md_page(ctx); 4105 } else { 4106 /* Claim all of the clusters used by the metadata */ 4107 num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster); 4108 for (i = 0; i < num_md_clusters; i++) { 4109 spdk_bit_array_set(ctx->used_clusters, i); 4110 } 4111 ctx->bs->num_free_clusters -= num_md_clusters; 4112 spdk_free(ctx->page); 4113 bs_load_write_used_md(ctx); 4114 } 4115 } 4116 4117 static void 4118 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4119 { 4120 struct spdk_bs_load_ctx *ctx = cb_arg; 4121 uint32_t page_num; 4122 uint64_t i; 4123 4124 if (bserrno != 0) { 4125 spdk_free(ctx->extent_pages); 4126 bs_load_ctx_fail(ctx, bserrno); 4127 return; 4128 } 4129 4130 for (i = 0; i < ctx->num_extent_pages; i++) { 4131 /* Extent pages are only read when present within in chain md. 4132 * Integrity of md is not right if that page was not a valid extent page. */ 4133 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4134 spdk_free(ctx->extent_pages); 4135 bs_load_ctx_fail(ctx, -EILSEQ); 4136 return; 4137 } 4138 4139 page_num = ctx->extent_page_num[i]; 4140 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4141 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4142 spdk_free(ctx->extent_pages); 4143 bs_load_ctx_fail(ctx, -EILSEQ); 4144 return; 4145 } 4146 } 4147 4148 spdk_free(ctx->extent_pages); 4149 free(ctx->extent_page_num); 4150 ctx->extent_page_num = NULL; 4151 ctx->num_extent_pages = 0; 4152 4153 bs_load_replay_md_chain_cpl(ctx); 4154 } 4155 4156 static void 4157 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4158 { 4159 spdk_bs_batch_t *batch; 4160 uint32_t page; 4161 uint64_t lba; 4162 uint64_t i; 4163 4164 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4165 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4166 if (!ctx->extent_pages) { 4167 bs_load_ctx_fail(ctx, -ENOMEM); 4168 return; 4169 } 4170 4171 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4172 4173 for (i = 0; i < ctx->num_extent_pages; i++) { 4174 page = ctx->extent_page_num[i]; 4175 assert(page < ctx->super->md_len); 4176 lba = bs_md_page_to_lba(ctx->bs, page); 4177 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4178 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4179 } 4180 4181 bs_batch_close(batch); 4182 } 4183 4184 static void 4185 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4186 { 4187 struct spdk_bs_load_ctx *ctx = cb_arg; 4188 uint32_t page_num; 4189 struct spdk_blob_md_page *page; 4190 4191 if (bserrno != 0) { 4192 bs_load_ctx_fail(ctx, bserrno); 4193 return; 4194 } 4195 4196 page_num = ctx->cur_page; 4197 page = ctx->page; 4198 if (bs_load_cur_md_page_valid(ctx) == true) { 4199 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4200 bs_claim_md_page(ctx->bs, page_num); 4201 if (page->sequence_num == 0) { 4202 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4203 } 4204 if (bs_load_replay_md_parse_page(ctx, page)) { 4205 bs_load_ctx_fail(ctx, -EILSEQ); 4206 return; 4207 } 4208 if (page->next != SPDK_INVALID_MD_PAGE) { 4209 ctx->in_page_chain = true; 4210 ctx->cur_page = page->next; 4211 bs_load_replay_cur_md_page(ctx); 4212 return; 4213 } 4214 if (ctx->num_extent_pages != 0) { 4215 bs_load_replay_extent_pages(ctx); 4216 return; 4217 } 4218 } 4219 } 4220 bs_load_replay_md_chain_cpl(ctx); 4221 } 4222 4223 static void 4224 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4225 { 4226 uint64_t lba; 4227 4228 assert(ctx->cur_page < ctx->super->md_len); 4229 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4230 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4231 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4232 bs_load_replay_md_cpl, ctx); 4233 } 4234 4235 static void 4236 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4237 { 4238 ctx->page_index = 0; 4239 ctx->cur_page = 0; 4240 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4241 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4242 if (!ctx->page) { 4243 bs_load_ctx_fail(ctx, -ENOMEM); 4244 return; 4245 } 4246 bs_load_replay_cur_md_page(ctx); 4247 } 4248 4249 static void 4250 bs_recover(struct spdk_bs_load_ctx *ctx) 4251 { 4252 int rc; 4253 4254 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4255 if (rc < 0) { 4256 bs_load_ctx_fail(ctx, -ENOMEM); 4257 return; 4258 } 4259 4260 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4261 if (rc < 0) { 4262 bs_load_ctx_fail(ctx, -ENOMEM); 4263 return; 4264 } 4265 4266 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4267 if (rc < 0) { 4268 bs_load_ctx_fail(ctx, -ENOMEM); 4269 return; 4270 } 4271 4272 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4273 if (rc < 0) { 4274 bs_load_ctx_fail(ctx, -ENOMEM); 4275 return; 4276 } 4277 4278 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4279 bs_load_replay_md(ctx); 4280 } 4281 4282 static void 4283 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4284 { 4285 struct spdk_bs_load_ctx *ctx = cb_arg; 4286 uint32_t crc; 4287 int rc; 4288 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4289 4290 if (ctx->super->version > SPDK_BS_VERSION || 4291 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4292 bs_load_ctx_fail(ctx, -EILSEQ); 4293 return; 4294 } 4295 4296 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4297 sizeof(ctx->super->signature)) != 0) { 4298 bs_load_ctx_fail(ctx, -EILSEQ); 4299 return; 4300 } 4301 4302 crc = blob_md_page_calc_crc(ctx->super); 4303 if (crc != ctx->super->crc) { 4304 bs_load_ctx_fail(ctx, -EILSEQ); 4305 return; 4306 } 4307 4308 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4309 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4310 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4311 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4312 } else { 4313 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4314 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4315 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4316 bs_load_ctx_fail(ctx, -ENXIO); 4317 return; 4318 } 4319 4320 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4321 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4322 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4323 bs_load_ctx_fail(ctx, -EILSEQ); 4324 return; 4325 } 4326 4327 if (ctx->super->size == 0) { 4328 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4329 } 4330 4331 if (ctx->super->io_unit_size == 0) { 4332 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4333 } 4334 4335 /* Parse the super block */ 4336 ctx->bs->clean = 1; 4337 ctx->bs->cluster_sz = ctx->super->cluster_size; 4338 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4339 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4340 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4341 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4342 } 4343 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4344 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4345 if (rc < 0) { 4346 bs_load_ctx_fail(ctx, -ENOMEM); 4347 return; 4348 } 4349 ctx->bs->md_start = ctx->super->md_start; 4350 ctx->bs->md_len = ctx->super->md_len; 4351 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4352 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4353 ctx->bs->super_blob = ctx->super->super_blob; 4354 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4355 4356 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 4357 bs_recover(ctx); 4358 } else { 4359 bs_load_read_used_pages(ctx); 4360 } 4361 } 4362 4363 static inline int 4364 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4365 { 4366 4367 if (!src->opts_size) { 4368 SPDK_ERRLOG("opts_size should not be zero value\n"); 4369 return -1; 4370 } 4371 4372 #define FIELD_OK(field) \ 4373 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4374 4375 #define SET_FIELD(field) \ 4376 if (FIELD_OK(field)) { \ 4377 dst->field = src->field; \ 4378 } \ 4379 4380 SET_FIELD(cluster_sz); 4381 SET_FIELD(num_md_pages); 4382 SET_FIELD(max_md_ops); 4383 SET_FIELD(max_channel_ops); 4384 SET_FIELD(clear_method); 4385 4386 if (FIELD_OK(bstype)) { 4387 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4388 } 4389 SET_FIELD(iter_cb_fn); 4390 SET_FIELD(iter_cb_arg); 4391 4392 dst->opts_size = src->opts_size; 4393 4394 /* You should not remove this statement, but need to update the assert statement 4395 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4396 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 64, "Incorrect size"); 4397 4398 #undef FIELD_OK 4399 #undef SET_FIELD 4400 4401 return 0; 4402 } 4403 4404 void 4405 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4406 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4407 { 4408 struct spdk_blob_store *bs; 4409 struct spdk_bs_cpl cpl; 4410 struct spdk_bs_load_ctx *ctx; 4411 struct spdk_bs_opts opts = {}; 4412 int err; 4413 4414 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4415 4416 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4417 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4418 dev->destroy(dev); 4419 cb_fn(cb_arg, NULL, -EINVAL); 4420 return; 4421 } 4422 4423 spdk_bs_opts_init(&opts, sizeof(opts)); 4424 if (o) { 4425 if (bs_opts_copy(o, &opts)) { 4426 return; 4427 } 4428 } 4429 4430 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4431 dev->destroy(dev); 4432 cb_fn(cb_arg, NULL, -EINVAL); 4433 return; 4434 } 4435 4436 err = bs_alloc(dev, &opts, &bs, &ctx); 4437 if (err) { 4438 dev->destroy(dev); 4439 cb_fn(cb_arg, NULL, err); 4440 return; 4441 } 4442 4443 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4444 cpl.u.bs_handle.cb_fn = cb_fn; 4445 cpl.u.bs_handle.cb_arg = cb_arg; 4446 cpl.u.bs_handle.bs = bs; 4447 4448 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4449 if (!ctx->seq) { 4450 spdk_free(ctx->super); 4451 free(ctx); 4452 bs_free(bs); 4453 cb_fn(cb_arg, NULL, -ENOMEM); 4454 return; 4455 } 4456 4457 /* Read the super block */ 4458 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4459 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4460 bs_load_super_cpl, ctx); 4461 } 4462 4463 /* END spdk_bs_load */ 4464 4465 /* START spdk_bs_dump */ 4466 4467 static void 4468 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4469 { 4470 spdk_free(ctx->super); 4471 4472 /* 4473 * We need to defer calling bs_call_cpl() until after 4474 * dev destruction, so tuck these away for later use. 4475 */ 4476 ctx->bs->unload_err = bserrno; 4477 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4478 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4479 4480 bs_sequence_finish(seq, 0); 4481 bs_free(ctx->bs); 4482 free(ctx); 4483 } 4484 4485 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 4486 4487 static void 4488 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4489 { 4490 uint32_t page_idx = ctx->cur_page; 4491 struct spdk_blob_md_page *page = ctx->page; 4492 struct spdk_blob_md_descriptor *desc; 4493 size_t cur_desc = 0; 4494 uint32_t crc; 4495 4496 fprintf(ctx->fp, "=========\n"); 4497 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4498 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4499 4500 crc = blob_md_page_calc_crc(page); 4501 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4502 4503 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4504 while (cur_desc < sizeof(page->descriptors)) { 4505 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4506 if (desc->length == 0) { 4507 /* If padding and length are 0, this terminates the page */ 4508 break; 4509 } 4510 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4511 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4512 unsigned int i; 4513 4514 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4515 4516 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4517 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4518 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4519 desc_extent_rle->extents[i].cluster_idx); 4520 } else { 4521 fprintf(ctx->fp, "Unallocated Extent - "); 4522 } 4523 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4524 fprintf(ctx->fp, "\n"); 4525 } 4526 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4527 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4528 unsigned int i; 4529 4530 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4531 4532 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4533 if (desc_extent->cluster_idx[i] != 0) { 4534 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4535 desc_extent->cluster_idx[i]); 4536 } else { 4537 fprintf(ctx->fp, "Unallocated Extent"); 4538 } 4539 fprintf(ctx->fp, "\n"); 4540 } 4541 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4542 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4543 uint32_t i; 4544 4545 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4546 4547 if (desc_xattr->length != 4548 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4549 desc_xattr->name_length + desc_xattr->value_length) { 4550 } 4551 4552 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4553 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4554 fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name); 4555 fprintf(ctx->fp, " value = \""); 4556 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4557 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4558 desc_xattr->value_length); 4559 fprintf(ctx->fp, "\"\n"); 4560 for (i = 0; i < desc_xattr->value_length; i++) { 4561 if (i % 16 == 0) { 4562 fprintf(ctx->fp, " "); 4563 } 4564 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4565 if ((i + 1) % 16 == 0) { 4566 fprintf(ctx->fp, "\n"); 4567 } 4568 } 4569 if (i % 16 != 0) { 4570 fprintf(ctx->fp, "\n"); 4571 } 4572 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4573 /* TODO */ 4574 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4575 /* TODO */ 4576 } else { 4577 /* Error */ 4578 } 4579 /* Advance to the next descriptor */ 4580 cur_desc += sizeof(*desc) + desc->length; 4581 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4582 break; 4583 } 4584 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4585 } 4586 } 4587 4588 static void 4589 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4590 { 4591 struct spdk_bs_load_ctx *ctx = cb_arg; 4592 4593 if (bserrno != 0) { 4594 bs_dump_finish(seq, ctx, bserrno); 4595 return; 4596 } 4597 4598 if (ctx->page->id != 0) { 4599 bs_dump_print_md_page(ctx); 4600 } 4601 4602 ctx->cur_page++; 4603 4604 if (ctx->cur_page < ctx->super->md_len) { 4605 bs_dump_read_md_page(seq, ctx); 4606 } else { 4607 spdk_free(ctx->page); 4608 bs_dump_finish(seq, ctx, 0); 4609 } 4610 } 4611 4612 static void 4613 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4614 { 4615 struct spdk_bs_load_ctx *ctx = cb_arg; 4616 uint64_t lba; 4617 4618 assert(ctx->cur_page < ctx->super->md_len); 4619 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4620 bs_sequence_read_dev(seq, ctx->page, lba, 4621 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4622 bs_dump_read_md_page_cpl, ctx); 4623 } 4624 4625 static void 4626 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4627 { 4628 struct spdk_bs_load_ctx *ctx = cb_arg; 4629 4630 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4631 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4632 sizeof(ctx->super->signature)) != 0) { 4633 fprintf(ctx->fp, "(Mismatch)\n"); 4634 bs_dump_finish(seq, ctx, bserrno); 4635 return; 4636 } else { 4637 fprintf(ctx->fp, "(OK)\n"); 4638 } 4639 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4640 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4641 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4642 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4643 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4644 fprintf(ctx->fp, "Super Blob ID: "); 4645 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4646 fprintf(ctx->fp, "(None)\n"); 4647 } else { 4648 fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob); 4649 } 4650 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4651 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4652 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4653 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4654 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4655 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4656 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4657 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4658 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4659 4660 ctx->cur_page = 0; 4661 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4662 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4663 if (!ctx->page) { 4664 bs_dump_finish(seq, ctx, -ENOMEM); 4665 return; 4666 } 4667 bs_dump_read_md_page(seq, ctx); 4668 } 4669 4670 void 4671 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4672 spdk_bs_op_complete cb_fn, void *cb_arg) 4673 { 4674 struct spdk_blob_store *bs; 4675 struct spdk_bs_cpl cpl; 4676 spdk_bs_sequence_t *seq; 4677 struct spdk_bs_load_ctx *ctx; 4678 struct spdk_bs_opts opts = {}; 4679 int err; 4680 4681 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 4682 4683 spdk_bs_opts_init(&opts, sizeof(opts)); 4684 4685 err = bs_alloc(dev, &opts, &bs, &ctx); 4686 if (err) { 4687 dev->destroy(dev); 4688 cb_fn(cb_arg, err); 4689 return; 4690 } 4691 4692 ctx->fp = fp; 4693 ctx->print_xattr_fn = print_xattr_fn; 4694 4695 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 4696 cpl.u.bs_basic.cb_fn = cb_fn; 4697 cpl.u.bs_basic.cb_arg = cb_arg; 4698 4699 seq = bs_sequence_start(bs->md_channel, &cpl); 4700 if (!seq) { 4701 spdk_free(ctx->super); 4702 free(ctx); 4703 bs_free(bs); 4704 cb_fn(cb_arg, -ENOMEM); 4705 return; 4706 } 4707 4708 /* Read the super block */ 4709 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 4710 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4711 bs_dump_super_cpl, ctx); 4712 } 4713 4714 /* END spdk_bs_dump */ 4715 4716 /* START spdk_bs_init */ 4717 4718 static void 4719 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4720 { 4721 struct spdk_bs_load_ctx *ctx = cb_arg; 4722 4723 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 4724 spdk_free(ctx->super); 4725 free(ctx); 4726 4727 bs_sequence_finish(seq, bserrno); 4728 } 4729 4730 static void 4731 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4732 { 4733 struct spdk_bs_load_ctx *ctx = cb_arg; 4734 4735 /* Write super block */ 4736 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 4737 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 4738 bs_init_persist_super_cpl, ctx); 4739 } 4740 4741 void 4742 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4743 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4744 { 4745 struct spdk_bs_load_ctx *ctx; 4746 struct spdk_blob_store *bs; 4747 struct spdk_bs_cpl cpl; 4748 spdk_bs_sequence_t *seq; 4749 spdk_bs_batch_t *batch; 4750 uint64_t num_md_lba; 4751 uint64_t num_md_pages; 4752 uint64_t num_md_clusters; 4753 uint32_t i; 4754 struct spdk_bs_opts opts = {}; 4755 int rc; 4756 uint64_t lba, lba_count; 4757 4758 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 4759 4760 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4761 SPDK_ERRLOG("unsupported dev block length of %d\n", 4762 dev->blocklen); 4763 dev->destroy(dev); 4764 cb_fn(cb_arg, NULL, -EINVAL); 4765 return; 4766 } 4767 4768 spdk_bs_opts_init(&opts, sizeof(opts)); 4769 if (o) { 4770 if (bs_opts_copy(o, &opts)) { 4771 return; 4772 } 4773 } 4774 4775 if (bs_opts_verify(&opts) != 0) { 4776 dev->destroy(dev); 4777 cb_fn(cb_arg, NULL, -EINVAL); 4778 return; 4779 } 4780 4781 rc = bs_alloc(dev, &opts, &bs, &ctx); 4782 if (rc) { 4783 dev->destroy(dev); 4784 cb_fn(cb_arg, NULL, rc); 4785 return; 4786 } 4787 4788 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 4789 /* By default, allocate 1 page per cluster. 4790 * Technically, this over-allocates metadata 4791 * because more metadata will reduce the number 4792 * of usable clusters. This can be addressed with 4793 * more complex math in the future. 4794 */ 4795 bs->md_len = bs->total_clusters; 4796 } else { 4797 bs->md_len = opts.num_md_pages; 4798 } 4799 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 4800 if (rc < 0) { 4801 spdk_free(ctx->super); 4802 free(ctx); 4803 bs_free(bs); 4804 cb_fn(cb_arg, NULL, -ENOMEM); 4805 return; 4806 } 4807 4808 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 4809 if (rc < 0) { 4810 spdk_free(ctx->super); 4811 free(ctx); 4812 bs_free(bs); 4813 cb_fn(cb_arg, NULL, -ENOMEM); 4814 return; 4815 } 4816 4817 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 4818 if (rc < 0) { 4819 spdk_free(ctx->super); 4820 free(ctx); 4821 bs_free(bs); 4822 cb_fn(cb_arg, NULL, -ENOMEM); 4823 return; 4824 } 4825 4826 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4827 sizeof(ctx->super->signature)); 4828 ctx->super->version = SPDK_BS_VERSION; 4829 ctx->super->length = sizeof(*ctx->super); 4830 ctx->super->super_blob = bs->super_blob; 4831 ctx->super->clean = 0; 4832 ctx->super->cluster_size = bs->cluster_sz; 4833 ctx->super->io_unit_size = bs->io_unit_size; 4834 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 4835 4836 /* Calculate how many pages the metadata consumes at the front 4837 * of the disk. 4838 */ 4839 4840 /* The super block uses 1 page */ 4841 num_md_pages = 1; 4842 4843 /* The used_md_pages mask requires 1 bit per metadata page, rounded 4844 * up to the nearest page, plus a header. 4845 */ 4846 ctx->super->used_page_mask_start = num_md_pages; 4847 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4848 spdk_divide_round_up(bs->md_len, 8), 4849 SPDK_BS_PAGE_SIZE); 4850 num_md_pages += ctx->super->used_page_mask_len; 4851 4852 /* The used_clusters mask requires 1 bit per cluster, rounded 4853 * up to the nearest page, plus a header. 4854 */ 4855 ctx->super->used_cluster_mask_start = num_md_pages; 4856 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4857 spdk_divide_round_up(bs->total_clusters, 8), 4858 SPDK_BS_PAGE_SIZE); 4859 num_md_pages += ctx->super->used_cluster_mask_len; 4860 4861 /* The used_blobids mask requires 1 bit per metadata page, rounded 4862 * up to the nearest page, plus a header. 4863 */ 4864 ctx->super->used_blobid_mask_start = num_md_pages; 4865 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4866 spdk_divide_round_up(bs->md_len, 8), 4867 SPDK_BS_PAGE_SIZE); 4868 num_md_pages += ctx->super->used_blobid_mask_len; 4869 4870 /* The metadata region size was chosen above */ 4871 ctx->super->md_start = bs->md_start = num_md_pages; 4872 ctx->super->md_len = bs->md_len; 4873 num_md_pages += bs->md_len; 4874 4875 num_md_lba = bs_page_to_lba(bs, num_md_pages); 4876 4877 ctx->super->size = dev->blockcnt * dev->blocklen; 4878 4879 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 4880 4881 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 4882 if (num_md_clusters > bs->total_clusters) { 4883 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 4884 "please decrease number of pages reserved for metadata " 4885 "or increase cluster size.\n"); 4886 spdk_free(ctx->super); 4887 spdk_bit_array_free(&ctx->used_clusters); 4888 free(ctx); 4889 bs_free(bs); 4890 cb_fn(cb_arg, NULL, -ENOMEM); 4891 return; 4892 } 4893 /* Claim all of the clusters used by the metadata */ 4894 for (i = 0; i < num_md_clusters; i++) { 4895 spdk_bit_array_set(ctx->used_clusters, i); 4896 } 4897 4898 bs->num_free_clusters -= num_md_clusters; 4899 bs->total_data_clusters = bs->num_free_clusters; 4900 4901 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4902 cpl.u.bs_handle.cb_fn = cb_fn; 4903 cpl.u.bs_handle.cb_arg = cb_arg; 4904 cpl.u.bs_handle.bs = bs; 4905 4906 seq = bs_sequence_start(bs->md_channel, &cpl); 4907 if (!seq) { 4908 spdk_free(ctx->super); 4909 free(ctx); 4910 bs_free(bs); 4911 cb_fn(cb_arg, NULL, -ENOMEM); 4912 return; 4913 } 4914 4915 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 4916 4917 /* Clear metadata space */ 4918 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 4919 4920 lba = num_md_lba; 4921 while (lba < ctx->bs->dev->blockcnt) { 4922 lba_count = spdk_min(UINT32_MAX, ctx->bs->dev->blockcnt - lba); 4923 switch (opts.clear_method) { 4924 case BS_CLEAR_WITH_UNMAP: 4925 /* Trim data clusters */ 4926 bs_batch_unmap_dev(batch, lba, lba_count); 4927 break; 4928 case BS_CLEAR_WITH_WRITE_ZEROES: 4929 /* Write_zeroes to data clusters */ 4930 bs_batch_write_zeroes_dev(batch, lba, lba_count); 4931 break; 4932 case BS_CLEAR_WITH_NONE: 4933 default: 4934 break; 4935 } 4936 lba += lba_count; 4937 } 4938 4939 bs_batch_close(batch); 4940 } 4941 4942 /* END spdk_bs_init */ 4943 4944 /* START spdk_bs_destroy */ 4945 4946 static void 4947 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4948 { 4949 struct spdk_bs_load_ctx *ctx = cb_arg; 4950 struct spdk_blob_store *bs = ctx->bs; 4951 4952 /* 4953 * We need to defer calling bs_call_cpl() until after 4954 * dev destruction, so tuck these away for later use. 4955 */ 4956 bs->unload_err = bserrno; 4957 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4958 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4959 4960 bs_sequence_finish(seq, bserrno); 4961 4962 bs_free(bs); 4963 free(ctx); 4964 } 4965 4966 void 4967 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 4968 void *cb_arg) 4969 { 4970 struct spdk_bs_cpl cpl; 4971 spdk_bs_sequence_t *seq; 4972 struct spdk_bs_load_ctx *ctx; 4973 4974 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 4975 4976 if (!TAILQ_EMPTY(&bs->blobs)) { 4977 SPDK_ERRLOG("Blobstore still has open blobs\n"); 4978 cb_fn(cb_arg, -EBUSY); 4979 return; 4980 } 4981 4982 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 4983 cpl.u.bs_basic.cb_fn = cb_fn; 4984 cpl.u.bs_basic.cb_arg = cb_arg; 4985 4986 ctx = calloc(1, sizeof(*ctx)); 4987 if (!ctx) { 4988 cb_fn(cb_arg, -ENOMEM); 4989 return; 4990 } 4991 4992 ctx->bs = bs; 4993 4994 seq = bs_sequence_start(bs->md_channel, &cpl); 4995 if (!seq) { 4996 free(ctx); 4997 cb_fn(cb_arg, -ENOMEM); 4998 return; 4999 } 5000 5001 /* Write zeroes to the super block */ 5002 bs_sequence_write_zeroes_dev(seq, 5003 bs_page_to_lba(bs, 0), 5004 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5005 bs_destroy_trim_cpl, ctx); 5006 } 5007 5008 /* END spdk_bs_destroy */ 5009 5010 /* START spdk_bs_unload */ 5011 5012 static void 5013 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5014 { 5015 spdk_bs_sequence_t *seq = ctx->seq; 5016 5017 spdk_free(ctx->super); 5018 5019 /* 5020 * We need to defer calling bs_call_cpl() until after 5021 * dev destruction, so tuck these away for later use. 5022 */ 5023 ctx->bs->unload_err = bserrno; 5024 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5025 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5026 5027 bs_sequence_finish(seq, bserrno); 5028 5029 bs_free(ctx->bs); 5030 free(ctx); 5031 } 5032 5033 static void 5034 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5035 { 5036 struct spdk_bs_load_ctx *ctx = cb_arg; 5037 5038 bs_unload_finish(ctx, bserrno); 5039 } 5040 5041 static void 5042 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5043 { 5044 struct spdk_bs_load_ctx *ctx = cb_arg; 5045 5046 spdk_free(ctx->mask); 5047 5048 if (bserrno != 0) { 5049 bs_unload_finish(ctx, bserrno); 5050 return; 5051 } 5052 5053 ctx->super->clean = 1; 5054 5055 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5056 } 5057 5058 static void 5059 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5060 { 5061 struct spdk_bs_load_ctx *ctx = cb_arg; 5062 5063 spdk_free(ctx->mask); 5064 ctx->mask = NULL; 5065 5066 if (bserrno != 0) { 5067 bs_unload_finish(ctx, bserrno); 5068 return; 5069 } 5070 5071 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5072 } 5073 5074 static void 5075 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5076 { 5077 struct spdk_bs_load_ctx *ctx = cb_arg; 5078 5079 spdk_free(ctx->mask); 5080 ctx->mask = NULL; 5081 5082 if (bserrno != 0) { 5083 bs_unload_finish(ctx, bserrno); 5084 return; 5085 } 5086 5087 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5088 } 5089 5090 static void 5091 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5092 { 5093 struct spdk_bs_load_ctx *ctx = cb_arg; 5094 5095 if (bserrno != 0) { 5096 bs_unload_finish(ctx, bserrno); 5097 return; 5098 } 5099 5100 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5101 } 5102 5103 void 5104 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5105 { 5106 struct spdk_bs_cpl cpl; 5107 struct spdk_bs_load_ctx *ctx; 5108 5109 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5110 5111 if (!TAILQ_EMPTY(&bs->blobs)) { 5112 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5113 cb_fn(cb_arg, -EBUSY); 5114 return; 5115 } 5116 5117 ctx = calloc(1, sizeof(*ctx)); 5118 if (!ctx) { 5119 cb_fn(cb_arg, -ENOMEM); 5120 return; 5121 } 5122 5123 ctx->bs = bs; 5124 5125 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5126 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5127 if (!ctx->super) { 5128 free(ctx); 5129 cb_fn(cb_arg, -ENOMEM); 5130 return; 5131 } 5132 5133 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5134 cpl.u.bs_basic.cb_fn = cb_fn; 5135 cpl.u.bs_basic.cb_arg = cb_arg; 5136 5137 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5138 if (!ctx->seq) { 5139 spdk_free(ctx->super); 5140 free(ctx); 5141 cb_fn(cb_arg, -ENOMEM); 5142 return; 5143 } 5144 5145 /* Read super block */ 5146 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5147 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5148 bs_unload_read_super_cpl, ctx); 5149 } 5150 5151 /* END spdk_bs_unload */ 5152 5153 /* START spdk_bs_set_super */ 5154 5155 struct spdk_bs_set_super_ctx { 5156 struct spdk_blob_store *bs; 5157 struct spdk_bs_super_block *super; 5158 }; 5159 5160 static void 5161 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5162 { 5163 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5164 5165 if (bserrno != 0) { 5166 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5167 } 5168 5169 spdk_free(ctx->super); 5170 5171 bs_sequence_finish(seq, bserrno); 5172 5173 free(ctx); 5174 } 5175 5176 static void 5177 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5178 { 5179 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5180 5181 if (bserrno != 0) { 5182 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5183 spdk_free(ctx->super); 5184 bs_sequence_finish(seq, bserrno); 5185 free(ctx); 5186 return; 5187 } 5188 5189 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5190 } 5191 5192 void 5193 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5194 spdk_bs_op_complete cb_fn, void *cb_arg) 5195 { 5196 struct spdk_bs_cpl cpl; 5197 spdk_bs_sequence_t *seq; 5198 struct spdk_bs_set_super_ctx *ctx; 5199 5200 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5201 5202 ctx = calloc(1, sizeof(*ctx)); 5203 if (!ctx) { 5204 cb_fn(cb_arg, -ENOMEM); 5205 return; 5206 } 5207 5208 ctx->bs = bs; 5209 5210 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5211 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5212 if (!ctx->super) { 5213 free(ctx); 5214 cb_fn(cb_arg, -ENOMEM); 5215 return; 5216 } 5217 5218 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5219 cpl.u.bs_basic.cb_fn = cb_fn; 5220 cpl.u.bs_basic.cb_arg = cb_arg; 5221 5222 seq = bs_sequence_start(bs->md_channel, &cpl); 5223 if (!seq) { 5224 spdk_free(ctx->super); 5225 free(ctx); 5226 cb_fn(cb_arg, -ENOMEM); 5227 return; 5228 } 5229 5230 bs->super_blob = blobid; 5231 5232 /* Read super block */ 5233 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5234 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5235 bs_set_super_read_cpl, ctx); 5236 } 5237 5238 /* END spdk_bs_set_super */ 5239 5240 void 5241 spdk_bs_get_super(struct spdk_blob_store *bs, 5242 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5243 { 5244 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5245 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5246 } else { 5247 cb_fn(cb_arg, bs->super_blob, 0); 5248 } 5249 } 5250 5251 uint64_t 5252 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5253 { 5254 return bs->cluster_sz; 5255 } 5256 5257 uint64_t 5258 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5259 { 5260 return SPDK_BS_PAGE_SIZE; 5261 } 5262 5263 uint64_t 5264 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5265 { 5266 return bs->io_unit_size; 5267 } 5268 5269 uint64_t 5270 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5271 { 5272 return bs->num_free_clusters; 5273 } 5274 5275 uint64_t 5276 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5277 { 5278 return bs->total_data_clusters; 5279 } 5280 5281 static int 5282 bs_register_md_thread(struct spdk_blob_store *bs) 5283 { 5284 bs->md_channel = spdk_get_io_channel(bs); 5285 if (!bs->md_channel) { 5286 SPDK_ERRLOG("Failed to get IO channel.\n"); 5287 return -1; 5288 } 5289 5290 return 0; 5291 } 5292 5293 static int 5294 bs_unregister_md_thread(struct spdk_blob_store *bs) 5295 { 5296 spdk_put_io_channel(bs->md_channel); 5297 5298 return 0; 5299 } 5300 5301 spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob) 5302 { 5303 assert(blob != NULL); 5304 5305 return blob->id; 5306 } 5307 5308 uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob) 5309 { 5310 assert(blob != NULL); 5311 5312 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5313 } 5314 5315 uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob) 5316 { 5317 assert(blob != NULL); 5318 5319 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5320 } 5321 5322 uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob) 5323 { 5324 assert(blob != NULL); 5325 5326 return blob->active.num_clusters; 5327 } 5328 5329 /* START spdk_bs_create_blob */ 5330 5331 static void 5332 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5333 { 5334 struct spdk_blob *blob = cb_arg; 5335 uint32_t page_idx = bs_blobid_to_page(blob->id); 5336 5337 if (bserrno != 0) { 5338 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5339 bs_release_md_page(blob->bs, page_idx); 5340 } 5341 5342 blob_free(blob); 5343 5344 bs_sequence_finish(seq, bserrno); 5345 } 5346 5347 static int 5348 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5349 bool internal) 5350 { 5351 uint64_t i; 5352 size_t value_len = 0; 5353 int rc; 5354 const void *value = NULL; 5355 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5356 return -EINVAL; 5357 } 5358 for (i = 0; i < xattrs->count; i++) { 5359 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5360 if (value == NULL || value_len == 0) { 5361 return -EINVAL; 5362 } 5363 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5364 if (rc < 0) { 5365 return rc; 5366 } 5367 } 5368 return 0; 5369 } 5370 5371 static void 5372 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5373 { 5374 #define FIELD_OK(field) \ 5375 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5376 5377 #define SET_FIELD(field) \ 5378 if (FIELD_OK(field)) { \ 5379 dst->field = src->field; \ 5380 } \ 5381 5382 SET_FIELD(num_clusters); 5383 SET_FIELD(thin_provision); 5384 SET_FIELD(clear_method); 5385 5386 if (FIELD_OK(xattrs)) { 5387 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5388 } 5389 5390 SET_FIELD(use_extent_table); 5391 5392 dst->opts_size = src->opts_size; 5393 5394 /* You should not remove this statement, but need to update the assert statement 5395 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5396 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5397 5398 #undef FIELD_OK 5399 #undef SET_FIELD 5400 } 5401 5402 static void 5403 bs_create_blob(struct spdk_blob_store *bs, 5404 const struct spdk_blob_opts *opts, 5405 const struct spdk_blob_xattr_opts *internal_xattrs, 5406 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5407 { 5408 struct spdk_blob *blob; 5409 uint32_t page_idx; 5410 struct spdk_bs_cpl cpl; 5411 struct spdk_blob_opts opts_local; 5412 struct spdk_blob_xattr_opts internal_xattrs_default; 5413 spdk_bs_sequence_t *seq; 5414 spdk_blob_id id; 5415 int rc; 5416 5417 assert(spdk_get_thread() == bs->md_thread); 5418 5419 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5420 if (page_idx == UINT32_MAX) { 5421 cb_fn(cb_arg, 0, -ENOMEM); 5422 return; 5423 } 5424 spdk_bit_array_set(bs->used_blobids, page_idx); 5425 bs_claim_md_page(bs, page_idx); 5426 5427 id = bs_page_to_blobid(page_idx); 5428 5429 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5430 5431 blob = blob_alloc(bs, id); 5432 if (!blob) { 5433 spdk_bit_array_clear(bs->used_blobids, page_idx); 5434 bs_release_md_page(bs, page_idx); 5435 cb_fn(cb_arg, 0, -ENOMEM); 5436 return; 5437 } 5438 5439 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5440 if (opts) { 5441 blob_opts_copy(opts, &opts_local); 5442 } 5443 5444 blob->use_extent_table = opts_local.use_extent_table; 5445 if (blob->use_extent_table) { 5446 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5447 } 5448 5449 if (!internal_xattrs) { 5450 blob_xattrs_init(&internal_xattrs_default); 5451 internal_xattrs = &internal_xattrs_default; 5452 } 5453 5454 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5455 if (rc < 0) { 5456 blob_free(blob); 5457 spdk_bit_array_clear(bs->used_blobids, page_idx); 5458 bs_release_md_page(bs, page_idx); 5459 cb_fn(cb_arg, 0, rc); 5460 return; 5461 } 5462 5463 rc = blob_set_xattrs(blob, internal_xattrs, true); 5464 if (rc < 0) { 5465 blob_free(blob); 5466 spdk_bit_array_clear(bs->used_blobids, page_idx); 5467 bs_release_md_page(bs, page_idx); 5468 cb_fn(cb_arg, 0, rc); 5469 return; 5470 } 5471 5472 if (opts_local.thin_provision) { 5473 blob_set_thin_provision(blob); 5474 } 5475 5476 blob_set_clear_method(blob, opts_local.clear_method); 5477 5478 rc = blob_resize(blob, opts_local.num_clusters); 5479 if (rc < 0) { 5480 blob_free(blob); 5481 spdk_bit_array_clear(bs->used_blobids, page_idx); 5482 bs_release_md_page(bs, page_idx); 5483 cb_fn(cb_arg, 0, rc); 5484 return; 5485 } 5486 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5487 cpl.u.blobid.cb_fn = cb_fn; 5488 cpl.u.blobid.cb_arg = cb_arg; 5489 cpl.u.blobid.blobid = blob->id; 5490 5491 seq = bs_sequence_start(bs->md_channel, &cpl); 5492 if (!seq) { 5493 blob_free(blob); 5494 spdk_bit_array_clear(bs->used_blobids, page_idx); 5495 bs_release_md_page(bs, page_idx); 5496 cb_fn(cb_arg, 0, -ENOMEM); 5497 return; 5498 } 5499 5500 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5501 } 5502 5503 void spdk_bs_create_blob(struct spdk_blob_store *bs, 5504 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5505 { 5506 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5507 } 5508 5509 void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5510 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5511 { 5512 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5513 } 5514 5515 /* END spdk_bs_create_blob */ 5516 5517 /* START blob_cleanup */ 5518 5519 struct spdk_clone_snapshot_ctx { 5520 struct spdk_bs_cpl cpl; 5521 int bserrno; 5522 bool frozen; 5523 5524 struct spdk_io_channel *channel; 5525 5526 /* Current cluster for inflate operation */ 5527 uint64_t cluster; 5528 5529 /* For inflation force allocation of all unallocated clusters and remove 5530 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5531 bool allocate_all; 5532 5533 struct { 5534 spdk_blob_id id; 5535 struct spdk_blob *blob; 5536 } original; 5537 struct { 5538 spdk_blob_id id; 5539 struct spdk_blob *blob; 5540 } new; 5541 5542 /* xattrs specified for snapshot/clones only. They have no impact on 5543 * the original blobs xattrs. */ 5544 const struct spdk_blob_xattr_opts *xattrs; 5545 }; 5546 5547 static void 5548 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5549 { 5550 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5551 struct spdk_bs_cpl *cpl = &ctx->cpl; 5552 5553 if (bserrno != 0) { 5554 if (ctx->bserrno != 0) { 5555 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5556 } else { 5557 ctx->bserrno = bserrno; 5558 } 5559 } 5560 5561 switch (cpl->type) { 5562 case SPDK_BS_CPL_TYPE_BLOBID: 5563 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5564 break; 5565 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5566 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5567 break; 5568 default: 5569 SPDK_UNREACHABLE(); 5570 break; 5571 } 5572 5573 free(ctx); 5574 } 5575 5576 static void 5577 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5578 { 5579 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5580 struct spdk_blob *origblob = ctx->original.blob; 5581 5582 if (bserrno != 0) { 5583 if (ctx->bserrno != 0) { 5584 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5585 } else { 5586 ctx->bserrno = bserrno; 5587 } 5588 } 5589 5590 ctx->original.id = origblob->id; 5591 origblob->locked_operation_in_progress = false; 5592 5593 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5594 } 5595 5596 static void 5597 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5598 { 5599 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5600 struct spdk_blob *origblob = ctx->original.blob; 5601 5602 if (bserrno != 0) { 5603 if (ctx->bserrno != 0) { 5604 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5605 } else { 5606 ctx->bserrno = bserrno; 5607 } 5608 } 5609 5610 if (ctx->frozen) { 5611 /* Unfreeze any outstanding I/O */ 5612 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5613 } else { 5614 bs_snapshot_unfreeze_cpl(ctx, 0); 5615 } 5616 5617 } 5618 5619 static void 5620 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5621 { 5622 struct spdk_blob *newblob = ctx->new.blob; 5623 5624 if (bserrno != 0) { 5625 if (ctx->bserrno != 0) { 5626 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5627 } else { 5628 ctx->bserrno = bserrno; 5629 } 5630 } 5631 5632 ctx->new.id = newblob->id; 5633 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5634 } 5635 5636 /* END blob_cleanup */ 5637 5638 /* START spdk_bs_create_snapshot */ 5639 5640 static void 5641 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 5642 { 5643 uint64_t *cluster_temp; 5644 uint32_t *extent_page_temp; 5645 5646 cluster_temp = blob1->active.clusters; 5647 blob1->active.clusters = blob2->active.clusters; 5648 blob2->active.clusters = cluster_temp; 5649 5650 extent_page_temp = blob1->active.extent_pages; 5651 blob1->active.extent_pages = blob2->active.extent_pages; 5652 blob2->active.extent_pages = extent_page_temp; 5653 } 5654 5655 static void 5656 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 5657 { 5658 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5659 struct spdk_blob *origblob = ctx->original.blob; 5660 struct spdk_blob *newblob = ctx->new.blob; 5661 5662 if (bserrno != 0) { 5663 bs_snapshot_swap_cluster_maps(newblob, origblob); 5664 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5665 return; 5666 } 5667 5668 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 5669 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 5670 if (bserrno != 0) { 5671 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5672 return; 5673 } 5674 5675 bs_blob_list_add(ctx->original.blob); 5676 5677 spdk_blob_set_read_only(newblob); 5678 5679 /* sync snapshot metadata */ 5680 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5681 } 5682 5683 static void 5684 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 5685 { 5686 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5687 struct spdk_blob *origblob = ctx->original.blob; 5688 struct spdk_blob *newblob = ctx->new.blob; 5689 5690 if (bserrno != 0) { 5691 /* return cluster map back to original */ 5692 bs_snapshot_swap_cluster_maps(newblob, origblob); 5693 5694 /* Newblob md sync failed. Valid clusters are only present in origblob. 5695 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occured. 5696 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 5697 blob_set_thin_provision(newblob); 5698 assert(spdk_mem_all_zero(newblob->active.clusters, 5699 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5700 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5701 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5702 5703 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5704 return; 5705 } 5706 5707 /* Set internal xattr for snapshot id */ 5708 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 5709 if (bserrno != 0) { 5710 /* return cluster map back to original */ 5711 bs_snapshot_swap_cluster_maps(newblob, origblob); 5712 blob_set_thin_provision(newblob); 5713 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5714 return; 5715 } 5716 5717 /* Create new back_bs_dev for snapshot */ 5718 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 5719 if (origblob->back_bs_dev == NULL) { 5720 /* return cluster map back to original */ 5721 bs_snapshot_swap_cluster_maps(newblob, origblob); 5722 blob_set_thin_provision(newblob); 5723 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 5724 return; 5725 } 5726 5727 bs_blob_list_remove(origblob); 5728 origblob->parent_id = newblob->id; 5729 /* set clone blob as thin provisioned */ 5730 blob_set_thin_provision(origblob); 5731 5732 bs_blob_list_add(newblob); 5733 5734 /* sync clone metadata */ 5735 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 5736 } 5737 5738 static void 5739 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 5740 { 5741 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5742 struct spdk_blob *origblob = ctx->original.blob; 5743 struct spdk_blob *newblob = ctx->new.blob; 5744 int bserrno; 5745 5746 if (rc != 0) { 5747 bs_clone_snapshot_newblob_cleanup(ctx, rc); 5748 return; 5749 } 5750 5751 ctx->frozen = true; 5752 5753 /* set new back_bs_dev for snapshot */ 5754 newblob->back_bs_dev = origblob->back_bs_dev; 5755 /* Set invalid flags from origblob */ 5756 newblob->invalid_flags = origblob->invalid_flags; 5757 5758 /* inherit parent from original blob if set */ 5759 newblob->parent_id = origblob->parent_id; 5760 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 5761 /* Set internal xattr for snapshot id */ 5762 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 5763 &origblob->parent_id, sizeof(spdk_blob_id), true); 5764 if (bserrno != 0) { 5765 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5766 return; 5767 } 5768 } 5769 5770 /* swap cluster maps */ 5771 bs_snapshot_swap_cluster_maps(newblob, origblob); 5772 5773 /* Set the clear method on the new blob to match the original. */ 5774 blob_set_clear_method(newblob, origblob->clear_method); 5775 5776 /* sync snapshot metadata */ 5777 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 5778 } 5779 5780 static void 5781 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5782 { 5783 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5784 struct spdk_blob *origblob = ctx->original.blob; 5785 struct spdk_blob *newblob = _blob; 5786 5787 if (bserrno != 0) { 5788 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5789 return; 5790 } 5791 5792 ctx->new.blob = newblob; 5793 assert(spdk_blob_is_thin_provisioned(newblob)); 5794 assert(spdk_mem_all_zero(newblob->active.clusters, 5795 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5796 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5797 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5798 5799 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 5800 } 5801 5802 static void 5803 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 5804 { 5805 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5806 struct spdk_blob *origblob = ctx->original.blob; 5807 5808 if (bserrno != 0) { 5809 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5810 return; 5811 } 5812 5813 ctx->new.id = blobid; 5814 ctx->cpl.u.blobid.blobid = blobid; 5815 5816 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 5817 } 5818 5819 5820 static void 5821 bs_xattr_snapshot(void *arg, const char *name, 5822 const void **value, size_t *value_len) 5823 { 5824 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 5825 5826 struct spdk_blob *blob = (struct spdk_blob *)arg; 5827 *value = &blob->id; 5828 *value_len = sizeof(blob->id); 5829 } 5830 5831 static void 5832 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5833 { 5834 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5835 struct spdk_blob_opts opts; 5836 struct spdk_blob_xattr_opts internal_xattrs; 5837 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 5838 5839 if (bserrno != 0) { 5840 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 5841 return; 5842 } 5843 5844 ctx->original.blob = _blob; 5845 5846 if (_blob->data_ro || _blob->md_ro) { 5847 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 5848 _blob->id); 5849 ctx->bserrno = -EINVAL; 5850 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5851 return; 5852 } 5853 5854 if (_blob->locked_operation_in_progress) { 5855 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 5856 ctx->bserrno = -EBUSY; 5857 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5858 return; 5859 } 5860 5861 _blob->locked_operation_in_progress = true; 5862 5863 spdk_blob_opts_init(&opts, sizeof(opts)); 5864 blob_xattrs_init(&internal_xattrs); 5865 5866 /* Change the size of new blob to the same as in original blob, 5867 * but do not allocate clusters */ 5868 opts.thin_provision = true; 5869 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 5870 opts.use_extent_table = _blob->use_extent_table; 5871 5872 /* If there are any xattrs specified for snapshot, set them now */ 5873 if (ctx->xattrs) { 5874 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 5875 } 5876 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 5877 internal_xattrs.count = 1; 5878 internal_xattrs.ctx = _blob; 5879 internal_xattrs.names = xattrs_names; 5880 internal_xattrs.get_value = bs_xattr_snapshot; 5881 5882 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 5883 bs_snapshot_newblob_create_cpl, ctx); 5884 } 5885 5886 void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 5887 const struct spdk_blob_xattr_opts *snapshot_xattrs, 5888 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5889 { 5890 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 5891 5892 if (!ctx) { 5893 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 5894 return; 5895 } 5896 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5897 ctx->cpl.u.blobid.cb_fn = cb_fn; 5898 ctx->cpl.u.blobid.cb_arg = cb_arg; 5899 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 5900 ctx->bserrno = 0; 5901 ctx->frozen = false; 5902 ctx->original.id = blobid; 5903 ctx->xattrs = snapshot_xattrs; 5904 5905 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 5906 } 5907 /* END spdk_bs_create_snapshot */ 5908 5909 /* START spdk_bs_create_clone */ 5910 5911 static void 5912 bs_xattr_clone(void *arg, const char *name, 5913 const void **value, size_t *value_len) 5914 { 5915 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 5916 5917 struct spdk_blob *blob = (struct spdk_blob *)arg; 5918 *value = &blob->id; 5919 *value_len = sizeof(blob->id); 5920 } 5921 5922 static void 5923 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5924 { 5925 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5926 struct spdk_blob *clone = _blob; 5927 5928 ctx->new.blob = clone; 5929 bs_blob_list_add(clone); 5930 5931 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 5932 } 5933 5934 static void 5935 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 5936 { 5937 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5938 5939 ctx->cpl.u.blobid.blobid = blobid; 5940 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 5941 } 5942 5943 static void 5944 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5945 { 5946 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5947 struct spdk_blob_opts opts; 5948 struct spdk_blob_xattr_opts internal_xattrs; 5949 char *xattr_names[] = { BLOB_SNAPSHOT }; 5950 5951 if (bserrno != 0) { 5952 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 5953 return; 5954 } 5955 5956 ctx->original.blob = _blob; 5957 5958 if (!_blob->data_ro || !_blob->md_ro) { 5959 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 5960 ctx->bserrno = -EINVAL; 5961 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5962 return; 5963 } 5964 5965 if (_blob->locked_operation_in_progress) { 5966 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 5967 ctx->bserrno = -EBUSY; 5968 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5969 return; 5970 } 5971 5972 _blob->locked_operation_in_progress = true; 5973 5974 spdk_blob_opts_init(&opts, sizeof(opts)); 5975 blob_xattrs_init(&internal_xattrs); 5976 5977 opts.thin_provision = true; 5978 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 5979 opts.use_extent_table = _blob->use_extent_table; 5980 if (ctx->xattrs) { 5981 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 5982 } 5983 5984 /* Set internal xattr BLOB_SNAPSHOT */ 5985 internal_xattrs.count = 1; 5986 internal_xattrs.ctx = _blob; 5987 internal_xattrs.names = xattr_names; 5988 internal_xattrs.get_value = bs_xattr_clone; 5989 5990 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 5991 bs_clone_newblob_create_cpl, ctx); 5992 } 5993 5994 void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 5995 const struct spdk_blob_xattr_opts *clone_xattrs, 5996 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5997 { 5998 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 5999 6000 if (!ctx) { 6001 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6002 return; 6003 } 6004 6005 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6006 ctx->cpl.u.blobid.cb_fn = cb_fn; 6007 ctx->cpl.u.blobid.cb_arg = cb_arg; 6008 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6009 ctx->bserrno = 0; 6010 ctx->xattrs = clone_xattrs; 6011 ctx->original.id = blobid; 6012 6013 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6014 } 6015 6016 /* END spdk_bs_create_clone */ 6017 6018 /* START spdk_bs_inflate_blob */ 6019 6020 static void 6021 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6022 { 6023 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6024 struct spdk_blob *_blob = ctx->original.blob; 6025 6026 if (bserrno != 0) { 6027 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6028 return; 6029 } 6030 6031 assert(_parent != NULL); 6032 6033 bs_blob_list_remove(_blob); 6034 _blob->parent_id = _parent->id; 6035 blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id, 6036 sizeof(spdk_blob_id), true); 6037 6038 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6039 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6040 bs_blob_list_add(_blob); 6041 6042 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6043 } 6044 6045 static void 6046 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6047 { 6048 struct spdk_blob *_blob = ctx->original.blob; 6049 struct spdk_blob *_parent; 6050 6051 if (ctx->allocate_all) { 6052 /* remove thin provisioning */ 6053 bs_blob_list_remove(_blob); 6054 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6055 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6056 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6057 _blob->back_bs_dev = NULL; 6058 _blob->parent_id = SPDK_BLOBID_INVALID; 6059 } else { 6060 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6061 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6062 /* We must change the parent of the inflated blob */ 6063 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6064 bs_inflate_blob_set_parent_cpl, ctx); 6065 return; 6066 } 6067 6068 bs_blob_list_remove(_blob); 6069 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6070 _blob->parent_id = SPDK_BLOBID_INVALID; 6071 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6072 _blob->back_bs_dev = bs_create_zeroes_dev(); 6073 } 6074 6075 _blob->state = SPDK_BLOB_STATE_DIRTY; 6076 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6077 } 6078 6079 /* Check if cluster needs allocation */ 6080 static inline bool 6081 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6082 { 6083 struct spdk_blob_bs_dev *b; 6084 6085 assert(blob != NULL); 6086 6087 if (blob->active.clusters[cluster] != 0) { 6088 /* Cluster is already allocated */ 6089 return false; 6090 } 6091 6092 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6093 /* Blob have no parent blob */ 6094 return allocate_all; 6095 } 6096 6097 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6098 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6099 } 6100 6101 static void 6102 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6103 { 6104 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6105 struct spdk_blob *_blob = ctx->original.blob; 6106 uint64_t offset; 6107 6108 if (bserrno != 0) { 6109 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6110 return; 6111 } 6112 6113 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6114 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6115 break; 6116 } 6117 } 6118 6119 if (ctx->cluster < _blob->active.num_clusters) { 6120 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6121 6122 /* We may safely increment a cluster before write */ 6123 ctx->cluster++; 6124 6125 /* Use zero length write to touch a cluster */ 6126 spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0, 6127 bs_inflate_blob_touch_next, ctx); 6128 } else { 6129 bs_inflate_blob_done(ctx); 6130 } 6131 } 6132 6133 static void 6134 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6135 { 6136 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6137 uint64_t clusters_needed; 6138 uint64_t i; 6139 6140 if (bserrno != 0) { 6141 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6142 return; 6143 } 6144 6145 ctx->original.blob = _blob; 6146 6147 if (_blob->locked_operation_in_progress) { 6148 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6149 ctx->bserrno = -EBUSY; 6150 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6151 return; 6152 } 6153 6154 _blob->locked_operation_in_progress = true; 6155 6156 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6157 /* This blob have no parent, so we cannot decouple it. */ 6158 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6159 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6160 return; 6161 } 6162 6163 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6164 /* This is not thin provisioned blob. No need to inflate. */ 6165 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6166 return; 6167 } 6168 6169 /* Do two passes - one to verify that we can obtain enough clusters 6170 * and another to actually claim them. 6171 */ 6172 clusters_needed = 0; 6173 for (i = 0; i < _blob->active.num_clusters; i++) { 6174 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6175 clusters_needed++; 6176 } 6177 } 6178 6179 if (clusters_needed > _blob->bs->num_free_clusters) { 6180 /* Not enough free clusters. Cannot satisfy the request. */ 6181 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6182 return; 6183 } 6184 6185 ctx->cluster = 0; 6186 bs_inflate_blob_touch_next(ctx, 0); 6187 } 6188 6189 static void 6190 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6191 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6192 { 6193 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6194 6195 if (!ctx) { 6196 cb_fn(cb_arg, -ENOMEM); 6197 return; 6198 } 6199 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6200 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6201 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6202 ctx->bserrno = 0; 6203 ctx->original.id = blobid; 6204 ctx->channel = channel; 6205 ctx->allocate_all = allocate_all; 6206 6207 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6208 } 6209 6210 void 6211 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6212 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6213 { 6214 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6215 } 6216 6217 void 6218 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6219 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6220 { 6221 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6222 } 6223 /* END spdk_bs_inflate_blob */ 6224 6225 /* START spdk_blob_resize */ 6226 struct spdk_bs_resize_ctx { 6227 spdk_blob_op_complete cb_fn; 6228 void *cb_arg; 6229 struct spdk_blob *blob; 6230 uint64_t sz; 6231 int rc; 6232 }; 6233 6234 static void 6235 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6236 { 6237 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6238 6239 if (rc != 0) { 6240 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6241 } 6242 6243 if (ctx->rc != 0) { 6244 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6245 rc = ctx->rc; 6246 } 6247 6248 ctx->blob->locked_operation_in_progress = false; 6249 6250 ctx->cb_fn(ctx->cb_arg, rc); 6251 free(ctx); 6252 } 6253 6254 static void 6255 bs_resize_freeze_cpl(void *cb_arg, int rc) 6256 { 6257 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6258 6259 if (rc != 0) { 6260 ctx->blob->locked_operation_in_progress = false; 6261 ctx->cb_fn(ctx->cb_arg, rc); 6262 free(ctx); 6263 return; 6264 } 6265 6266 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6267 6268 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6269 } 6270 6271 void 6272 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6273 { 6274 struct spdk_bs_resize_ctx *ctx; 6275 6276 blob_verify_md_op(blob); 6277 6278 SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz); 6279 6280 if (blob->md_ro) { 6281 cb_fn(cb_arg, -EPERM); 6282 return; 6283 } 6284 6285 if (sz == blob->active.num_clusters) { 6286 cb_fn(cb_arg, 0); 6287 return; 6288 } 6289 6290 if (blob->locked_operation_in_progress) { 6291 cb_fn(cb_arg, -EBUSY); 6292 return; 6293 } 6294 6295 ctx = calloc(1, sizeof(*ctx)); 6296 if (!ctx) { 6297 cb_fn(cb_arg, -ENOMEM); 6298 return; 6299 } 6300 6301 blob->locked_operation_in_progress = true; 6302 ctx->cb_fn = cb_fn; 6303 ctx->cb_arg = cb_arg; 6304 ctx->blob = blob; 6305 ctx->sz = sz; 6306 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6307 } 6308 6309 /* END spdk_blob_resize */ 6310 6311 6312 /* START spdk_bs_delete_blob */ 6313 6314 static void 6315 bs_delete_close_cpl(void *cb_arg, int bserrno) 6316 { 6317 spdk_bs_sequence_t *seq = cb_arg; 6318 6319 bs_sequence_finish(seq, bserrno); 6320 } 6321 6322 static void 6323 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6324 { 6325 struct spdk_blob *blob = cb_arg; 6326 6327 if (bserrno != 0) { 6328 /* 6329 * We already removed this blob from the blobstore tailq, so 6330 * we need to free it here since this is the last reference 6331 * to it. 6332 */ 6333 blob_free(blob); 6334 bs_delete_close_cpl(seq, bserrno); 6335 return; 6336 } 6337 6338 /* 6339 * This will immediately decrement the ref_count and call 6340 * the completion routine since the metadata state is clean. 6341 * By calling spdk_blob_close, we reduce the number of call 6342 * points into code that touches the blob->open_ref count 6343 * and the blobstore's blob list. 6344 */ 6345 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6346 } 6347 6348 struct delete_snapshot_ctx { 6349 struct spdk_blob_list *parent_snapshot_entry; 6350 struct spdk_blob *snapshot; 6351 bool snapshot_md_ro; 6352 struct spdk_blob *clone; 6353 bool clone_md_ro; 6354 spdk_blob_op_with_handle_complete cb_fn; 6355 void *cb_arg; 6356 int bserrno; 6357 uint32_t next_extent_page; 6358 }; 6359 6360 static void 6361 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6362 { 6363 struct delete_snapshot_ctx *ctx = cb_arg; 6364 6365 if (bserrno != 0) { 6366 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6367 } 6368 6369 assert(ctx != NULL); 6370 6371 if (bserrno != 0 && ctx->bserrno == 0) { 6372 ctx->bserrno = bserrno; 6373 } 6374 6375 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6376 free(ctx); 6377 } 6378 6379 static void 6380 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6381 { 6382 struct delete_snapshot_ctx *ctx = cb_arg; 6383 6384 if (bserrno != 0) { 6385 ctx->bserrno = bserrno; 6386 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6387 } 6388 6389 if (ctx->bserrno != 0) { 6390 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6391 TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link); 6392 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6393 } 6394 6395 ctx->snapshot->locked_operation_in_progress = false; 6396 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6397 6398 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6399 } 6400 6401 static void 6402 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6403 { 6404 struct delete_snapshot_ctx *ctx = cb_arg; 6405 6406 ctx->clone->locked_operation_in_progress = false; 6407 ctx->clone->md_ro = ctx->clone_md_ro; 6408 6409 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6410 } 6411 6412 static void 6413 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6414 { 6415 struct delete_snapshot_ctx *ctx = cb_arg; 6416 6417 if (bserrno) { 6418 ctx->bserrno = bserrno; 6419 delete_snapshot_cleanup_clone(ctx, 0); 6420 return; 6421 } 6422 6423 ctx->clone->locked_operation_in_progress = false; 6424 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6425 } 6426 6427 static void 6428 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6429 { 6430 struct delete_snapshot_ctx *ctx = cb_arg; 6431 struct spdk_blob_list *parent_snapshot_entry = NULL; 6432 struct spdk_blob_list *snapshot_entry = NULL; 6433 struct spdk_blob_list *clone_entry = NULL; 6434 struct spdk_blob_list *snapshot_clone_entry = NULL; 6435 6436 if (bserrno) { 6437 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6438 ctx->bserrno = bserrno; 6439 delete_snapshot_cleanup_clone(ctx, 0); 6440 return; 6441 } 6442 6443 /* Get snapshot entry for the snapshot we want to remove */ 6444 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6445 6446 assert(snapshot_entry != NULL); 6447 6448 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6449 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6450 assert(clone_entry != NULL); 6451 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6452 snapshot_entry->clone_count--; 6453 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6454 6455 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6456 /* This snapshot is at the same time a clone of another snapshot - we need to 6457 * update parent snapshot (remove current clone, add new one inherited from 6458 * the snapshot that is being removed) */ 6459 6460 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6461 * snapshot that we are removing */ 6462 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6463 &snapshot_clone_entry); 6464 6465 /* Switch clone entry in parent snapshot */ 6466 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6467 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6468 free(snapshot_clone_entry); 6469 } else { 6470 /* No parent snapshot - just remove clone entry */ 6471 free(clone_entry); 6472 } 6473 6474 /* Restore md_ro flags */ 6475 ctx->clone->md_ro = ctx->clone_md_ro; 6476 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6477 6478 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6479 } 6480 6481 static void 6482 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6483 { 6484 struct delete_snapshot_ctx *ctx = cb_arg; 6485 uint64_t i; 6486 6487 ctx->snapshot->md_ro = false; 6488 6489 if (bserrno) { 6490 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6491 ctx->bserrno = bserrno; 6492 6493 /* Restore snapshot to previous state */ 6494 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6495 if (bserrno != 0) { 6496 delete_snapshot_cleanup_clone(ctx, bserrno); 6497 return; 6498 } 6499 6500 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6501 return; 6502 } 6503 6504 /* Clear cluster map entries for snapshot */ 6505 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6506 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6507 ctx->snapshot->active.clusters[i] = 0; 6508 } 6509 } 6510 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6511 i < ctx->clone->active.num_extent_pages; i++) { 6512 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6513 ctx->snapshot->active.extent_pages[i] = 0; 6514 } 6515 } 6516 6517 blob_set_thin_provision(ctx->snapshot); 6518 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6519 6520 if (ctx->parent_snapshot_entry != NULL) { 6521 ctx->snapshot->back_bs_dev = NULL; 6522 } 6523 6524 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6525 } 6526 6527 static void 6528 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6529 { 6530 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6531 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6532 6533 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6534 if (ctx->parent_snapshot_entry != NULL) { 6535 /* ...to parent snapshot */ 6536 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6537 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6538 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6539 sizeof(spdk_blob_id), 6540 true); 6541 } else { 6542 /* ...to blobid invalid and zeroes dev */ 6543 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6544 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6545 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6546 } 6547 6548 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6549 } 6550 6551 static void 6552 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6553 { 6554 struct delete_snapshot_ctx *ctx = cb_arg; 6555 uint32_t *extent_page; 6556 uint64_t i; 6557 6558 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6559 i < ctx->clone->active.num_extent_pages; i++) { 6560 if (ctx->snapshot->active.extent_pages[i] == 0) { 6561 /* No extent page to use from snapshot */ 6562 continue; 6563 } 6564 6565 extent_page = &ctx->clone->active.extent_pages[i]; 6566 if (*extent_page == 0) { 6567 /* Copy extent page from snapshot when clone did not have a matching one */ 6568 *extent_page = ctx->snapshot->active.extent_pages[i]; 6569 continue; 6570 } 6571 6572 /* Clone and snapshot both contain partialy filled matching extent pages. 6573 * Update the clone extent page in place with cluster map containing the mix of both. */ 6574 ctx->next_extent_page = i + 1; 6575 6576 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, 6577 delete_snapshot_update_extent_pages, ctx); 6578 return; 6579 } 6580 delete_snapshot_update_extent_pages_cpl(ctx); 6581 } 6582 6583 static void 6584 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6585 { 6586 struct delete_snapshot_ctx *ctx = cb_arg; 6587 uint64_t i; 6588 6589 /* Temporarily override md_ro flag for clone for MD modification */ 6590 ctx->clone_md_ro = ctx->clone->md_ro; 6591 ctx->clone->md_ro = false; 6592 6593 if (bserrno) { 6594 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6595 ctx->bserrno = bserrno; 6596 delete_snapshot_cleanup_clone(ctx, 0); 6597 return; 6598 } 6599 6600 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 6601 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6602 if (ctx->clone->active.clusters[i] == 0) { 6603 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 6604 } 6605 } 6606 ctx->next_extent_page = 0; 6607 delete_snapshot_update_extent_pages(ctx, 0); 6608 } 6609 6610 static void 6611 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 6612 { 6613 struct delete_snapshot_ctx *ctx = cb_arg; 6614 6615 if (bserrno) { 6616 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 6617 ctx->bserrno = bserrno; 6618 delete_snapshot_cleanup_clone(ctx, 0); 6619 return; 6620 } 6621 6622 /* Temporarily override md_ro flag for snapshot for MD modification */ 6623 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 6624 ctx->snapshot->md_ro = false; 6625 6626 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 6627 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 6628 sizeof(spdk_blob_id), true); 6629 if (ctx->bserrno != 0) { 6630 delete_snapshot_cleanup_clone(ctx, 0); 6631 return; 6632 } 6633 6634 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 6635 } 6636 6637 static void 6638 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 6639 { 6640 struct delete_snapshot_ctx *ctx = cb_arg; 6641 6642 if (bserrno) { 6643 SPDK_ERRLOG("Failed to open clone\n"); 6644 ctx->bserrno = bserrno; 6645 delete_snapshot_cleanup_snapshot(ctx, 0); 6646 return; 6647 } 6648 6649 ctx->clone = clone; 6650 6651 if (clone->locked_operation_in_progress) { 6652 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 6653 ctx->bserrno = -EBUSY; 6654 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6655 return; 6656 } 6657 6658 clone->locked_operation_in_progress = true; 6659 6660 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 6661 } 6662 6663 static void 6664 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 6665 { 6666 struct spdk_blob_list *snapshot_entry = NULL; 6667 struct spdk_blob_list *clone_entry = NULL; 6668 struct spdk_blob_list *snapshot_clone_entry = NULL; 6669 6670 /* Get snapshot entry for the snapshot we want to remove */ 6671 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 6672 6673 assert(snapshot_entry != NULL); 6674 6675 /* Get clone of the snapshot (at this point there can be only one clone) */ 6676 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6677 assert(snapshot_entry->clone_count == 1); 6678 assert(clone_entry != NULL); 6679 6680 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6681 * snapshot that we are removing */ 6682 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 6683 &snapshot_clone_entry); 6684 6685 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 6686 } 6687 6688 static void 6689 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 6690 { 6691 spdk_bs_sequence_t *seq = cb_arg; 6692 struct spdk_blob_list *snapshot_entry = NULL; 6693 uint32_t page_num; 6694 6695 if (bserrno) { 6696 SPDK_ERRLOG("Failed to remove blob\n"); 6697 bs_sequence_finish(seq, bserrno); 6698 return; 6699 } 6700 6701 /* Remove snapshot from the list */ 6702 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 6703 if (snapshot_entry != NULL) { 6704 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 6705 free(snapshot_entry); 6706 } 6707 6708 page_num = bs_blobid_to_page(blob->id); 6709 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 6710 blob->state = SPDK_BLOB_STATE_DIRTY; 6711 blob->active.num_pages = 0; 6712 blob_resize(blob, 0); 6713 6714 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 6715 } 6716 6717 static int 6718 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 6719 { 6720 struct spdk_blob_list *snapshot_entry = NULL; 6721 struct spdk_blob_list *clone_entry = NULL; 6722 struct spdk_blob *clone = NULL; 6723 bool has_one_clone = false; 6724 6725 /* Check if this is a snapshot with clones */ 6726 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 6727 if (snapshot_entry != NULL) { 6728 if (snapshot_entry->clone_count > 1) { 6729 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 6730 return -EBUSY; 6731 } else if (snapshot_entry->clone_count == 1) { 6732 has_one_clone = true; 6733 } 6734 } 6735 6736 /* Check if someone has this blob open (besides this delete context): 6737 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 6738 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 6739 * and that is ok, because we will update it accordingly */ 6740 if (blob->open_ref <= 2 && has_one_clone) { 6741 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6742 assert(clone_entry != NULL); 6743 clone = blob_lookup(blob->bs, clone_entry->id); 6744 6745 if (blob->open_ref == 2 && clone == NULL) { 6746 /* Clone is closed and someone else opened this blob */ 6747 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 6748 return -EBUSY; 6749 } 6750 6751 *update_clone = true; 6752 return 0; 6753 } 6754 6755 if (blob->open_ref > 1) { 6756 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 6757 return -EBUSY; 6758 } 6759 6760 assert(has_one_clone == false); 6761 *update_clone = false; 6762 return 0; 6763 } 6764 6765 static void 6766 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 6767 { 6768 spdk_bs_sequence_t *seq = cb_arg; 6769 6770 bs_sequence_finish(seq, -ENOMEM); 6771 } 6772 6773 static void 6774 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 6775 { 6776 spdk_bs_sequence_t *seq = cb_arg; 6777 struct delete_snapshot_ctx *ctx; 6778 bool update_clone = false; 6779 6780 if (bserrno != 0) { 6781 bs_sequence_finish(seq, bserrno); 6782 return; 6783 } 6784 6785 blob_verify_md_op(blob); 6786 6787 ctx = calloc(1, sizeof(*ctx)); 6788 if (ctx == NULL) { 6789 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 6790 return; 6791 } 6792 6793 ctx->snapshot = blob; 6794 ctx->cb_fn = bs_delete_blob_finish; 6795 ctx->cb_arg = seq; 6796 6797 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 6798 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 6799 if (ctx->bserrno) { 6800 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 6801 return; 6802 } 6803 6804 if (blob->locked_operation_in_progress) { 6805 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 6806 ctx->bserrno = -EBUSY; 6807 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 6808 return; 6809 } 6810 6811 blob->locked_operation_in_progress = true; 6812 6813 /* 6814 * Remove the blob from the blob_store list now, to ensure it does not 6815 * get returned after this point by blob_lookup(). 6816 */ 6817 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 6818 TAILQ_REMOVE(&blob->bs->blobs, blob, link); 6819 6820 if (update_clone) { 6821 /* This blob is a snapshot with active clone - update clone first */ 6822 update_clone_on_snapshot_deletion(blob, ctx); 6823 } else { 6824 /* This blob does not have any clones - just remove it */ 6825 bs_blob_list_remove(blob); 6826 bs_delete_blob_finish(seq, blob, 0); 6827 free(ctx); 6828 } 6829 } 6830 6831 void 6832 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 6833 spdk_blob_op_complete cb_fn, void *cb_arg) 6834 { 6835 struct spdk_bs_cpl cpl; 6836 spdk_bs_sequence_t *seq; 6837 6838 SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid); 6839 6840 assert(spdk_get_thread() == bs->md_thread); 6841 6842 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6843 cpl.u.blob_basic.cb_fn = cb_fn; 6844 cpl.u.blob_basic.cb_arg = cb_arg; 6845 6846 seq = bs_sequence_start(bs->md_channel, &cpl); 6847 if (!seq) { 6848 cb_fn(cb_arg, -ENOMEM); 6849 return; 6850 } 6851 6852 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 6853 } 6854 6855 /* END spdk_bs_delete_blob */ 6856 6857 /* START spdk_bs_open_blob */ 6858 6859 static void 6860 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6861 { 6862 struct spdk_blob *blob = cb_arg; 6863 struct spdk_blob *existing; 6864 6865 if (bserrno != 0) { 6866 blob_free(blob); 6867 seq->cpl.u.blob_handle.blob = NULL; 6868 bs_sequence_finish(seq, bserrno); 6869 return; 6870 } 6871 6872 existing = blob_lookup(blob->bs, blob->id); 6873 if (existing) { 6874 blob_free(blob); 6875 existing->open_ref++; 6876 seq->cpl.u.blob_handle.blob = existing; 6877 bs_sequence_finish(seq, 0); 6878 return; 6879 } 6880 6881 blob->open_ref++; 6882 6883 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 6884 TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link); 6885 6886 bs_sequence_finish(seq, bserrno); 6887 } 6888 6889 static inline void 6890 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 6891 { 6892 #define FIELD_OK(field) \ 6893 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 6894 6895 #define SET_FIELD(field) \ 6896 if (FIELD_OK(field)) { \ 6897 dst->field = src->field; \ 6898 } \ 6899 6900 SET_FIELD(clear_method); 6901 6902 dst->opts_size = src->opts_size; 6903 6904 /* You should not remove this statement, but need to update the assert statement 6905 * if you add a new field, and also add a corresponding SET_FIELD statement */ 6906 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 6907 6908 #undef FIELD_OK 6909 #undef SET_FIELD 6910 } 6911 6912 static void 6913 bs_open_blob(struct spdk_blob_store *bs, 6914 spdk_blob_id blobid, 6915 struct spdk_blob_open_opts *opts, 6916 spdk_blob_op_with_handle_complete cb_fn, 6917 void *cb_arg) 6918 { 6919 struct spdk_blob *blob; 6920 struct spdk_bs_cpl cpl; 6921 struct spdk_blob_open_opts opts_local; 6922 spdk_bs_sequence_t *seq; 6923 uint32_t page_num; 6924 6925 SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid); 6926 assert(spdk_get_thread() == bs->md_thread); 6927 6928 page_num = bs_blobid_to_page(blobid); 6929 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 6930 /* Invalid blobid */ 6931 cb_fn(cb_arg, NULL, -ENOENT); 6932 return; 6933 } 6934 6935 blob = blob_lookup(bs, blobid); 6936 if (blob) { 6937 blob->open_ref++; 6938 cb_fn(cb_arg, blob, 0); 6939 return; 6940 } 6941 6942 blob = blob_alloc(bs, blobid); 6943 if (!blob) { 6944 cb_fn(cb_arg, NULL, -ENOMEM); 6945 return; 6946 } 6947 6948 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 6949 if (opts) { 6950 blob_open_opts_copy(opts, &opts_local); 6951 } 6952 6953 blob->clear_method = opts_local.clear_method; 6954 6955 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 6956 cpl.u.blob_handle.cb_fn = cb_fn; 6957 cpl.u.blob_handle.cb_arg = cb_arg; 6958 cpl.u.blob_handle.blob = blob; 6959 6960 seq = bs_sequence_start(bs->md_channel, &cpl); 6961 if (!seq) { 6962 blob_free(blob); 6963 cb_fn(cb_arg, NULL, -ENOMEM); 6964 return; 6965 } 6966 6967 blob_load(seq, blob, bs_open_blob_cpl, blob); 6968 } 6969 6970 void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 6971 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 6972 { 6973 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 6974 } 6975 6976 void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 6977 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 6978 { 6979 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 6980 } 6981 6982 /* END spdk_bs_open_blob */ 6983 6984 /* START spdk_blob_set_read_only */ 6985 int spdk_blob_set_read_only(struct spdk_blob *blob) 6986 { 6987 blob_verify_md_op(blob); 6988 6989 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 6990 6991 blob->state = SPDK_BLOB_STATE_DIRTY; 6992 return 0; 6993 } 6994 /* END spdk_blob_set_read_only */ 6995 6996 /* START spdk_blob_sync_md */ 6997 6998 static void 6999 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7000 { 7001 struct spdk_blob *blob = cb_arg; 7002 7003 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7004 blob->data_ro = true; 7005 blob->md_ro = true; 7006 } 7007 7008 bs_sequence_finish(seq, bserrno); 7009 } 7010 7011 static void 7012 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7013 { 7014 struct spdk_bs_cpl cpl; 7015 spdk_bs_sequence_t *seq; 7016 7017 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7018 cpl.u.blob_basic.cb_fn = cb_fn; 7019 cpl.u.blob_basic.cb_arg = cb_arg; 7020 7021 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7022 if (!seq) { 7023 cb_fn(cb_arg, -ENOMEM); 7024 return; 7025 } 7026 7027 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7028 } 7029 7030 void 7031 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7032 { 7033 blob_verify_md_op(blob); 7034 7035 SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id); 7036 7037 if (blob->md_ro) { 7038 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7039 cb_fn(cb_arg, 0); 7040 return; 7041 } 7042 7043 blob_sync_md(blob, cb_fn, cb_arg); 7044 } 7045 7046 /* END spdk_blob_sync_md */ 7047 7048 struct spdk_blob_insert_cluster_ctx { 7049 struct spdk_thread *thread; 7050 struct spdk_blob *blob; 7051 uint32_t cluster_num; /* cluster index in blob */ 7052 uint32_t cluster; /* cluster on disk */ 7053 uint32_t extent_page; /* extent page on disk */ 7054 int rc; 7055 spdk_blob_op_complete cb_fn; 7056 void *cb_arg; 7057 }; 7058 7059 static void 7060 blob_insert_cluster_msg_cpl(void *arg) 7061 { 7062 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7063 7064 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7065 free(ctx); 7066 } 7067 7068 static void 7069 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7070 { 7071 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7072 7073 ctx->rc = bserrno; 7074 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7075 } 7076 7077 static void 7078 blob_insert_new_ep_cb(void *arg, int bserrno) 7079 { 7080 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7081 uint32_t *extent_page; 7082 7083 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7084 *extent_page = ctx->extent_page; 7085 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7086 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7087 } 7088 7089 static void 7090 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7091 { 7092 struct spdk_blob_md_page *page = cb_arg; 7093 7094 bs_sequence_finish(seq, bserrno); 7095 spdk_free(page); 7096 } 7097 7098 static void 7099 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7100 spdk_blob_op_complete cb_fn, void *cb_arg) 7101 { 7102 spdk_bs_sequence_t *seq; 7103 struct spdk_bs_cpl cpl; 7104 struct spdk_blob_md_page *page = NULL; 7105 uint32_t page_count = 0; 7106 int rc; 7107 7108 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7109 cpl.u.blob_basic.cb_fn = cb_fn; 7110 cpl.u.blob_basic.cb_arg = cb_arg; 7111 7112 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7113 if (!seq) { 7114 cb_fn(cb_arg, -ENOMEM); 7115 return; 7116 } 7117 rc = blob_serialize_add_page(blob, &page, &page_count, &page); 7118 if (rc < 0) { 7119 bs_sequence_finish(seq, rc); 7120 return; 7121 } 7122 7123 blob_serialize_extent_page(blob, cluster_num, page); 7124 7125 page->crc = blob_md_page_calc_crc(page); 7126 7127 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7128 7129 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7130 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7131 blob_persist_extent_page_cpl, page); 7132 } 7133 7134 static void 7135 blob_insert_cluster_msg(void *arg) 7136 { 7137 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7138 uint32_t *extent_page; 7139 7140 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7141 if (ctx->rc != 0) { 7142 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7143 return; 7144 } 7145 7146 if (ctx->blob->use_extent_table == false) { 7147 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7148 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7149 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7150 return; 7151 } 7152 7153 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7154 if (*extent_page == 0) { 7155 /* Extent page requires allocation. 7156 * It was already claimed in the used_md_pages map and placed in ctx. */ 7157 assert(ctx->extent_page != 0); 7158 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7159 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, 7160 blob_insert_new_ep_cb, ctx); 7161 } else { 7162 /* It is possible for original thread to allocate extent page for 7163 * different cluster in the same extent page. In such case proceed with 7164 * updating the existing extent page, but release the additional one. */ 7165 if (ctx->extent_page != 0) { 7166 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7167 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7168 ctx->extent_page = 0; 7169 } 7170 /* Extent page already allocated. 7171 * Every cluster allocation, requires just an update of single extent page. */ 7172 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, 7173 blob_insert_cluster_msg_cb, ctx); 7174 } 7175 } 7176 7177 static void 7178 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7179 uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg) 7180 { 7181 struct spdk_blob_insert_cluster_ctx *ctx; 7182 7183 ctx = calloc(1, sizeof(*ctx)); 7184 if (ctx == NULL) { 7185 cb_fn(cb_arg, -ENOMEM); 7186 return; 7187 } 7188 7189 ctx->thread = spdk_get_thread(); 7190 ctx->blob = blob; 7191 ctx->cluster_num = cluster_num; 7192 ctx->cluster = cluster; 7193 ctx->extent_page = extent_page; 7194 ctx->cb_fn = cb_fn; 7195 ctx->cb_arg = cb_arg; 7196 7197 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7198 } 7199 7200 /* START spdk_blob_close */ 7201 7202 static void 7203 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7204 { 7205 struct spdk_blob *blob = cb_arg; 7206 7207 if (bserrno == 0) { 7208 blob->open_ref--; 7209 if (blob->open_ref == 0) { 7210 /* 7211 * Blobs with active.num_pages == 0 are deleted blobs. 7212 * these blobs are removed from the blob_store list 7213 * when the deletion process starts - so don't try to 7214 * remove them again. 7215 */ 7216 if (blob->active.num_pages > 0) { 7217 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7218 TAILQ_REMOVE(&blob->bs->blobs, blob, link); 7219 } 7220 blob_free(blob); 7221 } 7222 } 7223 7224 bs_sequence_finish(seq, bserrno); 7225 } 7226 7227 void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7228 { 7229 struct spdk_bs_cpl cpl; 7230 spdk_bs_sequence_t *seq; 7231 7232 blob_verify_md_op(blob); 7233 7234 SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id); 7235 7236 if (blob->open_ref == 0) { 7237 cb_fn(cb_arg, -EBADF); 7238 return; 7239 } 7240 7241 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7242 cpl.u.blob_basic.cb_fn = cb_fn; 7243 cpl.u.blob_basic.cb_arg = cb_arg; 7244 7245 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7246 if (!seq) { 7247 cb_fn(cb_arg, -ENOMEM); 7248 return; 7249 } 7250 7251 /* Sync metadata */ 7252 blob_persist(seq, blob, blob_close_cpl, blob); 7253 } 7254 7255 /* END spdk_blob_close */ 7256 7257 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7258 { 7259 return spdk_get_io_channel(bs); 7260 } 7261 7262 void spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7263 { 7264 spdk_put_io_channel(channel); 7265 } 7266 7267 void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7268 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7269 { 7270 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7271 SPDK_BLOB_UNMAP); 7272 } 7273 7274 void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7275 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7276 { 7277 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7278 SPDK_BLOB_WRITE_ZEROES); 7279 } 7280 7281 void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7282 void *payload, uint64_t offset, uint64_t length, 7283 spdk_blob_op_complete cb_fn, void *cb_arg) 7284 { 7285 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7286 SPDK_BLOB_WRITE); 7287 } 7288 7289 void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7290 void *payload, uint64_t offset, uint64_t length, 7291 spdk_blob_op_complete cb_fn, void *cb_arg) 7292 { 7293 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7294 SPDK_BLOB_READ); 7295 } 7296 7297 void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7298 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7299 spdk_blob_op_complete cb_fn, void *cb_arg) 7300 { 7301 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false); 7302 } 7303 7304 void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7305 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7306 spdk_blob_op_complete cb_fn, void *cb_arg) 7307 { 7308 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true); 7309 } 7310 7311 struct spdk_bs_iter_ctx { 7312 int64_t page_num; 7313 struct spdk_blob_store *bs; 7314 7315 spdk_blob_op_with_handle_complete cb_fn; 7316 void *cb_arg; 7317 }; 7318 7319 static void 7320 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7321 { 7322 struct spdk_bs_iter_ctx *ctx = cb_arg; 7323 struct spdk_blob_store *bs = ctx->bs; 7324 spdk_blob_id id; 7325 7326 if (bserrno == 0) { 7327 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7328 free(ctx); 7329 return; 7330 } 7331 7332 ctx->page_num++; 7333 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7334 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7335 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7336 free(ctx); 7337 return; 7338 } 7339 7340 id = bs_page_to_blobid(ctx->page_num); 7341 7342 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7343 } 7344 7345 void 7346 spdk_bs_iter_first(struct spdk_blob_store *bs, 7347 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7348 { 7349 struct spdk_bs_iter_ctx *ctx; 7350 7351 ctx = calloc(1, sizeof(*ctx)); 7352 if (!ctx) { 7353 cb_fn(cb_arg, NULL, -ENOMEM); 7354 return; 7355 } 7356 7357 ctx->page_num = -1; 7358 ctx->bs = bs; 7359 ctx->cb_fn = cb_fn; 7360 ctx->cb_arg = cb_arg; 7361 7362 bs_iter_cpl(ctx, NULL, -1); 7363 } 7364 7365 static void 7366 bs_iter_close_cpl(void *cb_arg, int bserrno) 7367 { 7368 struct spdk_bs_iter_ctx *ctx = cb_arg; 7369 7370 bs_iter_cpl(ctx, NULL, -1); 7371 } 7372 7373 void 7374 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7375 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7376 { 7377 struct spdk_bs_iter_ctx *ctx; 7378 7379 assert(blob != NULL); 7380 7381 ctx = calloc(1, sizeof(*ctx)); 7382 if (!ctx) { 7383 cb_fn(cb_arg, NULL, -ENOMEM); 7384 return; 7385 } 7386 7387 ctx->page_num = bs_blobid_to_page(blob->id); 7388 ctx->bs = bs; 7389 ctx->cb_fn = cb_fn; 7390 ctx->cb_arg = cb_arg; 7391 7392 /* Close the existing blob */ 7393 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7394 } 7395 7396 static int 7397 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7398 uint16_t value_len, bool internal) 7399 { 7400 struct spdk_xattr_tailq *xattrs; 7401 struct spdk_xattr *xattr; 7402 size_t desc_size; 7403 void *tmp; 7404 7405 blob_verify_md_op(blob); 7406 7407 if (blob->md_ro) { 7408 return -EPERM; 7409 } 7410 7411 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7412 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7413 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7414 desc_size, SPDK_BS_MAX_DESC_SIZE); 7415 return -ENOMEM; 7416 } 7417 7418 if (internal) { 7419 xattrs = &blob->xattrs_internal; 7420 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7421 } else { 7422 xattrs = &blob->xattrs; 7423 } 7424 7425 TAILQ_FOREACH(xattr, xattrs, link) { 7426 if (!strcmp(name, xattr->name)) { 7427 tmp = malloc(value_len); 7428 if (!tmp) { 7429 return -ENOMEM; 7430 } 7431 7432 free(xattr->value); 7433 xattr->value_len = value_len; 7434 xattr->value = tmp; 7435 memcpy(xattr->value, value, value_len); 7436 7437 blob->state = SPDK_BLOB_STATE_DIRTY; 7438 7439 return 0; 7440 } 7441 } 7442 7443 xattr = calloc(1, sizeof(*xattr)); 7444 if (!xattr) { 7445 return -ENOMEM; 7446 } 7447 7448 xattr->name = strdup(name); 7449 if (!xattr->name) { 7450 free(xattr); 7451 return -ENOMEM; 7452 } 7453 7454 xattr->value_len = value_len; 7455 xattr->value = malloc(value_len); 7456 if (!xattr->value) { 7457 free(xattr->name); 7458 free(xattr); 7459 return -ENOMEM; 7460 } 7461 memcpy(xattr->value, value, value_len); 7462 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7463 7464 blob->state = SPDK_BLOB_STATE_DIRTY; 7465 7466 return 0; 7467 } 7468 7469 int 7470 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7471 uint16_t value_len) 7472 { 7473 return blob_set_xattr(blob, name, value, value_len, false); 7474 } 7475 7476 static int 7477 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7478 { 7479 struct spdk_xattr_tailq *xattrs; 7480 struct spdk_xattr *xattr; 7481 7482 blob_verify_md_op(blob); 7483 7484 if (blob->md_ro) { 7485 return -EPERM; 7486 } 7487 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7488 7489 TAILQ_FOREACH(xattr, xattrs, link) { 7490 if (!strcmp(name, xattr->name)) { 7491 TAILQ_REMOVE(xattrs, xattr, link); 7492 free(xattr->value); 7493 free(xattr->name); 7494 free(xattr); 7495 7496 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7497 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7498 } 7499 blob->state = SPDK_BLOB_STATE_DIRTY; 7500 7501 return 0; 7502 } 7503 } 7504 7505 return -ENOENT; 7506 } 7507 7508 int 7509 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7510 { 7511 return blob_remove_xattr(blob, name, false); 7512 } 7513 7514 static int 7515 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7516 const void **value, size_t *value_len, bool internal) 7517 { 7518 struct spdk_xattr *xattr; 7519 struct spdk_xattr_tailq *xattrs; 7520 7521 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7522 7523 TAILQ_FOREACH(xattr, xattrs, link) { 7524 if (!strcmp(name, xattr->name)) { 7525 *value = xattr->value; 7526 *value_len = xattr->value_len; 7527 return 0; 7528 } 7529 } 7530 return -ENOENT; 7531 } 7532 7533 int 7534 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7535 const void **value, size_t *value_len) 7536 { 7537 blob_verify_md_op(blob); 7538 7539 return blob_get_xattr_value(blob, name, value, value_len, false); 7540 } 7541 7542 struct spdk_xattr_names { 7543 uint32_t count; 7544 const char *names[0]; 7545 }; 7546 7547 static int 7548 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7549 { 7550 struct spdk_xattr *xattr; 7551 int count = 0; 7552 7553 TAILQ_FOREACH(xattr, xattrs, link) { 7554 count++; 7555 } 7556 7557 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7558 if (*names == NULL) { 7559 return -ENOMEM; 7560 } 7561 7562 TAILQ_FOREACH(xattr, xattrs, link) { 7563 (*names)->names[(*names)->count++] = xattr->name; 7564 } 7565 7566 return 0; 7567 } 7568 7569 int 7570 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 7571 { 7572 blob_verify_md_op(blob); 7573 7574 return blob_get_xattr_names(&blob->xattrs, names); 7575 } 7576 7577 uint32_t 7578 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 7579 { 7580 assert(names != NULL); 7581 7582 return names->count; 7583 } 7584 7585 const char * 7586 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 7587 { 7588 if (index >= names->count) { 7589 return NULL; 7590 } 7591 7592 return names->names[index]; 7593 } 7594 7595 void 7596 spdk_xattr_names_free(struct spdk_xattr_names *names) 7597 { 7598 free(names); 7599 } 7600 7601 struct spdk_bs_type 7602 spdk_bs_get_bstype(struct spdk_blob_store *bs) 7603 { 7604 return bs->bstype; 7605 } 7606 7607 void 7608 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 7609 { 7610 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 7611 } 7612 7613 bool 7614 spdk_blob_is_read_only(struct spdk_blob *blob) 7615 { 7616 assert(blob != NULL); 7617 return (blob->data_ro || blob->md_ro); 7618 } 7619 7620 bool 7621 spdk_blob_is_snapshot(struct spdk_blob *blob) 7622 { 7623 struct spdk_blob_list *snapshot_entry; 7624 7625 assert(blob != NULL); 7626 7627 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7628 if (snapshot_entry == NULL) { 7629 return false; 7630 } 7631 7632 return true; 7633 } 7634 7635 bool 7636 spdk_blob_is_clone(struct spdk_blob *blob) 7637 { 7638 assert(blob != NULL); 7639 7640 if (blob->parent_id != SPDK_BLOBID_INVALID) { 7641 assert(spdk_blob_is_thin_provisioned(blob)); 7642 return true; 7643 } 7644 7645 return false; 7646 } 7647 7648 bool 7649 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 7650 { 7651 assert(blob != NULL); 7652 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 7653 } 7654 7655 static void 7656 blob_update_clear_method(struct spdk_blob *blob) 7657 { 7658 enum blob_clear_method stored_cm; 7659 7660 assert(blob != NULL); 7661 7662 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 7663 * in metadata previously. If something other than the default was 7664 * specified, ignore stored value and used what was passed in. 7665 */ 7666 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 7667 7668 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 7669 blob->clear_method = stored_cm; 7670 } else if (blob->clear_method != stored_cm) { 7671 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 7672 blob->clear_method, stored_cm); 7673 } 7674 } 7675 7676 spdk_blob_id 7677 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 7678 { 7679 struct spdk_blob_list *snapshot_entry = NULL; 7680 struct spdk_blob_list *clone_entry = NULL; 7681 7682 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 7683 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 7684 if (clone_entry->id == blob_id) { 7685 return snapshot_entry->id; 7686 } 7687 } 7688 } 7689 7690 return SPDK_BLOBID_INVALID; 7691 } 7692 7693 int 7694 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 7695 size_t *count) 7696 { 7697 struct spdk_blob_list *snapshot_entry, *clone_entry; 7698 size_t n; 7699 7700 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 7701 if (snapshot_entry == NULL) { 7702 *count = 0; 7703 return 0; 7704 } 7705 7706 if (ids == NULL || *count < snapshot_entry->clone_count) { 7707 *count = snapshot_entry->clone_count; 7708 return -ENOMEM; 7709 } 7710 *count = snapshot_entry->clone_count; 7711 7712 n = 0; 7713 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 7714 ids[n++] = clone_entry->id; 7715 } 7716 7717 return 0; 7718 } 7719 7720 SPDK_LOG_REGISTER_COMPONENT(blob) 7721