1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/blob.h" 10 #include "spdk/crc32.h" 11 #include "spdk/env.h" 12 #include "spdk/queue.h" 13 #include "spdk/thread.h" 14 #include "spdk/bit_array.h" 15 #include "spdk/bit_pool.h" 16 #include "spdk/likely.h" 17 #include "spdk/util.h" 18 #include "spdk/string.h" 19 20 #include "spdk_internal/assert.h" 21 #include "spdk/log.h" 22 23 #include "blobstore.h" 24 25 #define BLOB_CRC32C_INITIAL 0xffffffffUL 26 27 static int bs_register_md_thread(struct spdk_blob_store *bs); 28 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 29 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 30 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 31 uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page, 32 spdk_blob_op_complete cb_fn, void *cb_arg); 33 34 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 35 uint16_t value_len, bool internal); 36 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 37 const void **value, size_t *value_len, bool internal); 38 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 39 40 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 41 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg); 42 43 static int 44 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2) 45 { 46 return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id); 47 } 48 49 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp); 50 51 static void 52 blob_verify_md_op(struct spdk_blob *blob) 53 { 54 assert(blob != NULL); 55 assert(spdk_get_thread() == blob->bs->md_thread); 56 assert(blob->state != SPDK_BLOB_STATE_LOADING); 57 } 58 59 static struct spdk_blob_list * 60 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 61 { 62 struct spdk_blob_list *snapshot_entry = NULL; 63 64 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 65 if (snapshot_entry->id == blobid) { 66 break; 67 } 68 } 69 70 return snapshot_entry; 71 } 72 73 static void 74 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 75 { 76 assert(spdk_spin_held(&bs->used_lock)); 77 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 78 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 79 80 spdk_bit_array_set(bs->used_md_pages, page); 81 } 82 83 static void 84 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 85 { 86 assert(spdk_spin_held(&bs->used_lock)); 87 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 88 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 89 90 spdk_bit_array_clear(bs->used_md_pages, page); 91 } 92 93 static uint32_t 94 bs_claim_cluster(struct spdk_blob_store *bs) 95 { 96 uint32_t cluster_num; 97 98 assert(spdk_spin_held(&bs->used_lock)); 99 100 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 101 if (cluster_num == UINT32_MAX) { 102 return UINT32_MAX; 103 } 104 105 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 106 bs->num_free_clusters--; 107 108 return cluster_num; 109 } 110 111 static void 112 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 113 { 114 assert(spdk_spin_held(&bs->used_lock)); 115 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 116 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 117 assert(bs->num_free_clusters < bs->total_clusters); 118 119 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 120 121 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 122 bs->num_free_clusters++; 123 } 124 125 static int 126 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 127 { 128 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 129 130 blob_verify_md_op(blob); 131 132 if (*cluster_lba != 0) { 133 return -EEXIST; 134 } 135 136 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 137 return 0; 138 } 139 140 static int 141 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 142 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 143 { 144 uint32_t *extent_page = 0; 145 146 assert(spdk_spin_held(&blob->bs->used_lock)); 147 148 *cluster = bs_claim_cluster(blob->bs); 149 if (*cluster == UINT32_MAX) { 150 /* No more free clusters. Cannot satisfy the request */ 151 return -ENOSPC; 152 } 153 154 if (blob->use_extent_table) { 155 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 156 if (*extent_page == 0) { 157 /* Extent page shall never occupy md_page so start the search from 1 */ 158 if (*lowest_free_md_page == 0) { 159 *lowest_free_md_page = 1; 160 } 161 /* No extent_page is allocated for the cluster */ 162 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 163 *lowest_free_md_page); 164 if (*lowest_free_md_page == UINT32_MAX) { 165 /* No more free md pages. Cannot satisfy the request */ 166 bs_release_cluster(blob->bs, *cluster); 167 return -ENOSPC; 168 } 169 bs_claim_md_page(blob->bs, *lowest_free_md_page); 170 } 171 } 172 173 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob 0x%" PRIx64 "\n", *cluster, 174 blob->id); 175 176 if (update_map) { 177 blob_insert_cluster(blob, cluster_num, *cluster); 178 if (blob->use_extent_table && *extent_page == 0) { 179 *extent_page = *lowest_free_md_page; 180 } 181 } 182 183 return 0; 184 } 185 186 static void 187 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 188 { 189 xattrs->count = 0; 190 xattrs->names = NULL; 191 xattrs->ctx = NULL; 192 xattrs->get_value = NULL; 193 } 194 195 void 196 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 197 { 198 if (!opts) { 199 SPDK_ERRLOG("opts should not be NULL\n"); 200 return; 201 } 202 203 if (!opts_size) { 204 SPDK_ERRLOG("opts_size should not be zero value\n"); 205 return; 206 } 207 208 memset(opts, 0, opts_size); 209 opts->opts_size = opts_size; 210 211 #define FIELD_OK(field) \ 212 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 213 214 #define SET_FIELD(field, value) \ 215 if (FIELD_OK(field)) { \ 216 opts->field = value; \ 217 } \ 218 219 SET_FIELD(num_clusters, 0); 220 SET_FIELD(thin_provision, false); 221 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 222 223 if (FIELD_OK(xattrs)) { 224 blob_xattrs_init(&opts->xattrs); 225 } 226 227 SET_FIELD(use_extent_table, true); 228 229 #undef FIELD_OK 230 #undef SET_FIELD 231 } 232 233 void 234 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 235 { 236 if (!opts) { 237 SPDK_ERRLOG("opts should not be NULL\n"); 238 return; 239 } 240 241 if (!opts_size) { 242 SPDK_ERRLOG("opts_size should not be zero value\n"); 243 return; 244 } 245 246 memset(opts, 0, opts_size); 247 opts->opts_size = opts_size; 248 249 #define FIELD_OK(field) \ 250 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 251 252 #define SET_FIELD(field, value) \ 253 if (FIELD_OK(field)) { \ 254 opts->field = value; \ 255 } \ 256 257 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 258 259 #undef FIELD_OK 260 #undef SET_FILED 261 } 262 263 static struct spdk_blob * 264 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 265 { 266 struct spdk_blob *blob; 267 268 blob = calloc(1, sizeof(*blob)); 269 if (!blob) { 270 return NULL; 271 } 272 273 blob->id = id; 274 blob->bs = bs; 275 276 blob->parent_id = SPDK_BLOBID_INVALID; 277 278 blob->state = SPDK_BLOB_STATE_DIRTY; 279 blob->extent_rle_found = false; 280 blob->extent_table_found = false; 281 blob->active.num_pages = 1; 282 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 283 if (!blob->active.pages) { 284 free(blob); 285 return NULL; 286 } 287 288 blob->active.pages[0] = bs_blobid_to_page(id); 289 290 TAILQ_INIT(&blob->xattrs); 291 TAILQ_INIT(&blob->xattrs_internal); 292 TAILQ_INIT(&blob->pending_persists); 293 TAILQ_INIT(&blob->persists_to_complete); 294 295 return blob; 296 } 297 298 static void 299 xattrs_free(struct spdk_xattr_tailq *xattrs) 300 { 301 struct spdk_xattr *xattr, *xattr_tmp; 302 303 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 304 TAILQ_REMOVE(xattrs, xattr, link); 305 free(xattr->name); 306 free(xattr->value); 307 free(xattr); 308 } 309 } 310 311 static void 312 blob_free(struct spdk_blob *blob) 313 { 314 assert(blob != NULL); 315 assert(TAILQ_EMPTY(&blob->pending_persists)); 316 assert(TAILQ_EMPTY(&blob->persists_to_complete)); 317 318 free(blob->active.extent_pages); 319 free(blob->clean.extent_pages); 320 free(blob->active.clusters); 321 free(blob->clean.clusters); 322 free(blob->active.pages); 323 free(blob->clean.pages); 324 325 xattrs_free(&blob->xattrs); 326 xattrs_free(&blob->xattrs_internal); 327 328 if (blob->back_bs_dev) { 329 blob->back_bs_dev->destroy(blob->back_bs_dev); 330 } 331 332 free(blob); 333 } 334 335 struct freeze_io_ctx { 336 struct spdk_bs_cpl cpl; 337 struct spdk_blob *blob; 338 }; 339 340 static void 341 blob_io_sync(struct spdk_io_channel_iter *i) 342 { 343 spdk_for_each_channel_continue(i, 0); 344 } 345 346 static void 347 blob_execute_queued_io(struct spdk_io_channel_iter *i) 348 { 349 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 350 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 351 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 352 struct spdk_bs_request_set *set; 353 struct spdk_bs_user_op_args *args; 354 spdk_bs_user_op_t *op, *tmp; 355 356 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 357 set = (struct spdk_bs_request_set *)op; 358 args = &set->u.user_op; 359 360 if (args->blob == ctx->blob) { 361 TAILQ_REMOVE(&ch->queued_io, op, link); 362 bs_user_op_execute(op); 363 } 364 } 365 366 spdk_for_each_channel_continue(i, 0); 367 } 368 369 static void 370 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 371 { 372 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 373 374 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 375 376 free(ctx); 377 } 378 379 static void 380 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 381 { 382 struct freeze_io_ctx *ctx; 383 384 ctx = calloc(1, sizeof(*ctx)); 385 if (!ctx) { 386 cb_fn(cb_arg, -ENOMEM); 387 return; 388 } 389 390 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 391 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 392 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 393 ctx->blob = blob; 394 395 /* Freeze I/O on blob */ 396 blob->frozen_refcnt++; 397 398 if (blob->frozen_refcnt == 1) { 399 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 400 } else { 401 cb_fn(cb_arg, 0); 402 free(ctx); 403 } 404 } 405 406 static void 407 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 408 { 409 struct freeze_io_ctx *ctx; 410 411 ctx = calloc(1, sizeof(*ctx)); 412 if (!ctx) { 413 cb_fn(cb_arg, -ENOMEM); 414 return; 415 } 416 417 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 418 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 419 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 420 ctx->blob = blob; 421 422 assert(blob->frozen_refcnt > 0); 423 424 blob->frozen_refcnt--; 425 426 if (blob->frozen_refcnt == 0) { 427 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 428 } else { 429 cb_fn(cb_arg, 0); 430 free(ctx); 431 } 432 } 433 434 static int 435 blob_mark_clean(struct spdk_blob *blob) 436 { 437 uint32_t *extent_pages = NULL; 438 uint64_t *clusters = NULL; 439 uint32_t *pages = NULL; 440 441 assert(blob != NULL); 442 443 if (blob->active.num_extent_pages) { 444 assert(blob->active.extent_pages); 445 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 446 if (!extent_pages) { 447 return -ENOMEM; 448 } 449 memcpy(extent_pages, blob->active.extent_pages, 450 blob->active.num_extent_pages * sizeof(*extent_pages)); 451 } 452 453 if (blob->active.num_clusters) { 454 assert(blob->active.clusters); 455 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 456 if (!clusters) { 457 free(extent_pages); 458 return -ENOMEM; 459 } 460 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 461 } 462 463 if (blob->active.num_pages) { 464 assert(blob->active.pages); 465 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 466 if (!pages) { 467 free(extent_pages); 468 free(clusters); 469 return -ENOMEM; 470 } 471 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 472 } 473 474 free(blob->clean.extent_pages); 475 free(blob->clean.clusters); 476 free(blob->clean.pages); 477 478 blob->clean.num_extent_pages = blob->active.num_extent_pages; 479 blob->clean.extent_pages = blob->active.extent_pages; 480 blob->clean.num_clusters = blob->active.num_clusters; 481 blob->clean.clusters = blob->active.clusters; 482 blob->clean.num_pages = blob->active.num_pages; 483 blob->clean.pages = blob->active.pages; 484 485 blob->active.extent_pages = extent_pages; 486 blob->active.clusters = clusters; 487 blob->active.pages = pages; 488 489 /* If the metadata was dirtied again while the metadata was being written to disk, 490 * we do not want to revert the DIRTY state back to CLEAN here. 491 */ 492 if (blob->state == SPDK_BLOB_STATE_LOADING) { 493 blob->state = SPDK_BLOB_STATE_CLEAN; 494 } 495 496 return 0; 497 } 498 499 static int 500 blob_deserialize_xattr(struct spdk_blob *blob, 501 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 502 { 503 struct spdk_xattr *xattr; 504 505 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 506 sizeof(desc_xattr->value_length) + 507 desc_xattr->name_length + desc_xattr->value_length) { 508 return -EINVAL; 509 } 510 511 xattr = calloc(1, sizeof(*xattr)); 512 if (xattr == NULL) { 513 return -ENOMEM; 514 } 515 516 xattr->name = malloc(desc_xattr->name_length + 1); 517 if (xattr->name == NULL) { 518 free(xattr); 519 return -ENOMEM; 520 } 521 522 xattr->value = malloc(desc_xattr->value_length); 523 if (xattr->value == NULL) { 524 free(xattr->name); 525 free(xattr); 526 return -ENOMEM; 527 } 528 529 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 530 xattr->name[desc_xattr->name_length] = '\0'; 531 xattr->value_len = desc_xattr->value_length; 532 memcpy(xattr->value, 533 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 534 desc_xattr->value_length); 535 536 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 537 538 return 0; 539 } 540 541 542 static int 543 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 544 { 545 struct spdk_blob_md_descriptor *desc; 546 size_t cur_desc = 0; 547 void *tmp; 548 549 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 550 while (cur_desc < sizeof(page->descriptors)) { 551 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 552 if (desc->length == 0) { 553 /* If padding and length are 0, this terminates the page */ 554 break; 555 } 556 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 557 struct spdk_blob_md_descriptor_flags *desc_flags; 558 559 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 560 561 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 562 return -EINVAL; 563 } 564 565 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 566 SPDK_BLOB_INVALID_FLAGS_MASK) { 567 return -EINVAL; 568 } 569 570 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 571 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 572 blob->data_ro = true; 573 blob->md_ro = true; 574 } 575 576 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 577 SPDK_BLOB_MD_RO_FLAGS_MASK) { 578 blob->md_ro = true; 579 } 580 581 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 582 blob->data_ro = true; 583 blob->md_ro = true; 584 } 585 586 blob->invalid_flags = desc_flags->invalid_flags; 587 blob->data_ro_flags = desc_flags->data_ro_flags; 588 blob->md_ro_flags = desc_flags->md_ro_flags; 589 590 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 591 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 592 unsigned int i, j; 593 unsigned int cluster_count = blob->active.num_clusters; 594 595 if (blob->extent_table_found) { 596 /* Extent Table already present in the md, 597 * both descriptors should never be at the same time. */ 598 return -EINVAL; 599 } 600 blob->extent_rle_found = true; 601 602 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 603 604 if (desc_extent_rle->length == 0 || 605 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 606 return -EINVAL; 607 } 608 609 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 610 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 611 if (desc_extent_rle->extents[i].cluster_idx != 0) { 612 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 613 desc_extent_rle->extents[i].cluster_idx + j)) { 614 return -EINVAL; 615 } 616 } 617 cluster_count++; 618 } 619 } 620 621 if (cluster_count == 0) { 622 return -EINVAL; 623 } 624 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 625 if (tmp == NULL) { 626 return -ENOMEM; 627 } 628 blob->active.clusters = tmp; 629 blob->active.cluster_array_size = cluster_count; 630 631 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 632 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 633 if (desc_extent_rle->extents[i].cluster_idx != 0) { 634 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 635 desc_extent_rle->extents[i].cluster_idx + j); 636 } else if (spdk_blob_is_thin_provisioned(blob)) { 637 blob->active.clusters[blob->active.num_clusters++] = 0; 638 } else { 639 return -EINVAL; 640 } 641 } 642 } 643 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 644 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 645 uint32_t num_extent_pages = blob->active.num_extent_pages; 646 uint32_t i, j; 647 size_t extent_pages_length; 648 649 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 650 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 651 652 if (blob->extent_rle_found) { 653 /* This means that Extent RLE is present in MD, 654 * both should never be at the same time. */ 655 return -EINVAL; 656 } else if (blob->extent_table_found && 657 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 658 /* Number of clusters in this ET does not match number 659 * from previously read EXTENT_TABLE. */ 660 return -EINVAL; 661 } 662 663 if (desc_extent_table->length == 0 || 664 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 665 return -EINVAL; 666 } 667 668 blob->extent_table_found = true; 669 670 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 671 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 672 } 673 674 if (num_extent_pages > 0) { 675 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 676 if (tmp == NULL) { 677 return -ENOMEM; 678 } 679 blob->active.extent_pages = tmp; 680 } 681 blob->active.extent_pages_array_size = num_extent_pages; 682 683 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 684 685 /* Extent table entries contain md page numbers for extent pages. 686 * Zeroes represent unallocated extent pages, those are run-length-encoded. 687 */ 688 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 689 if (desc_extent_table->extent_page[i].page_idx != 0) { 690 assert(desc_extent_table->extent_page[i].num_pages == 1); 691 blob->active.extent_pages[blob->active.num_extent_pages++] = 692 desc_extent_table->extent_page[i].page_idx; 693 } else if (spdk_blob_is_thin_provisioned(blob)) { 694 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 695 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 696 } 697 } else { 698 return -EINVAL; 699 } 700 } 701 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 702 struct spdk_blob_md_descriptor_extent_page *desc_extent; 703 unsigned int i; 704 unsigned int cluster_count = 0; 705 size_t cluster_idx_length; 706 707 if (blob->extent_rle_found) { 708 /* This means that Extent RLE is present in MD, 709 * both should never be at the same time. */ 710 return -EINVAL; 711 } 712 713 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 714 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 715 716 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 717 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 718 return -EINVAL; 719 } 720 721 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 722 if (desc_extent->cluster_idx[i] != 0) { 723 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 724 return -EINVAL; 725 } 726 } 727 cluster_count++; 728 } 729 730 if (cluster_count == 0) { 731 return -EINVAL; 732 } 733 734 /* When reading extent pages sequentially starting cluster idx should match 735 * current size of a blob. 736 * If changed to batch reading, this check shall be removed. */ 737 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 738 return -EINVAL; 739 } 740 741 tmp = realloc(blob->active.clusters, 742 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 743 if (tmp == NULL) { 744 return -ENOMEM; 745 } 746 blob->active.clusters = tmp; 747 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 748 749 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 750 if (desc_extent->cluster_idx[i] != 0) { 751 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 752 desc_extent->cluster_idx[i]); 753 } else if (spdk_blob_is_thin_provisioned(blob)) { 754 blob->active.clusters[blob->active.num_clusters++] = 0; 755 } else { 756 return -EINVAL; 757 } 758 } 759 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 760 assert(blob->remaining_clusters_in_et >= cluster_count); 761 blob->remaining_clusters_in_et -= cluster_count; 762 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 763 int rc; 764 765 rc = blob_deserialize_xattr(blob, 766 (struct spdk_blob_md_descriptor_xattr *) desc, false); 767 if (rc != 0) { 768 return rc; 769 } 770 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 771 int rc; 772 773 rc = blob_deserialize_xattr(blob, 774 (struct spdk_blob_md_descriptor_xattr *) desc, true); 775 if (rc != 0) { 776 return rc; 777 } 778 } else { 779 /* Unrecognized descriptor type. Do not fail - just continue to the 780 * next descriptor. If this descriptor is associated with some feature 781 * defined in a newer version of blobstore, that version of blobstore 782 * should create and set an associated feature flag to specify if this 783 * blob can be loaded or not. 784 */ 785 } 786 787 /* Advance to the next descriptor */ 788 cur_desc += sizeof(*desc) + desc->length; 789 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 790 break; 791 } 792 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 793 } 794 795 return 0; 796 } 797 798 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 799 800 static int 801 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 802 { 803 assert(blob != NULL); 804 assert(blob->state == SPDK_BLOB_STATE_LOADING); 805 806 if (bs_load_cur_extent_page_valid(extent_page) == false) { 807 return -ENOENT; 808 } 809 810 return blob_parse_page(extent_page, blob); 811 } 812 813 static int 814 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 815 struct spdk_blob *blob) 816 { 817 const struct spdk_blob_md_page *page; 818 uint32_t i; 819 int rc; 820 void *tmp; 821 822 assert(page_count > 0); 823 assert(pages[0].sequence_num == 0); 824 assert(blob != NULL); 825 assert(blob->state == SPDK_BLOB_STATE_LOADING); 826 assert(blob->active.clusters == NULL); 827 828 /* The blobid provided doesn't match what's in the MD, this can 829 * happen for example if a bogus blobid is passed in through open. 830 */ 831 if (blob->id != pages[0].id) { 832 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 833 blob->id, pages[0].id); 834 return -ENOENT; 835 } 836 837 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 838 if (!tmp) { 839 return -ENOMEM; 840 } 841 blob->active.pages = tmp; 842 843 blob->active.pages[0] = pages[0].id; 844 845 for (i = 1; i < page_count; i++) { 846 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 847 blob->active.pages[i] = pages[i - 1].next; 848 } 849 blob->active.num_pages = page_count; 850 851 for (i = 0; i < page_count; i++) { 852 page = &pages[i]; 853 854 assert(page->id == blob->id); 855 assert(page->sequence_num == i); 856 857 rc = blob_parse_page(page, blob); 858 if (rc != 0) { 859 return rc; 860 } 861 } 862 863 return 0; 864 } 865 866 static int 867 blob_serialize_add_page(const struct spdk_blob *blob, 868 struct spdk_blob_md_page **pages, 869 uint32_t *page_count, 870 struct spdk_blob_md_page **last_page) 871 { 872 struct spdk_blob_md_page *page, *tmp_pages; 873 874 assert(pages != NULL); 875 assert(page_count != NULL); 876 877 *last_page = NULL; 878 if (*page_count == 0) { 879 assert(*pages == NULL); 880 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 881 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 882 if (*pages == NULL) { 883 return -ENOMEM; 884 } 885 *page_count = 1; 886 } else { 887 assert(*pages != NULL); 888 tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0); 889 if (tmp_pages == NULL) { 890 return -ENOMEM; 891 } 892 (*page_count)++; 893 *pages = tmp_pages; 894 } 895 896 page = &(*pages)[*page_count - 1]; 897 memset(page, 0, sizeof(*page)); 898 page->id = blob->id; 899 page->sequence_num = *page_count - 1; 900 page->next = SPDK_INVALID_MD_PAGE; 901 *last_page = page; 902 903 return 0; 904 } 905 906 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 907 * Update required_sz on both success and failure. 908 * 909 */ 910 static int 911 blob_serialize_xattr(const struct spdk_xattr *xattr, 912 uint8_t *buf, size_t buf_sz, 913 size_t *required_sz, bool internal) 914 { 915 struct spdk_blob_md_descriptor_xattr *desc; 916 917 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 918 strlen(xattr->name) + 919 xattr->value_len; 920 921 if (buf_sz < *required_sz) { 922 return -1; 923 } 924 925 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 926 927 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 928 desc->length = sizeof(desc->name_length) + 929 sizeof(desc->value_length) + 930 strlen(xattr->name) + 931 xattr->value_len; 932 desc->name_length = strlen(xattr->name); 933 desc->value_length = xattr->value_len; 934 935 memcpy(desc->name, xattr->name, desc->name_length); 936 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 937 xattr->value, 938 desc->value_length); 939 940 return 0; 941 } 942 943 static void 944 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 945 uint64_t start_ep, uint64_t *next_ep, 946 uint8_t **buf, size_t *remaining_sz) 947 { 948 struct spdk_blob_md_descriptor_extent_table *desc; 949 size_t cur_sz; 950 uint64_t i, et_idx; 951 uint32_t extent_page, ep_len; 952 953 /* The buffer must have room for at least num_clusters entry */ 954 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 955 if (*remaining_sz < cur_sz) { 956 *next_ep = start_ep; 957 return; 958 } 959 960 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 961 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 962 963 desc->num_clusters = blob->active.num_clusters; 964 965 ep_len = 1; 966 et_idx = 0; 967 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 968 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 969 /* If we ran out of buffer space, return */ 970 break; 971 } 972 973 extent_page = blob->active.extent_pages[i]; 974 /* Verify that next extent_page is unallocated */ 975 if (extent_page == 0 && 976 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 977 ep_len++; 978 continue; 979 } 980 desc->extent_page[et_idx].page_idx = extent_page; 981 desc->extent_page[et_idx].num_pages = ep_len; 982 et_idx++; 983 984 ep_len = 1; 985 cur_sz += sizeof(desc->extent_page[et_idx]); 986 } 987 *next_ep = i; 988 989 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 990 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 991 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 992 } 993 994 static int 995 blob_serialize_extent_table(const struct spdk_blob *blob, 996 struct spdk_blob_md_page **pages, 997 struct spdk_blob_md_page *cur_page, 998 uint32_t *page_count, uint8_t **buf, 999 size_t *remaining_sz) 1000 { 1001 uint64_t last_extent_page; 1002 int rc; 1003 1004 last_extent_page = 0; 1005 /* At least single extent table entry has to be always persisted. 1006 * Such case occurs with num_extent_pages == 0. */ 1007 while (last_extent_page <= blob->active.num_extent_pages) { 1008 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1009 remaining_sz); 1010 1011 if (last_extent_page == blob->active.num_extent_pages) { 1012 break; 1013 } 1014 1015 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1016 if (rc < 0) { 1017 return rc; 1018 } 1019 1020 *buf = (uint8_t *)cur_page->descriptors; 1021 *remaining_sz = sizeof(cur_page->descriptors); 1022 } 1023 1024 return 0; 1025 } 1026 1027 static void 1028 blob_serialize_extent_rle(const struct spdk_blob *blob, 1029 uint64_t start_cluster, uint64_t *next_cluster, 1030 uint8_t **buf, size_t *buf_sz) 1031 { 1032 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1033 size_t cur_sz; 1034 uint64_t i, extent_idx; 1035 uint64_t lba, lba_per_cluster, lba_count; 1036 1037 /* The buffer must have room for at least one extent */ 1038 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1039 if (*buf_sz < cur_sz) { 1040 *next_cluster = start_cluster; 1041 return; 1042 } 1043 1044 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1045 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1046 1047 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1048 1049 lba = blob->active.clusters[start_cluster]; 1050 lba_count = lba_per_cluster; 1051 extent_idx = 0; 1052 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1053 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1054 /* Run-length encode sequential non-zero LBA */ 1055 lba_count += lba_per_cluster; 1056 continue; 1057 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1058 /* Run-length encode unallocated clusters */ 1059 lba_count += lba_per_cluster; 1060 continue; 1061 } 1062 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1063 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1064 extent_idx++; 1065 1066 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1067 1068 if (*buf_sz < cur_sz) { 1069 /* If we ran out of buffer space, return */ 1070 *next_cluster = i; 1071 break; 1072 } 1073 1074 lba = blob->active.clusters[i]; 1075 lba_count = lba_per_cluster; 1076 } 1077 1078 if (*buf_sz >= cur_sz) { 1079 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1080 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1081 extent_idx++; 1082 1083 *next_cluster = blob->active.num_clusters; 1084 } 1085 1086 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1087 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1088 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1089 } 1090 1091 static int 1092 blob_serialize_extents_rle(const struct spdk_blob *blob, 1093 struct spdk_blob_md_page **pages, 1094 struct spdk_blob_md_page *cur_page, 1095 uint32_t *page_count, uint8_t **buf, 1096 size_t *remaining_sz) 1097 { 1098 uint64_t last_cluster; 1099 int rc; 1100 1101 last_cluster = 0; 1102 while (last_cluster < blob->active.num_clusters) { 1103 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1104 1105 if (last_cluster == blob->active.num_clusters) { 1106 break; 1107 } 1108 1109 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1110 if (rc < 0) { 1111 return rc; 1112 } 1113 1114 *buf = (uint8_t *)cur_page->descriptors; 1115 *remaining_sz = sizeof(cur_page->descriptors); 1116 } 1117 1118 return 0; 1119 } 1120 1121 static void 1122 blob_serialize_extent_page(const struct spdk_blob *blob, 1123 uint64_t cluster, struct spdk_blob_md_page *page) 1124 { 1125 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1126 uint64_t i, extent_idx; 1127 uint64_t lba, lba_per_cluster; 1128 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1129 1130 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1131 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1132 1133 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1134 1135 desc_extent->start_cluster_idx = start_cluster_idx; 1136 extent_idx = 0; 1137 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1138 lba = blob->active.clusters[i]; 1139 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1140 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1141 break; 1142 } 1143 } 1144 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1145 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1146 } 1147 1148 static void 1149 blob_serialize_flags(const struct spdk_blob *blob, 1150 uint8_t *buf, size_t *buf_sz) 1151 { 1152 struct spdk_blob_md_descriptor_flags *desc; 1153 1154 /* 1155 * Flags get serialized first, so we should always have room for the flags 1156 * descriptor. 1157 */ 1158 assert(*buf_sz >= sizeof(*desc)); 1159 1160 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1161 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1162 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1163 desc->invalid_flags = blob->invalid_flags; 1164 desc->data_ro_flags = blob->data_ro_flags; 1165 desc->md_ro_flags = blob->md_ro_flags; 1166 1167 *buf_sz -= sizeof(*desc); 1168 } 1169 1170 static int 1171 blob_serialize_xattrs(const struct spdk_blob *blob, 1172 const struct spdk_xattr_tailq *xattrs, bool internal, 1173 struct spdk_blob_md_page **pages, 1174 struct spdk_blob_md_page *cur_page, 1175 uint32_t *page_count, uint8_t **buf, 1176 size_t *remaining_sz) 1177 { 1178 const struct spdk_xattr *xattr; 1179 int rc; 1180 1181 TAILQ_FOREACH(xattr, xattrs, link) { 1182 size_t required_sz = 0; 1183 1184 rc = blob_serialize_xattr(xattr, 1185 *buf, *remaining_sz, 1186 &required_sz, internal); 1187 if (rc < 0) { 1188 /* Need to add a new page to the chain */ 1189 rc = blob_serialize_add_page(blob, pages, page_count, 1190 &cur_page); 1191 if (rc < 0) { 1192 spdk_free(*pages); 1193 *pages = NULL; 1194 *page_count = 0; 1195 return rc; 1196 } 1197 1198 *buf = (uint8_t *)cur_page->descriptors; 1199 *remaining_sz = sizeof(cur_page->descriptors); 1200 1201 /* Try again */ 1202 required_sz = 0; 1203 rc = blob_serialize_xattr(xattr, 1204 *buf, *remaining_sz, 1205 &required_sz, internal); 1206 1207 if (rc < 0) { 1208 spdk_free(*pages); 1209 *pages = NULL; 1210 *page_count = 0; 1211 return rc; 1212 } 1213 } 1214 1215 *remaining_sz -= required_sz; 1216 *buf += required_sz; 1217 } 1218 1219 return 0; 1220 } 1221 1222 static int 1223 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1224 uint32_t *page_count) 1225 { 1226 struct spdk_blob_md_page *cur_page; 1227 int rc; 1228 uint8_t *buf; 1229 size_t remaining_sz; 1230 1231 assert(pages != NULL); 1232 assert(page_count != NULL); 1233 assert(blob != NULL); 1234 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1235 1236 *pages = NULL; 1237 *page_count = 0; 1238 1239 /* A blob always has at least 1 page, even if it has no descriptors */ 1240 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1241 if (rc < 0) { 1242 return rc; 1243 } 1244 1245 buf = (uint8_t *)cur_page->descriptors; 1246 remaining_sz = sizeof(cur_page->descriptors); 1247 1248 /* Serialize flags */ 1249 blob_serialize_flags(blob, buf, &remaining_sz); 1250 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1251 1252 /* Serialize xattrs */ 1253 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1254 pages, cur_page, page_count, &buf, &remaining_sz); 1255 if (rc < 0) { 1256 return rc; 1257 } 1258 1259 /* Serialize internal xattrs */ 1260 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1261 pages, cur_page, page_count, &buf, &remaining_sz); 1262 if (rc < 0) { 1263 return rc; 1264 } 1265 1266 if (blob->use_extent_table) { 1267 /* Serialize extent table */ 1268 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1269 } else { 1270 /* Serialize extents */ 1271 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1272 } 1273 1274 return rc; 1275 } 1276 1277 struct spdk_blob_load_ctx { 1278 struct spdk_blob *blob; 1279 1280 struct spdk_blob_md_page *pages; 1281 uint32_t num_pages; 1282 uint32_t next_extent_page; 1283 spdk_bs_sequence_t *seq; 1284 1285 spdk_bs_sequence_cpl cb_fn; 1286 void *cb_arg; 1287 }; 1288 1289 static uint32_t 1290 blob_md_page_calc_crc(void *page) 1291 { 1292 uint32_t crc; 1293 1294 crc = BLOB_CRC32C_INITIAL; 1295 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1296 crc ^= BLOB_CRC32C_INITIAL; 1297 1298 return crc; 1299 1300 } 1301 1302 static void 1303 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1304 { 1305 struct spdk_blob *blob = ctx->blob; 1306 1307 if (bserrno == 0) { 1308 blob_mark_clean(blob); 1309 } 1310 1311 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1312 1313 /* Free the memory */ 1314 spdk_free(ctx->pages); 1315 free(ctx); 1316 } 1317 1318 static void 1319 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1320 { 1321 struct spdk_blob_load_ctx *ctx = cb_arg; 1322 struct spdk_blob *blob = ctx->blob; 1323 1324 if (bserrno == 0) { 1325 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1326 if (blob->back_bs_dev == NULL) { 1327 bserrno = -ENOMEM; 1328 } 1329 } 1330 if (bserrno != 0) { 1331 SPDK_ERRLOG("Snapshot fail\n"); 1332 } 1333 1334 blob_load_final(ctx, bserrno); 1335 } 1336 1337 static void blob_update_clear_method(struct spdk_blob *blob); 1338 1339 static void 1340 blob_load_backing_dev(void *cb_arg) 1341 { 1342 struct spdk_blob_load_ctx *ctx = cb_arg; 1343 struct spdk_blob *blob = ctx->blob; 1344 const void *value; 1345 size_t len; 1346 int rc; 1347 1348 if (spdk_blob_is_thin_provisioned(blob)) { 1349 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1350 if (rc == 0) { 1351 if (len != sizeof(spdk_blob_id)) { 1352 blob_load_final(ctx, -EINVAL); 1353 return; 1354 } 1355 /* open snapshot blob and continue in the callback function */ 1356 blob->parent_id = *(spdk_blob_id *)value; 1357 spdk_bs_open_blob(blob->bs, blob->parent_id, 1358 blob_load_snapshot_cpl, ctx); 1359 return; 1360 } else { 1361 /* add zeroes_dev for thin provisioned blob */ 1362 blob->back_bs_dev = bs_create_zeroes_dev(); 1363 } 1364 } else { 1365 /* standard blob */ 1366 blob->back_bs_dev = NULL; 1367 } 1368 blob_load_final(ctx, 0); 1369 } 1370 1371 static void 1372 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1373 { 1374 struct spdk_blob_load_ctx *ctx = cb_arg; 1375 struct spdk_blob *blob = ctx->blob; 1376 struct spdk_blob_md_page *page; 1377 uint64_t i; 1378 uint32_t crc; 1379 uint64_t lba; 1380 void *tmp; 1381 uint64_t sz; 1382 1383 if (bserrno) { 1384 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1385 blob_load_final(ctx, bserrno); 1386 return; 1387 } 1388 1389 if (ctx->pages == NULL) { 1390 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1391 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1392 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1393 if (!ctx->pages) { 1394 blob_load_final(ctx, -ENOMEM); 1395 return; 1396 } 1397 ctx->num_pages = 1; 1398 ctx->next_extent_page = 0; 1399 } else { 1400 page = &ctx->pages[0]; 1401 crc = blob_md_page_calc_crc(page); 1402 if (crc != page->crc) { 1403 blob_load_final(ctx, -EINVAL); 1404 return; 1405 } 1406 1407 if (page->next != SPDK_INVALID_MD_PAGE) { 1408 blob_load_final(ctx, -EINVAL); 1409 return; 1410 } 1411 1412 bserrno = blob_parse_extent_page(page, blob); 1413 if (bserrno) { 1414 blob_load_final(ctx, bserrno); 1415 return; 1416 } 1417 } 1418 1419 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1420 if (blob->active.extent_pages[i] != 0) { 1421 /* Extent page was allocated, read and parse it. */ 1422 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1423 ctx->next_extent_page = i + 1; 1424 1425 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1426 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1427 blob_load_cpl_extents_cpl, ctx); 1428 return; 1429 } else { 1430 /* Thin provisioned blobs can point to unallocated extent pages. 1431 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1432 1433 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1434 blob->active.num_clusters += sz; 1435 blob->remaining_clusters_in_et -= sz; 1436 1437 assert(spdk_blob_is_thin_provisioned(blob)); 1438 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1439 1440 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1441 if (tmp == NULL) { 1442 blob_load_final(ctx, -ENOMEM); 1443 return; 1444 } 1445 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1446 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1447 blob->active.clusters = tmp; 1448 blob->active.cluster_array_size = blob->active.num_clusters; 1449 } 1450 } 1451 1452 blob_load_backing_dev(ctx); 1453 } 1454 1455 static void 1456 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1457 { 1458 struct spdk_blob_load_ctx *ctx = cb_arg; 1459 struct spdk_blob *blob = ctx->blob; 1460 struct spdk_blob_md_page *page; 1461 int rc; 1462 uint32_t crc; 1463 uint32_t current_page; 1464 1465 if (ctx->num_pages == 1) { 1466 current_page = bs_blobid_to_page(blob->id); 1467 } else { 1468 assert(ctx->num_pages != 0); 1469 page = &ctx->pages[ctx->num_pages - 2]; 1470 current_page = page->next; 1471 } 1472 1473 if (bserrno) { 1474 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1475 current_page, blob->id, bserrno); 1476 blob_load_final(ctx, bserrno); 1477 return; 1478 } 1479 1480 page = &ctx->pages[ctx->num_pages - 1]; 1481 crc = blob_md_page_calc_crc(page); 1482 if (crc != page->crc) { 1483 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1484 current_page, blob->id); 1485 blob_load_final(ctx, -EINVAL); 1486 return; 1487 } 1488 1489 if (page->next != SPDK_INVALID_MD_PAGE) { 1490 struct spdk_blob_md_page *tmp_pages; 1491 uint32_t next_page = page->next; 1492 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1493 1494 /* Read the next page */ 1495 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0); 1496 if (tmp_pages == NULL) { 1497 blob_load_final(ctx, -ENOMEM); 1498 return; 1499 } 1500 ctx->num_pages++; 1501 ctx->pages = tmp_pages; 1502 1503 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1504 next_lba, 1505 bs_byte_to_lba(blob->bs, sizeof(*page)), 1506 blob_load_cpl, ctx); 1507 return; 1508 } 1509 1510 /* Parse the pages */ 1511 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1512 if (rc) { 1513 blob_load_final(ctx, rc); 1514 return; 1515 } 1516 1517 if (blob->extent_table_found == true) { 1518 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1519 assert(blob->extent_rle_found == false); 1520 blob->use_extent_table = true; 1521 } else { 1522 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1523 * for extent table. No extent_* descriptors means that blob has length of 0 1524 * and no extent_rle descriptors were persisted for it. 1525 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1526 blob->use_extent_table = false; 1527 } 1528 1529 /* Check the clear_method stored in metadata vs what may have been passed 1530 * via spdk_bs_open_blob_ext() and update accordingly. 1531 */ 1532 blob_update_clear_method(blob); 1533 1534 spdk_free(ctx->pages); 1535 ctx->pages = NULL; 1536 1537 if (blob->extent_table_found) { 1538 blob_load_cpl_extents_cpl(seq, ctx, 0); 1539 } else { 1540 blob_load_backing_dev(ctx); 1541 } 1542 } 1543 1544 /* Load a blob from disk given a blobid */ 1545 static void 1546 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1547 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1548 { 1549 struct spdk_blob_load_ctx *ctx; 1550 struct spdk_blob_store *bs; 1551 uint32_t page_num; 1552 uint64_t lba; 1553 1554 blob_verify_md_op(blob); 1555 1556 bs = blob->bs; 1557 1558 ctx = calloc(1, sizeof(*ctx)); 1559 if (!ctx) { 1560 cb_fn(seq, cb_arg, -ENOMEM); 1561 return; 1562 } 1563 1564 ctx->blob = blob; 1565 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1566 if (!ctx->pages) { 1567 free(ctx); 1568 cb_fn(seq, cb_arg, -ENOMEM); 1569 return; 1570 } 1571 ctx->num_pages = 1; 1572 ctx->cb_fn = cb_fn; 1573 ctx->cb_arg = cb_arg; 1574 ctx->seq = seq; 1575 1576 page_num = bs_blobid_to_page(blob->id); 1577 lba = bs_md_page_to_lba(blob->bs, page_num); 1578 1579 blob->state = SPDK_BLOB_STATE_LOADING; 1580 1581 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1582 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1583 blob_load_cpl, ctx); 1584 } 1585 1586 struct spdk_blob_persist_ctx { 1587 struct spdk_blob *blob; 1588 1589 struct spdk_bs_super_block *super; 1590 1591 struct spdk_blob_md_page *pages; 1592 uint32_t next_extent_page; 1593 struct spdk_blob_md_page *extent_page; 1594 1595 spdk_bs_sequence_t *seq; 1596 spdk_bs_sequence_cpl cb_fn; 1597 void *cb_arg; 1598 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1599 }; 1600 1601 static void 1602 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1603 uint64_t lba_count) 1604 { 1605 switch (ctx->blob->clear_method) { 1606 case BLOB_CLEAR_WITH_DEFAULT: 1607 case BLOB_CLEAR_WITH_UNMAP: 1608 bs_batch_unmap_dev(batch, lba, lba_count); 1609 break; 1610 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1611 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1612 break; 1613 case BLOB_CLEAR_WITH_NONE: 1614 default: 1615 break; 1616 } 1617 } 1618 1619 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1620 1621 static void 1622 blob_persist_complete_cb(void *arg) 1623 { 1624 struct spdk_blob_persist_ctx *ctx = arg; 1625 1626 /* Call user callback */ 1627 ctx->cb_fn(ctx->seq, ctx->cb_arg, 0); 1628 1629 /* Free the memory */ 1630 spdk_free(ctx->pages); 1631 free(ctx); 1632 } 1633 1634 static void 1635 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1636 { 1637 struct spdk_blob_persist_ctx *next_persist, *tmp; 1638 struct spdk_blob *blob = ctx->blob; 1639 1640 if (bserrno == 0) { 1641 blob_mark_clean(blob); 1642 } 1643 1644 assert(ctx == TAILQ_FIRST(&blob->persists_to_complete)); 1645 1646 /* Complete all persists that were pending when the current persist started */ 1647 TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) { 1648 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link); 1649 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist); 1650 } 1651 1652 if (TAILQ_EMPTY(&blob->pending_persists)) { 1653 return; 1654 } 1655 1656 /* Queue up all pending persists for completion and start blob persist with first one */ 1657 TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link); 1658 next_persist = TAILQ_FIRST(&blob->persists_to_complete); 1659 1660 blob->state = SPDK_BLOB_STATE_DIRTY; 1661 blob_persist_check_dirty(next_persist); 1662 } 1663 1664 static void 1665 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1666 { 1667 struct spdk_blob_persist_ctx *ctx = cb_arg; 1668 struct spdk_blob *blob = ctx->blob; 1669 struct spdk_blob_store *bs = blob->bs; 1670 size_t i; 1671 1672 if (bserrno != 0) { 1673 blob_persist_complete(seq, ctx, bserrno); 1674 return; 1675 } 1676 1677 spdk_spin_lock(&bs->used_lock); 1678 1679 /* Release all extent_pages that were truncated */ 1680 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1681 /* Nothing to release if it was not allocated */ 1682 if (blob->active.extent_pages[i] != 0) { 1683 bs_release_md_page(bs, blob->active.extent_pages[i]); 1684 } 1685 } 1686 1687 spdk_spin_unlock(&bs->used_lock); 1688 1689 if (blob->active.num_extent_pages == 0) { 1690 free(blob->active.extent_pages); 1691 blob->active.extent_pages = NULL; 1692 blob->active.extent_pages_array_size = 0; 1693 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1694 #ifndef __clang_analyzer__ 1695 void *tmp; 1696 1697 /* scan-build really can't figure reallocs, workaround it */ 1698 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1699 assert(tmp != NULL); 1700 blob->active.extent_pages = tmp; 1701 #endif 1702 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1703 } 1704 1705 blob_persist_complete(seq, ctx, bserrno); 1706 } 1707 1708 static void 1709 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1710 { 1711 struct spdk_blob *blob = ctx->blob; 1712 struct spdk_blob_store *bs = blob->bs; 1713 size_t i; 1714 uint64_t lba; 1715 uint64_t lba_count; 1716 spdk_bs_batch_t *batch; 1717 1718 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1719 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1720 1721 /* Clear all extent_pages that were truncated */ 1722 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1723 /* Nothing to clear if it was not allocated */ 1724 if (blob->active.extent_pages[i] != 0) { 1725 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1726 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1727 } 1728 } 1729 1730 bs_batch_close(batch); 1731 } 1732 1733 static void 1734 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1735 { 1736 struct spdk_blob_persist_ctx *ctx = cb_arg; 1737 struct spdk_blob *blob = ctx->blob; 1738 struct spdk_blob_store *bs = blob->bs; 1739 size_t i; 1740 1741 if (bserrno != 0) { 1742 blob_persist_complete(seq, ctx, bserrno); 1743 return; 1744 } 1745 1746 spdk_spin_lock(&bs->used_lock); 1747 /* Release all clusters that were truncated */ 1748 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1749 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1750 1751 /* Nothing to release if it was not allocated */ 1752 if (blob->active.clusters[i] != 0) { 1753 bs_release_cluster(bs, cluster_num); 1754 } 1755 } 1756 spdk_spin_unlock(&bs->used_lock); 1757 1758 if (blob->active.num_clusters == 0) { 1759 free(blob->active.clusters); 1760 blob->active.clusters = NULL; 1761 blob->active.cluster_array_size = 0; 1762 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1763 #ifndef __clang_analyzer__ 1764 void *tmp; 1765 1766 /* scan-build really can't figure reallocs, workaround it */ 1767 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1768 assert(tmp != NULL); 1769 blob->active.clusters = tmp; 1770 1771 #endif 1772 blob->active.cluster_array_size = blob->active.num_clusters; 1773 } 1774 1775 /* Move on to clearing extent pages */ 1776 blob_persist_clear_extents(seq, ctx); 1777 } 1778 1779 static void 1780 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1781 { 1782 struct spdk_blob *blob = ctx->blob; 1783 struct spdk_blob_store *bs = blob->bs; 1784 spdk_bs_batch_t *batch; 1785 size_t i; 1786 uint64_t lba; 1787 uint64_t lba_count; 1788 1789 /* Clusters don't move around in blobs. The list shrinks or grows 1790 * at the end, but no changes ever occur in the middle of the list. 1791 */ 1792 1793 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1794 1795 /* Clear all clusters that were truncated */ 1796 lba = 0; 1797 lba_count = 0; 1798 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1799 uint64_t next_lba = blob->active.clusters[i]; 1800 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1); 1801 1802 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1803 /* This cluster is contiguous with the previous one. */ 1804 lba_count += next_lba_count; 1805 continue; 1806 } else if (next_lba == 0) { 1807 continue; 1808 } 1809 1810 /* This cluster is not contiguous with the previous one. */ 1811 1812 /* If a run of LBAs previously existing, clear them now */ 1813 if (lba_count > 0) { 1814 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1815 } 1816 1817 /* Start building the next batch */ 1818 lba = next_lba; 1819 if (next_lba > 0) { 1820 lba_count = next_lba_count; 1821 } else { 1822 lba_count = 0; 1823 } 1824 } 1825 1826 /* If we ended with a contiguous set of LBAs, clear them now */ 1827 if (lba_count > 0) { 1828 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1829 } 1830 1831 bs_batch_close(batch); 1832 } 1833 1834 static void 1835 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1836 { 1837 struct spdk_blob_persist_ctx *ctx = cb_arg; 1838 struct spdk_blob *blob = ctx->blob; 1839 struct spdk_blob_store *bs = blob->bs; 1840 size_t i; 1841 1842 if (bserrno != 0) { 1843 blob_persist_complete(seq, ctx, bserrno); 1844 return; 1845 } 1846 1847 spdk_spin_lock(&bs->used_lock); 1848 1849 /* This loop starts at 1 because the first page is special and handled 1850 * below. The pages (except the first) are never written in place, 1851 * so any pages in the clean list must be zeroed. 1852 */ 1853 for (i = 1; i < blob->clean.num_pages; i++) { 1854 bs_release_md_page(bs, blob->clean.pages[i]); 1855 } 1856 1857 if (blob->active.num_pages == 0) { 1858 uint32_t page_num; 1859 1860 page_num = bs_blobid_to_page(blob->id); 1861 bs_release_md_page(bs, page_num); 1862 } 1863 1864 spdk_spin_unlock(&bs->used_lock); 1865 1866 /* Move on to clearing clusters */ 1867 blob_persist_clear_clusters(seq, ctx); 1868 } 1869 1870 static void 1871 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1872 { 1873 struct spdk_blob_persist_ctx *ctx = cb_arg; 1874 struct spdk_blob *blob = ctx->blob; 1875 struct spdk_blob_store *bs = blob->bs; 1876 uint64_t lba; 1877 uint64_t lba_count; 1878 spdk_bs_batch_t *batch; 1879 size_t i; 1880 1881 if (bserrno != 0) { 1882 blob_persist_complete(seq, ctx, bserrno); 1883 return; 1884 } 1885 1886 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1887 1888 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1889 1890 /* This loop starts at 1 because the first page is special and handled 1891 * below. The pages (except the first) are never written in place, 1892 * so any pages in the clean list must be zeroed. 1893 */ 1894 for (i = 1; i < blob->clean.num_pages; i++) { 1895 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1896 1897 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1898 } 1899 1900 /* The first page will only be zeroed if this is a delete. */ 1901 if (blob->active.num_pages == 0) { 1902 uint32_t page_num; 1903 1904 /* The first page in the metadata goes where the blobid indicates */ 1905 page_num = bs_blobid_to_page(blob->id); 1906 lba = bs_md_page_to_lba(bs, page_num); 1907 1908 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1909 } 1910 1911 bs_batch_close(batch); 1912 } 1913 1914 static void 1915 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1916 { 1917 struct spdk_blob_persist_ctx *ctx = cb_arg; 1918 struct spdk_blob *blob = ctx->blob; 1919 struct spdk_blob_store *bs = blob->bs; 1920 uint64_t lba; 1921 uint32_t lba_count; 1922 struct spdk_blob_md_page *page; 1923 1924 if (bserrno != 0) { 1925 blob_persist_complete(seq, ctx, bserrno); 1926 return; 1927 } 1928 1929 if (blob->active.num_pages == 0) { 1930 /* Move on to the next step */ 1931 blob_persist_zero_pages(seq, ctx, 0); 1932 return; 1933 } 1934 1935 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1936 1937 page = &ctx->pages[0]; 1938 /* The first page in the metadata goes where the blobid indicates */ 1939 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1940 1941 bs_sequence_write_dev(seq, page, lba, lba_count, 1942 blob_persist_zero_pages, ctx); 1943 } 1944 1945 static void 1946 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1947 { 1948 struct spdk_blob *blob = ctx->blob; 1949 struct spdk_blob_store *bs = blob->bs; 1950 uint64_t lba; 1951 uint32_t lba_count; 1952 struct spdk_blob_md_page *page; 1953 spdk_bs_batch_t *batch; 1954 size_t i; 1955 1956 /* Clusters don't move around in blobs. The list shrinks or grows 1957 * at the end, but no changes ever occur in the middle of the list. 1958 */ 1959 1960 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1961 1962 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1963 1964 /* This starts at 1. The root page is not written until 1965 * all of the others are finished 1966 */ 1967 for (i = 1; i < blob->active.num_pages; i++) { 1968 page = &ctx->pages[i]; 1969 assert(page->sequence_num == i); 1970 1971 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1972 1973 bs_batch_write_dev(batch, page, lba, lba_count); 1974 } 1975 1976 bs_batch_close(batch); 1977 } 1978 1979 static int 1980 blob_resize(struct spdk_blob *blob, uint64_t sz) 1981 { 1982 uint64_t i; 1983 uint64_t *tmp; 1984 uint64_t cluster; 1985 uint32_t lfmd; /* lowest free md page */ 1986 uint64_t num_clusters; 1987 uint32_t *ep_tmp; 1988 uint64_t new_num_ep = 0, current_num_ep = 0; 1989 struct spdk_blob_store *bs; 1990 int rc; 1991 1992 bs = blob->bs; 1993 1994 blob_verify_md_op(blob); 1995 1996 if (blob->active.num_clusters == sz) { 1997 return 0; 1998 } 1999 2000 if (blob->active.num_clusters < blob->active.cluster_array_size) { 2001 /* If this blob was resized to be larger, then smaller, then 2002 * larger without syncing, then the cluster array already 2003 * contains spare assigned clusters we can use. 2004 */ 2005 num_clusters = spdk_min(blob->active.cluster_array_size, 2006 sz); 2007 } else { 2008 num_clusters = blob->active.num_clusters; 2009 } 2010 2011 if (blob->use_extent_table) { 2012 /* Round up since every cluster beyond current Extent Table size, 2013 * requires new extent page. */ 2014 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 2015 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 2016 } 2017 2018 assert(!spdk_spin_held(&bs->used_lock)); 2019 2020 /* Check first that we have enough clusters and md pages before we start claiming them. 2021 * bs->used_lock is held to ensure that clusters we think are free are still free when we go 2022 * to claim them later in this function. 2023 */ 2024 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 2025 spdk_spin_lock(&bs->used_lock); 2026 if ((sz - num_clusters) > bs->num_free_clusters) { 2027 rc = -ENOSPC; 2028 goto out; 2029 } 2030 lfmd = 0; 2031 for (i = current_num_ep; i < new_num_ep ; i++) { 2032 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2033 if (lfmd == UINT32_MAX) { 2034 /* No more free md pages. Cannot satisfy the request */ 2035 rc = -ENOSPC; 2036 goto out; 2037 } 2038 } 2039 } 2040 2041 if (sz > num_clusters) { 2042 /* Expand the cluster array if necessary. 2043 * We only shrink the array when persisting. 2044 */ 2045 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2046 if (sz > 0 && tmp == NULL) { 2047 rc = -ENOMEM; 2048 goto out; 2049 } 2050 memset(tmp + blob->active.cluster_array_size, 0, 2051 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2052 blob->active.clusters = tmp; 2053 blob->active.cluster_array_size = sz; 2054 2055 /* Expand the extents table, only if enough clusters were added */ 2056 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2057 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2058 if (new_num_ep > 0 && ep_tmp == NULL) { 2059 rc = -ENOMEM; 2060 goto out; 2061 } 2062 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2063 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2064 blob->active.extent_pages = ep_tmp; 2065 blob->active.extent_pages_array_size = new_num_ep; 2066 } 2067 } 2068 2069 blob->state = SPDK_BLOB_STATE_DIRTY; 2070 2071 if (spdk_blob_is_thin_provisioned(blob) == false) { 2072 cluster = 0; 2073 lfmd = 0; 2074 for (i = num_clusters; i < sz; i++) { 2075 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2076 lfmd++; 2077 } 2078 } 2079 2080 blob->active.num_clusters = sz; 2081 blob->active.num_extent_pages = new_num_ep; 2082 2083 rc = 0; 2084 out: 2085 if (spdk_spin_held(&bs->used_lock)) { 2086 spdk_spin_unlock(&bs->used_lock); 2087 } 2088 2089 return rc; 2090 } 2091 2092 static void 2093 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2094 { 2095 spdk_bs_sequence_t *seq = ctx->seq; 2096 struct spdk_blob *blob = ctx->blob; 2097 struct spdk_blob_store *bs = blob->bs; 2098 uint64_t i; 2099 uint32_t page_num; 2100 void *tmp; 2101 int rc; 2102 2103 /* Generate the new metadata */ 2104 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2105 if (rc < 0) { 2106 blob_persist_complete(seq, ctx, rc); 2107 return; 2108 } 2109 2110 assert(blob->active.num_pages >= 1); 2111 2112 /* Resize the cache of page indices */ 2113 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2114 if (!tmp) { 2115 blob_persist_complete(seq, ctx, -ENOMEM); 2116 return; 2117 } 2118 blob->active.pages = tmp; 2119 2120 /* Assign this metadata to pages. This requires two passes - one to verify that there are 2121 * enough pages and a second to actually claim them. The used_lock is held across 2122 * both passes to ensure things don't change in the middle. 2123 */ 2124 spdk_spin_lock(&bs->used_lock); 2125 page_num = 0; 2126 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2127 for (i = 1; i < blob->active.num_pages; i++) { 2128 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2129 if (page_num == UINT32_MAX) { 2130 spdk_spin_unlock(&bs->used_lock); 2131 blob_persist_complete(seq, ctx, -ENOMEM); 2132 return; 2133 } 2134 page_num++; 2135 } 2136 2137 page_num = 0; 2138 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2139 for (i = 1; i < blob->active.num_pages; i++) { 2140 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2141 ctx->pages[i - 1].next = page_num; 2142 /* Now that previous metadata page is complete, calculate the crc for it. */ 2143 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2144 blob->active.pages[i] = page_num; 2145 bs_claim_md_page(bs, page_num); 2146 SPDK_DEBUGLOG(blob, "Claiming page %u for blob 0x%" PRIx64 "\n", page_num, 2147 blob->id); 2148 page_num++; 2149 } 2150 spdk_spin_unlock(&bs->used_lock); 2151 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2152 /* Start writing the metadata from last page to first */ 2153 blob->state = SPDK_BLOB_STATE_CLEAN; 2154 blob_persist_write_page_chain(seq, ctx); 2155 } 2156 2157 static void 2158 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2159 { 2160 struct spdk_blob_persist_ctx *ctx = cb_arg; 2161 struct spdk_blob *blob = ctx->blob; 2162 size_t i; 2163 uint32_t extent_page_id; 2164 uint32_t page_count = 0; 2165 int rc; 2166 2167 if (ctx->extent_page != NULL) { 2168 spdk_free(ctx->extent_page); 2169 ctx->extent_page = NULL; 2170 } 2171 2172 if (bserrno != 0) { 2173 blob_persist_complete(seq, ctx, bserrno); 2174 return; 2175 } 2176 2177 /* Only write out Extent Pages when blob was resized. */ 2178 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2179 extent_page_id = blob->active.extent_pages[i]; 2180 if (extent_page_id == 0) { 2181 /* No Extent Page to persist */ 2182 assert(spdk_blob_is_thin_provisioned(blob)); 2183 continue; 2184 } 2185 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2186 ctx->next_extent_page = i + 1; 2187 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2188 if (rc < 0) { 2189 blob_persist_complete(seq, ctx, rc); 2190 return; 2191 } 2192 2193 blob->state = SPDK_BLOB_STATE_DIRTY; 2194 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2195 2196 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2197 2198 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2199 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2200 blob_persist_write_extent_pages, ctx); 2201 return; 2202 } 2203 2204 blob_persist_generate_new_md(ctx); 2205 } 2206 2207 static void 2208 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2209 { 2210 spdk_bs_sequence_t *seq = ctx->seq; 2211 struct spdk_blob *blob = ctx->blob; 2212 2213 if (blob->active.num_pages == 0) { 2214 /* This is the signal that the blob should be deleted. 2215 * Immediately jump to the clean up routine. */ 2216 assert(blob->clean.num_pages > 0); 2217 blob->state = SPDK_BLOB_STATE_CLEAN; 2218 blob_persist_zero_pages(seq, ctx, 0); 2219 return; 2220 2221 } 2222 2223 if (blob->clean.num_clusters < blob->active.num_clusters) { 2224 /* Blob was resized up */ 2225 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2226 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2227 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2228 /* Blob was resized down */ 2229 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2230 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2231 } else { 2232 /* No change in size occurred */ 2233 blob_persist_generate_new_md(ctx); 2234 return; 2235 } 2236 2237 blob_persist_write_extent_pages(seq, ctx, 0); 2238 } 2239 2240 static void 2241 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2242 { 2243 struct spdk_blob_persist_ctx *ctx = cb_arg; 2244 2245 spdk_free(ctx->super); 2246 2247 if (bserrno != 0) { 2248 blob_persist_complete(seq, ctx, bserrno); 2249 return; 2250 } 2251 2252 ctx->blob->bs->clean = 0; 2253 2254 blob_persist_start(ctx); 2255 } 2256 2257 static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2258 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2259 2260 2261 static void 2262 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2263 { 2264 struct spdk_blob_persist_ctx *ctx = cb_arg; 2265 2266 if (bserrno != 0) { 2267 spdk_free(ctx->super); 2268 blob_persist_complete(seq, ctx, bserrno); 2269 return; 2270 } 2271 2272 ctx->super->clean = 0; 2273 if (ctx->super->size == 0) { 2274 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2275 } 2276 2277 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2278 } 2279 2280 static void 2281 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2282 { 2283 if (ctx->blob->bs->clean) { 2284 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2285 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2286 if (!ctx->super) { 2287 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2288 return; 2289 } 2290 2291 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2292 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2293 blob_persist_dirty, ctx); 2294 } else { 2295 blob_persist_start(ctx); 2296 } 2297 } 2298 2299 /* Write a blob to disk */ 2300 static void 2301 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2302 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2303 { 2304 struct spdk_blob_persist_ctx *ctx; 2305 2306 blob_verify_md_op(blob); 2307 2308 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) { 2309 cb_fn(seq, cb_arg, 0); 2310 return; 2311 } 2312 2313 ctx = calloc(1, sizeof(*ctx)); 2314 if (!ctx) { 2315 cb_fn(seq, cb_arg, -ENOMEM); 2316 return; 2317 } 2318 ctx->blob = blob; 2319 ctx->seq = seq; 2320 ctx->cb_fn = cb_fn; 2321 ctx->cb_arg = cb_arg; 2322 2323 /* Multiple blob persists can affect one another, via blob->state or 2324 * blob mutable data changes. To prevent it, queue up the persists. */ 2325 if (!TAILQ_EMPTY(&blob->persists_to_complete)) { 2326 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2327 return; 2328 } 2329 TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link); 2330 2331 blob_persist_check_dirty(ctx); 2332 } 2333 2334 struct spdk_blob_copy_cluster_ctx { 2335 struct spdk_blob *blob; 2336 uint8_t *buf; 2337 uint64_t page; 2338 uint64_t new_cluster; 2339 uint32_t new_extent_page; 2340 spdk_bs_sequence_t *seq; 2341 struct spdk_blob_md_page *new_cluster_page; 2342 }; 2343 2344 static void 2345 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2346 { 2347 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2348 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2349 TAILQ_HEAD(, spdk_bs_request_set) requests; 2350 spdk_bs_user_op_t *op; 2351 2352 TAILQ_INIT(&requests); 2353 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2354 2355 while (!TAILQ_EMPTY(&requests)) { 2356 op = TAILQ_FIRST(&requests); 2357 TAILQ_REMOVE(&requests, op, link); 2358 if (bserrno == 0) { 2359 bs_user_op_execute(op); 2360 } else { 2361 bs_user_op_abort(op, bserrno); 2362 } 2363 } 2364 2365 spdk_free(ctx->buf); 2366 free(ctx); 2367 } 2368 2369 static void 2370 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2371 { 2372 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2373 2374 if (bserrno) { 2375 if (bserrno == -EEXIST) { 2376 /* The metadata insert failed because another thread 2377 * allocated the cluster first. Free our cluster 2378 * but continue without error. */ 2379 bserrno = 0; 2380 } 2381 spdk_spin_lock(&ctx->blob->bs->used_lock); 2382 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2383 if (ctx->new_extent_page != 0) { 2384 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2385 } 2386 spdk_spin_unlock(&ctx->blob->bs->used_lock); 2387 } 2388 2389 bs_sequence_finish(ctx->seq, bserrno); 2390 } 2391 2392 static void 2393 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2394 { 2395 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2396 uint32_t cluster_number; 2397 2398 if (bserrno) { 2399 /* The write failed, so jump to the final completion handler */ 2400 bs_sequence_finish(seq, bserrno); 2401 return; 2402 } 2403 2404 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2405 2406 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2407 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2408 } 2409 2410 static void 2411 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2412 { 2413 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2414 2415 if (bserrno != 0) { 2416 /* The read failed, so jump to the final completion handler */ 2417 bs_sequence_finish(seq, bserrno); 2418 return; 2419 } 2420 2421 /* Write whole cluster */ 2422 bs_sequence_write_dev(seq, ctx->buf, 2423 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2424 bs_cluster_to_lba(ctx->blob->bs, 1), 2425 blob_write_copy_cpl, ctx); 2426 } 2427 2428 static bool 2429 blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba) 2430 { 2431 uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page); 2432 2433 return (blob->bs->dev->copy != NULL) && 2434 blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba); 2435 } 2436 2437 static void 2438 blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba) 2439 { 2440 struct spdk_blob *blob = ctx->blob; 2441 uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz); 2442 2443 bs_sequence_copy_dev(ctx->seq, 2444 bs_cluster_to_lba(blob->bs, ctx->new_cluster), 2445 src_lba, 2446 lba_count, 2447 blob_write_copy_cpl, ctx); 2448 } 2449 2450 static void 2451 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2452 struct spdk_io_channel *_ch, 2453 uint64_t io_unit, spdk_bs_user_op_t *op) 2454 { 2455 struct spdk_bs_cpl cpl; 2456 struct spdk_bs_channel *ch; 2457 struct spdk_blob_copy_cluster_ctx *ctx; 2458 uint32_t cluster_start_page; 2459 uint32_t cluster_number; 2460 bool is_zeroes; 2461 bool can_copy; 2462 uint64_t copy_src_lba; 2463 int rc; 2464 2465 ch = spdk_io_channel_get_ctx(_ch); 2466 2467 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2468 /* There are already operations pending. Queue this user op 2469 * and return because it will be re-executed when the outstanding 2470 * cluster allocation completes. */ 2471 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2472 return; 2473 } 2474 2475 /* Round the io_unit offset down to the first page in the cluster */ 2476 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2477 2478 /* Calculate which index in the metadata cluster array the corresponding 2479 * cluster is supposed to be at. */ 2480 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2481 2482 ctx = calloc(1, sizeof(*ctx)); 2483 if (!ctx) { 2484 bs_user_op_abort(op, -ENOMEM); 2485 return; 2486 } 2487 2488 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2489 2490 ctx->blob = blob; 2491 ctx->page = cluster_start_page; 2492 ctx->new_cluster_page = ch->new_cluster_page; 2493 memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); 2494 can_copy = blob_can_copy(blob, cluster_start_page, ©_src_lba); 2495 2496 is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev, 2497 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2498 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz)); 2499 if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) { 2500 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2501 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2502 if (!ctx->buf) { 2503 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2504 blob->bs->cluster_sz); 2505 free(ctx); 2506 bs_user_op_abort(op, -ENOMEM); 2507 return; 2508 } 2509 } 2510 2511 spdk_spin_lock(&blob->bs->used_lock); 2512 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2513 false); 2514 spdk_spin_unlock(&blob->bs->used_lock); 2515 if (rc != 0) { 2516 spdk_free(ctx->buf); 2517 free(ctx); 2518 bs_user_op_abort(op, rc); 2519 return; 2520 } 2521 2522 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2523 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2524 cpl.u.blob_basic.cb_arg = ctx; 2525 2526 ctx->seq = bs_sequence_start(_ch, &cpl); 2527 if (!ctx->seq) { 2528 spdk_spin_lock(&blob->bs->used_lock); 2529 bs_release_cluster(blob->bs, ctx->new_cluster); 2530 spdk_spin_unlock(&blob->bs->used_lock); 2531 spdk_free(ctx->buf); 2532 free(ctx); 2533 bs_user_op_abort(op, -ENOMEM); 2534 return; 2535 } 2536 2537 /* Queue the user op to block other incoming operations */ 2538 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2539 2540 if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { 2541 if (can_copy) { 2542 blob_copy(ctx, op, copy_src_lba); 2543 } else { 2544 /* Read cluster from backing device */ 2545 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2546 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2547 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2548 blob_write_copy, ctx); 2549 } 2550 2551 } else { 2552 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2553 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2554 } 2555 } 2556 2557 static inline bool 2558 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2559 uint64_t *lba, uint64_t *lba_count) 2560 { 2561 *lba_count = length; 2562 2563 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2564 assert(blob->back_bs_dev != NULL); 2565 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2566 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2567 return false; 2568 } else { 2569 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2570 return true; 2571 } 2572 } 2573 2574 struct op_split_ctx { 2575 struct spdk_blob *blob; 2576 struct spdk_io_channel *channel; 2577 uint64_t io_unit_offset; 2578 uint64_t io_units_remaining; 2579 void *curr_payload; 2580 enum spdk_blob_op_type op_type; 2581 spdk_bs_sequence_t *seq; 2582 bool in_submit_ctx; 2583 bool completed_in_submit_ctx; 2584 bool done; 2585 }; 2586 2587 static void 2588 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2589 { 2590 struct op_split_ctx *ctx = cb_arg; 2591 struct spdk_blob *blob = ctx->blob; 2592 struct spdk_io_channel *ch = ctx->channel; 2593 enum spdk_blob_op_type op_type = ctx->op_type; 2594 uint8_t *buf; 2595 uint64_t offset; 2596 uint64_t length; 2597 uint64_t op_length; 2598 2599 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2600 bs_sequence_finish(ctx->seq, bserrno); 2601 if (ctx->in_submit_ctx) { 2602 /* Defer freeing of the ctx object, since it will be 2603 * accessed when this unwinds back to the submisison 2604 * context. 2605 */ 2606 ctx->done = true; 2607 } else { 2608 free(ctx); 2609 } 2610 return; 2611 } 2612 2613 if (ctx->in_submit_ctx) { 2614 /* If this split operation completed in the context 2615 * of its submission, mark the flag and return immediately 2616 * to avoid recursion. 2617 */ 2618 ctx->completed_in_submit_ctx = true; 2619 return; 2620 } 2621 2622 while (true) { 2623 ctx->completed_in_submit_ctx = false; 2624 2625 offset = ctx->io_unit_offset; 2626 length = ctx->io_units_remaining; 2627 buf = ctx->curr_payload; 2628 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2629 offset)); 2630 2631 /* Update length and payload for next operation */ 2632 ctx->io_units_remaining -= op_length; 2633 ctx->io_unit_offset += op_length; 2634 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2635 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2636 } 2637 2638 assert(!ctx->in_submit_ctx); 2639 ctx->in_submit_ctx = true; 2640 2641 switch (op_type) { 2642 case SPDK_BLOB_READ: 2643 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2644 blob_request_submit_op_split_next, ctx); 2645 break; 2646 case SPDK_BLOB_WRITE: 2647 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2648 blob_request_submit_op_split_next, ctx); 2649 break; 2650 case SPDK_BLOB_UNMAP: 2651 spdk_blob_io_unmap(blob, ch, offset, op_length, 2652 blob_request_submit_op_split_next, ctx); 2653 break; 2654 case SPDK_BLOB_WRITE_ZEROES: 2655 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2656 blob_request_submit_op_split_next, ctx); 2657 break; 2658 case SPDK_BLOB_READV: 2659 case SPDK_BLOB_WRITEV: 2660 SPDK_ERRLOG("readv/write not valid\n"); 2661 bs_sequence_finish(ctx->seq, -EINVAL); 2662 free(ctx); 2663 return; 2664 } 2665 2666 #ifndef __clang_analyzer__ 2667 /* scan-build reports a false positive around accessing the ctx here. It 2668 * forms a path that recursively calls this function, but then says 2669 * "assuming ctx->in_submit_ctx is false", when that isn't possible. 2670 * This path does free(ctx), returns to here, and reports a use-after-free 2671 * bug. Wrapping this bit of code so that scan-build doesn't see it 2672 * works around the scan-build bug. 2673 */ 2674 assert(ctx->in_submit_ctx); 2675 ctx->in_submit_ctx = false; 2676 2677 /* If the operation completed immediately, loop back and submit the 2678 * next operation. Otherwise we can return and the next split 2679 * operation will get submitted when this current operation is 2680 * later completed asynchronously. 2681 */ 2682 if (ctx->completed_in_submit_ctx) { 2683 continue; 2684 } else if (ctx->done) { 2685 free(ctx); 2686 } 2687 #endif 2688 break; 2689 } 2690 } 2691 2692 static void 2693 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2694 void *payload, uint64_t offset, uint64_t length, 2695 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2696 { 2697 struct op_split_ctx *ctx; 2698 spdk_bs_sequence_t *seq; 2699 struct spdk_bs_cpl cpl; 2700 2701 assert(blob != NULL); 2702 2703 ctx = calloc(1, sizeof(struct op_split_ctx)); 2704 if (ctx == NULL) { 2705 cb_fn(cb_arg, -ENOMEM); 2706 return; 2707 } 2708 2709 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2710 cpl.u.blob_basic.cb_fn = cb_fn; 2711 cpl.u.blob_basic.cb_arg = cb_arg; 2712 2713 seq = bs_sequence_start(ch, &cpl); 2714 if (!seq) { 2715 free(ctx); 2716 cb_fn(cb_arg, -ENOMEM); 2717 return; 2718 } 2719 2720 ctx->blob = blob; 2721 ctx->channel = ch; 2722 ctx->curr_payload = payload; 2723 ctx->io_unit_offset = offset; 2724 ctx->io_units_remaining = length; 2725 ctx->op_type = op_type; 2726 ctx->seq = seq; 2727 2728 blob_request_submit_op_split_next(ctx, 0); 2729 } 2730 2731 static void 2732 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2733 void *payload, uint64_t offset, uint64_t length, 2734 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2735 { 2736 struct spdk_bs_cpl cpl; 2737 uint64_t lba; 2738 uint64_t lba_count; 2739 bool is_allocated; 2740 2741 assert(blob != NULL); 2742 2743 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2744 cpl.u.blob_basic.cb_fn = cb_fn; 2745 cpl.u.blob_basic.cb_arg = cb_arg; 2746 2747 if (blob->frozen_refcnt) { 2748 /* This blob I/O is frozen */ 2749 spdk_bs_user_op_t *op; 2750 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2751 2752 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2753 if (!op) { 2754 cb_fn(cb_arg, -ENOMEM); 2755 return; 2756 } 2757 2758 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2759 2760 return; 2761 } 2762 2763 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2764 2765 switch (op_type) { 2766 case SPDK_BLOB_READ: { 2767 spdk_bs_batch_t *batch; 2768 2769 batch = bs_batch_open(_ch, &cpl); 2770 if (!batch) { 2771 cb_fn(cb_arg, -ENOMEM); 2772 return; 2773 } 2774 2775 if (is_allocated) { 2776 /* Read from the blob */ 2777 bs_batch_read_dev(batch, payload, lba, lba_count); 2778 } else { 2779 /* Read from the backing block device */ 2780 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2781 } 2782 2783 bs_batch_close(batch); 2784 break; 2785 } 2786 case SPDK_BLOB_WRITE: 2787 case SPDK_BLOB_WRITE_ZEROES: { 2788 if (is_allocated) { 2789 /* Write to the blob */ 2790 spdk_bs_batch_t *batch; 2791 2792 if (lba_count == 0) { 2793 cb_fn(cb_arg, 0); 2794 return; 2795 } 2796 2797 batch = bs_batch_open(_ch, &cpl); 2798 if (!batch) { 2799 cb_fn(cb_arg, -ENOMEM); 2800 return; 2801 } 2802 2803 if (op_type == SPDK_BLOB_WRITE) { 2804 bs_batch_write_dev(batch, payload, lba, lba_count); 2805 } else { 2806 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2807 } 2808 2809 bs_batch_close(batch); 2810 } else { 2811 /* Queue this operation and allocate the cluster */ 2812 spdk_bs_user_op_t *op; 2813 2814 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2815 if (!op) { 2816 cb_fn(cb_arg, -ENOMEM); 2817 return; 2818 } 2819 2820 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2821 } 2822 break; 2823 } 2824 case SPDK_BLOB_UNMAP: { 2825 spdk_bs_batch_t *batch; 2826 2827 batch = bs_batch_open(_ch, &cpl); 2828 if (!batch) { 2829 cb_fn(cb_arg, -ENOMEM); 2830 return; 2831 } 2832 2833 if (is_allocated) { 2834 bs_batch_unmap_dev(batch, lba, lba_count); 2835 } 2836 2837 bs_batch_close(batch); 2838 break; 2839 } 2840 case SPDK_BLOB_READV: 2841 case SPDK_BLOB_WRITEV: 2842 SPDK_ERRLOG("readv/write not valid\n"); 2843 cb_fn(cb_arg, -EINVAL); 2844 break; 2845 } 2846 } 2847 2848 static void 2849 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2850 void *payload, uint64_t offset, uint64_t length, 2851 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2852 { 2853 assert(blob != NULL); 2854 2855 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2856 cb_fn(cb_arg, -EPERM); 2857 return; 2858 } 2859 2860 if (length == 0) { 2861 cb_fn(cb_arg, 0); 2862 return; 2863 } 2864 2865 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2866 cb_fn(cb_arg, -EINVAL); 2867 return; 2868 } 2869 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2870 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2871 cb_fn, cb_arg, op_type); 2872 } else { 2873 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2874 cb_fn, cb_arg, op_type); 2875 } 2876 } 2877 2878 struct rw_iov_ctx { 2879 struct spdk_blob *blob; 2880 struct spdk_io_channel *channel; 2881 spdk_blob_op_complete cb_fn; 2882 void *cb_arg; 2883 bool read; 2884 int iovcnt; 2885 struct iovec *orig_iov; 2886 uint64_t io_unit_offset; 2887 uint64_t io_units_remaining; 2888 uint64_t io_units_done; 2889 struct spdk_blob_ext_io_opts *ext_io_opts; 2890 struct iovec iov[0]; 2891 }; 2892 2893 static void 2894 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2895 { 2896 assert(cb_arg == NULL); 2897 bs_sequence_finish(seq, bserrno); 2898 } 2899 2900 static void 2901 rw_iov_split_next(void *cb_arg, int bserrno) 2902 { 2903 struct rw_iov_ctx *ctx = cb_arg; 2904 struct spdk_blob *blob = ctx->blob; 2905 struct iovec *iov, *orig_iov; 2906 int iovcnt; 2907 size_t orig_iovoff; 2908 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2909 uint64_t byte_count; 2910 2911 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2912 ctx->cb_fn(ctx->cb_arg, bserrno); 2913 free(ctx); 2914 return; 2915 } 2916 2917 io_unit_offset = ctx->io_unit_offset; 2918 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2919 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2920 /* 2921 * Get index and offset into the original iov array for our current position in the I/O sequence. 2922 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2923 * point to the current position in the I/O sequence. 2924 */ 2925 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2926 orig_iov = &ctx->orig_iov[0]; 2927 orig_iovoff = 0; 2928 while (byte_count > 0) { 2929 if (byte_count >= orig_iov->iov_len) { 2930 byte_count -= orig_iov->iov_len; 2931 orig_iov++; 2932 } else { 2933 orig_iovoff = byte_count; 2934 byte_count = 0; 2935 } 2936 } 2937 2938 /* 2939 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2940 * bytes of this next I/O remain to be accounted for in the new iov array. 2941 */ 2942 byte_count = io_units_count * blob->bs->io_unit_size; 2943 iov = &ctx->iov[0]; 2944 iovcnt = 0; 2945 while (byte_count > 0) { 2946 assert(iovcnt < ctx->iovcnt); 2947 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2948 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2949 byte_count -= iov->iov_len; 2950 orig_iovoff = 0; 2951 orig_iov++; 2952 iov++; 2953 iovcnt++; 2954 } 2955 2956 ctx->io_unit_offset += io_units_count; 2957 ctx->io_units_remaining -= io_units_count; 2958 ctx->io_units_done += io_units_count; 2959 iov = &ctx->iov[0]; 2960 2961 if (ctx->read) { 2962 spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2963 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2964 } else { 2965 spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2966 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2967 } 2968 } 2969 2970 static void 2971 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2972 struct iovec *iov, int iovcnt, 2973 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read, 2974 struct spdk_blob_ext_io_opts *ext_io_opts) 2975 { 2976 struct spdk_bs_cpl cpl; 2977 2978 assert(blob != NULL); 2979 2980 if (!read && blob->data_ro) { 2981 cb_fn(cb_arg, -EPERM); 2982 return; 2983 } 2984 2985 if (length == 0) { 2986 cb_fn(cb_arg, 0); 2987 return; 2988 } 2989 2990 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2991 cb_fn(cb_arg, -EINVAL); 2992 return; 2993 } 2994 2995 /* 2996 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2997 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2998 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2999 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 3000 * to allocate a separate iov array and split the I/O such that none of the resulting 3001 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 3002 * but since this case happens very infrequently, any performance impact will be negligible. 3003 * 3004 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 3005 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 3006 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 3007 * when the batch was completed, to allow for freeing the memory for the iov arrays. 3008 */ 3009 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 3010 uint64_t lba_count; 3011 uint64_t lba; 3012 bool is_allocated; 3013 3014 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 3015 cpl.u.blob_basic.cb_fn = cb_fn; 3016 cpl.u.blob_basic.cb_arg = cb_arg; 3017 3018 if (blob->frozen_refcnt) { 3019 /* This blob I/O is frozen */ 3020 enum spdk_blob_op_type op_type; 3021 spdk_bs_user_op_t *op; 3022 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 3023 3024 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 3025 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 3026 if (!op) { 3027 cb_fn(cb_arg, -ENOMEM); 3028 return; 3029 } 3030 3031 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 3032 3033 return; 3034 } 3035 3036 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 3037 3038 if (read) { 3039 spdk_bs_sequence_t *seq; 3040 3041 seq = bs_sequence_start(_channel, &cpl); 3042 if (!seq) { 3043 cb_fn(cb_arg, -ENOMEM); 3044 return; 3045 } 3046 3047 seq->ext_io_opts = ext_io_opts; 3048 3049 if (is_allocated) { 3050 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 3051 } else { 3052 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 3053 rw_iov_done, NULL); 3054 } 3055 } else { 3056 if (is_allocated) { 3057 spdk_bs_sequence_t *seq; 3058 3059 seq = bs_sequence_start(_channel, &cpl); 3060 if (!seq) { 3061 cb_fn(cb_arg, -ENOMEM); 3062 return; 3063 } 3064 3065 seq->ext_io_opts = ext_io_opts; 3066 3067 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 3068 } else { 3069 /* Queue this operation and allocate the cluster */ 3070 spdk_bs_user_op_t *op; 3071 3072 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 3073 length); 3074 if (!op) { 3075 cb_fn(cb_arg, -ENOMEM); 3076 return; 3077 } 3078 3079 op->ext_io_opts = ext_io_opts; 3080 3081 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 3082 } 3083 } 3084 } else { 3085 struct rw_iov_ctx *ctx; 3086 3087 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 3088 if (ctx == NULL) { 3089 cb_fn(cb_arg, -ENOMEM); 3090 return; 3091 } 3092 3093 ctx->blob = blob; 3094 ctx->channel = _channel; 3095 ctx->cb_fn = cb_fn; 3096 ctx->cb_arg = cb_arg; 3097 ctx->read = read; 3098 ctx->orig_iov = iov; 3099 ctx->iovcnt = iovcnt; 3100 ctx->io_unit_offset = offset; 3101 ctx->io_units_remaining = length; 3102 ctx->io_units_done = 0; 3103 ctx->ext_io_opts = ext_io_opts; 3104 3105 rw_iov_split_next(ctx, 0); 3106 } 3107 } 3108 3109 static struct spdk_blob * 3110 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 3111 { 3112 struct spdk_blob find; 3113 3114 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 3115 return NULL; 3116 } 3117 3118 find.id = blobid; 3119 return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find); 3120 } 3121 3122 static void 3123 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 3124 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 3125 { 3126 assert(blob != NULL); 3127 *snapshot_entry = NULL; 3128 *clone_entry = NULL; 3129 3130 if (blob->parent_id == SPDK_BLOBID_INVALID) { 3131 return; 3132 } 3133 3134 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 3135 if ((*snapshot_entry)->id == blob->parent_id) { 3136 break; 3137 } 3138 } 3139 3140 if (*snapshot_entry != NULL) { 3141 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3142 if ((*clone_entry)->id == blob->id) { 3143 break; 3144 } 3145 } 3146 3147 assert(*clone_entry != NULL); 3148 } 3149 } 3150 3151 static int 3152 bs_channel_create(void *io_device, void *ctx_buf) 3153 { 3154 struct spdk_blob_store *bs = io_device; 3155 struct spdk_bs_channel *channel = ctx_buf; 3156 struct spdk_bs_dev *dev; 3157 uint32_t max_ops = bs->max_channel_ops; 3158 uint32_t i; 3159 3160 dev = bs->dev; 3161 3162 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3163 if (!channel->req_mem) { 3164 return -1; 3165 } 3166 3167 TAILQ_INIT(&channel->reqs); 3168 3169 for (i = 0; i < max_ops; i++) { 3170 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3171 } 3172 3173 channel->bs = bs; 3174 channel->dev = dev; 3175 channel->dev_channel = dev->create_channel(dev); 3176 3177 if (!channel->dev_channel) { 3178 SPDK_ERRLOG("Failed to create device channel.\n"); 3179 free(channel->req_mem); 3180 return -1; 3181 } 3182 3183 channel->new_cluster_page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, 3184 SPDK_MALLOC_DMA); 3185 if (!channel->new_cluster_page) { 3186 SPDK_ERRLOG("Failed to allocate new cluster page\n"); 3187 free(channel->req_mem); 3188 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3189 return -1; 3190 } 3191 3192 TAILQ_INIT(&channel->need_cluster_alloc); 3193 TAILQ_INIT(&channel->queued_io); 3194 3195 return 0; 3196 } 3197 3198 static void 3199 bs_channel_destroy(void *io_device, void *ctx_buf) 3200 { 3201 struct spdk_bs_channel *channel = ctx_buf; 3202 spdk_bs_user_op_t *op; 3203 3204 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3205 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3206 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3207 bs_user_op_abort(op, -EIO); 3208 } 3209 3210 while (!TAILQ_EMPTY(&channel->queued_io)) { 3211 op = TAILQ_FIRST(&channel->queued_io); 3212 TAILQ_REMOVE(&channel->queued_io, op, link); 3213 bs_user_op_abort(op, -EIO); 3214 } 3215 3216 free(channel->req_mem); 3217 spdk_free(channel->new_cluster_page); 3218 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3219 } 3220 3221 static void 3222 bs_dev_destroy(void *io_device) 3223 { 3224 struct spdk_blob_store *bs = io_device; 3225 struct spdk_blob *blob, *blob_tmp; 3226 3227 bs->dev->destroy(bs->dev); 3228 3229 RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) { 3230 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob); 3231 spdk_bit_array_clear(bs->open_blobids, blob->id); 3232 blob_free(blob); 3233 } 3234 3235 spdk_spin_destroy(&bs->used_lock); 3236 3237 spdk_bit_array_free(&bs->open_blobids); 3238 spdk_bit_array_free(&bs->used_blobids); 3239 spdk_bit_array_free(&bs->used_md_pages); 3240 spdk_bit_pool_free(&bs->used_clusters); 3241 /* 3242 * If this function is called for any reason except a successful unload, 3243 * the unload_cpl type will be NONE and this will be a nop. 3244 */ 3245 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3246 3247 free(bs); 3248 } 3249 3250 static int 3251 bs_blob_list_add(struct spdk_blob *blob) 3252 { 3253 spdk_blob_id snapshot_id; 3254 struct spdk_blob_list *snapshot_entry = NULL; 3255 struct spdk_blob_list *clone_entry = NULL; 3256 3257 assert(blob != NULL); 3258 3259 snapshot_id = blob->parent_id; 3260 if (snapshot_id == SPDK_BLOBID_INVALID) { 3261 return 0; 3262 } 3263 3264 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3265 if (snapshot_entry == NULL) { 3266 /* Snapshot not found */ 3267 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3268 if (snapshot_entry == NULL) { 3269 return -ENOMEM; 3270 } 3271 snapshot_entry->id = snapshot_id; 3272 TAILQ_INIT(&snapshot_entry->clones); 3273 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3274 } else { 3275 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3276 if (clone_entry->id == blob->id) { 3277 break; 3278 } 3279 } 3280 } 3281 3282 if (clone_entry == NULL) { 3283 /* Clone not found */ 3284 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3285 if (clone_entry == NULL) { 3286 return -ENOMEM; 3287 } 3288 clone_entry->id = blob->id; 3289 TAILQ_INIT(&clone_entry->clones); 3290 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3291 snapshot_entry->clone_count++; 3292 } 3293 3294 return 0; 3295 } 3296 3297 static void 3298 bs_blob_list_remove(struct spdk_blob *blob) 3299 { 3300 struct spdk_blob_list *snapshot_entry = NULL; 3301 struct spdk_blob_list *clone_entry = NULL; 3302 3303 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3304 3305 if (snapshot_entry == NULL) { 3306 return; 3307 } 3308 3309 blob->parent_id = SPDK_BLOBID_INVALID; 3310 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3311 free(clone_entry); 3312 3313 snapshot_entry->clone_count--; 3314 } 3315 3316 static int 3317 bs_blob_list_free(struct spdk_blob_store *bs) 3318 { 3319 struct spdk_blob_list *snapshot_entry; 3320 struct spdk_blob_list *snapshot_entry_tmp; 3321 struct spdk_blob_list *clone_entry; 3322 struct spdk_blob_list *clone_entry_tmp; 3323 3324 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3325 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3326 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3327 free(clone_entry); 3328 } 3329 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3330 free(snapshot_entry); 3331 } 3332 3333 return 0; 3334 } 3335 3336 static void 3337 bs_free(struct spdk_blob_store *bs) 3338 { 3339 bs_blob_list_free(bs); 3340 3341 bs_unregister_md_thread(bs); 3342 spdk_io_device_unregister(bs, bs_dev_destroy); 3343 } 3344 3345 void 3346 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3347 { 3348 3349 if (!opts) { 3350 SPDK_ERRLOG("opts should not be NULL\n"); 3351 return; 3352 } 3353 3354 if (!opts_size) { 3355 SPDK_ERRLOG("opts_size should not be zero value\n"); 3356 return; 3357 } 3358 3359 memset(opts, 0, opts_size); 3360 opts->opts_size = opts_size; 3361 3362 #define FIELD_OK(field) \ 3363 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3364 3365 #define SET_FIELD(field, value) \ 3366 if (FIELD_OK(field)) { \ 3367 opts->field = value; \ 3368 } \ 3369 3370 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3371 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3372 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3373 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3374 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3375 3376 if (FIELD_OK(bstype)) { 3377 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3378 } 3379 3380 SET_FIELD(iter_cb_fn, NULL); 3381 SET_FIELD(iter_cb_arg, NULL); 3382 SET_FIELD(force_recover, false); 3383 3384 #undef FIELD_OK 3385 #undef SET_FIELD 3386 } 3387 3388 static int 3389 bs_opts_verify(struct spdk_bs_opts *opts) 3390 { 3391 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3392 opts->max_channel_ops == 0) { 3393 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3394 return -1; 3395 } 3396 3397 return 0; 3398 } 3399 3400 /* START spdk_bs_load */ 3401 3402 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3403 3404 struct spdk_bs_load_ctx { 3405 struct spdk_blob_store *bs; 3406 struct spdk_bs_super_block *super; 3407 3408 struct spdk_bs_md_mask *mask; 3409 bool in_page_chain; 3410 uint32_t page_index; 3411 uint32_t cur_page; 3412 struct spdk_blob_md_page *page; 3413 3414 uint64_t num_extent_pages; 3415 uint32_t *extent_page_num; 3416 struct spdk_blob_md_page *extent_pages; 3417 struct spdk_bit_array *used_clusters; 3418 3419 spdk_bs_sequence_t *seq; 3420 spdk_blob_op_with_handle_complete iter_cb_fn; 3421 void *iter_cb_arg; 3422 struct spdk_blob *blob; 3423 spdk_blob_id blobid; 3424 3425 bool force_recover; 3426 3427 /* These fields are used in the spdk_bs_dump path. */ 3428 bool dumping; 3429 FILE *fp; 3430 spdk_bs_dump_print_xattr print_xattr_fn; 3431 char xattr_name[4096]; 3432 }; 3433 3434 static int 3435 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3436 struct spdk_bs_load_ctx **_ctx) 3437 { 3438 struct spdk_blob_store *bs; 3439 struct spdk_bs_load_ctx *ctx; 3440 uint64_t dev_size; 3441 int rc; 3442 3443 dev_size = dev->blocklen * dev->blockcnt; 3444 if (dev_size < opts->cluster_sz) { 3445 /* Device size cannot be smaller than cluster size of blobstore */ 3446 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3447 dev_size, opts->cluster_sz); 3448 return -ENOSPC; 3449 } 3450 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3451 /* Cluster size cannot be smaller than page size */ 3452 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3453 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3454 return -EINVAL; 3455 } 3456 bs = calloc(1, sizeof(struct spdk_blob_store)); 3457 if (!bs) { 3458 return -ENOMEM; 3459 } 3460 3461 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3462 if (!ctx) { 3463 free(bs); 3464 return -ENOMEM; 3465 } 3466 3467 ctx->bs = bs; 3468 ctx->iter_cb_fn = opts->iter_cb_fn; 3469 ctx->iter_cb_arg = opts->iter_cb_arg; 3470 ctx->force_recover = opts->force_recover; 3471 3472 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3473 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3474 if (!ctx->super) { 3475 free(ctx); 3476 free(bs); 3477 return -ENOMEM; 3478 } 3479 3480 RB_INIT(&bs->open_blobs); 3481 TAILQ_INIT(&bs->snapshots); 3482 bs->dev = dev; 3483 bs->md_thread = spdk_get_thread(); 3484 assert(bs->md_thread != NULL); 3485 3486 /* 3487 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3488 * even multiple of the cluster size. 3489 */ 3490 bs->cluster_sz = opts->cluster_sz; 3491 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3492 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3493 if (!ctx->used_clusters) { 3494 spdk_free(ctx->super); 3495 free(ctx); 3496 free(bs); 3497 return -ENOMEM; 3498 } 3499 3500 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3501 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3502 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3503 } 3504 bs->num_free_clusters = bs->total_clusters; 3505 bs->io_unit_size = dev->blocklen; 3506 3507 bs->max_channel_ops = opts->max_channel_ops; 3508 bs->super_blob = SPDK_BLOBID_INVALID; 3509 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3510 3511 /* The metadata is assumed to be at least 1 page */ 3512 bs->used_md_pages = spdk_bit_array_create(1); 3513 bs->used_blobids = spdk_bit_array_create(0); 3514 bs->open_blobids = spdk_bit_array_create(0); 3515 3516 spdk_spin_init(&bs->used_lock); 3517 3518 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3519 sizeof(struct spdk_bs_channel), "blobstore"); 3520 rc = bs_register_md_thread(bs); 3521 if (rc == -1) { 3522 spdk_io_device_unregister(bs, NULL); 3523 spdk_spin_destroy(&bs->used_lock); 3524 spdk_bit_array_free(&bs->open_blobids); 3525 spdk_bit_array_free(&bs->used_blobids); 3526 spdk_bit_array_free(&bs->used_md_pages); 3527 spdk_bit_array_free(&ctx->used_clusters); 3528 spdk_free(ctx->super); 3529 free(ctx); 3530 free(bs); 3531 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3532 return -ENOMEM; 3533 } 3534 3535 *_ctx = ctx; 3536 *_bs = bs; 3537 return 0; 3538 } 3539 3540 static void 3541 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3542 { 3543 assert(bserrno != 0); 3544 3545 spdk_free(ctx->super); 3546 bs_sequence_finish(ctx->seq, bserrno); 3547 bs_free(ctx->bs); 3548 spdk_bit_array_free(&ctx->used_clusters); 3549 free(ctx); 3550 } 3551 3552 static void 3553 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3554 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3555 { 3556 /* Update the values in the super block */ 3557 super->super_blob = bs->super_blob; 3558 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3559 super->crc = blob_md_page_calc_crc(super); 3560 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3561 bs_byte_to_lba(bs, sizeof(*super)), 3562 cb_fn, cb_arg); 3563 } 3564 3565 static void 3566 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3567 { 3568 struct spdk_bs_load_ctx *ctx = arg; 3569 uint64_t mask_size, lba, lba_count; 3570 3571 /* Write out the used clusters mask */ 3572 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3573 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3574 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3575 if (!ctx->mask) { 3576 bs_load_ctx_fail(ctx, -ENOMEM); 3577 return; 3578 } 3579 3580 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3581 ctx->mask->length = ctx->bs->total_clusters; 3582 /* We could get here through the normal unload path, or through dirty 3583 * shutdown recovery. For the normal unload path, we use the mask from 3584 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3585 * only the bit array from the load ctx. 3586 */ 3587 if (ctx->bs->used_clusters) { 3588 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3589 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3590 } else { 3591 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3592 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3593 } 3594 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3595 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3596 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3597 } 3598 3599 static void 3600 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3601 { 3602 struct spdk_bs_load_ctx *ctx = arg; 3603 uint64_t mask_size, lba, lba_count; 3604 3605 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3606 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3607 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3608 if (!ctx->mask) { 3609 bs_load_ctx_fail(ctx, -ENOMEM); 3610 return; 3611 } 3612 3613 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3614 ctx->mask->length = ctx->super->md_len; 3615 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3616 3617 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3618 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3619 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3620 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3621 } 3622 3623 static void 3624 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3625 { 3626 struct spdk_bs_load_ctx *ctx = arg; 3627 uint64_t mask_size, lba, lba_count; 3628 3629 if (ctx->super->used_blobid_mask_len == 0) { 3630 /* 3631 * This is a pre-v3 on-disk format where the blobid mask does not get 3632 * written to disk. 3633 */ 3634 cb_fn(seq, arg, 0); 3635 return; 3636 } 3637 3638 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3639 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3640 SPDK_MALLOC_DMA); 3641 if (!ctx->mask) { 3642 bs_load_ctx_fail(ctx, -ENOMEM); 3643 return; 3644 } 3645 3646 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3647 ctx->mask->length = ctx->super->md_len; 3648 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3649 3650 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3651 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3652 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3653 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3654 } 3655 3656 static void 3657 blob_set_thin_provision(struct spdk_blob *blob) 3658 { 3659 blob_verify_md_op(blob); 3660 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3661 blob->state = SPDK_BLOB_STATE_DIRTY; 3662 } 3663 3664 static void 3665 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3666 { 3667 blob_verify_md_op(blob); 3668 blob->clear_method = clear_method; 3669 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3670 blob->state = SPDK_BLOB_STATE_DIRTY; 3671 } 3672 3673 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3674 3675 static void 3676 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3677 { 3678 struct spdk_bs_load_ctx *ctx = cb_arg; 3679 spdk_blob_id id; 3680 int64_t page_num; 3681 3682 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3683 * last blob has been removed */ 3684 page_num = bs_blobid_to_page(ctx->blobid); 3685 page_num++; 3686 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3687 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3688 bs_load_iter(ctx, NULL, -ENOENT); 3689 return; 3690 } 3691 3692 id = bs_page_to_blobid(page_num); 3693 3694 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3695 } 3696 3697 static void 3698 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3699 { 3700 struct spdk_bs_load_ctx *ctx = cb_arg; 3701 3702 if (bserrno != 0) { 3703 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3704 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3705 return; 3706 } 3707 3708 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3709 } 3710 3711 static void 3712 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3713 { 3714 struct spdk_bs_load_ctx *ctx = cb_arg; 3715 uint64_t i; 3716 3717 if (bserrno != 0) { 3718 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3719 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3720 return; 3721 } 3722 3723 /* Snapshot and clone have the same copy of cluster map and extent pages 3724 * at this point. Let's clear both for snapshot now, 3725 * so that it won't be cleared for clone later when we remove snapshot. 3726 * Also set thin provision to pass data corruption check */ 3727 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3728 ctx->blob->active.clusters[i] = 0; 3729 } 3730 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3731 ctx->blob->active.extent_pages[i] = 0; 3732 } 3733 3734 ctx->blob->md_ro = false; 3735 3736 blob_set_thin_provision(ctx->blob); 3737 3738 ctx->blobid = ctx->blob->id; 3739 3740 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3741 } 3742 3743 static void 3744 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3745 { 3746 struct spdk_bs_load_ctx *ctx = cb_arg; 3747 3748 if (bserrno != 0) { 3749 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3750 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3751 return; 3752 } 3753 3754 ctx->blob->md_ro = false; 3755 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3756 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3757 spdk_blob_set_read_only(ctx->blob); 3758 3759 if (ctx->iter_cb_fn) { 3760 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3761 } 3762 bs_blob_list_add(ctx->blob); 3763 3764 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3765 } 3766 3767 static void 3768 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3769 { 3770 struct spdk_bs_load_ctx *ctx = cb_arg; 3771 3772 if (bserrno != 0) { 3773 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3774 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3775 return; 3776 } 3777 3778 if (blob->parent_id == ctx->blob->id) { 3779 /* Power failure occurred before updating clone (snapshot delete case) 3780 * or after updating clone (creating snapshot case) - keep snapshot */ 3781 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3782 } else { 3783 /* Power failure occurred after updating clone (snapshot delete case) 3784 * or before updating clone (creating snapshot case) - remove snapshot */ 3785 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3786 } 3787 } 3788 3789 static void 3790 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3791 { 3792 struct spdk_bs_load_ctx *ctx = arg; 3793 const void *value; 3794 size_t len; 3795 int rc = 0; 3796 3797 if (bserrno == 0) { 3798 /* Examine blob if it is corrupted after power failure. Fix 3799 * the ones that can be fixed and remove any other corrupted 3800 * ones. If it is not corrupted just process it */ 3801 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3802 if (rc != 0) { 3803 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3804 if (rc != 0) { 3805 /* Not corrupted - process it and continue with iterating through blobs */ 3806 if (ctx->iter_cb_fn) { 3807 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3808 } 3809 bs_blob_list_add(blob); 3810 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3811 return; 3812 } 3813 3814 } 3815 3816 assert(len == sizeof(spdk_blob_id)); 3817 3818 ctx->blob = blob; 3819 3820 /* Open clone to check if we are able to fix this blob or should we remove it */ 3821 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3822 return; 3823 } else if (bserrno == -ENOENT) { 3824 bserrno = 0; 3825 } else { 3826 /* 3827 * This case needs to be looked at further. Same problem 3828 * exists with applications that rely on explicit blob 3829 * iteration. We should just skip the blob that failed 3830 * to load and continue on to the next one. 3831 */ 3832 SPDK_ERRLOG("Error in iterating blobs\n"); 3833 } 3834 3835 ctx->iter_cb_fn = NULL; 3836 3837 spdk_free(ctx->super); 3838 spdk_free(ctx->mask); 3839 bs_sequence_finish(ctx->seq, bserrno); 3840 free(ctx); 3841 } 3842 3843 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 3844 3845 static void 3846 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3847 { 3848 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3849 if (ctx->dumping) { 3850 bs_dump_read_md_page(ctx->seq, ctx); 3851 return; 3852 } 3853 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3854 } 3855 3856 static void 3857 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3858 { 3859 struct spdk_bs_load_ctx *ctx = cb_arg; 3860 int rc; 3861 3862 /* The type must be correct */ 3863 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3864 3865 /* The length of the mask (in bits) must not be greater than 3866 * the length of the buffer (converted to bits) */ 3867 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3868 3869 /* The length of the mask must be exactly equal to the size 3870 * (in pages) of the metadata region */ 3871 assert(ctx->mask->length == ctx->super->md_len); 3872 3873 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3874 if (rc < 0) { 3875 spdk_free(ctx->mask); 3876 bs_load_ctx_fail(ctx, rc); 3877 return; 3878 } 3879 3880 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3881 bs_load_complete(ctx); 3882 } 3883 3884 static void 3885 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3886 { 3887 struct spdk_bs_load_ctx *ctx = cb_arg; 3888 uint64_t lba, lba_count, mask_size; 3889 int rc; 3890 3891 if (bserrno != 0) { 3892 bs_load_ctx_fail(ctx, bserrno); 3893 return; 3894 } 3895 3896 /* The type must be correct */ 3897 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3898 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3899 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3900 struct spdk_blob_md_page) * 8)); 3901 /* 3902 * The length of the mask must be equal to or larger than the total number of clusters. It may be 3903 * larger than the total number of clusters due to a failure spdk_bs_grow. 3904 */ 3905 assert(ctx->mask->length >= ctx->bs->total_clusters); 3906 if (ctx->mask->length > ctx->bs->total_clusters) { 3907 SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters"); 3908 ctx->mask->length = ctx->bs->total_clusters; 3909 } 3910 3911 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3912 if (rc < 0) { 3913 spdk_free(ctx->mask); 3914 bs_load_ctx_fail(ctx, rc); 3915 return; 3916 } 3917 3918 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3919 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3920 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3921 3922 spdk_free(ctx->mask); 3923 3924 /* Read the used blobids mask */ 3925 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3926 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3927 SPDK_MALLOC_DMA); 3928 if (!ctx->mask) { 3929 bs_load_ctx_fail(ctx, -ENOMEM); 3930 return; 3931 } 3932 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3933 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3934 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3935 bs_load_used_blobids_cpl, ctx); 3936 } 3937 3938 static void 3939 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3940 { 3941 struct spdk_bs_load_ctx *ctx = cb_arg; 3942 uint64_t lba, lba_count, mask_size; 3943 int rc; 3944 3945 if (bserrno != 0) { 3946 bs_load_ctx_fail(ctx, bserrno); 3947 return; 3948 } 3949 3950 /* The type must be correct */ 3951 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3952 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3953 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3954 8)); 3955 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3956 if (ctx->mask->length != ctx->super->md_len) { 3957 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3958 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3959 ctx->mask->length, ctx->super->md_len); 3960 assert(false); 3961 } 3962 3963 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3964 if (rc < 0) { 3965 spdk_free(ctx->mask); 3966 bs_load_ctx_fail(ctx, rc); 3967 return; 3968 } 3969 3970 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3971 spdk_free(ctx->mask); 3972 3973 /* Read the used clusters mask */ 3974 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3975 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3976 SPDK_MALLOC_DMA); 3977 if (!ctx->mask) { 3978 bs_load_ctx_fail(ctx, -ENOMEM); 3979 return; 3980 } 3981 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3982 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3983 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3984 bs_load_used_clusters_cpl, ctx); 3985 } 3986 3987 static void 3988 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3989 { 3990 uint64_t lba, lba_count, mask_size; 3991 3992 /* Read the used pages mask */ 3993 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3994 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3995 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3996 if (!ctx->mask) { 3997 bs_load_ctx_fail(ctx, -ENOMEM); 3998 return; 3999 } 4000 4001 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 4002 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 4003 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 4004 bs_load_used_pages_cpl, ctx); 4005 } 4006 4007 static int 4008 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 4009 { 4010 struct spdk_blob_store *bs = ctx->bs; 4011 struct spdk_blob_md_descriptor *desc; 4012 size_t cur_desc = 0; 4013 4014 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4015 while (cur_desc < sizeof(page->descriptors)) { 4016 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4017 if (desc->length == 0) { 4018 /* If padding and length are 0, this terminates the page */ 4019 break; 4020 } 4021 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4022 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4023 unsigned int i, j; 4024 unsigned int cluster_count = 0; 4025 uint32_t cluster_idx; 4026 4027 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4028 4029 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4030 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 4031 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 4032 /* 4033 * cluster_idx = 0 means an unallocated cluster - don't mark that 4034 * in the used cluster map. 4035 */ 4036 if (cluster_idx != 0) { 4037 SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j); 4038 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 4039 if (bs->num_free_clusters == 0) { 4040 return -ENOSPC; 4041 } 4042 bs->num_free_clusters--; 4043 } 4044 cluster_count++; 4045 } 4046 } 4047 if (cluster_count == 0) { 4048 return -EINVAL; 4049 } 4050 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4051 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4052 uint32_t i; 4053 uint32_t cluster_count = 0; 4054 uint32_t cluster_idx; 4055 size_t cluster_idx_length; 4056 4057 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4058 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 4059 4060 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 4061 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 4062 return -EINVAL; 4063 } 4064 4065 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 4066 cluster_idx = desc_extent->cluster_idx[i]; 4067 /* 4068 * cluster_idx = 0 means an unallocated cluster - don't mark that 4069 * in the used cluster map. 4070 */ 4071 if (cluster_idx != 0) { 4072 if (cluster_idx < desc_extent->start_cluster_idx && 4073 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 4074 return -EINVAL; 4075 } 4076 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 4077 if (bs->num_free_clusters == 0) { 4078 return -ENOSPC; 4079 } 4080 bs->num_free_clusters--; 4081 } 4082 cluster_count++; 4083 } 4084 4085 if (cluster_count == 0) { 4086 return -EINVAL; 4087 } 4088 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4089 /* Skip this item */ 4090 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4091 /* Skip this item */ 4092 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4093 /* Skip this item */ 4094 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4095 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 4096 uint32_t num_extent_pages = ctx->num_extent_pages; 4097 uint32_t i; 4098 size_t extent_pages_length; 4099 void *tmp; 4100 4101 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 4102 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 4103 4104 if (desc_extent_table->length == 0 || 4105 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 4106 return -EINVAL; 4107 } 4108 4109 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4110 if (desc_extent_table->extent_page[i].page_idx != 0) { 4111 if (desc_extent_table->extent_page[i].num_pages != 1) { 4112 return -EINVAL; 4113 } 4114 num_extent_pages += 1; 4115 } 4116 } 4117 4118 if (num_extent_pages > 0) { 4119 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 4120 if (tmp == NULL) { 4121 return -ENOMEM; 4122 } 4123 ctx->extent_page_num = tmp; 4124 4125 /* Extent table entries contain md page numbers for extent pages. 4126 * Zeroes represent unallocated extent pages, those are run-length-encoded. 4127 */ 4128 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4129 if (desc_extent_table->extent_page[i].page_idx != 0) { 4130 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 4131 ctx->num_extent_pages += 1; 4132 } 4133 } 4134 } 4135 } else { 4136 /* Error */ 4137 return -EINVAL; 4138 } 4139 /* Advance to the next descriptor */ 4140 cur_desc += sizeof(*desc) + desc->length; 4141 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4142 break; 4143 } 4144 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4145 } 4146 return 0; 4147 } 4148 4149 static bool 4150 bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 4151 { 4152 uint32_t crc; 4153 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4154 size_t desc_len; 4155 4156 crc = blob_md_page_calc_crc(page); 4157 if (crc != page->crc) { 4158 return false; 4159 } 4160 4161 /* Extent page should always be of sequence num 0. */ 4162 if (page->sequence_num != 0) { 4163 return false; 4164 } 4165 4166 /* Descriptor type must be EXTENT_PAGE. */ 4167 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4168 return false; 4169 } 4170 4171 /* Descriptor length cannot exceed the page. */ 4172 desc_len = sizeof(*desc) + desc->length; 4173 if (desc_len > sizeof(page->descriptors)) { 4174 return false; 4175 } 4176 4177 /* It has to be the only descriptor in the page. */ 4178 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4179 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4180 if (desc->length != 0) { 4181 return false; 4182 } 4183 } 4184 4185 return true; 4186 } 4187 4188 static bool 4189 bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4190 { 4191 uint32_t crc; 4192 struct spdk_blob_md_page *page = ctx->page; 4193 4194 crc = blob_md_page_calc_crc(page); 4195 if (crc != page->crc) { 4196 return false; 4197 } 4198 4199 /* First page of a sequence should match the blobid. */ 4200 if (page->sequence_num == 0 && 4201 bs_page_to_blobid(ctx->cur_page) != page->id) { 4202 return false; 4203 } 4204 assert(bs_load_cur_extent_page_valid(page) == false); 4205 4206 return true; 4207 } 4208 4209 static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4210 4211 static void 4212 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4213 { 4214 struct spdk_bs_load_ctx *ctx = cb_arg; 4215 4216 if (bserrno != 0) { 4217 bs_load_ctx_fail(ctx, bserrno); 4218 return; 4219 } 4220 4221 bs_load_complete(ctx); 4222 } 4223 4224 static void 4225 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4226 { 4227 struct spdk_bs_load_ctx *ctx = cb_arg; 4228 4229 spdk_free(ctx->mask); 4230 ctx->mask = NULL; 4231 4232 if (bserrno != 0) { 4233 bs_load_ctx_fail(ctx, bserrno); 4234 return; 4235 } 4236 4237 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4238 } 4239 4240 static void 4241 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4242 { 4243 struct spdk_bs_load_ctx *ctx = cb_arg; 4244 4245 spdk_free(ctx->mask); 4246 ctx->mask = NULL; 4247 4248 if (bserrno != 0) { 4249 bs_load_ctx_fail(ctx, bserrno); 4250 return; 4251 } 4252 4253 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4254 } 4255 4256 static void 4257 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4258 { 4259 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4260 } 4261 4262 static void 4263 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4264 { 4265 uint64_t num_md_clusters; 4266 uint64_t i; 4267 4268 ctx->in_page_chain = false; 4269 4270 do { 4271 ctx->page_index++; 4272 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4273 4274 if (ctx->page_index < ctx->super->md_len) { 4275 ctx->cur_page = ctx->page_index; 4276 bs_load_replay_cur_md_page(ctx); 4277 } else { 4278 /* Claim all of the clusters used by the metadata */ 4279 num_md_clusters = spdk_divide_round_up( 4280 ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster); 4281 for (i = 0; i < num_md_clusters; i++) { 4282 spdk_bit_array_set(ctx->used_clusters, i); 4283 } 4284 ctx->bs->num_free_clusters -= num_md_clusters; 4285 spdk_free(ctx->page); 4286 bs_load_write_used_md(ctx); 4287 } 4288 } 4289 4290 static void 4291 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4292 { 4293 struct spdk_bs_load_ctx *ctx = cb_arg; 4294 uint32_t page_num; 4295 uint64_t i; 4296 4297 if (bserrno != 0) { 4298 spdk_free(ctx->extent_pages); 4299 bs_load_ctx_fail(ctx, bserrno); 4300 return; 4301 } 4302 4303 for (i = 0; i < ctx->num_extent_pages; i++) { 4304 /* Extent pages are only read when present within in chain md. 4305 * Integrity of md is not right if that page was not a valid extent page. */ 4306 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4307 spdk_free(ctx->extent_pages); 4308 bs_load_ctx_fail(ctx, -EILSEQ); 4309 return; 4310 } 4311 4312 page_num = ctx->extent_page_num[i]; 4313 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4314 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4315 spdk_free(ctx->extent_pages); 4316 bs_load_ctx_fail(ctx, -EILSEQ); 4317 return; 4318 } 4319 } 4320 4321 spdk_free(ctx->extent_pages); 4322 free(ctx->extent_page_num); 4323 ctx->extent_page_num = NULL; 4324 ctx->num_extent_pages = 0; 4325 4326 bs_load_replay_md_chain_cpl(ctx); 4327 } 4328 4329 static void 4330 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4331 { 4332 spdk_bs_batch_t *batch; 4333 uint32_t page; 4334 uint64_t lba; 4335 uint64_t i; 4336 4337 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4338 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4339 if (!ctx->extent_pages) { 4340 bs_load_ctx_fail(ctx, -ENOMEM); 4341 return; 4342 } 4343 4344 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4345 4346 for (i = 0; i < ctx->num_extent_pages; i++) { 4347 page = ctx->extent_page_num[i]; 4348 assert(page < ctx->super->md_len); 4349 lba = bs_md_page_to_lba(ctx->bs, page); 4350 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4351 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4352 } 4353 4354 bs_batch_close(batch); 4355 } 4356 4357 static void 4358 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4359 { 4360 struct spdk_bs_load_ctx *ctx = cb_arg; 4361 uint32_t page_num; 4362 struct spdk_blob_md_page *page; 4363 4364 if (bserrno != 0) { 4365 bs_load_ctx_fail(ctx, bserrno); 4366 return; 4367 } 4368 4369 page_num = ctx->cur_page; 4370 page = ctx->page; 4371 if (bs_load_cur_md_page_valid(ctx) == true) { 4372 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4373 spdk_spin_lock(&ctx->bs->used_lock); 4374 bs_claim_md_page(ctx->bs, page_num); 4375 spdk_spin_unlock(&ctx->bs->used_lock); 4376 if (page->sequence_num == 0) { 4377 SPDK_NOTICELOG("Recover: blob 0x%" PRIx32 "\n", page_num); 4378 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4379 } 4380 if (bs_load_replay_md_parse_page(ctx, page)) { 4381 bs_load_ctx_fail(ctx, -EILSEQ); 4382 return; 4383 } 4384 if (page->next != SPDK_INVALID_MD_PAGE) { 4385 ctx->in_page_chain = true; 4386 ctx->cur_page = page->next; 4387 bs_load_replay_cur_md_page(ctx); 4388 return; 4389 } 4390 if (ctx->num_extent_pages != 0) { 4391 bs_load_replay_extent_pages(ctx); 4392 return; 4393 } 4394 } 4395 } 4396 bs_load_replay_md_chain_cpl(ctx); 4397 } 4398 4399 static void 4400 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4401 { 4402 uint64_t lba; 4403 4404 assert(ctx->cur_page < ctx->super->md_len); 4405 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4406 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4407 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4408 bs_load_replay_md_cpl, ctx); 4409 } 4410 4411 static void 4412 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4413 { 4414 ctx->page_index = 0; 4415 ctx->cur_page = 0; 4416 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4417 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4418 if (!ctx->page) { 4419 bs_load_ctx_fail(ctx, -ENOMEM); 4420 return; 4421 } 4422 bs_load_replay_cur_md_page(ctx); 4423 } 4424 4425 static void 4426 bs_recover(struct spdk_bs_load_ctx *ctx) 4427 { 4428 int rc; 4429 4430 SPDK_NOTICELOG("Performing recovery on blobstore\n"); 4431 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4432 if (rc < 0) { 4433 bs_load_ctx_fail(ctx, -ENOMEM); 4434 return; 4435 } 4436 4437 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4438 if (rc < 0) { 4439 bs_load_ctx_fail(ctx, -ENOMEM); 4440 return; 4441 } 4442 4443 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4444 if (rc < 0) { 4445 bs_load_ctx_fail(ctx, -ENOMEM); 4446 return; 4447 } 4448 4449 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4450 if (rc < 0) { 4451 bs_load_ctx_fail(ctx, -ENOMEM); 4452 return; 4453 } 4454 4455 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4456 bs_load_replay_md(ctx); 4457 } 4458 4459 static int 4460 bs_parse_super(struct spdk_bs_load_ctx *ctx) 4461 { 4462 int rc; 4463 4464 if (ctx->super->size == 0) { 4465 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4466 } 4467 4468 if (ctx->super->io_unit_size == 0) { 4469 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4470 } 4471 4472 ctx->bs->clean = 1; 4473 ctx->bs->cluster_sz = ctx->super->cluster_size; 4474 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4475 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4476 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4477 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4478 } 4479 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4480 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4481 if (rc < 0) { 4482 return -ENOMEM; 4483 } 4484 ctx->bs->md_start = ctx->super->md_start; 4485 ctx->bs->md_len = ctx->super->md_len; 4486 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 4487 if (rc < 0) { 4488 return -ENOMEM; 4489 } 4490 4491 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4492 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4493 ctx->bs->super_blob = ctx->super->super_blob; 4494 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4495 4496 return 0; 4497 } 4498 4499 static void 4500 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4501 { 4502 struct spdk_bs_load_ctx *ctx = cb_arg; 4503 uint32_t crc; 4504 int rc; 4505 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4506 4507 if (ctx->super->version > SPDK_BS_VERSION || 4508 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4509 bs_load_ctx_fail(ctx, -EILSEQ); 4510 return; 4511 } 4512 4513 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4514 sizeof(ctx->super->signature)) != 0) { 4515 bs_load_ctx_fail(ctx, -EILSEQ); 4516 return; 4517 } 4518 4519 crc = blob_md_page_calc_crc(ctx->super); 4520 if (crc != ctx->super->crc) { 4521 bs_load_ctx_fail(ctx, -EILSEQ); 4522 return; 4523 } 4524 4525 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4526 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4527 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4528 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4529 } else { 4530 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4531 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4532 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4533 bs_load_ctx_fail(ctx, -ENXIO); 4534 return; 4535 } 4536 4537 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4538 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4539 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4540 bs_load_ctx_fail(ctx, -EILSEQ); 4541 return; 4542 } 4543 4544 rc = bs_parse_super(ctx); 4545 if (rc < 0) { 4546 bs_load_ctx_fail(ctx, rc); 4547 return; 4548 } 4549 4550 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) { 4551 bs_recover(ctx); 4552 } else { 4553 bs_load_read_used_pages(ctx); 4554 } 4555 } 4556 4557 static inline int 4558 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4559 { 4560 4561 if (!src->opts_size) { 4562 SPDK_ERRLOG("opts_size should not be zero value\n"); 4563 return -1; 4564 } 4565 4566 #define FIELD_OK(field) \ 4567 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4568 4569 #define SET_FIELD(field) \ 4570 if (FIELD_OK(field)) { \ 4571 dst->field = src->field; \ 4572 } \ 4573 4574 SET_FIELD(cluster_sz); 4575 SET_FIELD(num_md_pages); 4576 SET_FIELD(max_md_ops); 4577 SET_FIELD(max_channel_ops); 4578 SET_FIELD(clear_method); 4579 4580 if (FIELD_OK(bstype)) { 4581 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4582 } 4583 SET_FIELD(iter_cb_fn); 4584 SET_FIELD(iter_cb_arg); 4585 SET_FIELD(force_recover); 4586 4587 dst->opts_size = src->opts_size; 4588 4589 /* You should not remove this statement, but need to update the assert statement 4590 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4591 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 72, "Incorrect size"); 4592 4593 #undef FIELD_OK 4594 #undef SET_FIELD 4595 4596 return 0; 4597 } 4598 4599 void 4600 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4601 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4602 { 4603 struct spdk_blob_store *bs; 4604 struct spdk_bs_cpl cpl; 4605 struct spdk_bs_load_ctx *ctx; 4606 struct spdk_bs_opts opts = {}; 4607 int err; 4608 4609 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4610 4611 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4612 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4613 dev->destroy(dev); 4614 cb_fn(cb_arg, NULL, -EINVAL); 4615 return; 4616 } 4617 4618 spdk_bs_opts_init(&opts, sizeof(opts)); 4619 if (o) { 4620 if (bs_opts_copy(o, &opts)) { 4621 return; 4622 } 4623 } 4624 4625 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4626 dev->destroy(dev); 4627 cb_fn(cb_arg, NULL, -EINVAL); 4628 return; 4629 } 4630 4631 err = bs_alloc(dev, &opts, &bs, &ctx); 4632 if (err) { 4633 dev->destroy(dev); 4634 cb_fn(cb_arg, NULL, err); 4635 return; 4636 } 4637 4638 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4639 cpl.u.bs_handle.cb_fn = cb_fn; 4640 cpl.u.bs_handle.cb_arg = cb_arg; 4641 cpl.u.bs_handle.bs = bs; 4642 4643 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4644 if (!ctx->seq) { 4645 spdk_free(ctx->super); 4646 free(ctx); 4647 bs_free(bs); 4648 cb_fn(cb_arg, NULL, -ENOMEM); 4649 return; 4650 } 4651 4652 /* Read the super block */ 4653 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4654 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4655 bs_load_super_cpl, ctx); 4656 } 4657 4658 /* END spdk_bs_load */ 4659 4660 /* START spdk_bs_dump */ 4661 4662 static void 4663 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4664 { 4665 spdk_free(ctx->super); 4666 4667 /* 4668 * We need to defer calling bs_call_cpl() until after 4669 * dev destruction, so tuck these away for later use. 4670 */ 4671 ctx->bs->unload_err = bserrno; 4672 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4673 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4674 4675 bs_sequence_finish(seq, 0); 4676 bs_free(ctx->bs); 4677 free(ctx); 4678 } 4679 4680 static void 4681 bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4682 { 4683 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4684 uint32_t i; 4685 const char *type; 4686 4687 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4688 4689 if (desc_xattr->length != 4690 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4691 desc_xattr->name_length + desc_xattr->value_length) { 4692 } 4693 4694 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4695 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4696 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4697 type = "XATTR"; 4698 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4699 type = "XATTR_INTERNAL"; 4700 } else { 4701 assert(false); 4702 type = "XATTR_?"; 4703 } 4704 fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name); 4705 fprintf(ctx->fp, " value = \""); 4706 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4707 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4708 desc_xattr->value_length); 4709 fprintf(ctx->fp, "\"\n"); 4710 for (i = 0; i < desc_xattr->value_length; i++) { 4711 if (i % 16 == 0) { 4712 fprintf(ctx->fp, " "); 4713 } 4714 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4715 if ((i + 1) % 16 == 0) { 4716 fprintf(ctx->fp, "\n"); 4717 } 4718 } 4719 if (i % 16 != 0) { 4720 fprintf(ctx->fp, "\n"); 4721 } 4722 } 4723 4724 struct type_flag_desc { 4725 uint64_t mask; 4726 uint64_t val; 4727 const char *name; 4728 }; 4729 4730 static void 4731 bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags, 4732 struct type_flag_desc *desc, size_t numflags) 4733 { 4734 uint64_t covered = 0; 4735 size_t i; 4736 4737 for (i = 0; i < numflags; i++) { 4738 if ((desc[i].mask & flags) != desc[i].val) { 4739 continue; 4740 } 4741 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name); 4742 if (desc[i].mask != desc[i].val) { 4743 fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")", 4744 desc[i].mask, desc[i].val); 4745 } 4746 fprintf(ctx->fp, "\n"); 4747 covered |= desc[i].mask; 4748 } 4749 if ((flags & ~covered) != 0) { 4750 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered); 4751 } 4752 } 4753 4754 static void 4755 bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4756 { 4757 struct spdk_blob_md_descriptor_flags *type_desc; 4758 #define ADD_FLAG(f) { f, f, #f } 4759 #define ADD_MASK_VAL(m, v) { m, v, #v } 4760 static struct type_flag_desc invalid[] = { 4761 ADD_FLAG(SPDK_BLOB_THIN_PROV), 4762 ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR), 4763 ADD_FLAG(SPDK_BLOB_EXTENT_TABLE), 4764 }; 4765 static struct type_flag_desc data_ro[] = { 4766 ADD_FLAG(SPDK_BLOB_READ_ONLY), 4767 }; 4768 static struct type_flag_desc md_ro[] = { 4769 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT), 4770 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE), 4771 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP), 4772 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES), 4773 }; 4774 #undef ADD_FLAG 4775 #undef ADD_MASK_VAL 4776 4777 type_desc = (struct spdk_blob_md_descriptor_flags *)desc; 4778 fprintf(ctx->fp, "Flags:\n"); 4779 fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags); 4780 bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid, 4781 SPDK_COUNTOF(invalid)); 4782 fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags); 4783 bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro, 4784 SPDK_COUNTOF(data_ro)); 4785 fprintf(ctx->fp, "\t md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags); 4786 bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro, 4787 SPDK_COUNTOF(md_ro)); 4788 } 4789 4790 static void 4791 bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4792 { 4793 struct spdk_blob_md_descriptor_extent_table *et_desc; 4794 uint64_t num_extent_pages; 4795 uint32_t et_idx; 4796 4797 et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc; 4798 num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) / 4799 sizeof(et_desc->extent_page[0]); 4800 4801 fprintf(ctx->fp, "Extent table:\n"); 4802 for (et_idx = 0; et_idx < num_extent_pages; et_idx++) { 4803 if (et_desc->extent_page[et_idx].page_idx == 0) { 4804 /* Zeroes represent unallocated extent pages. */ 4805 continue; 4806 } 4807 fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32 4808 " at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx, 4809 et_desc->extent_page[et_idx].num_pages, 4810 bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx)); 4811 } 4812 } 4813 4814 static void 4815 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4816 { 4817 uint32_t page_idx = ctx->cur_page; 4818 struct spdk_blob_md_page *page = ctx->page; 4819 struct spdk_blob_md_descriptor *desc; 4820 size_t cur_desc = 0; 4821 uint32_t crc; 4822 4823 fprintf(ctx->fp, "=========\n"); 4824 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4825 fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx)); 4826 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4827 fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num); 4828 if (page->next == SPDK_INVALID_MD_PAGE) { 4829 fprintf(ctx->fp, "Next: None\n"); 4830 } else { 4831 fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next); 4832 } 4833 fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)"); 4834 if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) { 4835 fprintf(ctx->fp, " md"); 4836 } 4837 if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) { 4838 fprintf(ctx->fp, " blob"); 4839 } 4840 fprintf(ctx->fp, "\n"); 4841 4842 crc = blob_md_page_calc_crc(page); 4843 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4844 4845 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4846 while (cur_desc < sizeof(page->descriptors)) { 4847 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4848 if (desc->length == 0) { 4849 /* If padding and length are 0, this terminates the page */ 4850 break; 4851 } 4852 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4853 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4854 unsigned int i; 4855 4856 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4857 4858 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4859 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4860 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4861 desc_extent_rle->extents[i].cluster_idx); 4862 } else { 4863 fprintf(ctx->fp, "Unallocated Extent - "); 4864 } 4865 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4866 fprintf(ctx->fp, "\n"); 4867 } 4868 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4869 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4870 unsigned int i; 4871 4872 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4873 4874 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4875 if (desc_extent->cluster_idx[i] != 0) { 4876 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4877 desc_extent->cluster_idx[i]); 4878 } else { 4879 fprintf(ctx->fp, "Unallocated Extent"); 4880 } 4881 fprintf(ctx->fp, "\n"); 4882 } 4883 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4884 bs_dump_print_xattr(ctx, desc); 4885 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4886 bs_dump_print_xattr(ctx, desc); 4887 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4888 bs_dump_print_type_flags(ctx, desc); 4889 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4890 bs_dump_print_extent_table(ctx, desc); 4891 } else { 4892 /* Error */ 4893 fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type); 4894 } 4895 /* Advance to the next descriptor */ 4896 cur_desc += sizeof(*desc) + desc->length; 4897 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4898 break; 4899 } 4900 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4901 } 4902 } 4903 4904 static void 4905 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4906 { 4907 struct spdk_bs_load_ctx *ctx = cb_arg; 4908 4909 if (bserrno != 0) { 4910 bs_dump_finish(seq, ctx, bserrno); 4911 return; 4912 } 4913 4914 if (ctx->page->id != 0) { 4915 bs_dump_print_md_page(ctx); 4916 } 4917 4918 ctx->cur_page++; 4919 4920 if (ctx->cur_page < ctx->super->md_len) { 4921 bs_dump_read_md_page(seq, ctx); 4922 } else { 4923 spdk_free(ctx->page); 4924 bs_dump_finish(seq, ctx, 0); 4925 } 4926 } 4927 4928 static void 4929 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4930 { 4931 struct spdk_bs_load_ctx *ctx = cb_arg; 4932 uint64_t lba; 4933 4934 assert(ctx->cur_page < ctx->super->md_len); 4935 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4936 bs_sequence_read_dev(seq, ctx->page, lba, 4937 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4938 bs_dump_read_md_page_cpl, ctx); 4939 } 4940 4941 static void 4942 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4943 { 4944 struct spdk_bs_load_ctx *ctx = cb_arg; 4945 int rc; 4946 4947 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4948 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4949 sizeof(ctx->super->signature)) != 0) { 4950 fprintf(ctx->fp, "(Mismatch)\n"); 4951 bs_dump_finish(seq, ctx, bserrno); 4952 return; 4953 } else { 4954 fprintf(ctx->fp, "(OK)\n"); 4955 } 4956 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4957 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4958 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4959 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4960 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4961 fprintf(ctx->fp, "Super Blob ID: "); 4962 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4963 fprintf(ctx->fp, "(None)\n"); 4964 } else { 4965 fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob); 4966 } 4967 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4968 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4969 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4970 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4971 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4972 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4973 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4974 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4975 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4976 4977 ctx->cur_page = 0; 4978 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4979 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4980 if (!ctx->page) { 4981 bs_dump_finish(seq, ctx, -ENOMEM); 4982 return; 4983 } 4984 4985 rc = bs_parse_super(ctx); 4986 if (rc < 0) { 4987 bs_load_ctx_fail(ctx, rc); 4988 return; 4989 } 4990 4991 bs_load_read_used_pages(ctx); 4992 } 4993 4994 void 4995 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4996 spdk_bs_op_complete cb_fn, void *cb_arg) 4997 { 4998 struct spdk_blob_store *bs; 4999 struct spdk_bs_cpl cpl; 5000 struct spdk_bs_load_ctx *ctx; 5001 struct spdk_bs_opts opts = {}; 5002 int err; 5003 5004 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 5005 5006 spdk_bs_opts_init(&opts, sizeof(opts)); 5007 5008 err = bs_alloc(dev, &opts, &bs, &ctx); 5009 if (err) { 5010 dev->destroy(dev); 5011 cb_fn(cb_arg, err); 5012 return; 5013 } 5014 5015 ctx->dumping = true; 5016 ctx->fp = fp; 5017 ctx->print_xattr_fn = print_xattr_fn; 5018 5019 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5020 cpl.u.bs_basic.cb_fn = cb_fn; 5021 cpl.u.bs_basic.cb_arg = cb_arg; 5022 5023 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5024 if (!ctx->seq) { 5025 spdk_free(ctx->super); 5026 free(ctx); 5027 bs_free(bs); 5028 cb_fn(cb_arg, -ENOMEM); 5029 return; 5030 } 5031 5032 /* Read the super block */ 5033 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5034 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5035 bs_dump_super_cpl, ctx); 5036 } 5037 5038 /* END spdk_bs_dump */ 5039 5040 /* START spdk_bs_init */ 5041 5042 static void 5043 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5044 { 5045 struct spdk_bs_load_ctx *ctx = cb_arg; 5046 5047 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 5048 spdk_free(ctx->super); 5049 free(ctx); 5050 5051 bs_sequence_finish(seq, bserrno); 5052 } 5053 5054 static void 5055 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5056 { 5057 struct spdk_bs_load_ctx *ctx = cb_arg; 5058 5059 /* Write super block */ 5060 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 5061 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 5062 bs_init_persist_super_cpl, ctx); 5063 } 5064 5065 void 5066 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 5067 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 5068 { 5069 struct spdk_bs_load_ctx *ctx; 5070 struct spdk_blob_store *bs; 5071 struct spdk_bs_cpl cpl; 5072 spdk_bs_sequence_t *seq; 5073 spdk_bs_batch_t *batch; 5074 uint64_t num_md_lba; 5075 uint64_t num_md_pages; 5076 uint64_t num_md_clusters; 5077 uint64_t max_used_cluster_mask_len; 5078 uint32_t i; 5079 struct spdk_bs_opts opts = {}; 5080 int rc; 5081 uint64_t lba, lba_count; 5082 5083 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 5084 5085 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 5086 SPDK_ERRLOG("unsupported dev block length of %d\n", 5087 dev->blocklen); 5088 dev->destroy(dev); 5089 cb_fn(cb_arg, NULL, -EINVAL); 5090 return; 5091 } 5092 5093 spdk_bs_opts_init(&opts, sizeof(opts)); 5094 if (o) { 5095 if (bs_opts_copy(o, &opts)) { 5096 return; 5097 } 5098 } 5099 5100 if (bs_opts_verify(&opts) != 0) { 5101 dev->destroy(dev); 5102 cb_fn(cb_arg, NULL, -EINVAL); 5103 return; 5104 } 5105 5106 rc = bs_alloc(dev, &opts, &bs, &ctx); 5107 if (rc) { 5108 dev->destroy(dev); 5109 cb_fn(cb_arg, NULL, rc); 5110 return; 5111 } 5112 5113 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 5114 /* By default, allocate 1 page per cluster. 5115 * Technically, this over-allocates metadata 5116 * because more metadata will reduce the number 5117 * of usable clusters. This can be addressed with 5118 * more complex math in the future. 5119 */ 5120 bs->md_len = bs->total_clusters; 5121 } else { 5122 bs->md_len = opts.num_md_pages; 5123 } 5124 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 5125 if (rc < 0) { 5126 spdk_free(ctx->super); 5127 free(ctx); 5128 bs_free(bs); 5129 cb_fn(cb_arg, NULL, -ENOMEM); 5130 return; 5131 } 5132 5133 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 5134 if (rc < 0) { 5135 spdk_free(ctx->super); 5136 free(ctx); 5137 bs_free(bs); 5138 cb_fn(cb_arg, NULL, -ENOMEM); 5139 return; 5140 } 5141 5142 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 5143 if (rc < 0) { 5144 spdk_free(ctx->super); 5145 free(ctx); 5146 bs_free(bs); 5147 cb_fn(cb_arg, NULL, -ENOMEM); 5148 return; 5149 } 5150 5151 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 5152 sizeof(ctx->super->signature)); 5153 ctx->super->version = SPDK_BS_VERSION; 5154 ctx->super->length = sizeof(*ctx->super); 5155 ctx->super->super_blob = bs->super_blob; 5156 ctx->super->clean = 0; 5157 ctx->super->cluster_size = bs->cluster_sz; 5158 ctx->super->io_unit_size = bs->io_unit_size; 5159 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 5160 5161 /* Calculate how many pages the metadata consumes at the front 5162 * of the disk. 5163 */ 5164 5165 /* The super block uses 1 page */ 5166 num_md_pages = 1; 5167 5168 /* The used_md_pages mask requires 1 bit per metadata page, rounded 5169 * up to the nearest page, plus a header. 5170 */ 5171 ctx->super->used_page_mask_start = num_md_pages; 5172 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5173 spdk_divide_round_up(bs->md_len, 8), 5174 SPDK_BS_PAGE_SIZE); 5175 num_md_pages += ctx->super->used_page_mask_len; 5176 5177 /* The used_clusters mask requires 1 bit per cluster, rounded 5178 * up to the nearest page, plus a header. 5179 */ 5180 ctx->super->used_cluster_mask_start = num_md_pages; 5181 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5182 spdk_divide_round_up(bs->total_clusters, 8), 5183 SPDK_BS_PAGE_SIZE); 5184 /* The blobstore might be extended, then the used_cluster bitmap will need more space. 5185 * Here we calculate the max clusters we can support according to the 5186 * num_md_pages (bs->md_len). 5187 */ 5188 max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5189 spdk_divide_round_up(bs->md_len, 8), 5190 SPDK_BS_PAGE_SIZE); 5191 max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, 5192 ctx->super->used_cluster_mask_len); 5193 num_md_pages += max_used_cluster_mask_len; 5194 5195 /* The used_blobids mask requires 1 bit per metadata page, rounded 5196 * up to the nearest page, plus a header. 5197 */ 5198 ctx->super->used_blobid_mask_start = num_md_pages; 5199 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5200 spdk_divide_round_up(bs->md_len, 8), 5201 SPDK_BS_PAGE_SIZE); 5202 num_md_pages += ctx->super->used_blobid_mask_len; 5203 5204 /* The metadata region size was chosen above */ 5205 ctx->super->md_start = bs->md_start = num_md_pages; 5206 ctx->super->md_len = bs->md_len; 5207 num_md_pages += bs->md_len; 5208 5209 num_md_lba = bs_page_to_lba(bs, num_md_pages); 5210 5211 ctx->super->size = dev->blockcnt * dev->blocklen; 5212 5213 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 5214 5215 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 5216 if (num_md_clusters > bs->total_clusters) { 5217 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 5218 "please decrease number of pages reserved for metadata " 5219 "or increase cluster size.\n"); 5220 spdk_free(ctx->super); 5221 spdk_bit_array_free(&ctx->used_clusters); 5222 free(ctx); 5223 bs_free(bs); 5224 cb_fn(cb_arg, NULL, -ENOMEM); 5225 return; 5226 } 5227 /* Claim all of the clusters used by the metadata */ 5228 for (i = 0; i < num_md_clusters; i++) { 5229 spdk_bit_array_set(ctx->used_clusters, i); 5230 } 5231 5232 bs->num_free_clusters -= num_md_clusters; 5233 bs->total_data_clusters = bs->num_free_clusters; 5234 5235 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 5236 cpl.u.bs_handle.cb_fn = cb_fn; 5237 cpl.u.bs_handle.cb_arg = cb_arg; 5238 cpl.u.bs_handle.bs = bs; 5239 5240 seq = bs_sequence_start(bs->md_channel, &cpl); 5241 if (!seq) { 5242 spdk_free(ctx->super); 5243 free(ctx); 5244 bs_free(bs); 5245 cb_fn(cb_arg, NULL, -ENOMEM); 5246 return; 5247 } 5248 5249 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 5250 5251 /* Clear metadata space */ 5252 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 5253 5254 lba = num_md_lba; 5255 lba_count = ctx->bs->dev->blockcnt - lba; 5256 switch (opts.clear_method) { 5257 case BS_CLEAR_WITH_UNMAP: 5258 /* Trim data clusters */ 5259 bs_batch_unmap_dev(batch, lba, lba_count); 5260 break; 5261 case BS_CLEAR_WITH_WRITE_ZEROES: 5262 /* Write_zeroes to data clusters */ 5263 bs_batch_write_zeroes_dev(batch, lba, lba_count); 5264 break; 5265 case BS_CLEAR_WITH_NONE: 5266 default: 5267 break; 5268 } 5269 5270 bs_batch_close(batch); 5271 } 5272 5273 /* END spdk_bs_init */ 5274 5275 /* START spdk_bs_destroy */ 5276 5277 static void 5278 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5279 { 5280 struct spdk_bs_load_ctx *ctx = cb_arg; 5281 struct spdk_blob_store *bs = ctx->bs; 5282 5283 /* 5284 * We need to defer calling bs_call_cpl() until after 5285 * dev destruction, so tuck these away for later use. 5286 */ 5287 bs->unload_err = bserrno; 5288 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5289 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5290 5291 bs_sequence_finish(seq, bserrno); 5292 5293 bs_free(bs); 5294 free(ctx); 5295 } 5296 5297 void 5298 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 5299 void *cb_arg) 5300 { 5301 struct spdk_bs_cpl cpl; 5302 spdk_bs_sequence_t *seq; 5303 struct spdk_bs_load_ctx *ctx; 5304 5305 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 5306 5307 if (!RB_EMPTY(&bs->open_blobs)) { 5308 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5309 cb_fn(cb_arg, -EBUSY); 5310 return; 5311 } 5312 5313 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5314 cpl.u.bs_basic.cb_fn = cb_fn; 5315 cpl.u.bs_basic.cb_arg = cb_arg; 5316 5317 ctx = calloc(1, sizeof(*ctx)); 5318 if (!ctx) { 5319 cb_fn(cb_arg, -ENOMEM); 5320 return; 5321 } 5322 5323 ctx->bs = bs; 5324 5325 seq = bs_sequence_start(bs->md_channel, &cpl); 5326 if (!seq) { 5327 free(ctx); 5328 cb_fn(cb_arg, -ENOMEM); 5329 return; 5330 } 5331 5332 /* Write zeroes to the super block */ 5333 bs_sequence_write_zeroes_dev(seq, 5334 bs_page_to_lba(bs, 0), 5335 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5336 bs_destroy_trim_cpl, ctx); 5337 } 5338 5339 /* END spdk_bs_destroy */ 5340 5341 /* START spdk_bs_unload */ 5342 5343 static void 5344 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5345 { 5346 spdk_bs_sequence_t *seq = ctx->seq; 5347 5348 spdk_free(ctx->super); 5349 5350 /* 5351 * We need to defer calling bs_call_cpl() until after 5352 * dev destruction, so tuck these away for later use. 5353 */ 5354 ctx->bs->unload_err = bserrno; 5355 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5356 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5357 5358 bs_sequence_finish(seq, bserrno); 5359 5360 bs_free(ctx->bs); 5361 free(ctx); 5362 } 5363 5364 static void 5365 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5366 { 5367 struct spdk_bs_load_ctx *ctx = cb_arg; 5368 5369 bs_unload_finish(ctx, bserrno); 5370 } 5371 5372 static void 5373 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5374 { 5375 struct spdk_bs_load_ctx *ctx = cb_arg; 5376 5377 spdk_free(ctx->mask); 5378 5379 if (bserrno != 0) { 5380 bs_unload_finish(ctx, bserrno); 5381 return; 5382 } 5383 5384 ctx->super->clean = 1; 5385 5386 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5387 } 5388 5389 static void 5390 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5391 { 5392 struct spdk_bs_load_ctx *ctx = cb_arg; 5393 5394 spdk_free(ctx->mask); 5395 ctx->mask = NULL; 5396 5397 if (bserrno != 0) { 5398 bs_unload_finish(ctx, bserrno); 5399 return; 5400 } 5401 5402 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5403 } 5404 5405 static void 5406 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5407 { 5408 struct spdk_bs_load_ctx *ctx = cb_arg; 5409 5410 spdk_free(ctx->mask); 5411 ctx->mask = NULL; 5412 5413 if (bserrno != 0) { 5414 bs_unload_finish(ctx, bserrno); 5415 return; 5416 } 5417 5418 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5419 } 5420 5421 static void 5422 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5423 { 5424 struct spdk_bs_load_ctx *ctx = cb_arg; 5425 5426 if (bserrno != 0) { 5427 bs_unload_finish(ctx, bserrno); 5428 return; 5429 } 5430 5431 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5432 } 5433 5434 void 5435 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5436 { 5437 struct spdk_bs_cpl cpl; 5438 struct spdk_bs_load_ctx *ctx; 5439 5440 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5441 5442 if (!RB_EMPTY(&bs->open_blobs)) { 5443 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5444 cb_fn(cb_arg, -EBUSY); 5445 return; 5446 } 5447 5448 ctx = calloc(1, sizeof(*ctx)); 5449 if (!ctx) { 5450 cb_fn(cb_arg, -ENOMEM); 5451 return; 5452 } 5453 5454 ctx->bs = bs; 5455 5456 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5457 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5458 if (!ctx->super) { 5459 free(ctx); 5460 cb_fn(cb_arg, -ENOMEM); 5461 return; 5462 } 5463 5464 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5465 cpl.u.bs_basic.cb_fn = cb_fn; 5466 cpl.u.bs_basic.cb_arg = cb_arg; 5467 5468 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5469 if (!ctx->seq) { 5470 spdk_free(ctx->super); 5471 free(ctx); 5472 cb_fn(cb_arg, -ENOMEM); 5473 return; 5474 } 5475 5476 /* Read super block */ 5477 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5478 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5479 bs_unload_read_super_cpl, ctx); 5480 } 5481 5482 /* END spdk_bs_unload */ 5483 5484 /* START spdk_bs_set_super */ 5485 5486 struct spdk_bs_set_super_ctx { 5487 struct spdk_blob_store *bs; 5488 struct spdk_bs_super_block *super; 5489 }; 5490 5491 static void 5492 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5493 { 5494 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5495 5496 if (bserrno != 0) { 5497 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5498 } 5499 5500 spdk_free(ctx->super); 5501 5502 bs_sequence_finish(seq, bserrno); 5503 5504 free(ctx); 5505 } 5506 5507 static void 5508 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5509 { 5510 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5511 5512 if (bserrno != 0) { 5513 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5514 spdk_free(ctx->super); 5515 bs_sequence_finish(seq, bserrno); 5516 free(ctx); 5517 return; 5518 } 5519 5520 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5521 } 5522 5523 void 5524 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5525 spdk_bs_op_complete cb_fn, void *cb_arg) 5526 { 5527 struct spdk_bs_cpl cpl; 5528 spdk_bs_sequence_t *seq; 5529 struct spdk_bs_set_super_ctx *ctx; 5530 5531 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5532 5533 ctx = calloc(1, sizeof(*ctx)); 5534 if (!ctx) { 5535 cb_fn(cb_arg, -ENOMEM); 5536 return; 5537 } 5538 5539 ctx->bs = bs; 5540 5541 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5542 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5543 if (!ctx->super) { 5544 free(ctx); 5545 cb_fn(cb_arg, -ENOMEM); 5546 return; 5547 } 5548 5549 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5550 cpl.u.bs_basic.cb_fn = cb_fn; 5551 cpl.u.bs_basic.cb_arg = cb_arg; 5552 5553 seq = bs_sequence_start(bs->md_channel, &cpl); 5554 if (!seq) { 5555 spdk_free(ctx->super); 5556 free(ctx); 5557 cb_fn(cb_arg, -ENOMEM); 5558 return; 5559 } 5560 5561 bs->super_blob = blobid; 5562 5563 /* Read super block */ 5564 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5565 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5566 bs_set_super_read_cpl, ctx); 5567 } 5568 5569 /* END spdk_bs_set_super */ 5570 5571 void 5572 spdk_bs_get_super(struct spdk_blob_store *bs, 5573 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5574 { 5575 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5576 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5577 } else { 5578 cb_fn(cb_arg, bs->super_blob, 0); 5579 } 5580 } 5581 5582 uint64_t 5583 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5584 { 5585 return bs->cluster_sz; 5586 } 5587 5588 uint64_t 5589 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5590 { 5591 return SPDK_BS_PAGE_SIZE; 5592 } 5593 5594 uint64_t 5595 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5596 { 5597 return bs->io_unit_size; 5598 } 5599 5600 uint64_t 5601 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5602 { 5603 return bs->num_free_clusters; 5604 } 5605 5606 uint64_t 5607 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5608 { 5609 return bs->total_data_clusters; 5610 } 5611 5612 static int 5613 bs_register_md_thread(struct spdk_blob_store *bs) 5614 { 5615 bs->md_channel = spdk_get_io_channel(bs); 5616 if (!bs->md_channel) { 5617 SPDK_ERRLOG("Failed to get IO channel.\n"); 5618 return -1; 5619 } 5620 5621 return 0; 5622 } 5623 5624 static int 5625 bs_unregister_md_thread(struct spdk_blob_store *bs) 5626 { 5627 spdk_put_io_channel(bs->md_channel); 5628 5629 return 0; 5630 } 5631 5632 spdk_blob_id 5633 spdk_blob_get_id(struct spdk_blob *blob) 5634 { 5635 assert(blob != NULL); 5636 5637 return blob->id; 5638 } 5639 5640 uint64_t 5641 spdk_blob_get_num_pages(struct spdk_blob *blob) 5642 { 5643 assert(blob != NULL); 5644 5645 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5646 } 5647 5648 uint64_t 5649 spdk_blob_get_num_io_units(struct spdk_blob *blob) 5650 { 5651 assert(blob != NULL); 5652 5653 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5654 } 5655 5656 uint64_t 5657 spdk_blob_get_num_clusters(struct spdk_blob *blob) 5658 { 5659 assert(blob != NULL); 5660 5661 return blob->active.num_clusters; 5662 } 5663 5664 static uint64_t 5665 blob_find_io_unit(struct spdk_blob *blob, uint64_t offset, bool is_allocated) 5666 { 5667 uint64_t blob_io_unit_num = spdk_blob_get_num_io_units(blob); 5668 5669 while (offset < blob_io_unit_num) { 5670 if (bs_io_unit_is_allocated(blob, offset) == is_allocated) { 5671 return offset; 5672 } 5673 5674 offset += bs_num_io_units_to_cluster_boundary(blob, offset); 5675 } 5676 5677 return UINT64_MAX; 5678 } 5679 5680 uint64_t 5681 spdk_blob_get_next_allocated_io_unit(struct spdk_blob *blob, uint64_t offset) 5682 { 5683 return blob_find_io_unit(blob, offset, true); 5684 } 5685 5686 uint64_t 5687 spdk_blob_get_next_unallocated_io_unit(struct spdk_blob *blob, uint64_t offset) 5688 { 5689 return blob_find_io_unit(blob, offset, false); 5690 } 5691 5692 /* START spdk_bs_create_blob */ 5693 5694 static void 5695 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5696 { 5697 struct spdk_blob *blob = cb_arg; 5698 uint32_t page_idx = bs_blobid_to_page(blob->id); 5699 5700 if (bserrno != 0) { 5701 spdk_spin_lock(&blob->bs->used_lock); 5702 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5703 bs_release_md_page(blob->bs, page_idx); 5704 spdk_spin_unlock(&blob->bs->used_lock); 5705 } 5706 5707 blob_free(blob); 5708 5709 bs_sequence_finish(seq, bserrno); 5710 } 5711 5712 static int 5713 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5714 bool internal) 5715 { 5716 uint64_t i; 5717 size_t value_len = 0; 5718 int rc; 5719 const void *value = NULL; 5720 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5721 return -EINVAL; 5722 } 5723 for (i = 0; i < xattrs->count; i++) { 5724 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5725 if (value == NULL || value_len == 0) { 5726 return -EINVAL; 5727 } 5728 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5729 if (rc < 0) { 5730 return rc; 5731 } 5732 } 5733 return 0; 5734 } 5735 5736 static void 5737 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5738 { 5739 #define FIELD_OK(field) \ 5740 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5741 5742 #define SET_FIELD(field) \ 5743 if (FIELD_OK(field)) { \ 5744 dst->field = src->field; \ 5745 } \ 5746 5747 SET_FIELD(num_clusters); 5748 SET_FIELD(thin_provision); 5749 SET_FIELD(clear_method); 5750 5751 if (FIELD_OK(xattrs)) { 5752 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5753 } 5754 5755 SET_FIELD(use_extent_table); 5756 5757 dst->opts_size = src->opts_size; 5758 5759 /* You should not remove this statement, but need to update the assert statement 5760 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5761 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5762 5763 #undef FIELD_OK 5764 #undef SET_FIELD 5765 } 5766 5767 static void 5768 bs_create_blob(struct spdk_blob_store *bs, 5769 const struct spdk_blob_opts *opts, 5770 const struct spdk_blob_xattr_opts *internal_xattrs, 5771 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5772 { 5773 struct spdk_blob *blob; 5774 uint32_t page_idx; 5775 struct spdk_bs_cpl cpl; 5776 struct spdk_blob_opts opts_local; 5777 struct spdk_blob_xattr_opts internal_xattrs_default; 5778 spdk_bs_sequence_t *seq; 5779 spdk_blob_id id; 5780 int rc; 5781 5782 assert(spdk_get_thread() == bs->md_thread); 5783 5784 spdk_spin_lock(&bs->used_lock); 5785 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5786 if (page_idx == UINT32_MAX) { 5787 spdk_spin_unlock(&bs->used_lock); 5788 cb_fn(cb_arg, 0, -ENOMEM); 5789 return; 5790 } 5791 spdk_bit_array_set(bs->used_blobids, page_idx); 5792 bs_claim_md_page(bs, page_idx); 5793 spdk_spin_unlock(&bs->used_lock); 5794 5795 id = bs_page_to_blobid(page_idx); 5796 5797 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5798 5799 blob = blob_alloc(bs, id); 5800 if (!blob) { 5801 rc = -ENOMEM; 5802 goto error; 5803 } 5804 5805 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5806 if (opts) { 5807 blob_opts_copy(opts, &opts_local); 5808 } 5809 5810 blob->use_extent_table = opts_local.use_extent_table; 5811 if (blob->use_extent_table) { 5812 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5813 } 5814 5815 if (!internal_xattrs) { 5816 blob_xattrs_init(&internal_xattrs_default); 5817 internal_xattrs = &internal_xattrs_default; 5818 } 5819 5820 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5821 if (rc < 0) { 5822 goto error; 5823 } 5824 5825 rc = blob_set_xattrs(blob, internal_xattrs, true); 5826 if (rc < 0) { 5827 goto error; 5828 } 5829 5830 if (opts_local.thin_provision) { 5831 blob_set_thin_provision(blob); 5832 } 5833 5834 blob_set_clear_method(blob, opts_local.clear_method); 5835 5836 rc = blob_resize(blob, opts_local.num_clusters); 5837 if (rc < 0) { 5838 goto error; 5839 } 5840 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5841 cpl.u.blobid.cb_fn = cb_fn; 5842 cpl.u.blobid.cb_arg = cb_arg; 5843 cpl.u.blobid.blobid = blob->id; 5844 5845 seq = bs_sequence_start(bs->md_channel, &cpl); 5846 if (!seq) { 5847 rc = -ENOMEM; 5848 goto error; 5849 } 5850 5851 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5852 return; 5853 5854 error: 5855 if (blob != NULL) { 5856 blob_free(blob); 5857 } 5858 spdk_spin_lock(&bs->used_lock); 5859 spdk_bit_array_clear(bs->used_blobids, page_idx); 5860 bs_release_md_page(bs, page_idx); 5861 spdk_spin_unlock(&bs->used_lock); 5862 cb_fn(cb_arg, 0, rc); 5863 } 5864 5865 void 5866 spdk_bs_create_blob(struct spdk_blob_store *bs, 5867 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5868 { 5869 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5870 } 5871 5872 void 5873 spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5874 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5875 { 5876 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5877 } 5878 5879 /* END spdk_bs_create_blob */ 5880 5881 /* START blob_cleanup */ 5882 5883 struct spdk_clone_snapshot_ctx { 5884 struct spdk_bs_cpl cpl; 5885 int bserrno; 5886 bool frozen; 5887 5888 struct spdk_io_channel *channel; 5889 5890 /* Current cluster for inflate operation */ 5891 uint64_t cluster; 5892 5893 /* For inflation force allocation of all unallocated clusters and remove 5894 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5895 bool allocate_all; 5896 5897 struct { 5898 spdk_blob_id id; 5899 struct spdk_blob *blob; 5900 bool md_ro; 5901 } original; 5902 struct { 5903 spdk_blob_id id; 5904 struct spdk_blob *blob; 5905 } new; 5906 5907 /* xattrs specified for snapshot/clones only. They have no impact on 5908 * the original blobs xattrs. */ 5909 const struct spdk_blob_xattr_opts *xattrs; 5910 }; 5911 5912 static void 5913 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5914 { 5915 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5916 struct spdk_bs_cpl *cpl = &ctx->cpl; 5917 5918 if (bserrno != 0) { 5919 if (ctx->bserrno != 0) { 5920 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5921 } else { 5922 ctx->bserrno = bserrno; 5923 } 5924 } 5925 5926 switch (cpl->type) { 5927 case SPDK_BS_CPL_TYPE_BLOBID: 5928 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5929 break; 5930 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5931 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5932 break; 5933 default: 5934 SPDK_UNREACHABLE(); 5935 break; 5936 } 5937 5938 free(ctx); 5939 } 5940 5941 static void 5942 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5943 { 5944 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5945 struct spdk_blob *origblob = ctx->original.blob; 5946 5947 if (bserrno != 0) { 5948 if (ctx->bserrno != 0) { 5949 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5950 } else { 5951 ctx->bserrno = bserrno; 5952 } 5953 } 5954 5955 ctx->original.id = origblob->id; 5956 origblob->locked_operation_in_progress = false; 5957 5958 /* Revert md_ro to original state */ 5959 origblob->md_ro = ctx->original.md_ro; 5960 5961 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5962 } 5963 5964 static void 5965 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5966 { 5967 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5968 struct spdk_blob *origblob = ctx->original.blob; 5969 5970 if (bserrno != 0) { 5971 if (ctx->bserrno != 0) { 5972 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5973 } else { 5974 ctx->bserrno = bserrno; 5975 } 5976 } 5977 5978 if (ctx->frozen) { 5979 /* Unfreeze any outstanding I/O */ 5980 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5981 } else { 5982 bs_snapshot_unfreeze_cpl(ctx, 0); 5983 } 5984 5985 } 5986 5987 static void 5988 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5989 { 5990 struct spdk_blob *newblob = ctx->new.blob; 5991 5992 if (bserrno != 0) { 5993 if (ctx->bserrno != 0) { 5994 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5995 } else { 5996 ctx->bserrno = bserrno; 5997 } 5998 } 5999 6000 ctx->new.id = newblob->id; 6001 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 6002 } 6003 6004 /* END blob_cleanup */ 6005 6006 /* START spdk_bs_create_snapshot */ 6007 6008 static void 6009 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 6010 { 6011 uint64_t *cluster_temp; 6012 uint32_t *extent_page_temp; 6013 6014 cluster_temp = blob1->active.clusters; 6015 blob1->active.clusters = blob2->active.clusters; 6016 blob2->active.clusters = cluster_temp; 6017 6018 extent_page_temp = blob1->active.extent_pages; 6019 blob1->active.extent_pages = blob2->active.extent_pages; 6020 blob2->active.extent_pages = extent_page_temp; 6021 } 6022 6023 static void 6024 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 6025 { 6026 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6027 struct spdk_blob *origblob = ctx->original.blob; 6028 struct spdk_blob *newblob = ctx->new.blob; 6029 6030 if (bserrno != 0) { 6031 bs_snapshot_swap_cluster_maps(newblob, origblob); 6032 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6033 return; 6034 } 6035 6036 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 6037 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 6038 if (bserrno != 0) { 6039 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6040 return; 6041 } 6042 6043 bs_blob_list_add(ctx->original.blob); 6044 6045 spdk_blob_set_read_only(newblob); 6046 6047 /* sync snapshot metadata */ 6048 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 6049 } 6050 6051 static void 6052 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 6053 { 6054 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6055 struct spdk_blob *origblob = ctx->original.blob; 6056 struct spdk_blob *newblob = ctx->new.blob; 6057 6058 if (bserrno != 0) { 6059 /* return cluster map back to original */ 6060 bs_snapshot_swap_cluster_maps(newblob, origblob); 6061 6062 /* Newblob md sync failed. Valid clusters are only present in origblob. 6063 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred. 6064 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 6065 blob_set_thin_provision(newblob); 6066 assert(spdk_mem_all_zero(newblob->active.clusters, 6067 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6068 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6069 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6070 6071 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6072 return; 6073 } 6074 6075 /* Set internal xattr for snapshot id */ 6076 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 6077 if (bserrno != 0) { 6078 /* return cluster map back to original */ 6079 bs_snapshot_swap_cluster_maps(newblob, origblob); 6080 blob_set_thin_provision(newblob); 6081 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6082 return; 6083 } 6084 6085 /* Create new back_bs_dev for snapshot */ 6086 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 6087 if (origblob->back_bs_dev == NULL) { 6088 /* return cluster map back to original */ 6089 bs_snapshot_swap_cluster_maps(newblob, origblob); 6090 blob_set_thin_provision(newblob); 6091 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 6092 return; 6093 } 6094 6095 bs_blob_list_remove(origblob); 6096 origblob->parent_id = newblob->id; 6097 /* set clone blob as thin provisioned */ 6098 blob_set_thin_provision(origblob); 6099 6100 bs_blob_list_add(newblob); 6101 6102 /* sync clone metadata */ 6103 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 6104 } 6105 6106 static void 6107 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 6108 { 6109 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6110 struct spdk_blob *origblob = ctx->original.blob; 6111 struct spdk_blob *newblob = ctx->new.blob; 6112 int bserrno; 6113 6114 if (rc != 0) { 6115 bs_clone_snapshot_newblob_cleanup(ctx, rc); 6116 return; 6117 } 6118 6119 ctx->frozen = true; 6120 6121 if (newblob->back_bs_dev) { 6122 newblob->back_bs_dev->destroy(newblob->back_bs_dev); 6123 } 6124 /* set new back_bs_dev for snapshot */ 6125 newblob->back_bs_dev = origblob->back_bs_dev; 6126 /* Set invalid flags from origblob */ 6127 newblob->invalid_flags = origblob->invalid_flags; 6128 6129 /* inherit parent from original blob if set */ 6130 newblob->parent_id = origblob->parent_id; 6131 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 6132 /* Set internal xattr for snapshot id */ 6133 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 6134 &origblob->parent_id, sizeof(spdk_blob_id), true); 6135 if (bserrno != 0) { 6136 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6137 return; 6138 } 6139 } 6140 6141 /* swap cluster maps */ 6142 bs_snapshot_swap_cluster_maps(newblob, origblob); 6143 6144 /* Set the clear method on the new blob to match the original. */ 6145 blob_set_clear_method(newblob, origblob->clear_method); 6146 6147 /* sync snapshot metadata */ 6148 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 6149 } 6150 6151 static void 6152 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6153 { 6154 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6155 struct spdk_blob *origblob = ctx->original.blob; 6156 struct spdk_blob *newblob = _blob; 6157 6158 if (bserrno != 0) { 6159 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6160 return; 6161 } 6162 6163 ctx->new.blob = newblob; 6164 assert(spdk_blob_is_thin_provisioned(newblob)); 6165 assert(spdk_mem_all_zero(newblob->active.clusters, 6166 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6167 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6168 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6169 6170 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 6171 } 6172 6173 static void 6174 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6175 { 6176 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6177 struct spdk_blob *origblob = ctx->original.blob; 6178 6179 if (bserrno != 0) { 6180 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6181 return; 6182 } 6183 6184 ctx->new.id = blobid; 6185 ctx->cpl.u.blobid.blobid = blobid; 6186 6187 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 6188 } 6189 6190 6191 static void 6192 bs_xattr_snapshot(void *arg, const char *name, 6193 const void **value, size_t *value_len) 6194 { 6195 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 6196 6197 struct spdk_blob *blob = (struct spdk_blob *)arg; 6198 *value = &blob->id; 6199 *value_len = sizeof(blob->id); 6200 } 6201 6202 static void 6203 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6204 { 6205 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6206 struct spdk_blob_opts opts; 6207 struct spdk_blob_xattr_opts internal_xattrs; 6208 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 6209 6210 if (bserrno != 0) { 6211 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6212 return; 6213 } 6214 6215 ctx->original.blob = _blob; 6216 6217 if (_blob->data_ro || _blob->md_ro) { 6218 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 6219 _blob->id); 6220 ctx->bserrno = -EINVAL; 6221 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6222 return; 6223 } 6224 6225 if (_blob->locked_operation_in_progress) { 6226 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 6227 ctx->bserrno = -EBUSY; 6228 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6229 return; 6230 } 6231 6232 _blob->locked_operation_in_progress = true; 6233 6234 spdk_blob_opts_init(&opts, sizeof(opts)); 6235 blob_xattrs_init(&internal_xattrs); 6236 6237 /* Change the size of new blob to the same as in original blob, 6238 * but do not allocate clusters */ 6239 opts.thin_provision = true; 6240 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6241 opts.use_extent_table = _blob->use_extent_table; 6242 6243 /* If there are any xattrs specified for snapshot, set them now */ 6244 if (ctx->xattrs) { 6245 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6246 } 6247 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 6248 internal_xattrs.count = 1; 6249 internal_xattrs.ctx = _blob; 6250 internal_xattrs.names = xattrs_names; 6251 internal_xattrs.get_value = bs_xattr_snapshot; 6252 6253 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6254 bs_snapshot_newblob_create_cpl, ctx); 6255 } 6256 6257 void 6258 spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 6259 const struct spdk_blob_xattr_opts *snapshot_xattrs, 6260 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6261 { 6262 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6263 6264 if (!ctx) { 6265 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6266 return; 6267 } 6268 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6269 ctx->cpl.u.blobid.cb_fn = cb_fn; 6270 ctx->cpl.u.blobid.cb_arg = cb_arg; 6271 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6272 ctx->bserrno = 0; 6273 ctx->frozen = false; 6274 ctx->original.id = blobid; 6275 ctx->xattrs = snapshot_xattrs; 6276 6277 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 6278 } 6279 /* END spdk_bs_create_snapshot */ 6280 6281 /* START spdk_bs_create_clone */ 6282 6283 static void 6284 bs_xattr_clone(void *arg, const char *name, 6285 const void **value, size_t *value_len) 6286 { 6287 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 6288 6289 struct spdk_blob *blob = (struct spdk_blob *)arg; 6290 *value = &blob->id; 6291 *value_len = sizeof(blob->id); 6292 } 6293 6294 static void 6295 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6296 { 6297 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6298 struct spdk_blob *clone = _blob; 6299 6300 ctx->new.blob = clone; 6301 bs_blob_list_add(clone); 6302 6303 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 6304 } 6305 6306 static void 6307 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6308 { 6309 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6310 6311 ctx->cpl.u.blobid.blobid = blobid; 6312 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 6313 } 6314 6315 static void 6316 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6317 { 6318 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6319 struct spdk_blob_opts opts; 6320 struct spdk_blob_xattr_opts internal_xattrs; 6321 char *xattr_names[] = { BLOB_SNAPSHOT }; 6322 6323 if (bserrno != 0) { 6324 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6325 return; 6326 } 6327 6328 ctx->original.blob = _blob; 6329 ctx->original.md_ro = _blob->md_ro; 6330 6331 if (!_blob->data_ro || !_blob->md_ro) { 6332 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 6333 ctx->bserrno = -EINVAL; 6334 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6335 return; 6336 } 6337 6338 if (_blob->locked_operation_in_progress) { 6339 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 6340 ctx->bserrno = -EBUSY; 6341 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6342 return; 6343 } 6344 6345 _blob->locked_operation_in_progress = true; 6346 6347 spdk_blob_opts_init(&opts, sizeof(opts)); 6348 blob_xattrs_init(&internal_xattrs); 6349 6350 opts.thin_provision = true; 6351 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6352 opts.use_extent_table = _blob->use_extent_table; 6353 if (ctx->xattrs) { 6354 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6355 } 6356 6357 /* Set internal xattr BLOB_SNAPSHOT */ 6358 internal_xattrs.count = 1; 6359 internal_xattrs.ctx = _blob; 6360 internal_xattrs.names = xattr_names; 6361 internal_xattrs.get_value = bs_xattr_clone; 6362 6363 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6364 bs_clone_newblob_create_cpl, ctx); 6365 } 6366 6367 void 6368 spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 6369 const struct spdk_blob_xattr_opts *clone_xattrs, 6370 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6371 { 6372 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6373 6374 if (!ctx) { 6375 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6376 return; 6377 } 6378 6379 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6380 ctx->cpl.u.blobid.cb_fn = cb_fn; 6381 ctx->cpl.u.blobid.cb_arg = cb_arg; 6382 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6383 ctx->bserrno = 0; 6384 ctx->xattrs = clone_xattrs; 6385 ctx->original.id = blobid; 6386 6387 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6388 } 6389 6390 /* END spdk_bs_create_clone */ 6391 6392 /* START spdk_bs_inflate_blob */ 6393 6394 static void 6395 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6396 { 6397 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6398 struct spdk_blob *_blob = ctx->original.blob; 6399 6400 if (bserrno != 0) { 6401 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6402 return; 6403 } 6404 6405 /* Temporarily override md_ro flag for MD modification */ 6406 _blob->md_ro = false; 6407 6408 bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true); 6409 if (bserrno != 0) { 6410 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6411 return; 6412 } 6413 6414 assert(_parent != NULL); 6415 6416 bs_blob_list_remove(_blob); 6417 _blob->parent_id = _parent->id; 6418 6419 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6420 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6421 bs_blob_list_add(_blob); 6422 6423 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6424 } 6425 6426 static void 6427 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6428 { 6429 struct spdk_blob *_blob = ctx->original.blob; 6430 struct spdk_blob *_parent; 6431 6432 if (ctx->allocate_all) { 6433 /* remove thin provisioning */ 6434 bs_blob_list_remove(_blob); 6435 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6436 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6437 _blob->back_bs_dev = NULL; 6438 _blob->parent_id = SPDK_BLOBID_INVALID; 6439 } else { 6440 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6441 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6442 /* We must change the parent of the inflated blob */ 6443 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6444 bs_inflate_blob_set_parent_cpl, ctx); 6445 return; 6446 } 6447 6448 bs_blob_list_remove(_blob); 6449 _blob->parent_id = SPDK_BLOBID_INVALID; 6450 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6451 _blob->back_bs_dev = bs_create_zeroes_dev(); 6452 } 6453 6454 /* Temporarily override md_ro flag for MD modification */ 6455 _blob->md_ro = false; 6456 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6457 _blob->state = SPDK_BLOB_STATE_DIRTY; 6458 6459 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6460 } 6461 6462 /* Check if cluster needs allocation */ 6463 static inline bool 6464 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6465 { 6466 struct spdk_blob_bs_dev *b; 6467 6468 assert(blob != NULL); 6469 6470 if (blob->active.clusters[cluster] != 0) { 6471 /* Cluster is already allocated */ 6472 return false; 6473 } 6474 6475 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6476 /* Blob have no parent blob */ 6477 return allocate_all; 6478 } 6479 6480 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6481 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6482 } 6483 6484 static void 6485 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6486 { 6487 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6488 struct spdk_blob *_blob = ctx->original.blob; 6489 struct spdk_bs_cpl cpl; 6490 spdk_bs_user_op_t *op; 6491 uint64_t offset; 6492 6493 if (bserrno != 0) { 6494 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6495 return; 6496 } 6497 6498 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6499 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6500 break; 6501 } 6502 } 6503 6504 if (ctx->cluster < _blob->active.num_clusters) { 6505 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6506 6507 /* We may safely increment a cluster before copying */ 6508 ctx->cluster++; 6509 6510 /* Use a dummy 0B read as a context for cluster copy */ 6511 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6512 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next; 6513 cpl.u.blob_basic.cb_arg = ctx; 6514 6515 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob, 6516 NULL, 0, offset, 0); 6517 if (!op) { 6518 bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM); 6519 return; 6520 } 6521 6522 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op); 6523 } else { 6524 bs_inflate_blob_done(ctx); 6525 } 6526 } 6527 6528 static void 6529 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6530 { 6531 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6532 uint64_t clusters_needed; 6533 uint64_t i; 6534 6535 if (bserrno != 0) { 6536 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6537 return; 6538 } 6539 6540 ctx->original.blob = _blob; 6541 ctx->original.md_ro = _blob->md_ro; 6542 6543 if (_blob->locked_operation_in_progress) { 6544 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6545 ctx->bserrno = -EBUSY; 6546 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6547 return; 6548 } 6549 6550 _blob->locked_operation_in_progress = true; 6551 6552 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6553 /* This blob have no parent, so we cannot decouple it. */ 6554 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6555 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6556 return; 6557 } 6558 6559 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6560 /* This is not thin provisioned blob. No need to inflate. */ 6561 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6562 return; 6563 } 6564 6565 /* Do two passes - one to verify that we can obtain enough clusters 6566 * and another to actually claim them. 6567 */ 6568 clusters_needed = 0; 6569 for (i = 0; i < _blob->active.num_clusters; i++) { 6570 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6571 clusters_needed++; 6572 } 6573 } 6574 6575 if (clusters_needed > _blob->bs->num_free_clusters) { 6576 /* Not enough free clusters. Cannot satisfy the request. */ 6577 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6578 return; 6579 } 6580 6581 ctx->cluster = 0; 6582 bs_inflate_blob_touch_next(ctx, 0); 6583 } 6584 6585 static void 6586 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6587 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6588 { 6589 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6590 6591 if (!ctx) { 6592 cb_fn(cb_arg, -ENOMEM); 6593 return; 6594 } 6595 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6596 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6597 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6598 ctx->bserrno = 0; 6599 ctx->original.id = blobid; 6600 ctx->channel = channel; 6601 ctx->allocate_all = allocate_all; 6602 6603 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6604 } 6605 6606 void 6607 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6608 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6609 { 6610 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6611 } 6612 6613 void 6614 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6615 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6616 { 6617 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6618 } 6619 /* END spdk_bs_inflate_blob */ 6620 6621 /* START spdk_blob_resize */ 6622 struct spdk_bs_resize_ctx { 6623 spdk_blob_op_complete cb_fn; 6624 void *cb_arg; 6625 struct spdk_blob *blob; 6626 uint64_t sz; 6627 int rc; 6628 }; 6629 6630 static void 6631 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6632 { 6633 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6634 6635 if (rc != 0) { 6636 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6637 } 6638 6639 if (ctx->rc != 0) { 6640 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6641 rc = ctx->rc; 6642 } 6643 6644 ctx->blob->locked_operation_in_progress = false; 6645 6646 ctx->cb_fn(ctx->cb_arg, rc); 6647 free(ctx); 6648 } 6649 6650 static void 6651 bs_resize_freeze_cpl(void *cb_arg, int rc) 6652 { 6653 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6654 6655 if (rc != 0) { 6656 ctx->blob->locked_operation_in_progress = false; 6657 ctx->cb_fn(ctx->cb_arg, rc); 6658 free(ctx); 6659 return; 6660 } 6661 6662 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6663 6664 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6665 } 6666 6667 void 6668 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6669 { 6670 struct spdk_bs_resize_ctx *ctx; 6671 6672 blob_verify_md_op(blob); 6673 6674 SPDK_DEBUGLOG(blob, "Resizing blob 0x%" PRIx64 " to %" PRIu64 " clusters\n", blob->id, sz); 6675 6676 if (blob->md_ro) { 6677 cb_fn(cb_arg, -EPERM); 6678 return; 6679 } 6680 6681 if (sz == blob->active.num_clusters) { 6682 cb_fn(cb_arg, 0); 6683 return; 6684 } 6685 6686 if (blob->locked_operation_in_progress) { 6687 cb_fn(cb_arg, -EBUSY); 6688 return; 6689 } 6690 6691 ctx = calloc(1, sizeof(*ctx)); 6692 if (!ctx) { 6693 cb_fn(cb_arg, -ENOMEM); 6694 return; 6695 } 6696 6697 blob->locked_operation_in_progress = true; 6698 ctx->cb_fn = cb_fn; 6699 ctx->cb_arg = cb_arg; 6700 ctx->blob = blob; 6701 ctx->sz = sz; 6702 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6703 } 6704 6705 /* END spdk_blob_resize */ 6706 6707 6708 /* START spdk_bs_delete_blob */ 6709 6710 static void 6711 bs_delete_close_cpl(void *cb_arg, int bserrno) 6712 { 6713 spdk_bs_sequence_t *seq = cb_arg; 6714 6715 bs_sequence_finish(seq, bserrno); 6716 } 6717 6718 static void 6719 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6720 { 6721 struct spdk_blob *blob = cb_arg; 6722 6723 if (bserrno != 0) { 6724 /* 6725 * We already removed this blob from the blobstore tailq, so 6726 * we need to free it here since this is the last reference 6727 * to it. 6728 */ 6729 blob_free(blob); 6730 bs_delete_close_cpl(seq, bserrno); 6731 return; 6732 } 6733 6734 /* 6735 * This will immediately decrement the ref_count and call 6736 * the completion routine since the metadata state is clean. 6737 * By calling spdk_blob_close, we reduce the number of call 6738 * points into code that touches the blob->open_ref count 6739 * and the blobstore's blob list. 6740 */ 6741 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6742 } 6743 6744 struct delete_snapshot_ctx { 6745 struct spdk_blob_list *parent_snapshot_entry; 6746 struct spdk_blob *snapshot; 6747 struct spdk_blob_md_page *page; 6748 bool snapshot_md_ro; 6749 struct spdk_blob *clone; 6750 bool clone_md_ro; 6751 spdk_blob_op_with_handle_complete cb_fn; 6752 void *cb_arg; 6753 int bserrno; 6754 uint32_t next_extent_page; 6755 }; 6756 6757 static void 6758 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6759 { 6760 struct delete_snapshot_ctx *ctx = cb_arg; 6761 6762 if (bserrno != 0) { 6763 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6764 } 6765 6766 assert(ctx != NULL); 6767 6768 if (bserrno != 0 && ctx->bserrno == 0) { 6769 ctx->bserrno = bserrno; 6770 } 6771 6772 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6773 spdk_free(ctx->page); 6774 free(ctx); 6775 } 6776 6777 static void 6778 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6779 { 6780 struct delete_snapshot_ctx *ctx = cb_arg; 6781 6782 if (bserrno != 0) { 6783 ctx->bserrno = bserrno; 6784 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6785 } 6786 6787 if (ctx->bserrno != 0) { 6788 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6789 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot); 6790 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6791 } 6792 6793 ctx->snapshot->locked_operation_in_progress = false; 6794 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6795 6796 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6797 } 6798 6799 static void 6800 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6801 { 6802 struct delete_snapshot_ctx *ctx = cb_arg; 6803 6804 ctx->clone->locked_operation_in_progress = false; 6805 ctx->clone->md_ro = ctx->clone_md_ro; 6806 6807 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6808 } 6809 6810 static void 6811 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6812 { 6813 struct delete_snapshot_ctx *ctx = cb_arg; 6814 6815 if (bserrno) { 6816 ctx->bserrno = bserrno; 6817 delete_snapshot_cleanup_clone(ctx, 0); 6818 return; 6819 } 6820 6821 ctx->clone->locked_operation_in_progress = false; 6822 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6823 } 6824 6825 static void 6826 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6827 { 6828 struct delete_snapshot_ctx *ctx = cb_arg; 6829 struct spdk_blob_list *parent_snapshot_entry = NULL; 6830 struct spdk_blob_list *snapshot_entry = NULL; 6831 struct spdk_blob_list *clone_entry = NULL; 6832 struct spdk_blob_list *snapshot_clone_entry = NULL; 6833 6834 if (bserrno) { 6835 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6836 ctx->bserrno = bserrno; 6837 delete_snapshot_cleanup_clone(ctx, 0); 6838 return; 6839 } 6840 6841 /* Get snapshot entry for the snapshot we want to remove */ 6842 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6843 6844 assert(snapshot_entry != NULL); 6845 6846 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6847 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6848 assert(clone_entry != NULL); 6849 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6850 snapshot_entry->clone_count--; 6851 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6852 6853 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6854 /* This snapshot is at the same time a clone of another snapshot - we need to 6855 * update parent snapshot (remove current clone, add new one inherited from 6856 * the snapshot that is being removed) */ 6857 6858 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6859 * snapshot that we are removing */ 6860 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6861 &snapshot_clone_entry); 6862 6863 /* Switch clone entry in parent snapshot */ 6864 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6865 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6866 free(snapshot_clone_entry); 6867 } else { 6868 /* No parent snapshot - just remove clone entry */ 6869 free(clone_entry); 6870 } 6871 6872 /* Restore md_ro flags */ 6873 ctx->clone->md_ro = ctx->clone_md_ro; 6874 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6875 6876 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6877 } 6878 6879 static void 6880 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6881 { 6882 struct delete_snapshot_ctx *ctx = cb_arg; 6883 uint64_t i; 6884 6885 ctx->snapshot->md_ro = false; 6886 6887 if (bserrno) { 6888 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6889 ctx->bserrno = bserrno; 6890 6891 /* Restore snapshot to previous state */ 6892 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6893 if (bserrno != 0) { 6894 delete_snapshot_cleanup_clone(ctx, bserrno); 6895 return; 6896 } 6897 6898 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6899 return; 6900 } 6901 6902 /* Clear cluster map entries for snapshot */ 6903 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6904 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6905 ctx->snapshot->active.clusters[i] = 0; 6906 } 6907 } 6908 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6909 i < ctx->clone->active.num_extent_pages; i++) { 6910 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6911 ctx->snapshot->active.extent_pages[i] = 0; 6912 } 6913 } 6914 6915 blob_set_thin_provision(ctx->snapshot); 6916 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6917 6918 if (ctx->parent_snapshot_entry != NULL) { 6919 ctx->snapshot->back_bs_dev = NULL; 6920 } 6921 6922 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6923 } 6924 6925 static void 6926 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6927 { 6928 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6929 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6930 6931 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6932 if (ctx->parent_snapshot_entry != NULL) { 6933 /* ...to parent snapshot */ 6934 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6935 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6936 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6937 sizeof(spdk_blob_id), 6938 true); 6939 } else { 6940 /* ...to blobid invalid and zeroes dev */ 6941 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6942 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6943 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6944 } 6945 6946 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6947 } 6948 6949 static void 6950 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6951 { 6952 struct delete_snapshot_ctx *ctx = cb_arg; 6953 uint32_t *extent_page; 6954 uint64_t i; 6955 6956 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6957 i < ctx->clone->active.num_extent_pages; i++) { 6958 if (ctx->snapshot->active.extent_pages[i] == 0) { 6959 /* No extent page to use from snapshot */ 6960 continue; 6961 } 6962 6963 extent_page = &ctx->clone->active.extent_pages[i]; 6964 if (*extent_page == 0) { 6965 /* Copy extent page from snapshot when clone did not have a matching one */ 6966 *extent_page = ctx->snapshot->active.extent_pages[i]; 6967 continue; 6968 } 6969 6970 /* Clone and snapshot both contain partially filled matching extent pages. 6971 * Update the clone extent page in place with cluster map containing the mix of both. */ 6972 ctx->next_extent_page = i + 1; 6973 memset(ctx->page, 0, SPDK_BS_PAGE_SIZE); 6974 6975 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page, 6976 delete_snapshot_update_extent_pages, ctx); 6977 return; 6978 } 6979 delete_snapshot_update_extent_pages_cpl(ctx); 6980 } 6981 6982 static void 6983 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6984 { 6985 struct delete_snapshot_ctx *ctx = cb_arg; 6986 uint64_t i; 6987 6988 /* Temporarily override md_ro flag for clone for MD modification */ 6989 ctx->clone_md_ro = ctx->clone->md_ro; 6990 ctx->clone->md_ro = false; 6991 6992 if (bserrno) { 6993 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6994 ctx->bserrno = bserrno; 6995 delete_snapshot_cleanup_clone(ctx, 0); 6996 return; 6997 } 6998 6999 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 7000 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 7001 if (ctx->clone->active.clusters[i] == 0) { 7002 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 7003 } 7004 } 7005 ctx->next_extent_page = 0; 7006 delete_snapshot_update_extent_pages(ctx, 0); 7007 } 7008 7009 static void 7010 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 7011 { 7012 struct delete_snapshot_ctx *ctx = cb_arg; 7013 7014 if (bserrno) { 7015 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 7016 ctx->bserrno = bserrno; 7017 delete_snapshot_cleanup_clone(ctx, 0); 7018 return; 7019 } 7020 7021 /* Temporarily override md_ro flag for snapshot for MD modification */ 7022 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 7023 ctx->snapshot->md_ro = false; 7024 7025 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 7026 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 7027 sizeof(spdk_blob_id), true); 7028 if (ctx->bserrno != 0) { 7029 delete_snapshot_cleanup_clone(ctx, 0); 7030 return; 7031 } 7032 7033 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 7034 } 7035 7036 static void 7037 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 7038 { 7039 struct delete_snapshot_ctx *ctx = cb_arg; 7040 7041 if (bserrno) { 7042 SPDK_ERRLOG("Failed to open clone\n"); 7043 ctx->bserrno = bserrno; 7044 delete_snapshot_cleanup_snapshot(ctx, 0); 7045 return; 7046 } 7047 7048 ctx->clone = clone; 7049 7050 if (clone->locked_operation_in_progress) { 7051 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 7052 ctx->bserrno = -EBUSY; 7053 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 7054 return; 7055 } 7056 7057 clone->locked_operation_in_progress = true; 7058 7059 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 7060 } 7061 7062 static void 7063 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 7064 { 7065 struct spdk_blob_list *snapshot_entry = NULL; 7066 struct spdk_blob_list *clone_entry = NULL; 7067 struct spdk_blob_list *snapshot_clone_entry = NULL; 7068 7069 /* Get snapshot entry for the snapshot we want to remove */ 7070 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 7071 7072 assert(snapshot_entry != NULL); 7073 7074 /* Get clone of the snapshot (at this point there can be only one clone) */ 7075 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7076 assert(snapshot_entry->clone_count == 1); 7077 assert(clone_entry != NULL); 7078 7079 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 7080 * snapshot that we are removing */ 7081 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 7082 &snapshot_clone_entry); 7083 7084 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 7085 } 7086 7087 static void 7088 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 7089 { 7090 spdk_bs_sequence_t *seq = cb_arg; 7091 struct spdk_blob_list *snapshot_entry = NULL; 7092 uint32_t page_num; 7093 7094 if (bserrno) { 7095 SPDK_ERRLOG("Failed to remove blob\n"); 7096 bs_sequence_finish(seq, bserrno); 7097 return; 7098 } 7099 7100 /* Remove snapshot from the list */ 7101 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7102 if (snapshot_entry != NULL) { 7103 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 7104 free(snapshot_entry); 7105 } 7106 7107 page_num = bs_blobid_to_page(blob->id); 7108 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 7109 blob->state = SPDK_BLOB_STATE_DIRTY; 7110 blob->active.num_pages = 0; 7111 blob_resize(blob, 0); 7112 7113 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 7114 } 7115 7116 static int 7117 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 7118 { 7119 struct spdk_blob_list *snapshot_entry = NULL; 7120 struct spdk_blob_list *clone_entry = NULL; 7121 struct spdk_blob *clone = NULL; 7122 bool has_one_clone = false; 7123 7124 /* Check if this is a snapshot with clones */ 7125 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7126 if (snapshot_entry != NULL) { 7127 if (snapshot_entry->clone_count > 1) { 7128 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 7129 return -EBUSY; 7130 } else if (snapshot_entry->clone_count == 1) { 7131 has_one_clone = true; 7132 } 7133 } 7134 7135 /* Check if someone has this blob open (besides this delete context): 7136 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 7137 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 7138 * and that is ok, because we will update it accordingly */ 7139 if (blob->open_ref <= 2 && has_one_clone) { 7140 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7141 assert(clone_entry != NULL); 7142 clone = blob_lookup(blob->bs, clone_entry->id); 7143 7144 if (blob->open_ref == 2 && clone == NULL) { 7145 /* Clone is closed and someone else opened this blob */ 7146 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7147 return -EBUSY; 7148 } 7149 7150 *update_clone = true; 7151 return 0; 7152 } 7153 7154 if (blob->open_ref > 1) { 7155 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7156 return -EBUSY; 7157 } 7158 7159 assert(has_one_clone == false); 7160 *update_clone = false; 7161 return 0; 7162 } 7163 7164 static void 7165 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 7166 { 7167 spdk_bs_sequence_t *seq = cb_arg; 7168 7169 bs_sequence_finish(seq, -ENOMEM); 7170 } 7171 7172 static void 7173 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 7174 { 7175 spdk_bs_sequence_t *seq = cb_arg; 7176 struct delete_snapshot_ctx *ctx; 7177 bool update_clone = false; 7178 7179 if (bserrno != 0) { 7180 bs_sequence_finish(seq, bserrno); 7181 return; 7182 } 7183 7184 blob_verify_md_op(blob); 7185 7186 ctx = calloc(1, sizeof(*ctx)); 7187 if (ctx == NULL) { 7188 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 7189 return; 7190 } 7191 7192 ctx->snapshot = blob; 7193 ctx->cb_fn = bs_delete_blob_finish; 7194 ctx->cb_arg = seq; 7195 7196 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 7197 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 7198 if (ctx->bserrno) { 7199 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7200 return; 7201 } 7202 7203 if (blob->locked_operation_in_progress) { 7204 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 7205 ctx->bserrno = -EBUSY; 7206 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7207 return; 7208 } 7209 7210 blob->locked_operation_in_progress = true; 7211 7212 /* 7213 * Remove the blob from the blob_store list now, to ensure it does not 7214 * get returned after this point by blob_lookup(). 7215 */ 7216 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7217 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7218 7219 if (update_clone) { 7220 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 7221 if (!ctx->page) { 7222 ctx->bserrno = -ENOMEM; 7223 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7224 return; 7225 } 7226 /* This blob is a snapshot with active clone - update clone first */ 7227 update_clone_on_snapshot_deletion(blob, ctx); 7228 } else { 7229 /* This blob does not have any clones - just remove it */ 7230 bs_blob_list_remove(blob); 7231 bs_delete_blob_finish(seq, blob, 0); 7232 free(ctx); 7233 } 7234 } 7235 7236 void 7237 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7238 spdk_blob_op_complete cb_fn, void *cb_arg) 7239 { 7240 struct spdk_bs_cpl cpl; 7241 spdk_bs_sequence_t *seq; 7242 7243 SPDK_DEBUGLOG(blob, "Deleting blob 0x%" PRIx64 "\n", blobid); 7244 7245 assert(spdk_get_thread() == bs->md_thread); 7246 7247 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7248 cpl.u.blob_basic.cb_fn = cb_fn; 7249 cpl.u.blob_basic.cb_arg = cb_arg; 7250 7251 seq = bs_sequence_start(bs->md_channel, &cpl); 7252 if (!seq) { 7253 cb_fn(cb_arg, -ENOMEM); 7254 return; 7255 } 7256 7257 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 7258 } 7259 7260 /* END spdk_bs_delete_blob */ 7261 7262 /* START spdk_bs_open_blob */ 7263 7264 static void 7265 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7266 { 7267 struct spdk_blob *blob = cb_arg; 7268 struct spdk_blob *existing; 7269 7270 if (bserrno != 0) { 7271 blob_free(blob); 7272 seq->cpl.u.blob_handle.blob = NULL; 7273 bs_sequence_finish(seq, bserrno); 7274 return; 7275 } 7276 7277 existing = blob_lookup(blob->bs, blob->id); 7278 if (existing) { 7279 blob_free(blob); 7280 existing->open_ref++; 7281 seq->cpl.u.blob_handle.blob = existing; 7282 bs_sequence_finish(seq, 0); 7283 return; 7284 } 7285 7286 blob->open_ref++; 7287 7288 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 7289 RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob); 7290 7291 bs_sequence_finish(seq, bserrno); 7292 } 7293 7294 static inline void 7295 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 7296 { 7297 #define FIELD_OK(field) \ 7298 offsetof(struct spdk_blob_open_opts, field) + sizeof(src->field) <= src->opts_size 7299 7300 #define SET_FIELD(field) \ 7301 if (FIELD_OK(field)) { \ 7302 dst->field = src->field; \ 7303 } \ 7304 7305 SET_FIELD(clear_method); 7306 7307 dst->opts_size = src->opts_size; 7308 7309 /* You should not remove this statement, but need to update the assert statement 7310 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7311 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 7312 7313 #undef FIELD_OK 7314 #undef SET_FIELD 7315 } 7316 7317 static void 7318 bs_open_blob(struct spdk_blob_store *bs, 7319 spdk_blob_id blobid, 7320 struct spdk_blob_open_opts *opts, 7321 spdk_blob_op_with_handle_complete cb_fn, 7322 void *cb_arg) 7323 { 7324 struct spdk_blob *blob; 7325 struct spdk_bs_cpl cpl; 7326 struct spdk_blob_open_opts opts_local; 7327 spdk_bs_sequence_t *seq; 7328 uint32_t page_num; 7329 7330 SPDK_DEBUGLOG(blob, "Opening blob 0x%" PRIx64 "\n", blobid); 7331 assert(spdk_get_thread() == bs->md_thread); 7332 7333 page_num = bs_blobid_to_page(blobid); 7334 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 7335 /* Invalid blobid */ 7336 cb_fn(cb_arg, NULL, -ENOENT); 7337 return; 7338 } 7339 7340 blob = blob_lookup(bs, blobid); 7341 if (blob) { 7342 blob->open_ref++; 7343 cb_fn(cb_arg, blob, 0); 7344 return; 7345 } 7346 7347 blob = blob_alloc(bs, blobid); 7348 if (!blob) { 7349 cb_fn(cb_arg, NULL, -ENOMEM); 7350 return; 7351 } 7352 7353 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 7354 if (opts) { 7355 blob_open_opts_copy(opts, &opts_local); 7356 } 7357 7358 blob->clear_method = opts_local.clear_method; 7359 7360 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 7361 cpl.u.blob_handle.cb_fn = cb_fn; 7362 cpl.u.blob_handle.cb_arg = cb_arg; 7363 cpl.u.blob_handle.blob = blob; 7364 7365 seq = bs_sequence_start(bs->md_channel, &cpl); 7366 if (!seq) { 7367 blob_free(blob); 7368 cb_fn(cb_arg, NULL, -ENOMEM); 7369 return; 7370 } 7371 7372 blob_load(seq, blob, bs_open_blob_cpl, blob); 7373 } 7374 7375 void 7376 spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7377 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7378 { 7379 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 7380 } 7381 7382 void 7383 spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 7384 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7385 { 7386 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 7387 } 7388 7389 /* END spdk_bs_open_blob */ 7390 7391 /* START spdk_blob_set_read_only */ 7392 int 7393 spdk_blob_set_read_only(struct spdk_blob *blob) 7394 { 7395 blob_verify_md_op(blob); 7396 7397 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 7398 7399 blob->state = SPDK_BLOB_STATE_DIRTY; 7400 return 0; 7401 } 7402 /* END spdk_blob_set_read_only */ 7403 7404 /* START spdk_blob_sync_md */ 7405 7406 static void 7407 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7408 { 7409 struct spdk_blob *blob = cb_arg; 7410 7411 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7412 blob->data_ro = true; 7413 blob->md_ro = true; 7414 } 7415 7416 bs_sequence_finish(seq, bserrno); 7417 } 7418 7419 static void 7420 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7421 { 7422 struct spdk_bs_cpl cpl; 7423 spdk_bs_sequence_t *seq; 7424 7425 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7426 cpl.u.blob_basic.cb_fn = cb_fn; 7427 cpl.u.blob_basic.cb_arg = cb_arg; 7428 7429 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7430 if (!seq) { 7431 cb_fn(cb_arg, -ENOMEM); 7432 return; 7433 } 7434 7435 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7436 } 7437 7438 void 7439 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7440 { 7441 blob_verify_md_op(blob); 7442 7443 SPDK_DEBUGLOG(blob, "Syncing blob 0x%" PRIx64 "\n", blob->id); 7444 7445 if (blob->md_ro) { 7446 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7447 cb_fn(cb_arg, 0); 7448 return; 7449 } 7450 7451 blob_sync_md(blob, cb_fn, cb_arg); 7452 } 7453 7454 /* END spdk_blob_sync_md */ 7455 7456 struct spdk_blob_insert_cluster_ctx { 7457 struct spdk_thread *thread; 7458 struct spdk_blob *blob; 7459 uint32_t cluster_num; /* cluster index in blob */ 7460 uint32_t cluster; /* cluster on disk */ 7461 uint32_t extent_page; /* extent page on disk */ 7462 struct spdk_blob_md_page *page; /* preallocated extent page */ 7463 int rc; 7464 spdk_blob_op_complete cb_fn; 7465 void *cb_arg; 7466 }; 7467 7468 static void 7469 blob_insert_cluster_msg_cpl(void *arg) 7470 { 7471 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7472 7473 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7474 free(ctx); 7475 } 7476 7477 static void 7478 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7479 { 7480 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7481 7482 ctx->rc = bserrno; 7483 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7484 } 7485 7486 static void 7487 blob_insert_new_ep_cb(void *arg, int bserrno) 7488 { 7489 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7490 uint32_t *extent_page; 7491 7492 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7493 *extent_page = ctx->extent_page; 7494 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7495 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7496 } 7497 7498 static void 7499 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7500 { 7501 bs_sequence_finish(seq, bserrno); 7502 } 7503 7504 static void 7505 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7506 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg) 7507 { 7508 spdk_bs_sequence_t *seq; 7509 struct spdk_bs_cpl cpl; 7510 7511 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7512 cpl.u.blob_basic.cb_fn = cb_fn; 7513 cpl.u.blob_basic.cb_arg = cb_arg; 7514 7515 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7516 if (!seq) { 7517 cb_fn(cb_arg, -ENOMEM); 7518 return; 7519 } 7520 7521 assert(page); 7522 page->next = SPDK_INVALID_MD_PAGE; 7523 page->id = blob->id; 7524 page->sequence_num = 0; 7525 7526 blob_serialize_extent_page(blob, cluster_num, page); 7527 7528 page->crc = blob_md_page_calc_crc(page); 7529 7530 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7531 7532 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7533 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7534 blob_persist_extent_page_cpl, page); 7535 } 7536 7537 static void 7538 blob_insert_cluster_msg(void *arg) 7539 { 7540 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7541 uint32_t *extent_page; 7542 7543 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7544 if (ctx->rc != 0) { 7545 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7546 return; 7547 } 7548 7549 if (ctx->blob->use_extent_table == false) { 7550 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7551 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7552 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7553 return; 7554 } 7555 7556 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7557 if (*extent_page == 0) { 7558 /* Extent page requires allocation. 7559 * It was already claimed in the used_md_pages map and placed in ctx. */ 7560 assert(ctx->extent_page != 0); 7561 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7562 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page, 7563 blob_insert_new_ep_cb, ctx); 7564 } else { 7565 /* It is possible for original thread to allocate extent page for 7566 * different cluster in the same extent page. In such case proceed with 7567 * updating the existing extent page, but release the additional one. */ 7568 if (ctx->extent_page != 0) { 7569 spdk_spin_lock(&ctx->blob->bs->used_lock); 7570 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7571 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7572 spdk_spin_unlock(&ctx->blob->bs->used_lock); 7573 ctx->extent_page = 0; 7574 } 7575 /* Extent page already allocated. 7576 * Every cluster allocation, requires just an update of single extent page. */ 7577 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page, 7578 blob_insert_cluster_msg_cb, ctx); 7579 } 7580 } 7581 7582 static void 7583 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7584 uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page, 7585 spdk_blob_op_complete cb_fn, void *cb_arg) 7586 { 7587 struct spdk_blob_insert_cluster_ctx *ctx; 7588 7589 ctx = calloc(1, sizeof(*ctx)); 7590 if (ctx == NULL) { 7591 cb_fn(cb_arg, -ENOMEM); 7592 return; 7593 } 7594 7595 ctx->thread = spdk_get_thread(); 7596 ctx->blob = blob; 7597 ctx->cluster_num = cluster_num; 7598 ctx->cluster = cluster; 7599 ctx->extent_page = extent_page; 7600 ctx->page = page; 7601 ctx->cb_fn = cb_fn; 7602 ctx->cb_arg = cb_arg; 7603 7604 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7605 } 7606 7607 /* START spdk_blob_close */ 7608 7609 static void 7610 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7611 { 7612 struct spdk_blob *blob = cb_arg; 7613 7614 if (bserrno == 0) { 7615 blob->open_ref--; 7616 if (blob->open_ref == 0) { 7617 /* 7618 * Blobs with active.num_pages == 0 are deleted blobs. 7619 * these blobs are removed from the blob_store list 7620 * when the deletion process starts - so don't try to 7621 * remove them again. 7622 */ 7623 if (blob->active.num_pages > 0) { 7624 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7625 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7626 } 7627 blob_free(blob); 7628 } 7629 } 7630 7631 bs_sequence_finish(seq, bserrno); 7632 } 7633 7634 void 7635 spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7636 { 7637 struct spdk_bs_cpl cpl; 7638 spdk_bs_sequence_t *seq; 7639 7640 blob_verify_md_op(blob); 7641 7642 SPDK_DEBUGLOG(blob, "Closing blob 0x%" PRIx64 "\n", blob->id); 7643 7644 if (blob->open_ref == 0) { 7645 cb_fn(cb_arg, -EBADF); 7646 return; 7647 } 7648 7649 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7650 cpl.u.blob_basic.cb_fn = cb_fn; 7651 cpl.u.blob_basic.cb_arg = cb_arg; 7652 7653 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7654 if (!seq) { 7655 cb_fn(cb_arg, -ENOMEM); 7656 return; 7657 } 7658 7659 /* Sync metadata */ 7660 blob_persist(seq, blob, blob_close_cpl, blob); 7661 } 7662 7663 /* END spdk_blob_close */ 7664 7665 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7666 { 7667 return spdk_get_io_channel(bs); 7668 } 7669 7670 void 7671 spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7672 { 7673 spdk_put_io_channel(channel); 7674 } 7675 7676 void 7677 spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7678 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7679 { 7680 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7681 SPDK_BLOB_UNMAP); 7682 } 7683 7684 void 7685 spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7686 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7687 { 7688 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7689 SPDK_BLOB_WRITE_ZEROES); 7690 } 7691 7692 void 7693 spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7694 void *payload, uint64_t offset, uint64_t length, 7695 spdk_blob_op_complete cb_fn, void *cb_arg) 7696 { 7697 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7698 SPDK_BLOB_WRITE); 7699 } 7700 7701 void 7702 spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7703 void *payload, uint64_t offset, uint64_t length, 7704 spdk_blob_op_complete cb_fn, void *cb_arg) 7705 { 7706 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7707 SPDK_BLOB_READ); 7708 } 7709 7710 void 7711 spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7712 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7713 spdk_blob_op_complete cb_fn, void *cb_arg) 7714 { 7715 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL); 7716 } 7717 7718 void 7719 spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7720 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7721 spdk_blob_op_complete cb_fn, void *cb_arg) 7722 { 7723 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL); 7724 } 7725 7726 void 7727 spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7728 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7729 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7730 { 7731 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, 7732 io_opts); 7733 } 7734 7735 void 7736 spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7737 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7738 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7739 { 7740 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, 7741 io_opts); 7742 } 7743 7744 struct spdk_bs_iter_ctx { 7745 int64_t page_num; 7746 struct spdk_blob_store *bs; 7747 7748 spdk_blob_op_with_handle_complete cb_fn; 7749 void *cb_arg; 7750 }; 7751 7752 static void 7753 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7754 { 7755 struct spdk_bs_iter_ctx *ctx = cb_arg; 7756 struct spdk_blob_store *bs = ctx->bs; 7757 spdk_blob_id id; 7758 7759 if (bserrno == 0) { 7760 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7761 free(ctx); 7762 return; 7763 } 7764 7765 ctx->page_num++; 7766 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7767 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7768 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7769 free(ctx); 7770 return; 7771 } 7772 7773 id = bs_page_to_blobid(ctx->page_num); 7774 7775 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7776 } 7777 7778 void 7779 spdk_bs_iter_first(struct spdk_blob_store *bs, 7780 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7781 { 7782 struct spdk_bs_iter_ctx *ctx; 7783 7784 ctx = calloc(1, sizeof(*ctx)); 7785 if (!ctx) { 7786 cb_fn(cb_arg, NULL, -ENOMEM); 7787 return; 7788 } 7789 7790 ctx->page_num = -1; 7791 ctx->bs = bs; 7792 ctx->cb_fn = cb_fn; 7793 ctx->cb_arg = cb_arg; 7794 7795 bs_iter_cpl(ctx, NULL, -1); 7796 } 7797 7798 static void 7799 bs_iter_close_cpl(void *cb_arg, int bserrno) 7800 { 7801 struct spdk_bs_iter_ctx *ctx = cb_arg; 7802 7803 bs_iter_cpl(ctx, NULL, -1); 7804 } 7805 7806 void 7807 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7808 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7809 { 7810 struct spdk_bs_iter_ctx *ctx; 7811 7812 assert(blob != NULL); 7813 7814 ctx = calloc(1, sizeof(*ctx)); 7815 if (!ctx) { 7816 cb_fn(cb_arg, NULL, -ENOMEM); 7817 return; 7818 } 7819 7820 ctx->page_num = bs_blobid_to_page(blob->id); 7821 ctx->bs = bs; 7822 ctx->cb_fn = cb_fn; 7823 ctx->cb_arg = cb_arg; 7824 7825 /* Close the existing blob */ 7826 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7827 } 7828 7829 static int 7830 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7831 uint16_t value_len, bool internal) 7832 { 7833 struct spdk_xattr_tailq *xattrs; 7834 struct spdk_xattr *xattr; 7835 size_t desc_size; 7836 void *tmp; 7837 7838 blob_verify_md_op(blob); 7839 7840 if (blob->md_ro) { 7841 return -EPERM; 7842 } 7843 7844 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7845 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7846 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7847 desc_size, SPDK_BS_MAX_DESC_SIZE); 7848 return -ENOMEM; 7849 } 7850 7851 if (internal) { 7852 xattrs = &blob->xattrs_internal; 7853 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7854 } else { 7855 xattrs = &blob->xattrs; 7856 } 7857 7858 TAILQ_FOREACH(xattr, xattrs, link) { 7859 if (!strcmp(name, xattr->name)) { 7860 tmp = malloc(value_len); 7861 if (!tmp) { 7862 return -ENOMEM; 7863 } 7864 7865 free(xattr->value); 7866 xattr->value_len = value_len; 7867 xattr->value = tmp; 7868 memcpy(xattr->value, value, value_len); 7869 7870 blob->state = SPDK_BLOB_STATE_DIRTY; 7871 7872 return 0; 7873 } 7874 } 7875 7876 xattr = calloc(1, sizeof(*xattr)); 7877 if (!xattr) { 7878 return -ENOMEM; 7879 } 7880 7881 xattr->name = strdup(name); 7882 if (!xattr->name) { 7883 free(xattr); 7884 return -ENOMEM; 7885 } 7886 7887 xattr->value_len = value_len; 7888 xattr->value = malloc(value_len); 7889 if (!xattr->value) { 7890 free(xattr->name); 7891 free(xattr); 7892 return -ENOMEM; 7893 } 7894 memcpy(xattr->value, value, value_len); 7895 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7896 7897 blob->state = SPDK_BLOB_STATE_DIRTY; 7898 7899 return 0; 7900 } 7901 7902 int 7903 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7904 uint16_t value_len) 7905 { 7906 return blob_set_xattr(blob, name, value, value_len, false); 7907 } 7908 7909 static int 7910 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7911 { 7912 struct spdk_xattr_tailq *xattrs; 7913 struct spdk_xattr *xattr; 7914 7915 blob_verify_md_op(blob); 7916 7917 if (blob->md_ro) { 7918 return -EPERM; 7919 } 7920 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7921 7922 TAILQ_FOREACH(xattr, xattrs, link) { 7923 if (!strcmp(name, xattr->name)) { 7924 TAILQ_REMOVE(xattrs, xattr, link); 7925 free(xattr->value); 7926 free(xattr->name); 7927 free(xattr); 7928 7929 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7930 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7931 } 7932 blob->state = SPDK_BLOB_STATE_DIRTY; 7933 7934 return 0; 7935 } 7936 } 7937 7938 return -ENOENT; 7939 } 7940 7941 int 7942 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7943 { 7944 return blob_remove_xattr(blob, name, false); 7945 } 7946 7947 static int 7948 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7949 const void **value, size_t *value_len, bool internal) 7950 { 7951 struct spdk_xattr *xattr; 7952 struct spdk_xattr_tailq *xattrs; 7953 7954 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7955 7956 TAILQ_FOREACH(xattr, xattrs, link) { 7957 if (!strcmp(name, xattr->name)) { 7958 *value = xattr->value; 7959 *value_len = xattr->value_len; 7960 return 0; 7961 } 7962 } 7963 return -ENOENT; 7964 } 7965 7966 int 7967 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7968 const void **value, size_t *value_len) 7969 { 7970 blob_verify_md_op(blob); 7971 7972 return blob_get_xattr_value(blob, name, value, value_len, false); 7973 } 7974 7975 struct spdk_xattr_names { 7976 uint32_t count; 7977 const char *names[0]; 7978 }; 7979 7980 static int 7981 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7982 { 7983 struct spdk_xattr *xattr; 7984 int count = 0; 7985 7986 TAILQ_FOREACH(xattr, xattrs, link) { 7987 count++; 7988 } 7989 7990 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7991 if (*names == NULL) { 7992 return -ENOMEM; 7993 } 7994 7995 TAILQ_FOREACH(xattr, xattrs, link) { 7996 (*names)->names[(*names)->count++] = xattr->name; 7997 } 7998 7999 return 0; 8000 } 8001 8002 int 8003 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 8004 { 8005 blob_verify_md_op(blob); 8006 8007 return blob_get_xattr_names(&blob->xattrs, names); 8008 } 8009 8010 uint32_t 8011 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 8012 { 8013 assert(names != NULL); 8014 8015 return names->count; 8016 } 8017 8018 const char * 8019 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 8020 { 8021 if (index >= names->count) { 8022 return NULL; 8023 } 8024 8025 return names->names[index]; 8026 } 8027 8028 void 8029 spdk_xattr_names_free(struct spdk_xattr_names *names) 8030 { 8031 free(names); 8032 } 8033 8034 struct spdk_bs_type 8035 spdk_bs_get_bstype(struct spdk_blob_store *bs) 8036 { 8037 return bs->bstype; 8038 } 8039 8040 void 8041 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 8042 { 8043 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 8044 } 8045 8046 bool 8047 spdk_blob_is_read_only(struct spdk_blob *blob) 8048 { 8049 assert(blob != NULL); 8050 return (blob->data_ro || blob->md_ro); 8051 } 8052 8053 bool 8054 spdk_blob_is_snapshot(struct spdk_blob *blob) 8055 { 8056 struct spdk_blob_list *snapshot_entry; 8057 8058 assert(blob != NULL); 8059 8060 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 8061 if (snapshot_entry == NULL) { 8062 return false; 8063 } 8064 8065 return true; 8066 } 8067 8068 bool 8069 spdk_blob_is_clone(struct spdk_blob *blob) 8070 { 8071 assert(blob != NULL); 8072 8073 if (blob->parent_id != SPDK_BLOBID_INVALID) { 8074 assert(spdk_blob_is_thin_provisioned(blob)); 8075 return true; 8076 } 8077 8078 return false; 8079 } 8080 8081 bool 8082 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 8083 { 8084 assert(blob != NULL); 8085 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 8086 } 8087 8088 static void 8089 blob_update_clear_method(struct spdk_blob *blob) 8090 { 8091 enum blob_clear_method stored_cm; 8092 8093 assert(blob != NULL); 8094 8095 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 8096 * in metadata previously. If something other than the default was 8097 * specified, ignore stored value and used what was passed in. 8098 */ 8099 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 8100 8101 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 8102 blob->clear_method = stored_cm; 8103 } else if (blob->clear_method != stored_cm) { 8104 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 8105 blob->clear_method, stored_cm); 8106 } 8107 } 8108 8109 spdk_blob_id 8110 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 8111 { 8112 struct spdk_blob_list *snapshot_entry = NULL; 8113 struct spdk_blob_list *clone_entry = NULL; 8114 8115 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 8116 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8117 if (clone_entry->id == blob_id) { 8118 return snapshot_entry->id; 8119 } 8120 } 8121 } 8122 8123 return SPDK_BLOBID_INVALID; 8124 } 8125 8126 int 8127 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 8128 size_t *count) 8129 { 8130 struct spdk_blob_list *snapshot_entry, *clone_entry; 8131 size_t n; 8132 8133 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 8134 if (snapshot_entry == NULL) { 8135 *count = 0; 8136 return 0; 8137 } 8138 8139 if (ids == NULL || *count < snapshot_entry->clone_count) { 8140 *count = snapshot_entry->clone_count; 8141 return -ENOMEM; 8142 } 8143 *count = snapshot_entry->clone_count; 8144 8145 n = 0; 8146 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8147 ids[n++] = clone_entry->id; 8148 } 8149 8150 return 0; 8151 } 8152 8153 static void 8154 bs_load_grow_continue(struct spdk_bs_load_ctx *ctx) 8155 { 8156 int rc; 8157 8158 if (ctx->super->size == 0) { 8159 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8160 } 8161 8162 if (ctx->super->io_unit_size == 0) { 8163 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 8164 } 8165 8166 /* Parse the super block */ 8167 ctx->bs->clean = 1; 8168 ctx->bs->cluster_sz = ctx->super->cluster_size; 8169 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 8170 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 8171 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 8172 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 8173 } 8174 ctx->bs->io_unit_size = ctx->super->io_unit_size; 8175 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 8176 if (rc < 0) { 8177 bs_load_ctx_fail(ctx, -ENOMEM); 8178 return; 8179 } 8180 ctx->bs->md_start = ctx->super->md_start; 8181 ctx->bs->md_len = ctx->super->md_len; 8182 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 8183 if (rc < 0) { 8184 bs_load_ctx_fail(ctx, -ENOMEM); 8185 return; 8186 } 8187 8188 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 8189 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 8190 ctx->bs->super_blob = ctx->super->super_blob; 8191 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 8192 8193 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 8194 SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n"); 8195 bs_load_ctx_fail(ctx, -EIO); 8196 return; 8197 } else { 8198 bs_load_read_used_pages(ctx); 8199 } 8200 } 8201 8202 static void 8203 bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8204 { 8205 struct spdk_bs_load_ctx *ctx = cb_arg; 8206 8207 if (bserrno != 0) { 8208 bs_load_ctx_fail(ctx, bserrno); 8209 return; 8210 } 8211 bs_load_grow_continue(ctx); 8212 } 8213 8214 static void 8215 bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8216 { 8217 struct spdk_bs_load_ctx *ctx = cb_arg; 8218 8219 if (bserrno != 0) { 8220 bs_load_ctx_fail(ctx, bserrno); 8221 return; 8222 } 8223 8224 spdk_free(ctx->mask); 8225 8226 bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 8227 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 8228 bs_load_grow_super_write_cpl, ctx); 8229 } 8230 8231 static void 8232 bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8233 { 8234 struct spdk_bs_load_ctx *ctx = cb_arg; 8235 uint64_t lba, lba_count; 8236 uint64_t dev_size; 8237 uint64_t total_clusters; 8238 8239 if (bserrno != 0) { 8240 bs_load_ctx_fail(ctx, bserrno); 8241 return; 8242 } 8243 8244 /* The type must be correct */ 8245 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 8246 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 8247 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 8248 struct spdk_blob_md_page) * 8)); 8249 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8250 total_clusters = dev_size / ctx->super->cluster_size; 8251 ctx->mask->length = total_clusters; 8252 8253 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8254 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8255 bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count, 8256 bs_load_grow_used_clusters_write_cpl, ctx); 8257 } 8258 8259 static void 8260 bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx) 8261 { 8262 uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask; 8263 uint64_t lba, lba_count, mask_size; 8264 8265 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8266 total_clusters = dev_size / ctx->super->cluster_size; 8267 used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 8268 spdk_divide_round_up(total_clusters, 8), 8269 SPDK_BS_PAGE_SIZE); 8270 max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start; 8271 /* No necessary to grow or no space to grow */ 8272 if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) { 8273 SPDK_DEBUGLOG(blob, "No grow\n"); 8274 bs_load_grow_continue(ctx); 8275 return; 8276 } 8277 8278 SPDK_DEBUGLOG(blob, "Resize blobstore\n"); 8279 8280 ctx->super->size = dev_size; 8281 ctx->super->used_cluster_mask_len = used_cluster_mask_len; 8282 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 8283 8284 mask_size = used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 8285 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 8286 SPDK_MALLOC_DMA); 8287 if (!ctx->mask) { 8288 bs_load_ctx_fail(ctx, -ENOMEM); 8289 return; 8290 } 8291 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8292 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8293 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 8294 bs_load_grow_used_clusters_read_cpl, ctx); 8295 } 8296 8297 static void 8298 bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8299 { 8300 struct spdk_bs_load_ctx *ctx = cb_arg; 8301 uint32_t crc; 8302 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 8303 8304 if (ctx->super->version > SPDK_BS_VERSION || 8305 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 8306 bs_load_ctx_fail(ctx, -EILSEQ); 8307 return; 8308 } 8309 8310 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 8311 sizeof(ctx->super->signature)) != 0) { 8312 bs_load_ctx_fail(ctx, -EILSEQ); 8313 return; 8314 } 8315 8316 crc = blob_md_page_calc_crc(ctx->super); 8317 if (crc != ctx->super->crc) { 8318 bs_load_ctx_fail(ctx, -EILSEQ); 8319 return; 8320 } 8321 8322 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8323 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 8324 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8325 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 8326 } else { 8327 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 8328 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8329 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8330 bs_load_ctx_fail(ctx, -ENXIO); 8331 return; 8332 } 8333 8334 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 8335 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 8336 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 8337 bs_load_ctx_fail(ctx, -EILSEQ); 8338 return; 8339 } 8340 8341 bs_load_try_to_grow(ctx); 8342 8343 } 8344 8345 void 8346 spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 8347 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 8348 { 8349 struct spdk_blob_store *bs; 8350 struct spdk_bs_cpl cpl; 8351 struct spdk_bs_load_ctx *ctx; 8352 struct spdk_bs_opts opts = {}; 8353 int err; 8354 8355 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 8356 8357 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 8358 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 8359 dev->destroy(dev); 8360 cb_fn(cb_arg, NULL, -EINVAL); 8361 return; 8362 } 8363 8364 spdk_bs_opts_init(&opts, sizeof(opts)); 8365 if (o) { 8366 if (bs_opts_copy(o, &opts)) { 8367 return; 8368 } 8369 } 8370 8371 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 8372 dev->destroy(dev); 8373 cb_fn(cb_arg, NULL, -EINVAL); 8374 return; 8375 } 8376 8377 err = bs_alloc(dev, &opts, &bs, &ctx); 8378 if (err) { 8379 dev->destroy(dev); 8380 cb_fn(cb_arg, NULL, err); 8381 return; 8382 } 8383 8384 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 8385 cpl.u.bs_handle.cb_fn = cb_fn; 8386 cpl.u.bs_handle.cb_arg = cb_arg; 8387 cpl.u.bs_handle.bs = bs; 8388 8389 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 8390 if (!ctx->seq) { 8391 spdk_free(ctx->super); 8392 free(ctx); 8393 bs_free(bs); 8394 cb_fn(cb_arg, NULL, -ENOMEM); 8395 return; 8396 } 8397 8398 /* Read the super block */ 8399 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 8400 bs_byte_to_lba(bs, sizeof(*ctx->super)), 8401 bs_grow_load_super_cpl, ctx); 8402 } 8403 8404 SPDK_LOG_REGISTER_COMPONENT(blob) 8405