1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/blob.h" 10 #include "spdk/crc32.h" 11 #include "spdk/env.h" 12 #include "spdk/queue.h" 13 #include "spdk/thread.h" 14 #include "spdk/bit_array.h" 15 #include "spdk/bit_pool.h" 16 #include "spdk/likely.h" 17 #include "spdk/util.h" 18 #include "spdk/string.h" 19 20 #include "spdk_internal/assert.h" 21 #include "spdk/log.h" 22 23 #include "blobstore.h" 24 25 #define BLOB_CRC32C_INITIAL 0xffffffffUL 26 27 static int bs_register_md_thread(struct spdk_blob_store *bs); 28 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 29 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 30 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 31 uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page, 32 spdk_blob_op_complete cb_fn, void *cb_arg); 33 34 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 35 uint16_t value_len, bool internal); 36 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 37 const void **value, size_t *value_len, bool internal); 38 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 39 40 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 41 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg); 42 43 static int 44 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2) 45 { 46 return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id); 47 } 48 49 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp); 50 51 static void 52 blob_verify_md_op(struct spdk_blob *blob) 53 { 54 assert(blob != NULL); 55 assert(spdk_get_thread() == blob->bs->md_thread); 56 assert(blob->state != SPDK_BLOB_STATE_LOADING); 57 } 58 59 static struct spdk_blob_list * 60 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 61 { 62 struct spdk_blob_list *snapshot_entry = NULL; 63 64 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 65 if (snapshot_entry->id == blobid) { 66 break; 67 } 68 } 69 70 return snapshot_entry; 71 } 72 73 static void 74 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 75 { 76 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 77 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 78 79 spdk_bit_array_set(bs->used_md_pages, page); 80 } 81 82 static void 83 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 84 { 85 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 86 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 87 88 spdk_bit_array_clear(bs->used_md_pages, page); 89 } 90 91 static uint32_t 92 bs_claim_cluster(struct spdk_blob_store *bs) 93 { 94 uint32_t cluster_num; 95 96 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 97 if (cluster_num == UINT32_MAX) { 98 return UINT32_MAX; 99 } 100 101 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 102 bs->num_free_clusters--; 103 104 return cluster_num; 105 } 106 107 static void 108 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 109 { 110 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 111 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 112 assert(bs->num_free_clusters < bs->total_clusters); 113 114 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 115 116 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 117 bs->num_free_clusters++; 118 } 119 120 static int 121 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 122 { 123 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 124 125 blob_verify_md_op(blob); 126 127 if (*cluster_lba != 0) { 128 return -EEXIST; 129 } 130 131 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 132 return 0; 133 } 134 135 static int 136 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 137 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 138 { 139 uint32_t *extent_page = 0; 140 141 *cluster = bs_claim_cluster(blob->bs); 142 if (*cluster == UINT32_MAX) { 143 /* No more free clusters. Cannot satisfy the request */ 144 return -ENOSPC; 145 } 146 147 if (blob->use_extent_table) { 148 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 149 if (*extent_page == 0) { 150 /* Extent page shall never occupy md_page so start the search from 1 */ 151 if (*lowest_free_md_page == 0) { 152 *lowest_free_md_page = 1; 153 } 154 /* No extent_page is allocated for the cluster */ 155 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 156 *lowest_free_md_page); 157 if (*lowest_free_md_page == UINT32_MAX) { 158 /* No more free md pages. Cannot satisfy the request */ 159 bs_release_cluster(blob->bs, *cluster); 160 return -ENOSPC; 161 } 162 bs_claim_md_page(blob->bs, *lowest_free_md_page); 163 } 164 } 165 166 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id); 167 168 if (update_map) { 169 blob_insert_cluster(blob, cluster_num, *cluster); 170 if (blob->use_extent_table && *extent_page == 0) { 171 *extent_page = *lowest_free_md_page; 172 } 173 } 174 175 return 0; 176 } 177 178 static void 179 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 180 { 181 xattrs->count = 0; 182 xattrs->names = NULL; 183 xattrs->ctx = NULL; 184 xattrs->get_value = NULL; 185 } 186 187 void 188 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 189 { 190 if (!opts) { 191 SPDK_ERRLOG("opts should not be NULL\n"); 192 return; 193 } 194 195 if (!opts_size) { 196 SPDK_ERRLOG("opts_size should not be zero value\n"); 197 return; 198 } 199 200 memset(opts, 0, opts_size); 201 opts->opts_size = opts_size; 202 203 #define FIELD_OK(field) \ 204 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 205 206 #define SET_FIELD(field, value) \ 207 if (FIELD_OK(field)) { \ 208 opts->field = value; \ 209 } \ 210 211 SET_FIELD(num_clusters, 0); 212 SET_FIELD(thin_provision, false); 213 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 214 215 if (FIELD_OK(xattrs)) { 216 blob_xattrs_init(&opts->xattrs); 217 } 218 219 SET_FIELD(use_extent_table, true); 220 221 #undef FIELD_OK 222 #undef SET_FIELD 223 } 224 225 void 226 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 227 { 228 if (!opts) { 229 SPDK_ERRLOG("opts should not be NULL\n"); 230 return; 231 } 232 233 if (!opts_size) { 234 SPDK_ERRLOG("opts_size should not be zero value\n"); 235 return; 236 } 237 238 memset(opts, 0, opts_size); 239 opts->opts_size = opts_size; 240 241 #define FIELD_OK(field) \ 242 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 243 244 #define SET_FIELD(field, value) \ 245 if (FIELD_OK(field)) { \ 246 opts->field = value; \ 247 } \ 248 249 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 250 251 #undef FIELD_OK 252 #undef SET_FILED 253 } 254 255 static struct spdk_blob * 256 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 257 { 258 struct spdk_blob *blob; 259 260 blob = calloc(1, sizeof(*blob)); 261 if (!blob) { 262 return NULL; 263 } 264 265 blob->id = id; 266 blob->bs = bs; 267 268 blob->parent_id = SPDK_BLOBID_INVALID; 269 270 blob->state = SPDK_BLOB_STATE_DIRTY; 271 blob->extent_rle_found = false; 272 blob->extent_table_found = false; 273 blob->active.num_pages = 1; 274 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 275 if (!blob->active.pages) { 276 free(blob); 277 return NULL; 278 } 279 280 blob->active.pages[0] = bs_blobid_to_page(id); 281 282 TAILQ_INIT(&blob->xattrs); 283 TAILQ_INIT(&blob->xattrs_internal); 284 TAILQ_INIT(&blob->pending_persists); 285 TAILQ_INIT(&blob->persists_to_complete); 286 287 return blob; 288 } 289 290 static void 291 xattrs_free(struct spdk_xattr_tailq *xattrs) 292 { 293 struct spdk_xattr *xattr, *xattr_tmp; 294 295 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 296 TAILQ_REMOVE(xattrs, xattr, link); 297 free(xattr->name); 298 free(xattr->value); 299 free(xattr); 300 } 301 } 302 303 static void 304 blob_free(struct spdk_blob *blob) 305 { 306 assert(blob != NULL); 307 assert(TAILQ_EMPTY(&blob->pending_persists)); 308 assert(TAILQ_EMPTY(&blob->persists_to_complete)); 309 310 free(blob->active.extent_pages); 311 free(blob->clean.extent_pages); 312 free(blob->active.clusters); 313 free(blob->clean.clusters); 314 free(blob->active.pages); 315 free(blob->clean.pages); 316 317 xattrs_free(&blob->xattrs); 318 xattrs_free(&blob->xattrs_internal); 319 320 if (blob->back_bs_dev) { 321 blob->back_bs_dev->destroy(blob->back_bs_dev); 322 } 323 324 free(blob); 325 } 326 327 struct freeze_io_ctx { 328 struct spdk_bs_cpl cpl; 329 struct spdk_blob *blob; 330 }; 331 332 static void 333 blob_io_sync(struct spdk_io_channel_iter *i) 334 { 335 spdk_for_each_channel_continue(i, 0); 336 } 337 338 static void 339 blob_execute_queued_io(struct spdk_io_channel_iter *i) 340 { 341 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 342 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 343 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 344 struct spdk_bs_request_set *set; 345 struct spdk_bs_user_op_args *args; 346 spdk_bs_user_op_t *op, *tmp; 347 348 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 349 set = (struct spdk_bs_request_set *)op; 350 args = &set->u.user_op; 351 352 if (args->blob == ctx->blob) { 353 TAILQ_REMOVE(&ch->queued_io, op, link); 354 bs_user_op_execute(op); 355 } 356 } 357 358 spdk_for_each_channel_continue(i, 0); 359 } 360 361 static void 362 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 363 { 364 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 365 366 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 367 368 free(ctx); 369 } 370 371 static void 372 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 373 { 374 struct freeze_io_ctx *ctx; 375 376 ctx = calloc(1, sizeof(*ctx)); 377 if (!ctx) { 378 cb_fn(cb_arg, -ENOMEM); 379 return; 380 } 381 382 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 383 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 384 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 385 ctx->blob = blob; 386 387 /* Freeze I/O on blob */ 388 blob->frozen_refcnt++; 389 390 if (blob->frozen_refcnt == 1) { 391 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 392 } else { 393 cb_fn(cb_arg, 0); 394 free(ctx); 395 } 396 } 397 398 static void 399 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 400 { 401 struct freeze_io_ctx *ctx; 402 403 ctx = calloc(1, sizeof(*ctx)); 404 if (!ctx) { 405 cb_fn(cb_arg, -ENOMEM); 406 return; 407 } 408 409 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 410 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 411 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 412 ctx->blob = blob; 413 414 assert(blob->frozen_refcnt > 0); 415 416 blob->frozen_refcnt--; 417 418 if (blob->frozen_refcnt == 0) { 419 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 420 } else { 421 cb_fn(cb_arg, 0); 422 free(ctx); 423 } 424 } 425 426 static int 427 blob_mark_clean(struct spdk_blob *blob) 428 { 429 uint32_t *extent_pages = NULL; 430 uint64_t *clusters = NULL; 431 uint32_t *pages = NULL; 432 433 assert(blob != NULL); 434 435 if (blob->active.num_extent_pages) { 436 assert(blob->active.extent_pages); 437 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 438 if (!extent_pages) { 439 return -ENOMEM; 440 } 441 memcpy(extent_pages, blob->active.extent_pages, 442 blob->active.num_extent_pages * sizeof(*extent_pages)); 443 } 444 445 if (blob->active.num_clusters) { 446 assert(blob->active.clusters); 447 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 448 if (!clusters) { 449 free(extent_pages); 450 return -ENOMEM; 451 } 452 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 453 } 454 455 if (blob->active.num_pages) { 456 assert(blob->active.pages); 457 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 458 if (!pages) { 459 free(extent_pages); 460 free(clusters); 461 return -ENOMEM; 462 } 463 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 464 } 465 466 free(blob->clean.extent_pages); 467 free(blob->clean.clusters); 468 free(blob->clean.pages); 469 470 blob->clean.num_extent_pages = blob->active.num_extent_pages; 471 blob->clean.extent_pages = blob->active.extent_pages; 472 blob->clean.num_clusters = blob->active.num_clusters; 473 blob->clean.clusters = blob->active.clusters; 474 blob->clean.num_pages = blob->active.num_pages; 475 blob->clean.pages = blob->active.pages; 476 477 blob->active.extent_pages = extent_pages; 478 blob->active.clusters = clusters; 479 blob->active.pages = pages; 480 481 /* If the metadata was dirtied again while the metadata was being written to disk, 482 * we do not want to revert the DIRTY state back to CLEAN here. 483 */ 484 if (blob->state == SPDK_BLOB_STATE_LOADING) { 485 blob->state = SPDK_BLOB_STATE_CLEAN; 486 } 487 488 return 0; 489 } 490 491 static int 492 blob_deserialize_xattr(struct spdk_blob *blob, 493 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 494 { 495 struct spdk_xattr *xattr; 496 497 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 498 sizeof(desc_xattr->value_length) + 499 desc_xattr->name_length + desc_xattr->value_length) { 500 return -EINVAL; 501 } 502 503 xattr = calloc(1, sizeof(*xattr)); 504 if (xattr == NULL) { 505 return -ENOMEM; 506 } 507 508 xattr->name = malloc(desc_xattr->name_length + 1); 509 if (xattr->name == NULL) { 510 free(xattr); 511 return -ENOMEM; 512 } 513 514 xattr->value = malloc(desc_xattr->value_length); 515 if (xattr->value == NULL) { 516 free(xattr->name); 517 free(xattr); 518 return -ENOMEM; 519 } 520 521 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 522 xattr->name[desc_xattr->name_length] = '\0'; 523 xattr->value_len = desc_xattr->value_length; 524 memcpy(xattr->value, 525 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 526 desc_xattr->value_length); 527 528 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 529 530 return 0; 531 } 532 533 534 static int 535 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 536 { 537 struct spdk_blob_md_descriptor *desc; 538 size_t cur_desc = 0; 539 void *tmp; 540 541 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 542 while (cur_desc < sizeof(page->descriptors)) { 543 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 544 if (desc->length == 0) { 545 /* If padding and length are 0, this terminates the page */ 546 break; 547 } 548 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 549 struct spdk_blob_md_descriptor_flags *desc_flags; 550 551 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 552 553 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 554 return -EINVAL; 555 } 556 557 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 558 SPDK_BLOB_INVALID_FLAGS_MASK) { 559 return -EINVAL; 560 } 561 562 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 563 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 564 blob->data_ro = true; 565 blob->md_ro = true; 566 } 567 568 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 569 SPDK_BLOB_MD_RO_FLAGS_MASK) { 570 blob->md_ro = true; 571 } 572 573 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 574 blob->data_ro = true; 575 blob->md_ro = true; 576 } 577 578 blob->invalid_flags = desc_flags->invalid_flags; 579 blob->data_ro_flags = desc_flags->data_ro_flags; 580 blob->md_ro_flags = desc_flags->md_ro_flags; 581 582 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 583 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 584 unsigned int i, j; 585 unsigned int cluster_count = blob->active.num_clusters; 586 587 if (blob->extent_table_found) { 588 /* Extent Table already present in the md, 589 * both descriptors should never be at the same time. */ 590 return -EINVAL; 591 } 592 blob->extent_rle_found = true; 593 594 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 595 596 if (desc_extent_rle->length == 0 || 597 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 598 return -EINVAL; 599 } 600 601 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 602 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 603 if (desc_extent_rle->extents[i].cluster_idx != 0) { 604 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 605 desc_extent_rle->extents[i].cluster_idx + j)) { 606 return -EINVAL; 607 } 608 } 609 cluster_count++; 610 } 611 } 612 613 if (cluster_count == 0) { 614 return -EINVAL; 615 } 616 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 617 if (tmp == NULL) { 618 return -ENOMEM; 619 } 620 blob->active.clusters = tmp; 621 blob->active.cluster_array_size = cluster_count; 622 623 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 624 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 625 if (desc_extent_rle->extents[i].cluster_idx != 0) { 626 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 627 desc_extent_rle->extents[i].cluster_idx + j); 628 } else if (spdk_blob_is_thin_provisioned(blob)) { 629 blob->active.clusters[blob->active.num_clusters++] = 0; 630 } else { 631 return -EINVAL; 632 } 633 } 634 } 635 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 636 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 637 uint32_t num_extent_pages = blob->active.num_extent_pages; 638 uint32_t i, j; 639 size_t extent_pages_length; 640 641 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 642 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 643 644 if (blob->extent_rle_found) { 645 /* This means that Extent RLE is present in MD, 646 * both should never be at the same time. */ 647 return -EINVAL; 648 } else if (blob->extent_table_found && 649 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 650 /* Number of clusters in this ET does not match number 651 * from previously read EXTENT_TABLE. */ 652 return -EINVAL; 653 } 654 655 if (desc_extent_table->length == 0 || 656 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 657 return -EINVAL; 658 } 659 660 blob->extent_table_found = true; 661 662 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 663 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 664 } 665 666 if (num_extent_pages > 0) { 667 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 668 if (tmp == NULL) { 669 return -ENOMEM; 670 } 671 blob->active.extent_pages = tmp; 672 } 673 blob->active.extent_pages_array_size = num_extent_pages; 674 675 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 676 677 /* Extent table entries contain md page numbers for extent pages. 678 * Zeroes represent unallocated extent pages, those are run-length-encoded. 679 */ 680 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 681 if (desc_extent_table->extent_page[i].page_idx != 0) { 682 assert(desc_extent_table->extent_page[i].num_pages == 1); 683 blob->active.extent_pages[blob->active.num_extent_pages++] = 684 desc_extent_table->extent_page[i].page_idx; 685 } else if (spdk_blob_is_thin_provisioned(blob)) { 686 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 687 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 688 } 689 } else { 690 return -EINVAL; 691 } 692 } 693 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 694 struct spdk_blob_md_descriptor_extent_page *desc_extent; 695 unsigned int i; 696 unsigned int cluster_count = 0; 697 size_t cluster_idx_length; 698 699 if (blob->extent_rle_found) { 700 /* This means that Extent RLE is present in MD, 701 * both should never be at the same time. */ 702 return -EINVAL; 703 } 704 705 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 706 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 707 708 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 709 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 710 return -EINVAL; 711 } 712 713 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 714 if (desc_extent->cluster_idx[i] != 0) { 715 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 716 return -EINVAL; 717 } 718 } 719 cluster_count++; 720 } 721 722 if (cluster_count == 0) { 723 return -EINVAL; 724 } 725 726 /* When reading extent pages sequentially starting cluster idx should match 727 * current size of a blob. 728 * If changed to batch reading, this check shall be removed. */ 729 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 730 return -EINVAL; 731 } 732 733 tmp = realloc(blob->active.clusters, 734 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 735 if (tmp == NULL) { 736 return -ENOMEM; 737 } 738 blob->active.clusters = tmp; 739 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 740 741 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 742 if (desc_extent->cluster_idx[i] != 0) { 743 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 744 desc_extent->cluster_idx[i]); 745 } else if (spdk_blob_is_thin_provisioned(blob)) { 746 blob->active.clusters[blob->active.num_clusters++] = 0; 747 } else { 748 return -EINVAL; 749 } 750 } 751 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 752 assert(blob->remaining_clusters_in_et >= cluster_count); 753 blob->remaining_clusters_in_et -= cluster_count; 754 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 755 int rc; 756 757 rc = blob_deserialize_xattr(blob, 758 (struct spdk_blob_md_descriptor_xattr *) desc, false); 759 if (rc != 0) { 760 return rc; 761 } 762 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 763 int rc; 764 765 rc = blob_deserialize_xattr(blob, 766 (struct spdk_blob_md_descriptor_xattr *) desc, true); 767 if (rc != 0) { 768 return rc; 769 } 770 } else { 771 /* Unrecognized descriptor type. Do not fail - just continue to the 772 * next descriptor. If this descriptor is associated with some feature 773 * defined in a newer version of blobstore, that version of blobstore 774 * should create and set an associated feature flag to specify if this 775 * blob can be loaded or not. 776 */ 777 } 778 779 /* Advance to the next descriptor */ 780 cur_desc += sizeof(*desc) + desc->length; 781 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 782 break; 783 } 784 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 785 } 786 787 return 0; 788 } 789 790 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 791 792 static int 793 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 794 { 795 assert(blob != NULL); 796 assert(blob->state == SPDK_BLOB_STATE_LOADING); 797 798 if (bs_load_cur_extent_page_valid(extent_page) == false) { 799 return -ENOENT; 800 } 801 802 return blob_parse_page(extent_page, blob); 803 } 804 805 static int 806 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 807 struct spdk_blob *blob) 808 { 809 const struct spdk_blob_md_page *page; 810 uint32_t i; 811 int rc; 812 void *tmp; 813 814 assert(page_count > 0); 815 assert(pages[0].sequence_num == 0); 816 assert(blob != NULL); 817 assert(blob->state == SPDK_BLOB_STATE_LOADING); 818 assert(blob->active.clusters == NULL); 819 820 /* The blobid provided doesn't match what's in the MD, this can 821 * happen for example if a bogus blobid is passed in through open. 822 */ 823 if (blob->id != pages[0].id) { 824 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 825 blob->id, pages[0].id); 826 return -ENOENT; 827 } 828 829 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 830 if (!tmp) { 831 return -ENOMEM; 832 } 833 blob->active.pages = tmp; 834 835 blob->active.pages[0] = pages[0].id; 836 837 for (i = 1; i < page_count; i++) { 838 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 839 blob->active.pages[i] = pages[i - 1].next; 840 } 841 blob->active.num_pages = page_count; 842 843 for (i = 0; i < page_count; i++) { 844 page = &pages[i]; 845 846 assert(page->id == blob->id); 847 assert(page->sequence_num == i); 848 849 rc = blob_parse_page(page, blob); 850 if (rc != 0) { 851 return rc; 852 } 853 } 854 855 return 0; 856 } 857 858 static int 859 blob_serialize_add_page(const struct spdk_blob *blob, 860 struct spdk_blob_md_page **pages, 861 uint32_t *page_count, 862 struct spdk_blob_md_page **last_page) 863 { 864 struct spdk_blob_md_page *page, *tmp_pages; 865 866 assert(pages != NULL); 867 assert(page_count != NULL); 868 869 *last_page = NULL; 870 if (*page_count == 0) { 871 assert(*pages == NULL); 872 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 873 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 874 if (*pages == NULL) { 875 return -ENOMEM; 876 } 877 *page_count = 1; 878 } else { 879 assert(*pages != NULL); 880 tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0); 881 if (tmp_pages == NULL) { 882 return -ENOMEM; 883 } 884 (*page_count)++; 885 *pages = tmp_pages; 886 } 887 888 page = &(*pages)[*page_count - 1]; 889 memset(page, 0, sizeof(*page)); 890 page->id = blob->id; 891 page->sequence_num = *page_count - 1; 892 page->next = SPDK_INVALID_MD_PAGE; 893 *last_page = page; 894 895 return 0; 896 } 897 898 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 899 * Update required_sz on both success and failure. 900 * 901 */ 902 static int 903 blob_serialize_xattr(const struct spdk_xattr *xattr, 904 uint8_t *buf, size_t buf_sz, 905 size_t *required_sz, bool internal) 906 { 907 struct spdk_blob_md_descriptor_xattr *desc; 908 909 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 910 strlen(xattr->name) + 911 xattr->value_len; 912 913 if (buf_sz < *required_sz) { 914 return -1; 915 } 916 917 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 918 919 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 920 desc->length = sizeof(desc->name_length) + 921 sizeof(desc->value_length) + 922 strlen(xattr->name) + 923 xattr->value_len; 924 desc->name_length = strlen(xattr->name); 925 desc->value_length = xattr->value_len; 926 927 memcpy(desc->name, xattr->name, desc->name_length); 928 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 929 xattr->value, 930 desc->value_length); 931 932 return 0; 933 } 934 935 static void 936 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 937 uint64_t start_ep, uint64_t *next_ep, 938 uint8_t **buf, size_t *remaining_sz) 939 { 940 struct spdk_blob_md_descriptor_extent_table *desc; 941 size_t cur_sz; 942 uint64_t i, et_idx; 943 uint32_t extent_page, ep_len; 944 945 /* The buffer must have room for at least num_clusters entry */ 946 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 947 if (*remaining_sz < cur_sz) { 948 *next_ep = start_ep; 949 return; 950 } 951 952 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 953 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 954 955 desc->num_clusters = blob->active.num_clusters; 956 957 ep_len = 1; 958 et_idx = 0; 959 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 960 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 961 /* If we ran out of buffer space, return */ 962 break; 963 } 964 965 extent_page = blob->active.extent_pages[i]; 966 /* Verify that next extent_page is unallocated */ 967 if (extent_page == 0 && 968 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 969 ep_len++; 970 continue; 971 } 972 desc->extent_page[et_idx].page_idx = extent_page; 973 desc->extent_page[et_idx].num_pages = ep_len; 974 et_idx++; 975 976 ep_len = 1; 977 cur_sz += sizeof(desc->extent_page[et_idx]); 978 } 979 *next_ep = i; 980 981 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 982 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 983 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 984 } 985 986 static int 987 blob_serialize_extent_table(const struct spdk_blob *blob, 988 struct spdk_blob_md_page **pages, 989 struct spdk_blob_md_page *cur_page, 990 uint32_t *page_count, uint8_t **buf, 991 size_t *remaining_sz) 992 { 993 uint64_t last_extent_page; 994 int rc; 995 996 last_extent_page = 0; 997 /* At least single extent table entry has to be always persisted. 998 * Such case occurs with num_extent_pages == 0. */ 999 while (last_extent_page <= blob->active.num_extent_pages) { 1000 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1001 remaining_sz); 1002 1003 if (last_extent_page == blob->active.num_extent_pages) { 1004 break; 1005 } 1006 1007 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1008 if (rc < 0) { 1009 return rc; 1010 } 1011 1012 *buf = (uint8_t *)cur_page->descriptors; 1013 *remaining_sz = sizeof(cur_page->descriptors); 1014 } 1015 1016 return 0; 1017 } 1018 1019 static void 1020 blob_serialize_extent_rle(const struct spdk_blob *blob, 1021 uint64_t start_cluster, uint64_t *next_cluster, 1022 uint8_t **buf, size_t *buf_sz) 1023 { 1024 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1025 size_t cur_sz; 1026 uint64_t i, extent_idx; 1027 uint64_t lba, lba_per_cluster, lba_count; 1028 1029 /* The buffer must have room for at least one extent */ 1030 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1031 if (*buf_sz < cur_sz) { 1032 *next_cluster = start_cluster; 1033 return; 1034 } 1035 1036 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1037 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1038 1039 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1040 1041 lba = blob->active.clusters[start_cluster]; 1042 lba_count = lba_per_cluster; 1043 extent_idx = 0; 1044 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1045 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1046 /* Run-length encode sequential non-zero LBA */ 1047 lba_count += lba_per_cluster; 1048 continue; 1049 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1050 /* Run-length encode unallocated clusters */ 1051 lba_count += lba_per_cluster; 1052 continue; 1053 } 1054 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1055 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1056 extent_idx++; 1057 1058 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1059 1060 if (*buf_sz < cur_sz) { 1061 /* If we ran out of buffer space, return */ 1062 *next_cluster = i; 1063 break; 1064 } 1065 1066 lba = blob->active.clusters[i]; 1067 lba_count = lba_per_cluster; 1068 } 1069 1070 if (*buf_sz >= cur_sz) { 1071 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1072 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1073 extent_idx++; 1074 1075 *next_cluster = blob->active.num_clusters; 1076 } 1077 1078 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1079 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1080 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1081 } 1082 1083 static int 1084 blob_serialize_extents_rle(const struct spdk_blob *blob, 1085 struct spdk_blob_md_page **pages, 1086 struct spdk_blob_md_page *cur_page, 1087 uint32_t *page_count, uint8_t **buf, 1088 size_t *remaining_sz) 1089 { 1090 uint64_t last_cluster; 1091 int rc; 1092 1093 last_cluster = 0; 1094 while (last_cluster < blob->active.num_clusters) { 1095 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1096 1097 if (last_cluster == blob->active.num_clusters) { 1098 break; 1099 } 1100 1101 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1102 if (rc < 0) { 1103 return rc; 1104 } 1105 1106 *buf = (uint8_t *)cur_page->descriptors; 1107 *remaining_sz = sizeof(cur_page->descriptors); 1108 } 1109 1110 return 0; 1111 } 1112 1113 static void 1114 blob_serialize_extent_page(const struct spdk_blob *blob, 1115 uint64_t cluster, struct spdk_blob_md_page *page) 1116 { 1117 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1118 uint64_t i, extent_idx; 1119 uint64_t lba, lba_per_cluster; 1120 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1121 1122 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1123 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1124 1125 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1126 1127 desc_extent->start_cluster_idx = start_cluster_idx; 1128 extent_idx = 0; 1129 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1130 lba = blob->active.clusters[i]; 1131 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1132 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1133 break; 1134 } 1135 } 1136 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1137 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1138 } 1139 1140 static void 1141 blob_serialize_flags(const struct spdk_blob *blob, 1142 uint8_t *buf, size_t *buf_sz) 1143 { 1144 struct spdk_blob_md_descriptor_flags *desc; 1145 1146 /* 1147 * Flags get serialized first, so we should always have room for the flags 1148 * descriptor. 1149 */ 1150 assert(*buf_sz >= sizeof(*desc)); 1151 1152 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1153 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1154 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1155 desc->invalid_flags = blob->invalid_flags; 1156 desc->data_ro_flags = blob->data_ro_flags; 1157 desc->md_ro_flags = blob->md_ro_flags; 1158 1159 *buf_sz -= sizeof(*desc); 1160 } 1161 1162 static int 1163 blob_serialize_xattrs(const struct spdk_blob *blob, 1164 const struct spdk_xattr_tailq *xattrs, bool internal, 1165 struct spdk_blob_md_page **pages, 1166 struct spdk_blob_md_page *cur_page, 1167 uint32_t *page_count, uint8_t **buf, 1168 size_t *remaining_sz) 1169 { 1170 const struct spdk_xattr *xattr; 1171 int rc; 1172 1173 TAILQ_FOREACH(xattr, xattrs, link) { 1174 size_t required_sz = 0; 1175 1176 rc = blob_serialize_xattr(xattr, 1177 *buf, *remaining_sz, 1178 &required_sz, internal); 1179 if (rc < 0) { 1180 /* Need to add a new page to the chain */ 1181 rc = blob_serialize_add_page(blob, pages, page_count, 1182 &cur_page); 1183 if (rc < 0) { 1184 spdk_free(*pages); 1185 *pages = NULL; 1186 *page_count = 0; 1187 return rc; 1188 } 1189 1190 *buf = (uint8_t *)cur_page->descriptors; 1191 *remaining_sz = sizeof(cur_page->descriptors); 1192 1193 /* Try again */ 1194 required_sz = 0; 1195 rc = blob_serialize_xattr(xattr, 1196 *buf, *remaining_sz, 1197 &required_sz, internal); 1198 1199 if (rc < 0) { 1200 spdk_free(*pages); 1201 *pages = NULL; 1202 *page_count = 0; 1203 return rc; 1204 } 1205 } 1206 1207 *remaining_sz -= required_sz; 1208 *buf += required_sz; 1209 } 1210 1211 return 0; 1212 } 1213 1214 static int 1215 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1216 uint32_t *page_count) 1217 { 1218 struct spdk_blob_md_page *cur_page; 1219 int rc; 1220 uint8_t *buf; 1221 size_t remaining_sz; 1222 1223 assert(pages != NULL); 1224 assert(page_count != NULL); 1225 assert(blob != NULL); 1226 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1227 1228 *pages = NULL; 1229 *page_count = 0; 1230 1231 /* A blob always has at least 1 page, even if it has no descriptors */ 1232 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1233 if (rc < 0) { 1234 return rc; 1235 } 1236 1237 buf = (uint8_t *)cur_page->descriptors; 1238 remaining_sz = sizeof(cur_page->descriptors); 1239 1240 /* Serialize flags */ 1241 blob_serialize_flags(blob, buf, &remaining_sz); 1242 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1243 1244 /* Serialize xattrs */ 1245 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1246 pages, cur_page, page_count, &buf, &remaining_sz); 1247 if (rc < 0) { 1248 return rc; 1249 } 1250 1251 /* Serialize internal xattrs */ 1252 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1253 pages, cur_page, page_count, &buf, &remaining_sz); 1254 if (rc < 0) { 1255 return rc; 1256 } 1257 1258 if (blob->use_extent_table) { 1259 /* Serialize extent table */ 1260 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1261 } else { 1262 /* Serialize extents */ 1263 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1264 } 1265 1266 return rc; 1267 } 1268 1269 struct spdk_blob_load_ctx { 1270 struct spdk_blob *blob; 1271 1272 struct spdk_blob_md_page *pages; 1273 uint32_t num_pages; 1274 uint32_t next_extent_page; 1275 spdk_bs_sequence_t *seq; 1276 1277 spdk_bs_sequence_cpl cb_fn; 1278 void *cb_arg; 1279 }; 1280 1281 static uint32_t 1282 blob_md_page_calc_crc(void *page) 1283 { 1284 uint32_t crc; 1285 1286 crc = BLOB_CRC32C_INITIAL; 1287 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1288 crc ^= BLOB_CRC32C_INITIAL; 1289 1290 return crc; 1291 1292 } 1293 1294 static void 1295 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1296 { 1297 struct spdk_blob *blob = ctx->blob; 1298 1299 if (bserrno == 0) { 1300 blob_mark_clean(blob); 1301 } 1302 1303 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1304 1305 /* Free the memory */ 1306 spdk_free(ctx->pages); 1307 free(ctx); 1308 } 1309 1310 static void 1311 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1312 { 1313 struct spdk_blob_load_ctx *ctx = cb_arg; 1314 struct spdk_blob *blob = ctx->blob; 1315 1316 if (bserrno == 0) { 1317 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1318 if (blob->back_bs_dev == NULL) { 1319 bserrno = -ENOMEM; 1320 } 1321 } 1322 if (bserrno != 0) { 1323 SPDK_ERRLOG("Snapshot fail\n"); 1324 } 1325 1326 blob_load_final(ctx, bserrno); 1327 } 1328 1329 static void blob_update_clear_method(struct spdk_blob *blob); 1330 1331 static void 1332 blob_load_backing_dev(void *cb_arg) 1333 { 1334 struct spdk_blob_load_ctx *ctx = cb_arg; 1335 struct spdk_blob *blob = ctx->blob; 1336 const void *value; 1337 size_t len; 1338 int rc; 1339 1340 if (spdk_blob_is_thin_provisioned(blob)) { 1341 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1342 if (rc == 0) { 1343 if (len != sizeof(spdk_blob_id)) { 1344 blob_load_final(ctx, -EINVAL); 1345 return; 1346 } 1347 /* open snapshot blob and continue in the callback function */ 1348 blob->parent_id = *(spdk_blob_id *)value; 1349 spdk_bs_open_blob(blob->bs, blob->parent_id, 1350 blob_load_snapshot_cpl, ctx); 1351 return; 1352 } else { 1353 /* add zeroes_dev for thin provisioned blob */ 1354 blob->back_bs_dev = bs_create_zeroes_dev(); 1355 } 1356 } else { 1357 /* standard blob */ 1358 blob->back_bs_dev = NULL; 1359 } 1360 blob_load_final(ctx, 0); 1361 } 1362 1363 static void 1364 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1365 { 1366 struct spdk_blob_load_ctx *ctx = cb_arg; 1367 struct spdk_blob *blob = ctx->blob; 1368 struct spdk_blob_md_page *page; 1369 uint64_t i; 1370 uint32_t crc; 1371 uint64_t lba; 1372 void *tmp; 1373 uint64_t sz; 1374 1375 if (bserrno) { 1376 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1377 blob_load_final(ctx, bserrno); 1378 return; 1379 } 1380 1381 if (ctx->pages == NULL) { 1382 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1383 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1384 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1385 if (!ctx->pages) { 1386 blob_load_final(ctx, -ENOMEM); 1387 return; 1388 } 1389 ctx->num_pages = 1; 1390 ctx->next_extent_page = 0; 1391 } else { 1392 page = &ctx->pages[0]; 1393 crc = blob_md_page_calc_crc(page); 1394 if (crc != page->crc) { 1395 blob_load_final(ctx, -EINVAL); 1396 return; 1397 } 1398 1399 if (page->next != SPDK_INVALID_MD_PAGE) { 1400 blob_load_final(ctx, -EINVAL); 1401 return; 1402 } 1403 1404 bserrno = blob_parse_extent_page(page, blob); 1405 if (bserrno) { 1406 blob_load_final(ctx, bserrno); 1407 return; 1408 } 1409 } 1410 1411 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1412 if (blob->active.extent_pages[i] != 0) { 1413 /* Extent page was allocated, read and parse it. */ 1414 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1415 ctx->next_extent_page = i + 1; 1416 1417 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1418 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1419 blob_load_cpl_extents_cpl, ctx); 1420 return; 1421 } else { 1422 /* Thin provisioned blobs can point to unallocated extent pages. 1423 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1424 1425 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1426 blob->active.num_clusters += sz; 1427 blob->remaining_clusters_in_et -= sz; 1428 1429 assert(spdk_blob_is_thin_provisioned(blob)); 1430 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1431 1432 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1433 if (tmp == NULL) { 1434 blob_load_final(ctx, -ENOMEM); 1435 return; 1436 } 1437 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1438 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1439 blob->active.clusters = tmp; 1440 blob->active.cluster_array_size = blob->active.num_clusters; 1441 } 1442 } 1443 1444 blob_load_backing_dev(ctx); 1445 } 1446 1447 static void 1448 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1449 { 1450 struct spdk_blob_load_ctx *ctx = cb_arg; 1451 struct spdk_blob *blob = ctx->blob; 1452 struct spdk_blob_md_page *page; 1453 int rc; 1454 uint32_t crc; 1455 uint32_t current_page; 1456 1457 if (ctx->num_pages == 1) { 1458 current_page = bs_blobid_to_page(blob->id); 1459 } else { 1460 assert(ctx->num_pages != 0); 1461 page = &ctx->pages[ctx->num_pages - 2]; 1462 current_page = page->next; 1463 } 1464 1465 if (bserrno) { 1466 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1467 current_page, blob->id, bserrno); 1468 blob_load_final(ctx, bserrno); 1469 return; 1470 } 1471 1472 page = &ctx->pages[ctx->num_pages - 1]; 1473 crc = blob_md_page_calc_crc(page); 1474 if (crc != page->crc) { 1475 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1476 current_page, blob->id); 1477 blob_load_final(ctx, -EINVAL); 1478 return; 1479 } 1480 1481 if (page->next != SPDK_INVALID_MD_PAGE) { 1482 struct spdk_blob_md_page *tmp_pages; 1483 uint32_t next_page = page->next; 1484 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1485 1486 /* Read the next page */ 1487 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0); 1488 if (tmp_pages == NULL) { 1489 blob_load_final(ctx, -ENOMEM); 1490 return; 1491 } 1492 ctx->num_pages++; 1493 ctx->pages = tmp_pages; 1494 1495 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1496 next_lba, 1497 bs_byte_to_lba(blob->bs, sizeof(*page)), 1498 blob_load_cpl, ctx); 1499 return; 1500 } 1501 1502 /* Parse the pages */ 1503 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1504 if (rc) { 1505 blob_load_final(ctx, rc); 1506 return; 1507 } 1508 1509 if (blob->extent_table_found == true) { 1510 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1511 assert(blob->extent_rle_found == false); 1512 blob->use_extent_table = true; 1513 } else { 1514 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1515 * for extent table. No extent_* descriptors means that blob has length of 0 1516 * and no extent_rle descriptors were persisted for it. 1517 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1518 blob->use_extent_table = false; 1519 } 1520 1521 /* Check the clear_method stored in metadata vs what may have been passed 1522 * via spdk_bs_open_blob_ext() and update accordingly. 1523 */ 1524 blob_update_clear_method(blob); 1525 1526 spdk_free(ctx->pages); 1527 ctx->pages = NULL; 1528 1529 if (blob->extent_table_found) { 1530 blob_load_cpl_extents_cpl(seq, ctx, 0); 1531 } else { 1532 blob_load_backing_dev(ctx); 1533 } 1534 } 1535 1536 /* Load a blob from disk given a blobid */ 1537 static void 1538 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1539 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1540 { 1541 struct spdk_blob_load_ctx *ctx; 1542 struct spdk_blob_store *bs; 1543 uint32_t page_num; 1544 uint64_t lba; 1545 1546 blob_verify_md_op(blob); 1547 1548 bs = blob->bs; 1549 1550 ctx = calloc(1, sizeof(*ctx)); 1551 if (!ctx) { 1552 cb_fn(seq, cb_arg, -ENOMEM); 1553 return; 1554 } 1555 1556 ctx->blob = blob; 1557 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1558 if (!ctx->pages) { 1559 free(ctx); 1560 cb_fn(seq, cb_arg, -ENOMEM); 1561 return; 1562 } 1563 ctx->num_pages = 1; 1564 ctx->cb_fn = cb_fn; 1565 ctx->cb_arg = cb_arg; 1566 ctx->seq = seq; 1567 1568 page_num = bs_blobid_to_page(blob->id); 1569 lba = bs_md_page_to_lba(blob->bs, page_num); 1570 1571 blob->state = SPDK_BLOB_STATE_LOADING; 1572 1573 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1574 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1575 blob_load_cpl, ctx); 1576 } 1577 1578 struct spdk_blob_persist_ctx { 1579 struct spdk_blob *blob; 1580 1581 struct spdk_bs_super_block *super; 1582 1583 struct spdk_blob_md_page *pages; 1584 uint32_t next_extent_page; 1585 struct spdk_blob_md_page *extent_page; 1586 1587 spdk_bs_sequence_t *seq; 1588 spdk_bs_sequence_cpl cb_fn; 1589 void *cb_arg; 1590 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1591 }; 1592 1593 static void 1594 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1595 uint64_t lba_count) 1596 { 1597 switch (ctx->blob->clear_method) { 1598 case BLOB_CLEAR_WITH_DEFAULT: 1599 case BLOB_CLEAR_WITH_UNMAP: 1600 bs_batch_unmap_dev(batch, lba, lba_count); 1601 break; 1602 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1603 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1604 break; 1605 case BLOB_CLEAR_WITH_NONE: 1606 default: 1607 break; 1608 } 1609 } 1610 1611 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1612 1613 static void 1614 blob_persist_complete_cb(void *arg) 1615 { 1616 struct spdk_blob_persist_ctx *ctx = arg; 1617 1618 /* Call user callback */ 1619 ctx->cb_fn(ctx->seq, ctx->cb_arg, 0); 1620 1621 /* Free the memory */ 1622 spdk_free(ctx->pages); 1623 free(ctx); 1624 } 1625 1626 static void 1627 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1628 { 1629 struct spdk_blob_persist_ctx *next_persist, *tmp; 1630 struct spdk_blob *blob = ctx->blob; 1631 1632 if (bserrno == 0) { 1633 blob_mark_clean(blob); 1634 } 1635 1636 assert(ctx == TAILQ_FIRST(&blob->persists_to_complete)); 1637 1638 /* Complete all persists that were pending when the current persist started */ 1639 TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) { 1640 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link); 1641 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist); 1642 } 1643 1644 if (TAILQ_EMPTY(&blob->pending_persists)) { 1645 return; 1646 } 1647 1648 /* Queue up all pending persists for completion and start blob persist with first one */ 1649 TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link); 1650 next_persist = TAILQ_FIRST(&blob->persists_to_complete); 1651 1652 blob->state = SPDK_BLOB_STATE_DIRTY; 1653 blob_persist_check_dirty(next_persist); 1654 } 1655 1656 static void 1657 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1658 { 1659 struct spdk_blob_persist_ctx *ctx = cb_arg; 1660 struct spdk_blob *blob = ctx->blob; 1661 struct spdk_blob_store *bs = blob->bs; 1662 size_t i; 1663 1664 if (bserrno != 0) { 1665 blob_persist_complete(seq, ctx, bserrno); 1666 return; 1667 } 1668 1669 /* Release all extent_pages that were truncated */ 1670 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1671 /* Nothing to release if it was not allocated */ 1672 if (blob->active.extent_pages[i] != 0) { 1673 bs_release_md_page(bs, blob->active.extent_pages[i]); 1674 } 1675 } 1676 1677 if (blob->active.num_extent_pages == 0) { 1678 free(blob->active.extent_pages); 1679 blob->active.extent_pages = NULL; 1680 blob->active.extent_pages_array_size = 0; 1681 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1682 #ifndef __clang_analyzer__ 1683 void *tmp; 1684 1685 /* scan-build really can't figure reallocs, workaround it */ 1686 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1687 assert(tmp != NULL); 1688 blob->active.extent_pages = tmp; 1689 #endif 1690 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1691 } 1692 1693 blob_persist_complete(seq, ctx, bserrno); 1694 } 1695 1696 static void 1697 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1698 { 1699 struct spdk_blob *blob = ctx->blob; 1700 struct spdk_blob_store *bs = blob->bs; 1701 size_t i; 1702 uint64_t lba; 1703 uint64_t lba_count; 1704 spdk_bs_batch_t *batch; 1705 1706 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1707 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1708 1709 /* Clear all extent_pages that were truncated */ 1710 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1711 /* Nothing to clear if it was not allocated */ 1712 if (blob->active.extent_pages[i] != 0) { 1713 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1714 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1715 } 1716 } 1717 1718 bs_batch_close(batch); 1719 } 1720 1721 static void 1722 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1723 { 1724 struct spdk_blob_persist_ctx *ctx = cb_arg; 1725 struct spdk_blob *blob = ctx->blob; 1726 struct spdk_blob_store *bs = blob->bs; 1727 size_t i; 1728 1729 if (bserrno != 0) { 1730 blob_persist_complete(seq, ctx, bserrno); 1731 return; 1732 } 1733 1734 pthread_mutex_lock(&bs->used_clusters_mutex); 1735 /* Release all clusters that were truncated */ 1736 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1737 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1738 1739 /* Nothing to release if it was not allocated */ 1740 if (blob->active.clusters[i] != 0) { 1741 bs_release_cluster(bs, cluster_num); 1742 } 1743 } 1744 pthread_mutex_unlock(&bs->used_clusters_mutex); 1745 1746 if (blob->active.num_clusters == 0) { 1747 free(blob->active.clusters); 1748 blob->active.clusters = NULL; 1749 blob->active.cluster_array_size = 0; 1750 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1751 #ifndef __clang_analyzer__ 1752 void *tmp; 1753 1754 /* scan-build really can't figure reallocs, workaround it */ 1755 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1756 assert(tmp != NULL); 1757 blob->active.clusters = tmp; 1758 1759 #endif 1760 blob->active.cluster_array_size = blob->active.num_clusters; 1761 } 1762 1763 /* Move on to clearing extent pages */ 1764 blob_persist_clear_extents(seq, ctx); 1765 } 1766 1767 static void 1768 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1769 { 1770 struct spdk_blob *blob = ctx->blob; 1771 struct spdk_blob_store *bs = blob->bs; 1772 spdk_bs_batch_t *batch; 1773 size_t i; 1774 uint64_t lba; 1775 uint64_t lba_count; 1776 1777 /* Clusters don't move around in blobs. The list shrinks or grows 1778 * at the end, but no changes ever occur in the middle of the list. 1779 */ 1780 1781 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1782 1783 /* Clear all clusters that were truncated */ 1784 lba = 0; 1785 lba_count = 0; 1786 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1787 uint64_t next_lba = blob->active.clusters[i]; 1788 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1); 1789 1790 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1791 /* This cluster is contiguous with the previous one. */ 1792 lba_count += next_lba_count; 1793 continue; 1794 } else if (next_lba == 0) { 1795 continue; 1796 } 1797 1798 /* This cluster is not contiguous with the previous one. */ 1799 1800 /* If a run of LBAs previously existing, clear them now */ 1801 if (lba_count > 0) { 1802 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1803 } 1804 1805 /* Start building the next batch */ 1806 lba = next_lba; 1807 if (next_lba > 0) { 1808 lba_count = next_lba_count; 1809 } else { 1810 lba_count = 0; 1811 } 1812 } 1813 1814 /* If we ended with a contiguous set of LBAs, clear them now */ 1815 if (lba_count > 0) { 1816 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1817 } 1818 1819 bs_batch_close(batch); 1820 } 1821 1822 static void 1823 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1824 { 1825 struct spdk_blob_persist_ctx *ctx = cb_arg; 1826 struct spdk_blob *blob = ctx->blob; 1827 struct spdk_blob_store *bs = blob->bs; 1828 size_t i; 1829 1830 if (bserrno != 0) { 1831 blob_persist_complete(seq, ctx, bserrno); 1832 return; 1833 } 1834 1835 /* This loop starts at 1 because the first page is special and handled 1836 * below. The pages (except the first) are never written in place, 1837 * so any pages in the clean list must be zeroed. 1838 */ 1839 for (i = 1; i < blob->clean.num_pages; i++) { 1840 bs_release_md_page(bs, blob->clean.pages[i]); 1841 } 1842 1843 if (blob->active.num_pages == 0) { 1844 uint32_t page_num; 1845 1846 page_num = bs_blobid_to_page(blob->id); 1847 bs_release_md_page(bs, page_num); 1848 } 1849 1850 /* Move on to clearing clusters */ 1851 blob_persist_clear_clusters(seq, ctx); 1852 } 1853 1854 static void 1855 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1856 { 1857 struct spdk_blob_persist_ctx *ctx = cb_arg; 1858 struct spdk_blob *blob = ctx->blob; 1859 struct spdk_blob_store *bs = blob->bs; 1860 uint64_t lba; 1861 uint64_t lba_count; 1862 spdk_bs_batch_t *batch; 1863 size_t i; 1864 1865 if (bserrno != 0) { 1866 blob_persist_complete(seq, ctx, bserrno); 1867 return; 1868 } 1869 1870 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1871 1872 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1873 1874 /* This loop starts at 1 because the first page is special and handled 1875 * below. The pages (except the first) are never written in place, 1876 * so any pages in the clean list must be zeroed. 1877 */ 1878 for (i = 1; i < blob->clean.num_pages; i++) { 1879 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1880 1881 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1882 } 1883 1884 /* The first page will only be zeroed if this is a delete. */ 1885 if (blob->active.num_pages == 0) { 1886 uint32_t page_num; 1887 1888 /* The first page in the metadata goes where the blobid indicates */ 1889 page_num = bs_blobid_to_page(blob->id); 1890 lba = bs_md_page_to_lba(bs, page_num); 1891 1892 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1893 } 1894 1895 bs_batch_close(batch); 1896 } 1897 1898 static void 1899 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1900 { 1901 struct spdk_blob_persist_ctx *ctx = cb_arg; 1902 struct spdk_blob *blob = ctx->blob; 1903 struct spdk_blob_store *bs = blob->bs; 1904 uint64_t lba; 1905 uint32_t lba_count; 1906 struct spdk_blob_md_page *page; 1907 1908 if (bserrno != 0) { 1909 blob_persist_complete(seq, ctx, bserrno); 1910 return; 1911 } 1912 1913 if (blob->active.num_pages == 0) { 1914 /* Move on to the next step */ 1915 blob_persist_zero_pages(seq, ctx, 0); 1916 return; 1917 } 1918 1919 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1920 1921 page = &ctx->pages[0]; 1922 /* The first page in the metadata goes where the blobid indicates */ 1923 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1924 1925 bs_sequence_write_dev(seq, page, lba, lba_count, 1926 blob_persist_zero_pages, ctx); 1927 } 1928 1929 static void 1930 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1931 { 1932 struct spdk_blob *blob = ctx->blob; 1933 struct spdk_blob_store *bs = blob->bs; 1934 uint64_t lba; 1935 uint32_t lba_count; 1936 struct spdk_blob_md_page *page; 1937 spdk_bs_batch_t *batch; 1938 size_t i; 1939 1940 /* Clusters don't move around in blobs. The list shrinks or grows 1941 * at the end, but no changes ever occur in the middle of the list. 1942 */ 1943 1944 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1945 1946 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1947 1948 /* This starts at 1. The root page is not written until 1949 * all of the others are finished 1950 */ 1951 for (i = 1; i < blob->active.num_pages; i++) { 1952 page = &ctx->pages[i]; 1953 assert(page->sequence_num == i); 1954 1955 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1956 1957 bs_batch_write_dev(batch, page, lba, lba_count); 1958 } 1959 1960 bs_batch_close(batch); 1961 } 1962 1963 static int 1964 blob_resize(struct spdk_blob *blob, uint64_t sz) 1965 { 1966 uint64_t i; 1967 uint64_t *tmp; 1968 uint64_t cluster; 1969 uint32_t lfmd; /* lowest free md page */ 1970 uint64_t num_clusters; 1971 uint32_t *ep_tmp; 1972 uint64_t new_num_ep = 0, current_num_ep = 0; 1973 struct spdk_blob_store *bs; 1974 1975 bs = blob->bs; 1976 1977 blob_verify_md_op(blob); 1978 1979 if (blob->active.num_clusters == sz) { 1980 return 0; 1981 } 1982 1983 if (blob->active.num_clusters < blob->active.cluster_array_size) { 1984 /* If this blob was resized to be larger, then smaller, then 1985 * larger without syncing, then the cluster array already 1986 * contains spare assigned clusters we can use. 1987 */ 1988 num_clusters = spdk_min(blob->active.cluster_array_size, 1989 sz); 1990 } else { 1991 num_clusters = blob->active.num_clusters; 1992 } 1993 1994 if (blob->use_extent_table) { 1995 /* Round up since every cluster beyond current Extent Table size, 1996 * requires new extent page. */ 1997 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 1998 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 1999 } 2000 2001 /* Check first that we have enough clusters and md pages before we start claiming them. */ 2002 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 2003 if ((sz - num_clusters) > bs->num_free_clusters) { 2004 return -ENOSPC; 2005 } 2006 lfmd = 0; 2007 for (i = current_num_ep; i < new_num_ep ; i++) { 2008 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2009 if (lfmd == UINT32_MAX) { 2010 /* No more free md pages. Cannot satisfy the request */ 2011 return -ENOSPC; 2012 } 2013 } 2014 } 2015 2016 if (sz > num_clusters) { 2017 /* Expand the cluster array if necessary. 2018 * We only shrink the array when persisting. 2019 */ 2020 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2021 if (sz > 0 && tmp == NULL) { 2022 return -ENOMEM; 2023 } 2024 memset(tmp + blob->active.cluster_array_size, 0, 2025 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2026 blob->active.clusters = tmp; 2027 blob->active.cluster_array_size = sz; 2028 2029 /* Expand the extents table, only if enough clusters were added */ 2030 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2031 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2032 if (new_num_ep > 0 && ep_tmp == NULL) { 2033 return -ENOMEM; 2034 } 2035 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2036 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2037 blob->active.extent_pages = ep_tmp; 2038 blob->active.extent_pages_array_size = new_num_ep; 2039 } 2040 } 2041 2042 blob->state = SPDK_BLOB_STATE_DIRTY; 2043 2044 if (spdk_blob_is_thin_provisioned(blob) == false) { 2045 cluster = 0; 2046 lfmd = 0; 2047 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2048 for (i = num_clusters; i < sz; i++) { 2049 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2050 lfmd++; 2051 } 2052 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2053 } 2054 2055 blob->active.num_clusters = sz; 2056 blob->active.num_extent_pages = new_num_ep; 2057 2058 return 0; 2059 } 2060 2061 static void 2062 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2063 { 2064 spdk_bs_sequence_t *seq = ctx->seq; 2065 struct spdk_blob *blob = ctx->blob; 2066 struct spdk_blob_store *bs = blob->bs; 2067 uint64_t i; 2068 uint32_t page_num; 2069 void *tmp; 2070 int rc; 2071 2072 /* Generate the new metadata */ 2073 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2074 if (rc < 0) { 2075 blob_persist_complete(seq, ctx, rc); 2076 return; 2077 } 2078 2079 assert(blob->active.num_pages >= 1); 2080 2081 /* Resize the cache of page indices */ 2082 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2083 if (!tmp) { 2084 blob_persist_complete(seq, ctx, -ENOMEM); 2085 return; 2086 } 2087 blob->active.pages = tmp; 2088 2089 /* Assign this metadata to pages. This requires two passes - 2090 * one to verify that there are enough pages and a second 2091 * to actually claim them. */ 2092 page_num = 0; 2093 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2094 for (i = 1; i < blob->active.num_pages; i++) { 2095 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2096 if (page_num == UINT32_MAX) { 2097 blob_persist_complete(seq, ctx, -ENOMEM); 2098 return; 2099 } 2100 page_num++; 2101 } 2102 2103 page_num = 0; 2104 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2105 for (i = 1; i < blob->active.num_pages; i++) { 2106 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2107 ctx->pages[i - 1].next = page_num; 2108 /* Now that previous metadata page is complete, calculate the crc for it. */ 2109 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2110 blob->active.pages[i] = page_num; 2111 bs_claim_md_page(bs, page_num); 2112 SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id); 2113 page_num++; 2114 } 2115 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2116 /* Start writing the metadata from last page to first */ 2117 blob->state = SPDK_BLOB_STATE_CLEAN; 2118 blob_persist_write_page_chain(seq, ctx); 2119 } 2120 2121 static void 2122 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2123 { 2124 struct spdk_blob_persist_ctx *ctx = cb_arg; 2125 struct spdk_blob *blob = ctx->blob; 2126 size_t i; 2127 uint32_t extent_page_id; 2128 uint32_t page_count = 0; 2129 int rc; 2130 2131 if (ctx->extent_page != NULL) { 2132 spdk_free(ctx->extent_page); 2133 ctx->extent_page = NULL; 2134 } 2135 2136 if (bserrno != 0) { 2137 blob_persist_complete(seq, ctx, bserrno); 2138 return; 2139 } 2140 2141 /* Only write out Extent Pages when blob was resized. */ 2142 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2143 extent_page_id = blob->active.extent_pages[i]; 2144 if (extent_page_id == 0) { 2145 /* No Extent Page to persist */ 2146 assert(spdk_blob_is_thin_provisioned(blob)); 2147 continue; 2148 } 2149 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2150 ctx->next_extent_page = i + 1; 2151 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2152 if (rc < 0) { 2153 blob_persist_complete(seq, ctx, rc); 2154 return; 2155 } 2156 2157 blob->state = SPDK_BLOB_STATE_DIRTY; 2158 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2159 2160 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2161 2162 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2163 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2164 blob_persist_write_extent_pages, ctx); 2165 return; 2166 } 2167 2168 blob_persist_generate_new_md(ctx); 2169 } 2170 2171 static void 2172 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2173 { 2174 spdk_bs_sequence_t *seq = ctx->seq; 2175 struct spdk_blob *blob = ctx->blob; 2176 2177 if (blob->active.num_pages == 0) { 2178 /* This is the signal that the blob should be deleted. 2179 * Immediately jump to the clean up routine. */ 2180 assert(blob->clean.num_pages > 0); 2181 blob->state = SPDK_BLOB_STATE_CLEAN; 2182 blob_persist_zero_pages(seq, ctx, 0); 2183 return; 2184 2185 } 2186 2187 if (blob->clean.num_clusters < blob->active.num_clusters) { 2188 /* Blob was resized up */ 2189 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2190 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2191 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2192 /* Blob was resized down */ 2193 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2194 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2195 } else { 2196 /* No change in size occurred */ 2197 blob_persist_generate_new_md(ctx); 2198 return; 2199 } 2200 2201 blob_persist_write_extent_pages(seq, ctx, 0); 2202 } 2203 2204 static void 2205 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2206 { 2207 struct spdk_blob_persist_ctx *ctx = cb_arg; 2208 2209 spdk_free(ctx->super); 2210 2211 if (bserrno != 0) { 2212 blob_persist_complete(seq, ctx, bserrno); 2213 return; 2214 } 2215 2216 ctx->blob->bs->clean = 0; 2217 2218 blob_persist_start(ctx); 2219 } 2220 2221 static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2222 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2223 2224 2225 static void 2226 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2227 { 2228 struct spdk_blob_persist_ctx *ctx = cb_arg; 2229 2230 if (bserrno != 0) { 2231 spdk_free(ctx->super); 2232 blob_persist_complete(seq, ctx, bserrno); 2233 return; 2234 } 2235 2236 ctx->super->clean = 0; 2237 if (ctx->super->size == 0) { 2238 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2239 } 2240 2241 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2242 } 2243 2244 static void 2245 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2246 { 2247 if (ctx->blob->bs->clean) { 2248 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2249 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2250 if (!ctx->super) { 2251 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2252 return; 2253 } 2254 2255 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2256 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2257 blob_persist_dirty, ctx); 2258 } else { 2259 blob_persist_start(ctx); 2260 } 2261 } 2262 2263 /* Write a blob to disk */ 2264 static void 2265 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2266 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2267 { 2268 struct spdk_blob_persist_ctx *ctx; 2269 2270 blob_verify_md_op(blob); 2271 2272 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) { 2273 cb_fn(seq, cb_arg, 0); 2274 return; 2275 } 2276 2277 ctx = calloc(1, sizeof(*ctx)); 2278 if (!ctx) { 2279 cb_fn(seq, cb_arg, -ENOMEM); 2280 return; 2281 } 2282 ctx->blob = blob; 2283 ctx->seq = seq; 2284 ctx->cb_fn = cb_fn; 2285 ctx->cb_arg = cb_arg; 2286 2287 /* Multiple blob persists can affect one another, via blob->state or 2288 * blob mutable data changes. To prevent it, queue up the persists. */ 2289 if (!TAILQ_EMPTY(&blob->persists_to_complete)) { 2290 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2291 return; 2292 } 2293 TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link); 2294 2295 blob_persist_check_dirty(ctx); 2296 } 2297 2298 struct spdk_blob_copy_cluster_ctx { 2299 struct spdk_blob *blob; 2300 uint8_t *buf; 2301 uint64_t page; 2302 uint64_t new_cluster; 2303 uint32_t new_extent_page; 2304 spdk_bs_sequence_t *seq; 2305 struct spdk_blob_md_page *new_cluster_page; 2306 }; 2307 2308 static void 2309 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2310 { 2311 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2312 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2313 TAILQ_HEAD(, spdk_bs_request_set) requests; 2314 spdk_bs_user_op_t *op; 2315 2316 TAILQ_INIT(&requests); 2317 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2318 2319 while (!TAILQ_EMPTY(&requests)) { 2320 op = TAILQ_FIRST(&requests); 2321 TAILQ_REMOVE(&requests, op, link); 2322 if (bserrno == 0) { 2323 bs_user_op_execute(op); 2324 } else { 2325 bs_user_op_abort(op, bserrno); 2326 } 2327 } 2328 2329 spdk_free(ctx->buf); 2330 free(ctx); 2331 } 2332 2333 static void 2334 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2335 { 2336 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2337 2338 if (bserrno) { 2339 if (bserrno == -EEXIST) { 2340 /* The metadata insert failed because another thread 2341 * allocated the cluster first. Free our cluster 2342 * but continue without error. */ 2343 bserrno = 0; 2344 } 2345 pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex); 2346 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2347 pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex); 2348 if (ctx->new_extent_page != 0) { 2349 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2350 } 2351 } 2352 2353 bs_sequence_finish(ctx->seq, bserrno); 2354 } 2355 2356 static void 2357 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2358 { 2359 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2360 uint32_t cluster_number; 2361 2362 if (bserrno) { 2363 /* The write failed, so jump to the final completion handler */ 2364 bs_sequence_finish(seq, bserrno); 2365 return; 2366 } 2367 2368 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2369 2370 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2371 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2372 } 2373 2374 static void 2375 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2376 { 2377 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2378 2379 if (bserrno != 0) { 2380 /* The read failed, so jump to the final completion handler */ 2381 bs_sequence_finish(seq, bserrno); 2382 return; 2383 } 2384 2385 /* Write whole cluster */ 2386 bs_sequence_write_dev(seq, ctx->buf, 2387 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2388 bs_cluster_to_lba(ctx->blob->bs, 1), 2389 blob_write_copy_cpl, ctx); 2390 } 2391 2392 static void 2393 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2394 struct spdk_io_channel *_ch, 2395 uint64_t io_unit, spdk_bs_user_op_t *op) 2396 { 2397 struct spdk_bs_cpl cpl; 2398 struct spdk_bs_channel *ch; 2399 struct spdk_blob_copy_cluster_ctx *ctx; 2400 uint32_t cluster_start_page; 2401 uint32_t cluster_number; 2402 int rc; 2403 2404 ch = spdk_io_channel_get_ctx(_ch); 2405 2406 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2407 /* There are already operations pending. Queue this user op 2408 * and return because it will be re-executed when the outstanding 2409 * cluster allocation completes. */ 2410 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2411 return; 2412 } 2413 2414 /* Round the io_unit offset down to the first page in the cluster */ 2415 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2416 2417 /* Calculate which index in the metadata cluster array the corresponding 2418 * cluster is supposed to be at. */ 2419 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2420 2421 ctx = calloc(1, sizeof(*ctx)); 2422 if (!ctx) { 2423 bs_user_op_abort(op, -ENOMEM); 2424 return; 2425 } 2426 2427 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2428 2429 ctx->blob = blob; 2430 ctx->page = cluster_start_page; 2431 ctx->new_cluster_page = ch->new_cluster_page; 2432 memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); 2433 2434 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2435 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2436 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2437 if (!ctx->buf) { 2438 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2439 blob->bs->cluster_sz); 2440 free(ctx); 2441 bs_user_op_abort(op, -ENOMEM); 2442 return; 2443 } 2444 } 2445 2446 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2447 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2448 false); 2449 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2450 if (rc != 0) { 2451 spdk_free(ctx->buf); 2452 free(ctx); 2453 bs_user_op_abort(op, rc); 2454 return; 2455 } 2456 2457 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2458 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2459 cpl.u.blob_basic.cb_arg = ctx; 2460 2461 ctx->seq = bs_sequence_start(_ch, &cpl); 2462 if (!ctx->seq) { 2463 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2464 bs_release_cluster(blob->bs, ctx->new_cluster); 2465 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2466 spdk_free(ctx->buf); 2467 free(ctx); 2468 bs_user_op_abort(op, -ENOMEM); 2469 return; 2470 } 2471 2472 /* Queue the user op to block other incoming operations */ 2473 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2474 2475 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2476 /* Read cluster from backing device */ 2477 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2478 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2479 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2480 blob_write_copy, ctx); 2481 } else { 2482 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2483 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2484 } 2485 } 2486 2487 static inline bool 2488 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2489 uint64_t *lba, uint64_t *lba_count) 2490 { 2491 *lba_count = length; 2492 2493 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2494 assert(blob->back_bs_dev != NULL); 2495 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2496 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2497 return false; 2498 } else { 2499 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2500 return true; 2501 } 2502 } 2503 2504 struct op_split_ctx { 2505 struct spdk_blob *blob; 2506 struct spdk_io_channel *channel; 2507 uint64_t io_unit_offset; 2508 uint64_t io_units_remaining; 2509 void *curr_payload; 2510 enum spdk_blob_op_type op_type; 2511 spdk_bs_sequence_t *seq; 2512 bool in_submit_ctx; 2513 bool completed_in_submit_ctx; 2514 bool done; 2515 }; 2516 2517 static void 2518 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2519 { 2520 struct op_split_ctx *ctx = cb_arg; 2521 struct spdk_blob *blob = ctx->blob; 2522 struct spdk_io_channel *ch = ctx->channel; 2523 enum spdk_blob_op_type op_type = ctx->op_type; 2524 uint8_t *buf; 2525 uint64_t offset; 2526 uint64_t length; 2527 uint64_t op_length; 2528 2529 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2530 bs_sequence_finish(ctx->seq, bserrno); 2531 if (ctx->in_submit_ctx) { 2532 /* Defer freeing of the ctx object, since it will be 2533 * accessed when this unwinds back to the submisison 2534 * context. 2535 */ 2536 ctx->done = true; 2537 } else { 2538 free(ctx); 2539 } 2540 return; 2541 } 2542 2543 if (ctx->in_submit_ctx) { 2544 /* If this split operation completed in the context 2545 * of its submission, mark the flag and return immediately 2546 * to avoid recursion. 2547 */ 2548 ctx->completed_in_submit_ctx = true; 2549 return; 2550 } 2551 2552 while (true) { 2553 ctx->completed_in_submit_ctx = false; 2554 2555 offset = ctx->io_unit_offset; 2556 length = ctx->io_units_remaining; 2557 buf = ctx->curr_payload; 2558 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2559 offset)); 2560 2561 /* Update length and payload for next operation */ 2562 ctx->io_units_remaining -= op_length; 2563 ctx->io_unit_offset += op_length; 2564 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2565 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2566 } 2567 2568 assert(!ctx->in_submit_ctx); 2569 ctx->in_submit_ctx = true; 2570 2571 switch (op_type) { 2572 case SPDK_BLOB_READ: 2573 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2574 blob_request_submit_op_split_next, ctx); 2575 break; 2576 case SPDK_BLOB_WRITE: 2577 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2578 blob_request_submit_op_split_next, ctx); 2579 break; 2580 case SPDK_BLOB_UNMAP: 2581 spdk_blob_io_unmap(blob, ch, offset, op_length, 2582 blob_request_submit_op_split_next, ctx); 2583 break; 2584 case SPDK_BLOB_WRITE_ZEROES: 2585 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2586 blob_request_submit_op_split_next, ctx); 2587 break; 2588 case SPDK_BLOB_READV: 2589 case SPDK_BLOB_WRITEV: 2590 SPDK_ERRLOG("readv/write not valid\n"); 2591 bs_sequence_finish(ctx->seq, -EINVAL); 2592 free(ctx); 2593 return; 2594 } 2595 2596 #ifndef __clang_analyzer__ 2597 /* scan-build reports a false positive around accessing the ctx here. It 2598 * forms a path that recursively calls this function, but then says 2599 * "assuming ctx->in_submit_ctx is false", when that isn't possible. 2600 * This path does free(ctx), returns to here, and reports a use-after-free 2601 * bug. Wrapping this bit of code so that scan-build doesn't see it 2602 * works around the scan-build bug. 2603 */ 2604 assert(ctx->in_submit_ctx); 2605 ctx->in_submit_ctx = false; 2606 2607 /* If the operation completed immediately, loop back and submit the 2608 * next operation. Otherwise we can return and the next split 2609 * operation will get submitted when this current operation is 2610 * later completed asynchronously. 2611 */ 2612 if (ctx->completed_in_submit_ctx) { 2613 continue; 2614 } else if (ctx->done) { 2615 free(ctx); 2616 } 2617 #endif 2618 break; 2619 } 2620 } 2621 2622 static void 2623 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2624 void *payload, uint64_t offset, uint64_t length, 2625 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2626 { 2627 struct op_split_ctx *ctx; 2628 spdk_bs_sequence_t *seq; 2629 struct spdk_bs_cpl cpl; 2630 2631 assert(blob != NULL); 2632 2633 ctx = calloc(1, sizeof(struct op_split_ctx)); 2634 if (ctx == NULL) { 2635 cb_fn(cb_arg, -ENOMEM); 2636 return; 2637 } 2638 2639 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2640 cpl.u.blob_basic.cb_fn = cb_fn; 2641 cpl.u.blob_basic.cb_arg = cb_arg; 2642 2643 seq = bs_sequence_start(ch, &cpl); 2644 if (!seq) { 2645 free(ctx); 2646 cb_fn(cb_arg, -ENOMEM); 2647 return; 2648 } 2649 2650 ctx->blob = blob; 2651 ctx->channel = ch; 2652 ctx->curr_payload = payload; 2653 ctx->io_unit_offset = offset; 2654 ctx->io_units_remaining = length; 2655 ctx->op_type = op_type; 2656 ctx->seq = seq; 2657 2658 blob_request_submit_op_split_next(ctx, 0); 2659 } 2660 2661 static void 2662 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2663 void *payload, uint64_t offset, uint64_t length, 2664 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2665 { 2666 struct spdk_bs_cpl cpl; 2667 uint64_t lba; 2668 uint64_t lba_count; 2669 bool is_allocated; 2670 2671 assert(blob != NULL); 2672 2673 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2674 cpl.u.blob_basic.cb_fn = cb_fn; 2675 cpl.u.blob_basic.cb_arg = cb_arg; 2676 2677 if (blob->frozen_refcnt) { 2678 /* This blob I/O is frozen */ 2679 spdk_bs_user_op_t *op; 2680 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2681 2682 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2683 if (!op) { 2684 cb_fn(cb_arg, -ENOMEM); 2685 return; 2686 } 2687 2688 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2689 2690 return; 2691 } 2692 2693 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2694 2695 switch (op_type) { 2696 case SPDK_BLOB_READ: { 2697 spdk_bs_batch_t *batch; 2698 2699 batch = bs_batch_open(_ch, &cpl); 2700 if (!batch) { 2701 cb_fn(cb_arg, -ENOMEM); 2702 return; 2703 } 2704 2705 if (is_allocated) { 2706 /* Read from the blob */ 2707 bs_batch_read_dev(batch, payload, lba, lba_count); 2708 } else { 2709 /* Read from the backing block device */ 2710 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2711 } 2712 2713 bs_batch_close(batch); 2714 break; 2715 } 2716 case SPDK_BLOB_WRITE: 2717 case SPDK_BLOB_WRITE_ZEROES: { 2718 if (is_allocated) { 2719 /* Write to the blob */ 2720 spdk_bs_batch_t *batch; 2721 2722 if (lba_count == 0) { 2723 cb_fn(cb_arg, 0); 2724 return; 2725 } 2726 2727 batch = bs_batch_open(_ch, &cpl); 2728 if (!batch) { 2729 cb_fn(cb_arg, -ENOMEM); 2730 return; 2731 } 2732 2733 if (op_type == SPDK_BLOB_WRITE) { 2734 bs_batch_write_dev(batch, payload, lba, lba_count); 2735 } else { 2736 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2737 } 2738 2739 bs_batch_close(batch); 2740 } else { 2741 /* Queue this operation and allocate the cluster */ 2742 spdk_bs_user_op_t *op; 2743 2744 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2745 if (!op) { 2746 cb_fn(cb_arg, -ENOMEM); 2747 return; 2748 } 2749 2750 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2751 } 2752 break; 2753 } 2754 case SPDK_BLOB_UNMAP: { 2755 spdk_bs_batch_t *batch; 2756 2757 batch = bs_batch_open(_ch, &cpl); 2758 if (!batch) { 2759 cb_fn(cb_arg, -ENOMEM); 2760 return; 2761 } 2762 2763 if (is_allocated) { 2764 bs_batch_unmap_dev(batch, lba, lba_count); 2765 } 2766 2767 bs_batch_close(batch); 2768 break; 2769 } 2770 case SPDK_BLOB_READV: 2771 case SPDK_BLOB_WRITEV: 2772 SPDK_ERRLOG("readv/write not valid\n"); 2773 cb_fn(cb_arg, -EINVAL); 2774 break; 2775 } 2776 } 2777 2778 static void 2779 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2780 void *payload, uint64_t offset, uint64_t length, 2781 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2782 { 2783 assert(blob != NULL); 2784 2785 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2786 cb_fn(cb_arg, -EPERM); 2787 return; 2788 } 2789 2790 if (length == 0) { 2791 cb_fn(cb_arg, 0); 2792 return; 2793 } 2794 2795 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2796 cb_fn(cb_arg, -EINVAL); 2797 return; 2798 } 2799 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2800 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2801 cb_fn, cb_arg, op_type); 2802 } else { 2803 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2804 cb_fn, cb_arg, op_type); 2805 } 2806 } 2807 2808 struct rw_iov_ctx { 2809 struct spdk_blob *blob; 2810 struct spdk_io_channel *channel; 2811 spdk_blob_op_complete cb_fn; 2812 void *cb_arg; 2813 bool read; 2814 int iovcnt; 2815 struct iovec *orig_iov; 2816 uint64_t io_unit_offset; 2817 uint64_t io_units_remaining; 2818 uint64_t io_units_done; 2819 struct spdk_blob_ext_io_opts *ext_io_opts; 2820 struct iovec iov[0]; 2821 }; 2822 2823 static void 2824 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2825 { 2826 assert(cb_arg == NULL); 2827 bs_sequence_finish(seq, bserrno); 2828 } 2829 2830 static void 2831 rw_iov_split_next(void *cb_arg, int bserrno) 2832 { 2833 struct rw_iov_ctx *ctx = cb_arg; 2834 struct spdk_blob *blob = ctx->blob; 2835 struct iovec *iov, *orig_iov; 2836 int iovcnt; 2837 size_t orig_iovoff; 2838 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2839 uint64_t byte_count; 2840 2841 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2842 ctx->cb_fn(ctx->cb_arg, bserrno); 2843 free(ctx); 2844 return; 2845 } 2846 2847 io_unit_offset = ctx->io_unit_offset; 2848 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2849 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2850 /* 2851 * Get index and offset into the original iov array for our current position in the I/O sequence. 2852 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2853 * point to the current position in the I/O sequence. 2854 */ 2855 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2856 orig_iov = &ctx->orig_iov[0]; 2857 orig_iovoff = 0; 2858 while (byte_count > 0) { 2859 if (byte_count >= orig_iov->iov_len) { 2860 byte_count -= orig_iov->iov_len; 2861 orig_iov++; 2862 } else { 2863 orig_iovoff = byte_count; 2864 byte_count = 0; 2865 } 2866 } 2867 2868 /* 2869 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2870 * bytes of this next I/O remain to be accounted for in the new iov array. 2871 */ 2872 byte_count = io_units_count * blob->bs->io_unit_size; 2873 iov = &ctx->iov[0]; 2874 iovcnt = 0; 2875 while (byte_count > 0) { 2876 assert(iovcnt < ctx->iovcnt); 2877 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2878 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2879 byte_count -= iov->iov_len; 2880 orig_iovoff = 0; 2881 orig_iov++; 2882 iov++; 2883 iovcnt++; 2884 } 2885 2886 ctx->io_unit_offset += io_units_count; 2887 ctx->io_units_remaining -= io_units_count; 2888 ctx->io_units_done += io_units_count; 2889 iov = &ctx->iov[0]; 2890 2891 if (ctx->read) { 2892 spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2893 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2894 } else { 2895 spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2896 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2897 } 2898 } 2899 2900 static void 2901 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2902 struct iovec *iov, int iovcnt, 2903 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read, 2904 struct spdk_blob_ext_io_opts *ext_io_opts) 2905 { 2906 struct spdk_bs_cpl cpl; 2907 2908 assert(blob != NULL); 2909 2910 if (!read && blob->data_ro) { 2911 cb_fn(cb_arg, -EPERM); 2912 return; 2913 } 2914 2915 if (length == 0) { 2916 cb_fn(cb_arg, 0); 2917 return; 2918 } 2919 2920 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2921 cb_fn(cb_arg, -EINVAL); 2922 return; 2923 } 2924 2925 /* 2926 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2927 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2928 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2929 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 2930 * to allocate a separate iov array and split the I/O such that none of the resulting 2931 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 2932 * but since this case happens very infrequently, any performance impact will be negligible. 2933 * 2934 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 2935 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 2936 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 2937 * when the batch was completed, to allow for freeing the memory for the iov arrays. 2938 */ 2939 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 2940 uint64_t lba_count; 2941 uint64_t lba; 2942 bool is_allocated; 2943 2944 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2945 cpl.u.blob_basic.cb_fn = cb_fn; 2946 cpl.u.blob_basic.cb_arg = cb_arg; 2947 2948 if (blob->frozen_refcnt) { 2949 /* This blob I/O is frozen */ 2950 enum spdk_blob_op_type op_type; 2951 spdk_bs_user_op_t *op; 2952 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 2953 2954 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 2955 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 2956 if (!op) { 2957 cb_fn(cb_arg, -ENOMEM); 2958 return; 2959 } 2960 2961 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2962 2963 return; 2964 } 2965 2966 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2967 2968 if (read) { 2969 spdk_bs_sequence_t *seq; 2970 2971 seq = bs_sequence_start(_channel, &cpl); 2972 if (!seq) { 2973 cb_fn(cb_arg, -ENOMEM); 2974 return; 2975 } 2976 2977 seq->ext_io_opts = ext_io_opts; 2978 2979 if (is_allocated) { 2980 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2981 } else { 2982 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 2983 rw_iov_done, NULL); 2984 } 2985 } else { 2986 if (is_allocated) { 2987 spdk_bs_sequence_t *seq; 2988 2989 seq = bs_sequence_start(_channel, &cpl); 2990 if (!seq) { 2991 cb_fn(cb_arg, -ENOMEM); 2992 return; 2993 } 2994 2995 seq->ext_io_opts = ext_io_opts; 2996 2997 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2998 } else { 2999 /* Queue this operation and allocate the cluster */ 3000 spdk_bs_user_op_t *op; 3001 3002 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 3003 length); 3004 if (!op) { 3005 cb_fn(cb_arg, -ENOMEM); 3006 return; 3007 } 3008 3009 op->ext_io_opts = ext_io_opts; 3010 3011 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 3012 } 3013 } 3014 } else { 3015 struct rw_iov_ctx *ctx; 3016 3017 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 3018 if (ctx == NULL) { 3019 cb_fn(cb_arg, -ENOMEM); 3020 return; 3021 } 3022 3023 ctx->blob = blob; 3024 ctx->channel = _channel; 3025 ctx->cb_fn = cb_fn; 3026 ctx->cb_arg = cb_arg; 3027 ctx->read = read; 3028 ctx->orig_iov = iov; 3029 ctx->iovcnt = iovcnt; 3030 ctx->io_unit_offset = offset; 3031 ctx->io_units_remaining = length; 3032 ctx->io_units_done = 0; 3033 ctx->ext_io_opts = ext_io_opts; 3034 3035 rw_iov_split_next(ctx, 0); 3036 } 3037 } 3038 3039 static struct spdk_blob * 3040 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 3041 { 3042 struct spdk_blob find; 3043 3044 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 3045 return NULL; 3046 } 3047 3048 find.id = blobid; 3049 return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find); 3050 } 3051 3052 static void 3053 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 3054 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 3055 { 3056 assert(blob != NULL); 3057 *snapshot_entry = NULL; 3058 *clone_entry = NULL; 3059 3060 if (blob->parent_id == SPDK_BLOBID_INVALID) { 3061 return; 3062 } 3063 3064 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 3065 if ((*snapshot_entry)->id == blob->parent_id) { 3066 break; 3067 } 3068 } 3069 3070 if (*snapshot_entry != NULL) { 3071 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3072 if ((*clone_entry)->id == blob->id) { 3073 break; 3074 } 3075 } 3076 3077 assert(*clone_entry != NULL); 3078 } 3079 } 3080 3081 static int 3082 bs_channel_create(void *io_device, void *ctx_buf) 3083 { 3084 struct spdk_blob_store *bs = io_device; 3085 struct spdk_bs_channel *channel = ctx_buf; 3086 struct spdk_bs_dev *dev; 3087 uint32_t max_ops = bs->max_channel_ops; 3088 uint32_t i; 3089 3090 dev = bs->dev; 3091 3092 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3093 if (!channel->req_mem) { 3094 return -1; 3095 } 3096 3097 TAILQ_INIT(&channel->reqs); 3098 3099 for (i = 0; i < max_ops; i++) { 3100 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3101 } 3102 3103 channel->bs = bs; 3104 channel->dev = dev; 3105 channel->dev_channel = dev->create_channel(dev); 3106 3107 if (!channel->dev_channel) { 3108 SPDK_ERRLOG("Failed to create device channel.\n"); 3109 free(channel->req_mem); 3110 return -1; 3111 } 3112 3113 channel->new_cluster_page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, 3114 SPDK_MALLOC_DMA); 3115 if (!channel->new_cluster_page) { 3116 SPDK_ERRLOG("Failed to allocate new cluster page\n"); 3117 free(channel->req_mem); 3118 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3119 return -1; 3120 } 3121 3122 TAILQ_INIT(&channel->need_cluster_alloc); 3123 TAILQ_INIT(&channel->queued_io); 3124 3125 return 0; 3126 } 3127 3128 static void 3129 bs_channel_destroy(void *io_device, void *ctx_buf) 3130 { 3131 struct spdk_bs_channel *channel = ctx_buf; 3132 spdk_bs_user_op_t *op; 3133 3134 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3135 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3136 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3137 bs_user_op_abort(op, -EIO); 3138 } 3139 3140 while (!TAILQ_EMPTY(&channel->queued_io)) { 3141 op = TAILQ_FIRST(&channel->queued_io); 3142 TAILQ_REMOVE(&channel->queued_io, op, link); 3143 bs_user_op_abort(op, -EIO); 3144 } 3145 3146 free(channel->req_mem); 3147 spdk_free(channel->new_cluster_page); 3148 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3149 } 3150 3151 static void 3152 bs_dev_destroy(void *io_device) 3153 { 3154 struct spdk_blob_store *bs = io_device; 3155 struct spdk_blob *blob, *blob_tmp; 3156 3157 bs->dev->destroy(bs->dev); 3158 3159 RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) { 3160 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob); 3161 spdk_bit_array_clear(bs->open_blobids, blob->id); 3162 blob_free(blob); 3163 } 3164 3165 pthread_mutex_destroy(&bs->used_clusters_mutex); 3166 3167 spdk_bit_array_free(&bs->open_blobids); 3168 spdk_bit_array_free(&bs->used_blobids); 3169 spdk_bit_array_free(&bs->used_md_pages); 3170 spdk_bit_pool_free(&bs->used_clusters); 3171 /* 3172 * If this function is called for any reason except a successful unload, 3173 * the unload_cpl type will be NONE and this will be a nop. 3174 */ 3175 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3176 3177 free(bs); 3178 } 3179 3180 static int 3181 bs_blob_list_add(struct spdk_blob *blob) 3182 { 3183 spdk_blob_id snapshot_id; 3184 struct spdk_blob_list *snapshot_entry = NULL; 3185 struct spdk_blob_list *clone_entry = NULL; 3186 3187 assert(blob != NULL); 3188 3189 snapshot_id = blob->parent_id; 3190 if (snapshot_id == SPDK_BLOBID_INVALID) { 3191 return 0; 3192 } 3193 3194 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3195 if (snapshot_entry == NULL) { 3196 /* Snapshot not found */ 3197 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3198 if (snapshot_entry == NULL) { 3199 return -ENOMEM; 3200 } 3201 snapshot_entry->id = snapshot_id; 3202 TAILQ_INIT(&snapshot_entry->clones); 3203 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3204 } else { 3205 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3206 if (clone_entry->id == blob->id) { 3207 break; 3208 } 3209 } 3210 } 3211 3212 if (clone_entry == NULL) { 3213 /* Clone not found */ 3214 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3215 if (clone_entry == NULL) { 3216 return -ENOMEM; 3217 } 3218 clone_entry->id = blob->id; 3219 TAILQ_INIT(&clone_entry->clones); 3220 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3221 snapshot_entry->clone_count++; 3222 } 3223 3224 return 0; 3225 } 3226 3227 static void 3228 bs_blob_list_remove(struct spdk_blob *blob) 3229 { 3230 struct spdk_blob_list *snapshot_entry = NULL; 3231 struct spdk_blob_list *clone_entry = NULL; 3232 3233 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3234 3235 if (snapshot_entry == NULL) { 3236 return; 3237 } 3238 3239 blob->parent_id = SPDK_BLOBID_INVALID; 3240 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3241 free(clone_entry); 3242 3243 snapshot_entry->clone_count--; 3244 } 3245 3246 static int 3247 bs_blob_list_free(struct spdk_blob_store *bs) 3248 { 3249 struct spdk_blob_list *snapshot_entry; 3250 struct spdk_blob_list *snapshot_entry_tmp; 3251 struct spdk_blob_list *clone_entry; 3252 struct spdk_blob_list *clone_entry_tmp; 3253 3254 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3255 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3256 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3257 free(clone_entry); 3258 } 3259 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3260 free(snapshot_entry); 3261 } 3262 3263 return 0; 3264 } 3265 3266 static void 3267 bs_free(struct spdk_blob_store *bs) 3268 { 3269 bs_blob_list_free(bs); 3270 3271 bs_unregister_md_thread(bs); 3272 spdk_io_device_unregister(bs, bs_dev_destroy); 3273 } 3274 3275 void 3276 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3277 { 3278 3279 if (!opts) { 3280 SPDK_ERRLOG("opts should not be NULL\n"); 3281 return; 3282 } 3283 3284 if (!opts_size) { 3285 SPDK_ERRLOG("opts_size should not be zero value\n"); 3286 return; 3287 } 3288 3289 memset(opts, 0, opts_size); 3290 opts->opts_size = opts_size; 3291 3292 #define FIELD_OK(field) \ 3293 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3294 3295 #define SET_FIELD(field, value) \ 3296 if (FIELD_OK(field)) { \ 3297 opts->field = value; \ 3298 } \ 3299 3300 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3301 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3302 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3303 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3304 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3305 3306 if (FIELD_OK(bstype)) { 3307 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3308 } 3309 3310 SET_FIELD(iter_cb_fn, NULL); 3311 SET_FIELD(iter_cb_arg, NULL); 3312 SET_FIELD(force_recover, false); 3313 3314 #undef FIELD_OK 3315 #undef SET_FIELD 3316 } 3317 3318 static int 3319 bs_opts_verify(struct spdk_bs_opts *opts) 3320 { 3321 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3322 opts->max_channel_ops == 0) { 3323 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3324 return -1; 3325 } 3326 3327 return 0; 3328 } 3329 3330 /* START spdk_bs_load */ 3331 3332 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3333 3334 struct spdk_bs_load_ctx { 3335 struct spdk_blob_store *bs; 3336 struct spdk_bs_super_block *super; 3337 3338 struct spdk_bs_md_mask *mask; 3339 bool in_page_chain; 3340 uint32_t page_index; 3341 uint32_t cur_page; 3342 struct spdk_blob_md_page *page; 3343 3344 uint64_t num_extent_pages; 3345 uint32_t *extent_page_num; 3346 struct spdk_blob_md_page *extent_pages; 3347 struct spdk_bit_array *used_clusters; 3348 3349 spdk_bs_sequence_t *seq; 3350 spdk_blob_op_with_handle_complete iter_cb_fn; 3351 void *iter_cb_arg; 3352 struct spdk_blob *blob; 3353 spdk_blob_id blobid; 3354 3355 bool force_recover; 3356 3357 /* These fields are used in the spdk_bs_dump path. */ 3358 bool dumping; 3359 FILE *fp; 3360 spdk_bs_dump_print_xattr print_xattr_fn; 3361 char xattr_name[4096]; 3362 }; 3363 3364 static int 3365 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3366 struct spdk_bs_load_ctx **_ctx) 3367 { 3368 struct spdk_blob_store *bs; 3369 struct spdk_bs_load_ctx *ctx; 3370 uint64_t dev_size; 3371 int rc; 3372 3373 dev_size = dev->blocklen * dev->blockcnt; 3374 if (dev_size < opts->cluster_sz) { 3375 /* Device size cannot be smaller than cluster size of blobstore */ 3376 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3377 dev_size, opts->cluster_sz); 3378 return -ENOSPC; 3379 } 3380 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3381 /* Cluster size cannot be smaller than page size */ 3382 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3383 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3384 return -EINVAL; 3385 } 3386 bs = calloc(1, sizeof(struct spdk_blob_store)); 3387 if (!bs) { 3388 return -ENOMEM; 3389 } 3390 3391 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3392 if (!ctx) { 3393 free(bs); 3394 return -ENOMEM; 3395 } 3396 3397 ctx->bs = bs; 3398 ctx->iter_cb_fn = opts->iter_cb_fn; 3399 ctx->iter_cb_arg = opts->iter_cb_arg; 3400 ctx->force_recover = opts->force_recover; 3401 3402 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3403 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3404 if (!ctx->super) { 3405 free(ctx); 3406 free(bs); 3407 return -ENOMEM; 3408 } 3409 3410 RB_INIT(&bs->open_blobs); 3411 TAILQ_INIT(&bs->snapshots); 3412 bs->dev = dev; 3413 bs->md_thread = spdk_get_thread(); 3414 assert(bs->md_thread != NULL); 3415 3416 /* 3417 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3418 * even multiple of the cluster size. 3419 */ 3420 bs->cluster_sz = opts->cluster_sz; 3421 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3422 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3423 if (!ctx->used_clusters) { 3424 spdk_free(ctx->super); 3425 free(ctx); 3426 free(bs); 3427 return -ENOMEM; 3428 } 3429 3430 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3431 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3432 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3433 } 3434 bs->num_free_clusters = bs->total_clusters; 3435 bs->io_unit_size = dev->blocklen; 3436 3437 bs->max_channel_ops = opts->max_channel_ops; 3438 bs->super_blob = SPDK_BLOBID_INVALID; 3439 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3440 3441 /* The metadata is assumed to be at least 1 page */ 3442 bs->used_md_pages = spdk_bit_array_create(1); 3443 bs->used_blobids = spdk_bit_array_create(0); 3444 bs->open_blobids = spdk_bit_array_create(0); 3445 3446 pthread_mutex_init(&bs->used_clusters_mutex, NULL); 3447 3448 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3449 sizeof(struct spdk_bs_channel), "blobstore"); 3450 rc = bs_register_md_thread(bs); 3451 if (rc == -1) { 3452 spdk_io_device_unregister(bs, NULL); 3453 pthread_mutex_destroy(&bs->used_clusters_mutex); 3454 spdk_bit_array_free(&bs->open_blobids); 3455 spdk_bit_array_free(&bs->used_blobids); 3456 spdk_bit_array_free(&bs->used_md_pages); 3457 spdk_bit_array_free(&ctx->used_clusters); 3458 spdk_free(ctx->super); 3459 free(ctx); 3460 free(bs); 3461 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3462 return -ENOMEM; 3463 } 3464 3465 *_ctx = ctx; 3466 *_bs = bs; 3467 return 0; 3468 } 3469 3470 static void 3471 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3472 { 3473 assert(bserrno != 0); 3474 3475 spdk_free(ctx->super); 3476 bs_sequence_finish(ctx->seq, bserrno); 3477 bs_free(ctx->bs); 3478 spdk_bit_array_free(&ctx->used_clusters); 3479 free(ctx); 3480 } 3481 3482 static void 3483 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3484 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3485 { 3486 /* Update the values in the super block */ 3487 super->super_blob = bs->super_blob; 3488 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3489 super->crc = blob_md_page_calc_crc(super); 3490 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3491 bs_byte_to_lba(bs, sizeof(*super)), 3492 cb_fn, cb_arg); 3493 } 3494 3495 static void 3496 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3497 { 3498 struct spdk_bs_load_ctx *ctx = arg; 3499 uint64_t mask_size, lba, lba_count; 3500 3501 /* Write out the used clusters mask */ 3502 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3503 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3504 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3505 if (!ctx->mask) { 3506 bs_load_ctx_fail(ctx, -ENOMEM); 3507 return; 3508 } 3509 3510 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3511 ctx->mask->length = ctx->bs->total_clusters; 3512 /* We could get here through the normal unload path, or through dirty 3513 * shutdown recovery. For the normal unload path, we use the mask from 3514 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3515 * only the bit array from the load ctx. 3516 */ 3517 if (ctx->bs->used_clusters) { 3518 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3519 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3520 } else { 3521 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3522 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3523 } 3524 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3525 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3526 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3527 } 3528 3529 static void 3530 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3531 { 3532 struct spdk_bs_load_ctx *ctx = arg; 3533 uint64_t mask_size, lba, lba_count; 3534 3535 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3536 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3537 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3538 if (!ctx->mask) { 3539 bs_load_ctx_fail(ctx, -ENOMEM); 3540 return; 3541 } 3542 3543 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3544 ctx->mask->length = ctx->super->md_len; 3545 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3546 3547 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3548 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3549 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3550 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3551 } 3552 3553 static void 3554 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3555 { 3556 struct spdk_bs_load_ctx *ctx = arg; 3557 uint64_t mask_size, lba, lba_count; 3558 3559 if (ctx->super->used_blobid_mask_len == 0) { 3560 /* 3561 * This is a pre-v3 on-disk format where the blobid mask does not get 3562 * written to disk. 3563 */ 3564 cb_fn(seq, arg, 0); 3565 return; 3566 } 3567 3568 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3569 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3570 SPDK_MALLOC_DMA); 3571 if (!ctx->mask) { 3572 bs_load_ctx_fail(ctx, -ENOMEM); 3573 return; 3574 } 3575 3576 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3577 ctx->mask->length = ctx->super->md_len; 3578 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3579 3580 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3581 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3582 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3583 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3584 } 3585 3586 static void 3587 blob_set_thin_provision(struct spdk_blob *blob) 3588 { 3589 blob_verify_md_op(blob); 3590 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3591 blob->state = SPDK_BLOB_STATE_DIRTY; 3592 } 3593 3594 static void 3595 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3596 { 3597 blob_verify_md_op(blob); 3598 blob->clear_method = clear_method; 3599 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3600 blob->state = SPDK_BLOB_STATE_DIRTY; 3601 } 3602 3603 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3604 3605 static void 3606 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3607 { 3608 struct spdk_bs_load_ctx *ctx = cb_arg; 3609 spdk_blob_id id; 3610 int64_t page_num; 3611 3612 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3613 * last blob has been removed */ 3614 page_num = bs_blobid_to_page(ctx->blobid); 3615 page_num++; 3616 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3617 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3618 bs_load_iter(ctx, NULL, -ENOENT); 3619 return; 3620 } 3621 3622 id = bs_page_to_blobid(page_num); 3623 3624 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3625 } 3626 3627 static void 3628 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3629 { 3630 struct spdk_bs_load_ctx *ctx = cb_arg; 3631 3632 if (bserrno != 0) { 3633 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3634 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3635 return; 3636 } 3637 3638 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3639 } 3640 3641 static void 3642 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3643 { 3644 struct spdk_bs_load_ctx *ctx = cb_arg; 3645 uint64_t i; 3646 3647 if (bserrno != 0) { 3648 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3649 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3650 return; 3651 } 3652 3653 /* Snapshot and clone have the same copy of cluster map and extent pages 3654 * at this point. Let's clear both for snapshot now, 3655 * so that it won't be cleared for clone later when we remove snapshot. 3656 * Also set thin provision to pass data corruption check */ 3657 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3658 ctx->blob->active.clusters[i] = 0; 3659 } 3660 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3661 ctx->blob->active.extent_pages[i] = 0; 3662 } 3663 3664 ctx->blob->md_ro = false; 3665 3666 blob_set_thin_provision(ctx->blob); 3667 3668 ctx->blobid = ctx->blob->id; 3669 3670 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3671 } 3672 3673 static void 3674 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3675 { 3676 struct spdk_bs_load_ctx *ctx = cb_arg; 3677 3678 if (bserrno != 0) { 3679 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3680 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3681 return; 3682 } 3683 3684 ctx->blob->md_ro = false; 3685 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3686 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3687 spdk_blob_set_read_only(ctx->blob); 3688 3689 if (ctx->iter_cb_fn) { 3690 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3691 } 3692 bs_blob_list_add(ctx->blob); 3693 3694 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3695 } 3696 3697 static void 3698 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3699 { 3700 struct spdk_bs_load_ctx *ctx = cb_arg; 3701 3702 if (bserrno != 0) { 3703 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3704 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3705 return; 3706 } 3707 3708 if (blob->parent_id == ctx->blob->id) { 3709 /* Power failure occurred before updating clone (snapshot delete case) 3710 * or after updating clone (creating snapshot case) - keep snapshot */ 3711 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3712 } else { 3713 /* Power failure occurred after updating clone (snapshot delete case) 3714 * or before updating clone (creating snapshot case) - remove snapshot */ 3715 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3716 } 3717 } 3718 3719 static void 3720 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3721 { 3722 struct spdk_bs_load_ctx *ctx = arg; 3723 const void *value; 3724 size_t len; 3725 int rc = 0; 3726 3727 if (bserrno == 0) { 3728 /* Examine blob if it is corrupted after power failure. Fix 3729 * the ones that can be fixed and remove any other corrupted 3730 * ones. If it is not corrupted just process it */ 3731 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3732 if (rc != 0) { 3733 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3734 if (rc != 0) { 3735 /* Not corrupted - process it and continue with iterating through blobs */ 3736 if (ctx->iter_cb_fn) { 3737 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3738 } 3739 bs_blob_list_add(blob); 3740 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3741 return; 3742 } 3743 3744 } 3745 3746 assert(len == sizeof(spdk_blob_id)); 3747 3748 ctx->blob = blob; 3749 3750 /* Open clone to check if we are able to fix this blob or should we remove it */ 3751 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3752 return; 3753 } else if (bserrno == -ENOENT) { 3754 bserrno = 0; 3755 } else { 3756 /* 3757 * This case needs to be looked at further. Same problem 3758 * exists with applications that rely on explicit blob 3759 * iteration. We should just skip the blob that failed 3760 * to load and continue on to the next one. 3761 */ 3762 SPDK_ERRLOG("Error in iterating blobs\n"); 3763 } 3764 3765 ctx->iter_cb_fn = NULL; 3766 3767 spdk_free(ctx->super); 3768 spdk_free(ctx->mask); 3769 bs_sequence_finish(ctx->seq, bserrno); 3770 free(ctx); 3771 } 3772 3773 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 3774 3775 static void 3776 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3777 { 3778 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3779 if (ctx->dumping) { 3780 bs_dump_read_md_page(ctx->seq, ctx); 3781 return; 3782 } 3783 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3784 } 3785 3786 static void 3787 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3788 { 3789 struct spdk_bs_load_ctx *ctx = cb_arg; 3790 int rc; 3791 3792 /* The type must be correct */ 3793 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3794 3795 /* The length of the mask (in bits) must not be greater than 3796 * the length of the buffer (converted to bits) */ 3797 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3798 3799 /* The length of the mask must be exactly equal to the size 3800 * (in pages) of the metadata region */ 3801 assert(ctx->mask->length == ctx->super->md_len); 3802 3803 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3804 if (rc < 0) { 3805 spdk_free(ctx->mask); 3806 bs_load_ctx_fail(ctx, rc); 3807 return; 3808 } 3809 3810 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3811 bs_load_complete(ctx); 3812 } 3813 3814 static void 3815 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3816 { 3817 struct spdk_bs_load_ctx *ctx = cb_arg; 3818 uint64_t lba, lba_count, mask_size; 3819 int rc; 3820 3821 if (bserrno != 0) { 3822 bs_load_ctx_fail(ctx, bserrno); 3823 return; 3824 } 3825 3826 /* The type must be correct */ 3827 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3828 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3829 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3830 struct spdk_blob_md_page) * 8)); 3831 /* 3832 * The length of the mask must be equal to or larger than the total number of clusters. It may be 3833 * larger than the total nubmer of clusters due to a failure spdk_bs_grow. 3834 */ 3835 assert(ctx->mask->length >= ctx->bs->total_clusters); 3836 if (ctx->mask->length > ctx->bs->total_clusters) { 3837 SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters"); 3838 ctx->mask->length = ctx->bs->total_clusters; 3839 } 3840 3841 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3842 if (rc < 0) { 3843 spdk_free(ctx->mask); 3844 bs_load_ctx_fail(ctx, rc); 3845 return; 3846 } 3847 3848 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3849 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3850 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3851 3852 spdk_free(ctx->mask); 3853 3854 /* Read the used blobids mask */ 3855 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3856 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3857 SPDK_MALLOC_DMA); 3858 if (!ctx->mask) { 3859 bs_load_ctx_fail(ctx, -ENOMEM); 3860 return; 3861 } 3862 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3863 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3864 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3865 bs_load_used_blobids_cpl, ctx); 3866 } 3867 3868 static void 3869 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3870 { 3871 struct spdk_bs_load_ctx *ctx = cb_arg; 3872 uint64_t lba, lba_count, mask_size; 3873 int rc; 3874 3875 if (bserrno != 0) { 3876 bs_load_ctx_fail(ctx, bserrno); 3877 return; 3878 } 3879 3880 /* The type must be correct */ 3881 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3882 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3883 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3884 8)); 3885 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3886 if (ctx->mask->length != ctx->super->md_len) { 3887 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3888 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3889 ctx->mask->length, ctx->super->md_len); 3890 assert(false); 3891 } 3892 3893 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3894 if (rc < 0) { 3895 spdk_free(ctx->mask); 3896 bs_load_ctx_fail(ctx, rc); 3897 return; 3898 } 3899 3900 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3901 spdk_free(ctx->mask); 3902 3903 /* Read the used clusters mask */ 3904 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3905 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3906 SPDK_MALLOC_DMA); 3907 if (!ctx->mask) { 3908 bs_load_ctx_fail(ctx, -ENOMEM); 3909 return; 3910 } 3911 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3912 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3913 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3914 bs_load_used_clusters_cpl, ctx); 3915 } 3916 3917 static void 3918 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3919 { 3920 uint64_t lba, lba_count, mask_size; 3921 3922 /* Read the used pages mask */ 3923 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3924 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3925 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3926 if (!ctx->mask) { 3927 bs_load_ctx_fail(ctx, -ENOMEM); 3928 return; 3929 } 3930 3931 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3932 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3933 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 3934 bs_load_used_pages_cpl, ctx); 3935 } 3936 3937 static int 3938 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 3939 { 3940 struct spdk_blob_store *bs = ctx->bs; 3941 struct spdk_blob_md_descriptor *desc; 3942 size_t cur_desc = 0; 3943 3944 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 3945 while (cur_desc < sizeof(page->descriptors)) { 3946 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 3947 if (desc->length == 0) { 3948 /* If padding and length are 0, this terminates the page */ 3949 break; 3950 } 3951 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 3952 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 3953 unsigned int i, j; 3954 unsigned int cluster_count = 0; 3955 uint32_t cluster_idx; 3956 3957 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 3958 3959 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 3960 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 3961 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 3962 /* 3963 * cluster_idx = 0 means an unallocated cluster - don't mark that 3964 * in the used cluster map. 3965 */ 3966 if (cluster_idx != 0) { 3967 SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j); 3968 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 3969 if (bs->num_free_clusters == 0) { 3970 return -ENOSPC; 3971 } 3972 bs->num_free_clusters--; 3973 } 3974 cluster_count++; 3975 } 3976 } 3977 if (cluster_count == 0) { 3978 return -EINVAL; 3979 } 3980 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 3981 struct spdk_blob_md_descriptor_extent_page *desc_extent; 3982 uint32_t i; 3983 uint32_t cluster_count = 0; 3984 uint32_t cluster_idx; 3985 size_t cluster_idx_length; 3986 3987 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 3988 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 3989 3990 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 3991 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 3992 return -EINVAL; 3993 } 3994 3995 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 3996 cluster_idx = desc_extent->cluster_idx[i]; 3997 /* 3998 * cluster_idx = 0 means an unallocated cluster - don't mark that 3999 * in the used cluster map. 4000 */ 4001 if (cluster_idx != 0) { 4002 if (cluster_idx < desc_extent->start_cluster_idx && 4003 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 4004 return -EINVAL; 4005 } 4006 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 4007 if (bs->num_free_clusters == 0) { 4008 return -ENOSPC; 4009 } 4010 bs->num_free_clusters--; 4011 } 4012 cluster_count++; 4013 } 4014 4015 if (cluster_count == 0) { 4016 return -EINVAL; 4017 } 4018 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4019 /* Skip this item */ 4020 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4021 /* Skip this item */ 4022 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4023 /* Skip this item */ 4024 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4025 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 4026 uint32_t num_extent_pages = ctx->num_extent_pages; 4027 uint32_t i; 4028 size_t extent_pages_length; 4029 void *tmp; 4030 4031 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 4032 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 4033 4034 if (desc_extent_table->length == 0 || 4035 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 4036 return -EINVAL; 4037 } 4038 4039 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4040 if (desc_extent_table->extent_page[i].page_idx != 0) { 4041 if (desc_extent_table->extent_page[i].num_pages != 1) { 4042 return -EINVAL; 4043 } 4044 num_extent_pages += 1; 4045 } 4046 } 4047 4048 if (num_extent_pages > 0) { 4049 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 4050 if (tmp == NULL) { 4051 return -ENOMEM; 4052 } 4053 ctx->extent_page_num = tmp; 4054 4055 /* Extent table entries contain md page numbers for extent pages. 4056 * Zeroes represent unallocated extent pages, those are run-length-encoded. 4057 */ 4058 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4059 if (desc_extent_table->extent_page[i].page_idx != 0) { 4060 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 4061 ctx->num_extent_pages += 1; 4062 } 4063 } 4064 } 4065 } else { 4066 /* Error */ 4067 return -EINVAL; 4068 } 4069 /* Advance to the next descriptor */ 4070 cur_desc += sizeof(*desc) + desc->length; 4071 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4072 break; 4073 } 4074 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4075 } 4076 return 0; 4077 } 4078 4079 static bool 4080 bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 4081 { 4082 uint32_t crc; 4083 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4084 size_t desc_len; 4085 4086 crc = blob_md_page_calc_crc(page); 4087 if (crc != page->crc) { 4088 return false; 4089 } 4090 4091 /* Extent page should always be of sequence num 0. */ 4092 if (page->sequence_num != 0) { 4093 return false; 4094 } 4095 4096 /* Descriptor type must be EXTENT_PAGE. */ 4097 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4098 return false; 4099 } 4100 4101 /* Descriptor length cannot exceed the page. */ 4102 desc_len = sizeof(*desc) + desc->length; 4103 if (desc_len > sizeof(page->descriptors)) { 4104 return false; 4105 } 4106 4107 /* It has to be the only descriptor in the page. */ 4108 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4109 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4110 if (desc->length != 0) { 4111 return false; 4112 } 4113 } 4114 4115 return true; 4116 } 4117 4118 static bool 4119 bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4120 { 4121 uint32_t crc; 4122 struct spdk_blob_md_page *page = ctx->page; 4123 4124 crc = blob_md_page_calc_crc(page); 4125 if (crc != page->crc) { 4126 return false; 4127 } 4128 4129 /* First page of a sequence should match the blobid. */ 4130 if (page->sequence_num == 0 && 4131 bs_page_to_blobid(ctx->cur_page) != page->id) { 4132 return false; 4133 } 4134 assert(bs_load_cur_extent_page_valid(page) == false); 4135 4136 return true; 4137 } 4138 4139 static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4140 4141 static void 4142 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4143 { 4144 struct spdk_bs_load_ctx *ctx = cb_arg; 4145 4146 if (bserrno != 0) { 4147 bs_load_ctx_fail(ctx, bserrno); 4148 return; 4149 } 4150 4151 bs_load_complete(ctx); 4152 } 4153 4154 static void 4155 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4156 { 4157 struct spdk_bs_load_ctx *ctx = cb_arg; 4158 4159 spdk_free(ctx->mask); 4160 ctx->mask = NULL; 4161 4162 if (bserrno != 0) { 4163 bs_load_ctx_fail(ctx, bserrno); 4164 return; 4165 } 4166 4167 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4168 } 4169 4170 static void 4171 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4172 { 4173 struct spdk_bs_load_ctx *ctx = cb_arg; 4174 4175 spdk_free(ctx->mask); 4176 ctx->mask = NULL; 4177 4178 if (bserrno != 0) { 4179 bs_load_ctx_fail(ctx, bserrno); 4180 return; 4181 } 4182 4183 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4184 } 4185 4186 static void 4187 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4188 { 4189 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4190 } 4191 4192 static void 4193 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4194 { 4195 uint64_t num_md_clusters; 4196 uint64_t i; 4197 4198 ctx->in_page_chain = false; 4199 4200 do { 4201 ctx->page_index++; 4202 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4203 4204 if (ctx->page_index < ctx->super->md_len) { 4205 ctx->cur_page = ctx->page_index; 4206 bs_load_replay_cur_md_page(ctx); 4207 } else { 4208 /* Claim all of the clusters used by the metadata */ 4209 num_md_clusters = spdk_divide_round_up( 4210 ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster); 4211 for (i = 0; i < num_md_clusters; i++) { 4212 spdk_bit_array_set(ctx->used_clusters, i); 4213 } 4214 ctx->bs->num_free_clusters -= num_md_clusters; 4215 spdk_free(ctx->page); 4216 bs_load_write_used_md(ctx); 4217 } 4218 } 4219 4220 static void 4221 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4222 { 4223 struct spdk_bs_load_ctx *ctx = cb_arg; 4224 uint32_t page_num; 4225 uint64_t i; 4226 4227 if (bserrno != 0) { 4228 spdk_free(ctx->extent_pages); 4229 bs_load_ctx_fail(ctx, bserrno); 4230 return; 4231 } 4232 4233 for (i = 0; i < ctx->num_extent_pages; i++) { 4234 /* Extent pages are only read when present within in chain md. 4235 * Integrity of md is not right if that page was not a valid extent page. */ 4236 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4237 spdk_free(ctx->extent_pages); 4238 bs_load_ctx_fail(ctx, -EILSEQ); 4239 return; 4240 } 4241 4242 page_num = ctx->extent_page_num[i]; 4243 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4244 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4245 spdk_free(ctx->extent_pages); 4246 bs_load_ctx_fail(ctx, -EILSEQ); 4247 return; 4248 } 4249 } 4250 4251 spdk_free(ctx->extent_pages); 4252 free(ctx->extent_page_num); 4253 ctx->extent_page_num = NULL; 4254 ctx->num_extent_pages = 0; 4255 4256 bs_load_replay_md_chain_cpl(ctx); 4257 } 4258 4259 static void 4260 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4261 { 4262 spdk_bs_batch_t *batch; 4263 uint32_t page; 4264 uint64_t lba; 4265 uint64_t i; 4266 4267 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4268 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4269 if (!ctx->extent_pages) { 4270 bs_load_ctx_fail(ctx, -ENOMEM); 4271 return; 4272 } 4273 4274 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4275 4276 for (i = 0; i < ctx->num_extent_pages; i++) { 4277 page = ctx->extent_page_num[i]; 4278 assert(page < ctx->super->md_len); 4279 lba = bs_md_page_to_lba(ctx->bs, page); 4280 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4281 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4282 } 4283 4284 bs_batch_close(batch); 4285 } 4286 4287 static void 4288 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4289 { 4290 struct spdk_bs_load_ctx *ctx = cb_arg; 4291 uint32_t page_num; 4292 struct spdk_blob_md_page *page; 4293 4294 if (bserrno != 0) { 4295 bs_load_ctx_fail(ctx, bserrno); 4296 return; 4297 } 4298 4299 page_num = ctx->cur_page; 4300 page = ctx->page; 4301 if (bs_load_cur_md_page_valid(ctx) == true) { 4302 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4303 bs_claim_md_page(ctx->bs, page_num); 4304 if (page->sequence_num == 0) { 4305 SPDK_NOTICELOG("Recover: blob %" PRIu32 "\n", page_num); 4306 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4307 } 4308 if (bs_load_replay_md_parse_page(ctx, page)) { 4309 bs_load_ctx_fail(ctx, -EILSEQ); 4310 return; 4311 } 4312 if (page->next != SPDK_INVALID_MD_PAGE) { 4313 ctx->in_page_chain = true; 4314 ctx->cur_page = page->next; 4315 bs_load_replay_cur_md_page(ctx); 4316 return; 4317 } 4318 if (ctx->num_extent_pages != 0) { 4319 bs_load_replay_extent_pages(ctx); 4320 return; 4321 } 4322 } 4323 } 4324 bs_load_replay_md_chain_cpl(ctx); 4325 } 4326 4327 static void 4328 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4329 { 4330 uint64_t lba; 4331 4332 assert(ctx->cur_page < ctx->super->md_len); 4333 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4334 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4335 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4336 bs_load_replay_md_cpl, ctx); 4337 } 4338 4339 static void 4340 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4341 { 4342 ctx->page_index = 0; 4343 ctx->cur_page = 0; 4344 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4345 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4346 if (!ctx->page) { 4347 bs_load_ctx_fail(ctx, -ENOMEM); 4348 return; 4349 } 4350 bs_load_replay_cur_md_page(ctx); 4351 } 4352 4353 static void 4354 bs_recover(struct spdk_bs_load_ctx *ctx) 4355 { 4356 int rc; 4357 4358 SPDK_NOTICELOG("Performing recovery on blobstore\n"); 4359 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4360 if (rc < 0) { 4361 bs_load_ctx_fail(ctx, -ENOMEM); 4362 return; 4363 } 4364 4365 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4366 if (rc < 0) { 4367 bs_load_ctx_fail(ctx, -ENOMEM); 4368 return; 4369 } 4370 4371 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4372 if (rc < 0) { 4373 bs_load_ctx_fail(ctx, -ENOMEM); 4374 return; 4375 } 4376 4377 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4378 if (rc < 0) { 4379 bs_load_ctx_fail(ctx, -ENOMEM); 4380 return; 4381 } 4382 4383 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4384 bs_load_replay_md(ctx); 4385 } 4386 4387 static int 4388 bs_parse_super(struct spdk_bs_load_ctx *ctx) 4389 { 4390 int rc; 4391 4392 if (ctx->super->size == 0) { 4393 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4394 } 4395 4396 if (ctx->super->io_unit_size == 0) { 4397 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4398 } 4399 4400 ctx->bs->clean = 1; 4401 ctx->bs->cluster_sz = ctx->super->cluster_size; 4402 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4403 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4404 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4405 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4406 } 4407 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4408 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4409 if (rc < 0) { 4410 return -ENOMEM; 4411 } 4412 ctx->bs->md_start = ctx->super->md_start; 4413 ctx->bs->md_len = ctx->super->md_len; 4414 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 4415 if (rc < 0) { 4416 return -ENOMEM; 4417 } 4418 4419 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4420 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4421 ctx->bs->super_blob = ctx->super->super_blob; 4422 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4423 4424 return 0; 4425 } 4426 4427 static void 4428 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4429 { 4430 struct spdk_bs_load_ctx *ctx = cb_arg; 4431 uint32_t crc; 4432 int rc; 4433 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4434 4435 if (ctx->super->version > SPDK_BS_VERSION || 4436 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4437 bs_load_ctx_fail(ctx, -EILSEQ); 4438 return; 4439 } 4440 4441 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4442 sizeof(ctx->super->signature)) != 0) { 4443 bs_load_ctx_fail(ctx, -EILSEQ); 4444 return; 4445 } 4446 4447 crc = blob_md_page_calc_crc(ctx->super); 4448 if (crc != ctx->super->crc) { 4449 bs_load_ctx_fail(ctx, -EILSEQ); 4450 return; 4451 } 4452 4453 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4454 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4455 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4456 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4457 } else { 4458 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4459 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4460 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4461 bs_load_ctx_fail(ctx, -ENXIO); 4462 return; 4463 } 4464 4465 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4466 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4467 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4468 bs_load_ctx_fail(ctx, -EILSEQ); 4469 return; 4470 } 4471 4472 rc = bs_parse_super(ctx); 4473 if (rc < 0) { 4474 bs_load_ctx_fail(ctx, rc); 4475 return; 4476 } 4477 4478 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) { 4479 bs_recover(ctx); 4480 } else { 4481 bs_load_read_used_pages(ctx); 4482 } 4483 } 4484 4485 static inline int 4486 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4487 { 4488 4489 if (!src->opts_size) { 4490 SPDK_ERRLOG("opts_size should not be zero value\n"); 4491 return -1; 4492 } 4493 4494 #define FIELD_OK(field) \ 4495 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4496 4497 #define SET_FIELD(field) \ 4498 if (FIELD_OK(field)) { \ 4499 dst->field = src->field; \ 4500 } \ 4501 4502 SET_FIELD(cluster_sz); 4503 SET_FIELD(num_md_pages); 4504 SET_FIELD(max_md_ops); 4505 SET_FIELD(max_channel_ops); 4506 SET_FIELD(clear_method); 4507 4508 if (FIELD_OK(bstype)) { 4509 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4510 } 4511 SET_FIELD(iter_cb_fn); 4512 SET_FIELD(iter_cb_arg); 4513 SET_FIELD(force_recover); 4514 4515 dst->opts_size = src->opts_size; 4516 4517 /* You should not remove this statement, but need to update the assert statement 4518 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4519 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 72, "Incorrect size"); 4520 4521 #undef FIELD_OK 4522 #undef SET_FIELD 4523 4524 return 0; 4525 } 4526 4527 void 4528 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4529 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4530 { 4531 struct spdk_blob_store *bs; 4532 struct spdk_bs_cpl cpl; 4533 struct spdk_bs_load_ctx *ctx; 4534 struct spdk_bs_opts opts = {}; 4535 int err; 4536 4537 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4538 4539 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4540 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4541 dev->destroy(dev); 4542 cb_fn(cb_arg, NULL, -EINVAL); 4543 return; 4544 } 4545 4546 spdk_bs_opts_init(&opts, sizeof(opts)); 4547 if (o) { 4548 if (bs_opts_copy(o, &opts)) { 4549 return; 4550 } 4551 } 4552 4553 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4554 dev->destroy(dev); 4555 cb_fn(cb_arg, NULL, -EINVAL); 4556 return; 4557 } 4558 4559 err = bs_alloc(dev, &opts, &bs, &ctx); 4560 if (err) { 4561 dev->destroy(dev); 4562 cb_fn(cb_arg, NULL, err); 4563 return; 4564 } 4565 4566 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4567 cpl.u.bs_handle.cb_fn = cb_fn; 4568 cpl.u.bs_handle.cb_arg = cb_arg; 4569 cpl.u.bs_handle.bs = bs; 4570 4571 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4572 if (!ctx->seq) { 4573 spdk_free(ctx->super); 4574 free(ctx); 4575 bs_free(bs); 4576 cb_fn(cb_arg, NULL, -ENOMEM); 4577 return; 4578 } 4579 4580 /* Read the super block */ 4581 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4582 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4583 bs_load_super_cpl, ctx); 4584 } 4585 4586 /* END spdk_bs_load */ 4587 4588 /* START spdk_bs_dump */ 4589 4590 static void 4591 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4592 { 4593 spdk_free(ctx->super); 4594 4595 /* 4596 * We need to defer calling bs_call_cpl() until after 4597 * dev destruction, so tuck these away for later use. 4598 */ 4599 ctx->bs->unload_err = bserrno; 4600 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4601 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4602 4603 bs_sequence_finish(seq, 0); 4604 bs_free(ctx->bs); 4605 free(ctx); 4606 } 4607 4608 static void 4609 bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4610 { 4611 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4612 uint32_t i; 4613 const char *type; 4614 4615 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4616 4617 if (desc_xattr->length != 4618 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4619 desc_xattr->name_length + desc_xattr->value_length) { 4620 } 4621 4622 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4623 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4624 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4625 type = "XATTR"; 4626 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4627 type = "XATTR_INTERNAL"; 4628 } else { 4629 assert(false); 4630 type = "XATTR_?"; 4631 } 4632 fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name); 4633 fprintf(ctx->fp, " value = \""); 4634 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4635 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4636 desc_xattr->value_length); 4637 fprintf(ctx->fp, "\"\n"); 4638 for (i = 0; i < desc_xattr->value_length; i++) { 4639 if (i % 16 == 0) { 4640 fprintf(ctx->fp, " "); 4641 } 4642 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4643 if ((i + 1) % 16 == 0) { 4644 fprintf(ctx->fp, "\n"); 4645 } 4646 } 4647 if (i % 16 != 0) { 4648 fprintf(ctx->fp, "\n"); 4649 } 4650 } 4651 4652 struct type_flag_desc { 4653 uint64_t mask; 4654 uint64_t val; 4655 const char *name; 4656 }; 4657 4658 static void 4659 bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags, 4660 struct type_flag_desc *desc, size_t numflags) 4661 { 4662 uint64_t covered = 0; 4663 size_t i; 4664 4665 for (i = 0; i < numflags; i++) { 4666 if ((desc[i].mask & flags) != desc[i].val) { 4667 continue; 4668 } 4669 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name); 4670 if (desc[i].mask != desc[i].val) { 4671 fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")", 4672 desc[i].mask, desc[i].val); 4673 } 4674 fprintf(ctx->fp, "\n"); 4675 covered |= desc[i].mask; 4676 } 4677 if ((flags & ~covered) != 0) { 4678 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered); 4679 } 4680 } 4681 4682 static void 4683 bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4684 { 4685 struct spdk_blob_md_descriptor_flags *type_desc; 4686 #define ADD_FLAG(f) { f, f, #f } 4687 #define ADD_MASK_VAL(m, v) { m, v, #v } 4688 static struct type_flag_desc invalid[] = { 4689 ADD_FLAG(SPDK_BLOB_THIN_PROV), 4690 ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR), 4691 ADD_FLAG(SPDK_BLOB_EXTENT_TABLE), 4692 }; 4693 static struct type_flag_desc data_ro[] = { 4694 ADD_FLAG(SPDK_BLOB_READ_ONLY), 4695 }; 4696 static struct type_flag_desc md_ro[] = { 4697 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT), 4698 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE), 4699 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP), 4700 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES), 4701 }; 4702 #undef ADD_FLAG 4703 #undef ADD_MASK_VAL 4704 4705 type_desc = (struct spdk_blob_md_descriptor_flags *)desc; 4706 fprintf(ctx->fp, "Flags:\n"); 4707 fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags); 4708 bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid, 4709 SPDK_COUNTOF(invalid)); 4710 fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags); 4711 bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro, 4712 SPDK_COUNTOF(data_ro)); 4713 fprintf(ctx->fp, "\t md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags); 4714 bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro, 4715 SPDK_COUNTOF(md_ro)); 4716 } 4717 4718 static void 4719 bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4720 { 4721 struct spdk_blob_md_descriptor_extent_table *et_desc; 4722 uint64_t num_extent_pages; 4723 uint32_t et_idx; 4724 4725 et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc; 4726 num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) / 4727 sizeof(et_desc->extent_page[0]); 4728 4729 fprintf(ctx->fp, "Extent table:\n"); 4730 for (et_idx = 0; et_idx < num_extent_pages; et_idx++) { 4731 if (et_desc->extent_page[et_idx].page_idx == 0) { 4732 /* Zeroes represent unallocated extent pages. */ 4733 continue; 4734 } 4735 fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32 4736 " at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx, 4737 et_desc->extent_page[et_idx].num_pages, 4738 bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx)); 4739 } 4740 } 4741 4742 static void 4743 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4744 { 4745 uint32_t page_idx = ctx->cur_page; 4746 struct spdk_blob_md_page *page = ctx->page; 4747 struct spdk_blob_md_descriptor *desc; 4748 size_t cur_desc = 0; 4749 uint32_t crc; 4750 4751 fprintf(ctx->fp, "=========\n"); 4752 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4753 fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx)); 4754 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4755 fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num); 4756 if (page->next == SPDK_INVALID_MD_PAGE) { 4757 fprintf(ctx->fp, "Next: None\n"); 4758 } else { 4759 fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next); 4760 } 4761 fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)"); 4762 if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) { 4763 fprintf(ctx->fp, " md"); 4764 } 4765 if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) { 4766 fprintf(ctx->fp, " blob"); 4767 } 4768 fprintf(ctx->fp, "\n"); 4769 4770 crc = blob_md_page_calc_crc(page); 4771 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4772 4773 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4774 while (cur_desc < sizeof(page->descriptors)) { 4775 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4776 if (desc->length == 0) { 4777 /* If padding and length are 0, this terminates the page */ 4778 break; 4779 } 4780 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4781 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4782 unsigned int i; 4783 4784 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4785 4786 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4787 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4788 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4789 desc_extent_rle->extents[i].cluster_idx); 4790 } else { 4791 fprintf(ctx->fp, "Unallocated Extent - "); 4792 } 4793 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4794 fprintf(ctx->fp, "\n"); 4795 } 4796 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4797 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4798 unsigned int i; 4799 4800 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4801 4802 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4803 if (desc_extent->cluster_idx[i] != 0) { 4804 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4805 desc_extent->cluster_idx[i]); 4806 } else { 4807 fprintf(ctx->fp, "Unallocated Extent"); 4808 } 4809 fprintf(ctx->fp, "\n"); 4810 } 4811 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4812 bs_dump_print_xattr(ctx, desc); 4813 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4814 bs_dump_print_xattr(ctx, desc); 4815 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4816 bs_dump_print_type_flags(ctx, desc); 4817 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4818 bs_dump_print_extent_table(ctx, desc); 4819 } else { 4820 /* Error */ 4821 fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type); 4822 } 4823 /* Advance to the next descriptor */ 4824 cur_desc += sizeof(*desc) + desc->length; 4825 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4826 break; 4827 } 4828 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4829 } 4830 } 4831 4832 static void 4833 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4834 { 4835 struct spdk_bs_load_ctx *ctx = cb_arg; 4836 4837 if (bserrno != 0) { 4838 bs_dump_finish(seq, ctx, bserrno); 4839 return; 4840 } 4841 4842 if (ctx->page->id != 0) { 4843 bs_dump_print_md_page(ctx); 4844 } 4845 4846 ctx->cur_page++; 4847 4848 if (ctx->cur_page < ctx->super->md_len) { 4849 bs_dump_read_md_page(seq, ctx); 4850 } else { 4851 spdk_free(ctx->page); 4852 bs_dump_finish(seq, ctx, 0); 4853 } 4854 } 4855 4856 static void 4857 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4858 { 4859 struct spdk_bs_load_ctx *ctx = cb_arg; 4860 uint64_t lba; 4861 4862 assert(ctx->cur_page < ctx->super->md_len); 4863 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4864 bs_sequence_read_dev(seq, ctx->page, lba, 4865 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4866 bs_dump_read_md_page_cpl, ctx); 4867 } 4868 4869 static void 4870 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4871 { 4872 struct spdk_bs_load_ctx *ctx = cb_arg; 4873 int rc; 4874 4875 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4876 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4877 sizeof(ctx->super->signature)) != 0) { 4878 fprintf(ctx->fp, "(Mismatch)\n"); 4879 bs_dump_finish(seq, ctx, bserrno); 4880 return; 4881 } else { 4882 fprintf(ctx->fp, "(OK)\n"); 4883 } 4884 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4885 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4886 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4887 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4888 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4889 fprintf(ctx->fp, "Super Blob ID: "); 4890 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4891 fprintf(ctx->fp, "(None)\n"); 4892 } else { 4893 fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob); 4894 } 4895 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4896 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4897 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4898 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4899 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4900 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4901 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4902 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4903 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4904 4905 ctx->cur_page = 0; 4906 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4907 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4908 if (!ctx->page) { 4909 bs_dump_finish(seq, ctx, -ENOMEM); 4910 return; 4911 } 4912 4913 rc = bs_parse_super(ctx); 4914 if (rc < 0) { 4915 bs_load_ctx_fail(ctx, rc); 4916 return; 4917 } 4918 4919 bs_load_read_used_pages(ctx); 4920 } 4921 4922 void 4923 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4924 spdk_bs_op_complete cb_fn, void *cb_arg) 4925 { 4926 struct spdk_blob_store *bs; 4927 struct spdk_bs_cpl cpl; 4928 struct spdk_bs_load_ctx *ctx; 4929 struct spdk_bs_opts opts = {}; 4930 int err; 4931 4932 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 4933 4934 spdk_bs_opts_init(&opts, sizeof(opts)); 4935 4936 err = bs_alloc(dev, &opts, &bs, &ctx); 4937 if (err) { 4938 dev->destroy(dev); 4939 cb_fn(cb_arg, err); 4940 return; 4941 } 4942 4943 ctx->dumping = true; 4944 ctx->fp = fp; 4945 ctx->print_xattr_fn = print_xattr_fn; 4946 4947 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 4948 cpl.u.bs_basic.cb_fn = cb_fn; 4949 cpl.u.bs_basic.cb_arg = cb_arg; 4950 4951 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4952 if (!ctx->seq) { 4953 spdk_free(ctx->super); 4954 free(ctx); 4955 bs_free(bs); 4956 cb_fn(cb_arg, -ENOMEM); 4957 return; 4958 } 4959 4960 /* Read the super block */ 4961 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4962 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4963 bs_dump_super_cpl, ctx); 4964 } 4965 4966 /* END spdk_bs_dump */ 4967 4968 /* START spdk_bs_init */ 4969 4970 static void 4971 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4972 { 4973 struct spdk_bs_load_ctx *ctx = cb_arg; 4974 4975 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 4976 spdk_free(ctx->super); 4977 free(ctx); 4978 4979 bs_sequence_finish(seq, bserrno); 4980 } 4981 4982 static void 4983 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4984 { 4985 struct spdk_bs_load_ctx *ctx = cb_arg; 4986 4987 /* Write super block */ 4988 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 4989 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 4990 bs_init_persist_super_cpl, ctx); 4991 } 4992 4993 void 4994 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4995 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4996 { 4997 struct spdk_bs_load_ctx *ctx; 4998 struct spdk_blob_store *bs; 4999 struct spdk_bs_cpl cpl; 5000 spdk_bs_sequence_t *seq; 5001 spdk_bs_batch_t *batch; 5002 uint64_t num_md_lba; 5003 uint64_t num_md_pages; 5004 uint64_t num_md_clusters; 5005 uint64_t max_used_cluster_mask_len; 5006 uint32_t i; 5007 struct spdk_bs_opts opts = {}; 5008 int rc; 5009 uint64_t lba, lba_count; 5010 5011 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 5012 5013 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 5014 SPDK_ERRLOG("unsupported dev block length of %d\n", 5015 dev->blocklen); 5016 dev->destroy(dev); 5017 cb_fn(cb_arg, NULL, -EINVAL); 5018 return; 5019 } 5020 5021 spdk_bs_opts_init(&opts, sizeof(opts)); 5022 if (o) { 5023 if (bs_opts_copy(o, &opts)) { 5024 return; 5025 } 5026 } 5027 5028 if (bs_opts_verify(&opts) != 0) { 5029 dev->destroy(dev); 5030 cb_fn(cb_arg, NULL, -EINVAL); 5031 return; 5032 } 5033 5034 rc = bs_alloc(dev, &opts, &bs, &ctx); 5035 if (rc) { 5036 dev->destroy(dev); 5037 cb_fn(cb_arg, NULL, rc); 5038 return; 5039 } 5040 5041 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 5042 /* By default, allocate 1 page per cluster. 5043 * Technically, this over-allocates metadata 5044 * because more metadata will reduce the number 5045 * of usable clusters. This can be addressed with 5046 * more complex math in the future. 5047 */ 5048 bs->md_len = bs->total_clusters; 5049 } else { 5050 bs->md_len = opts.num_md_pages; 5051 } 5052 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 5053 if (rc < 0) { 5054 spdk_free(ctx->super); 5055 free(ctx); 5056 bs_free(bs); 5057 cb_fn(cb_arg, NULL, -ENOMEM); 5058 return; 5059 } 5060 5061 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 5062 if (rc < 0) { 5063 spdk_free(ctx->super); 5064 free(ctx); 5065 bs_free(bs); 5066 cb_fn(cb_arg, NULL, -ENOMEM); 5067 return; 5068 } 5069 5070 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 5071 if (rc < 0) { 5072 spdk_free(ctx->super); 5073 free(ctx); 5074 bs_free(bs); 5075 cb_fn(cb_arg, NULL, -ENOMEM); 5076 return; 5077 } 5078 5079 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 5080 sizeof(ctx->super->signature)); 5081 ctx->super->version = SPDK_BS_VERSION; 5082 ctx->super->length = sizeof(*ctx->super); 5083 ctx->super->super_blob = bs->super_blob; 5084 ctx->super->clean = 0; 5085 ctx->super->cluster_size = bs->cluster_sz; 5086 ctx->super->io_unit_size = bs->io_unit_size; 5087 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 5088 5089 /* Calculate how many pages the metadata consumes at the front 5090 * of the disk. 5091 */ 5092 5093 /* The super block uses 1 page */ 5094 num_md_pages = 1; 5095 5096 /* The used_md_pages mask requires 1 bit per metadata page, rounded 5097 * up to the nearest page, plus a header. 5098 */ 5099 ctx->super->used_page_mask_start = num_md_pages; 5100 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5101 spdk_divide_round_up(bs->md_len, 8), 5102 SPDK_BS_PAGE_SIZE); 5103 num_md_pages += ctx->super->used_page_mask_len; 5104 5105 /* The used_clusters mask requires 1 bit per cluster, rounded 5106 * up to the nearest page, plus a header. 5107 */ 5108 ctx->super->used_cluster_mask_start = num_md_pages; 5109 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5110 spdk_divide_round_up(bs->total_clusters, 8), 5111 SPDK_BS_PAGE_SIZE); 5112 /* The blobstore might be extended, then the used_cluster bitmap will need more space. 5113 * Here we calculate the max clusters we can support according to the 5114 * num_md_pages (bs->md_len). 5115 */ 5116 max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5117 spdk_divide_round_up(bs->md_len, 8), 5118 SPDK_BS_PAGE_SIZE); 5119 max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, 5120 ctx->super->used_cluster_mask_len); 5121 num_md_pages += max_used_cluster_mask_len; 5122 5123 /* The used_blobids mask requires 1 bit per metadata page, rounded 5124 * up to the nearest page, plus a header. 5125 */ 5126 ctx->super->used_blobid_mask_start = num_md_pages; 5127 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5128 spdk_divide_round_up(bs->md_len, 8), 5129 SPDK_BS_PAGE_SIZE); 5130 num_md_pages += ctx->super->used_blobid_mask_len; 5131 5132 /* The metadata region size was chosen above */ 5133 ctx->super->md_start = bs->md_start = num_md_pages; 5134 ctx->super->md_len = bs->md_len; 5135 num_md_pages += bs->md_len; 5136 5137 num_md_lba = bs_page_to_lba(bs, num_md_pages); 5138 5139 ctx->super->size = dev->blockcnt * dev->blocklen; 5140 5141 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 5142 5143 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 5144 if (num_md_clusters > bs->total_clusters) { 5145 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 5146 "please decrease number of pages reserved for metadata " 5147 "or increase cluster size.\n"); 5148 spdk_free(ctx->super); 5149 spdk_bit_array_free(&ctx->used_clusters); 5150 free(ctx); 5151 bs_free(bs); 5152 cb_fn(cb_arg, NULL, -ENOMEM); 5153 return; 5154 } 5155 /* Claim all of the clusters used by the metadata */ 5156 for (i = 0; i < num_md_clusters; i++) { 5157 spdk_bit_array_set(ctx->used_clusters, i); 5158 } 5159 5160 bs->num_free_clusters -= num_md_clusters; 5161 bs->total_data_clusters = bs->num_free_clusters; 5162 5163 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 5164 cpl.u.bs_handle.cb_fn = cb_fn; 5165 cpl.u.bs_handle.cb_arg = cb_arg; 5166 cpl.u.bs_handle.bs = bs; 5167 5168 seq = bs_sequence_start(bs->md_channel, &cpl); 5169 if (!seq) { 5170 spdk_free(ctx->super); 5171 free(ctx); 5172 bs_free(bs); 5173 cb_fn(cb_arg, NULL, -ENOMEM); 5174 return; 5175 } 5176 5177 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 5178 5179 /* Clear metadata space */ 5180 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 5181 5182 lba = num_md_lba; 5183 lba_count = ctx->bs->dev->blockcnt - lba; 5184 switch (opts.clear_method) { 5185 case BS_CLEAR_WITH_UNMAP: 5186 /* Trim data clusters */ 5187 bs_batch_unmap_dev(batch, lba, lba_count); 5188 break; 5189 case BS_CLEAR_WITH_WRITE_ZEROES: 5190 /* Write_zeroes to data clusters */ 5191 bs_batch_write_zeroes_dev(batch, lba, lba_count); 5192 break; 5193 case BS_CLEAR_WITH_NONE: 5194 default: 5195 break; 5196 } 5197 5198 bs_batch_close(batch); 5199 } 5200 5201 /* END spdk_bs_init */ 5202 5203 /* START spdk_bs_destroy */ 5204 5205 static void 5206 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5207 { 5208 struct spdk_bs_load_ctx *ctx = cb_arg; 5209 struct spdk_blob_store *bs = ctx->bs; 5210 5211 /* 5212 * We need to defer calling bs_call_cpl() until after 5213 * dev destruction, so tuck these away for later use. 5214 */ 5215 bs->unload_err = bserrno; 5216 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5217 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5218 5219 bs_sequence_finish(seq, bserrno); 5220 5221 bs_free(bs); 5222 free(ctx); 5223 } 5224 5225 void 5226 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 5227 void *cb_arg) 5228 { 5229 struct spdk_bs_cpl cpl; 5230 spdk_bs_sequence_t *seq; 5231 struct spdk_bs_load_ctx *ctx; 5232 5233 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 5234 5235 if (!RB_EMPTY(&bs->open_blobs)) { 5236 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5237 cb_fn(cb_arg, -EBUSY); 5238 return; 5239 } 5240 5241 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5242 cpl.u.bs_basic.cb_fn = cb_fn; 5243 cpl.u.bs_basic.cb_arg = cb_arg; 5244 5245 ctx = calloc(1, sizeof(*ctx)); 5246 if (!ctx) { 5247 cb_fn(cb_arg, -ENOMEM); 5248 return; 5249 } 5250 5251 ctx->bs = bs; 5252 5253 seq = bs_sequence_start(bs->md_channel, &cpl); 5254 if (!seq) { 5255 free(ctx); 5256 cb_fn(cb_arg, -ENOMEM); 5257 return; 5258 } 5259 5260 /* Write zeroes to the super block */ 5261 bs_sequence_write_zeroes_dev(seq, 5262 bs_page_to_lba(bs, 0), 5263 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5264 bs_destroy_trim_cpl, ctx); 5265 } 5266 5267 /* END spdk_bs_destroy */ 5268 5269 /* START spdk_bs_unload */ 5270 5271 static void 5272 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5273 { 5274 spdk_bs_sequence_t *seq = ctx->seq; 5275 5276 spdk_free(ctx->super); 5277 5278 /* 5279 * We need to defer calling bs_call_cpl() until after 5280 * dev destruction, so tuck these away for later use. 5281 */ 5282 ctx->bs->unload_err = bserrno; 5283 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5284 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5285 5286 bs_sequence_finish(seq, bserrno); 5287 5288 bs_free(ctx->bs); 5289 free(ctx); 5290 } 5291 5292 static void 5293 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5294 { 5295 struct spdk_bs_load_ctx *ctx = cb_arg; 5296 5297 bs_unload_finish(ctx, bserrno); 5298 } 5299 5300 static void 5301 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5302 { 5303 struct spdk_bs_load_ctx *ctx = cb_arg; 5304 5305 spdk_free(ctx->mask); 5306 5307 if (bserrno != 0) { 5308 bs_unload_finish(ctx, bserrno); 5309 return; 5310 } 5311 5312 ctx->super->clean = 1; 5313 5314 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5315 } 5316 5317 static void 5318 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5319 { 5320 struct spdk_bs_load_ctx *ctx = cb_arg; 5321 5322 spdk_free(ctx->mask); 5323 ctx->mask = NULL; 5324 5325 if (bserrno != 0) { 5326 bs_unload_finish(ctx, bserrno); 5327 return; 5328 } 5329 5330 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5331 } 5332 5333 static void 5334 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5335 { 5336 struct spdk_bs_load_ctx *ctx = cb_arg; 5337 5338 spdk_free(ctx->mask); 5339 ctx->mask = NULL; 5340 5341 if (bserrno != 0) { 5342 bs_unload_finish(ctx, bserrno); 5343 return; 5344 } 5345 5346 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5347 } 5348 5349 static void 5350 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5351 { 5352 struct spdk_bs_load_ctx *ctx = cb_arg; 5353 5354 if (bserrno != 0) { 5355 bs_unload_finish(ctx, bserrno); 5356 return; 5357 } 5358 5359 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5360 } 5361 5362 void 5363 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5364 { 5365 struct spdk_bs_cpl cpl; 5366 struct spdk_bs_load_ctx *ctx; 5367 5368 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5369 5370 if (!RB_EMPTY(&bs->open_blobs)) { 5371 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5372 cb_fn(cb_arg, -EBUSY); 5373 return; 5374 } 5375 5376 ctx = calloc(1, sizeof(*ctx)); 5377 if (!ctx) { 5378 cb_fn(cb_arg, -ENOMEM); 5379 return; 5380 } 5381 5382 ctx->bs = bs; 5383 5384 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5385 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5386 if (!ctx->super) { 5387 free(ctx); 5388 cb_fn(cb_arg, -ENOMEM); 5389 return; 5390 } 5391 5392 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5393 cpl.u.bs_basic.cb_fn = cb_fn; 5394 cpl.u.bs_basic.cb_arg = cb_arg; 5395 5396 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5397 if (!ctx->seq) { 5398 spdk_free(ctx->super); 5399 free(ctx); 5400 cb_fn(cb_arg, -ENOMEM); 5401 return; 5402 } 5403 5404 /* Read super block */ 5405 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5406 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5407 bs_unload_read_super_cpl, ctx); 5408 } 5409 5410 /* END spdk_bs_unload */ 5411 5412 /* START spdk_bs_set_super */ 5413 5414 struct spdk_bs_set_super_ctx { 5415 struct spdk_blob_store *bs; 5416 struct spdk_bs_super_block *super; 5417 }; 5418 5419 static void 5420 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5421 { 5422 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5423 5424 if (bserrno != 0) { 5425 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5426 } 5427 5428 spdk_free(ctx->super); 5429 5430 bs_sequence_finish(seq, bserrno); 5431 5432 free(ctx); 5433 } 5434 5435 static void 5436 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5437 { 5438 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5439 5440 if (bserrno != 0) { 5441 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5442 spdk_free(ctx->super); 5443 bs_sequence_finish(seq, bserrno); 5444 free(ctx); 5445 return; 5446 } 5447 5448 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5449 } 5450 5451 void 5452 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5453 spdk_bs_op_complete cb_fn, void *cb_arg) 5454 { 5455 struct spdk_bs_cpl cpl; 5456 spdk_bs_sequence_t *seq; 5457 struct spdk_bs_set_super_ctx *ctx; 5458 5459 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5460 5461 ctx = calloc(1, sizeof(*ctx)); 5462 if (!ctx) { 5463 cb_fn(cb_arg, -ENOMEM); 5464 return; 5465 } 5466 5467 ctx->bs = bs; 5468 5469 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5470 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5471 if (!ctx->super) { 5472 free(ctx); 5473 cb_fn(cb_arg, -ENOMEM); 5474 return; 5475 } 5476 5477 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5478 cpl.u.bs_basic.cb_fn = cb_fn; 5479 cpl.u.bs_basic.cb_arg = cb_arg; 5480 5481 seq = bs_sequence_start(bs->md_channel, &cpl); 5482 if (!seq) { 5483 spdk_free(ctx->super); 5484 free(ctx); 5485 cb_fn(cb_arg, -ENOMEM); 5486 return; 5487 } 5488 5489 bs->super_blob = blobid; 5490 5491 /* Read super block */ 5492 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5493 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5494 bs_set_super_read_cpl, ctx); 5495 } 5496 5497 /* END spdk_bs_set_super */ 5498 5499 void 5500 spdk_bs_get_super(struct spdk_blob_store *bs, 5501 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5502 { 5503 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5504 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5505 } else { 5506 cb_fn(cb_arg, bs->super_blob, 0); 5507 } 5508 } 5509 5510 uint64_t 5511 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5512 { 5513 return bs->cluster_sz; 5514 } 5515 5516 uint64_t 5517 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5518 { 5519 return SPDK_BS_PAGE_SIZE; 5520 } 5521 5522 uint64_t 5523 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5524 { 5525 return bs->io_unit_size; 5526 } 5527 5528 uint64_t 5529 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5530 { 5531 return bs->num_free_clusters; 5532 } 5533 5534 uint64_t 5535 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5536 { 5537 return bs->total_data_clusters; 5538 } 5539 5540 static int 5541 bs_register_md_thread(struct spdk_blob_store *bs) 5542 { 5543 bs->md_channel = spdk_get_io_channel(bs); 5544 if (!bs->md_channel) { 5545 SPDK_ERRLOG("Failed to get IO channel.\n"); 5546 return -1; 5547 } 5548 5549 return 0; 5550 } 5551 5552 static int 5553 bs_unregister_md_thread(struct spdk_blob_store *bs) 5554 { 5555 spdk_put_io_channel(bs->md_channel); 5556 5557 return 0; 5558 } 5559 5560 spdk_blob_id 5561 spdk_blob_get_id(struct spdk_blob *blob) 5562 { 5563 assert(blob != NULL); 5564 5565 return blob->id; 5566 } 5567 5568 uint64_t 5569 spdk_blob_get_num_pages(struct spdk_blob *blob) 5570 { 5571 assert(blob != NULL); 5572 5573 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5574 } 5575 5576 uint64_t 5577 spdk_blob_get_num_io_units(struct spdk_blob *blob) 5578 { 5579 assert(blob != NULL); 5580 5581 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5582 } 5583 5584 uint64_t 5585 spdk_blob_get_num_clusters(struct spdk_blob *blob) 5586 { 5587 assert(blob != NULL); 5588 5589 return blob->active.num_clusters; 5590 } 5591 5592 /* START spdk_bs_create_blob */ 5593 5594 static void 5595 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5596 { 5597 struct spdk_blob *blob = cb_arg; 5598 uint32_t page_idx = bs_blobid_to_page(blob->id); 5599 5600 if (bserrno != 0) { 5601 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5602 bs_release_md_page(blob->bs, page_idx); 5603 } 5604 5605 blob_free(blob); 5606 5607 bs_sequence_finish(seq, bserrno); 5608 } 5609 5610 static int 5611 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5612 bool internal) 5613 { 5614 uint64_t i; 5615 size_t value_len = 0; 5616 int rc; 5617 const void *value = NULL; 5618 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5619 return -EINVAL; 5620 } 5621 for (i = 0; i < xattrs->count; i++) { 5622 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5623 if (value == NULL || value_len == 0) { 5624 return -EINVAL; 5625 } 5626 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5627 if (rc < 0) { 5628 return rc; 5629 } 5630 } 5631 return 0; 5632 } 5633 5634 static void 5635 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5636 { 5637 #define FIELD_OK(field) \ 5638 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5639 5640 #define SET_FIELD(field) \ 5641 if (FIELD_OK(field)) { \ 5642 dst->field = src->field; \ 5643 } \ 5644 5645 SET_FIELD(num_clusters); 5646 SET_FIELD(thin_provision); 5647 SET_FIELD(clear_method); 5648 5649 if (FIELD_OK(xattrs)) { 5650 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5651 } 5652 5653 SET_FIELD(use_extent_table); 5654 5655 dst->opts_size = src->opts_size; 5656 5657 /* You should not remove this statement, but need to update the assert statement 5658 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5659 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5660 5661 #undef FIELD_OK 5662 #undef SET_FIELD 5663 } 5664 5665 static void 5666 bs_create_blob(struct spdk_blob_store *bs, 5667 const struct spdk_blob_opts *opts, 5668 const struct spdk_blob_xattr_opts *internal_xattrs, 5669 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5670 { 5671 struct spdk_blob *blob; 5672 uint32_t page_idx; 5673 struct spdk_bs_cpl cpl; 5674 struct spdk_blob_opts opts_local; 5675 struct spdk_blob_xattr_opts internal_xattrs_default; 5676 spdk_bs_sequence_t *seq; 5677 spdk_blob_id id; 5678 int rc; 5679 5680 assert(spdk_get_thread() == bs->md_thread); 5681 5682 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5683 if (page_idx == UINT32_MAX) { 5684 cb_fn(cb_arg, 0, -ENOMEM); 5685 return; 5686 } 5687 spdk_bit_array_set(bs->used_blobids, page_idx); 5688 bs_claim_md_page(bs, page_idx); 5689 5690 id = bs_page_to_blobid(page_idx); 5691 5692 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5693 5694 blob = blob_alloc(bs, id); 5695 if (!blob) { 5696 spdk_bit_array_clear(bs->used_blobids, page_idx); 5697 bs_release_md_page(bs, page_idx); 5698 cb_fn(cb_arg, 0, -ENOMEM); 5699 return; 5700 } 5701 5702 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5703 if (opts) { 5704 blob_opts_copy(opts, &opts_local); 5705 } 5706 5707 blob->use_extent_table = opts_local.use_extent_table; 5708 if (blob->use_extent_table) { 5709 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5710 } 5711 5712 if (!internal_xattrs) { 5713 blob_xattrs_init(&internal_xattrs_default); 5714 internal_xattrs = &internal_xattrs_default; 5715 } 5716 5717 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5718 if (rc < 0) { 5719 blob_free(blob); 5720 spdk_bit_array_clear(bs->used_blobids, page_idx); 5721 bs_release_md_page(bs, page_idx); 5722 cb_fn(cb_arg, 0, rc); 5723 return; 5724 } 5725 5726 rc = blob_set_xattrs(blob, internal_xattrs, true); 5727 if (rc < 0) { 5728 blob_free(blob); 5729 spdk_bit_array_clear(bs->used_blobids, page_idx); 5730 bs_release_md_page(bs, page_idx); 5731 cb_fn(cb_arg, 0, rc); 5732 return; 5733 } 5734 5735 if (opts_local.thin_provision) { 5736 blob_set_thin_provision(blob); 5737 } 5738 5739 blob_set_clear_method(blob, opts_local.clear_method); 5740 5741 rc = blob_resize(blob, opts_local.num_clusters); 5742 if (rc < 0) { 5743 blob_free(blob); 5744 spdk_bit_array_clear(bs->used_blobids, page_idx); 5745 bs_release_md_page(bs, page_idx); 5746 cb_fn(cb_arg, 0, rc); 5747 return; 5748 } 5749 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5750 cpl.u.blobid.cb_fn = cb_fn; 5751 cpl.u.blobid.cb_arg = cb_arg; 5752 cpl.u.blobid.blobid = blob->id; 5753 5754 seq = bs_sequence_start(bs->md_channel, &cpl); 5755 if (!seq) { 5756 blob_free(blob); 5757 spdk_bit_array_clear(bs->used_blobids, page_idx); 5758 bs_release_md_page(bs, page_idx); 5759 cb_fn(cb_arg, 0, -ENOMEM); 5760 return; 5761 } 5762 5763 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5764 } 5765 5766 void 5767 spdk_bs_create_blob(struct spdk_blob_store *bs, 5768 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5769 { 5770 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5771 } 5772 5773 void 5774 spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5775 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5776 { 5777 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5778 } 5779 5780 /* END spdk_bs_create_blob */ 5781 5782 /* START blob_cleanup */ 5783 5784 struct spdk_clone_snapshot_ctx { 5785 struct spdk_bs_cpl cpl; 5786 int bserrno; 5787 bool frozen; 5788 5789 struct spdk_io_channel *channel; 5790 5791 /* Current cluster for inflate operation */ 5792 uint64_t cluster; 5793 5794 /* For inflation force allocation of all unallocated clusters and remove 5795 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5796 bool allocate_all; 5797 5798 struct { 5799 spdk_blob_id id; 5800 struct spdk_blob *blob; 5801 bool md_ro; 5802 } original; 5803 struct { 5804 spdk_blob_id id; 5805 struct spdk_blob *blob; 5806 } new; 5807 5808 /* xattrs specified for snapshot/clones only. They have no impact on 5809 * the original blobs xattrs. */ 5810 const struct spdk_blob_xattr_opts *xattrs; 5811 }; 5812 5813 static void 5814 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5815 { 5816 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5817 struct spdk_bs_cpl *cpl = &ctx->cpl; 5818 5819 if (bserrno != 0) { 5820 if (ctx->bserrno != 0) { 5821 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5822 } else { 5823 ctx->bserrno = bserrno; 5824 } 5825 } 5826 5827 switch (cpl->type) { 5828 case SPDK_BS_CPL_TYPE_BLOBID: 5829 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5830 break; 5831 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5832 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5833 break; 5834 default: 5835 SPDK_UNREACHABLE(); 5836 break; 5837 } 5838 5839 free(ctx); 5840 } 5841 5842 static void 5843 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5844 { 5845 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5846 struct spdk_blob *origblob = ctx->original.blob; 5847 5848 if (bserrno != 0) { 5849 if (ctx->bserrno != 0) { 5850 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5851 } else { 5852 ctx->bserrno = bserrno; 5853 } 5854 } 5855 5856 ctx->original.id = origblob->id; 5857 origblob->locked_operation_in_progress = false; 5858 5859 /* Revert md_ro to original state */ 5860 origblob->md_ro = ctx->original.md_ro; 5861 5862 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5863 } 5864 5865 static void 5866 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5867 { 5868 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5869 struct spdk_blob *origblob = ctx->original.blob; 5870 5871 if (bserrno != 0) { 5872 if (ctx->bserrno != 0) { 5873 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5874 } else { 5875 ctx->bserrno = bserrno; 5876 } 5877 } 5878 5879 if (ctx->frozen) { 5880 /* Unfreeze any outstanding I/O */ 5881 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5882 } else { 5883 bs_snapshot_unfreeze_cpl(ctx, 0); 5884 } 5885 5886 } 5887 5888 static void 5889 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5890 { 5891 struct spdk_blob *newblob = ctx->new.blob; 5892 5893 if (bserrno != 0) { 5894 if (ctx->bserrno != 0) { 5895 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5896 } else { 5897 ctx->bserrno = bserrno; 5898 } 5899 } 5900 5901 ctx->new.id = newblob->id; 5902 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5903 } 5904 5905 /* END blob_cleanup */ 5906 5907 /* START spdk_bs_create_snapshot */ 5908 5909 static void 5910 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 5911 { 5912 uint64_t *cluster_temp; 5913 uint32_t *extent_page_temp; 5914 5915 cluster_temp = blob1->active.clusters; 5916 blob1->active.clusters = blob2->active.clusters; 5917 blob2->active.clusters = cluster_temp; 5918 5919 extent_page_temp = blob1->active.extent_pages; 5920 blob1->active.extent_pages = blob2->active.extent_pages; 5921 blob2->active.extent_pages = extent_page_temp; 5922 } 5923 5924 static void 5925 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 5926 { 5927 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5928 struct spdk_blob *origblob = ctx->original.blob; 5929 struct spdk_blob *newblob = ctx->new.blob; 5930 5931 if (bserrno != 0) { 5932 bs_snapshot_swap_cluster_maps(newblob, origblob); 5933 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5934 return; 5935 } 5936 5937 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 5938 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 5939 if (bserrno != 0) { 5940 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5941 return; 5942 } 5943 5944 bs_blob_list_add(ctx->original.blob); 5945 5946 spdk_blob_set_read_only(newblob); 5947 5948 /* sync snapshot metadata */ 5949 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5950 } 5951 5952 static void 5953 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 5954 { 5955 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5956 struct spdk_blob *origblob = ctx->original.blob; 5957 struct spdk_blob *newblob = ctx->new.blob; 5958 5959 if (bserrno != 0) { 5960 /* return cluster map back to original */ 5961 bs_snapshot_swap_cluster_maps(newblob, origblob); 5962 5963 /* Newblob md sync failed. Valid clusters are only present in origblob. 5964 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred. 5965 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 5966 blob_set_thin_provision(newblob); 5967 assert(spdk_mem_all_zero(newblob->active.clusters, 5968 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5969 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5970 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5971 5972 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5973 return; 5974 } 5975 5976 /* Set internal xattr for snapshot id */ 5977 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 5978 if (bserrno != 0) { 5979 /* return cluster map back to original */ 5980 bs_snapshot_swap_cluster_maps(newblob, origblob); 5981 blob_set_thin_provision(newblob); 5982 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5983 return; 5984 } 5985 5986 /* Create new back_bs_dev for snapshot */ 5987 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 5988 if (origblob->back_bs_dev == NULL) { 5989 /* return cluster map back to original */ 5990 bs_snapshot_swap_cluster_maps(newblob, origblob); 5991 blob_set_thin_provision(newblob); 5992 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 5993 return; 5994 } 5995 5996 bs_blob_list_remove(origblob); 5997 origblob->parent_id = newblob->id; 5998 /* set clone blob as thin provisioned */ 5999 blob_set_thin_provision(origblob); 6000 6001 bs_blob_list_add(newblob); 6002 6003 /* sync clone metadata */ 6004 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 6005 } 6006 6007 static void 6008 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 6009 { 6010 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6011 struct spdk_blob *origblob = ctx->original.blob; 6012 struct spdk_blob *newblob = ctx->new.blob; 6013 int bserrno; 6014 6015 if (rc != 0) { 6016 bs_clone_snapshot_newblob_cleanup(ctx, rc); 6017 return; 6018 } 6019 6020 ctx->frozen = true; 6021 6022 if (newblob->back_bs_dev) { 6023 newblob->back_bs_dev->destroy(newblob->back_bs_dev); 6024 } 6025 /* set new back_bs_dev for snapshot */ 6026 newblob->back_bs_dev = origblob->back_bs_dev; 6027 /* Set invalid flags from origblob */ 6028 newblob->invalid_flags = origblob->invalid_flags; 6029 6030 /* inherit parent from original blob if set */ 6031 newblob->parent_id = origblob->parent_id; 6032 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 6033 /* Set internal xattr for snapshot id */ 6034 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 6035 &origblob->parent_id, sizeof(spdk_blob_id), true); 6036 if (bserrno != 0) { 6037 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6038 return; 6039 } 6040 } 6041 6042 /* swap cluster maps */ 6043 bs_snapshot_swap_cluster_maps(newblob, origblob); 6044 6045 /* Set the clear method on the new blob to match the original. */ 6046 blob_set_clear_method(newblob, origblob->clear_method); 6047 6048 /* sync snapshot metadata */ 6049 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 6050 } 6051 6052 static void 6053 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6054 { 6055 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6056 struct spdk_blob *origblob = ctx->original.blob; 6057 struct spdk_blob *newblob = _blob; 6058 6059 if (bserrno != 0) { 6060 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6061 return; 6062 } 6063 6064 ctx->new.blob = newblob; 6065 assert(spdk_blob_is_thin_provisioned(newblob)); 6066 assert(spdk_mem_all_zero(newblob->active.clusters, 6067 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6068 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6069 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6070 6071 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 6072 } 6073 6074 static void 6075 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6076 { 6077 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6078 struct spdk_blob *origblob = ctx->original.blob; 6079 6080 if (bserrno != 0) { 6081 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6082 return; 6083 } 6084 6085 ctx->new.id = blobid; 6086 ctx->cpl.u.blobid.blobid = blobid; 6087 6088 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 6089 } 6090 6091 6092 static void 6093 bs_xattr_snapshot(void *arg, const char *name, 6094 const void **value, size_t *value_len) 6095 { 6096 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 6097 6098 struct spdk_blob *blob = (struct spdk_blob *)arg; 6099 *value = &blob->id; 6100 *value_len = sizeof(blob->id); 6101 } 6102 6103 static void 6104 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6105 { 6106 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6107 struct spdk_blob_opts opts; 6108 struct spdk_blob_xattr_opts internal_xattrs; 6109 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 6110 6111 if (bserrno != 0) { 6112 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6113 return; 6114 } 6115 6116 ctx->original.blob = _blob; 6117 6118 if (_blob->data_ro || _blob->md_ro) { 6119 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 6120 _blob->id); 6121 ctx->bserrno = -EINVAL; 6122 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6123 return; 6124 } 6125 6126 if (_blob->locked_operation_in_progress) { 6127 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 6128 ctx->bserrno = -EBUSY; 6129 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6130 return; 6131 } 6132 6133 _blob->locked_operation_in_progress = true; 6134 6135 spdk_blob_opts_init(&opts, sizeof(opts)); 6136 blob_xattrs_init(&internal_xattrs); 6137 6138 /* Change the size of new blob to the same as in original blob, 6139 * but do not allocate clusters */ 6140 opts.thin_provision = true; 6141 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6142 opts.use_extent_table = _blob->use_extent_table; 6143 6144 /* If there are any xattrs specified for snapshot, set them now */ 6145 if (ctx->xattrs) { 6146 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6147 } 6148 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 6149 internal_xattrs.count = 1; 6150 internal_xattrs.ctx = _blob; 6151 internal_xattrs.names = xattrs_names; 6152 internal_xattrs.get_value = bs_xattr_snapshot; 6153 6154 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6155 bs_snapshot_newblob_create_cpl, ctx); 6156 } 6157 6158 void 6159 spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 6160 const struct spdk_blob_xattr_opts *snapshot_xattrs, 6161 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6162 { 6163 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6164 6165 if (!ctx) { 6166 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6167 return; 6168 } 6169 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6170 ctx->cpl.u.blobid.cb_fn = cb_fn; 6171 ctx->cpl.u.blobid.cb_arg = cb_arg; 6172 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6173 ctx->bserrno = 0; 6174 ctx->frozen = false; 6175 ctx->original.id = blobid; 6176 ctx->xattrs = snapshot_xattrs; 6177 6178 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 6179 } 6180 /* END spdk_bs_create_snapshot */ 6181 6182 /* START spdk_bs_create_clone */ 6183 6184 static void 6185 bs_xattr_clone(void *arg, const char *name, 6186 const void **value, size_t *value_len) 6187 { 6188 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 6189 6190 struct spdk_blob *blob = (struct spdk_blob *)arg; 6191 *value = &blob->id; 6192 *value_len = sizeof(blob->id); 6193 } 6194 6195 static void 6196 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6197 { 6198 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6199 struct spdk_blob *clone = _blob; 6200 6201 ctx->new.blob = clone; 6202 bs_blob_list_add(clone); 6203 6204 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 6205 } 6206 6207 static void 6208 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6209 { 6210 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6211 6212 ctx->cpl.u.blobid.blobid = blobid; 6213 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 6214 } 6215 6216 static void 6217 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6218 { 6219 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6220 struct spdk_blob_opts opts; 6221 struct spdk_blob_xattr_opts internal_xattrs; 6222 char *xattr_names[] = { BLOB_SNAPSHOT }; 6223 6224 if (bserrno != 0) { 6225 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6226 return; 6227 } 6228 6229 ctx->original.blob = _blob; 6230 ctx->original.md_ro = _blob->md_ro; 6231 6232 if (!_blob->data_ro || !_blob->md_ro) { 6233 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 6234 ctx->bserrno = -EINVAL; 6235 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6236 return; 6237 } 6238 6239 if (_blob->locked_operation_in_progress) { 6240 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 6241 ctx->bserrno = -EBUSY; 6242 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6243 return; 6244 } 6245 6246 _blob->locked_operation_in_progress = true; 6247 6248 spdk_blob_opts_init(&opts, sizeof(opts)); 6249 blob_xattrs_init(&internal_xattrs); 6250 6251 opts.thin_provision = true; 6252 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6253 opts.use_extent_table = _blob->use_extent_table; 6254 if (ctx->xattrs) { 6255 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6256 } 6257 6258 /* Set internal xattr BLOB_SNAPSHOT */ 6259 internal_xattrs.count = 1; 6260 internal_xattrs.ctx = _blob; 6261 internal_xattrs.names = xattr_names; 6262 internal_xattrs.get_value = bs_xattr_clone; 6263 6264 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6265 bs_clone_newblob_create_cpl, ctx); 6266 } 6267 6268 void 6269 spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 6270 const struct spdk_blob_xattr_opts *clone_xattrs, 6271 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6272 { 6273 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6274 6275 if (!ctx) { 6276 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6277 return; 6278 } 6279 6280 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6281 ctx->cpl.u.blobid.cb_fn = cb_fn; 6282 ctx->cpl.u.blobid.cb_arg = cb_arg; 6283 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6284 ctx->bserrno = 0; 6285 ctx->xattrs = clone_xattrs; 6286 ctx->original.id = blobid; 6287 6288 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6289 } 6290 6291 /* END spdk_bs_create_clone */ 6292 6293 /* START spdk_bs_inflate_blob */ 6294 6295 static void 6296 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6297 { 6298 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6299 struct spdk_blob *_blob = ctx->original.blob; 6300 6301 if (bserrno != 0) { 6302 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6303 return; 6304 } 6305 6306 /* Temporarily override md_ro flag for MD modification */ 6307 _blob->md_ro = false; 6308 6309 bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true); 6310 if (bserrno != 0) { 6311 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6312 return; 6313 } 6314 6315 assert(_parent != NULL); 6316 6317 bs_blob_list_remove(_blob); 6318 _blob->parent_id = _parent->id; 6319 6320 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6321 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6322 bs_blob_list_add(_blob); 6323 6324 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6325 } 6326 6327 static void 6328 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6329 { 6330 struct spdk_blob *_blob = ctx->original.blob; 6331 struct spdk_blob *_parent; 6332 6333 if (ctx->allocate_all) { 6334 /* remove thin provisioning */ 6335 bs_blob_list_remove(_blob); 6336 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6337 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6338 _blob->back_bs_dev = NULL; 6339 _blob->parent_id = SPDK_BLOBID_INVALID; 6340 } else { 6341 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6342 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6343 /* We must change the parent of the inflated blob */ 6344 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6345 bs_inflate_blob_set_parent_cpl, ctx); 6346 return; 6347 } 6348 6349 bs_blob_list_remove(_blob); 6350 _blob->parent_id = SPDK_BLOBID_INVALID; 6351 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6352 _blob->back_bs_dev = bs_create_zeroes_dev(); 6353 } 6354 6355 /* Temporarily override md_ro flag for MD modification */ 6356 _blob->md_ro = false; 6357 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6358 _blob->state = SPDK_BLOB_STATE_DIRTY; 6359 6360 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6361 } 6362 6363 /* Check if cluster needs allocation */ 6364 static inline bool 6365 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6366 { 6367 struct spdk_blob_bs_dev *b; 6368 6369 assert(blob != NULL); 6370 6371 if (blob->active.clusters[cluster] != 0) { 6372 /* Cluster is already allocated */ 6373 return false; 6374 } 6375 6376 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6377 /* Blob have no parent blob */ 6378 return allocate_all; 6379 } 6380 6381 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6382 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6383 } 6384 6385 static void 6386 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6387 { 6388 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6389 struct spdk_blob *_blob = ctx->original.blob; 6390 struct spdk_bs_cpl cpl; 6391 spdk_bs_user_op_t *op; 6392 uint64_t offset; 6393 6394 if (bserrno != 0) { 6395 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6396 return; 6397 } 6398 6399 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6400 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6401 break; 6402 } 6403 } 6404 6405 if (ctx->cluster < _blob->active.num_clusters) { 6406 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6407 6408 /* We may safely increment a cluster before copying */ 6409 ctx->cluster++; 6410 6411 /* Use a dummy 0B read as a context for cluster copy */ 6412 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6413 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next; 6414 cpl.u.blob_basic.cb_arg = ctx; 6415 6416 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob, 6417 NULL, 0, offset, 0); 6418 if (!op) { 6419 bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM); 6420 return; 6421 } 6422 6423 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op); 6424 } else { 6425 bs_inflate_blob_done(ctx); 6426 } 6427 } 6428 6429 static void 6430 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6431 { 6432 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6433 uint64_t clusters_needed; 6434 uint64_t i; 6435 6436 if (bserrno != 0) { 6437 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6438 return; 6439 } 6440 6441 ctx->original.blob = _blob; 6442 ctx->original.md_ro = _blob->md_ro; 6443 6444 if (_blob->locked_operation_in_progress) { 6445 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6446 ctx->bserrno = -EBUSY; 6447 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6448 return; 6449 } 6450 6451 _blob->locked_operation_in_progress = true; 6452 6453 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6454 /* This blob have no parent, so we cannot decouple it. */ 6455 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6456 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6457 return; 6458 } 6459 6460 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6461 /* This is not thin provisioned blob. No need to inflate. */ 6462 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6463 return; 6464 } 6465 6466 /* Do two passes - one to verify that we can obtain enough clusters 6467 * and another to actually claim them. 6468 */ 6469 clusters_needed = 0; 6470 for (i = 0; i < _blob->active.num_clusters; i++) { 6471 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6472 clusters_needed++; 6473 } 6474 } 6475 6476 if (clusters_needed > _blob->bs->num_free_clusters) { 6477 /* Not enough free clusters. Cannot satisfy the request. */ 6478 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6479 return; 6480 } 6481 6482 ctx->cluster = 0; 6483 bs_inflate_blob_touch_next(ctx, 0); 6484 } 6485 6486 static void 6487 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6488 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6489 { 6490 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6491 6492 if (!ctx) { 6493 cb_fn(cb_arg, -ENOMEM); 6494 return; 6495 } 6496 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6497 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6498 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6499 ctx->bserrno = 0; 6500 ctx->original.id = blobid; 6501 ctx->channel = channel; 6502 ctx->allocate_all = allocate_all; 6503 6504 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6505 } 6506 6507 void 6508 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6509 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6510 { 6511 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6512 } 6513 6514 void 6515 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6516 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6517 { 6518 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6519 } 6520 /* END spdk_bs_inflate_blob */ 6521 6522 /* START spdk_blob_resize */ 6523 struct spdk_bs_resize_ctx { 6524 spdk_blob_op_complete cb_fn; 6525 void *cb_arg; 6526 struct spdk_blob *blob; 6527 uint64_t sz; 6528 int rc; 6529 }; 6530 6531 static void 6532 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6533 { 6534 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6535 6536 if (rc != 0) { 6537 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6538 } 6539 6540 if (ctx->rc != 0) { 6541 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6542 rc = ctx->rc; 6543 } 6544 6545 ctx->blob->locked_operation_in_progress = false; 6546 6547 ctx->cb_fn(ctx->cb_arg, rc); 6548 free(ctx); 6549 } 6550 6551 static void 6552 bs_resize_freeze_cpl(void *cb_arg, int rc) 6553 { 6554 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6555 6556 if (rc != 0) { 6557 ctx->blob->locked_operation_in_progress = false; 6558 ctx->cb_fn(ctx->cb_arg, rc); 6559 free(ctx); 6560 return; 6561 } 6562 6563 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6564 6565 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6566 } 6567 6568 void 6569 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6570 { 6571 struct spdk_bs_resize_ctx *ctx; 6572 6573 blob_verify_md_op(blob); 6574 6575 SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz); 6576 6577 if (blob->md_ro) { 6578 cb_fn(cb_arg, -EPERM); 6579 return; 6580 } 6581 6582 if (sz == blob->active.num_clusters) { 6583 cb_fn(cb_arg, 0); 6584 return; 6585 } 6586 6587 if (blob->locked_operation_in_progress) { 6588 cb_fn(cb_arg, -EBUSY); 6589 return; 6590 } 6591 6592 ctx = calloc(1, sizeof(*ctx)); 6593 if (!ctx) { 6594 cb_fn(cb_arg, -ENOMEM); 6595 return; 6596 } 6597 6598 blob->locked_operation_in_progress = true; 6599 ctx->cb_fn = cb_fn; 6600 ctx->cb_arg = cb_arg; 6601 ctx->blob = blob; 6602 ctx->sz = sz; 6603 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6604 } 6605 6606 /* END spdk_blob_resize */ 6607 6608 6609 /* START spdk_bs_delete_blob */ 6610 6611 static void 6612 bs_delete_close_cpl(void *cb_arg, int bserrno) 6613 { 6614 spdk_bs_sequence_t *seq = cb_arg; 6615 6616 bs_sequence_finish(seq, bserrno); 6617 } 6618 6619 static void 6620 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6621 { 6622 struct spdk_blob *blob = cb_arg; 6623 6624 if (bserrno != 0) { 6625 /* 6626 * We already removed this blob from the blobstore tailq, so 6627 * we need to free it here since this is the last reference 6628 * to it. 6629 */ 6630 blob_free(blob); 6631 bs_delete_close_cpl(seq, bserrno); 6632 return; 6633 } 6634 6635 /* 6636 * This will immediately decrement the ref_count and call 6637 * the completion routine since the metadata state is clean. 6638 * By calling spdk_blob_close, we reduce the number of call 6639 * points into code that touches the blob->open_ref count 6640 * and the blobstore's blob list. 6641 */ 6642 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6643 } 6644 6645 struct delete_snapshot_ctx { 6646 struct spdk_blob_list *parent_snapshot_entry; 6647 struct spdk_blob *snapshot; 6648 struct spdk_blob_md_page *page; 6649 bool snapshot_md_ro; 6650 struct spdk_blob *clone; 6651 bool clone_md_ro; 6652 spdk_blob_op_with_handle_complete cb_fn; 6653 void *cb_arg; 6654 int bserrno; 6655 uint32_t next_extent_page; 6656 }; 6657 6658 static void 6659 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6660 { 6661 struct delete_snapshot_ctx *ctx = cb_arg; 6662 6663 if (bserrno != 0) { 6664 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6665 } 6666 6667 assert(ctx != NULL); 6668 6669 if (bserrno != 0 && ctx->bserrno == 0) { 6670 ctx->bserrno = bserrno; 6671 } 6672 6673 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6674 spdk_free(ctx->page); 6675 free(ctx); 6676 } 6677 6678 static void 6679 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6680 { 6681 struct delete_snapshot_ctx *ctx = cb_arg; 6682 6683 if (bserrno != 0) { 6684 ctx->bserrno = bserrno; 6685 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6686 } 6687 6688 if (ctx->bserrno != 0) { 6689 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6690 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot); 6691 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6692 } 6693 6694 ctx->snapshot->locked_operation_in_progress = false; 6695 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6696 6697 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6698 } 6699 6700 static void 6701 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6702 { 6703 struct delete_snapshot_ctx *ctx = cb_arg; 6704 6705 ctx->clone->locked_operation_in_progress = false; 6706 ctx->clone->md_ro = ctx->clone_md_ro; 6707 6708 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6709 } 6710 6711 static void 6712 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6713 { 6714 struct delete_snapshot_ctx *ctx = cb_arg; 6715 6716 if (bserrno) { 6717 ctx->bserrno = bserrno; 6718 delete_snapshot_cleanup_clone(ctx, 0); 6719 return; 6720 } 6721 6722 ctx->clone->locked_operation_in_progress = false; 6723 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6724 } 6725 6726 static void 6727 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6728 { 6729 struct delete_snapshot_ctx *ctx = cb_arg; 6730 struct spdk_blob_list *parent_snapshot_entry = NULL; 6731 struct spdk_blob_list *snapshot_entry = NULL; 6732 struct spdk_blob_list *clone_entry = NULL; 6733 struct spdk_blob_list *snapshot_clone_entry = NULL; 6734 6735 if (bserrno) { 6736 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6737 ctx->bserrno = bserrno; 6738 delete_snapshot_cleanup_clone(ctx, 0); 6739 return; 6740 } 6741 6742 /* Get snapshot entry for the snapshot we want to remove */ 6743 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6744 6745 assert(snapshot_entry != NULL); 6746 6747 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6748 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6749 assert(clone_entry != NULL); 6750 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6751 snapshot_entry->clone_count--; 6752 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6753 6754 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6755 /* This snapshot is at the same time a clone of another snapshot - we need to 6756 * update parent snapshot (remove current clone, add new one inherited from 6757 * the snapshot that is being removed) */ 6758 6759 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6760 * snapshot that we are removing */ 6761 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6762 &snapshot_clone_entry); 6763 6764 /* Switch clone entry in parent snapshot */ 6765 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6766 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6767 free(snapshot_clone_entry); 6768 } else { 6769 /* No parent snapshot - just remove clone entry */ 6770 free(clone_entry); 6771 } 6772 6773 /* Restore md_ro flags */ 6774 ctx->clone->md_ro = ctx->clone_md_ro; 6775 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6776 6777 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6778 } 6779 6780 static void 6781 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6782 { 6783 struct delete_snapshot_ctx *ctx = cb_arg; 6784 uint64_t i; 6785 6786 ctx->snapshot->md_ro = false; 6787 6788 if (bserrno) { 6789 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6790 ctx->bserrno = bserrno; 6791 6792 /* Restore snapshot to previous state */ 6793 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6794 if (bserrno != 0) { 6795 delete_snapshot_cleanup_clone(ctx, bserrno); 6796 return; 6797 } 6798 6799 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6800 return; 6801 } 6802 6803 /* Clear cluster map entries for snapshot */ 6804 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6805 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6806 ctx->snapshot->active.clusters[i] = 0; 6807 } 6808 } 6809 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6810 i < ctx->clone->active.num_extent_pages; i++) { 6811 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6812 ctx->snapshot->active.extent_pages[i] = 0; 6813 } 6814 } 6815 6816 blob_set_thin_provision(ctx->snapshot); 6817 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6818 6819 if (ctx->parent_snapshot_entry != NULL) { 6820 ctx->snapshot->back_bs_dev = NULL; 6821 } 6822 6823 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6824 } 6825 6826 static void 6827 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6828 { 6829 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6830 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6831 6832 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6833 if (ctx->parent_snapshot_entry != NULL) { 6834 /* ...to parent snapshot */ 6835 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6836 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6837 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6838 sizeof(spdk_blob_id), 6839 true); 6840 } else { 6841 /* ...to blobid invalid and zeroes dev */ 6842 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6843 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6844 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6845 } 6846 6847 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6848 } 6849 6850 static void 6851 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6852 { 6853 struct delete_snapshot_ctx *ctx = cb_arg; 6854 uint32_t *extent_page; 6855 uint64_t i; 6856 6857 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6858 i < ctx->clone->active.num_extent_pages; i++) { 6859 if (ctx->snapshot->active.extent_pages[i] == 0) { 6860 /* No extent page to use from snapshot */ 6861 continue; 6862 } 6863 6864 extent_page = &ctx->clone->active.extent_pages[i]; 6865 if (*extent_page == 0) { 6866 /* Copy extent page from snapshot when clone did not have a matching one */ 6867 *extent_page = ctx->snapshot->active.extent_pages[i]; 6868 continue; 6869 } 6870 6871 /* Clone and snapshot both contain partially filled matching extent pages. 6872 * Update the clone extent page in place with cluster map containing the mix of both. */ 6873 ctx->next_extent_page = i + 1; 6874 memset(ctx->page, 0, SPDK_BS_PAGE_SIZE); 6875 6876 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page, 6877 delete_snapshot_update_extent_pages, ctx); 6878 return; 6879 } 6880 delete_snapshot_update_extent_pages_cpl(ctx); 6881 } 6882 6883 static void 6884 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6885 { 6886 struct delete_snapshot_ctx *ctx = cb_arg; 6887 uint64_t i; 6888 6889 /* Temporarily override md_ro flag for clone for MD modification */ 6890 ctx->clone_md_ro = ctx->clone->md_ro; 6891 ctx->clone->md_ro = false; 6892 6893 if (bserrno) { 6894 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6895 ctx->bserrno = bserrno; 6896 delete_snapshot_cleanup_clone(ctx, 0); 6897 return; 6898 } 6899 6900 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 6901 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6902 if (ctx->clone->active.clusters[i] == 0) { 6903 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 6904 } 6905 } 6906 ctx->next_extent_page = 0; 6907 delete_snapshot_update_extent_pages(ctx, 0); 6908 } 6909 6910 static void 6911 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 6912 { 6913 struct delete_snapshot_ctx *ctx = cb_arg; 6914 6915 if (bserrno) { 6916 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 6917 ctx->bserrno = bserrno; 6918 delete_snapshot_cleanup_clone(ctx, 0); 6919 return; 6920 } 6921 6922 /* Temporarily override md_ro flag for snapshot for MD modification */ 6923 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 6924 ctx->snapshot->md_ro = false; 6925 6926 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 6927 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 6928 sizeof(spdk_blob_id), true); 6929 if (ctx->bserrno != 0) { 6930 delete_snapshot_cleanup_clone(ctx, 0); 6931 return; 6932 } 6933 6934 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 6935 } 6936 6937 static void 6938 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 6939 { 6940 struct delete_snapshot_ctx *ctx = cb_arg; 6941 6942 if (bserrno) { 6943 SPDK_ERRLOG("Failed to open clone\n"); 6944 ctx->bserrno = bserrno; 6945 delete_snapshot_cleanup_snapshot(ctx, 0); 6946 return; 6947 } 6948 6949 ctx->clone = clone; 6950 6951 if (clone->locked_operation_in_progress) { 6952 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 6953 ctx->bserrno = -EBUSY; 6954 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6955 return; 6956 } 6957 6958 clone->locked_operation_in_progress = true; 6959 6960 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 6961 } 6962 6963 static void 6964 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 6965 { 6966 struct spdk_blob_list *snapshot_entry = NULL; 6967 struct spdk_blob_list *clone_entry = NULL; 6968 struct spdk_blob_list *snapshot_clone_entry = NULL; 6969 6970 /* Get snapshot entry for the snapshot we want to remove */ 6971 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 6972 6973 assert(snapshot_entry != NULL); 6974 6975 /* Get clone of the snapshot (at this point there can be only one clone) */ 6976 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6977 assert(snapshot_entry->clone_count == 1); 6978 assert(clone_entry != NULL); 6979 6980 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6981 * snapshot that we are removing */ 6982 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 6983 &snapshot_clone_entry); 6984 6985 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 6986 } 6987 6988 static void 6989 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 6990 { 6991 spdk_bs_sequence_t *seq = cb_arg; 6992 struct spdk_blob_list *snapshot_entry = NULL; 6993 uint32_t page_num; 6994 6995 if (bserrno) { 6996 SPDK_ERRLOG("Failed to remove blob\n"); 6997 bs_sequence_finish(seq, bserrno); 6998 return; 6999 } 7000 7001 /* Remove snapshot from the list */ 7002 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7003 if (snapshot_entry != NULL) { 7004 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 7005 free(snapshot_entry); 7006 } 7007 7008 page_num = bs_blobid_to_page(blob->id); 7009 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 7010 blob->state = SPDK_BLOB_STATE_DIRTY; 7011 blob->active.num_pages = 0; 7012 blob_resize(blob, 0); 7013 7014 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 7015 } 7016 7017 static int 7018 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 7019 { 7020 struct spdk_blob_list *snapshot_entry = NULL; 7021 struct spdk_blob_list *clone_entry = NULL; 7022 struct spdk_blob *clone = NULL; 7023 bool has_one_clone = false; 7024 7025 /* Check if this is a snapshot with clones */ 7026 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7027 if (snapshot_entry != NULL) { 7028 if (snapshot_entry->clone_count > 1) { 7029 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 7030 return -EBUSY; 7031 } else if (snapshot_entry->clone_count == 1) { 7032 has_one_clone = true; 7033 } 7034 } 7035 7036 /* Check if someone has this blob open (besides this delete context): 7037 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 7038 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 7039 * and that is ok, because we will update it accordingly */ 7040 if (blob->open_ref <= 2 && has_one_clone) { 7041 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7042 assert(clone_entry != NULL); 7043 clone = blob_lookup(blob->bs, clone_entry->id); 7044 7045 if (blob->open_ref == 2 && clone == NULL) { 7046 /* Clone is closed and someone else opened this blob */ 7047 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7048 return -EBUSY; 7049 } 7050 7051 *update_clone = true; 7052 return 0; 7053 } 7054 7055 if (blob->open_ref > 1) { 7056 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7057 return -EBUSY; 7058 } 7059 7060 assert(has_one_clone == false); 7061 *update_clone = false; 7062 return 0; 7063 } 7064 7065 static void 7066 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 7067 { 7068 spdk_bs_sequence_t *seq = cb_arg; 7069 7070 bs_sequence_finish(seq, -ENOMEM); 7071 } 7072 7073 static void 7074 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 7075 { 7076 spdk_bs_sequence_t *seq = cb_arg; 7077 struct delete_snapshot_ctx *ctx; 7078 bool update_clone = false; 7079 7080 if (bserrno != 0) { 7081 bs_sequence_finish(seq, bserrno); 7082 return; 7083 } 7084 7085 blob_verify_md_op(blob); 7086 7087 ctx = calloc(1, sizeof(*ctx)); 7088 if (ctx == NULL) { 7089 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 7090 return; 7091 } 7092 7093 ctx->snapshot = blob; 7094 ctx->cb_fn = bs_delete_blob_finish; 7095 ctx->cb_arg = seq; 7096 7097 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 7098 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 7099 if (ctx->bserrno) { 7100 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7101 return; 7102 } 7103 7104 if (blob->locked_operation_in_progress) { 7105 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 7106 ctx->bserrno = -EBUSY; 7107 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7108 return; 7109 } 7110 7111 blob->locked_operation_in_progress = true; 7112 7113 /* 7114 * Remove the blob from the blob_store list now, to ensure it does not 7115 * get returned after this point by blob_lookup(). 7116 */ 7117 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7118 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7119 7120 if (update_clone) { 7121 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 7122 if (!ctx->page) { 7123 ctx->bserrno = -ENOMEM; 7124 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7125 return; 7126 } 7127 /* This blob is a snapshot with active clone - update clone first */ 7128 update_clone_on_snapshot_deletion(blob, ctx); 7129 } else { 7130 /* This blob does not have any clones - just remove it */ 7131 bs_blob_list_remove(blob); 7132 bs_delete_blob_finish(seq, blob, 0); 7133 free(ctx); 7134 } 7135 } 7136 7137 void 7138 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7139 spdk_blob_op_complete cb_fn, void *cb_arg) 7140 { 7141 struct spdk_bs_cpl cpl; 7142 spdk_bs_sequence_t *seq; 7143 7144 SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid); 7145 7146 assert(spdk_get_thread() == bs->md_thread); 7147 7148 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7149 cpl.u.blob_basic.cb_fn = cb_fn; 7150 cpl.u.blob_basic.cb_arg = cb_arg; 7151 7152 seq = bs_sequence_start(bs->md_channel, &cpl); 7153 if (!seq) { 7154 cb_fn(cb_arg, -ENOMEM); 7155 return; 7156 } 7157 7158 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 7159 } 7160 7161 /* END spdk_bs_delete_blob */ 7162 7163 /* START spdk_bs_open_blob */ 7164 7165 static void 7166 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7167 { 7168 struct spdk_blob *blob = cb_arg; 7169 struct spdk_blob *existing; 7170 7171 if (bserrno != 0) { 7172 blob_free(blob); 7173 seq->cpl.u.blob_handle.blob = NULL; 7174 bs_sequence_finish(seq, bserrno); 7175 return; 7176 } 7177 7178 existing = blob_lookup(blob->bs, blob->id); 7179 if (existing) { 7180 blob_free(blob); 7181 existing->open_ref++; 7182 seq->cpl.u.blob_handle.blob = existing; 7183 bs_sequence_finish(seq, 0); 7184 return; 7185 } 7186 7187 blob->open_ref++; 7188 7189 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 7190 RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob); 7191 7192 bs_sequence_finish(seq, bserrno); 7193 } 7194 7195 static inline void 7196 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 7197 { 7198 #define FIELD_OK(field) \ 7199 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 7200 7201 #define SET_FIELD(field) \ 7202 if (FIELD_OK(field)) { \ 7203 dst->field = src->field; \ 7204 } \ 7205 7206 SET_FIELD(clear_method); 7207 7208 dst->opts_size = src->opts_size; 7209 7210 /* You should not remove this statement, but need to update the assert statement 7211 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7212 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 7213 7214 #undef FIELD_OK 7215 #undef SET_FIELD 7216 } 7217 7218 static void 7219 bs_open_blob(struct spdk_blob_store *bs, 7220 spdk_blob_id blobid, 7221 struct spdk_blob_open_opts *opts, 7222 spdk_blob_op_with_handle_complete cb_fn, 7223 void *cb_arg) 7224 { 7225 struct spdk_blob *blob; 7226 struct spdk_bs_cpl cpl; 7227 struct spdk_blob_open_opts opts_local; 7228 spdk_bs_sequence_t *seq; 7229 uint32_t page_num; 7230 7231 SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid); 7232 assert(spdk_get_thread() == bs->md_thread); 7233 7234 page_num = bs_blobid_to_page(blobid); 7235 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 7236 /* Invalid blobid */ 7237 cb_fn(cb_arg, NULL, -ENOENT); 7238 return; 7239 } 7240 7241 blob = blob_lookup(bs, blobid); 7242 if (blob) { 7243 blob->open_ref++; 7244 cb_fn(cb_arg, blob, 0); 7245 return; 7246 } 7247 7248 blob = blob_alloc(bs, blobid); 7249 if (!blob) { 7250 cb_fn(cb_arg, NULL, -ENOMEM); 7251 return; 7252 } 7253 7254 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 7255 if (opts) { 7256 blob_open_opts_copy(opts, &opts_local); 7257 } 7258 7259 blob->clear_method = opts_local.clear_method; 7260 7261 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 7262 cpl.u.blob_handle.cb_fn = cb_fn; 7263 cpl.u.blob_handle.cb_arg = cb_arg; 7264 cpl.u.blob_handle.blob = blob; 7265 7266 seq = bs_sequence_start(bs->md_channel, &cpl); 7267 if (!seq) { 7268 blob_free(blob); 7269 cb_fn(cb_arg, NULL, -ENOMEM); 7270 return; 7271 } 7272 7273 blob_load(seq, blob, bs_open_blob_cpl, blob); 7274 } 7275 7276 void 7277 spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7278 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7279 { 7280 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 7281 } 7282 7283 void 7284 spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 7285 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7286 { 7287 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 7288 } 7289 7290 /* END spdk_bs_open_blob */ 7291 7292 /* START spdk_blob_set_read_only */ 7293 int 7294 spdk_blob_set_read_only(struct spdk_blob *blob) 7295 { 7296 blob_verify_md_op(blob); 7297 7298 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 7299 7300 blob->state = SPDK_BLOB_STATE_DIRTY; 7301 return 0; 7302 } 7303 /* END spdk_blob_set_read_only */ 7304 7305 /* START spdk_blob_sync_md */ 7306 7307 static void 7308 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7309 { 7310 struct spdk_blob *blob = cb_arg; 7311 7312 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7313 blob->data_ro = true; 7314 blob->md_ro = true; 7315 } 7316 7317 bs_sequence_finish(seq, bserrno); 7318 } 7319 7320 static void 7321 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7322 { 7323 struct spdk_bs_cpl cpl; 7324 spdk_bs_sequence_t *seq; 7325 7326 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7327 cpl.u.blob_basic.cb_fn = cb_fn; 7328 cpl.u.blob_basic.cb_arg = cb_arg; 7329 7330 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7331 if (!seq) { 7332 cb_fn(cb_arg, -ENOMEM); 7333 return; 7334 } 7335 7336 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7337 } 7338 7339 void 7340 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7341 { 7342 blob_verify_md_op(blob); 7343 7344 SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id); 7345 7346 if (blob->md_ro) { 7347 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7348 cb_fn(cb_arg, 0); 7349 return; 7350 } 7351 7352 blob_sync_md(blob, cb_fn, cb_arg); 7353 } 7354 7355 /* END spdk_blob_sync_md */ 7356 7357 struct spdk_blob_insert_cluster_ctx { 7358 struct spdk_thread *thread; 7359 struct spdk_blob *blob; 7360 uint32_t cluster_num; /* cluster index in blob */ 7361 uint32_t cluster; /* cluster on disk */ 7362 uint32_t extent_page; /* extent page on disk */ 7363 struct spdk_blob_md_page *page; /* preallocated extent page */ 7364 int rc; 7365 spdk_blob_op_complete cb_fn; 7366 void *cb_arg; 7367 }; 7368 7369 static void 7370 blob_insert_cluster_msg_cpl(void *arg) 7371 { 7372 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7373 7374 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7375 free(ctx); 7376 } 7377 7378 static void 7379 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7380 { 7381 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7382 7383 ctx->rc = bserrno; 7384 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7385 } 7386 7387 static void 7388 blob_insert_new_ep_cb(void *arg, int bserrno) 7389 { 7390 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7391 uint32_t *extent_page; 7392 7393 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7394 *extent_page = ctx->extent_page; 7395 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7396 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7397 } 7398 7399 static void 7400 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7401 { 7402 bs_sequence_finish(seq, bserrno); 7403 } 7404 7405 static void 7406 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7407 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg) 7408 { 7409 spdk_bs_sequence_t *seq; 7410 struct spdk_bs_cpl cpl; 7411 7412 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7413 cpl.u.blob_basic.cb_fn = cb_fn; 7414 cpl.u.blob_basic.cb_arg = cb_arg; 7415 7416 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7417 if (!seq) { 7418 cb_fn(cb_arg, -ENOMEM); 7419 return; 7420 } 7421 7422 assert(page); 7423 page->next = SPDK_INVALID_MD_PAGE; 7424 page->id = blob->id; 7425 page->sequence_num = 0; 7426 7427 blob_serialize_extent_page(blob, cluster_num, page); 7428 7429 page->crc = blob_md_page_calc_crc(page); 7430 7431 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7432 7433 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7434 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7435 blob_persist_extent_page_cpl, page); 7436 } 7437 7438 static void 7439 blob_insert_cluster_msg(void *arg) 7440 { 7441 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7442 uint32_t *extent_page; 7443 7444 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7445 if (ctx->rc != 0) { 7446 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7447 return; 7448 } 7449 7450 if (ctx->blob->use_extent_table == false) { 7451 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7452 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7453 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7454 return; 7455 } 7456 7457 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7458 if (*extent_page == 0) { 7459 /* Extent page requires allocation. 7460 * It was already claimed in the used_md_pages map and placed in ctx. */ 7461 assert(ctx->extent_page != 0); 7462 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7463 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page, 7464 blob_insert_new_ep_cb, ctx); 7465 } else { 7466 /* It is possible for original thread to allocate extent page for 7467 * different cluster in the same extent page. In such case proceed with 7468 * updating the existing extent page, but release the additional one. */ 7469 if (ctx->extent_page != 0) { 7470 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7471 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7472 ctx->extent_page = 0; 7473 } 7474 /* Extent page already allocated. 7475 * Every cluster allocation, requires just an update of single extent page. */ 7476 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page, 7477 blob_insert_cluster_msg_cb, ctx); 7478 } 7479 } 7480 7481 static void 7482 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7483 uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page, 7484 spdk_blob_op_complete cb_fn, void *cb_arg) 7485 { 7486 struct spdk_blob_insert_cluster_ctx *ctx; 7487 7488 ctx = calloc(1, sizeof(*ctx)); 7489 if (ctx == NULL) { 7490 cb_fn(cb_arg, -ENOMEM); 7491 return; 7492 } 7493 7494 ctx->thread = spdk_get_thread(); 7495 ctx->blob = blob; 7496 ctx->cluster_num = cluster_num; 7497 ctx->cluster = cluster; 7498 ctx->extent_page = extent_page; 7499 ctx->page = page; 7500 ctx->cb_fn = cb_fn; 7501 ctx->cb_arg = cb_arg; 7502 7503 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7504 } 7505 7506 /* START spdk_blob_close */ 7507 7508 static void 7509 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7510 { 7511 struct spdk_blob *blob = cb_arg; 7512 7513 if (bserrno == 0) { 7514 blob->open_ref--; 7515 if (blob->open_ref == 0) { 7516 /* 7517 * Blobs with active.num_pages == 0 are deleted blobs. 7518 * these blobs are removed from the blob_store list 7519 * when the deletion process starts - so don't try to 7520 * remove them again. 7521 */ 7522 if (blob->active.num_pages > 0) { 7523 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7524 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7525 } 7526 blob_free(blob); 7527 } 7528 } 7529 7530 bs_sequence_finish(seq, bserrno); 7531 } 7532 7533 void 7534 spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7535 { 7536 struct spdk_bs_cpl cpl; 7537 spdk_bs_sequence_t *seq; 7538 7539 blob_verify_md_op(blob); 7540 7541 SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id); 7542 7543 if (blob->open_ref == 0) { 7544 cb_fn(cb_arg, -EBADF); 7545 return; 7546 } 7547 7548 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7549 cpl.u.blob_basic.cb_fn = cb_fn; 7550 cpl.u.blob_basic.cb_arg = cb_arg; 7551 7552 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7553 if (!seq) { 7554 cb_fn(cb_arg, -ENOMEM); 7555 return; 7556 } 7557 7558 /* Sync metadata */ 7559 blob_persist(seq, blob, blob_close_cpl, blob); 7560 } 7561 7562 /* END spdk_blob_close */ 7563 7564 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7565 { 7566 return spdk_get_io_channel(bs); 7567 } 7568 7569 void 7570 spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7571 { 7572 spdk_put_io_channel(channel); 7573 } 7574 7575 void 7576 spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7577 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7578 { 7579 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7580 SPDK_BLOB_UNMAP); 7581 } 7582 7583 void 7584 spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7585 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7586 { 7587 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7588 SPDK_BLOB_WRITE_ZEROES); 7589 } 7590 7591 void 7592 spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7593 void *payload, uint64_t offset, uint64_t length, 7594 spdk_blob_op_complete cb_fn, void *cb_arg) 7595 { 7596 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7597 SPDK_BLOB_WRITE); 7598 } 7599 7600 void 7601 spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7602 void *payload, uint64_t offset, uint64_t length, 7603 spdk_blob_op_complete cb_fn, void *cb_arg) 7604 { 7605 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7606 SPDK_BLOB_READ); 7607 } 7608 7609 void 7610 spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7611 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7612 spdk_blob_op_complete cb_fn, void *cb_arg) 7613 { 7614 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL); 7615 } 7616 7617 void 7618 spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7619 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7620 spdk_blob_op_complete cb_fn, void *cb_arg) 7621 { 7622 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL); 7623 } 7624 7625 void 7626 spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7627 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7628 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7629 { 7630 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, 7631 io_opts); 7632 } 7633 7634 void 7635 spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7636 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7637 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7638 { 7639 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, 7640 io_opts); 7641 } 7642 7643 struct spdk_bs_iter_ctx { 7644 int64_t page_num; 7645 struct spdk_blob_store *bs; 7646 7647 spdk_blob_op_with_handle_complete cb_fn; 7648 void *cb_arg; 7649 }; 7650 7651 static void 7652 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7653 { 7654 struct spdk_bs_iter_ctx *ctx = cb_arg; 7655 struct spdk_blob_store *bs = ctx->bs; 7656 spdk_blob_id id; 7657 7658 if (bserrno == 0) { 7659 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7660 free(ctx); 7661 return; 7662 } 7663 7664 ctx->page_num++; 7665 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7666 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7667 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7668 free(ctx); 7669 return; 7670 } 7671 7672 id = bs_page_to_blobid(ctx->page_num); 7673 7674 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7675 } 7676 7677 void 7678 spdk_bs_iter_first(struct spdk_blob_store *bs, 7679 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7680 { 7681 struct spdk_bs_iter_ctx *ctx; 7682 7683 ctx = calloc(1, sizeof(*ctx)); 7684 if (!ctx) { 7685 cb_fn(cb_arg, NULL, -ENOMEM); 7686 return; 7687 } 7688 7689 ctx->page_num = -1; 7690 ctx->bs = bs; 7691 ctx->cb_fn = cb_fn; 7692 ctx->cb_arg = cb_arg; 7693 7694 bs_iter_cpl(ctx, NULL, -1); 7695 } 7696 7697 static void 7698 bs_iter_close_cpl(void *cb_arg, int bserrno) 7699 { 7700 struct spdk_bs_iter_ctx *ctx = cb_arg; 7701 7702 bs_iter_cpl(ctx, NULL, -1); 7703 } 7704 7705 void 7706 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7707 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7708 { 7709 struct spdk_bs_iter_ctx *ctx; 7710 7711 assert(blob != NULL); 7712 7713 ctx = calloc(1, sizeof(*ctx)); 7714 if (!ctx) { 7715 cb_fn(cb_arg, NULL, -ENOMEM); 7716 return; 7717 } 7718 7719 ctx->page_num = bs_blobid_to_page(blob->id); 7720 ctx->bs = bs; 7721 ctx->cb_fn = cb_fn; 7722 ctx->cb_arg = cb_arg; 7723 7724 /* Close the existing blob */ 7725 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7726 } 7727 7728 static int 7729 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7730 uint16_t value_len, bool internal) 7731 { 7732 struct spdk_xattr_tailq *xattrs; 7733 struct spdk_xattr *xattr; 7734 size_t desc_size; 7735 void *tmp; 7736 7737 blob_verify_md_op(blob); 7738 7739 if (blob->md_ro) { 7740 return -EPERM; 7741 } 7742 7743 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7744 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7745 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7746 desc_size, SPDK_BS_MAX_DESC_SIZE); 7747 return -ENOMEM; 7748 } 7749 7750 if (internal) { 7751 xattrs = &blob->xattrs_internal; 7752 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7753 } else { 7754 xattrs = &blob->xattrs; 7755 } 7756 7757 TAILQ_FOREACH(xattr, xattrs, link) { 7758 if (!strcmp(name, xattr->name)) { 7759 tmp = malloc(value_len); 7760 if (!tmp) { 7761 return -ENOMEM; 7762 } 7763 7764 free(xattr->value); 7765 xattr->value_len = value_len; 7766 xattr->value = tmp; 7767 memcpy(xattr->value, value, value_len); 7768 7769 blob->state = SPDK_BLOB_STATE_DIRTY; 7770 7771 return 0; 7772 } 7773 } 7774 7775 xattr = calloc(1, sizeof(*xattr)); 7776 if (!xattr) { 7777 return -ENOMEM; 7778 } 7779 7780 xattr->name = strdup(name); 7781 if (!xattr->name) { 7782 free(xattr); 7783 return -ENOMEM; 7784 } 7785 7786 xattr->value_len = value_len; 7787 xattr->value = malloc(value_len); 7788 if (!xattr->value) { 7789 free(xattr->name); 7790 free(xattr); 7791 return -ENOMEM; 7792 } 7793 memcpy(xattr->value, value, value_len); 7794 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7795 7796 blob->state = SPDK_BLOB_STATE_DIRTY; 7797 7798 return 0; 7799 } 7800 7801 int 7802 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7803 uint16_t value_len) 7804 { 7805 return blob_set_xattr(blob, name, value, value_len, false); 7806 } 7807 7808 static int 7809 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7810 { 7811 struct spdk_xattr_tailq *xattrs; 7812 struct spdk_xattr *xattr; 7813 7814 blob_verify_md_op(blob); 7815 7816 if (blob->md_ro) { 7817 return -EPERM; 7818 } 7819 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7820 7821 TAILQ_FOREACH(xattr, xattrs, link) { 7822 if (!strcmp(name, xattr->name)) { 7823 TAILQ_REMOVE(xattrs, xattr, link); 7824 free(xattr->value); 7825 free(xattr->name); 7826 free(xattr); 7827 7828 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7829 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7830 } 7831 blob->state = SPDK_BLOB_STATE_DIRTY; 7832 7833 return 0; 7834 } 7835 } 7836 7837 return -ENOENT; 7838 } 7839 7840 int 7841 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7842 { 7843 return blob_remove_xattr(blob, name, false); 7844 } 7845 7846 static int 7847 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7848 const void **value, size_t *value_len, bool internal) 7849 { 7850 struct spdk_xattr *xattr; 7851 struct spdk_xattr_tailq *xattrs; 7852 7853 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7854 7855 TAILQ_FOREACH(xattr, xattrs, link) { 7856 if (!strcmp(name, xattr->name)) { 7857 *value = xattr->value; 7858 *value_len = xattr->value_len; 7859 return 0; 7860 } 7861 } 7862 return -ENOENT; 7863 } 7864 7865 int 7866 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7867 const void **value, size_t *value_len) 7868 { 7869 blob_verify_md_op(blob); 7870 7871 return blob_get_xattr_value(blob, name, value, value_len, false); 7872 } 7873 7874 struct spdk_xattr_names { 7875 uint32_t count; 7876 const char *names[0]; 7877 }; 7878 7879 static int 7880 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7881 { 7882 struct spdk_xattr *xattr; 7883 int count = 0; 7884 7885 TAILQ_FOREACH(xattr, xattrs, link) { 7886 count++; 7887 } 7888 7889 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7890 if (*names == NULL) { 7891 return -ENOMEM; 7892 } 7893 7894 TAILQ_FOREACH(xattr, xattrs, link) { 7895 (*names)->names[(*names)->count++] = xattr->name; 7896 } 7897 7898 return 0; 7899 } 7900 7901 int 7902 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 7903 { 7904 blob_verify_md_op(blob); 7905 7906 return blob_get_xattr_names(&blob->xattrs, names); 7907 } 7908 7909 uint32_t 7910 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 7911 { 7912 assert(names != NULL); 7913 7914 return names->count; 7915 } 7916 7917 const char * 7918 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 7919 { 7920 if (index >= names->count) { 7921 return NULL; 7922 } 7923 7924 return names->names[index]; 7925 } 7926 7927 void 7928 spdk_xattr_names_free(struct spdk_xattr_names *names) 7929 { 7930 free(names); 7931 } 7932 7933 struct spdk_bs_type 7934 spdk_bs_get_bstype(struct spdk_blob_store *bs) 7935 { 7936 return bs->bstype; 7937 } 7938 7939 void 7940 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 7941 { 7942 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 7943 } 7944 7945 bool 7946 spdk_blob_is_read_only(struct spdk_blob *blob) 7947 { 7948 assert(blob != NULL); 7949 return (blob->data_ro || blob->md_ro); 7950 } 7951 7952 bool 7953 spdk_blob_is_snapshot(struct spdk_blob *blob) 7954 { 7955 struct spdk_blob_list *snapshot_entry; 7956 7957 assert(blob != NULL); 7958 7959 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7960 if (snapshot_entry == NULL) { 7961 return false; 7962 } 7963 7964 return true; 7965 } 7966 7967 bool 7968 spdk_blob_is_clone(struct spdk_blob *blob) 7969 { 7970 assert(blob != NULL); 7971 7972 if (blob->parent_id != SPDK_BLOBID_INVALID) { 7973 assert(spdk_blob_is_thin_provisioned(blob)); 7974 return true; 7975 } 7976 7977 return false; 7978 } 7979 7980 bool 7981 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 7982 { 7983 assert(blob != NULL); 7984 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 7985 } 7986 7987 static void 7988 blob_update_clear_method(struct spdk_blob *blob) 7989 { 7990 enum blob_clear_method stored_cm; 7991 7992 assert(blob != NULL); 7993 7994 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 7995 * in metadata previously. If something other than the default was 7996 * specified, ignore stored value and used what was passed in. 7997 */ 7998 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 7999 8000 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 8001 blob->clear_method = stored_cm; 8002 } else if (blob->clear_method != stored_cm) { 8003 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 8004 blob->clear_method, stored_cm); 8005 } 8006 } 8007 8008 spdk_blob_id 8009 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 8010 { 8011 struct spdk_blob_list *snapshot_entry = NULL; 8012 struct spdk_blob_list *clone_entry = NULL; 8013 8014 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 8015 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8016 if (clone_entry->id == blob_id) { 8017 return snapshot_entry->id; 8018 } 8019 } 8020 } 8021 8022 return SPDK_BLOBID_INVALID; 8023 } 8024 8025 int 8026 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 8027 size_t *count) 8028 { 8029 struct spdk_blob_list *snapshot_entry, *clone_entry; 8030 size_t n; 8031 8032 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 8033 if (snapshot_entry == NULL) { 8034 *count = 0; 8035 return 0; 8036 } 8037 8038 if (ids == NULL || *count < snapshot_entry->clone_count) { 8039 *count = snapshot_entry->clone_count; 8040 return -ENOMEM; 8041 } 8042 *count = snapshot_entry->clone_count; 8043 8044 n = 0; 8045 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8046 ids[n++] = clone_entry->id; 8047 } 8048 8049 return 0; 8050 } 8051 8052 static void 8053 bs_load_grow_continue(struct spdk_bs_load_ctx *ctx) 8054 { 8055 int rc; 8056 8057 if (ctx->super->size == 0) { 8058 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8059 } 8060 8061 if (ctx->super->io_unit_size == 0) { 8062 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 8063 } 8064 8065 /* Parse the super block */ 8066 ctx->bs->clean = 1; 8067 ctx->bs->cluster_sz = ctx->super->cluster_size; 8068 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 8069 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 8070 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 8071 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 8072 } 8073 ctx->bs->io_unit_size = ctx->super->io_unit_size; 8074 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 8075 if (rc < 0) { 8076 bs_load_ctx_fail(ctx, -ENOMEM); 8077 return; 8078 } 8079 ctx->bs->md_start = ctx->super->md_start; 8080 ctx->bs->md_len = ctx->super->md_len; 8081 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 8082 if (rc < 0) { 8083 bs_load_ctx_fail(ctx, -ENOMEM); 8084 return; 8085 } 8086 8087 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 8088 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 8089 ctx->bs->super_blob = ctx->super->super_blob; 8090 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 8091 8092 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 8093 SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n"); 8094 bs_load_ctx_fail(ctx, -EIO); 8095 return; 8096 } else { 8097 bs_load_read_used_pages(ctx); 8098 } 8099 } 8100 8101 static void 8102 bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8103 { 8104 struct spdk_bs_load_ctx *ctx = cb_arg; 8105 8106 if (bserrno != 0) { 8107 bs_load_ctx_fail(ctx, bserrno); 8108 return; 8109 } 8110 bs_load_grow_continue(ctx); 8111 } 8112 8113 static void 8114 bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8115 { 8116 struct spdk_bs_load_ctx *ctx = cb_arg; 8117 8118 if (bserrno != 0) { 8119 bs_load_ctx_fail(ctx, bserrno); 8120 return; 8121 } 8122 8123 spdk_free(ctx->mask); 8124 8125 bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 8126 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 8127 bs_load_grow_super_write_cpl, ctx); 8128 } 8129 8130 static void 8131 bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8132 { 8133 struct spdk_bs_load_ctx *ctx = cb_arg; 8134 uint64_t lba, lba_count; 8135 uint64_t dev_size; 8136 uint64_t total_clusters; 8137 8138 if (bserrno != 0) { 8139 bs_load_ctx_fail(ctx, bserrno); 8140 return; 8141 } 8142 8143 /* The type must be correct */ 8144 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 8145 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 8146 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 8147 struct spdk_blob_md_page) * 8)); 8148 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8149 total_clusters = dev_size / ctx->super->cluster_size; 8150 ctx->mask->length = total_clusters; 8151 8152 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8153 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8154 bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count, 8155 bs_load_grow_used_clusters_write_cpl, ctx); 8156 } 8157 8158 static void 8159 bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx) 8160 { 8161 uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask; 8162 uint64_t lba, lba_count, mask_size; 8163 8164 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8165 total_clusters = dev_size / ctx->super->cluster_size; 8166 used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 8167 spdk_divide_round_up(total_clusters, 8), 8168 SPDK_BS_PAGE_SIZE); 8169 max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start; 8170 /* No necessary to grow or no space to grow */ 8171 if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) { 8172 SPDK_DEBUGLOG(blob, "No grow\n"); 8173 bs_load_grow_continue(ctx); 8174 return; 8175 } 8176 8177 SPDK_DEBUGLOG(blob, "Resize blobstore\n"); 8178 8179 ctx->super->size = dev_size; 8180 ctx->super->used_cluster_mask_len = used_cluster_mask_len; 8181 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 8182 8183 mask_size = used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 8184 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 8185 SPDK_MALLOC_DMA); 8186 if (!ctx->mask) { 8187 bs_load_ctx_fail(ctx, -ENOMEM); 8188 return; 8189 } 8190 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8191 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8192 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 8193 bs_load_grow_used_clusters_read_cpl, ctx); 8194 } 8195 8196 static void 8197 bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8198 { 8199 struct spdk_bs_load_ctx *ctx = cb_arg; 8200 uint32_t crc; 8201 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 8202 8203 if (ctx->super->version > SPDK_BS_VERSION || 8204 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 8205 bs_load_ctx_fail(ctx, -EILSEQ); 8206 return; 8207 } 8208 8209 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 8210 sizeof(ctx->super->signature)) != 0) { 8211 bs_load_ctx_fail(ctx, -EILSEQ); 8212 return; 8213 } 8214 8215 crc = blob_md_page_calc_crc(ctx->super); 8216 if (crc != ctx->super->crc) { 8217 bs_load_ctx_fail(ctx, -EILSEQ); 8218 return; 8219 } 8220 8221 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8222 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 8223 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8224 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 8225 } else { 8226 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 8227 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8228 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8229 bs_load_ctx_fail(ctx, -ENXIO); 8230 return; 8231 } 8232 8233 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 8234 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 8235 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 8236 bs_load_ctx_fail(ctx, -EILSEQ); 8237 return; 8238 } 8239 8240 bs_load_try_to_grow(ctx); 8241 8242 } 8243 8244 void 8245 spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 8246 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 8247 { 8248 struct spdk_blob_store *bs; 8249 struct spdk_bs_cpl cpl; 8250 struct spdk_bs_load_ctx *ctx; 8251 struct spdk_bs_opts opts = {}; 8252 int err; 8253 8254 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 8255 8256 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 8257 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 8258 dev->destroy(dev); 8259 cb_fn(cb_arg, NULL, -EINVAL); 8260 return; 8261 } 8262 8263 spdk_bs_opts_init(&opts, sizeof(opts)); 8264 if (o) { 8265 if (bs_opts_copy(o, &opts)) { 8266 return; 8267 } 8268 } 8269 8270 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 8271 dev->destroy(dev); 8272 cb_fn(cb_arg, NULL, -EINVAL); 8273 return; 8274 } 8275 8276 err = bs_alloc(dev, &opts, &bs, &ctx); 8277 if (err) { 8278 dev->destroy(dev); 8279 cb_fn(cb_arg, NULL, err); 8280 return; 8281 } 8282 8283 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 8284 cpl.u.bs_handle.cb_fn = cb_fn; 8285 cpl.u.bs_handle.cb_arg = cb_arg; 8286 cpl.u.bs_handle.bs = bs; 8287 8288 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 8289 if (!ctx->seq) { 8290 spdk_free(ctx->super); 8291 free(ctx); 8292 bs_free(bs); 8293 cb_fn(cb_arg, NULL, -ENOMEM); 8294 return; 8295 } 8296 8297 /* Read the super block */ 8298 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 8299 bs_byte_to_lba(bs, sizeof(*ctx->super)), 8300 bs_grow_load_super_cpl, ctx); 8301 } 8302 8303 SPDK_LOG_REGISTER_COMPONENT(blob) 8304