1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/blob.h" 10 #include "spdk/crc32.h" 11 #include "spdk/env.h" 12 #include "spdk/queue.h" 13 #include "spdk/thread.h" 14 #include "spdk/bit_array.h" 15 #include "spdk/bit_pool.h" 16 #include "spdk/likely.h" 17 #include "spdk/util.h" 18 #include "spdk/string.h" 19 20 #include "spdk_internal/assert.h" 21 #include "spdk/log.h" 22 23 #include "blobstore.h" 24 25 #define BLOB_CRC32C_INITIAL 0xffffffffUL 26 27 static int bs_register_md_thread(struct spdk_blob_store *bs); 28 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 29 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 30 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 31 uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page, 32 spdk_blob_op_complete cb_fn, void *cb_arg); 33 34 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 35 uint16_t value_len, bool internal); 36 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 37 const void **value, size_t *value_len, bool internal); 38 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 39 40 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 41 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg); 42 43 static int 44 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2) 45 { 46 return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id); 47 } 48 49 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp); 50 51 static void 52 blob_verify_md_op(struct spdk_blob *blob) 53 { 54 assert(blob != NULL); 55 assert(spdk_get_thread() == blob->bs->md_thread); 56 assert(blob->state != SPDK_BLOB_STATE_LOADING); 57 } 58 59 static struct spdk_blob_list * 60 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 61 { 62 struct spdk_blob_list *snapshot_entry = NULL; 63 64 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 65 if (snapshot_entry->id == blobid) { 66 break; 67 } 68 } 69 70 return snapshot_entry; 71 } 72 73 static void 74 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 75 { 76 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 77 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 78 79 spdk_bit_array_set(bs->used_md_pages, page); 80 } 81 82 static void 83 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 84 { 85 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 86 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 87 88 spdk_bit_array_clear(bs->used_md_pages, page); 89 } 90 91 static uint32_t 92 bs_claim_cluster(struct spdk_blob_store *bs) 93 { 94 uint32_t cluster_num; 95 96 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 97 if (cluster_num == UINT32_MAX) { 98 return UINT32_MAX; 99 } 100 101 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 102 bs->num_free_clusters--; 103 104 return cluster_num; 105 } 106 107 static void 108 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 109 { 110 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 111 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 112 assert(bs->num_free_clusters < bs->total_clusters); 113 114 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 115 116 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 117 bs->num_free_clusters++; 118 } 119 120 static int 121 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 122 { 123 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 124 125 blob_verify_md_op(blob); 126 127 if (*cluster_lba != 0) { 128 return -EEXIST; 129 } 130 131 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 132 return 0; 133 } 134 135 static int 136 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 137 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 138 { 139 uint32_t *extent_page = 0; 140 141 *cluster = bs_claim_cluster(blob->bs); 142 if (*cluster == UINT32_MAX) { 143 /* No more free clusters. Cannot satisfy the request */ 144 return -ENOSPC; 145 } 146 147 if (blob->use_extent_table) { 148 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 149 if (*extent_page == 0) { 150 /* Extent page shall never occupy md_page so start the search from 1 */ 151 if (*lowest_free_md_page == 0) { 152 *lowest_free_md_page = 1; 153 } 154 /* No extent_page is allocated for the cluster */ 155 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 156 *lowest_free_md_page); 157 if (*lowest_free_md_page == UINT32_MAX) { 158 /* No more free md pages. Cannot satisfy the request */ 159 bs_release_cluster(blob->bs, *cluster); 160 return -ENOSPC; 161 } 162 bs_claim_md_page(blob->bs, *lowest_free_md_page); 163 } 164 } 165 166 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id); 167 168 if (update_map) { 169 blob_insert_cluster(blob, cluster_num, *cluster); 170 if (blob->use_extent_table && *extent_page == 0) { 171 *extent_page = *lowest_free_md_page; 172 } 173 } 174 175 return 0; 176 } 177 178 static void 179 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 180 { 181 xattrs->count = 0; 182 xattrs->names = NULL; 183 xattrs->ctx = NULL; 184 xattrs->get_value = NULL; 185 } 186 187 void 188 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 189 { 190 if (!opts) { 191 SPDK_ERRLOG("opts should not be NULL\n"); 192 return; 193 } 194 195 if (!opts_size) { 196 SPDK_ERRLOG("opts_size should not be zero value\n"); 197 return; 198 } 199 200 memset(opts, 0, opts_size); 201 opts->opts_size = opts_size; 202 203 #define FIELD_OK(field) \ 204 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 205 206 #define SET_FIELD(field, value) \ 207 if (FIELD_OK(field)) { \ 208 opts->field = value; \ 209 } \ 210 211 SET_FIELD(num_clusters, 0); 212 SET_FIELD(thin_provision, false); 213 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 214 215 if (FIELD_OK(xattrs)) { 216 blob_xattrs_init(&opts->xattrs); 217 } 218 219 SET_FIELD(use_extent_table, true); 220 221 #undef FIELD_OK 222 #undef SET_FIELD 223 } 224 225 void 226 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 227 { 228 if (!opts) { 229 SPDK_ERRLOG("opts should not be NULL\n"); 230 return; 231 } 232 233 if (!opts_size) { 234 SPDK_ERRLOG("opts_size should not be zero value\n"); 235 return; 236 } 237 238 memset(opts, 0, opts_size); 239 opts->opts_size = opts_size; 240 241 #define FIELD_OK(field) \ 242 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 243 244 #define SET_FIELD(field, value) \ 245 if (FIELD_OK(field)) { \ 246 opts->field = value; \ 247 } \ 248 249 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 250 251 #undef FIELD_OK 252 #undef SET_FILED 253 } 254 255 static struct spdk_blob * 256 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 257 { 258 struct spdk_blob *blob; 259 260 blob = calloc(1, sizeof(*blob)); 261 if (!blob) { 262 return NULL; 263 } 264 265 blob->id = id; 266 blob->bs = bs; 267 268 blob->parent_id = SPDK_BLOBID_INVALID; 269 270 blob->state = SPDK_BLOB_STATE_DIRTY; 271 blob->extent_rle_found = false; 272 blob->extent_table_found = false; 273 blob->active.num_pages = 1; 274 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 275 if (!blob->active.pages) { 276 free(blob); 277 return NULL; 278 } 279 280 blob->active.pages[0] = bs_blobid_to_page(id); 281 282 TAILQ_INIT(&blob->xattrs); 283 TAILQ_INIT(&blob->xattrs_internal); 284 TAILQ_INIT(&blob->pending_persists); 285 TAILQ_INIT(&blob->persists_to_complete); 286 287 return blob; 288 } 289 290 static void 291 xattrs_free(struct spdk_xattr_tailq *xattrs) 292 { 293 struct spdk_xattr *xattr, *xattr_tmp; 294 295 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 296 TAILQ_REMOVE(xattrs, xattr, link); 297 free(xattr->name); 298 free(xattr->value); 299 free(xattr); 300 } 301 } 302 303 static void 304 blob_free(struct spdk_blob *blob) 305 { 306 assert(blob != NULL); 307 assert(TAILQ_EMPTY(&blob->pending_persists)); 308 assert(TAILQ_EMPTY(&blob->persists_to_complete)); 309 310 free(blob->active.extent_pages); 311 free(blob->clean.extent_pages); 312 free(blob->active.clusters); 313 free(blob->clean.clusters); 314 free(blob->active.pages); 315 free(blob->clean.pages); 316 317 xattrs_free(&blob->xattrs); 318 xattrs_free(&blob->xattrs_internal); 319 320 if (blob->back_bs_dev) { 321 blob->back_bs_dev->destroy(blob->back_bs_dev); 322 } 323 324 free(blob); 325 } 326 327 struct freeze_io_ctx { 328 struct spdk_bs_cpl cpl; 329 struct spdk_blob *blob; 330 }; 331 332 static void 333 blob_io_sync(struct spdk_io_channel_iter *i) 334 { 335 spdk_for_each_channel_continue(i, 0); 336 } 337 338 static void 339 blob_execute_queued_io(struct spdk_io_channel_iter *i) 340 { 341 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 342 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 343 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 344 struct spdk_bs_request_set *set; 345 struct spdk_bs_user_op_args *args; 346 spdk_bs_user_op_t *op, *tmp; 347 348 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 349 set = (struct spdk_bs_request_set *)op; 350 args = &set->u.user_op; 351 352 if (args->blob == ctx->blob) { 353 TAILQ_REMOVE(&ch->queued_io, op, link); 354 bs_user_op_execute(op); 355 } 356 } 357 358 spdk_for_each_channel_continue(i, 0); 359 } 360 361 static void 362 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 363 { 364 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 365 366 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 367 368 free(ctx); 369 } 370 371 static void 372 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 373 { 374 struct freeze_io_ctx *ctx; 375 376 ctx = calloc(1, sizeof(*ctx)); 377 if (!ctx) { 378 cb_fn(cb_arg, -ENOMEM); 379 return; 380 } 381 382 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 383 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 384 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 385 ctx->blob = blob; 386 387 /* Freeze I/O on blob */ 388 blob->frozen_refcnt++; 389 390 if (blob->frozen_refcnt == 1) { 391 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 392 } else { 393 cb_fn(cb_arg, 0); 394 free(ctx); 395 } 396 } 397 398 static void 399 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 400 { 401 struct freeze_io_ctx *ctx; 402 403 ctx = calloc(1, sizeof(*ctx)); 404 if (!ctx) { 405 cb_fn(cb_arg, -ENOMEM); 406 return; 407 } 408 409 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 410 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 411 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 412 ctx->blob = blob; 413 414 assert(blob->frozen_refcnt > 0); 415 416 blob->frozen_refcnt--; 417 418 if (blob->frozen_refcnt == 0) { 419 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 420 } else { 421 cb_fn(cb_arg, 0); 422 free(ctx); 423 } 424 } 425 426 static int 427 blob_mark_clean(struct spdk_blob *blob) 428 { 429 uint32_t *extent_pages = NULL; 430 uint64_t *clusters = NULL; 431 uint32_t *pages = NULL; 432 433 assert(blob != NULL); 434 435 if (blob->active.num_extent_pages) { 436 assert(blob->active.extent_pages); 437 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 438 if (!extent_pages) { 439 return -ENOMEM; 440 } 441 memcpy(extent_pages, blob->active.extent_pages, 442 blob->active.num_extent_pages * sizeof(*extent_pages)); 443 } 444 445 if (blob->active.num_clusters) { 446 assert(blob->active.clusters); 447 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 448 if (!clusters) { 449 free(extent_pages); 450 return -ENOMEM; 451 } 452 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 453 } 454 455 if (blob->active.num_pages) { 456 assert(blob->active.pages); 457 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 458 if (!pages) { 459 free(extent_pages); 460 free(clusters); 461 return -ENOMEM; 462 } 463 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 464 } 465 466 free(blob->clean.extent_pages); 467 free(blob->clean.clusters); 468 free(blob->clean.pages); 469 470 blob->clean.num_extent_pages = blob->active.num_extent_pages; 471 blob->clean.extent_pages = blob->active.extent_pages; 472 blob->clean.num_clusters = blob->active.num_clusters; 473 blob->clean.clusters = blob->active.clusters; 474 blob->clean.num_pages = blob->active.num_pages; 475 blob->clean.pages = blob->active.pages; 476 477 blob->active.extent_pages = extent_pages; 478 blob->active.clusters = clusters; 479 blob->active.pages = pages; 480 481 /* If the metadata was dirtied again while the metadata was being written to disk, 482 * we do not want to revert the DIRTY state back to CLEAN here. 483 */ 484 if (blob->state == SPDK_BLOB_STATE_LOADING) { 485 blob->state = SPDK_BLOB_STATE_CLEAN; 486 } 487 488 return 0; 489 } 490 491 static int 492 blob_deserialize_xattr(struct spdk_blob *blob, 493 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 494 { 495 struct spdk_xattr *xattr; 496 497 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 498 sizeof(desc_xattr->value_length) + 499 desc_xattr->name_length + desc_xattr->value_length) { 500 return -EINVAL; 501 } 502 503 xattr = calloc(1, sizeof(*xattr)); 504 if (xattr == NULL) { 505 return -ENOMEM; 506 } 507 508 xattr->name = malloc(desc_xattr->name_length + 1); 509 if (xattr->name == NULL) { 510 free(xattr); 511 return -ENOMEM; 512 } 513 514 xattr->value = malloc(desc_xattr->value_length); 515 if (xattr->value == NULL) { 516 free(xattr->name); 517 free(xattr); 518 return -ENOMEM; 519 } 520 521 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 522 xattr->name[desc_xattr->name_length] = '\0'; 523 xattr->value_len = desc_xattr->value_length; 524 memcpy(xattr->value, 525 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 526 desc_xattr->value_length); 527 528 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 529 530 return 0; 531 } 532 533 534 static int 535 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 536 { 537 struct spdk_blob_md_descriptor *desc; 538 size_t cur_desc = 0; 539 void *tmp; 540 541 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 542 while (cur_desc < sizeof(page->descriptors)) { 543 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 544 if (desc->length == 0) { 545 /* If padding and length are 0, this terminates the page */ 546 break; 547 } 548 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 549 struct spdk_blob_md_descriptor_flags *desc_flags; 550 551 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 552 553 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 554 return -EINVAL; 555 } 556 557 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 558 SPDK_BLOB_INVALID_FLAGS_MASK) { 559 return -EINVAL; 560 } 561 562 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 563 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 564 blob->data_ro = true; 565 blob->md_ro = true; 566 } 567 568 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 569 SPDK_BLOB_MD_RO_FLAGS_MASK) { 570 blob->md_ro = true; 571 } 572 573 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 574 blob->data_ro = true; 575 blob->md_ro = true; 576 } 577 578 blob->invalid_flags = desc_flags->invalid_flags; 579 blob->data_ro_flags = desc_flags->data_ro_flags; 580 blob->md_ro_flags = desc_flags->md_ro_flags; 581 582 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 583 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 584 unsigned int i, j; 585 unsigned int cluster_count = blob->active.num_clusters; 586 587 if (blob->extent_table_found) { 588 /* Extent Table already present in the md, 589 * both descriptors should never be at the same time. */ 590 return -EINVAL; 591 } 592 blob->extent_rle_found = true; 593 594 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 595 596 if (desc_extent_rle->length == 0 || 597 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 598 return -EINVAL; 599 } 600 601 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 602 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 603 if (desc_extent_rle->extents[i].cluster_idx != 0) { 604 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 605 desc_extent_rle->extents[i].cluster_idx + j)) { 606 return -EINVAL; 607 } 608 } 609 cluster_count++; 610 } 611 } 612 613 if (cluster_count == 0) { 614 return -EINVAL; 615 } 616 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 617 if (tmp == NULL) { 618 return -ENOMEM; 619 } 620 blob->active.clusters = tmp; 621 blob->active.cluster_array_size = cluster_count; 622 623 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 624 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 625 if (desc_extent_rle->extents[i].cluster_idx != 0) { 626 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 627 desc_extent_rle->extents[i].cluster_idx + j); 628 } else if (spdk_blob_is_thin_provisioned(blob)) { 629 blob->active.clusters[blob->active.num_clusters++] = 0; 630 } else { 631 return -EINVAL; 632 } 633 } 634 } 635 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 636 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 637 uint32_t num_extent_pages = blob->active.num_extent_pages; 638 uint32_t i, j; 639 size_t extent_pages_length; 640 641 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 642 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 643 644 if (blob->extent_rle_found) { 645 /* This means that Extent RLE is present in MD, 646 * both should never be at the same time. */ 647 return -EINVAL; 648 } else if (blob->extent_table_found && 649 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 650 /* Number of clusters in this ET does not match number 651 * from previously read EXTENT_TABLE. */ 652 return -EINVAL; 653 } 654 655 if (desc_extent_table->length == 0 || 656 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 657 return -EINVAL; 658 } 659 660 blob->extent_table_found = true; 661 662 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 663 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 664 } 665 666 if (num_extent_pages > 0) { 667 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 668 if (tmp == NULL) { 669 return -ENOMEM; 670 } 671 blob->active.extent_pages = tmp; 672 } 673 blob->active.extent_pages_array_size = num_extent_pages; 674 675 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 676 677 /* Extent table entries contain md page numbers for extent pages. 678 * Zeroes represent unallocated extent pages, those are run-length-encoded. 679 */ 680 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 681 if (desc_extent_table->extent_page[i].page_idx != 0) { 682 assert(desc_extent_table->extent_page[i].num_pages == 1); 683 blob->active.extent_pages[blob->active.num_extent_pages++] = 684 desc_extent_table->extent_page[i].page_idx; 685 } else if (spdk_blob_is_thin_provisioned(blob)) { 686 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 687 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 688 } 689 } else { 690 return -EINVAL; 691 } 692 } 693 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 694 struct spdk_blob_md_descriptor_extent_page *desc_extent; 695 unsigned int i; 696 unsigned int cluster_count = 0; 697 size_t cluster_idx_length; 698 699 if (blob->extent_rle_found) { 700 /* This means that Extent RLE is present in MD, 701 * both should never be at the same time. */ 702 return -EINVAL; 703 } 704 705 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 706 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 707 708 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 709 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 710 return -EINVAL; 711 } 712 713 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 714 if (desc_extent->cluster_idx[i] != 0) { 715 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 716 return -EINVAL; 717 } 718 } 719 cluster_count++; 720 } 721 722 if (cluster_count == 0) { 723 return -EINVAL; 724 } 725 726 /* When reading extent pages sequentially starting cluster idx should match 727 * current size of a blob. 728 * If changed to batch reading, this check shall be removed. */ 729 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 730 return -EINVAL; 731 } 732 733 tmp = realloc(blob->active.clusters, 734 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 735 if (tmp == NULL) { 736 return -ENOMEM; 737 } 738 blob->active.clusters = tmp; 739 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 740 741 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 742 if (desc_extent->cluster_idx[i] != 0) { 743 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 744 desc_extent->cluster_idx[i]); 745 } else if (spdk_blob_is_thin_provisioned(blob)) { 746 blob->active.clusters[blob->active.num_clusters++] = 0; 747 } else { 748 return -EINVAL; 749 } 750 } 751 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 752 assert(blob->remaining_clusters_in_et >= cluster_count); 753 blob->remaining_clusters_in_et -= cluster_count; 754 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 755 int rc; 756 757 rc = blob_deserialize_xattr(blob, 758 (struct spdk_blob_md_descriptor_xattr *) desc, false); 759 if (rc != 0) { 760 return rc; 761 } 762 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 763 int rc; 764 765 rc = blob_deserialize_xattr(blob, 766 (struct spdk_blob_md_descriptor_xattr *) desc, true); 767 if (rc != 0) { 768 return rc; 769 } 770 } else { 771 /* Unrecognized descriptor type. Do not fail - just continue to the 772 * next descriptor. If this descriptor is associated with some feature 773 * defined in a newer version of blobstore, that version of blobstore 774 * should create and set an associated feature flag to specify if this 775 * blob can be loaded or not. 776 */ 777 } 778 779 /* Advance to the next descriptor */ 780 cur_desc += sizeof(*desc) + desc->length; 781 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 782 break; 783 } 784 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 785 } 786 787 return 0; 788 } 789 790 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 791 792 static int 793 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 794 { 795 assert(blob != NULL); 796 assert(blob->state == SPDK_BLOB_STATE_LOADING); 797 798 if (bs_load_cur_extent_page_valid(extent_page) == false) { 799 return -ENOENT; 800 } 801 802 return blob_parse_page(extent_page, blob); 803 } 804 805 static int 806 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 807 struct spdk_blob *blob) 808 { 809 const struct spdk_blob_md_page *page; 810 uint32_t i; 811 int rc; 812 void *tmp; 813 814 assert(page_count > 0); 815 assert(pages[0].sequence_num == 0); 816 assert(blob != NULL); 817 assert(blob->state == SPDK_BLOB_STATE_LOADING); 818 assert(blob->active.clusters == NULL); 819 820 /* The blobid provided doesn't match what's in the MD, this can 821 * happen for example if a bogus blobid is passed in through open. 822 */ 823 if (blob->id != pages[0].id) { 824 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 825 blob->id, pages[0].id); 826 return -ENOENT; 827 } 828 829 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 830 if (!tmp) { 831 return -ENOMEM; 832 } 833 blob->active.pages = tmp; 834 835 blob->active.pages[0] = pages[0].id; 836 837 for (i = 1; i < page_count; i++) { 838 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 839 blob->active.pages[i] = pages[i - 1].next; 840 } 841 blob->active.num_pages = page_count; 842 843 for (i = 0; i < page_count; i++) { 844 page = &pages[i]; 845 846 assert(page->id == blob->id); 847 assert(page->sequence_num == i); 848 849 rc = blob_parse_page(page, blob); 850 if (rc != 0) { 851 return rc; 852 } 853 } 854 855 return 0; 856 } 857 858 static int 859 blob_serialize_add_page(const struct spdk_blob *blob, 860 struct spdk_blob_md_page **pages, 861 uint32_t *page_count, 862 struct spdk_blob_md_page **last_page) 863 { 864 struct spdk_blob_md_page *page, *tmp_pages; 865 866 assert(pages != NULL); 867 assert(page_count != NULL); 868 869 *last_page = NULL; 870 if (*page_count == 0) { 871 assert(*pages == NULL); 872 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 873 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 874 if (*pages == NULL) { 875 return -ENOMEM; 876 } 877 *page_count = 1; 878 } else { 879 assert(*pages != NULL); 880 tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0); 881 if (tmp_pages == NULL) { 882 return -ENOMEM; 883 } 884 (*page_count)++; 885 *pages = tmp_pages; 886 } 887 888 page = &(*pages)[*page_count - 1]; 889 memset(page, 0, sizeof(*page)); 890 page->id = blob->id; 891 page->sequence_num = *page_count - 1; 892 page->next = SPDK_INVALID_MD_PAGE; 893 *last_page = page; 894 895 return 0; 896 } 897 898 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 899 * Update required_sz on both success and failure. 900 * 901 */ 902 static int 903 blob_serialize_xattr(const struct spdk_xattr *xattr, 904 uint8_t *buf, size_t buf_sz, 905 size_t *required_sz, bool internal) 906 { 907 struct spdk_blob_md_descriptor_xattr *desc; 908 909 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 910 strlen(xattr->name) + 911 xattr->value_len; 912 913 if (buf_sz < *required_sz) { 914 return -1; 915 } 916 917 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 918 919 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 920 desc->length = sizeof(desc->name_length) + 921 sizeof(desc->value_length) + 922 strlen(xattr->name) + 923 xattr->value_len; 924 desc->name_length = strlen(xattr->name); 925 desc->value_length = xattr->value_len; 926 927 memcpy(desc->name, xattr->name, desc->name_length); 928 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 929 xattr->value, 930 desc->value_length); 931 932 return 0; 933 } 934 935 static void 936 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 937 uint64_t start_ep, uint64_t *next_ep, 938 uint8_t **buf, size_t *remaining_sz) 939 { 940 struct spdk_blob_md_descriptor_extent_table *desc; 941 size_t cur_sz; 942 uint64_t i, et_idx; 943 uint32_t extent_page, ep_len; 944 945 /* The buffer must have room for at least num_clusters entry */ 946 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 947 if (*remaining_sz < cur_sz) { 948 *next_ep = start_ep; 949 return; 950 } 951 952 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 953 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 954 955 desc->num_clusters = blob->active.num_clusters; 956 957 ep_len = 1; 958 et_idx = 0; 959 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 960 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 961 /* If we ran out of buffer space, return */ 962 break; 963 } 964 965 extent_page = blob->active.extent_pages[i]; 966 /* Verify that next extent_page is unallocated */ 967 if (extent_page == 0 && 968 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 969 ep_len++; 970 continue; 971 } 972 desc->extent_page[et_idx].page_idx = extent_page; 973 desc->extent_page[et_idx].num_pages = ep_len; 974 et_idx++; 975 976 ep_len = 1; 977 cur_sz += sizeof(desc->extent_page[et_idx]); 978 } 979 *next_ep = i; 980 981 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 982 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 983 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 984 } 985 986 static int 987 blob_serialize_extent_table(const struct spdk_blob *blob, 988 struct spdk_blob_md_page **pages, 989 struct spdk_blob_md_page *cur_page, 990 uint32_t *page_count, uint8_t **buf, 991 size_t *remaining_sz) 992 { 993 uint64_t last_extent_page; 994 int rc; 995 996 last_extent_page = 0; 997 /* At least single extent table entry has to be always persisted. 998 * Such case occurs with num_extent_pages == 0. */ 999 while (last_extent_page <= blob->active.num_extent_pages) { 1000 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1001 remaining_sz); 1002 1003 if (last_extent_page == blob->active.num_extent_pages) { 1004 break; 1005 } 1006 1007 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1008 if (rc < 0) { 1009 return rc; 1010 } 1011 1012 *buf = (uint8_t *)cur_page->descriptors; 1013 *remaining_sz = sizeof(cur_page->descriptors); 1014 } 1015 1016 return 0; 1017 } 1018 1019 static void 1020 blob_serialize_extent_rle(const struct spdk_blob *blob, 1021 uint64_t start_cluster, uint64_t *next_cluster, 1022 uint8_t **buf, size_t *buf_sz) 1023 { 1024 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1025 size_t cur_sz; 1026 uint64_t i, extent_idx; 1027 uint64_t lba, lba_per_cluster, lba_count; 1028 1029 /* The buffer must have room for at least one extent */ 1030 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1031 if (*buf_sz < cur_sz) { 1032 *next_cluster = start_cluster; 1033 return; 1034 } 1035 1036 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1037 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1038 1039 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1040 1041 lba = blob->active.clusters[start_cluster]; 1042 lba_count = lba_per_cluster; 1043 extent_idx = 0; 1044 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1045 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1046 /* Run-length encode sequential non-zero LBA */ 1047 lba_count += lba_per_cluster; 1048 continue; 1049 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1050 /* Run-length encode unallocated clusters */ 1051 lba_count += lba_per_cluster; 1052 continue; 1053 } 1054 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1055 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1056 extent_idx++; 1057 1058 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1059 1060 if (*buf_sz < cur_sz) { 1061 /* If we ran out of buffer space, return */ 1062 *next_cluster = i; 1063 break; 1064 } 1065 1066 lba = blob->active.clusters[i]; 1067 lba_count = lba_per_cluster; 1068 } 1069 1070 if (*buf_sz >= cur_sz) { 1071 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1072 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1073 extent_idx++; 1074 1075 *next_cluster = blob->active.num_clusters; 1076 } 1077 1078 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1079 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1080 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1081 } 1082 1083 static int 1084 blob_serialize_extents_rle(const struct spdk_blob *blob, 1085 struct spdk_blob_md_page **pages, 1086 struct spdk_blob_md_page *cur_page, 1087 uint32_t *page_count, uint8_t **buf, 1088 size_t *remaining_sz) 1089 { 1090 uint64_t last_cluster; 1091 int rc; 1092 1093 last_cluster = 0; 1094 while (last_cluster < blob->active.num_clusters) { 1095 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1096 1097 if (last_cluster == blob->active.num_clusters) { 1098 break; 1099 } 1100 1101 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1102 if (rc < 0) { 1103 return rc; 1104 } 1105 1106 *buf = (uint8_t *)cur_page->descriptors; 1107 *remaining_sz = sizeof(cur_page->descriptors); 1108 } 1109 1110 return 0; 1111 } 1112 1113 static void 1114 blob_serialize_extent_page(const struct spdk_blob *blob, 1115 uint64_t cluster, struct spdk_blob_md_page *page) 1116 { 1117 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1118 uint64_t i, extent_idx; 1119 uint64_t lba, lba_per_cluster; 1120 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1121 1122 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1123 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1124 1125 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1126 1127 desc_extent->start_cluster_idx = start_cluster_idx; 1128 extent_idx = 0; 1129 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1130 lba = blob->active.clusters[i]; 1131 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1132 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1133 break; 1134 } 1135 } 1136 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1137 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1138 } 1139 1140 static void 1141 blob_serialize_flags(const struct spdk_blob *blob, 1142 uint8_t *buf, size_t *buf_sz) 1143 { 1144 struct spdk_blob_md_descriptor_flags *desc; 1145 1146 /* 1147 * Flags get serialized first, so we should always have room for the flags 1148 * descriptor. 1149 */ 1150 assert(*buf_sz >= sizeof(*desc)); 1151 1152 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1153 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1154 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1155 desc->invalid_flags = blob->invalid_flags; 1156 desc->data_ro_flags = blob->data_ro_flags; 1157 desc->md_ro_flags = blob->md_ro_flags; 1158 1159 *buf_sz -= sizeof(*desc); 1160 } 1161 1162 static int 1163 blob_serialize_xattrs(const struct spdk_blob *blob, 1164 const struct spdk_xattr_tailq *xattrs, bool internal, 1165 struct spdk_blob_md_page **pages, 1166 struct spdk_blob_md_page *cur_page, 1167 uint32_t *page_count, uint8_t **buf, 1168 size_t *remaining_sz) 1169 { 1170 const struct spdk_xattr *xattr; 1171 int rc; 1172 1173 TAILQ_FOREACH(xattr, xattrs, link) { 1174 size_t required_sz = 0; 1175 1176 rc = blob_serialize_xattr(xattr, 1177 *buf, *remaining_sz, 1178 &required_sz, internal); 1179 if (rc < 0) { 1180 /* Need to add a new page to the chain */ 1181 rc = blob_serialize_add_page(blob, pages, page_count, 1182 &cur_page); 1183 if (rc < 0) { 1184 spdk_free(*pages); 1185 *pages = NULL; 1186 *page_count = 0; 1187 return rc; 1188 } 1189 1190 *buf = (uint8_t *)cur_page->descriptors; 1191 *remaining_sz = sizeof(cur_page->descriptors); 1192 1193 /* Try again */ 1194 required_sz = 0; 1195 rc = blob_serialize_xattr(xattr, 1196 *buf, *remaining_sz, 1197 &required_sz, internal); 1198 1199 if (rc < 0) { 1200 spdk_free(*pages); 1201 *pages = NULL; 1202 *page_count = 0; 1203 return rc; 1204 } 1205 } 1206 1207 *remaining_sz -= required_sz; 1208 *buf += required_sz; 1209 } 1210 1211 return 0; 1212 } 1213 1214 static int 1215 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1216 uint32_t *page_count) 1217 { 1218 struct spdk_blob_md_page *cur_page; 1219 int rc; 1220 uint8_t *buf; 1221 size_t remaining_sz; 1222 1223 assert(pages != NULL); 1224 assert(page_count != NULL); 1225 assert(blob != NULL); 1226 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1227 1228 *pages = NULL; 1229 *page_count = 0; 1230 1231 /* A blob always has at least 1 page, even if it has no descriptors */ 1232 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1233 if (rc < 0) { 1234 return rc; 1235 } 1236 1237 buf = (uint8_t *)cur_page->descriptors; 1238 remaining_sz = sizeof(cur_page->descriptors); 1239 1240 /* Serialize flags */ 1241 blob_serialize_flags(blob, buf, &remaining_sz); 1242 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1243 1244 /* Serialize xattrs */ 1245 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1246 pages, cur_page, page_count, &buf, &remaining_sz); 1247 if (rc < 0) { 1248 return rc; 1249 } 1250 1251 /* Serialize internal xattrs */ 1252 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1253 pages, cur_page, page_count, &buf, &remaining_sz); 1254 if (rc < 0) { 1255 return rc; 1256 } 1257 1258 if (blob->use_extent_table) { 1259 /* Serialize extent table */ 1260 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1261 } else { 1262 /* Serialize extents */ 1263 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1264 } 1265 1266 return rc; 1267 } 1268 1269 struct spdk_blob_load_ctx { 1270 struct spdk_blob *blob; 1271 1272 struct spdk_blob_md_page *pages; 1273 uint32_t num_pages; 1274 uint32_t next_extent_page; 1275 spdk_bs_sequence_t *seq; 1276 1277 spdk_bs_sequence_cpl cb_fn; 1278 void *cb_arg; 1279 }; 1280 1281 static uint32_t 1282 blob_md_page_calc_crc(void *page) 1283 { 1284 uint32_t crc; 1285 1286 crc = BLOB_CRC32C_INITIAL; 1287 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1288 crc ^= BLOB_CRC32C_INITIAL; 1289 1290 return crc; 1291 1292 } 1293 1294 static void 1295 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1296 { 1297 struct spdk_blob *blob = ctx->blob; 1298 1299 if (bserrno == 0) { 1300 blob_mark_clean(blob); 1301 } 1302 1303 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1304 1305 /* Free the memory */ 1306 spdk_free(ctx->pages); 1307 free(ctx); 1308 } 1309 1310 static void 1311 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1312 { 1313 struct spdk_blob_load_ctx *ctx = cb_arg; 1314 struct spdk_blob *blob = ctx->blob; 1315 1316 if (bserrno == 0) { 1317 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1318 if (blob->back_bs_dev == NULL) { 1319 bserrno = -ENOMEM; 1320 } 1321 } 1322 if (bserrno != 0) { 1323 SPDK_ERRLOG("Snapshot fail\n"); 1324 } 1325 1326 blob_load_final(ctx, bserrno); 1327 } 1328 1329 static void blob_update_clear_method(struct spdk_blob *blob); 1330 1331 static void 1332 blob_load_backing_dev(void *cb_arg) 1333 { 1334 struct spdk_blob_load_ctx *ctx = cb_arg; 1335 struct spdk_blob *blob = ctx->blob; 1336 const void *value; 1337 size_t len; 1338 int rc; 1339 1340 if (spdk_blob_is_thin_provisioned(blob)) { 1341 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1342 if (rc == 0) { 1343 if (len != sizeof(spdk_blob_id)) { 1344 blob_load_final(ctx, -EINVAL); 1345 return; 1346 } 1347 /* open snapshot blob and continue in the callback function */ 1348 blob->parent_id = *(spdk_blob_id *)value; 1349 spdk_bs_open_blob(blob->bs, blob->parent_id, 1350 blob_load_snapshot_cpl, ctx); 1351 return; 1352 } else { 1353 /* add zeroes_dev for thin provisioned blob */ 1354 blob->back_bs_dev = bs_create_zeroes_dev(); 1355 } 1356 } else { 1357 /* standard blob */ 1358 blob->back_bs_dev = NULL; 1359 } 1360 blob_load_final(ctx, 0); 1361 } 1362 1363 static void 1364 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1365 { 1366 struct spdk_blob_load_ctx *ctx = cb_arg; 1367 struct spdk_blob *blob = ctx->blob; 1368 struct spdk_blob_md_page *page; 1369 uint64_t i; 1370 uint32_t crc; 1371 uint64_t lba; 1372 void *tmp; 1373 uint64_t sz; 1374 1375 if (bserrno) { 1376 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1377 blob_load_final(ctx, bserrno); 1378 return; 1379 } 1380 1381 if (ctx->pages == NULL) { 1382 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1383 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1384 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1385 if (!ctx->pages) { 1386 blob_load_final(ctx, -ENOMEM); 1387 return; 1388 } 1389 ctx->num_pages = 1; 1390 ctx->next_extent_page = 0; 1391 } else { 1392 page = &ctx->pages[0]; 1393 crc = blob_md_page_calc_crc(page); 1394 if (crc != page->crc) { 1395 blob_load_final(ctx, -EINVAL); 1396 return; 1397 } 1398 1399 if (page->next != SPDK_INVALID_MD_PAGE) { 1400 blob_load_final(ctx, -EINVAL); 1401 return; 1402 } 1403 1404 bserrno = blob_parse_extent_page(page, blob); 1405 if (bserrno) { 1406 blob_load_final(ctx, bserrno); 1407 return; 1408 } 1409 } 1410 1411 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1412 if (blob->active.extent_pages[i] != 0) { 1413 /* Extent page was allocated, read and parse it. */ 1414 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1415 ctx->next_extent_page = i + 1; 1416 1417 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1418 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1419 blob_load_cpl_extents_cpl, ctx); 1420 return; 1421 } else { 1422 /* Thin provisioned blobs can point to unallocated extent pages. 1423 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1424 1425 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1426 blob->active.num_clusters += sz; 1427 blob->remaining_clusters_in_et -= sz; 1428 1429 assert(spdk_blob_is_thin_provisioned(blob)); 1430 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1431 1432 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1433 if (tmp == NULL) { 1434 blob_load_final(ctx, -ENOMEM); 1435 return; 1436 } 1437 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1438 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1439 blob->active.clusters = tmp; 1440 blob->active.cluster_array_size = blob->active.num_clusters; 1441 } 1442 } 1443 1444 blob_load_backing_dev(ctx); 1445 } 1446 1447 static void 1448 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1449 { 1450 struct spdk_blob_load_ctx *ctx = cb_arg; 1451 struct spdk_blob *blob = ctx->blob; 1452 struct spdk_blob_md_page *page; 1453 int rc; 1454 uint32_t crc; 1455 uint32_t current_page; 1456 1457 if (ctx->num_pages == 1) { 1458 current_page = bs_blobid_to_page(blob->id); 1459 } else { 1460 assert(ctx->num_pages != 0); 1461 page = &ctx->pages[ctx->num_pages - 2]; 1462 current_page = page->next; 1463 } 1464 1465 if (bserrno) { 1466 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1467 current_page, blob->id, bserrno); 1468 blob_load_final(ctx, bserrno); 1469 return; 1470 } 1471 1472 page = &ctx->pages[ctx->num_pages - 1]; 1473 crc = blob_md_page_calc_crc(page); 1474 if (crc != page->crc) { 1475 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1476 current_page, blob->id); 1477 blob_load_final(ctx, -EINVAL); 1478 return; 1479 } 1480 1481 if (page->next != SPDK_INVALID_MD_PAGE) { 1482 struct spdk_blob_md_page *tmp_pages; 1483 uint32_t next_page = page->next; 1484 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1485 1486 /* Read the next page */ 1487 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0); 1488 if (tmp_pages == NULL) { 1489 blob_load_final(ctx, -ENOMEM); 1490 return; 1491 } 1492 ctx->num_pages++; 1493 ctx->pages = tmp_pages; 1494 1495 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1496 next_lba, 1497 bs_byte_to_lba(blob->bs, sizeof(*page)), 1498 blob_load_cpl, ctx); 1499 return; 1500 } 1501 1502 /* Parse the pages */ 1503 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1504 if (rc) { 1505 blob_load_final(ctx, rc); 1506 return; 1507 } 1508 1509 if (blob->extent_table_found == true) { 1510 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1511 assert(blob->extent_rle_found == false); 1512 blob->use_extent_table = true; 1513 } else { 1514 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1515 * for extent table. No extent_* descriptors means that blob has length of 0 1516 * and no extent_rle descriptors were persisted for it. 1517 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1518 blob->use_extent_table = false; 1519 } 1520 1521 /* Check the clear_method stored in metadata vs what may have been passed 1522 * via spdk_bs_open_blob_ext() and update accordingly. 1523 */ 1524 blob_update_clear_method(blob); 1525 1526 spdk_free(ctx->pages); 1527 ctx->pages = NULL; 1528 1529 if (blob->extent_table_found) { 1530 blob_load_cpl_extents_cpl(seq, ctx, 0); 1531 } else { 1532 blob_load_backing_dev(ctx); 1533 } 1534 } 1535 1536 /* Load a blob from disk given a blobid */ 1537 static void 1538 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1539 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1540 { 1541 struct spdk_blob_load_ctx *ctx; 1542 struct spdk_blob_store *bs; 1543 uint32_t page_num; 1544 uint64_t lba; 1545 1546 blob_verify_md_op(blob); 1547 1548 bs = blob->bs; 1549 1550 ctx = calloc(1, sizeof(*ctx)); 1551 if (!ctx) { 1552 cb_fn(seq, cb_arg, -ENOMEM); 1553 return; 1554 } 1555 1556 ctx->blob = blob; 1557 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1558 if (!ctx->pages) { 1559 free(ctx); 1560 cb_fn(seq, cb_arg, -ENOMEM); 1561 return; 1562 } 1563 ctx->num_pages = 1; 1564 ctx->cb_fn = cb_fn; 1565 ctx->cb_arg = cb_arg; 1566 ctx->seq = seq; 1567 1568 page_num = bs_blobid_to_page(blob->id); 1569 lba = bs_md_page_to_lba(blob->bs, page_num); 1570 1571 blob->state = SPDK_BLOB_STATE_LOADING; 1572 1573 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1574 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1575 blob_load_cpl, ctx); 1576 } 1577 1578 struct spdk_blob_persist_ctx { 1579 struct spdk_blob *blob; 1580 1581 struct spdk_bs_super_block *super; 1582 1583 struct spdk_blob_md_page *pages; 1584 uint32_t next_extent_page; 1585 struct spdk_blob_md_page *extent_page; 1586 1587 spdk_bs_sequence_t *seq; 1588 spdk_bs_sequence_cpl cb_fn; 1589 void *cb_arg; 1590 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1591 }; 1592 1593 static void 1594 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1595 uint64_t lba_count) 1596 { 1597 switch (ctx->blob->clear_method) { 1598 case BLOB_CLEAR_WITH_DEFAULT: 1599 case BLOB_CLEAR_WITH_UNMAP: 1600 bs_batch_unmap_dev(batch, lba, lba_count); 1601 break; 1602 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1603 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1604 break; 1605 case BLOB_CLEAR_WITH_NONE: 1606 default: 1607 break; 1608 } 1609 } 1610 1611 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1612 1613 static void 1614 blob_persist_complete_cb(void *arg) 1615 { 1616 struct spdk_blob_persist_ctx *ctx = arg; 1617 1618 /* Call user callback */ 1619 ctx->cb_fn(ctx->seq, ctx->cb_arg, 0); 1620 1621 /* Free the memory */ 1622 spdk_free(ctx->pages); 1623 free(ctx); 1624 } 1625 1626 static void 1627 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1628 { 1629 struct spdk_blob_persist_ctx *next_persist, *tmp; 1630 struct spdk_blob *blob = ctx->blob; 1631 1632 if (bserrno == 0) { 1633 blob_mark_clean(blob); 1634 } 1635 1636 assert(ctx == TAILQ_FIRST(&blob->persists_to_complete)); 1637 1638 /* Complete all persists that were pending when the current persist started */ 1639 TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) { 1640 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link); 1641 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist); 1642 } 1643 1644 if (TAILQ_EMPTY(&blob->pending_persists)) { 1645 return; 1646 } 1647 1648 /* Queue up all pending persists for completion and start blob persist with first one */ 1649 TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link); 1650 next_persist = TAILQ_FIRST(&blob->persists_to_complete); 1651 1652 blob->state = SPDK_BLOB_STATE_DIRTY; 1653 blob_persist_check_dirty(next_persist); 1654 } 1655 1656 static void 1657 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1658 { 1659 struct spdk_blob_persist_ctx *ctx = cb_arg; 1660 struct spdk_blob *blob = ctx->blob; 1661 struct spdk_blob_store *bs = blob->bs; 1662 size_t i; 1663 1664 if (bserrno != 0) { 1665 blob_persist_complete(seq, ctx, bserrno); 1666 return; 1667 } 1668 1669 /* Release all extent_pages that were truncated */ 1670 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1671 /* Nothing to release if it was not allocated */ 1672 if (blob->active.extent_pages[i] != 0) { 1673 bs_release_md_page(bs, blob->active.extent_pages[i]); 1674 } 1675 } 1676 1677 if (blob->active.num_extent_pages == 0) { 1678 free(blob->active.extent_pages); 1679 blob->active.extent_pages = NULL; 1680 blob->active.extent_pages_array_size = 0; 1681 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1682 #ifndef __clang_analyzer__ 1683 void *tmp; 1684 1685 /* scan-build really can't figure reallocs, workaround it */ 1686 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1687 assert(tmp != NULL); 1688 blob->active.extent_pages = tmp; 1689 #endif 1690 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1691 } 1692 1693 blob_persist_complete(seq, ctx, bserrno); 1694 } 1695 1696 static void 1697 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1698 { 1699 struct spdk_blob *blob = ctx->blob; 1700 struct spdk_blob_store *bs = blob->bs; 1701 size_t i; 1702 uint64_t lba; 1703 uint64_t lba_count; 1704 spdk_bs_batch_t *batch; 1705 1706 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1707 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1708 1709 /* Clear all extent_pages that were truncated */ 1710 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1711 /* Nothing to clear if it was not allocated */ 1712 if (blob->active.extent_pages[i] != 0) { 1713 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1714 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1715 } 1716 } 1717 1718 bs_batch_close(batch); 1719 } 1720 1721 static void 1722 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1723 { 1724 struct spdk_blob_persist_ctx *ctx = cb_arg; 1725 struct spdk_blob *blob = ctx->blob; 1726 struct spdk_blob_store *bs = blob->bs; 1727 size_t i; 1728 1729 if (bserrno != 0) { 1730 blob_persist_complete(seq, ctx, bserrno); 1731 return; 1732 } 1733 1734 pthread_mutex_lock(&bs->used_clusters_mutex); 1735 /* Release all clusters that were truncated */ 1736 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1737 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1738 1739 /* Nothing to release if it was not allocated */ 1740 if (blob->active.clusters[i] != 0) { 1741 bs_release_cluster(bs, cluster_num); 1742 } 1743 } 1744 pthread_mutex_unlock(&bs->used_clusters_mutex); 1745 1746 if (blob->active.num_clusters == 0) { 1747 free(blob->active.clusters); 1748 blob->active.clusters = NULL; 1749 blob->active.cluster_array_size = 0; 1750 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1751 #ifndef __clang_analyzer__ 1752 void *tmp; 1753 1754 /* scan-build really can't figure reallocs, workaround it */ 1755 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1756 assert(tmp != NULL); 1757 blob->active.clusters = tmp; 1758 1759 #endif 1760 blob->active.cluster_array_size = blob->active.num_clusters; 1761 } 1762 1763 /* Move on to clearing extent pages */ 1764 blob_persist_clear_extents(seq, ctx); 1765 } 1766 1767 static void 1768 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1769 { 1770 struct spdk_blob *blob = ctx->blob; 1771 struct spdk_blob_store *bs = blob->bs; 1772 spdk_bs_batch_t *batch; 1773 size_t i; 1774 uint64_t lba; 1775 uint64_t lba_count; 1776 1777 /* Clusters don't move around in blobs. The list shrinks or grows 1778 * at the end, but no changes ever occur in the middle of the list. 1779 */ 1780 1781 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1782 1783 /* Clear all clusters that were truncated */ 1784 lba = 0; 1785 lba_count = 0; 1786 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1787 uint64_t next_lba = blob->active.clusters[i]; 1788 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1); 1789 1790 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1791 /* This cluster is contiguous with the previous one. */ 1792 lba_count += next_lba_count; 1793 continue; 1794 } else if (next_lba == 0) { 1795 continue; 1796 } 1797 1798 /* This cluster is not contiguous with the previous one. */ 1799 1800 /* If a run of LBAs previously existing, clear them now */ 1801 if (lba_count > 0) { 1802 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1803 } 1804 1805 /* Start building the next batch */ 1806 lba = next_lba; 1807 if (next_lba > 0) { 1808 lba_count = next_lba_count; 1809 } else { 1810 lba_count = 0; 1811 } 1812 } 1813 1814 /* If we ended with a contiguous set of LBAs, clear them now */ 1815 if (lba_count > 0) { 1816 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1817 } 1818 1819 bs_batch_close(batch); 1820 } 1821 1822 static void 1823 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1824 { 1825 struct spdk_blob_persist_ctx *ctx = cb_arg; 1826 struct spdk_blob *blob = ctx->blob; 1827 struct spdk_blob_store *bs = blob->bs; 1828 size_t i; 1829 1830 if (bserrno != 0) { 1831 blob_persist_complete(seq, ctx, bserrno); 1832 return; 1833 } 1834 1835 /* This loop starts at 1 because the first page is special and handled 1836 * below. The pages (except the first) are never written in place, 1837 * so any pages in the clean list must be zeroed. 1838 */ 1839 for (i = 1; i < blob->clean.num_pages; i++) { 1840 bs_release_md_page(bs, blob->clean.pages[i]); 1841 } 1842 1843 if (blob->active.num_pages == 0) { 1844 uint32_t page_num; 1845 1846 page_num = bs_blobid_to_page(blob->id); 1847 bs_release_md_page(bs, page_num); 1848 } 1849 1850 /* Move on to clearing clusters */ 1851 blob_persist_clear_clusters(seq, ctx); 1852 } 1853 1854 static void 1855 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1856 { 1857 struct spdk_blob_persist_ctx *ctx = cb_arg; 1858 struct spdk_blob *blob = ctx->blob; 1859 struct spdk_blob_store *bs = blob->bs; 1860 uint64_t lba; 1861 uint64_t lba_count; 1862 spdk_bs_batch_t *batch; 1863 size_t i; 1864 1865 if (bserrno != 0) { 1866 blob_persist_complete(seq, ctx, bserrno); 1867 return; 1868 } 1869 1870 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1871 1872 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1873 1874 /* This loop starts at 1 because the first page is special and handled 1875 * below. The pages (except the first) are never written in place, 1876 * so any pages in the clean list must be zeroed. 1877 */ 1878 for (i = 1; i < blob->clean.num_pages; i++) { 1879 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1880 1881 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1882 } 1883 1884 /* The first page will only be zeroed if this is a delete. */ 1885 if (blob->active.num_pages == 0) { 1886 uint32_t page_num; 1887 1888 /* The first page in the metadata goes where the blobid indicates */ 1889 page_num = bs_blobid_to_page(blob->id); 1890 lba = bs_md_page_to_lba(bs, page_num); 1891 1892 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1893 } 1894 1895 bs_batch_close(batch); 1896 } 1897 1898 static void 1899 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1900 { 1901 struct spdk_blob_persist_ctx *ctx = cb_arg; 1902 struct spdk_blob *blob = ctx->blob; 1903 struct spdk_blob_store *bs = blob->bs; 1904 uint64_t lba; 1905 uint32_t lba_count; 1906 struct spdk_blob_md_page *page; 1907 1908 if (bserrno != 0) { 1909 blob_persist_complete(seq, ctx, bserrno); 1910 return; 1911 } 1912 1913 if (blob->active.num_pages == 0) { 1914 /* Move on to the next step */ 1915 blob_persist_zero_pages(seq, ctx, 0); 1916 return; 1917 } 1918 1919 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1920 1921 page = &ctx->pages[0]; 1922 /* The first page in the metadata goes where the blobid indicates */ 1923 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1924 1925 bs_sequence_write_dev(seq, page, lba, lba_count, 1926 blob_persist_zero_pages, ctx); 1927 } 1928 1929 static void 1930 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1931 { 1932 struct spdk_blob *blob = ctx->blob; 1933 struct spdk_blob_store *bs = blob->bs; 1934 uint64_t lba; 1935 uint32_t lba_count; 1936 struct spdk_blob_md_page *page; 1937 spdk_bs_batch_t *batch; 1938 size_t i; 1939 1940 /* Clusters don't move around in blobs. The list shrinks or grows 1941 * at the end, but no changes ever occur in the middle of the list. 1942 */ 1943 1944 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1945 1946 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1947 1948 /* This starts at 1. The root page is not written until 1949 * all of the others are finished 1950 */ 1951 for (i = 1; i < blob->active.num_pages; i++) { 1952 page = &ctx->pages[i]; 1953 assert(page->sequence_num == i); 1954 1955 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1956 1957 bs_batch_write_dev(batch, page, lba, lba_count); 1958 } 1959 1960 bs_batch_close(batch); 1961 } 1962 1963 static int 1964 blob_resize(struct spdk_blob *blob, uint64_t sz) 1965 { 1966 uint64_t i; 1967 uint64_t *tmp; 1968 uint64_t cluster; 1969 uint32_t lfmd; /* lowest free md page */ 1970 uint64_t num_clusters; 1971 uint32_t *ep_tmp; 1972 uint64_t new_num_ep = 0, current_num_ep = 0; 1973 struct spdk_blob_store *bs; 1974 1975 bs = blob->bs; 1976 1977 blob_verify_md_op(blob); 1978 1979 if (blob->active.num_clusters == sz) { 1980 return 0; 1981 } 1982 1983 if (blob->active.num_clusters < blob->active.cluster_array_size) { 1984 /* If this blob was resized to be larger, then smaller, then 1985 * larger without syncing, then the cluster array already 1986 * contains spare assigned clusters we can use. 1987 */ 1988 num_clusters = spdk_min(blob->active.cluster_array_size, 1989 sz); 1990 } else { 1991 num_clusters = blob->active.num_clusters; 1992 } 1993 1994 if (blob->use_extent_table) { 1995 /* Round up since every cluster beyond current Extent Table size, 1996 * requires new extent page. */ 1997 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 1998 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 1999 } 2000 2001 /* Check first that we have enough clusters and md pages before we start claiming them. */ 2002 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 2003 if ((sz - num_clusters) > bs->num_free_clusters) { 2004 return -ENOSPC; 2005 } 2006 lfmd = 0; 2007 for (i = current_num_ep; i < new_num_ep ; i++) { 2008 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2009 if (lfmd == UINT32_MAX) { 2010 /* No more free md pages. Cannot satisfy the request */ 2011 return -ENOSPC; 2012 } 2013 } 2014 } 2015 2016 if (sz > num_clusters) { 2017 /* Expand the cluster array if necessary. 2018 * We only shrink the array when persisting. 2019 */ 2020 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2021 if (sz > 0 && tmp == NULL) { 2022 return -ENOMEM; 2023 } 2024 memset(tmp + blob->active.cluster_array_size, 0, 2025 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2026 blob->active.clusters = tmp; 2027 blob->active.cluster_array_size = sz; 2028 2029 /* Expand the extents table, only if enough clusters were added */ 2030 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2031 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2032 if (new_num_ep > 0 && ep_tmp == NULL) { 2033 return -ENOMEM; 2034 } 2035 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2036 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2037 blob->active.extent_pages = ep_tmp; 2038 blob->active.extent_pages_array_size = new_num_ep; 2039 } 2040 } 2041 2042 blob->state = SPDK_BLOB_STATE_DIRTY; 2043 2044 if (spdk_blob_is_thin_provisioned(blob) == false) { 2045 cluster = 0; 2046 lfmd = 0; 2047 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2048 for (i = num_clusters; i < sz; i++) { 2049 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2050 lfmd++; 2051 } 2052 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2053 } 2054 2055 blob->active.num_clusters = sz; 2056 blob->active.num_extent_pages = new_num_ep; 2057 2058 return 0; 2059 } 2060 2061 static void 2062 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2063 { 2064 spdk_bs_sequence_t *seq = ctx->seq; 2065 struct spdk_blob *blob = ctx->blob; 2066 struct spdk_blob_store *bs = blob->bs; 2067 uint64_t i; 2068 uint32_t page_num; 2069 void *tmp; 2070 int rc; 2071 2072 /* Generate the new metadata */ 2073 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2074 if (rc < 0) { 2075 blob_persist_complete(seq, ctx, rc); 2076 return; 2077 } 2078 2079 assert(blob->active.num_pages >= 1); 2080 2081 /* Resize the cache of page indices */ 2082 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2083 if (!tmp) { 2084 blob_persist_complete(seq, ctx, -ENOMEM); 2085 return; 2086 } 2087 blob->active.pages = tmp; 2088 2089 /* Assign this metadata to pages. This requires two passes - 2090 * one to verify that there are enough pages and a second 2091 * to actually claim them. */ 2092 page_num = 0; 2093 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2094 for (i = 1; i < blob->active.num_pages; i++) { 2095 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2096 if (page_num == UINT32_MAX) { 2097 blob_persist_complete(seq, ctx, -ENOMEM); 2098 return; 2099 } 2100 page_num++; 2101 } 2102 2103 page_num = 0; 2104 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2105 for (i = 1; i < blob->active.num_pages; i++) { 2106 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2107 ctx->pages[i - 1].next = page_num; 2108 /* Now that previous metadata page is complete, calculate the crc for it. */ 2109 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2110 blob->active.pages[i] = page_num; 2111 bs_claim_md_page(bs, page_num); 2112 SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id); 2113 page_num++; 2114 } 2115 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2116 /* Start writing the metadata from last page to first */ 2117 blob->state = SPDK_BLOB_STATE_CLEAN; 2118 blob_persist_write_page_chain(seq, ctx); 2119 } 2120 2121 static void 2122 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2123 { 2124 struct spdk_blob_persist_ctx *ctx = cb_arg; 2125 struct spdk_blob *blob = ctx->blob; 2126 size_t i; 2127 uint32_t extent_page_id; 2128 uint32_t page_count = 0; 2129 int rc; 2130 2131 if (ctx->extent_page != NULL) { 2132 spdk_free(ctx->extent_page); 2133 ctx->extent_page = NULL; 2134 } 2135 2136 if (bserrno != 0) { 2137 blob_persist_complete(seq, ctx, bserrno); 2138 return; 2139 } 2140 2141 /* Only write out Extent Pages when blob was resized. */ 2142 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2143 extent_page_id = blob->active.extent_pages[i]; 2144 if (extent_page_id == 0) { 2145 /* No Extent Page to persist */ 2146 assert(spdk_blob_is_thin_provisioned(blob)); 2147 continue; 2148 } 2149 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2150 ctx->next_extent_page = i + 1; 2151 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2152 if (rc < 0) { 2153 blob_persist_complete(seq, ctx, rc); 2154 return; 2155 } 2156 2157 blob->state = SPDK_BLOB_STATE_DIRTY; 2158 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2159 2160 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2161 2162 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2163 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2164 blob_persist_write_extent_pages, ctx); 2165 return; 2166 } 2167 2168 blob_persist_generate_new_md(ctx); 2169 } 2170 2171 static void 2172 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2173 { 2174 spdk_bs_sequence_t *seq = ctx->seq; 2175 struct spdk_blob *blob = ctx->blob; 2176 2177 if (blob->active.num_pages == 0) { 2178 /* This is the signal that the blob should be deleted. 2179 * Immediately jump to the clean up routine. */ 2180 assert(blob->clean.num_pages > 0); 2181 blob->state = SPDK_BLOB_STATE_CLEAN; 2182 blob_persist_zero_pages(seq, ctx, 0); 2183 return; 2184 2185 } 2186 2187 if (blob->clean.num_clusters < blob->active.num_clusters) { 2188 /* Blob was resized up */ 2189 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2190 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2191 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2192 /* Blob was resized down */ 2193 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2194 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2195 } else { 2196 /* No change in size occurred */ 2197 blob_persist_generate_new_md(ctx); 2198 return; 2199 } 2200 2201 blob_persist_write_extent_pages(seq, ctx, 0); 2202 } 2203 2204 static void 2205 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2206 { 2207 struct spdk_blob_persist_ctx *ctx = cb_arg; 2208 2209 spdk_free(ctx->super); 2210 2211 if (bserrno != 0) { 2212 blob_persist_complete(seq, ctx, bserrno); 2213 return; 2214 } 2215 2216 ctx->blob->bs->clean = 0; 2217 2218 blob_persist_start(ctx); 2219 } 2220 2221 static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2222 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2223 2224 2225 static void 2226 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2227 { 2228 struct spdk_blob_persist_ctx *ctx = cb_arg; 2229 2230 if (bserrno != 0) { 2231 spdk_free(ctx->super); 2232 blob_persist_complete(seq, ctx, bserrno); 2233 return; 2234 } 2235 2236 ctx->super->clean = 0; 2237 if (ctx->super->size == 0) { 2238 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2239 } 2240 2241 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2242 } 2243 2244 static void 2245 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2246 { 2247 if (ctx->blob->bs->clean) { 2248 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2249 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2250 if (!ctx->super) { 2251 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2252 return; 2253 } 2254 2255 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2256 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2257 blob_persist_dirty, ctx); 2258 } else { 2259 blob_persist_start(ctx); 2260 } 2261 } 2262 2263 /* Write a blob to disk */ 2264 static void 2265 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2266 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2267 { 2268 struct spdk_blob_persist_ctx *ctx; 2269 2270 blob_verify_md_op(blob); 2271 2272 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) { 2273 cb_fn(seq, cb_arg, 0); 2274 return; 2275 } 2276 2277 ctx = calloc(1, sizeof(*ctx)); 2278 if (!ctx) { 2279 cb_fn(seq, cb_arg, -ENOMEM); 2280 return; 2281 } 2282 ctx->blob = blob; 2283 ctx->seq = seq; 2284 ctx->cb_fn = cb_fn; 2285 ctx->cb_arg = cb_arg; 2286 2287 /* Multiple blob persists can affect one another, via blob->state or 2288 * blob mutable data changes. To prevent it, queue up the persists. */ 2289 if (!TAILQ_EMPTY(&blob->persists_to_complete)) { 2290 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2291 return; 2292 } 2293 TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link); 2294 2295 blob_persist_check_dirty(ctx); 2296 } 2297 2298 struct spdk_blob_copy_cluster_ctx { 2299 struct spdk_blob *blob; 2300 uint8_t *buf; 2301 uint64_t page; 2302 uint64_t new_cluster; 2303 uint32_t new_extent_page; 2304 spdk_bs_sequence_t *seq; 2305 struct spdk_blob_md_page *new_cluster_page; 2306 }; 2307 2308 static void 2309 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2310 { 2311 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2312 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2313 TAILQ_HEAD(, spdk_bs_request_set) requests; 2314 spdk_bs_user_op_t *op; 2315 2316 TAILQ_INIT(&requests); 2317 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2318 2319 while (!TAILQ_EMPTY(&requests)) { 2320 op = TAILQ_FIRST(&requests); 2321 TAILQ_REMOVE(&requests, op, link); 2322 if (bserrno == 0) { 2323 bs_user_op_execute(op); 2324 } else { 2325 bs_user_op_abort(op, bserrno); 2326 } 2327 } 2328 2329 spdk_free(ctx->buf); 2330 free(ctx); 2331 } 2332 2333 static void 2334 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2335 { 2336 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2337 2338 if (bserrno) { 2339 if (bserrno == -EEXIST) { 2340 /* The metadata insert failed because another thread 2341 * allocated the cluster first. Free our cluster 2342 * but continue without error. */ 2343 bserrno = 0; 2344 } 2345 pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex); 2346 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2347 pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex); 2348 if (ctx->new_extent_page != 0) { 2349 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2350 } 2351 } 2352 2353 bs_sequence_finish(ctx->seq, bserrno); 2354 } 2355 2356 static void 2357 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2358 { 2359 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2360 uint32_t cluster_number; 2361 2362 if (bserrno) { 2363 /* The write failed, so jump to the final completion handler */ 2364 bs_sequence_finish(seq, bserrno); 2365 return; 2366 } 2367 2368 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2369 2370 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2371 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2372 } 2373 2374 static void 2375 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2376 { 2377 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2378 2379 if (bserrno != 0) { 2380 /* The read failed, so jump to the final completion handler */ 2381 bs_sequence_finish(seq, bserrno); 2382 return; 2383 } 2384 2385 /* Write whole cluster */ 2386 bs_sequence_write_dev(seq, ctx->buf, 2387 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2388 bs_cluster_to_lba(ctx->blob->bs, 1), 2389 blob_write_copy_cpl, ctx); 2390 } 2391 2392 static void 2393 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2394 struct spdk_io_channel *_ch, 2395 uint64_t io_unit, spdk_bs_user_op_t *op) 2396 { 2397 struct spdk_bs_cpl cpl; 2398 struct spdk_bs_channel *ch; 2399 struct spdk_blob_copy_cluster_ctx *ctx; 2400 uint32_t cluster_start_page; 2401 uint32_t cluster_number; 2402 int rc; 2403 2404 ch = spdk_io_channel_get_ctx(_ch); 2405 2406 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2407 /* There are already operations pending. Queue this user op 2408 * and return because it will be re-executed when the outstanding 2409 * cluster allocation completes. */ 2410 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2411 return; 2412 } 2413 2414 /* Round the io_unit offset down to the first page in the cluster */ 2415 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2416 2417 /* Calculate which index in the metadata cluster array the corresponding 2418 * cluster is supposed to be at. */ 2419 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2420 2421 ctx = calloc(1, sizeof(*ctx)); 2422 if (!ctx) { 2423 bs_user_op_abort(op, -ENOMEM); 2424 return; 2425 } 2426 2427 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2428 2429 ctx->blob = blob; 2430 ctx->page = cluster_start_page; 2431 ctx->new_cluster_page = ch->new_cluster_page; 2432 memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); 2433 2434 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2435 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2436 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2437 if (!ctx->buf) { 2438 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2439 blob->bs->cluster_sz); 2440 free(ctx); 2441 bs_user_op_abort(op, -ENOMEM); 2442 return; 2443 } 2444 } 2445 2446 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2447 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2448 false); 2449 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2450 if (rc != 0) { 2451 spdk_free(ctx->buf); 2452 free(ctx); 2453 bs_user_op_abort(op, rc); 2454 return; 2455 } 2456 2457 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2458 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2459 cpl.u.blob_basic.cb_arg = ctx; 2460 2461 ctx->seq = bs_sequence_start(_ch, &cpl); 2462 if (!ctx->seq) { 2463 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2464 bs_release_cluster(blob->bs, ctx->new_cluster); 2465 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2466 spdk_free(ctx->buf); 2467 free(ctx); 2468 bs_user_op_abort(op, -ENOMEM); 2469 return; 2470 } 2471 2472 /* Queue the user op to block other incoming operations */ 2473 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2474 2475 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2476 /* Read cluster from backing device */ 2477 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2478 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2479 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2480 blob_write_copy, ctx); 2481 } else { 2482 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2483 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2484 } 2485 } 2486 2487 static inline bool 2488 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2489 uint64_t *lba, uint64_t *lba_count) 2490 { 2491 *lba_count = length; 2492 2493 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2494 assert(blob->back_bs_dev != NULL); 2495 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2496 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2497 return false; 2498 } else { 2499 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2500 return true; 2501 } 2502 } 2503 2504 struct op_split_ctx { 2505 struct spdk_blob *blob; 2506 struct spdk_io_channel *channel; 2507 uint64_t io_unit_offset; 2508 uint64_t io_units_remaining; 2509 void *curr_payload; 2510 enum spdk_blob_op_type op_type; 2511 spdk_bs_sequence_t *seq; 2512 bool in_submit_ctx; 2513 bool completed_in_submit_ctx; 2514 bool done; 2515 }; 2516 2517 static void 2518 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2519 { 2520 struct op_split_ctx *ctx = cb_arg; 2521 struct spdk_blob *blob = ctx->blob; 2522 struct spdk_io_channel *ch = ctx->channel; 2523 enum spdk_blob_op_type op_type = ctx->op_type; 2524 uint8_t *buf; 2525 uint64_t offset; 2526 uint64_t length; 2527 uint64_t op_length; 2528 2529 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2530 bs_sequence_finish(ctx->seq, bserrno); 2531 if (ctx->in_submit_ctx) { 2532 /* Defer freeing of the ctx object, since it will be 2533 * accessed when this unwinds back to the submisison 2534 * context. 2535 */ 2536 ctx->done = true; 2537 } else { 2538 free(ctx); 2539 } 2540 return; 2541 } 2542 2543 if (ctx->in_submit_ctx) { 2544 /* If this split operation completed in the context 2545 * of its submission, mark the flag and return immediately 2546 * to avoid recursion. 2547 */ 2548 ctx->completed_in_submit_ctx = true; 2549 return; 2550 } 2551 2552 while (true) { 2553 ctx->completed_in_submit_ctx = false; 2554 2555 offset = ctx->io_unit_offset; 2556 length = ctx->io_units_remaining; 2557 buf = ctx->curr_payload; 2558 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2559 offset)); 2560 2561 /* Update length and payload for next operation */ 2562 ctx->io_units_remaining -= op_length; 2563 ctx->io_unit_offset += op_length; 2564 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2565 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2566 } 2567 2568 assert(!ctx->in_submit_ctx); 2569 ctx->in_submit_ctx = true; 2570 2571 switch (op_type) { 2572 case SPDK_BLOB_READ: 2573 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2574 blob_request_submit_op_split_next, ctx); 2575 break; 2576 case SPDK_BLOB_WRITE: 2577 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2578 blob_request_submit_op_split_next, ctx); 2579 break; 2580 case SPDK_BLOB_UNMAP: 2581 spdk_blob_io_unmap(blob, ch, offset, op_length, 2582 blob_request_submit_op_split_next, ctx); 2583 break; 2584 case SPDK_BLOB_WRITE_ZEROES: 2585 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2586 blob_request_submit_op_split_next, ctx); 2587 break; 2588 case SPDK_BLOB_READV: 2589 case SPDK_BLOB_WRITEV: 2590 SPDK_ERRLOG("readv/write not valid\n"); 2591 bs_sequence_finish(ctx->seq, -EINVAL); 2592 free(ctx); 2593 return; 2594 } 2595 2596 #ifndef __clang_analyzer__ 2597 /* scan-build reports a false positive around accessing the ctx here. It 2598 * forms a path that recursively calls this function, but then says 2599 * "assuming ctx->in_submit_ctx is false", when that isn't possible. 2600 * This path does free(ctx), returns to here, and reports a use-after-free 2601 * bug. Wrapping this bit of code so that scan-build doesn't see it 2602 * works around the scan-build bug. 2603 */ 2604 assert(ctx->in_submit_ctx); 2605 ctx->in_submit_ctx = false; 2606 2607 /* If the operation completed immediately, loop back and submit the 2608 * next operation. Otherwise we can return and the next split 2609 * operation will get submitted when this current operation is 2610 * later completed asynchronously. 2611 */ 2612 if (ctx->completed_in_submit_ctx) { 2613 continue; 2614 } else if (ctx->done) { 2615 free(ctx); 2616 } 2617 #endif 2618 break; 2619 } 2620 } 2621 2622 static void 2623 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2624 void *payload, uint64_t offset, uint64_t length, 2625 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2626 { 2627 struct op_split_ctx *ctx; 2628 spdk_bs_sequence_t *seq; 2629 struct spdk_bs_cpl cpl; 2630 2631 assert(blob != NULL); 2632 2633 ctx = calloc(1, sizeof(struct op_split_ctx)); 2634 if (ctx == NULL) { 2635 cb_fn(cb_arg, -ENOMEM); 2636 return; 2637 } 2638 2639 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2640 cpl.u.blob_basic.cb_fn = cb_fn; 2641 cpl.u.blob_basic.cb_arg = cb_arg; 2642 2643 seq = bs_sequence_start(ch, &cpl); 2644 if (!seq) { 2645 free(ctx); 2646 cb_fn(cb_arg, -ENOMEM); 2647 return; 2648 } 2649 2650 ctx->blob = blob; 2651 ctx->channel = ch; 2652 ctx->curr_payload = payload; 2653 ctx->io_unit_offset = offset; 2654 ctx->io_units_remaining = length; 2655 ctx->op_type = op_type; 2656 ctx->seq = seq; 2657 2658 blob_request_submit_op_split_next(ctx, 0); 2659 } 2660 2661 static void 2662 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2663 void *payload, uint64_t offset, uint64_t length, 2664 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2665 { 2666 struct spdk_bs_cpl cpl; 2667 uint64_t lba; 2668 uint64_t lba_count; 2669 bool is_allocated; 2670 2671 assert(blob != NULL); 2672 2673 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2674 cpl.u.blob_basic.cb_fn = cb_fn; 2675 cpl.u.blob_basic.cb_arg = cb_arg; 2676 2677 if (blob->frozen_refcnt) { 2678 /* This blob I/O is frozen */ 2679 spdk_bs_user_op_t *op; 2680 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2681 2682 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2683 if (!op) { 2684 cb_fn(cb_arg, -ENOMEM); 2685 return; 2686 } 2687 2688 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2689 2690 return; 2691 } 2692 2693 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2694 2695 switch (op_type) { 2696 case SPDK_BLOB_READ: { 2697 spdk_bs_batch_t *batch; 2698 2699 batch = bs_batch_open(_ch, &cpl); 2700 if (!batch) { 2701 cb_fn(cb_arg, -ENOMEM); 2702 return; 2703 } 2704 2705 if (is_allocated) { 2706 /* Read from the blob */ 2707 bs_batch_read_dev(batch, payload, lba, lba_count); 2708 } else { 2709 /* Read from the backing block device */ 2710 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2711 } 2712 2713 bs_batch_close(batch); 2714 break; 2715 } 2716 case SPDK_BLOB_WRITE: 2717 case SPDK_BLOB_WRITE_ZEROES: { 2718 if (is_allocated) { 2719 /* Write to the blob */ 2720 spdk_bs_batch_t *batch; 2721 2722 if (lba_count == 0) { 2723 cb_fn(cb_arg, 0); 2724 return; 2725 } 2726 2727 batch = bs_batch_open(_ch, &cpl); 2728 if (!batch) { 2729 cb_fn(cb_arg, -ENOMEM); 2730 return; 2731 } 2732 2733 if (op_type == SPDK_BLOB_WRITE) { 2734 bs_batch_write_dev(batch, payload, lba, lba_count); 2735 } else { 2736 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2737 } 2738 2739 bs_batch_close(batch); 2740 } else { 2741 /* Queue this operation and allocate the cluster */ 2742 spdk_bs_user_op_t *op; 2743 2744 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2745 if (!op) { 2746 cb_fn(cb_arg, -ENOMEM); 2747 return; 2748 } 2749 2750 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2751 } 2752 break; 2753 } 2754 case SPDK_BLOB_UNMAP: { 2755 spdk_bs_batch_t *batch; 2756 2757 batch = bs_batch_open(_ch, &cpl); 2758 if (!batch) { 2759 cb_fn(cb_arg, -ENOMEM); 2760 return; 2761 } 2762 2763 if (is_allocated) { 2764 bs_batch_unmap_dev(batch, lba, lba_count); 2765 } 2766 2767 bs_batch_close(batch); 2768 break; 2769 } 2770 case SPDK_BLOB_READV: 2771 case SPDK_BLOB_WRITEV: 2772 SPDK_ERRLOG("readv/write not valid\n"); 2773 cb_fn(cb_arg, -EINVAL); 2774 break; 2775 } 2776 } 2777 2778 static void 2779 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2780 void *payload, uint64_t offset, uint64_t length, 2781 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2782 { 2783 assert(blob != NULL); 2784 2785 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2786 cb_fn(cb_arg, -EPERM); 2787 return; 2788 } 2789 2790 if (length == 0) { 2791 cb_fn(cb_arg, 0); 2792 return; 2793 } 2794 2795 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2796 cb_fn(cb_arg, -EINVAL); 2797 return; 2798 } 2799 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2800 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2801 cb_fn, cb_arg, op_type); 2802 } else { 2803 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2804 cb_fn, cb_arg, op_type); 2805 } 2806 } 2807 2808 struct rw_iov_ctx { 2809 struct spdk_blob *blob; 2810 struct spdk_io_channel *channel; 2811 spdk_blob_op_complete cb_fn; 2812 void *cb_arg; 2813 bool read; 2814 int iovcnt; 2815 struct iovec *orig_iov; 2816 uint64_t io_unit_offset; 2817 uint64_t io_units_remaining; 2818 uint64_t io_units_done; 2819 struct spdk_blob_ext_io_opts *ext_io_opts; 2820 struct iovec iov[0]; 2821 }; 2822 2823 static void 2824 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2825 { 2826 assert(cb_arg == NULL); 2827 bs_sequence_finish(seq, bserrno); 2828 } 2829 2830 static void 2831 rw_iov_split_next(void *cb_arg, int bserrno) 2832 { 2833 struct rw_iov_ctx *ctx = cb_arg; 2834 struct spdk_blob *blob = ctx->blob; 2835 struct iovec *iov, *orig_iov; 2836 int iovcnt; 2837 size_t orig_iovoff; 2838 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2839 uint64_t byte_count; 2840 2841 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2842 ctx->cb_fn(ctx->cb_arg, bserrno); 2843 free(ctx); 2844 return; 2845 } 2846 2847 io_unit_offset = ctx->io_unit_offset; 2848 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2849 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2850 /* 2851 * Get index and offset into the original iov array for our current position in the I/O sequence. 2852 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2853 * point to the current position in the I/O sequence. 2854 */ 2855 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2856 orig_iov = &ctx->orig_iov[0]; 2857 orig_iovoff = 0; 2858 while (byte_count > 0) { 2859 if (byte_count >= orig_iov->iov_len) { 2860 byte_count -= orig_iov->iov_len; 2861 orig_iov++; 2862 } else { 2863 orig_iovoff = byte_count; 2864 byte_count = 0; 2865 } 2866 } 2867 2868 /* 2869 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2870 * bytes of this next I/O remain to be accounted for in the new iov array. 2871 */ 2872 byte_count = io_units_count * blob->bs->io_unit_size; 2873 iov = &ctx->iov[0]; 2874 iovcnt = 0; 2875 while (byte_count > 0) { 2876 assert(iovcnt < ctx->iovcnt); 2877 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2878 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2879 byte_count -= iov->iov_len; 2880 orig_iovoff = 0; 2881 orig_iov++; 2882 iov++; 2883 iovcnt++; 2884 } 2885 2886 ctx->io_unit_offset += io_units_count; 2887 ctx->io_units_remaining -= io_units_count; 2888 ctx->io_units_done += io_units_count; 2889 iov = &ctx->iov[0]; 2890 2891 if (ctx->read) { 2892 spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2893 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2894 } else { 2895 spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2896 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2897 } 2898 } 2899 2900 static void 2901 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2902 struct iovec *iov, int iovcnt, 2903 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read, 2904 struct spdk_blob_ext_io_opts *ext_io_opts) 2905 { 2906 struct spdk_bs_cpl cpl; 2907 2908 assert(blob != NULL); 2909 2910 if (!read && blob->data_ro) { 2911 cb_fn(cb_arg, -EPERM); 2912 return; 2913 } 2914 2915 if (length == 0) { 2916 cb_fn(cb_arg, 0); 2917 return; 2918 } 2919 2920 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2921 cb_fn(cb_arg, -EINVAL); 2922 return; 2923 } 2924 2925 /* 2926 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2927 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2928 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2929 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 2930 * to allocate a separate iov array and split the I/O such that none of the resulting 2931 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 2932 * but since this case happens very infrequently, any performance impact will be negligible. 2933 * 2934 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 2935 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 2936 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 2937 * when the batch was completed, to allow for freeing the memory for the iov arrays. 2938 */ 2939 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 2940 uint64_t lba_count; 2941 uint64_t lba; 2942 bool is_allocated; 2943 2944 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2945 cpl.u.blob_basic.cb_fn = cb_fn; 2946 cpl.u.blob_basic.cb_arg = cb_arg; 2947 2948 if (blob->frozen_refcnt) { 2949 /* This blob I/O is frozen */ 2950 enum spdk_blob_op_type op_type; 2951 spdk_bs_user_op_t *op; 2952 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 2953 2954 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 2955 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 2956 if (!op) { 2957 cb_fn(cb_arg, -ENOMEM); 2958 return; 2959 } 2960 2961 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2962 2963 return; 2964 } 2965 2966 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2967 2968 if (read) { 2969 spdk_bs_sequence_t *seq; 2970 2971 seq = bs_sequence_start(_channel, &cpl); 2972 if (!seq) { 2973 cb_fn(cb_arg, -ENOMEM); 2974 return; 2975 } 2976 2977 seq->ext_io_opts = ext_io_opts; 2978 2979 if (is_allocated) { 2980 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2981 } else { 2982 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 2983 rw_iov_done, NULL); 2984 } 2985 } else { 2986 if (is_allocated) { 2987 spdk_bs_sequence_t *seq; 2988 2989 seq = bs_sequence_start(_channel, &cpl); 2990 if (!seq) { 2991 cb_fn(cb_arg, -ENOMEM); 2992 return; 2993 } 2994 2995 seq->ext_io_opts = ext_io_opts; 2996 2997 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2998 } else { 2999 /* Queue this operation and allocate the cluster */ 3000 spdk_bs_user_op_t *op; 3001 3002 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 3003 length); 3004 if (!op) { 3005 cb_fn(cb_arg, -ENOMEM); 3006 return; 3007 } 3008 3009 op->ext_io_opts = ext_io_opts; 3010 3011 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 3012 } 3013 } 3014 } else { 3015 struct rw_iov_ctx *ctx; 3016 3017 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 3018 if (ctx == NULL) { 3019 cb_fn(cb_arg, -ENOMEM); 3020 return; 3021 } 3022 3023 ctx->blob = blob; 3024 ctx->channel = _channel; 3025 ctx->cb_fn = cb_fn; 3026 ctx->cb_arg = cb_arg; 3027 ctx->read = read; 3028 ctx->orig_iov = iov; 3029 ctx->iovcnt = iovcnt; 3030 ctx->io_unit_offset = offset; 3031 ctx->io_units_remaining = length; 3032 ctx->io_units_done = 0; 3033 ctx->ext_io_opts = ext_io_opts; 3034 3035 rw_iov_split_next(ctx, 0); 3036 } 3037 } 3038 3039 static struct spdk_blob * 3040 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 3041 { 3042 struct spdk_blob find; 3043 3044 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 3045 return NULL; 3046 } 3047 3048 find.id = blobid; 3049 return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find); 3050 } 3051 3052 static void 3053 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 3054 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 3055 { 3056 assert(blob != NULL); 3057 *snapshot_entry = NULL; 3058 *clone_entry = NULL; 3059 3060 if (blob->parent_id == SPDK_BLOBID_INVALID) { 3061 return; 3062 } 3063 3064 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 3065 if ((*snapshot_entry)->id == blob->parent_id) { 3066 break; 3067 } 3068 } 3069 3070 if (*snapshot_entry != NULL) { 3071 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3072 if ((*clone_entry)->id == blob->id) { 3073 break; 3074 } 3075 } 3076 3077 assert(*clone_entry != NULL); 3078 } 3079 } 3080 3081 static int 3082 bs_channel_create(void *io_device, void *ctx_buf) 3083 { 3084 struct spdk_blob_store *bs = io_device; 3085 struct spdk_bs_channel *channel = ctx_buf; 3086 struct spdk_bs_dev *dev; 3087 uint32_t max_ops = bs->max_channel_ops; 3088 uint32_t i; 3089 3090 dev = bs->dev; 3091 3092 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3093 if (!channel->req_mem) { 3094 return -1; 3095 } 3096 3097 TAILQ_INIT(&channel->reqs); 3098 3099 for (i = 0; i < max_ops; i++) { 3100 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3101 } 3102 3103 channel->bs = bs; 3104 channel->dev = dev; 3105 channel->dev_channel = dev->create_channel(dev); 3106 3107 if (!channel->dev_channel) { 3108 SPDK_ERRLOG("Failed to create device channel.\n"); 3109 free(channel->req_mem); 3110 return -1; 3111 } 3112 3113 channel->new_cluster_page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, 3114 SPDK_MALLOC_DMA); 3115 if (!channel->new_cluster_page) { 3116 SPDK_ERRLOG("Failed to allocate new cluster page\n"); 3117 free(channel->req_mem); 3118 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3119 return -1; 3120 } 3121 3122 TAILQ_INIT(&channel->need_cluster_alloc); 3123 TAILQ_INIT(&channel->queued_io); 3124 3125 return 0; 3126 } 3127 3128 static void 3129 bs_channel_destroy(void *io_device, void *ctx_buf) 3130 { 3131 struct spdk_bs_channel *channel = ctx_buf; 3132 spdk_bs_user_op_t *op; 3133 3134 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3135 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3136 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3137 bs_user_op_abort(op, -EIO); 3138 } 3139 3140 while (!TAILQ_EMPTY(&channel->queued_io)) { 3141 op = TAILQ_FIRST(&channel->queued_io); 3142 TAILQ_REMOVE(&channel->queued_io, op, link); 3143 bs_user_op_abort(op, -EIO); 3144 } 3145 3146 free(channel->req_mem); 3147 spdk_free(channel->new_cluster_page); 3148 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3149 } 3150 3151 static void 3152 bs_dev_destroy(void *io_device) 3153 { 3154 struct spdk_blob_store *bs = io_device; 3155 struct spdk_blob *blob, *blob_tmp; 3156 3157 bs->dev->destroy(bs->dev); 3158 3159 RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) { 3160 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob); 3161 spdk_bit_array_clear(bs->open_blobids, blob->id); 3162 blob_free(blob); 3163 } 3164 3165 pthread_mutex_destroy(&bs->used_clusters_mutex); 3166 3167 spdk_bit_array_free(&bs->open_blobids); 3168 spdk_bit_array_free(&bs->used_blobids); 3169 spdk_bit_array_free(&bs->used_md_pages); 3170 spdk_bit_pool_free(&bs->used_clusters); 3171 /* 3172 * If this function is called for any reason except a successful unload, 3173 * the unload_cpl type will be NONE and this will be a nop. 3174 */ 3175 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3176 3177 free(bs); 3178 } 3179 3180 static int 3181 bs_blob_list_add(struct spdk_blob *blob) 3182 { 3183 spdk_blob_id snapshot_id; 3184 struct spdk_blob_list *snapshot_entry = NULL; 3185 struct spdk_blob_list *clone_entry = NULL; 3186 3187 assert(blob != NULL); 3188 3189 snapshot_id = blob->parent_id; 3190 if (snapshot_id == SPDK_BLOBID_INVALID) { 3191 return 0; 3192 } 3193 3194 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3195 if (snapshot_entry == NULL) { 3196 /* Snapshot not found */ 3197 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3198 if (snapshot_entry == NULL) { 3199 return -ENOMEM; 3200 } 3201 snapshot_entry->id = snapshot_id; 3202 TAILQ_INIT(&snapshot_entry->clones); 3203 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3204 } else { 3205 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3206 if (clone_entry->id == blob->id) { 3207 break; 3208 } 3209 } 3210 } 3211 3212 if (clone_entry == NULL) { 3213 /* Clone not found */ 3214 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3215 if (clone_entry == NULL) { 3216 return -ENOMEM; 3217 } 3218 clone_entry->id = blob->id; 3219 TAILQ_INIT(&clone_entry->clones); 3220 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3221 snapshot_entry->clone_count++; 3222 } 3223 3224 return 0; 3225 } 3226 3227 static void 3228 bs_blob_list_remove(struct spdk_blob *blob) 3229 { 3230 struct spdk_blob_list *snapshot_entry = NULL; 3231 struct spdk_blob_list *clone_entry = NULL; 3232 3233 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3234 3235 if (snapshot_entry == NULL) { 3236 return; 3237 } 3238 3239 blob->parent_id = SPDK_BLOBID_INVALID; 3240 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3241 free(clone_entry); 3242 3243 snapshot_entry->clone_count--; 3244 } 3245 3246 static int 3247 bs_blob_list_free(struct spdk_blob_store *bs) 3248 { 3249 struct spdk_blob_list *snapshot_entry; 3250 struct spdk_blob_list *snapshot_entry_tmp; 3251 struct spdk_blob_list *clone_entry; 3252 struct spdk_blob_list *clone_entry_tmp; 3253 3254 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3255 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3256 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3257 free(clone_entry); 3258 } 3259 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3260 free(snapshot_entry); 3261 } 3262 3263 return 0; 3264 } 3265 3266 static void 3267 bs_free(struct spdk_blob_store *bs) 3268 { 3269 bs_blob_list_free(bs); 3270 3271 bs_unregister_md_thread(bs); 3272 spdk_io_device_unregister(bs, bs_dev_destroy); 3273 } 3274 3275 void 3276 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3277 { 3278 3279 if (!opts) { 3280 SPDK_ERRLOG("opts should not be NULL\n"); 3281 return; 3282 } 3283 3284 if (!opts_size) { 3285 SPDK_ERRLOG("opts_size should not be zero value\n"); 3286 return; 3287 } 3288 3289 memset(opts, 0, opts_size); 3290 opts->opts_size = opts_size; 3291 3292 #define FIELD_OK(field) \ 3293 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3294 3295 #define SET_FIELD(field, value) \ 3296 if (FIELD_OK(field)) { \ 3297 opts->field = value; \ 3298 } \ 3299 3300 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3301 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3302 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3303 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3304 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3305 3306 if (FIELD_OK(bstype)) { 3307 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3308 } 3309 3310 SET_FIELD(iter_cb_fn, NULL); 3311 SET_FIELD(iter_cb_arg, NULL); 3312 SET_FIELD(force_recover, false); 3313 3314 #undef FIELD_OK 3315 #undef SET_FIELD 3316 } 3317 3318 static int 3319 bs_opts_verify(struct spdk_bs_opts *opts) 3320 { 3321 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3322 opts->max_channel_ops == 0) { 3323 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3324 return -1; 3325 } 3326 3327 return 0; 3328 } 3329 3330 /* START spdk_bs_load */ 3331 3332 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3333 3334 struct spdk_bs_load_ctx { 3335 struct spdk_blob_store *bs; 3336 struct spdk_bs_super_block *super; 3337 3338 struct spdk_bs_md_mask *mask; 3339 bool in_page_chain; 3340 uint32_t page_index; 3341 uint32_t cur_page; 3342 struct spdk_blob_md_page *page; 3343 3344 uint64_t num_extent_pages; 3345 uint32_t *extent_page_num; 3346 struct spdk_blob_md_page *extent_pages; 3347 struct spdk_bit_array *used_clusters; 3348 3349 spdk_bs_sequence_t *seq; 3350 spdk_blob_op_with_handle_complete iter_cb_fn; 3351 void *iter_cb_arg; 3352 struct spdk_blob *blob; 3353 spdk_blob_id blobid; 3354 3355 bool force_recover; 3356 3357 /* These fields are used in the spdk_bs_dump path. */ 3358 bool dumping; 3359 FILE *fp; 3360 spdk_bs_dump_print_xattr print_xattr_fn; 3361 char xattr_name[4096]; 3362 }; 3363 3364 static int 3365 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3366 struct spdk_bs_load_ctx **_ctx) 3367 { 3368 struct spdk_blob_store *bs; 3369 struct spdk_bs_load_ctx *ctx; 3370 uint64_t dev_size; 3371 int rc; 3372 3373 dev_size = dev->blocklen * dev->blockcnt; 3374 if (dev_size < opts->cluster_sz) { 3375 /* Device size cannot be smaller than cluster size of blobstore */ 3376 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3377 dev_size, opts->cluster_sz); 3378 return -ENOSPC; 3379 } 3380 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3381 /* Cluster size cannot be smaller than page size */ 3382 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3383 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3384 return -EINVAL; 3385 } 3386 bs = calloc(1, sizeof(struct spdk_blob_store)); 3387 if (!bs) { 3388 return -ENOMEM; 3389 } 3390 3391 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3392 if (!ctx) { 3393 free(bs); 3394 return -ENOMEM; 3395 } 3396 3397 ctx->bs = bs; 3398 ctx->iter_cb_fn = opts->iter_cb_fn; 3399 ctx->iter_cb_arg = opts->iter_cb_arg; 3400 ctx->force_recover = opts->force_recover; 3401 3402 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3403 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3404 if (!ctx->super) { 3405 free(ctx); 3406 free(bs); 3407 return -ENOMEM; 3408 } 3409 3410 RB_INIT(&bs->open_blobs); 3411 TAILQ_INIT(&bs->snapshots); 3412 bs->dev = dev; 3413 bs->md_thread = spdk_get_thread(); 3414 assert(bs->md_thread != NULL); 3415 3416 /* 3417 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3418 * even multiple of the cluster size. 3419 */ 3420 bs->cluster_sz = opts->cluster_sz; 3421 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3422 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3423 if (!ctx->used_clusters) { 3424 spdk_free(ctx->super); 3425 free(ctx); 3426 free(bs); 3427 return -ENOMEM; 3428 } 3429 3430 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3431 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3432 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3433 } 3434 bs->num_free_clusters = bs->total_clusters; 3435 bs->io_unit_size = dev->blocklen; 3436 3437 bs->max_channel_ops = opts->max_channel_ops; 3438 bs->super_blob = SPDK_BLOBID_INVALID; 3439 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3440 3441 /* The metadata is assumed to be at least 1 page */ 3442 bs->used_md_pages = spdk_bit_array_create(1); 3443 bs->used_blobids = spdk_bit_array_create(0); 3444 bs->open_blobids = spdk_bit_array_create(0); 3445 3446 pthread_mutex_init(&bs->used_clusters_mutex, NULL); 3447 3448 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3449 sizeof(struct spdk_bs_channel), "blobstore"); 3450 rc = bs_register_md_thread(bs); 3451 if (rc == -1) { 3452 spdk_io_device_unregister(bs, NULL); 3453 pthread_mutex_destroy(&bs->used_clusters_mutex); 3454 spdk_bit_array_free(&bs->open_blobids); 3455 spdk_bit_array_free(&bs->used_blobids); 3456 spdk_bit_array_free(&bs->used_md_pages); 3457 spdk_bit_array_free(&ctx->used_clusters); 3458 spdk_free(ctx->super); 3459 free(ctx); 3460 free(bs); 3461 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3462 return -ENOMEM; 3463 } 3464 3465 *_ctx = ctx; 3466 *_bs = bs; 3467 return 0; 3468 } 3469 3470 static void 3471 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3472 { 3473 assert(bserrno != 0); 3474 3475 spdk_free(ctx->super); 3476 bs_sequence_finish(ctx->seq, bserrno); 3477 bs_free(ctx->bs); 3478 spdk_bit_array_free(&ctx->used_clusters); 3479 free(ctx); 3480 } 3481 3482 static void 3483 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3484 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3485 { 3486 /* Update the values in the super block */ 3487 super->super_blob = bs->super_blob; 3488 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3489 super->crc = blob_md_page_calc_crc(super); 3490 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3491 bs_byte_to_lba(bs, sizeof(*super)), 3492 cb_fn, cb_arg); 3493 } 3494 3495 static void 3496 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3497 { 3498 struct spdk_bs_load_ctx *ctx = arg; 3499 uint64_t mask_size, lba, lba_count; 3500 3501 /* Write out the used clusters mask */ 3502 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3503 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3504 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3505 if (!ctx->mask) { 3506 bs_load_ctx_fail(ctx, -ENOMEM); 3507 return; 3508 } 3509 3510 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3511 ctx->mask->length = ctx->bs->total_clusters; 3512 /* We could get here through the normal unload path, or through dirty 3513 * shutdown recovery. For the normal unload path, we use the mask from 3514 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3515 * only the bit array from the load ctx. 3516 */ 3517 if (ctx->bs->used_clusters) { 3518 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3519 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3520 } else { 3521 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3522 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3523 } 3524 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3525 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3526 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3527 } 3528 3529 static void 3530 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3531 { 3532 struct spdk_bs_load_ctx *ctx = arg; 3533 uint64_t mask_size, lba, lba_count; 3534 3535 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3536 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3537 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3538 if (!ctx->mask) { 3539 bs_load_ctx_fail(ctx, -ENOMEM); 3540 return; 3541 } 3542 3543 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3544 ctx->mask->length = ctx->super->md_len; 3545 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3546 3547 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3548 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3549 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3550 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3551 } 3552 3553 static void 3554 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3555 { 3556 struct spdk_bs_load_ctx *ctx = arg; 3557 uint64_t mask_size, lba, lba_count; 3558 3559 if (ctx->super->used_blobid_mask_len == 0) { 3560 /* 3561 * This is a pre-v3 on-disk format where the blobid mask does not get 3562 * written to disk. 3563 */ 3564 cb_fn(seq, arg, 0); 3565 return; 3566 } 3567 3568 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3569 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3570 SPDK_MALLOC_DMA); 3571 if (!ctx->mask) { 3572 bs_load_ctx_fail(ctx, -ENOMEM); 3573 return; 3574 } 3575 3576 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3577 ctx->mask->length = ctx->super->md_len; 3578 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3579 3580 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3581 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3582 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3583 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3584 } 3585 3586 static void 3587 blob_set_thin_provision(struct spdk_blob *blob) 3588 { 3589 blob_verify_md_op(blob); 3590 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3591 blob->state = SPDK_BLOB_STATE_DIRTY; 3592 } 3593 3594 static void 3595 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3596 { 3597 blob_verify_md_op(blob); 3598 blob->clear_method = clear_method; 3599 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3600 blob->state = SPDK_BLOB_STATE_DIRTY; 3601 } 3602 3603 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3604 3605 static void 3606 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3607 { 3608 struct spdk_bs_load_ctx *ctx = cb_arg; 3609 spdk_blob_id id; 3610 int64_t page_num; 3611 3612 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3613 * last blob has been removed */ 3614 page_num = bs_blobid_to_page(ctx->blobid); 3615 page_num++; 3616 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3617 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3618 bs_load_iter(ctx, NULL, -ENOENT); 3619 return; 3620 } 3621 3622 id = bs_page_to_blobid(page_num); 3623 3624 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3625 } 3626 3627 static void 3628 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3629 { 3630 struct spdk_bs_load_ctx *ctx = cb_arg; 3631 3632 if (bserrno != 0) { 3633 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3634 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3635 return; 3636 } 3637 3638 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3639 } 3640 3641 static void 3642 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3643 { 3644 struct spdk_bs_load_ctx *ctx = cb_arg; 3645 uint64_t i; 3646 3647 if (bserrno != 0) { 3648 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3649 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3650 return; 3651 } 3652 3653 /* Snapshot and clone have the same copy of cluster map and extent pages 3654 * at this point. Let's clear both for snapshot now, 3655 * so that it won't be cleared for clone later when we remove snapshot. 3656 * Also set thin provision to pass data corruption check */ 3657 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3658 ctx->blob->active.clusters[i] = 0; 3659 } 3660 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3661 ctx->blob->active.extent_pages[i] = 0; 3662 } 3663 3664 ctx->blob->md_ro = false; 3665 3666 blob_set_thin_provision(ctx->blob); 3667 3668 ctx->blobid = ctx->blob->id; 3669 3670 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3671 } 3672 3673 static void 3674 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3675 { 3676 struct spdk_bs_load_ctx *ctx = cb_arg; 3677 3678 if (bserrno != 0) { 3679 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3680 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3681 return; 3682 } 3683 3684 ctx->blob->md_ro = false; 3685 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3686 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3687 spdk_blob_set_read_only(ctx->blob); 3688 3689 if (ctx->iter_cb_fn) { 3690 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3691 } 3692 bs_blob_list_add(ctx->blob); 3693 3694 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3695 } 3696 3697 static void 3698 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3699 { 3700 struct spdk_bs_load_ctx *ctx = cb_arg; 3701 3702 if (bserrno != 0) { 3703 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3704 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3705 return; 3706 } 3707 3708 if (blob->parent_id == ctx->blob->id) { 3709 /* Power failure occurred before updating clone (snapshot delete case) 3710 * or after updating clone (creating snapshot case) - keep snapshot */ 3711 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3712 } else { 3713 /* Power failure occurred after updating clone (snapshot delete case) 3714 * or before updating clone (creating snapshot case) - remove snapshot */ 3715 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3716 } 3717 } 3718 3719 static void 3720 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3721 { 3722 struct spdk_bs_load_ctx *ctx = arg; 3723 const void *value; 3724 size_t len; 3725 int rc = 0; 3726 3727 if (bserrno == 0) { 3728 /* Examine blob if it is corrupted after power failure. Fix 3729 * the ones that can be fixed and remove any other corrupted 3730 * ones. If it is not corrupted just process it */ 3731 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3732 if (rc != 0) { 3733 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3734 if (rc != 0) { 3735 /* Not corrupted - process it and continue with iterating through blobs */ 3736 if (ctx->iter_cb_fn) { 3737 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3738 } 3739 bs_blob_list_add(blob); 3740 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3741 return; 3742 } 3743 3744 } 3745 3746 assert(len == sizeof(spdk_blob_id)); 3747 3748 ctx->blob = blob; 3749 3750 /* Open clone to check if we are able to fix this blob or should we remove it */ 3751 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3752 return; 3753 } else if (bserrno == -ENOENT) { 3754 bserrno = 0; 3755 } else { 3756 /* 3757 * This case needs to be looked at further. Same problem 3758 * exists with applications that rely on explicit blob 3759 * iteration. We should just skip the blob that failed 3760 * to load and continue on to the next one. 3761 */ 3762 SPDK_ERRLOG("Error in iterating blobs\n"); 3763 } 3764 3765 ctx->iter_cb_fn = NULL; 3766 3767 spdk_free(ctx->super); 3768 spdk_free(ctx->mask); 3769 bs_sequence_finish(ctx->seq, bserrno); 3770 free(ctx); 3771 } 3772 3773 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 3774 3775 static void 3776 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3777 { 3778 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3779 if (ctx->dumping) { 3780 bs_dump_read_md_page(ctx->seq, ctx); 3781 return; 3782 } 3783 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3784 } 3785 3786 static void 3787 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3788 { 3789 struct spdk_bs_load_ctx *ctx = cb_arg; 3790 int rc; 3791 3792 /* The type must be correct */ 3793 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3794 3795 /* The length of the mask (in bits) must not be greater than 3796 * the length of the buffer (converted to bits) */ 3797 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3798 3799 /* The length of the mask must be exactly equal to the size 3800 * (in pages) of the metadata region */ 3801 assert(ctx->mask->length == ctx->super->md_len); 3802 3803 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3804 if (rc < 0) { 3805 spdk_free(ctx->mask); 3806 bs_load_ctx_fail(ctx, rc); 3807 return; 3808 } 3809 3810 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3811 bs_load_complete(ctx); 3812 } 3813 3814 static void 3815 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3816 { 3817 struct spdk_bs_load_ctx *ctx = cb_arg; 3818 uint64_t lba, lba_count, mask_size; 3819 int rc; 3820 3821 if (bserrno != 0) { 3822 bs_load_ctx_fail(ctx, bserrno); 3823 return; 3824 } 3825 3826 /* The type must be correct */ 3827 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3828 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3829 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3830 struct spdk_blob_md_page) * 8)); 3831 /* 3832 * The length of the mask must be equal to or larger than the total number of clusters. It may be 3833 * larger than the total nubmer of clusters due to a failure spdk_bs_grow. 3834 */ 3835 assert(ctx->mask->length >= ctx->bs->total_clusters); 3836 if (ctx->mask->length > ctx->bs->total_clusters) { 3837 SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters"); 3838 ctx->mask->length = ctx->bs->total_clusters; 3839 } 3840 3841 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3842 if (rc < 0) { 3843 spdk_free(ctx->mask); 3844 bs_load_ctx_fail(ctx, rc); 3845 return; 3846 } 3847 3848 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3849 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3850 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3851 3852 spdk_free(ctx->mask); 3853 3854 /* Read the used blobids mask */ 3855 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3856 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3857 SPDK_MALLOC_DMA); 3858 if (!ctx->mask) { 3859 bs_load_ctx_fail(ctx, -ENOMEM); 3860 return; 3861 } 3862 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3863 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3864 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3865 bs_load_used_blobids_cpl, ctx); 3866 } 3867 3868 static void 3869 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3870 { 3871 struct spdk_bs_load_ctx *ctx = cb_arg; 3872 uint64_t lba, lba_count, mask_size; 3873 int rc; 3874 3875 if (bserrno != 0) { 3876 bs_load_ctx_fail(ctx, bserrno); 3877 return; 3878 } 3879 3880 /* The type must be correct */ 3881 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3882 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3883 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3884 8)); 3885 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3886 if (ctx->mask->length != ctx->super->md_len) { 3887 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3888 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3889 ctx->mask->length, ctx->super->md_len); 3890 assert(false); 3891 } 3892 3893 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3894 if (rc < 0) { 3895 spdk_free(ctx->mask); 3896 bs_load_ctx_fail(ctx, rc); 3897 return; 3898 } 3899 3900 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3901 spdk_free(ctx->mask); 3902 3903 /* Read the used clusters mask */ 3904 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3905 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3906 SPDK_MALLOC_DMA); 3907 if (!ctx->mask) { 3908 bs_load_ctx_fail(ctx, -ENOMEM); 3909 return; 3910 } 3911 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3912 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3913 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3914 bs_load_used_clusters_cpl, ctx); 3915 } 3916 3917 static void 3918 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3919 { 3920 uint64_t lba, lba_count, mask_size; 3921 3922 /* Read the used pages mask */ 3923 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3924 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3925 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3926 if (!ctx->mask) { 3927 bs_load_ctx_fail(ctx, -ENOMEM); 3928 return; 3929 } 3930 3931 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3932 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3933 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 3934 bs_load_used_pages_cpl, ctx); 3935 } 3936 3937 static int 3938 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 3939 { 3940 struct spdk_blob_store *bs = ctx->bs; 3941 struct spdk_blob_md_descriptor *desc; 3942 size_t cur_desc = 0; 3943 3944 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 3945 while (cur_desc < sizeof(page->descriptors)) { 3946 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 3947 if (desc->length == 0) { 3948 /* If padding and length are 0, this terminates the page */ 3949 break; 3950 } 3951 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 3952 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 3953 unsigned int i, j; 3954 unsigned int cluster_count = 0; 3955 uint32_t cluster_idx; 3956 3957 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 3958 3959 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 3960 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 3961 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 3962 /* 3963 * cluster_idx = 0 means an unallocated cluster - don't mark that 3964 * in the used cluster map. 3965 */ 3966 if (cluster_idx != 0) { 3967 SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j); 3968 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 3969 if (bs->num_free_clusters == 0) { 3970 return -ENOSPC; 3971 } 3972 bs->num_free_clusters--; 3973 } 3974 cluster_count++; 3975 } 3976 } 3977 if (cluster_count == 0) { 3978 return -EINVAL; 3979 } 3980 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 3981 struct spdk_blob_md_descriptor_extent_page *desc_extent; 3982 uint32_t i; 3983 uint32_t cluster_count = 0; 3984 uint32_t cluster_idx; 3985 size_t cluster_idx_length; 3986 3987 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 3988 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 3989 3990 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 3991 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 3992 return -EINVAL; 3993 } 3994 3995 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 3996 cluster_idx = desc_extent->cluster_idx[i]; 3997 /* 3998 * cluster_idx = 0 means an unallocated cluster - don't mark that 3999 * in the used cluster map. 4000 */ 4001 if (cluster_idx != 0) { 4002 if (cluster_idx < desc_extent->start_cluster_idx && 4003 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 4004 return -EINVAL; 4005 } 4006 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 4007 if (bs->num_free_clusters == 0) { 4008 return -ENOSPC; 4009 } 4010 bs->num_free_clusters--; 4011 } 4012 cluster_count++; 4013 } 4014 4015 if (cluster_count == 0) { 4016 return -EINVAL; 4017 } 4018 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4019 /* Skip this item */ 4020 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4021 /* Skip this item */ 4022 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4023 /* Skip this item */ 4024 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4025 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 4026 uint32_t num_extent_pages = ctx->num_extent_pages; 4027 uint32_t i; 4028 size_t extent_pages_length; 4029 void *tmp; 4030 4031 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 4032 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 4033 4034 if (desc_extent_table->length == 0 || 4035 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 4036 return -EINVAL; 4037 } 4038 4039 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4040 if (desc_extent_table->extent_page[i].page_idx != 0) { 4041 if (desc_extent_table->extent_page[i].num_pages != 1) { 4042 return -EINVAL; 4043 } 4044 num_extent_pages += 1; 4045 } 4046 } 4047 4048 if (num_extent_pages > 0) { 4049 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 4050 if (tmp == NULL) { 4051 return -ENOMEM; 4052 } 4053 ctx->extent_page_num = tmp; 4054 4055 /* Extent table entries contain md page numbers for extent pages. 4056 * Zeroes represent unallocated extent pages, those are run-length-encoded. 4057 */ 4058 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4059 if (desc_extent_table->extent_page[i].page_idx != 0) { 4060 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 4061 ctx->num_extent_pages += 1; 4062 } 4063 } 4064 } 4065 } else { 4066 /* Error */ 4067 return -EINVAL; 4068 } 4069 /* Advance to the next descriptor */ 4070 cur_desc += sizeof(*desc) + desc->length; 4071 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4072 break; 4073 } 4074 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4075 } 4076 return 0; 4077 } 4078 4079 static bool 4080 bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 4081 { 4082 uint32_t crc; 4083 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4084 size_t desc_len; 4085 4086 crc = blob_md_page_calc_crc(page); 4087 if (crc != page->crc) { 4088 return false; 4089 } 4090 4091 /* Extent page should always be of sequence num 0. */ 4092 if (page->sequence_num != 0) { 4093 return false; 4094 } 4095 4096 /* Descriptor type must be EXTENT_PAGE. */ 4097 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4098 return false; 4099 } 4100 4101 /* Descriptor length cannot exceed the page. */ 4102 desc_len = sizeof(*desc) + desc->length; 4103 if (desc_len > sizeof(page->descriptors)) { 4104 return false; 4105 } 4106 4107 /* It has to be the only descriptor in the page. */ 4108 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4109 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4110 if (desc->length != 0) { 4111 return false; 4112 } 4113 } 4114 4115 return true; 4116 } 4117 4118 static bool 4119 bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4120 { 4121 uint32_t crc; 4122 struct spdk_blob_md_page *page = ctx->page; 4123 4124 crc = blob_md_page_calc_crc(page); 4125 if (crc != page->crc) { 4126 return false; 4127 } 4128 4129 /* First page of a sequence should match the blobid. */ 4130 if (page->sequence_num == 0 && 4131 bs_page_to_blobid(ctx->cur_page) != page->id) { 4132 return false; 4133 } 4134 assert(bs_load_cur_extent_page_valid(page) == false); 4135 4136 return true; 4137 } 4138 4139 static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4140 4141 static void 4142 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4143 { 4144 struct spdk_bs_load_ctx *ctx = cb_arg; 4145 4146 if (bserrno != 0) { 4147 bs_load_ctx_fail(ctx, bserrno); 4148 return; 4149 } 4150 4151 bs_load_complete(ctx); 4152 } 4153 4154 static void 4155 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4156 { 4157 struct spdk_bs_load_ctx *ctx = cb_arg; 4158 4159 spdk_free(ctx->mask); 4160 ctx->mask = NULL; 4161 4162 if (bserrno != 0) { 4163 bs_load_ctx_fail(ctx, bserrno); 4164 return; 4165 } 4166 4167 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4168 } 4169 4170 static void 4171 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4172 { 4173 struct spdk_bs_load_ctx *ctx = cb_arg; 4174 4175 spdk_free(ctx->mask); 4176 ctx->mask = NULL; 4177 4178 if (bserrno != 0) { 4179 bs_load_ctx_fail(ctx, bserrno); 4180 return; 4181 } 4182 4183 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4184 } 4185 4186 static void 4187 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4188 { 4189 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4190 } 4191 4192 static void 4193 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4194 { 4195 uint64_t num_md_clusters; 4196 uint64_t i; 4197 4198 ctx->in_page_chain = false; 4199 4200 do { 4201 ctx->page_index++; 4202 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4203 4204 if (ctx->page_index < ctx->super->md_len) { 4205 ctx->cur_page = ctx->page_index; 4206 bs_load_replay_cur_md_page(ctx); 4207 } else { 4208 /* Claim all of the clusters used by the metadata */ 4209 num_md_clusters = spdk_divide_round_up( 4210 ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster); 4211 for (i = 0; i < num_md_clusters; i++) { 4212 spdk_bit_array_set(ctx->used_clusters, i); 4213 } 4214 ctx->bs->num_free_clusters -= num_md_clusters; 4215 spdk_free(ctx->page); 4216 bs_load_write_used_md(ctx); 4217 } 4218 } 4219 4220 static void 4221 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4222 { 4223 struct spdk_bs_load_ctx *ctx = cb_arg; 4224 uint32_t page_num; 4225 uint64_t i; 4226 4227 if (bserrno != 0) { 4228 spdk_free(ctx->extent_pages); 4229 bs_load_ctx_fail(ctx, bserrno); 4230 return; 4231 } 4232 4233 for (i = 0; i < ctx->num_extent_pages; i++) { 4234 /* Extent pages are only read when present within in chain md. 4235 * Integrity of md is not right if that page was not a valid extent page. */ 4236 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4237 spdk_free(ctx->extent_pages); 4238 bs_load_ctx_fail(ctx, -EILSEQ); 4239 return; 4240 } 4241 4242 page_num = ctx->extent_page_num[i]; 4243 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4244 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4245 spdk_free(ctx->extent_pages); 4246 bs_load_ctx_fail(ctx, -EILSEQ); 4247 return; 4248 } 4249 } 4250 4251 spdk_free(ctx->extent_pages); 4252 free(ctx->extent_page_num); 4253 ctx->extent_page_num = NULL; 4254 ctx->num_extent_pages = 0; 4255 4256 bs_load_replay_md_chain_cpl(ctx); 4257 } 4258 4259 static void 4260 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4261 { 4262 spdk_bs_batch_t *batch; 4263 uint32_t page; 4264 uint64_t lba; 4265 uint64_t i; 4266 4267 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4268 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4269 if (!ctx->extent_pages) { 4270 bs_load_ctx_fail(ctx, -ENOMEM); 4271 return; 4272 } 4273 4274 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4275 4276 for (i = 0; i < ctx->num_extent_pages; i++) { 4277 page = ctx->extent_page_num[i]; 4278 assert(page < ctx->super->md_len); 4279 lba = bs_md_page_to_lba(ctx->bs, page); 4280 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4281 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4282 } 4283 4284 bs_batch_close(batch); 4285 } 4286 4287 static void 4288 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4289 { 4290 struct spdk_bs_load_ctx *ctx = cb_arg; 4291 uint32_t page_num; 4292 struct spdk_blob_md_page *page; 4293 4294 if (bserrno != 0) { 4295 bs_load_ctx_fail(ctx, bserrno); 4296 return; 4297 } 4298 4299 page_num = ctx->cur_page; 4300 page = ctx->page; 4301 if (bs_load_cur_md_page_valid(ctx) == true) { 4302 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4303 bs_claim_md_page(ctx->bs, page_num); 4304 if (page->sequence_num == 0) { 4305 SPDK_NOTICELOG("Recover: blob %" PRIu32 "\n", page_num); 4306 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4307 } 4308 if (bs_load_replay_md_parse_page(ctx, page)) { 4309 bs_load_ctx_fail(ctx, -EILSEQ); 4310 return; 4311 } 4312 if (page->next != SPDK_INVALID_MD_PAGE) { 4313 ctx->in_page_chain = true; 4314 ctx->cur_page = page->next; 4315 bs_load_replay_cur_md_page(ctx); 4316 return; 4317 } 4318 if (ctx->num_extent_pages != 0) { 4319 bs_load_replay_extent_pages(ctx); 4320 return; 4321 } 4322 } 4323 } 4324 bs_load_replay_md_chain_cpl(ctx); 4325 } 4326 4327 static void 4328 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4329 { 4330 uint64_t lba; 4331 4332 assert(ctx->cur_page < ctx->super->md_len); 4333 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4334 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4335 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4336 bs_load_replay_md_cpl, ctx); 4337 } 4338 4339 static void 4340 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4341 { 4342 ctx->page_index = 0; 4343 ctx->cur_page = 0; 4344 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4345 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4346 if (!ctx->page) { 4347 bs_load_ctx_fail(ctx, -ENOMEM); 4348 return; 4349 } 4350 bs_load_replay_cur_md_page(ctx); 4351 } 4352 4353 static void 4354 bs_recover(struct spdk_bs_load_ctx *ctx) 4355 { 4356 int rc; 4357 4358 SPDK_NOTICELOG("Performing recovery on blobstore\n"); 4359 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4360 if (rc < 0) { 4361 bs_load_ctx_fail(ctx, -ENOMEM); 4362 return; 4363 } 4364 4365 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4366 if (rc < 0) { 4367 bs_load_ctx_fail(ctx, -ENOMEM); 4368 return; 4369 } 4370 4371 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4372 if (rc < 0) { 4373 bs_load_ctx_fail(ctx, -ENOMEM); 4374 return; 4375 } 4376 4377 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4378 if (rc < 0) { 4379 bs_load_ctx_fail(ctx, -ENOMEM); 4380 return; 4381 } 4382 4383 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4384 bs_load_replay_md(ctx); 4385 } 4386 4387 static int 4388 bs_parse_super(struct spdk_bs_load_ctx *ctx) 4389 { 4390 int rc; 4391 4392 if (ctx->super->size == 0) { 4393 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4394 } 4395 4396 if (ctx->super->io_unit_size == 0) { 4397 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4398 } 4399 4400 ctx->bs->clean = 1; 4401 ctx->bs->cluster_sz = ctx->super->cluster_size; 4402 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4403 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4404 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4405 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4406 } 4407 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4408 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4409 if (rc < 0) { 4410 return -ENOMEM; 4411 } 4412 ctx->bs->md_start = ctx->super->md_start; 4413 ctx->bs->md_len = ctx->super->md_len; 4414 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 4415 if (rc < 0) { 4416 return -ENOMEM; 4417 } 4418 4419 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4420 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4421 ctx->bs->super_blob = ctx->super->super_blob; 4422 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4423 4424 return 0; 4425 } 4426 4427 static void 4428 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4429 { 4430 struct spdk_bs_load_ctx *ctx = cb_arg; 4431 uint32_t crc; 4432 int rc; 4433 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4434 4435 if (ctx->super->version > SPDK_BS_VERSION || 4436 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4437 bs_load_ctx_fail(ctx, -EILSEQ); 4438 return; 4439 } 4440 4441 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4442 sizeof(ctx->super->signature)) != 0) { 4443 bs_load_ctx_fail(ctx, -EILSEQ); 4444 return; 4445 } 4446 4447 crc = blob_md_page_calc_crc(ctx->super); 4448 if (crc != ctx->super->crc) { 4449 bs_load_ctx_fail(ctx, -EILSEQ); 4450 return; 4451 } 4452 4453 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4454 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4455 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4456 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4457 } else { 4458 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4459 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4460 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4461 bs_load_ctx_fail(ctx, -ENXIO); 4462 return; 4463 } 4464 4465 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4466 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4467 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4468 bs_load_ctx_fail(ctx, -EILSEQ); 4469 return; 4470 } 4471 4472 rc = bs_parse_super(ctx); 4473 if (rc < 0) { 4474 bs_load_ctx_fail(ctx, rc); 4475 return; 4476 } 4477 4478 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) { 4479 bs_recover(ctx); 4480 } else { 4481 bs_load_read_used_pages(ctx); 4482 } 4483 } 4484 4485 static inline int 4486 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4487 { 4488 4489 if (!src->opts_size) { 4490 SPDK_ERRLOG("opts_size should not be zero value\n"); 4491 return -1; 4492 } 4493 4494 #define FIELD_OK(field) \ 4495 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4496 4497 #define SET_FIELD(field) \ 4498 if (FIELD_OK(field)) { \ 4499 dst->field = src->field; \ 4500 } \ 4501 4502 SET_FIELD(cluster_sz); 4503 SET_FIELD(num_md_pages); 4504 SET_FIELD(max_md_ops); 4505 SET_FIELD(max_channel_ops); 4506 SET_FIELD(clear_method); 4507 4508 if (FIELD_OK(bstype)) { 4509 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4510 } 4511 SET_FIELD(iter_cb_fn); 4512 SET_FIELD(iter_cb_arg); 4513 SET_FIELD(force_recover); 4514 4515 dst->opts_size = src->opts_size; 4516 4517 /* You should not remove this statement, but need to update the assert statement 4518 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4519 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 72, "Incorrect size"); 4520 4521 #undef FIELD_OK 4522 #undef SET_FIELD 4523 4524 return 0; 4525 } 4526 4527 void 4528 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4529 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4530 { 4531 struct spdk_blob_store *bs; 4532 struct spdk_bs_cpl cpl; 4533 struct spdk_bs_load_ctx *ctx; 4534 struct spdk_bs_opts opts = {}; 4535 int err; 4536 4537 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4538 4539 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4540 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4541 dev->destroy(dev); 4542 cb_fn(cb_arg, NULL, -EINVAL); 4543 return; 4544 } 4545 4546 spdk_bs_opts_init(&opts, sizeof(opts)); 4547 if (o) { 4548 if (bs_opts_copy(o, &opts)) { 4549 return; 4550 } 4551 } 4552 4553 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4554 dev->destroy(dev); 4555 cb_fn(cb_arg, NULL, -EINVAL); 4556 return; 4557 } 4558 4559 err = bs_alloc(dev, &opts, &bs, &ctx); 4560 if (err) { 4561 dev->destroy(dev); 4562 cb_fn(cb_arg, NULL, err); 4563 return; 4564 } 4565 4566 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4567 cpl.u.bs_handle.cb_fn = cb_fn; 4568 cpl.u.bs_handle.cb_arg = cb_arg; 4569 cpl.u.bs_handle.bs = bs; 4570 4571 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4572 if (!ctx->seq) { 4573 spdk_free(ctx->super); 4574 free(ctx); 4575 bs_free(bs); 4576 cb_fn(cb_arg, NULL, -ENOMEM); 4577 return; 4578 } 4579 4580 /* Read the super block */ 4581 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4582 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4583 bs_load_super_cpl, ctx); 4584 } 4585 4586 /* END spdk_bs_load */ 4587 4588 /* START spdk_bs_dump */ 4589 4590 static void 4591 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4592 { 4593 spdk_free(ctx->super); 4594 4595 /* 4596 * We need to defer calling bs_call_cpl() until after 4597 * dev destruction, so tuck these away for later use. 4598 */ 4599 ctx->bs->unload_err = bserrno; 4600 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4601 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4602 4603 bs_sequence_finish(seq, 0); 4604 bs_free(ctx->bs); 4605 free(ctx); 4606 } 4607 4608 static void 4609 bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4610 { 4611 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4612 uint32_t i; 4613 const char *type; 4614 4615 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4616 4617 if (desc_xattr->length != 4618 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4619 desc_xattr->name_length + desc_xattr->value_length) { 4620 } 4621 4622 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4623 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4624 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4625 type = "XATTR"; 4626 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4627 type = "XATTR_INTERNAL"; 4628 } else { 4629 assert(false); 4630 type = "XATTR_?"; 4631 } 4632 fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name); 4633 fprintf(ctx->fp, " value = \""); 4634 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4635 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4636 desc_xattr->value_length); 4637 fprintf(ctx->fp, "\"\n"); 4638 for (i = 0; i < desc_xattr->value_length; i++) { 4639 if (i % 16 == 0) { 4640 fprintf(ctx->fp, " "); 4641 } 4642 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4643 if ((i + 1) % 16 == 0) { 4644 fprintf(ctx->fp, "\n"); 4645 } 4646 } 4647 if (i % 16 != 0) { 4648 fprintf(ctx->fp, "\n"); 4649 } 4650 } 4651 4652 struct type_flag_desc { 4653 uint64_t mask; 4654 uint64_t val; 4655 const char *name; 4656 }; 4657 4658 static void 4659 bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags, 4660 struct type_flag_desc *desc, size_t numflags) 4661 { 4662 uint64_t covered = 0; 4663 size_t i; 4664 4665 for (i = 0; i < numflags; i++) { 4666 if ((desc[i].mask & flags) != desc[i].val) { 4667 continue; 4668 } 4669 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name); 4670 if (desc[i].mask != desc[i].val) { 4671 fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")", 4672 desc[i].mask, desc[i].val); 4673 } 4674 fprintf(ctx->fp, "\n"); 4675 covered |= desc[i].mask; 4676 } 4677 if ((flags & ~covered) != 0) { 4678 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered); 4679 } 4680 } 4681 4682 static void 4683 bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4684 { 4685 struct spdk_blob_md_descriptor_flags *type_desc; 4686 #define ADD_FLAG(f) { f, f, #f } 4687 #define ADD_MASK_VAL(m, v) { m, v, #v } 4688 static struct type_flag_desc invalid[] = { 4689 ADD_FLAG(SPDK_BLOB_THIN_PROV), 4690 ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR), 4691 ADD_FLAG(SPDK_BLOB_EXTENT_TABLE), 4692 }; 4693 static struct type_flag_desc data_ro[] = { 4694 ADD_FLAG(SPDK_BLOB_READ_ONLY), 4695 }; 4696 static struct type_flag_desc md_ro[] = { 4697 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT), 4698 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE), 4699 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP), 4700 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES), 4701 }; 4702 #undef ADD_FLAG 4703 #undef ADD_MASK_VAL 4704 4705 type_desc = (struct spdk_blob_md_descriptor_flags *)desc; 4706 fprintf(ctx->fp, "Flags:\n"); 4707 fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags); 4708 bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid, 4709 SPDK_COUNTOF(invalid)); 4710 fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags); 4711 bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro, 4712 SPDK_COUNTOF(data_ro)); 4713 fprintf(ctx->fp, "\t md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags); 4714 bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro, 4715 SPDK_COUNTOF(md_ro)); 4716 } 4717 4718 static void 4719 bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4720 { 4721 struct spdk_blob_md_descriptor_extent_table *et_desc; 4722 uint64_t num_extent_pages; 4723 uint32_t et_idx; 4724 4725 et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc; 4726 num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) / 4727 sizeof(et_desc->extent_page[0]); 4728 4729 fprintf(ctx->fp, "Extent table:\n"); 4730 for (et_idx = 0; et_idx < num_extent_pages; et_idx++) { 4731 if (et_desc->extent_page[et_idx].page_idx == 0) { 4732 /* Zeroes represent unallocated extent pages. */ 4733 continue; 4734 } 4735 fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32 4736 " at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx, 4737 et_desc->extent_page[et_idx].num_pages, 4738 bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx)); 4739 } 4740 } 4741 4742 static void 4743 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4744 { 4745 uint32_t page_idx = ctx->cur_page; 4746 struct spdk_blob_md_page *page = ctx->page; 4747 struct spdk_blob_md_descriptor *desc; 4748 size_t cur_desc = 0; 4749 uint32_t crc; 4750 4751 fprintf(ctx->fp, "=========\n"); 4752 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4753 fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx)); 4754 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4755 fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num); 4756 if (page->next == SPDK_INVALID_MD_PAGE) { 4757 fprintf(ctx->fp, "Next: None\n"); 4758 } else { 4759 fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next); 4760 } 4761 fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)"); 4762 if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) { 4763 fprintf(ctx->fp, " md"); 4764 } 4765 if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) { 4766 fprintf(ctx->fp, " blob"); 4767 } 4768 fprintf(ctx->fp, "\n"); 4769 4770 crc = blob_md_page_calc_crc(page); 4771 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4772 4773 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4774 while (cur_desc < sizeof(page->descriptors)) { 4775 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4776 if (desc->length == 0) { 4777 /* If padding and length are 0, this terminates the page */ 4778 break; 4779 } 4780 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4781 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4782 unsigned int i; 4783 4784 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4785 4786 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4787 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4788 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4789 desc_extent_rle->extents[i].cluster_idx); 4790 } else { 4791 fprintf(ctx->fp, "Unallocated Extent - "); 4792 } 4793 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4794 fprintf(ctx->fp, "\n"); 4795 } 4796 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4797 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4798 unsigned int i; 4799 4800 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4801 4802 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4803 if (desc_extent->cluster_idx[i] != 0) { 4804 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4805 desc_extent->cluster_idx[i]); 4806 } else { 4807 fprintf(ctx->fp, "Unallocated Extent"); 4808 } 4809 fprintf(ctx->fp, "\n"); 4810 } 4811 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4812 bs_dump_print_xattr(ctx, desc); 4813 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4814 bs_dump_print_xattr(ctx, desc); 4815 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4816 bs_dump_print_type_flags(ctx, desc); 4817 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4818 bs_dump_print_extent_table(ctx, desc); 4819 } else { 4820 /* Error */ 4821 fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type); 4822 } 4823 /* Advance to the next descriptor */ 4824 cur_desc += sizeof(*desc) + desc->length; 4825 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4826 break; 4827 } 4828 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4829 } 4830 } 4831 4832 static void 4833 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4834 { 4835 struct spdk_bs_load_ctx *ctx = cb_arg; 4836 4837 if (bserrno != 0) { 4838 bs_dump_finish(seq, ctx, bserrno); 4839 return; 4840 } 4841 4842 if (ctx->page->id != 0) { 4843 bs_dump_print_md_page(ctx); 4844 } 4845 4846 ctx->cur_page++; 4847 4848 if (ctx->cur_page < ctx->super->md_len) { 4849 bs_dump_read_md_page(seq, ctx); 4850 } else { 4851 spdk_free(ctx->page); 4852 bs_dump_finish(seq, ctx, 0); 4853 } 4854 } 4855 4856 static void 4857 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4858 { 4859 struct spdk_bs_load_ctx *ctx = cb_arg; 4860 uint64_t lba; 4861 4862 assert(ctx->cur_page < ctx->super->md_len); 4863 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4864 bs_sequence_read_dev(seq, ctx->page, lba, 4865 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4866 bs_dump_read_md_page_cpl, ctx); 4867 } 4868 4869 static void 4870 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4871 { 4872 struct spdk_bs_load_ctx *ctx = cb_arg; 4873 int rc; 4874 4875 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4876 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4877 sizeof(ctx->super->signature)) != 0) { 4878 fprintf(ctx->fp, "(Mismatch)\n"); 4879 bs_dump_finish(seq, ctx, bserrno); 4880 return; 4881 } else { 4882 fprintf(ctx->fp, "(OK)\n"); 4883 } 4884 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4885 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4886 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4887 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4888 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4889 fprintf(ctx->fp, "Super Blob ID: "); 4890 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4891 fprintf(ctx->fp, "(None)\n"); 4892 } else { 4893 fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob); 4894 } 4895 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4896 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4897 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4898 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4899 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4900 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4901 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4902 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4903 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4904 4905 ctx->cur_page = 0; 4906 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4907 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4908 if (!ctx->page) { 4909 bs_dump_finish(seq, ctx, -ENOMEM); 4910 return; 4911 } 4912 4913 rc = bs_parse_super(ctx); 4914 if (rc < 0) { 4915 bs_load_ctx_fail(ctx, rc); 4916 return; 4917 } 4918 4919 bs_load_read_used_pages(ctx); 4920 } 4921 4922 void 4923 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4924 spdk_bs_op_complete cb_fn, void *cb_arg) 4925 { 4926 struct spdk_blob_store *bs; 4927 struct spdk_bs_cpl cpl; 4928 struct spdk_bs_load_ctx *ctx; 4929 struct spdk_bs_opts opts = {}; 4930 int err; 4931 4932 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 4933 4934 spdk_bs_opts_init(&opts, sizeof(opts)); 4935 4936 err = bs_alloc(dev, &opts, &bs, &ctx); 4937 if (err) { 4938 dev->destroy(dev); 4939 cb_fn(cb_arg, err); 4940 return; 4941 } 4942 4943 ctx->dumping = true; 4944 ctx->fp = fp; 4945 ctx->print_xattr_fn = print_xattr_fn; 4946 4947 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 4948 cpl.u.bs_basic.cb_fn = cb_fn; 4949 cpl.u.bs_basic.cb_arg = cb_arg; 4950 4951 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4952 if (!ctx->seq) { 4953 spdk_free(ctx->super); 4954 free(ctx); 4955 bs_free(bs); 4956 cb_fn(cb_arg, -ENOMEM); 4957 return; 4958 } 4959 4960 /* Read the super block */ 4961 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4962 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4963 bs_dump_super_cpl, ctx); 4964 } 4965 4966 /* END spdk_bs_dump */ 4967 4968 /* START spdk_bs_init */ 4969 4970 static void 4971 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4972 { 4973 struct spdk_bs_load_ctx *ctx = cb_arg; 4974 4975 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 4976 spdk_free(ctx->super); 4977 free(ctx); 4978 4979 bs_sequence_finish(seq, bserrno); 4980 } 4981 4982 static void 4983 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4984 { 4985 struct spdk_bs_load_ctx *ctx = cb_arg; 4986 4987 /* Write super block */ 4988 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 4989 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 4990 bs_init_persist_super_cpl, ctx); 4991 } 4992 4993 void 4994 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4995 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4996 { 4997 struct spdk_bs_load_ctx *ctx; 4998 struct spdk_blob_store *bs; 4999 struct spdk_bs_cpl cpl; 5000 spdk_bs_sequence_t *seq; 5001 spdk_bs_batch_t *batch; 5002 uint64_t num_md_lba; 5003 uint64_t num_md_pages; 5004 uint64_t num_md_clusters; 5005 uint64_t max_used_cluster_mask_len; 5006 uint32_t i; 5007 struct spdk_bs_opts opts = {}; 5008 int rc; 5009 uint64_t lba, lba_count; 5010 5011 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 5012 5013 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 5014 SPDK_ERRLOG("unsupported dev block length of %d\n", 5015 dev->blocklen); 5016 dev->destroy(dev); 5017 cb_fn(cb_arg, NULL, -EINVAL); 5018 return; 5019 } 5020 5021 spdk_bs_opts_init(&opts, sizeof(opts)); 5022 if (o) { 5023 if (bs_opts_copy(o, &opts)) { 5024 return; 5025 } 5026 } 5027 5028 if (bs_opts_verify(&opts) != 0) { 5029 dev->destroy(dev); 5030 cb_fn(cb_arg, NULL, -EINVAL); 5031 return; 5032 } 5033 5034 rc = bs_alloc(dev, &opts, &bs, &ctx); 5035 if (rc) { 5036 dev->destroy(dev); 5037 cb_fn(cb_arg, NULL, rc); 5038 return; 5039 } 5040 5041 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 5042 /* By default, allocate 1 page per cluster. 5043 * Technically, this over-allocates metadata 5044 * because more metadata will reduce the number 5045 * of usable clusters. This can be addressed with 5046 * more complex math in the future. 5047 */ 5048 bs->md_len = bs->total_clusters; 5049 } else { 5050 bs->md_len = opts.num_md_pages; 5051 } 5052 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 5053 if (rc < 0) { 5054 spdk_free(ctx->super); 5055 free(ctx); 5056 bs_free(bs); 5057 cb_fn(cb_arg, NULL, -ENOMEM); 5058 return; 5059 } 5060 5061 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 5062 if (rc < 0) { 5063 spdk_free(ctx->super); 5064 free(ctx); 5065 bs_free(bs); 5066 cb_fn(cb_arg, NULL, -ENOMEM); 5067 return; 5068 } 5069 5070 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 5071 if (rc < 0) { 5072 spdk_free(ctx->super); 5073 free(ctx); 5074 bs_free(bs); 5075 cb_fn(cb_arg, NULL, -ENOMEM); 5076 return; 5077 } 5078 5079 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 5080 sizeof(ctx->super->signature)); 5081 ctx->super->version = SPDK_BS_VERSION; 5082 ctx->super->length = sizeof(*ctx->super); 5083 ctx->super->super_blob = bs->super_blob; 5084 ctx->super->clean = 0; 5085 ctx->super->cluster_size = bs->cluster_sz; 5086 ctx->super->io_unit_size = bs->io_unit_size; 5087 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 5088 5089 /* Calculate how many pages the metadata consumes at the front 5090 * of the disk. 5091 */ 5092 5093 /* The super block uses 1 page */ 5094 num_md_pages = 1; 5095 5096 /* The used_md_pages mask requires 1 bit per metadata page, rounded 5097 * up to the nearest page, plus a header. 5098 */ 5099 ctx->super->used_page_mask_start = num_md_pages; 5100 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5101 spdk_divide_round_up(bs->md_len, 8), 5102 SPDK_BS_PAGE_SIZE); 5103 num_md_pages += ctx->super->used_page_mask_len; 5104 5105 /* The used_clusters mask requires 1 bit per cluster, rounded 5106 * up to the nearest page, plus a header. 5107 */ 5108 ctx->super->used_cluster_mask_start = num_md_pages; 5109 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5110 spdk_divide_round_up(bs->total_clusters, 8), 5111 SPDK_BS_PAGE_SIZE); 5112 /* The blobstore might be extended, then the used_cluster bitmap will need more space. 5113 * Here we calculate the max clusters we can support according to the 5114 * num_md_pages (bs->md_len). 5115 */ 5116 max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5117 spdk_divide_round_up(bs->md_len, 8), 5118 SPDK_BS_PAGE_SIZE); 5119 max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, 5120 ctx->super->used_cluster_mask_len); 5121 num_md_pages += max_used_cluster_mask_len; 5122 5123 /* The used_blobids mask requires 1 bit per metadata page, rounded 5124 * up to the nearest page, plus a header. 5125 */ 5126 ctx->super->used_blobid_mask_start = num_md_pages; 5127 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5128 spdk_divide_round_up(bs->md_len, 8), 5129 SPDK_BS_PAGE_SIZE); 5130 num_md_pages += ctx->super->used_blobid_mask_len; 5131 5132 /* The metadata region size was chosen above */ 5133 ctx->super->md_start = bs->md_start = num_md_pages; 5134 ctx->super->md_len = bs->md_len; 5135 num_md_pages += bs->md_len; 5136 5137 num_md_lba = bs_page_to_lba(bs, num_md_pages); 5138 5139 ctx->super->size = dev->blockcnt * dev->blocklen; 5140 5141 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 5142 5143 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 5144 if (num_md_clusters > bs->total_clusters) { 5145 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 5146 "please decrease number of pages reserved for metadata " 5147 "or increase cluster size.\n"); 5148 spdk_free(ctx->super); 5149 spdk_bit_array_free(&ctx->used_clusters); 5150 free(ctx); 5151 bs_free(bs); 5152 cb_fn(cb_arg, NULL, -ENOMEM); 5153 return; 5154 } 5155 /* Claim all of the clusters used by the metadata */ 5156 for (i = 0; i < num_md_clusters; i++) { 5157 spdk_bit_array_set(ctx->used_clusters, i); 5158 } 5159 5160 bs->num_free_clusters -= num_md_clusters; 5161 bs->total_data_clusters = bs->num_free_clusters; 5162 5163 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 5164 cpl.u.bs_handle.cb_fn = cb_fn; 5165 cpl.u.bs_handle.cb_arg = cb_arg; 5166 cpl.u.bs_handle.bs = bs; 5167 5168 seq = bs_sequence_start(bs->md_channel, &cpl); 5169 if (!seq) { 5170 spdk_free(ctx->super); 5171 free(ctx); 5172 bs_free(bs); 5173 cb_fn(cb_arg, NULL, -ENOMEM); 5174 return; 5175 } 5176 5177 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 5178 5179 /* Clear metadata space */ 5180 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 5181 5182 lba = num_md_lba; 5183 lba_count = ctx->bs->dev->blockcnt - lba; 5184 switch (opts.clear_method) { 5185 case BS_CLEAR_WITH_UNMAP: 5186 /* Trim data clusters */ 5187 bs_batch_unmap_dev(batch, lba, lba_count); 5188 break; 5189 case BS_CLEAR_WITH_WRITE_ZEROES: 5190 /* Write_zeroes to data clusters */ 5191 bs_batch_write_zeroes_dev(batch, lba, lba_count); 5192 break; 5193 case BS_CLEAR_WITH_NONE: 5194 default: 5195 break; 5196 } 5197 5198 bs_batch_close(batch); 5199 } 5200 5201 /* END spdk_bs_init */ 5202 5203 /* START spdk_bs_destroy */ 5204 5205 static void 5206 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5207 { 5208 struct spdk_bs_load_ctx *ctx = cb_arg; 5209 struct spdk_blob_store *bs = ctx->bs; 5210 5211 /* 5212 * We need to defer calling bs_call_cpl() until after 5213 * dev destruction, so tuck these away for later use. 5214 */ 5215 bs->unload_err = bserrno; 5216 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5217 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5218 5219 bs_sequence_finish(seq, bserrno); 5220 5221 bs_free(bs); 5222 free(ctx); 5223 } 5224 5225 void 5226 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 5227 void *cb_arg) 5228 { 5229 struct spdk_bs_cpl cpl; 5230 spdk_bs_sequence_t *seq; 5231 struct spdk_bs_load_ctx *ctx; 5232 5233 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 5234 5235 if (!RB_EMPTY(&bs->open_blobs)) { 5236 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5237 cb_fn(cb_arg, -EBUSY); 5238 return; 5239 } 5240 5241 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5242 cpl.u.bs_basic.cb_fn = cb_fn; 5243 cpl.u.bs_basic.cb_arg = cb_arg; 5244 5245 ctx = calloc(1, sizeof(*ctx)); 5246 if (!ctx) { 5247 cb_fn(cb_arg, -ENOMEM); 5248 return; 5249 } 5250 5251 ctx->bs = bs; 5252 5253 seq = bs_sequence_start(bs->md_channel, &cpl); 5254 if (!seq) { 5255 free(ctx); 5256 cb_fn(cb_arg, -ENOMEM); 5257 return; 5258 } 5259 5260 /* Write zeroes to the super block */ 5261 bs_sequence_write_zeroes_dev(seq, 5262 bs_page_to_lba(bs, 0), 5263 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5264 bs_destroy_trim_cpl, ctx); 5265 } 5266 5267 /* END spdk_bs_destroy */ 5268 5269 /* START spdk_bs_unload */ 5270 5271 static void 5272 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5273 { 5274 spdk_bs_sequence_t *seq = ctx->seq; 5275 5276 spdk_free(ctx->super); 5277 5278 /* 5279 * We need to defer calling bs_call_cpl() until after 5280 * dev destruction, so tuck these away for later use. 5281 */ 5282 ctx->bs->unload_err = bserrno; 5283 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5284 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5285 5286 bs_sequence_finish(seq, bserrno); 5287 5288 bs_free(ctx->bs); 5289 free(ctx); 5290 } 5291 5292 static void 5293 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5294 { 5295 struct spdk_bs_load_ctx *ctx = cb_arg; 5296 5297 bs_unload_finish(ctx, bserrno); 5298 } 5299 5300 static void 5301 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5302 { 5303 struct spdk_bs_load_ctx *ctx = cb_arg; 5304 5305 spdk_free(ctx->mask); 5306 5307 if (bserrno != 0) { 5308 bs_unload_finish(ctx, bserrno); 5309 return; 5310 } 5311 5312 ctx->super->clean = 1; 5313 5314 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5315 } 5316 5317 static void 5318 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5319 { 5320 struct spdk_bs_load_ctx *ctx = cb_arg; 5321 5322 spdk_free(ctx->mask); 5323 ctx->mask = NULL; 5324 5325 if (bserrno != 0) { 5326 bs_unload_finish(ctx, bserrno); 5327 return; 5328 } 5329 5330 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5331 } 5332 5333 static void 5334 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5335 { 5336 struct spdk_bs_load_ctx *ctx = cb_arg; 5337 5338 spdk_free(ctx->mask); 5339 ctx->mask = NULL; 5340 5341 if (bserrno != 0) { 5342 bs_unload_finish(ctx, bserrno); 5343 return; 5344 } 5345 5346 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5347 } 5348 5349 static void 5350 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5351 { 5352 struct spdk_bs_load_ctx *ctx = cb_arg; 5353 5354 if (bserrno != 0) { 5355 bs_unload_finish(ctx, bserrno); 5356 return; 5357 } 5358 5359 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5360 } 5361 5362 void 5363 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5364 { 5365 struct spdk_bs_cpl cpl; 5366 struct spdk_bs_load_ctx *ctx; 5367 5368 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5369 5370 if (!RB_EMPTY(&bs->open_blobs)) { 5371 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5372 cb_fn(cb_arg, -EBUSY); 5373 return; 5374 } 5375 5376 ctx = calloc(1, sizeof(*ctx)); 5377 if (!ctx) { 5378 cb_fn(cb_arg, -ENOMEM); 5379 return; 5380 } 5381 5382 ctx->bs = bs; 5383 5384 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5385 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5386 if (!ctx->super) { 5387 free(ctx); 5388 cb_fn(cb_arg, -ENOMEM); 5389 return; 5390 } 5391 5392 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5393 cpl.u.bs_basic.cb_fn = cb_fn; 5394 cpl.u.bs_basic.cb_arg = cb_arg; 5395 5396 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5397 if (!ctx->seq) { 5398 spdk_free(ctx->super); 5399 free(ctx); 5400 cb_fn(cb_arg, -ENOMEM); 5401 return; 5402 } 5403 5404 /* Read super block */ 5405 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5406 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5407 bs_unload_read_super_cpl, ctx); 5408 } 5409 5410 /* END spdk_bs_unload */ 5411 5412 /* START spdk_bs_set_super */ 5413 5414 struct spdk_bs_set_super_ctx { 5415 struct spdk_blob_store *bs; 5416 struct spdk_bs_super_block *super; 5417 }; 5418 5419 static void 5420 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5421 { 5422 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5423 5424 if (bserrno != 0) { 5425 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5426 } 5427 5428 spdk_free(ctx->super); 5429 5430 bs_sequence_finish(seq, bserrno); 5431 5432 free(ctx); 5433 } 5434 5435 static void 5436 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5437 { 5438 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5439 5440 if (bserrno != 0) { 5441 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5442 spdk_free(ctx->super); 5443 bs_sequence_finish(seq, bserrno); 5444 free(ctx); 5445 return; 5446 } 5447 5448 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5449 } 5450 5451 void 5452 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5453 spdk_bs_op_complete cb_fn, void *cb_arg) 5454 { 5455 struct spdk_bs_cpl cpl; 5456 spdk_bs_sequence_t *seq; 5457 struct spdk_bs_set_super_ctx *ctx; 5458 5459 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5460 5461 ctx = calloc(1, sizeof(*ctx)); 5462 if (!ctx) { 5463 cb_fn(cb_arg, -ENOMEM); 5464 return; 5465 } 5466 5467 ctx->bs = bs; 5468 5469 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5470 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5471 if (!ctx->super) { 5472 free(ctx); 5473 cb_fn(cb_arg, -ENOMEM); 5474 return; 5475 } 5476 5477 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5478 cpl.u.bs_basic.cb_fn = cb_fn; 5479 cpl.u.bs_basic.cb_arg = cb_arg; 5480 5481 seq = bs_sequence_start(bs->md_channel, &cpl); 5482 if (!seq) { 5483 spdk_free(ctx->super); 5484 free(ctx); 5485 cb_fn(cb_arg, -ENOMEM); 5486 return; 5487 } 5488 5489 bs->super_blob = blobid; 5490 5491 /* Read super block */ 5492 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5493 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5494 bs_set_super_read_cpl, ctx); 5495 } 5496 5497 /* END spdk_bs_set_super */ 5498 5499 void 5500 spdk_bs_get_super(struct spdk_blob_store *bs, 5501 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5502 { 5503 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5504 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5505 } else { 5506 cb_fn(cb_arg, bs->super_blob, 0); 5507 } 5508 } 5509 5510 uint64_t 5511 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5512 { 5513 return bs->cluster_sz; 5514 } 5515 5516 uint64_t 5517 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5518 { 5519 return SPDK_BS_PAGE_SIZE; 5520 } 5521 5522 uint64_t 5523 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5524 { 5525 return bs->io_unit_size; 5526 } 5527 5528 uint64_t 5529 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5530 { 5531 return bs->num_free_clusters; 5532 } 5533 5534 uint64_t 5535 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5536 { 5537 return bs->total_data_clusters; 5538 } 5539 5540 static int 5541 bs_register_md_thread(struct spdk_blob_store *bs) 5542 { 5543 bs->md_channel = spdk_get_io_channel(bs); 5544 if (!bs->md_channel) { 5545 SPDK_ERRLOG("Failed to get IO channel.\n"); 5546 return -1; 5547 } 5548 5549 return 0; 5550 } 5551 5552 static int 5553 bs_unregister_md_thread(struct spdk_blob_store *bs) 5554 { 5555 spdk_put_io_channel(bs->md_channel); 5556 5557 return 0; 5558 } 5559 5560 spdk_blob_id 5561 spdk_blob_get_id(struct spdk_blob *blob) 5562 { 5563 assert(blob != NULL); 5564 5565 return blob->id; 5566 } 5567 5568 uint64_t 5569 spdk_blob_get_num_pages(struct spdk_blob *blob) 5570 { 5571 assert(blob != NULL); 5572 5573 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5574 } 5575 5576 uint64_t 5577 spdk_blob_get_num_io_units(struct spdk_blob *blob) 5578 { 5579 assert(blob != NULL); 5580 5581 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5582 } 5583 5584 uint64_t 5585 spdk_blob_get_num_clusters(struct spdk_blob *blob) 5586 { 5587 assert(blob != NULL); 5588 5589 return blob->active.num_clusters; 5590 } 5591 5592 /* START spdk_bs_create_blob */ 5593 5594 static void 5595 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5596 { 5597 struct spdk_blob *blob = cb_arg; 5598 uint32_t page_idx = bs_blobid_to_page(blob->id); 5599 5600 if (bserrno != 0) { 5601 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5602 bs_release_md_page(blob->bs, page_idx); 5603 } 5604 5605 blob_free(blob); 5606 5607 bs_sequence_finish(seq, bserrno); 5608 } 5609 5610 static int 5611 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5612 bool internal) 5613 { 5614 uint64_t i; 5615 size_t value_len = 0; 5616 int rc; 5617 const void *value = NULL; 5618 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5619 return -EINVAL; 5620 } 5621 for (i = 0; i < xattrs->count; i++) { 5622 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5623 if (value == NULL || value_len == 0) { 5624 return -EINVAL; 5625 } 5626 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5627 if (rc < 0) { 5628 return rc; 5629 } 5630 } 5631 return 0; 5632 } 5633 5634 static void 5635 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5636 { 5637 #define FIELD_OK(field) \ 5638 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5639 5640 #define SET_FIELD(field) \ 5641 if (FIELD_OK(field)) { \ 5642 dst->field = src->field; \ 5643 } \ 5644 5645 SET_FIELD(num_clusters); 5646 SET_FIELD(thin_provision); 5647 SET_FIELD(clear_method); 5648 5649 if (FIELD_OK(xattrs)) { 5650 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5651 } 5652 5653 SET_FIELD(use_extent_table); 5654 5655 dst->opts_size = src->opts_size; 5656 5657 /* You should not remove this statement, but need to update the assert statement 5658 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5659 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5660 5661 #undef FIELD_OK 5662 #undef SET_FIELD 5663 } 5664 5665 static void 5666 bs_create_blob(struct spdk_blob_store *bs, 5667 const struct spdk_blob_opts *opts, 5668 const struct spdk_blob_xattr_opts *internal_xattrs, 5669 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5670 { 5671 struct spdk_blob *blob; 5672 uint32_t page_idx; 5673 struct spdk_bs_cpl cpl; 5674 struct spdk_blob_opts opts_local; 5675 struct spdk_blob_xattr_opts internal_xattrs_default; 5676 spdk_bs_sequence_t *seq; 5677 spdk_blob_id id; 5678 int rc; 5679 5680 assert(spdk_get_thread() == bs->md_thread); 5681 5682 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5683 if (page_idx == UINT32_MAX) { 5684 cb_fn(cb_arg, 0, -ENOMEM); 5685 return; 5686 } 5687 spdk_bit_array_set(bs->used_blobids, page_idx); 5688 bs_claim_md_page(bs, page_idx); 5689 5690 id = bs_page_to_blobid(page_idx); 5691 5692 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5693 5694 blob = blob_alloc(bs, id); 5695 if (!blob) { 5696 spdk_bit_array_clear(bs->used_blobids, page_idx); 5697 bs_release_md_page(bs, page_idx); 5698 cb_fn(cb_arg, 0, -ENOMEM); 5699 return; 5700 } 5701 5702 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5703 if (opts) { 5704 blob_opts_copy(opts, &opts_local); 5705 } 5706 5707 blob->use_extent_table = opts_local.use_extent_table; 5708 if (blob->use_extent_table) { 5709 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5710 } 5711 5712 if (!internal_xattrs) { 5713 blob_xattrs_init(&internal_xattrs_default); 5714 internal_xattrs = &internal_xattrs_default; 5715 } 5716 5717 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5718 if (rc < 0) { 5719 blob_free(blob); 5720 spdk_bit_array_clear(bs->used_blobids, page_idx); 5721 bs_release_md_page(bs, page_idx); 5722 cb_fn(cb_arg, 0, rc); 5723 return; 5724 } 5725 5726 rc = blob_set_xattrs(blob, internal_xattrs, true); 5727 if (rc < 0) { 5728 blob_free(blob); 5729 spdk_bit_array_clear(bs->used_blobids, page_idx); 5730 bs_release_md_page(bs, page_idx); 5731 cb_fn(cb_arg, 0, rc); 5732 return; 5733 } 5734 5735 if (opts_local.thin_provision) { 5736 blob_set_thin_provision(blob); 5737 } 5738 5739 blob_set_clear_method(blob, opts_local.clear_method); 5740 5741 rc = blob_resize(blob, opts_local.num_clusters); 5742 if (rc < 0) { 5743 blob_free(blob); 5744 spdk_bit_array_clear(bs->used_blobids, page_idx); 5745 bs_release_md_page(bs, page_idx); 5746 cb_fn(cb_arg, 0, rc); 5747 return; 5748 } 5749 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5750 cpl.u.blobid.cb_fn = cb_fn; 5751 cpl.u.blobid.cb_arg = cb_arg; 5752 cpl.u.blobid.blobid = blob->id; 5753 5754 seq = bs_sequence_start(bs->md_channel, &cpl); 5755 if (!seq) { 5756 blob_free(blob); 5757 spdk_bit_array_clear(bs->used_blobids, page_idx); 5758 bs_release_md_page(bs, page_idx); 5759 cb_fn(cb_arg, 0, -ENOMEM); 5760 return; 5761 } 5762 5763 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5764 } 5765 5766 void 5767 spdk_bs_create_blob(struct spdk_blob_store *bs, 5768 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5769 { 5770 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5771 } 5772 5773 void 5774 spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5775 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5776 { 5777 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5778 } 5779 5780 /* END spdk_bs_create_blob */ 5781 5782 /* START blob_cleanup */ 5783 5784 struct spdk_clone_snapshot_ctx { 5785 struct spdk_bs_cpl cpl; 5786 int bserrno; 5787 bool frozen; 5788 5789 struct spdk_io_channel *channel; 5790 5791 /* Current cluster for inflate operation */ 5792 uint64_t cluster; 5793 5794 /* For inflation force allocation of all unallocated clusters and remove 5795 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5796 bool allocate_all; 5797 5798 struct { 5799 spdk_blob_id id; 5800 struct spdk_blob *blob; 5801 bool md_ro; 5802 } original; 5803 struct { 5804 spdk_blob_id id; 5805 struct spdk_blob *blob; 5806 } new; 5807 5808 /* xattrs specified for snapshot/clones only. They have no impact on 5809 * the original blobs xattrs. */ 5810 const struct spdk_blob_xattr_opts *xattrs; 5811 }; 5812 5813 static void 5814 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5815 { 5816 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5817 struct spdk_bs_cpl *cpl = &ctx->cpl; 5818 5819 if (bserrno != 0) { 5820 if (ctx->bserrno != 0) { 5821 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5822 } else { 5823 ctx->bserrno = bserrno; 5824 } 5825 } 5826 5827 switch (cpl->type) { 5828 case SPDK_BS_CPL_TYPE_BLOBID: 5829 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5830 break; 5831 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5832 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5833 break; 5834 default: 5835 SPDK_UNREACHABLE(); 5836 break; 5837 } 5838 5839 free(ctx); 5840 } 5841 5842 static void 5843 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5844 { 5845 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5846 struct spdk_blob *origblob = ctx->original.blob; 5847 5848 if (bserrno != 0) { 5849 if (ctx->bserrno != 0) { 5850 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5851 } else { 5852 ctx->bserrno = bserrno; 5853 } 5854 } 5855 5856 ctx->original.id = origblob->id; 5857 origblob->locked_operation_in_progress = false; 5858 5859 /* Revert md_ro to original state */ 5860 origblob->md_ro = ctx->original.md_ro; 5861 5862 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5863 } 5864 5865 static void 5866 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5867 { 5868 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5869 struct spdk_blob *origblob = ctx->original.blob; 5870 5871 if (bserrno != 0) { 5872 if (ctx->bserrno != 0) { 5873 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5874 } else { 5875 ctx->bserrno = bserrno; 5876 } 5877 } 5878 5879 if (ctx->frozen) { 5880 /* Unfreeze any outstanding I/O */ 5881 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5882 } else { 5883 bs_snapshot_unfreeze_cpl(ctx, 0); 5884 } 5885 5886 } 5887 5888 static void 5889 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5890 { 5891 struct spdk_blob *newblob = ctx->new.blob; 5892 5893 if (bserrno != 0) { 5894 if (ctx->bserrno != 0) { 5895 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5896 } else { 5897 ctx->bserrno = bserrno; 5898 } 5899 } 5900 5901 ctx->new.id = newblob->id; 5902 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5903 } 5904 5905 /* END blob_cleanup */ 5906 5907 /* START spdk_bs_create_snapshot */ 5908 5909 static void 5910 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 5911 { 5912 uint64_t *cluster_temp; 5913 uint32_t *extent_page_temp; 5914 5915 cluster_temp = blob1->active.clusters; 5916 blob1->active.clusters = blob2->active.clusters; 5917 blob2->active.clusters = cluster_temp; 5918 5919 extent_page_temp = blob1->active.extent_pages; 5920 blob1->active.extent_pages = blob2->active.extent_pages; 5921 blob2->active.extent_pages = extent_page_temp; 5922 } 5923 5924 static void 5925 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 5926 { 5927 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5928 struct spdk_blob *origblob = ctx->original.blob; 5929 struct spdk_blob *newblob = ctx->new.blob; 5930 5931 if (bserrno != 0) { 5932 bs_snapshot_swap_cluster_maps(newblob, origblob); 5933 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5934 return; 5935 } 5936 5937 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 5938 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 5939 if (bserrno != 0) { 5940 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5941 return; 5942 } 5943 5944 bs_blob_list_add(ctx->original.blob); 5945 5946 spdk_blob_set_read_only(newblob); 5947 5948 /* sync snapshot metadata */ 5949 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5950 } 5951 5952 static void 5953 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 5954 { 5955 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5956 struct spdk_blob *origblob = ctx->original.blob; 5957 struct spdk_blob *newblob = ctx->new.blob; 5958 5959 if (bserrno != 0) { 5960 /* return cluster map back to original */ 5961 bs_snapshot_swap_cluster_maps(newblob, origblob); 5962 5963 /* Newblob md sync failed. Valid clusters are only present in origblob. 5964 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred. 5965 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 5966 blob_set_thin_provision(newblob); 5967 assert(spdk_mem_all_zero(newblob->active.clusters, 5968 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5969 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5970 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5971 5972 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5973 return; 5974 } 5975 5976 /* Set internal xattr for snapshot id */ 5977 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 5978 if (bserrno != 0) { 5979 /* return cluster map back to original */ 5980 bs_snapshot_swap_cluster_maps(newblob, origblob); 5981 blob_set_thin_provision(newblob); 5982 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5983 return; 5984 } 5985 5986 /* Create new back_bs_dev for snapshot */ 5987 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 5988 if (origblob->back_bs_dev == NULL) { 5989 /* return cluster map back to original */ 5990 bs_snapshot_swap_cluster_maps(newblob, origblob); 5991 blob_set_thin_provision(newblob); 5992 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 5993 return; 5994 } 5995 5996 bs_blob_list_remove(origblob); 5997 origblob->parent_id = newblob->id; 5998 /* set clone blob as thin provisioned */ 5999 blob_set_thin_provision(origblob); 6000 6001 bs_blob_list_add(newblob); 6002 6003 /* sync clone metadata */ 6004 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 6005 } 6006 6007 static void 6008 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 6009 { 6010 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6011 struct spdk_blob *origblob = ctx->original.blob; 6012 struct spdk_blob *newblob = ctx->new.blob; 6013 int bserrno; 6014 6015 if (rc != 0) { 6016 bs_clone_snapshot_newblob_cleanup(ctx, rc); 6017 return; 6018 } 6019 6020 ctx->frozen = true; 6021 6022 if (newblob->back_bs_dev) { 6023 newblob->back_bs_dev->destroy(newblob->back_bs_dev); 6024 } 6025 /* set new back_bs_dev for snapshot */ 6026 newblob->back_bs_dev = origblob->back_bs_dev; 6027 /* Set invalid flags from origblob */ 6028 newblob->invalid_flags = origblob->invalid_flags; 6029 6030 /* inherit parent from original blob if set */ 6031 newblob->parent_id = origblob->parent_id; 6032 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 6033 /* Set internal xattr for snapshot id */ 6034 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 6035 &origblob->parent_id, sizeof(spdk_blob_id), true); 6036 if (bserrno != 0) { 6037 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6038 return; 6039 } 6040 } 6041 6042 /* swap cluster maps */ 6043 bs_snapshot_swap_cluster_maps(newblob, origblob); 6044 6045 /* Set the clear method on the new blob to match the original. */ 6046 blob_set_clear_method(newblob, origblob->clear_method); 6047 6048 /* sync snapshot metadata */ 6049 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 6050 } 6051 6052 static void 6053 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6054 { 6055 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6056 struct spdk_blob *origblob = ctx->original.blob; 6057 struct spdk_blob *newblob = _blob; 6058 6059 if (bserrno != 0) { 6060 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6061 return; 6062 } 6063 6064 ctx->new.blob = newblob; 6065 assert(spdk_blob_is_thin_provisioned(newblob)); 6066 assert(spdk_mem_all_zero(newblob->active.clusters, 6067 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6068 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6069 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6070 6071 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 6072 } 6073 6074 static void 6075 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6076 { 6077 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6078 struct spdk_blob *origblob = ctx->original.blob; 6079 6080 if (bserrno != 0) { 6081 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6082 return; 6083 } 6084 6085 ctx->new.id = blobid; 6086 ctx->cpl.u.blobid.blobid = blobid; 6087 6088 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 6089 } 6090 6091 6092 static void 6093 bs_xattr_snapshot(void *arg, const char *name, 6094 const void **value, size_t *value_len) 6095 { 6096 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 6097 6098 struct spdk_blob *blob = (struct spdk_blob *)arg; 6099 *value = &blob->id; 6100 *value_len = sizeof(blob->id); 6101 } 6102 6103 static void 6104 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6105 { 6106 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6107 struct spdk_blob_opts opts; 6108 struct spdk_blob_xattr_opts internal_xattrs; 6109 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 6110 6111 if (bserrno != 0) { 6112 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6113 return; 6114 } 6115 6116 ctx->original.blob = _blob; 6117 6118 if (_blob->data_ro || _blob->md_ro) { 6119 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 6120 _blob->id); 6121 ctx->bserrno = -EINVAL; 6122 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6123 return; 6124 } 6125 6126 if (_blob->locked_operation_in_progress) { 6127 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 6128 ctx->bserrno = -EBUSY; 6129 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6130 return; 6131 } 6132 6133 _blob->locked_operation_in_progress = true; 6134 6135 spdk_blob_opts_init(&opts, sizeof(opts)); 6136 blob_xattrs_init(&internal_xattrs); 6137 6138 /* Change the size of new blob to the same as in original blob, 6139 * but do not allocate clusters */ 6140 opts.thin_provision = true; 6141 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6142 opts.use_extent_table = _blob->use_extent_table; 6143 6144 /* If there are any xattrs specified for snapshot, set them now */ 6145 if (ctx->xattrs) { 6146 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6147 } 6148 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 6149 internal_xattrs.count = 1; 6150 internal_xattrs.ctx = _blob; 6151 internal_xattrs.names = xattrs_names; 6152 internal_xattrs.get_value = bs_xattr_snapshot; 6153 6154 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6155 bs_snapshot_newblob_create_cpl, ctx); 6156 } 6157 6158 void 6159 spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 6160 const struct spdk_blob_xattr_opts *snapshot_xattrs, 6161 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6162 { 6163 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6164 6165 if (!ctx) { 6166 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6167 return; 6168 } 6169 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6170 ctx->cpl.u.blobid.cb_fn = cb_fn; 6171 ctx->cpl.u.blobid.cb_arg = cb_arg; 6172 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6173 ctx->bserrno = 0; 6174 ctx->frozen = false; 6175 ctx->original.id = blobid; 6176 ctx->xattrs = snapshot_xattrs; 6177 6178 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 6179 } 6180 /* END spdk_bs_create_snapshot */ 6181 6182 /* START spdk_bs_create_clone */ 6183 6184 static void 6185 bs_xattr_clone(void *arg, const char *name, 6186 const void **value, size_t *value_len) 6187 { 6188 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 6189 6190 struct spdk_blob *blob = (struct spdk_blob *)arg; 6191 *value = &blob->id; 6192 *value_len = sizeof(blob->id); 6193 } 6194 6195 static void 6196 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6197 { 6198 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6199 struct spdk_blob *clone = _blob; 6200 6201 ctx->new.blob = clone; 6202 bs_blob_list_add(clone); 6203 6204 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 6205 } 6206 6207 static void 6208 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6209 { 6210 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6211 6212 ctx->cpl.u.blobid.blobid = blobid; 6213 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 6214 } 6215 6216 static void 6217 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6218 { 6219 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6220 struct spdk_blob_opts opts; 6221 struct spdk_blob_xattr_opts internal_xattrs; 6222 char *xattr_names[] = { BLOB_SNAPSHOT }; 6223 6224 if (bserrno != 0) { 6225 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6226 return; 6227 } 6228 6229 ctx->original.blob = _blob; 6230 ctx->original.md_ro = _blob->md_ro; 6231 6232 if (!_blob->data_ro || !_blob->md_ro) { 6233 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 6234 ctx->bserrno = -EINVAL; 6235 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6236 return; 6237 } 6238 6239 if (_blob->locked_operation_in_progress) { 6240 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 6241 ctx->bserrno = -EBUSY; 6242 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6243 return; 6244 } 6245 6246 _blob->locked_operation_in_progress = true; 6247 6248 spdk_blob_opts_init(&opts, sizeof(opts)); 6249 blob_xattrs_init(&internal_xattrs); 6250 6251 opts.thin_provision = true; 6252 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6253 opts.use_extent_table = _blob->use_extent_table; 6254 if (ctx->xattrs) { 6255 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6256 } 6257 6258 /* Set internal xattr BLOB_SNAPSHOT */ 6259 internal_xattrs.count = 1; 6260 internal_xattrs.ctx = _blob; 6261 internal_xattrs.names = xattr_names; 6262 internal_xattrs.get_value = bs_xattr_clone; 6263 6264 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6265 bs_clone_newblob_create_cpl, ctx); 6266 } 6267 6268 void 6269 spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 6270 const struct spdk_blob_xattr_opts *clone_xattrs, 6271 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6272 { 6273 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6274 6275 if (!ctx) { 6276 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6277 return; 6278 } 6279 6280 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6281 ctx->cpl.u.blobid.cb_fn = cb_fn; 6282 ctx->cpl.u.blobid.cb_arg = cb_arg; 6283 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6284 ctx->bserrno = 0; 6285 ctx->xattrs = clone_xattrs; 6286 ctx->original.id = blobid; 6287 6288 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6289 } 6290 6291 /* END spdk_bs_create_clone */ 6292 6293 /* START spdk_bs_inflate_blob */ 6294 6295 static void 6296 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6297 { 6298 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6299 struct spdk_blob *_blob = ctx->original.blob; 6300 6301 if (bserrno != 0) { 6302 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6303 return; 6304 } 6305 6306 /* Temporarily override md_ro flag for MD modification */ 6307 _blob->md_ro = false; 6308 6309 bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true); 6310 if (bserrno != 0) { 6311 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6312 return; 6313 } 6314 6315 assert(_parent != NULL); 6316 6317 bs_blob_list_remove(_blob); 6318 _blob->parent_id = _parent->id; 6319 6320 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6321 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6322 bs_blob_list_add(_blob); 6323 6324 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6325 } 6326 6327 static void 6328 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6329 { 6330 struct spdk_blob *_blob = ctx->original.blob; 6331 struct spdk_blob *_parent; 6332 6333 if (ctx->allocate_all) { 6334 /* remove thin provisioning */ 6335 bs_blob_list_remove(_blob); 6336 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6337 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6338 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6339 _blob->back_bs_dev = NULL; 6340 _blob->parent_id = SPDK_BLOBID_INVALID; 6341 } else { 6342 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6343 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6344 /* We must change the parent of the inflated blob */ 6345 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6346 bs_inflate_blob_set_parent_cpl, ctx); 6347 return; 6348 } 6349 6350 bs_blob_list_remove(_blob); 6351 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6352 _blob->parent_id = SPDK_BLOBID_INVALID; 6353 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6354 _blob->back_bs_dev = bs_create_zeroes_dev(); 6355 } 6356 6357 /* Temporarily override md_ro flag for MD modification */ 6358 _blob->md_ro = false; 6359 _blob->state = SPDK_BLOB_STATE_DIRTY; 6360 6361 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6362 } 6363 6364 /* Check if cluster needs allocation */ 6365 static inline bool 6366 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6367 { 6368 struct spdk_blob_bs_dev *b; 6369 6370 assert(blob != NULL); 6371 6372 if (blob->active.clusters[cluster] != 0) { 6373 /* Cluster is already allocated */ 6374 return false; 6375 } 6376 6377 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6378 /* Blob have no parent blob */ 6379 return allocate_all; 6380 } 6381 6382 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6383 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6384 } 6385 6386 static void 6387 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6388 { 6389 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6390 struct spdk_blob *_blob = ctx->original.blob; 6391 struct spdk_bs_cpl cpl; 6392 spdk_bs_user_op_t *op; 6393 uint64_t offset; 6394 6395 if (bserrno != 0) { 6396 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6397 return; 6398 } 6399 6400 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6401 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6402 break; 6403 } 6404 } 6405 6406 if (ctx->cluster < _blob->active.num_clusters) { 6407 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6408 6409 /* We may safely increment a cluster before copying */ 6410 ctx->cluster++; 6411 6412 /* Use a dummy 0B read as a context for cluster copy */ 6413 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6414 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next; 6415 cpl.u.blob_basic.cb_arg = ctx; 6416 6417 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob, 6418 NULL, 0, offset, 0); 6419 if (!op) { 6420 bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM); 6421 return; 6422 } 6423 6424 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op); 6425 } else { 6426 bs_inflate_blob_done(ctx); 6427 } 6428 } 6429 6430 static void 6431 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6432 { 6433 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6434 uint64_t clusters_needed; 6435 uint64_t i; 6436 6437 if (bserrno != 0) { 6438 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6439 return; 6440 } 6441 6442 ctx->original.blob = _blob; 6443 ctx->original.md_ro = _blob->md_ro; 6444 6445 if (_blob->locked_operation_in_progress) { 6446 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6447 ctx->bserrno = -EBUSY; 6448 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6449 return; 6450 } 6451 6452 _blob->locked_operation_in_progress = true; 6453 6454 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6455 /* This blob have no parent, so we cannot decouple it. */ 6456 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6457 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6458 return; 6459 } 6460 6461 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6462 /* This is not thin provisioned blob. No need to inflate. */ 6463 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6464 return; 6465 } 6466 6467 /* Do two passes - one to verify that we can obtain enough clusters 6468 * and another to actually claim them. 6469 */ 6470 clusters_needed = 0; 6471 for (i = 0; i < _blob->active.num_clusters; i++) { 6472 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6473 clusters_needed++; 6474 } 6475 } 6476 6477 if (clusters_needed > _blob->bs->num_free_clusters) { 6478 /* Not enough free clusters. Cannot satisfy the request. */ 6479 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6480 return; 6481 } 6482 6483 ctx->cluster = 0; 6484 bs_inflate_blob_touch_next(ctx, 0); 6485 } 6486 6487 static void 6488 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6489 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6490 { 6491 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6492 6493 if (!ctx) { 6494 cb_fn(cb_arg, -ENOMEM); 6495 return; 6496 } 6497 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6498 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6499 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6500 ctx->bserrno = 0; 6501 ctx->original.id = blobid; 6502 ctx->channel = channel; 6503 ctx->allocate_all = allocate_all; 6504 6505 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6506 } 6507 6508 void 6509 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6510 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6511 { 6512 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6513 } 6514 6515 void 6516 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6517 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6518 { 6519 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6520 } 6521 /* END spdk_bs_inflate_blob */ 6522 6523 /* START spdk_blob_resize */ 6524 struct spdk_bs_resize_ctx { 6525 spdk_blob_op_complete cb_fn; 6526 void *cb_arg; 6527 struct spdk_blob *blob; 6528 uint64_t sz; 6529 int rc; 6530 }; 6531 6532 static void 6533 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6534 { 6535 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6536 6537 if (rc != 0) { 6538 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6539 } 6540 6541 if (ctx->rc != 0) { 6542 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6543 rc = ctx->rc; 6544 } 6545 6546 ctx->blob->locked_operation_in_progress = false; 6547 6548 ctx->cb_fn(ctx->cb_arg, rc); 6549 free(ctx); 6550 } 6551 6552 static void 6553 bs_resize_freeze_cpl(void *cb_arg, int rc) 6554 { 6555 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6556 6557 if (rc != 0) { 6558 ctx->blob->locked_operation_in_progress = false; 6559 ctx->cb_fn(ctx->cb_arg, rc); 6560 free(ctx); 6561 return; 6562 } 6563 6564 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6565 6566 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6567 } 6568 6569 void 6570 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6571 { 6572 struct spdk_bs_resize_ctx *ctx; 6573 6574 blob_verify_md_op(blob); 6575 6576 SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz); 6577 6578 if (blob->md_ro) { 6579 cb_fn(cb_arg, -EPERM); 6580 return; 6581 } 6582 6583 if (sz == blob->active.num_clusters) { 6584 cb_fn(cb_arg, 0); 6585 return; 6586 } 6587 6588 if (blob->locked_operation_in_progress) { 6589 cb_fn(cb_arg, -EBUSY); 6590 return; 6591 } 6592 6593 ctx = calloc(1, sizeof(*ctx)); 6594 if (!ctx) { 6595 cb_fn(cb_arg, -ENOMEM); 6596 return; 6597 } 6598 6599 blob->locked_operation_in_progress = true; 6600 ctx->cb_fn = cb_fn; 6601 ctx->cb_arg = cb_arg; 6602 ctx->blob = blob; 6603 ctx->sz = sz; 6604 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6605 } 6606 6607 /* END spdk_blob_resize */ 6608 6609 6610 /* START spdk_bs_delete_blob */ 6611 6612 static void 6613 bs_delete_close_cpl(void *cb_arg, int bserrno) 6614 { 6615 spdk_bs_sequence_t *seq = cb_arg; 6616 6617 bs_sequence_finish(seq, bserrno); 6618 } 6619 6620 static void 6621 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6622 { 6623 struct spdk_blob *blob = cb_arg; 6624 6625 if (bserrno != 0) { 6626 /* 6627 * We already removed this blob from the blobstore tailq, so 6628 * we need to free it here since this is the last reference 6629 * to it. 6630 */ 6631 blob_free(blob); 6632 bs_delete_close_cpl(seq, bserrno); 6633 return; 6634 } 6635 6636 /* 6637 * This will immediately decrement the ref_count and call 6638 * the completion routine since the metadata state is clean. 6639 * By calling spdk_blob_close, we reduce the number of call 6640 * points into code that touches the blob->open_ref count 6641 * and the blobstore's blob list. 6642 */ 6643 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6644 } 6645 6646 struct delete_snapshot_ctx { 6647 struct spdk_blob_list *parent_snapshot_entry; 6648 struct spdk_blob *snapshot; 6649 struct spdk_blob_md_page *page; 6650 bool snapshot_md_ro; 6651 struct spdk_blob *clone; 6652 bool clone_md_ro; 6653 spdk_blob_op_with_handle_complete cb_fn; 6654 void *cb_arg; 6655 int bserrno; 6656 uint32_t next_extent_page; 6657 }; 6658 6659 static void 6660 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6661 { 6662 struct delete_snapshot_ctx *ctx = cb_arg; 6663 6664 if (bserrno != 0) { 6665 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6666 } 6667 6668 assert(ctx != NULL); 6669 6670 if (bserrno != 0 && ctx->bserrno == 0) { 6671 ctx->bserrno = bserrno; 6672 } 6673 6674 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6675 spdk_free(ctx->page); 6676 free(ctx); 6677 } 6678 6679 static void 6680 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6681 { 6682 struct delete_snapshot_ctx *ctx = cb_arg; 6683 6684 if (bserrno != 0) { 6685 ctx->bserrno = bserrno; 6686 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6687 } 6688 6689 if (ctx->bserrno != 0) { 6690 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6691 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot); 6692 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6693 } 6694 6695 ctx->snapshot->locked_operation_in_progress = false; 6696 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6697 6698 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6699 } 6700 6701 static void 6702 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6703 { 6704 struct delete_snapshot_ctx *ctx = cb_arg; 6705 6706 ctx->clone->locked_operation_in_progress = false; 6707 ctx->clone->md_ro = ctx->clone_md_ro; 6708 6709 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6710 } 6711 6712 static void 6713 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6714 { 6715 struct delete_snapshot_ctx *ctx = cb_arg; 6716 6717 if (bserrno) { 6718 ctx->bserrno = bserrno; 6719 delete_snapshot_cleanup_clone(ctx, 0); 6720 return; 6721 } 6722 6723 ctx->clone->locked_operation_in_progress = false; 6724 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6725 } 6726 6727 static void 6728 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6729 { 6730 struct delete_snapshot_ctx *ctx = cb_arg; 6731 struct spdk_blob_list *parent_snapshot_entry = NULL; 6732 struct spdk_blob_list *snapshot_entry = NULL; 6733 struct spdk_blob_list *clone_entry = NULL; 6734 struct spdk_blob_list *snapshot_clone_entry = NULL; 6735 6736 if (bserrno) { 6737 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6738 ctx->bserrno = bserrno; 6739 delete_snapshot_cleanup_clone(ctx, 0); 6740 return; 6741 } 6742 6743 /* Get snapshot entry for the snapshot we want to remove */ 6744 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6745 6746 assert(snapshot_entry != NULL); 6747 6748 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6749 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6750 assert(clone_entry != NULL); 6751 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6752 snapshot_entry->clone_count--; 6753 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6754 6755 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6756 /* This snapshot is at the same time a clone of another snapshot - we need to 6757 * update parent snapshot (remove current clone, add new one inherited from 6758 * the snapshot that is being removed) */ 6759 6760 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6761 * snapshot that we are removing */ 6762 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6763 &snapshot_clone_entry); 6764 6765 /* Switch clone entry in parent snapshot */ 6766 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6767 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6768 free(snapshot_clone_entry); 6769 } else { 6770 /* No parent snapshot - just remove clone entry */ 6771 free(clone_entry); 6772 } 6773 6774 /* Restore md_ro flags */ 6775 ctx->clone->md_ro = ctx->clone_md_ro; 6776 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6777 6778 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6779 } 6780 6781 static void 6782 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6783 { 6784 struct delete_snapshot_ctx *ctx = cb_arg; 6785 uint64_t i; 6786 6787 ctx->snapshot->md_ro = false; 6788 6789 if (bserrno) { 6790 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6791 ctx->bserrno = bserrno; 6792 6793 /* Restore snapshot to previous state */ 6794 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6795 if (bserrno != 0) { 6796 delete_snapshot_cleanup_clone(ctx, bserrno); 6797 return; 6798 } 6799 6800 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6801 return; 6802 } 6803 6804 /* Clear cluster map entries for snapshot */ 6805 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6806 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6807 ctx->snapshot->active.clusters[i] = 0; 6808 } 6809 } 6810 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6811 i < ctx->clone->active.num_extent_pages; i++) { 6812 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6813 ctx->snapshot->active.extent_pages[i] = 0; 6814 } 6815 } 6816 6817 blob_set_thin_provision(ctx->snapshot); 6818 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6819 6820 if (ctx->parent_snapshot_entry != NULL) { 6821 ctx->snapshot->back_bs_dev = NULL; 6822 } 6823 6824 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6825 } 6826 6827 static void 6828 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6829 { 6830 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6831 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6832 6833 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6834 if (ctx->parent_snapshot_entry != NULL) { 6835 /* ...to parent snapshot */ 6836 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6837 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6838 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6839 sizeof(spdk_blob_id), 6840 true); 6841 } else { 6842 /* ...to blobid invalid and zeroes dev */ 6843 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6844 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6845 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6846 } 6847 6848 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6849 } 6850 6851 static void 6852 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6853 { 6854 struct delete_snapshot_ctx *ctx = cb_arg; 6855 uint32_t *extent_page; 6856 uint64_t i; 6857 6858 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6859 i < ctx->clone->active.num_extent_pages; i++) { 6860 if (ctx->snapshot->active.extent_pages[i] == 0) { 6861 /* No extent page to use from snapshot */ 6862 continue; 6863 } 6864 6865 extent_page = &ctx->clone->active.extent_pages[i]; 6866 if (*extent_page == 0) { 6867 /* Copy extent page from snapshot when clone did not have a matching one */ 6868 *extent_page = ctx->snapshot->active.extent_pages[i]; 6869 continue; 6870 } 6871 6872 /* Clone and snapshot both contain partially filled matching extent pages. 6873 * Update the clone extent page in place with cluster map containing the mix of both. */ 6874 ctx->next_extent_page = i + 1; 6875 memset(ctx->page, 0, SPDK_BS_PAGE_SIZE); 6876 6877 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page, 6878 delete_snapshot_update_extent_pages, ctx); 6879 return; 6880 } 6881 delete_snapshot_update_extent_pages_cpl(ctx); 6882 } 6883 6884 static void 6885 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6886 { 6887 struct delete_snapshot_ctx *ctx = cb_arg; 6888 uint64_t i; 6889 6890 /* Temporarily override md_ro flag for clone for MD modification */ 6891 ctx->clone_md_ro = ctx->clone->md_ro; 6892 ctx->clone->md_ro = false; 6893 6894 if (bserrno) { 6895 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6896 ctx->bserrno = bserrno; 6897 delete_snapshot_cleanup_clone(ctx, 0); 6898 return; 6899 } 6900 6901 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 6902 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6903 if (ctx->clone->active.clusters[i] == 0) { 6904 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 6905 } 6906 } 6907 ctx->next_extent_page = 0; 6908 delete_snapshot_update_extent_pages(ctx, 0); 6909 } 6910 6911 static void 6912 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 6913 { 6914 struct delete_snapshot_ctx *ctx = cb_arg; 6915 6916 if (bserrno) { 6917 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 6918 ctx->bserrno = bserrno; 6919 delete_snapshot_cleanup_clone(ctx, 0); 6920 return; 6921 } 6922 6923 /* Temporarily override md_ro flag for snapshot for MD modification */ 6924 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 6925 ctx->snapshot->md_ro = false; 6926 6927 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 6928 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 6929 sizeof(spdk_blob_id), true); 6930 if (ctx->bserrno != 0) { 6931 delete_snapshot_cleanup_clone(ctx, 0); 6932 return; 6933 } 6934 6935 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 6936 } 6937 6938 static void 6939 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 6940 { 6941 struct delete_snapshot_ctx *ctx = cb_arg; 6942 6943 if (bserrno) { 6944 SPDK_ERRLOG("Failed to open clone\n"); 6945 ctx->bserrno = bserrno; 6946 delete_snapshot_cleanup_snapshot(ctx, 0); 6947 return; 6948 } 6949 6950 ctx->clone = clone; 6951 6952 if (clone->locked_operation_in_progress) { 6953 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 6954 ctx->bserrno = -EBUSY; 6955 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6956 return; 6957 } 6958 6959 clone->locked_operation_in_progress = true; 6960 6961 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 6962 } 6963 6964 static void 6965 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 6966 { 6967 struct spdk_blob_list *snapshot_entry = NULL; 6968 struct spdk_blob_list *clone_entry = NULL; 6969 struct spdk_blob_list *snapshot_clone_entry = NULL; 6970 6971 /* Get snapshot entry for the snapshot we want to remove */ 6972 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 6973 6974 assert(snapshot_entry != NULL); 6975 6976 /* Get clone of the snapshot (at this point there can be only one clone) */ 6977 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6978 assert(snapshot_entry->clone_count == 1); 6979 assert(clone_entry != NULL); 6980 6981 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6982 * snapshot that we are removing */ 6983 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 6984 &snapshot_clone_entry); 6985 6986 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 6987 } 6988 6989 static void 6990 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 6991 { 6992 spdk_bs_sequence_t *seq = cb_arg; 6993 struct spdk_blob_list *snapshot_entry = NULL; 6994 uint32_t page_num; 6995 6996 if (bserrno) { 6997 SPDK_ERRLOG("Failed to remove blob\n"); 6998 bs_sequence_finish(seq, bserrno); 6999 return; 7000 } 7001 7002 /* Remove snapshot from the list */ 7003 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7004 if (snapshot_entry != NULL) { 7005 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 7006 free(snapshot_entry); 7007 } 7008 7009 page_num = bs_blobid_to_page(blob->id); 7010 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 7011 blob->state = SPDK_BLOB_STATE_DIRTY; 7012 blob->active.num_pages = 0; 7013 blob_resize(blob, 0); 7014 7015 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 7016 } 7017 7018 static int 7019 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 7020 { 7021 struct spdk_blob_list *snapshot_entry = NULL; 7022 struct spdk_blob_list *clone_entry = NULL; 7023 struct spdk_blob *clone = NULL; 7024 bool has_one_clone = false; 7025 7026 /* Check if this is a snapshot with clones */ 7027 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7028 if (snapshot_entry != NULL) { 7029 if (snapshot_entry->clone_count > 1) { 7030 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 7031 return -EBUSY; 7032 } else if (snapshot_entry->clone_count == 1) { 7033 has_one_clone = true; 7034 } 7035 } 7036 7037 /* Check if someone has this blob open (besides this delete context): 7038 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 7039 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 7040 * and that is ok, because we will update it accordingly */ 7041 if (blob->open_ref <= 2 && has_one_clone) { 7042 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7043 assert(clone_entry != NULL); 7044 clone = blob_lookup(blob->bs, clone_entry->id); 7045 7046 if (blob->open_ref == 2 && clone == NULL) { 7047 /* Clone is closed and someone else opened this blob */ 7048 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7049 return -EBUSY; 7050 } 7051 7052 *update_clone = true; 7053 return 0; 7054 } 7055 7056 if (blob->open_ref > 1) { 7057 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7058 return -EBUSY; 7059 } 7060 7061 assert(has_one_clone == false); 7062 *update_clone = false; 7063 return 0; 7064 } 7065 7066 static void 7067 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 7068 { 7069 spdk_bs_sequence_t *seq = cb_arg; 7070 7071 bs_sequence_finish(seq, -ENOMEM); 7072 } 7073 7074 static void 7075 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 7076 { 7077 spdk_bs_sequence_t *seq = cb_arg; 7078 struct delete_snapshot_ctx *ctx; 7079 bool update_clone = false; 7080 7081 if (bserrno != 0) { 7082 bs_sequence_finish(seq, bserrno); 7083 return; 7084 } 7085 7086 blob_verify_md_op(blob); 7087 7088 ctx = calloc(1, sizeof(*ctx)); 7089 if (ctx == NULL) { 7090 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 7091 return; 7092 } 7093 7094 ctx->snapshot = blob; 7095 ctx->cb_fn = bs_delete_blob_finish; 7096 ctx->cb_arg = seq; 7097 7098 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 7099 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 7100 if (ctx->bserrno) { 7101 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7102 return; 7103 } 7104 7105 if (blob->locked_operation_in_progress) { 7106 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 7107 ctx->bserrno = -EBUSY; 7108 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7109 return; 7110 } 7111 7112 blob->locked_operation_in_progress = true; 7113 7114 /* 7115 * Remove the blob from the blob_store list now, to ensure it does not 7116 * get returned after this point by blob_lookup(). 7117 */ 7118 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7119 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7120 7121 if (update_clone) { 7122 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 7123 if (!ctx->page) { 7124 ctx->bserrno = -ENOMEM; 7125 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7126 return; 7127 } 7128 /* This blob is a snapshot with active clone - update clone first */ 7129 update_clone_on_snapshot_deletion(blob, ctx); 7130 } else { 7131 /* This blob does not have any clones - just remove it */ 7132 bs_blob_list_remove(blob); 7133 bs_delete_blob_finish(seq, blob, 0); 7134 free(ctx); 7135 } 7136 } 7137 7138 void 7139 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7140 spdk_blob_op_complete cb_fn, void *cb_arg) 7141 { 7142 struct spdk_bs_cpl cpl; 7143 spdk_bs_sequence_t *seq; 7144 7145 SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid); 7146 7147 assert(spdk_get_thread() == bs->md_thread); 7148 7149 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7150 cpl.u.blob_basic.cb_fn = cb_fn; 7151 cpl.u.blob_basic.cb_arg = cb_arg; 7152 7153 seq = bs_sequence_start(bs->md_channel, &cpl); 7154 if (!seq) { 7155 cb_fn(cb_arg, -ENOMEM); 7156 return; 7157 } 7158 7159 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 7160 } 7161 7162 /* END spdk_bs_delete_blob */ 7163 7164 /* START spdk_bs_open_blob */ 7165 7166 static void 7167 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7168 { 7169 struct spdk_blob *blob = cb_arg; 7170 struct spdk_blob *existing; 7171 7172 if (bserrno != 0) { 7173 blob_free(blob); 7174 seq->cpl.u.blob_handle.blob = NULL; 7175 bs_sequence_finish(seq, bserrno); 7176 return; 7177 } 7178 7179 existing = blob_lookup(blob->bs, blob->id); 7180 if (existing) { 7181 blob_free(blob); 7182 existing->open_ref++; 7183 seq->cpl.u.blob_handle.blob = existing; 7184 bs_sequence_finish(seq, 0); 7185 return; 7186 } 7187 7188 blob->open_ref++; 7189 7190 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 7191 RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob); 7192 7193 bs_sequence_finish(seq, bserrno); 7194 } 7195 7196 static inline void 7197 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 7198 { 7199 #define FIELD_OK(field) \ 7200 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 7201 7202 #define SET_FIELD(field) \ 7203 if (FIELD_OK(field)) { \ 7204 dst->field = src->field; \ 7205 } \ 7206 7207 SET_FIELD(clear_method); 7208 7209 dst->opts_size = src->opts_size; 7210 7211 /* You should not remove this statement, but need to update the assert statement 7212 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7213 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 7214 7215 #undef FIELD_OK 7216 #undef SET_FIELD 7217 } 7218 7219 static void 7220 bs_open_blob(struct spdk_blob_store *bs, 7221 spdk_blob_id blobid, 7222 struct spdk_blob_open_opts *opts, 7223 spdk_blob_op_with_handle_complete cb_fn, 7224 void *cb_arg) 7225 { 7226 struct spdk_blob *blob; 7227 struct spdk_bs_cpl cpl; 7228 struct spdk_blob_open_opts opts_local; 7229 spdk_bs_sequence_t *seq; 7230 uint32_t page_num; 7231 7232 SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid); 7233 assert(spdk_get_thread() == bs->md_thread); 7234 7235 page_num = bs_blobid_to_page(blobid); 7236 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 7237 /* Invalid blobid */ 7238 cb_fn(cb_arg, NULL, -ENOENT); 7239 return; 7240 } 7241 7242 blob = blob_lookup(bs, blobid); 7243 if (blob) { 7244 blob->open_ref++; 7245 cb_fn(cb_arg, blob, 0); 7246 return; 7247 } 7248 7249 blob = blob_alloc(bs, blobid); 7250 if (!blob) { 7251 cb_fn(cb_arg, NULL, -ENOMEM); 7252 return; 7253 } 7254 7255 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 7256 if (opts) { 7257 blob_open_opts_copy(opts, &opts_local); 7258 } 7259 7260 blob->clear_method = opts_local.clear_method; 7261 7262 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 7263 cpl.u.blob_handle.cb_fn = cb_fn; 7264 cpl.u.blob_handle.cb_arg = cb_arg; 7265 cpl.u.blob_handle.blob = blob; 7266 7267 seq = bs_sequence_start(bs->md_channel, &cpl); 7268 if (!seq) { 7269 blob_free(blob); 7270 cb_fn(cb_arg, NULL, -ENOMEM); 7271 return; 7272 } 7273 7274 blob_load(seq, blob, bs_open_blob_cpl, blob); 7275 } 7276 7277 void 7278 spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7279 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7280 { 7281 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 7282 } 7283 7284 void 7285 spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 7286 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7287 { 7288 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 7289 } 7290 7291 /* END spdk_bs_open_blob */ 7292 7293 /* START spdk_blob_set_read_only */ 7294 int 7295 spdk_blob_set_read_only(struct spdk_blob *blob) 7296 { 7297 blob_verify_md_op(blob); 7298 7299 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 7300 7301 blob->state = SPDK_BLOB_STATE_DIRTY; 7302 return 0; 7303 } 7304 /* END spdk_blob_set_read_only */ 7305 7306 /* START spdk_blob_sync_md */ 7307 7308 static void 7309 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7310 { 7311 struct spdk_blob *blob = cb_arg; 7312 7313 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7314 blob->data_ro = true; 7315 blob->md_ro = true; 7316 } 7317 7318 bs_sequence_finish(seq, bserrno); 7319 } 7320 7321 static void 7322 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7323 { 7324 struct spdk_bs_cpl cpl; 7325 spdk_bs_sequence_t *seq; 7326 7327 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7328 cpl.u.blob_basic.cb_fn = cb_fn; 7329 cpl.u.blob_basic.cb_arg = cb_arg; 7330 7331 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7332 if (!seq) { 7333 cb_fn(cb_arg, -ENOMEM); 7334 return; 7335 } 7336 7337 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7338 } 7339 7340 void 7341 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7342 { 7343 blob_verify_md_op(blob); 7344 7345 SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id); 7346 7347 if (blob->md_ro) { 7348 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7349 cb_fn(cb_arg, 0); 7350 return; 7351 } 7352 7353 blob_sync_md(blob, cb_fn, cb_arg); 7354 } 7355 7356 /* END spdk_blob_sync_md */ 7357 7358 struct spdk_blob_insert_cluster_ctx { 7359 struct spdk_thread *thread; 7360 struct spdk_blob *blob; 7361 uint32_t cluster_num; /* cluster index in blob */ 7362 uint32_t cluster; /* cluster on disk */ 7363 uint32_t extent_page; /* extent page on disk */ 7364 struct spdk_blob_md_page *page; /* preallocated extent page */ 7365 int rc; 7366 spdk_blob_op_complete cb_fn; 7367 void *cb_arg; 7368 }; 7369 7370 static void 7371 blob_insert_cluster_msg_cpl(void *arg) 7372 { 7373 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7374 7375 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7376 free(ctx); 7377 } 7378 7379 static void 7380 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7381 { 7382 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7383 7384 ctx->rc = bserrno; 7385 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7386 } 7387 7388 static void 7389 blob_insert_new_ep_cb(void *arg, int bserrno) 7390 { 7391 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7392 uint32_t *extent_page; 7393 7394 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7395 *extent_page = ctx->extent_page; 7396 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7397 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7398 } 7399 7400 static void 7401 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7402 { 7403 bs_sequence_finish(seq, bserrno); 7404 } 7405 7406 static void 7407 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7408 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg) 7409 { 7410 spdk_bs_sequence_t *seq; 7411 struct spdk_bs_cpl cpl; 7412 7413 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7414 cpl.u.blob_basic.cb_fn = cb_fn; 7415 cpl.u.blob_basic.cb_arg = cb_arg; 7416 7417 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7418 if (!seq) { 7419 cb_fn(cb_arg, -ENOMEM); 7420 return; 7421 } 7422 7423 assert(page); 7424 page->next = SPDK_INVALID_MD_PAGE; 7425 page->id = blob->id; 7426 page->sequence_num = 0; 7427 7428 blob_serialize_extent_page(blob, cluster_num, page); 7429 7430 page->crc = blob_md_page_calc_crc(page); 7431 7432 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7433 7434 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7435 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7436 blob_persist_extent_page_cpl, page); 7437 } 7438 7439 static void 7440 blob_insert_cluster_msg(void *arg) 7441 { 7442 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7443 uint32_t *extent_page; 7444 7445 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7446 if (ctx->rc != 0) { 7447 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7448 return; 7449 } 7450 7451 if (ctx->blob->use_extent_table == false) { 7452 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7453 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7454 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7455 return; 7456 } 7457 7458 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7459 if (*extent_page == 0) { 7460 /* Extent page requires allocation. 7461 * It was already claimed in the used_md_pages map and placed in ctx. */ 7462 assert(ctx->extent_page != 0); 7463 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7464 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page, 7465 blob_insert_new_ep_cb, ctx); 7466 } else { 7467 /* It is possible for original thread to allocate extent page for 7468 * different cluster in the same extent page. In such case proceed with 7469 * updating the existing extent page, but release the additional one. */ 7470 if (ctx->extent_page != 0) { 7471 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7472 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7473 ctx->extent_page = 0; 7474 } 7475 /* Extent page already allocated. 7476 * Every cluster allocation, requires just an update of single extent page. */ 7477 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page, 7478 blob_insert_cluster_msg_cb, ctx); 7479 } 7480 } 7481 7482 static void 7483 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7484 uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page, 7485 spdk_blob_op_complete cb_fn, void *cb_arg) 7486 { 7487 struct spdk_blob_insert_cluster_ctx *ctx; 7488 7489 ctx = calloc(1, sizeof(*ctx)); 7490 if (ctx == NULL) { 7491 cb_fn(cb_arg, -ENOMEM); 7492 return; 7493 } 7494 7495 ctx->thread = spdk_get_thread(); 7496 ctx->blob = blob; 7497 ctx->cluster_num = cluster_num; 7498 ctx->cluster = cluster; 7499 ctx->extent_page = extent_page; 7500 ctx->page = page; 7501 ctx->cb_fn = cb_fn; 7502 ctx->cb_arg = cb_arg; 7503 7504 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7505 } 7506 7507 /* START spdk_blob_close */ 7508 7509 static void 7510 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7511 { 7512 struct spdk_blob *blob = cb_arg; 7513 7514 if (bserrno == 0) { 7515 blob->open_ref--; 7516 if (blob->open_ref == 0) { 7517 /* 7518 * Blobs with active.num_pages == 0 are deleted blobs. 7519 * these blobs are removed from the blob_store list 7520 * when the deletion process starts - so don't try to 7521 * remove them again. 7522 */ 7523 if (blob->active.num_pages > 0) { 7524 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7525 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7526 } 7527 blob_free(blob); 7528 } 7529 } 7530 7531 bs_sequence_finish(seq, bserrno); 7532 } 7533 7534 void 7535 spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7536 { 7537 struct spdk_bs_cpl cpl; 7538 spdk_bs_sequence_t *seq; 7539 7540 blob_verify_md_op(blob); 7541 7542 SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id); 7543 7544 if (blob->open_ref == 0) { 7545 cb_fn(cb_arg, -EBADF); 7546 return; 7547 } 7548 7549 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7550 cpl.u.blob_basic.cb_fn = cb_fn; 7551 cpl.u.blob_basic.cb_arg = cb_arg; 7552 7553 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7554 if (!seq) { 7555 cb_fn(cb_arg, -ENOMEM); 7556 return; 7557 } 7558 7559 /* Sync metadata */ 7560 blob_persist(seq, blob, blob_close_cpl, blob); 7561 } 7562 7563 /* END spdk_blob_close */ 7564 7565 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7566 { 7567 return spdk_get_io_channel(bs); 7568 } 7569 7570 void 7571 spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7572 { 7573 spdk_put_io_channel(channel); 7574 } 7575 7576 void 7577 spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7578 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7579 { 7580 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7581 SPDK_BLOB_UNMAP); 7582 } 7583 7584 void 7585 spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7586 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7587 { 7588 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7589 SPDK_BLOB_WRITE_ZEROES); 7590 } 7591 7592 void 7593 spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7594 void *payload, uint64_t offset, uint64_t length, 7595 spdk_blob_op_complete cb_fn, void *cb_arg) 7596 { 7597 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7598 SPDK_BLOB_WRITE); 7599 } 7600 7601 void 7602 spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7603 void *payload, uint64_t offset, uint64_t length, 7604 spdk_blob_op_complete cb_fn, void *cb_arg) 7605 { 7606 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7607 SPDK_BLOB_READ); 7608 } 7609 7610 void 7611 spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7612 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7613 spdk_blob_op_complete cb_fn, void *cb_arg) 7614 { 7615 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL); 7616 } 7617 7618 void 7619 spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7620 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7621 spdk_blob_op_complete cb_fn, void *cb_arg) 7622 { 7623 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL); 7624 } 7625 7626 void 7627 spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7628 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7629 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7630 { 7631 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, 7632 io_opts); 7633 } 7634 7635 void 7636 spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7637 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7638 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7639 { 7640 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, 7641 io_opts); 7642 } 7643 7644 struct spdk_bs_iter_ctx { 7645 int64_t page_num; 7646 struct spdk_blob_store *bs; 7647 7648 spdk_blob_op_with_handle_complete cb_fn; 7649 void *cb_arg; 7650 }; 7651 7652 static void 7653 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7654 { 7655 struct spdk_bs_iter_ctx *ctx = cb_arg; 7656 struct spdk_blob_store *bs = ctx->bs; 7657 spdk_blob_id id; 7658 7659 if (bserrno == 0) { 7660 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7661 free(ctx); 7662 return; 7663 } 7664 7665 ctx->page_num++; 7666 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7667 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7668 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7669 free(ctx); 7670 return; 7671 } 7672 7673 id = bs_page_to_blobid(ctx->page_num); 7674 7675 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7676 } 7677 7678 void 7679 spdk_bs_iter_first(struct spdk_blob_store *bs, 7680 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7681 { 7682 struct spdk_bs_iter_ctx *ctx; 7683 7684 ctx = calloc(1, sizeof(*ctx)); 7685 if (!ctx) { 7686 cb_fn(cb_arg, NULL, -ENOMEM); 7687 return; 7688 } 7689 7690 ctx->page_num = -1; 7691 ctx->bs = bs; 7692 ctx->cb_fn = cb_fn; 7693 ctx->cb_arg = cb_arg; 7694 7695 bs_iter_cpl(ctx, NULL, -1); 7696 } 7697 7698 static void 7699 bs_iter_close_cpl(void *cb_arg, int bserrno) 7700 { 7701 struct spdk_bs_iter_ctx *ctx = cb_arg; 7702 7703 bs_iter_cpl(ctx, NULL, -1); 7704 } 7705 7706 void 7707 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7708 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7709 { 7710 struct spdk_bs_iter_ctx *ctx; 7711 7712 assert(blob != NULL); 7713 7714 ctx = calloc(1, sizeof(*ctx)); 7715 if (!ctx) { 7716 cb_fn(cb_arg, NULL, -ENOMEM); 7717 return; 7718 } 7719 7720 ctx->page_num = bs_blobid_to_page(blob->id); 7721 ctx->bs = bs; 7722 ctx->cb_fn = cb_fn; 7723 ctx->cb_arg = cb_arg; 7724 7725 /* Close the existing blob */ 7726 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7727 } 7728 7729 static int 7730 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7731 uint16_t value_len, bool internal) 7732 { 7733 struct spdk_xattr_tailq *xattrs; 7734 struct spdk_xattr *xattr; 7735 size_t desc_size; 7736 void *tmp; 7737 7738 blob_verify_md_op(blob); 7739 7740 if (blob->md_ro) { 7741 return -EPERM; 7742 } 7743 7744 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7745 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7746 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7747 desc_size, SPDK_BS_MAX_DESC_SIZE); 7748 return -ENOMEM; 7749 } 7750 7751 if (internal) { 7752 xattrs = &blob->xattrs_internal; 7753 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7754 } else { 7755 xattrs = &blob->xattrs; 7756 } 7757 7758 TAILQ_FOREACH(xattr, xattrs, link) { 7759 if (!strcmp(name, xattr->name)) { 7760 tmp = malloc(value_len); 7761 if (!tmp) { 7762 return -ENOMEM; 7763 } 7764 7765 free(xattr->value); 7766 xattr->value_len = value_len; 7767 xattr->value = tmp; 7768 memcpy(xattr->value, value, value_len); 7769 7770 blob->state = SPDK_BLOB_STATE_DIRTY; 7771 7772 return 0; 7773 } 7774 } 7775 7776 xattr = calloc(1, sizeof(*xattr)); 7777 if (!xattr) { 7778 return -ENOMEM; 7779 } 7780 7781 xattr->name = strdup(name); 7782 if (!xattr->name) { 7783 free(xattr); 7784 return -ENOMEM; 7785 } 7786 7787 xattr->value_len = value_len; 7788 xattr->value = malloc(value_len); 7789 if (!xattr->value) { 7790 free(xattr->name); 7791 free(xattr); 7792 return -ENOMEM; 7793 } 7794 memcpy(xattr->value, value, value_len); 7795 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7796 7797 blob->state = SPDK_BLOB_STATE_DIRTY; 7798 7799 return 0; 7800 } 7801 7802 int 7803 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7804 uint16_t value_len) 7805 { 7806 return blob_set_xattr(blob, name, value, value_len, false); 7807 } 7808 7809 static int 7810 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7811 { 7812 struct spdk_xattr_tailq *xattrs; 7813 struct spdk_xattr *xattr; 7814 7815 blob_verify_md_op(blob); 7816 7817 if (blob->md_ro) { 7818 return -EPERM; 7819 } 7820 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7821 7822 TAILQ_FOREACH(xattr, xattrs, link) { 7823 if (!strcmp(name, xattr->name)) { 7824 TAILQ_REMOVE(xattrs, xattr, link); 7825 free(xattr->value); 7826 free(xattr->name); 7827 free(xattr); 7828 7829 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7830 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7831 } 7832 blob->state = SPDK_BLOB_STATE_DIRTY; 7833 7834 return 0; 7835 } 7836 } 7837 7838 return -ENOENT; 7839 } 7840 7841 int 7842 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7843 { 7844 return blob_remove_xattr(blob, name, false); 7845 } 7846 7847 static int 7848 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7849 const void **value, size_t *value_len, bool internal) 7850 { 7851 struct spdk_xattr *xattr; 7852 struct spdk_xattr_tailq *xattrs; 7853 7854 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7855 7856 TAILQ_FOREACH(xattr, xattrs, link) { 7857 if (!strcmp(name, xattr->name)) { 7858 *value = xattr->value; 7859 *value_len = xattr->value_len; 7860 return 0; 7861 } 7862 } 7863 return -ENOENT; 7864 } 7865 7866 int 7867 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7868 const void **value, size_t *value_len) 7869 { 7870 blob_verify_md_op(blob); 7871 7872 return blob_get_xattr_value(blob, name, value, value_len, false); 7873 } 7874 7875 struct spdk_xattr_names { 7876 uint32_t count; 7877 const char *names[0]; 7878 }; 7879 7880 static int 7881 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7882 { 7883 struct spdk_xattr *xattr; 7884 int count = 0; 7885 7886 TAILQ_FOREACH(xattr, xattrs, link) { 7887 count++; 7888 } 7889 7890 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7891 if (*names == NULL) { 7892 return -ENOMEM; 7893 } 7894 7895 TAILQ_FOREACH(xattr, xattrs, link) { 7896 (*names)->names[(*names)->count++] = xattr->name; 7897 } 7898 7899 return 0; 7900 } 7901 7902 int 7903 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 7904 { 7905 blob_verify_md_op(blob); 7906 7907 return blob_get_xattr_names(&blob->xattrs, names); 7908 } 7909 7910 uint32_t 7911 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 7912 { 7913 assert(names != NULL); 7914 7915 return names->count; 7916 } 7917 7918 const char * 7919 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 7920 { 7921 if (index >= names->count) { 7922 return NULL; 7923 } 7924 7925 return names->names[index]; 7926 } 7927 7928 void 7929 spdk_xattr_names_free(struct spdk_xattr_names *names) 7930 { 7931 free(names); 7932 } 7933 7934 struct spdk_bs_type 7935 spdk_bs_get_bstype(struct spdk_blob_store *bs) 7936 { 7937 return bs->bstype; 7938 } 7939 7940 void 7941 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 7942 { 7943 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 7944 } 7945 7946 bool 7947 spdk_blob_is_read_only(struct spdk_blob *blob) 7948 { 7949 assert(blob != NULL); 7950 return (blob->data_ro || blob->md_ro); 7951 } 7952 7953 bool 7954 spdk_blob_is_snapshot(struct spdk_blob *blob) 7955 { 7956 struct spdk_blob_list *snapshot_entry; 7957 7958 assert(blob != NULL); 7959 7960 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7961 if (snapshot_entry == NULL) { 7962 return false; 7963 } 7964 7965 return true; 7966 } 7967 7968 bool 7969 spdk_blob_is_clone(struct spdk_blob *blob) 7970 { 7971 assert(blob != NULL); 7972 7973 if (blob->parent_id != SPDK_BLOBID_INVALID) { 7974 assert(spdk_blob_is_thin_provisioned(blob)); 7975 return true; 7976 } 7977 7978 return false; 7979 } 7980 7981 bool 7982 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 7983 { 7984 assert(blob != NULL); 7985 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 7986 } 7987 7988 static void 7989 blob_update_clear_method(struct spdk_blob *blob) 7990 { 7991 enum blob_clear_method stored_cm; 7992 7993 assert(blob != NULL); 7994 7995 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 7996 * in metadata previously. If something other than the default was 7997 * specified, ignore stored value and used what was passed in. 7998 */ 7999 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 8000 8001 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 8002 blob->clear_method = stored_cm; 8003 } else if (blob->clear_method != stored_cm) { 8004 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 8005 blob->clear_method, stored_cm); 8006 } 8007 } 8008 8009 spdk_blob_id 8010 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 8011 { 8012 struct spdk_blob_list *snapshot_entry = NULL; 8013 struct spdk_blob_list *clone_entry = NULL; 8014 8015 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 8016 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8017 if (clone_entry->id == blob_id) { 8018 return snapshot_entry->id; 8019 } 8020 } 8021 } 8022 8023 return SPDK_BLOBID_INVALID; 8024 } 8025 8026 int 8027 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 8028 size_t *count) 8029 { 8030 struct spdk_blob_list *snapshot_entry, *clone_entry; 8031 size_t n; 8032 8033 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 8034 if (snapshot_entry == NULL) { 8035 *count = 0; 8036 return 0; 8037 } 8038 8039 if (ids == NULL || *count < snapshot_entry->clone_count) { 8040 *count = snapshot_entry->clone_count; 8041 return -ENOMEM; 8042 } 8043 *count = snapshot_entry->clone_count; 8044 8045 n = 0; 8046 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8047 ids[n++] = clone_entry->id; 8048 } 8049 8050 return 0; 8051 } 8052 8053 static void 8054 bs_load_grow_continue(struct spdk_bs_load_ctx *ctx) 8055 { 8056 int rc; 8057 8058 if (ctx->super->size == 0) { 8059 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8060 } 8061 8062 if (ctx->super->io_unit_size == 0) { 8063 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 8064 } 8065 8066 /* Parse the super block */ 8067 ctx->bs->clean = 1; 8068 ctx->bs->cluster_sz = ctx->super->cluster_size; 8069 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 8070 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 8071 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 8072 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 8073 } 8074 ctx->bs->io_unit_size = ctx->super->io_unit_size; 8075 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 8076 if (rc < 0) { 8077 bs_load_ctx_fail(ctx, -ENOMEM); 8078 return; 8079 } 8080 ctx->bs->md_start = ctx->super->md_start; 8081 ctx->bs->md_len = ctx->super->md_len; 8082 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 8083 if (rc < 0) { 8084 bs_load_ctx_fail(ctx, -ENOMEM); 8085 return; 8086 } 8087 8088 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 8089 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 8090 ctx->bs->super_blob = ctx->super->super_blob; 8091 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 8092 8093 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 8094 SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n"); 8095 bs_load_ctx_fail(ctx, -EIO); 8096 return; 8097 } else { 8098 bs_load_read_used_pages(ctx); 8099 } 8100 } 8101 8102 static void 8103 bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8104 { 8105 struct spdk_bs_load_ctx *ctx = cb_arg; 8106 8107 if (bserrno != 0) { 8108 bs_load_ctx_fail(ctx, bserrno); 8109 return; 8110 } 8111 bs_load_grow_continue(ctx); 8112 } 8113 8114 static void 8115 bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8116 { 8117 struct spdk_bs_load_ctx *ctx = cb_arg; 8118 8119 if (bserrno != 0) { 8120 bs_load_ctx_fail(ctx, bserrno); 8121 return; 8122 } 8123 8124 spdk_free(ctx->mask); 8125 8126 bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 8127 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 8128 bs_load_grow_super_write_cpl, ctx); 8129 } 8130 8131 static void 8132 bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8133 { 8134 struct spdk_bs_load_ctx *ctx = cb_arg; 8135 uint64_t lba, lba_count; 8136 uint64_t dev_size; 8137 uint64_t total_clusters; 8138 8139 if (bserrno != 0) { 8140 bs_load_ctx_fail(ctx, bserrno); 8141 return; 8142 } 8143 8144 /* The type must be correct */ 8145 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 8146 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 8147 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 8148 struct spdk_blob_md_page) * 8)); 8149 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8150 total_clusters = dev_size / ctx->super->cluster_size; 8151 ctx->mask->length = total_clusters; 8152 8153 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8154 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8155 bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count, 8156 bs_load_grow_used_clusters_write_cpl, ctx); 8157 } 8158 8159 static void 8160 bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx) 8161 { 8162 uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask; 8163 uint64_t lba, lba_count, mask_size; 8164 8165 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8166 total_clusters = dev_size / ctx->super->cluster_size; 8167 used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 8168 spdk_divide_round_up(total_clusters, 8), 8169 SPDK_BS_PAGE_SIZE); 8170 max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start; 8171 /* No necessary to grow or no space to grow */ 8172 if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) { 8173 SPDK_DEBUGLOG(blob, "No grow\n"); 8174 bs_load_grow_continue(ctx); 8175 return; 8176 } 8177 8178 SPDK_DEBUGLOG(blob, "Resize blobstore\n"); 8179 8180 ctx->super->size = dev_size; 8181 ctx->super->used_cluster_mask_len = used_cluster_mask_len; 8182 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 8183 8184 mask_size = used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 8185 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 8186 SPDK_MALLOC_DMA); 8187 if (!ctx->mask) { 8188 bs_load_ctx_fail(ctx, -ENOMEM); 8189 return; 8190 } 8191 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8192 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8193 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 8194 bs_load_grow_used_clusters_read_cpl, ctx); 8195 } 8196 8197 static void 8198 bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8199 { 8200 struct spdk_bs_load_ctx *ctx = cb_arg; 8201 uint32_t crc; 8202 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 8203 8204 if (ctx->super->version > SPDK_BS_VERSION || 8205 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 8206 bs_load_ctx_fail(ctx, -EILSEQ); 8207 return; 8208 } 8209 8210 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 8211 sizeof(ctx->super->signature)) != 0) { 8212 bs_load_ctx_fail(ctx, -EILSEQ); 8213 return; 8214 } 8215 8216 crc = blob_md_page_calc_crc(ctx->super); 8217 if (crc != ctx->super->crc) { 8218 bs_load_ctx_fail(ctx, -EILSEQ); 8219 return; 8220 } 8221 8222 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8223 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 8224 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8225 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 8226 } else { 8227 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 8228 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8229 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8230 bs_load_ctx_fail(ctx, -ENXIO); 8231 return; 8232 } 8233 8234 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 8235 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 8236 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 8237 bs_load_ctx_fail(ctx, -EILSEQ); 8238 return; 8239 } 8240 8241 bs_load_try_to_grow(ctx); 8242 8243 } 8244 8245 void 8246 spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 8247 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 8248 { 8249 struct spdk_blob_store *bs; 8250 struct spdk_bs_cpl cpl; 8251 struct spdk_bs_load_ctx *ctx; 8252 struct spdk_bs_opts opts = {}; 8253 int err; 8254 8255 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 8256 8257 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 8258 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 8259 dev->destroy(dev); 8260 cb_fn(cb_arg, NULL, -EINVAL); 8261 return; 8262 } 8263 8264 spdk_bs_opts_init(&opts, sizeof(opts)); 8265 if (o) { 8266 if (bs_opts_copy(o, &opts)) { 8267 return; 8268 } 8269 } 8270 8271 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 8272 dev->destroy(dev); 8273 cb_fn(cb_arg, NULL, -EINVAL); 8274 return; 8275 } 8276 8277 err = bs_alloc(dev, &opts, &bs, &ctx); 8278 if (err) { 8279 dev->destroy(dev); 8280 cb_fn(cb_arg, NULL, err); 8281 return; 8282 } 8283 8284 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 8285 cpl.u.bs_handle.cb_fn = cb_fn; 8286 cpl.u.bs_handle.cb_arg = cb_arg; 8287 cpl.u.bs_handle.bs = bs; 8288 8289 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 8290 if (!ctx->seq) { 8291 spdk_free(ctx->super); 8292 free(ctx); 8293 bs_free(bs); 8294 cb_fn(cb_arg, NULL, -ENOMEM); 8295 return; 8296 } 8297 8298 /* Read the super block */ 8299 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 8300 bs_byte_to_lba(bs, sizeof(*ctx->super)), 8301 bs_grow_load_super_cpl, ctx); 8302 } 8303 8304 SPDK_LOG_REGISTER_COMPONENT(blob) 8305