1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2017 Intel Corporation. 3 * All rights reserved. 4 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/blob.h" 10 #include "spdk/crc32.h" 11 #include "spdk/env.h" 12 #include "spdk/queue.h" 13 #include "spdk/thread.h" 14 #include "spdk/bit_array.h" 15 #include "spdk/bit_pool.h" 16 #include "spdk/likely.h" 17 #include "spdk/util.h" 18 #include "spdk/string.h" 19 20 #include "spdk_internal/assert.h" 21 #include "spdk/log.h" 22 23 #include "blobstore.h" 24 25 #define BLOB_CRC32C_INITIAL 0xffffffffUL 26 27 static int bs_register_md_thread(struct spdk_blob_store *bs); 28 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 29 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 30 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 31 uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page, 32 spdk_blob_op_complete cb_fn, void *cb_arg); 33 34 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 35 uint16_t value_len, bool internal); 36 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 37 const void **value, size_t *value_len, bool internal); 38 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 39 40 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 41 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg); 42 43 static int 44 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2) 45 { 46 return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id); 47 } 48 49 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp); 50 51 static void 52 blob_verify_md_op(struct spdk_blob *blob) 53 { 54 assert(blob != NULL); 55 assert(spdk_get_thread() == blob->bs->md_thread); 56 assert(blob->state != SPDK_BLOB_STATE_LOADING); 57 } 58 59 static struct spdk_blob_list * 60 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 61 { 62 struct spdk_blob_list *snapshot_entry = NULL; 63 64 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 65 if (snapshot_entry->id == blobid) { 66 break; 67 } 68 } 69 70 return snapshot_entry; 71 } 72 73 static void 74 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 75 { 76 assert(spdk_spin_held(&bs->used_lock)); 77 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 78 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 79 80 spdk_bit_array_set(bs->used_md_pages, page); 81 } 82 83 static void 84 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 85 { 86 assert(spdk_spin_held(&bs->used_lock)); 87 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 88 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 89 90 spdk_bit_array_clear(bs->used_md_pages, page); 91 } 92 93 static uint32_t 94 bs_claim_cluster(struct spdk_blob_store *bs) 95 { 96 uint32_t cluster_num; 97 98 assert(spdk_spin_held(&bs->used_lock)); 99 100 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 101 if (cluster_num == UINT32_MAX) { 102 return UINT32_MAX; 103 } 104 105 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 106 bs->num_free_clusters--; 107 108 return cluster_num; 109 } 110 111 static void 112 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 113 { 114 assert(spdk_spin_held(&bs->used_lock)); 115 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 116 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 117 assert(bs->num_free_clusters < bs->total_clusters); 118 119 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 120 121 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 122 bs->num_free_clusters++; 123 } 124 125 static int 126 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 127 { 128 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 129 130 blob_verify_md_op(blob); 131 132 if (*cluster_lba != 0) { 133 return -EEXIST; 134 } 135 136 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 137 return 0; 138 } 139 140 static int 141 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 142 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 143 { 144 uint32_t *extent_page = 0; 145 146 assert(spdk_spin_held(&blob->bs->used_lock)); 147 148 *cluster = bs_claim_cluster(blob->bs); 149 if (*cluster == UINT32_MAX) { 150 /* No more free clusters. Cannot satisfy the request */ 151 return -ENOSPC; 152 } 153 154 if (blob->use_extent_table) { 155 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 156 if (*extent_page == 0) { 157 /* Extent page shall never occupy md_page so start the search from 1 */ 158 if (*lowest_free_md_page == 0) { 159 *lowest_free_md_page = 1; 160 } 161 /* No extent_page is allocated for the cluster */ 162 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 163 *lowest_free_md_page); 164 if (*lowest_free_md_page == UINT32_MAX) { 165 /* No more free md pages. Cannot satisfy the request */ 166 bs_release_cluster(blob->bs, *cluster); 167 return -ENOSPC; 168 } 169 bs_claim_md_page(blob->bs, *lowest_free_md_page); 170 } 171 } 172 173 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id); 174 175 if (update_map) { 176 blob_insert_cluster(blob, cluster_num, *cluster); 177 if (blob->use_extent_table && *extent_page == 0) { 178 *extent_page = *lowest_free_md_page; 179 } 180 } 181 182 return 0; 183 } 184 185 static void 186 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 187 { 188 xattrs->count = 0; 189 xattrs->names = NULL; 190 xattrs->ctx = NULL; 191 xattrs->get_value = NULL; 192 } 193 194 void 195 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 196 { 197 if (!opts) { 198 SPDK_ERRLOG("opts should not be NULL\n"); 199 return; 200 } 201 202 if (!opts_size) { 203 SPDK_ERRLOG("opts_size should not be zero value\n"); 204 return; 205 } 206 207 memset(opts, 0, opts_size); 208 opts->opts_size = opts_size; 209 210 #define FIELD_OK(field) \ 211 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 212 213 #define SET_FIELD(field, value) \ 214 if (FIELD_OK(field)) { \ 215 opts->field = value; \ 216 } \ 217 218 SET_FIELD(num_clusters, 0); 219 SET_FIELD(thin_provision, false); 220 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 221 222 if (FIELD_OK(xattrs)) { 223 blob_xattrs_init(&opts->xattrs); 224 } 225 226 SET_FIELD(use_extent_table, true); 227 228 #undef FIELD_OK 229 #undef SET_FIELD 230 } 231 232 void 233 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 234 { 235 if (!opts) { 236 SPDK_ERRLOG("opts should not be NULL\n"); 237 return; 238 } 239 240 if (!opts_size) { 241 SPDK_ERRLOG("opts_size should not be zero value\n"); 242 return; 243 } 244 245 memset(opts, 0, opts_size); 246 opts->opts_size = opts_size; 247 248 #define FIELD_OK(field) \ 249 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 250 251 #define SET_FIELD(field, value) \ 252 if (FIELD_OK(field)) { \ 253 opts->field = value; \ 254 } \ 255 256 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 257 258 #undef FIELD_OK 259 #undef SET_FILED 260 } 261 262 static struct spdk_blob * 263 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 264 { 265 struct spdk_blob *blob; 266 267 blob = calloc(1, sizeof(*blob)); 268 if (!blob) { 269 return NULL; 270 } 271 272 blob->id = id; 273 blob->bs = bs; 274 275 blob->parent_id = SPDK_BLOBID_INVALID; 276 277 blob->state = SPDK_BLOB_STATE_DIRTY; 278 blob->extent_rle_found = false; 279 blob->extent_table_found = false; 280 blob->active.num_pages = 1; 281 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 282 if (!blob->active.pages) { 283 free(blob); 284 return NULL; 285 } 286 287 blob->active.pages[0] = bs_blobid_to_page(id); 288 289 TAILQ_INIT(&blob->xattrs); 290 TAILQ_INIT(&blob->xattrs_internal); 291 TAILQ_INIT(&blob->pending_persists); 292 TAILQ_INIT(&blob->persists_to_complete); 293 294 return blob; 295 } 296 297 static void 298 xattrs_free(struct spdk_xattr_tailq *xattrs) 299 { 300 struct spdk_xattr *xattr, *xattr_tmp; 301 302 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 303 TAILQ_REMOVE(xattrs, xattr, link); 304 free(xattr->name); 305 free(xattr->value); 306 free(xattr); 307 } 308 } 309 310 static void 311 blob_free(struct spdk_blob *blob) 312 { 313 assert(blob != NULL); 314 assert(TAILQ_EMPTY(&blob->pending_persists)); 315 assert(TAILQ_EMPTY(&blob->persists_to_complete)); 316 317 free(blob->active.extent_pages); 318 free(blob->clean.extent_pages); 319 free(blob->active.clusters); 320 free(blob->clean.clusters); 321 free(blob->active.pages); 322 free(blob->clean.pages); 323 324 xattrs_free(&blob->xattrs); 325 xattrs_free(&blob->xattrs_internal); 326 327 if (blob->back_bs_dev) { 328 blob->back_bs_dev->destroy(blob->back_bs_dev); 329 } 330 331 free(blob); 332 } 333 334 struct freeze_io_ctx { 335 struct spdk_bs_cpl cpl; 336 struct spdk_blob *blob; 337 }; 338 339 static void 340 blob_io_sync(struct spdk_io_channel_iter *i) 341 { 342 spdk_for_each_channel_continue(i, 0); 343 } 344 345 static void 346 blob_execute_queued_io(struct spdk_io_channel_iter *i) 347 { 348 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 349 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 350 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 351 struct spdk_bs_request_set *set; 352 struct spdk_bs_user_op_args *args; 353 spdk_bs_user_op_t *op, *tmp; 354 355 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 356 set = (struct spdk_bs_request_set *)op; 357 args = &set->u.user_op; 358 359 if (args->blob == ctx->blob) { 360 TAILQ_REMOVE(&ch->queued_io, op, link); 361 bs_user_op_execute(op); 362 } 363 } 364 365 spdk_for_each_channel_continue(i, 0); 366 } 367 368 static void 369 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 370 { 371 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 372 373 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 374 375 free(ctx); 376 } 377 378 static void 379 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 380 { 381 struct freeze_io_ctx *ctx; 382 383 ctx = calloc(1, sizeof(*ctx)); 384 if (!ctx) { 385 cb_fn(cb_arg, -ENOMEM); 386 return; 387 } 388 389 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 390 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 391 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 392 ctx->blob = blob; 393 394 /* Freeze I/O on blob */ 395 blob->frozen_refcnt++; 396 397 if (blob->frozen_refcnt == 1) { 398 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 399 } else { 400 cb_fn(cb_arg, 0); 401 free(ctx); 402 } 403 } 404 405 static void 406 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 407 { 408 struct freeze_io_ctx *ctx; 409 410 ctx = calloc(1, sizeof(*ctx)); 411 if (!ctx) { 412 cb_fn(cb_arg, -ENOMEM); 413 return; 414 } 415 416 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 417 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 418 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 419 ctx->blob = blob; 420 421 assert(blob->frozen_refcnt > 0); 422 423 blob->frozen_refcnt--; 424 425 if (blob->frozen_refcnt == 0) { 426 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 427 } else { 428 cb_fn(cb_arg, 0); 429 free(ctx); 430 } 431 } 432 433 static int 434 blob_mark_clean(struct spdk_blob *blob) 435 { 436 uint32_t *extent_pages = NULL; 437 uint64_t *clusters = NULL; 438 uint32_t *pages = NULL; 439 440 assert(blob != NULL); 441 442 if (blob->active.num_extent_pages) { 443 assert(blob->active.extent_pages); 444 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 445 if (!extent_pages) { 446 return -ENOMEM; 447 } 448 memcpy(extent_pages, blob->active.extent_pages, 449 blob->active.num_extent_pages * sizeof(*extent_pages)); 450 } 451 452 if (blob->active.num_clusters) { 453 assert(blob->active.clusters); 454 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 455 if (!clusters) { 456 free(extent_pages); 457 return -ENOMEM; 458 } 459 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 460 } 461 462 if (blob->active.num_pages) { 463 assert(blob->active.pages); 464 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 465 if (!pages) { 466 free(extent_pages); 467 free(clusters); 468 return -ENOMEM; 469 } 470 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 471 } 472 473 free(blob->clean.extent_pages); 474 free(blob->clean.clusters); 475 free(blob->clean.pages); 476 477 blob->clean.num_extent_pages = blob->active.num_extent_pages; 478 blob->clean.extent_pages = blob->active.extent_pages; 479 blob->clean.num_clusters = blob->active.num_clusters; 480 blob->clean.clusters = blob->active.clusters; 481 blob->clean.num_pages = blob->active.num_pages; 482 blob->clean.pages = blob->active.pages; 483 484 blob->active.extent_pages = extent_pages; 485 blob->active.clusters = clusters; 486 blob->active.pages = pages; 487 488 /* If the metadata was dirtied again while the metadata was being written to disk, 489 * we do not want to revert the DIRTY state back to CLEAN here. 490 */ 491 if (blob->state == SPDK_BLOB_STATE_LOADING) { 492 blob->state = SPDK_BLOB_STATE_CLEAN; 493 } 494 495 return 0; 496 } 497 498 static int 499 blob_deserialize_xattr(struct spdk_blob *blob, 500 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 501 { 502 struct spdk_xattr *xattr; 503 504 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 505 sizeof(desc_xattr->value_length) + 506 desc_xattr->name_length + desc_xattr->value_length) { 507 return -EINVAL; 508 } 509 510 xattr = calloc(1, sizeof(*xattr)); 511 if (xattr == NULL) { 512 return -ENOMEM; 513 } 514 515 xattr->name = malloc(desc_xattr->name_length + 1); 516 if (xattr->name == NULL) { 517 free(xattr); 518 return -ENOMEM; 519 } 520 521 xattr->value = malloc(desc_xattr->value_length); 522 if (xattr->value == NULL) { 523 free(xattr->name); 524 free(xattr); 525 return -ENOMEM; 526 } 527 528 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 529 xattr->name[desc_xattr->name_length] = '\0'; 530 xattr->value_len = desc_xattr->value_length; 531 memcpy(xattr->value, 532 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 533 desc_xattr->value_length); 534 535 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 536 537 return 0; 538 } 539 540 541 static int 542 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 543 { 544 struct spdk_blob_md_descriptor *desc; 545 size_t cur_desc = 0; 546 void *tmp; 547 548 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 549 while (cur_desc < sizeof(page->descriptors)) { 550 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 551 if (desc->length == 0) { 552 /* If padding and length are 0, this terminates the page */ 553 break; 554 } 555 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 556 struct spdk_blob_md_descriptor_flags *desc_flags; 557 558 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 559 560 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 561 return -EINVAL; 562 } 563 564 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 565 SPDK_BLOB_INVALID_FLAGS_MASK) { 566 return -EINVAL; 567 } 568 569 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 570 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 571 blob->data_ro = true; 572 blob->md_ro = true; 573 } 574 575 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 576 SPDK_BLOB_MD_RO_FLAGS_MASK) { 577 blob->md_ro = true; 578 } 579 580 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 581 blob->data_ro = true; 582 blob->md_ro = true; 583 } 584 585 blob->invalid_flags = desc_flags->invalid_flags; 586 blob->data_ro_flags = desc_flags->data_ro_flags; 587 blob->md_ro_flags = desc_flags->md_ro_flags; 588 589 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 590 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 591 unsigned int i, j; 592 unsigned int cluster_count = blob->active.num_clusters; 593 594 if (blob->extent_table_found) { 595 /* Extent Table already present in the md, 596 * both descriptors should never be at the same time. */ 597 return -EINVAL; 598 } 599 blob->extent_rle_found = true; 600 601 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 602 603 if (desc_extent_rle->length == 0 || 604 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 605 return -EINVAL; 606 } 607 608 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 609 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 610 if (desc_extent_rle->extents[i].cluster_idx != 0) { 611 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 612 desc_extent_rle->extents[i].cluster_idx + j)) { 613 return -EINVAL; 614 } 615 } 616 cluster_count++; 617 } 618 } 619 620 if (cluster_count == 0) { 621 return -EINVAL; 622 } 623 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 624 if (tmp == NULL) { 625 return -ENOMEM; 626 } 627 blob->active.clusters = tmp; 628 blob->active.cluster_array_size = cluster_count; 629 630 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 631 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 632 if (desc_extent_rle->extents[i].cluster_idx != 0) { 633 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 634 desc_extent_rle->extents[i].cluster_idx + j); 635 } else if (spdk_blob_is_thin_provisioned(blob)) { 636 blob->active.clusters[blob->active.num_clusters++] = 0; 637 } else { 638 return -EINVAL; 639 } 640 } 641 } 642 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 643 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 644 uint32_t num_extent_pages = blob->active.num_extent_pages; 645 uint32_t i, j; 646 size_t extent_pages_length; 647 648 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 649 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 650 651 if (blob->extent_rle_found) { 652 /* This means that Extent RLE is present in MD, 653 * both should never be at the same time. */ 654 return -EINVAL; 655 } else if (blob->extent_table_found && 656 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 657 /* Number of clusters in this ET does not match number 658 * from previously read EXTENT_TABLE. */ 659 return -EINVAL; 660 } 661 662 if (desc_extent_table->length == 0 || 663 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 664 return -EINVAL; 665 } 666 667 blob->extent_table_found = true; 668 669 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 670 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 671 } 672 673 if (num_extent_pages > 0) { 674 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 675 if (tmp == NULL) { 676 return -ENOMEM; 677 } 678 blob->active.extent_pages = tmp; 679 } 680 blob->active.extent_pages_array_size = num_extent_pages; 681 682 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 683 684 /* Extent table entries contain md page numbers for extent pages. 685 * Zeroes represent unallocated extent pages, those are run-length-encoded. 686 */ 687 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 688 if (desc_extent_table->extent_page[i].page_idx != 0) { 689 assert(desc_extent_table->extent_page[i].num_pages == 1); 690 blob->active.extent_pages[blob->active.num_extent_pages++] = 691 desc_extent_table->extent_page[i].page_idx; 692 } else if (spdk_blob_is_thin_provisioned(blob)) { 693 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 694 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 695 } 696 } else { 697 return -EINVAL; 698 } 699 } 700 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 701 struct spdk_blob_md_descriptor_extent_page *desc_extent; 702 unsigned int i; 703 unsigned int cluster_count = 0; 704 size_t cluster_idx_length; 705 706 if (blob->extent_rle_found) { 707 /* This means that Extent RLE is present in MD, 708 * both should never be at the same time. */ 709 return -EINVAL; 710 } 711 712 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 713 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 714 715 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 716 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 717 return -EINVAL; 718 } 719 720 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 721 if (desc_extent->cluster_idx[i] != 0) { 722 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 723 return -EINVAL; 724 } 725 } 726 cluster_count++; 727 } 728 729 if (cluster_count == 0) { 730 return -EINVAL; 731 } 732 733 /* When reading extent pages sequentially starting cluster idx should match 734 * current size of a blob. 735 * If changed to batch reading, this check shall be removed. */ 736 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 737 return -EINVAL; 738 } 739 740 tmp = realloc(blob->active.clusters, 741 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 742 if (tmp == NULL) { 743 return -ENOMEM; 744 } 745 blob->active.clusters = tmp; 746 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 747 748 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 749 if (desc_extent->cluster_idx[i] != 0) { 750 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 751 desc_extent->cluster_idx[i]); 752 } else if (spdk_blob_is_thin_provisioned(blob)) { 753 blob->active.clusters[blob->active.num_clusters++] = 0; 754 } else { 755 return -EINVAL; 756 } 757 } 758 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 759 assert(blob->remaining_clusters_in_et >= cluster_count); 760 blob->remaining_clusters_in_et -= cluster_count; 761 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 762 int rc; 763 764 rc = blob_deserialize_xattr(blob, 765 (struct spdk_blob_md_descriptor_xattr *) desc, false); 766 if (rc != 0) { 767 return rc; 768 } 769 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 770 int rc; 771 772 rc = blob_deserialize_xattr(blob, 773 (struct spdk_blob_md_descriptor_xattr *) desc, true); 774 if (rc != 0) { 775 return rc; 776 } 777 } else { 778 /* Unrecognized descriptor type. Do not fail - just continue to the 779 * next descriptor. If this descriptor is associated with some feature 780 * defined in a newer version of blobstore, that version of blobstore 781 * should create and set an associated feature flag to specify if this 782 * blob can be loaded or not. 783 */ 784 } 785 786 /* Advance to the next descriptor */ 787 cur_desc += sizeof(*desc) + desc->length; 788 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 789 break; 790 } 791 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 792 } 793 794 return 0; 795 } 796 797 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 798 799 static int 800 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 801 { 802 assert(blob != NULL); 803 assert(blob->state == SPDK_BLOB_STATE_LOADING); 804 805 if (bs_load_cur_extent_page_valid(extent_page) == false) { 806 return -ENOENT; 807 } 808 809 return blob_parse_page(extent_page, blob); 810 } 811 812 static int 813 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 814 struct spdk_blob *blob) 815 { 816 const struct spdk_blob_md_page *page; 817 uint32_t i; 818 int rc; 819 void *tmp; 820 821 assert(page_count > 0); 822 assert(pages[0].sequence_num == 0); 823 assert(blob != NULL); 824 assert(blob->state == SPDK_BLOB_STATE_LOADING); 825 assert(blob->active.clusters == NULL); 826 827 /* The blobid provided doesn't match what's in the MD, this can 828 * happen for example if a bogus blobid is passed in through open. 829 */ 830 if (blob->id != pages[0].id) { 831 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 832 blob->id, pages[0].id); 833 return -ENOENT; 834 } 835 836 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 837 if (!tmp) { 838 return -ENOMEM; 839 } 840 blob->active.pages = tmp; 841 842 blob->active.pages[0] = pages[0].id; 843 844 for (i = 1; i < page_count; i++) { 845 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 846 blob->active.pages[i] = pages[i - 1].next; 847 } 848 blob->active.num_pages = page_count; 849 850 for (i = 0; i < page_count; i++) { 851 page = &pages[i]; 852 853 assert(page->id == blob->id); 854 assert(page->sequence_num == i); 855 856 rc = blob_parse_page(page, blob); 857 if (rc != 0) { 858 return rc; 859 } 860 } 861 862 return 0; 863 } 864 865 static int 866 blob_serialize_add_page(const struct spdk_blob *blob, 867 struct spdk_blob_md_page **pages, 868 uint32_t *page_count, 869 struct spdk_blob_md_page **last_page) 870 { 871 struct spdk_blob_md_page *page, *tmp_pages; 872 873 assert(pages != NULL); 874 assert(page_count != NULL); 875 876 *last_page = NULL; 877 if (*page_count == 0) { 878 assert(*pages == NULL); 879 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 880 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 881 if (*pages == NULL) { 882 return -ENOMEM; 883 } 884 *page_count = 1; 885 } else { 886 assert(*pages != NULL); 887 tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0); 888 if (tmp_pages == NULL) { 889 return -ENOMEM; 890 } 891 (*page_count)++; 892 *pages = tmp_pages; 893 } 894 895 page = &(*pages)[*page_count - 1]; 896 memset(page, 0, sizeof(*page)); 897 page->id = blob->id; 898 page->sequence_num = *page_count - 1; 899 page->next = SPDK_INVALID_MD_PAGE; 900 *last_page = page; 901 902 return 0; 903 } 904 905 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 906 * Update required_sz on both success and failure. 907 * 908 */ 909 static int 910 blob_serialize_xattr(const struct spdk_xattr *xattr, 911 uint8_t *buf, size_t buf_sz, 912 size_t *required_sz, bool internal) 913 { 914 struct spdk_blob_md_descriptor_xattr *desc; 915 916 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 917 strlen(xattr->name) + 918 xattr->value_len; 919 920 if (buf_sz < *required_sz) { 921 return -1; 922 } 923 924 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 925 926 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 927 desc->length = sizeof(desc->name_length) + 928 sizeof(desc->value_length) + 929 strlen(xattr->name) + 930 xattr->value_len; 931 desc->name_length = strlen(xattr->name); 932 desc->value_length = xattr->value_len; 933 934 memcpy(desc->name, xattr->name, desc->name_length); 935 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 936 xattr->value, 937 desc->value_length); 938 939 return 0; 940 } 941 942 static void 943 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 944 uint64_t start_ep, uint64_t *next_ep, 945 uint8_t **buf, size_t *remaining_sz) 946 { 947 struct spdk_blob_md_descriptor_extent_table *desc; 948 size_t cur_sz; 949 uint64_t i, et_idx; 950 uint32_t extent_page, ep_len; 951 952 /* The buffer must have room for at least num_clusters entry */ 953 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 954 if (*remaining_sz < cur_sz) { 955 *next_ep = start_ep; 956 return; 957 } 958 959 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 960 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 961 962 desc->num_clusters = blob->active.num_clusters; 963 964 ep_len = 1; 965 et_idx = 0; 966 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 967 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 968 /* If we ran out of buffer space, return */ 969 break; 970 } 971 972 extent_page = blob->active.extent_pages[i]; 973 /* Verify that next extent_page is unallocated */ 974 if (extent_page == 0 && 975 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 976 ep_len++; 977 continue; 978 } 979 desc->extent_page[et_idx].page_idx = extent_page; 980 desc->extent_page[et_idx].num_pages = ep_len; 981 et_idx++; 982 983 ep_len = 1; 984 cur_sz += sizeof(desc->extent_page[et_idx]); 985 } 986 *next_ep = i; 987 988 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 989 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 990 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 991 } 992 993 static int 994 blob_serialize_extent_table(const struct spdk_blob *blob, 995 struct spdk_blob_md_page **pages, 996 struct spdk_blob_md_page *cur_page, 997 uint32_t *page_count, uint8_t **buf, 998 size_t *remaining_sz) 999 { 1000 uint64_t last_extent_page; 1001 int rc; 1002 1003 last_extent_page = 0; 1004 /* At least single extent table entry has to be always persisted. 1005 * Such case occurs with num_extent_pages == 0. */ 1006 while (last_extent_page <= blob->active.num_extent_pages) { 1007 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1008 remaining_sz); 1009 1010 if (last_extent_page == blob->active.num_extent_pages) { 1011 break; 1012 } 1013 1014 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1015 if (rc < 0) { 1016 return rc; 1017 } 1018 1019 *buf = (uint8_t *)cur_page->descriptors; 1020 *remaining_sz = sizeof(cur_page->descriptors); 1021 } 1022 1023 return 0; 1024 } 1025 1026 static void 1027 blob_serialize_extent_rle(const struct spdk_blob *blob, 1028 uint64_t start_cluster, uint64_t *next_cluster, 1029 uint8_t **buf, size_t *buf_sz) 1030 { 1031 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1032 size_t cur_sz; 1033 uint64_t i, extent_idx; 1034 uint64_t lba, lba_per_cluster, lba_count; 1035 1036 /* The buffer must have room for at least one extent */ 1037 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1038 if (*buf_sz < cur_sz) { 1039 *next_cluster = start_cluster; 1040 return; 1041 } 1042 1043 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1044 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1045 1046 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1047 1048 lba = blob->active.clusters[start_cluster]; 1049 lba_count = lba_per_cluster; 1050 extent_idx = 0; 1051 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1052 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1053 /* Run-length encode sequential non-zero LBA */ 1054 lba_count += lba_per_cluster; 1055 continue; 1056 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1057 /* Run-length encode unallocated clusters */ 1058 lba_count += lba_per_cluster; 1059 continue; 1060 } 1061 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1062 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1063 extent_idx++; 1064 1065 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1066 1067 if (*buf_sz < cur_sz) { 1068 /* If we ran out of buffer space, return */ 1069 *next_cluster = i; 1070 break; 1071 } 1072 1073 lba = blob->active.clusters[i]; 1074 lba_count = lba_per_cluster; 1075 } 1076 1077 if (*buf_sz >= cur_sz) { 1078 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1079 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1080 extent_idx++; 1081 1082 *next_cluster = blob->active.num_clusters; 1083 } 1084 1085 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1086 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1087 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1088 } 1089 1090 static int 1091 blob_serialize_extents_rle(const struct spdk_blob *blob, 1092 struct spdk_blob_md_page **pages, 1093 struct spdk_blob_md_page *cur_page, 1094 uint32_t *page_count, uint8_t **buf, 1095 size_t *remaining_sz) 1096 { 1097 uint64_t last_cluster; 1098 int rc; 1099 1100 last_cluster = 0; 1101 while (last_cluster < blob->active.num_clusters) { 1102 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1103 1104 if (last_cluster == blob->active.num_clusters) { 1105 break; 1106 } 1107 1108 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1109 if (rc < 0) { 1110 return rc; 1111 } 1112 1113 *buf = (uint8_t *)cur_page->descriptors; 1114 *remaining_sz = sizeof(cur_page->descriptors); 1115 } 1116 1117 return 0; 1118 } 1119 1120 static void 1121 blob_serialize_extent_page(const struct spdk_blob *blob, 1122 uint64_t cluster, struct spdk_blob_md_page *page) 1123 { 1124 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1125 uint64_t i, extent_idx; 1126 uint64_t lba, lba_per_cluster; 1127 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1128 1129 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1130 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1131 1132 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1133 1134 desc_extent->start_cluster_idx = start_cluster_idx; 1135 extent_idx = 0; 1136 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1137 lba = blob->active.clusters[i]; 1138 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1139 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1140 break; 1141 } 1142 } 1143 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1144 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1145 } 1146 1147 static void 1148 blob_serialize_flags(const struct spdk_blob *blob, 1149 uint8_t *buf, size_t *buf_sz) 1150 { 1151 struct spdk_blob_md_descriptor_flags *desc; 1152 1153 /* 1154 * Flags get serialized first, so we should always have room for the flags 1155 * descriptor. 1156 */ 1157 assert(*buf_sz >= sizeof(*desc)); 1158 1159 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1160 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1161 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1162 desc->invalid_flags = blob->invalid_flags; 1163 desc->data_ro_flags = blob->data_ro_flags; 1164 desc->md_ro_flags = blob->md_ro_flags; 1165 1166 *buf_sz -= sizeof(*desc); 1167 } 1168 1169 static int 1170 blob_serialize_xattrs(const struct spdk_blob *blob, 1171 const struct spdk_xattr_tailq *xattrs, bool internal, 1172 struct spdk_blob_md_page **pages, 1173 struct spdk_blob_md_page *cur_page, 1174 uint32_t *page_count, uint8_t **buf, 1175 size_t *remaining_sz) 1176 { 1177 const struct spdk_xattr *xattr; 1178 int rc; 1179 1180 TAILQ_FOREACH(xattr, xattrs, link) { 1181 size_t required_sz = 0; 1182 1183 rc = blob_serialize_xattr(xattr, 1184 *buf, *remaining_sz, 1185 &required_sz, internal); 1186 if (rc < 0) { 1187 /* Need to add a new page to the chain */ 1188 rc = blob_serialize_add_page(blob, pages, page_count, 1189 &cur_page); 1190 if (rc < 0) { 1191 spdk_free(*pages); 1192 *pages = NULL; 1193 *page_count = 0; 1194 return rc; 1195 } 1196 1197 *buf = (uint8_t *)cur_page->descriptors; 1198 *remaining_sz = sizeof(cur_page->descriptors); 1199 1200 /* Try again */ 1201 required_sz = 0; 1202 rc = blob_serialize_xattr(xattr, 1203 *buf, *remaining_sz, 1204 &required_sz, internal); 1205 1206 if (rc < 0) { 1207 spdk_free(*pages); 1208 *pages = NULL; 1209 *page_count = 0; 1210 return rc; 1211 } 1212 } 1213 1214 *remaining_sz -= required_sz; 1215 *buf += required_sz; 1216 } 1217 1218 return 0; 1219 } 1220 1221 static int 1222 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1223 uint32_t *page_count) 1224 { 1225 struct spdk_blob_md_page *cur_page; 1226 int rc; 1227 uint8_t *buf; 1228 size_t remaining_sz; 1229 1230 assert(pages != NULL); 1231 assert(page_count != NULL); 1232 assert(blob != NULL); 1233 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1234 1235 *pages = NULL; 1236 *page_count = 0; 1237 1238 /* A blob always has at least 1 page, even if it has no descriptors */ 1239 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1240 if (rc < 0) { 1241 return rc; 1242 } 1243 1244 buf = (uint8_t *)cur_page->descriptors; 1245 remaining_sz = sizeof(cur_page->descriptors); 1246 1247 /* Serialize flags */ 1248 blob_serialize_flags(blob, buf, &remaining_sz); 1249 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1250 1251 /* Serialize xattrs */ 1252 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1253 pages, cur_page, page_count, &buf, &remaining_sz); 1254 if (rc < 0) { 1255 return rc; 1256 } 1257 1258 /* Serialize internal xattrs */ 1259 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1260 pages, cur_page, page_count, &buf, &remaining_sz); 1261 if (rc < 0) { 1262 return rc; 1263 } 1264 1265 if (blob->use_extent_table) { 1266 /* Serialize extent table */ 1267 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1268 } else { 1269 /* Serialize extents */ 1270 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1271 } 1272 1273 return rc; 1274 } 1275 1276 struct spdk_blob_load_ctx { 1277 struct spdk_blob *blob; 1278 1279 struct spdk_blob_md_page *pages; 1280 uint32_t num_pages; 1281 uint32_t next_extent_page; 1282 spdk_bs_sequence_t *seq; 1283 1284 spdk_bs_sequence_cpl cb_fn; 1285 void *cb_arg; 1286 }; 1287 1288 static uint32_t 1289 blob_md_page_calc_crc(void *page) 1290 { 1291 uint32_t crc; 1292 1293 crc = BLOB_CRC32C_INITIAL; 1294 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1295 crc ^= BLOB_CRC32C_INITIAL; 1296 1297 return crc; 1298 1299 } 1300 1301 static void 1302 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1303 { 1304 struct spdk_blob *blob = ctx->blob; 1305 1306 if (bserrno == 0) { 1307 blob_mark_clean(blob); 1308 } 1309 1310 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1311 1312 /* Free the memory */ 1313 spdk_free(ctx->pages); 1314 free(ctx); 1315 } 1316 1317 static void 1318 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1319 { 1320 struct spdk_blob_load_ctx *ctx = cb_arg; 1321 struct spdk_blob *blob = ctx->blob; 1322 1323 if (bserrno == 0) { 1324 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1325 if (blob->back_bs_dev == NULL) { 1326 bserrno = -ENOMEM; 1327 } 1328 } 1329 if (bserrno != 0) { 1330 SPDK_ERRLOG("Snapshot fail\n"); 1331 } 1332 1333 blob_load_final(ctx, bserrno); 1334 } 1335 1336 static void blob_update_clear_method(struct spdk_blob *blob); 1337 1338 static void 1339 blob_load_backing_dev(void *cb_arg) 1340 { 1341 struct spdk_blob_load_ctx *ctx = cb_arg; 1342 struct spdk_blob *blob = ctx->blob; 1343 const void *value; 1344 size_t len; 1345 int rc; 1346 1347 if (spdk_blob_is_thin_provisioned(blob)) { 1348 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1349 if (rc == 0) { 1350 if (len != sizeof(spdk_blob_id)) { 1351 blob_load_final(ctx, -EINVAL); 1352 return; 1353 } 1354 /* open snapshot blob and continue in the callback function */ 1355 blob->parent_id = *(spdk_blob_id *)value; 1356 spdk_bs_open_blob(blob->bs, blob->parent_id, 1357 blob_load_snapshot_cpl, ctx); 1358 return; 1359 } else { 1360 /* add zeroes_dev for thin provisioned blob */ 1361 blob->back_bs_dev = bs_create_zeroes_dev(); 1362 } 1363 } else { 1364 /* standard blob */ 1365 blob->back_bs_dev = NULL; 1366 } 1367 blob_load_final(ctx, 0); 1368 } 1369 1370 static void 1371 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1372 { 1373 struct spdk_blob_load_ctx *ctx = cb_arg; 1374 struct spdk_blob *blob = ctx->blob; 1375 struct spdk_blob_md_page *page; 1376 uint64_t i; 1377 uint32_t crc; 1378 uint64_t lba; 1379 void *tmp; 1380 uint64_t sz; 1381 1382 if (bserrno) { 1383 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1384 blob_load_final(ctx, bserrno); 1385 return; 1386 } 1387 1388 if (ctx->pages == NULL) { 1389 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1390 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1391 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1392 if (!ctx->pages) { 1393 blob_load_final(ctx, -ENOMEM); 1394 return; 1395 } 1396 ctx->num_pages = 1; 1397 ctx->next_extent_page = 0; 1398 } else { 1399 page = &ctx->pages[0]; 1400 crc = blob_md_page_calc_crc(page); 1401 if (crc != page->crc) { 1402 blob_load_final(ctx, -EINVAL); 1403 return; 1404 } 1405 1406 if (page->next != SPDK_INVALID_MD_PAGE) { 1407 blob_load_final(ctx, -EINVAL); 1408 return; 1409 } 1410 1411 bserrno = blob_parse_extent_page(page, blob); 1412 if (bserrno) { 1413 blob_load_final(ctx, bserrno); 1414 return; 1415 } 1416 } 1417 1418 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1419 if (blob->active.extent_pages[i] != 0) { 1420 /* Extent page was allocated, read and parse it. */ 1421 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1422 ctx->next_extent_page = i + 1; 1423 1424 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1425 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1426 blob_load_cpl_extents_cpl, ctx); 1427 return; 1428 } else { 1429 /* Thin provisioned blobs can point to unallocated extent pages. 1430 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1431 1432 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1433 blob->active.num_clusters += sz; 1434 blob->remaining_clusters_in_et -= sz; 1435 1436 assert(spdk_blob_is_thin_provisioned(blob)); 1437 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1438 1439 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1440 if (tmp == NULL) { 1441 blob_load_final(ctx, -ENOMEM); 1442 return; 1443 } 1444 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1445 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1446 blob->active.clusters = tmp; 1447 blob->active.cluster_array_size = blob->active.num_clusters; 1448 } 1449 } 1450 1451 blob_load_backing_dev(ctx); 1452 } 1453 1454 static void 1455 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1456 { 1457 struct spdk_blob_load_ctx *ctx = cb_arg; 1458 struct spdk_blob *blob = ctx->blob; 1459 struct spdk_blob_md_page *page; 1460 int rc; 1461 uint32_t crc; 1462 uint32_t current_page; 1463 1464 if (ctx->num_pages == 1) { 1465 current_page = bs_blobid_to_page(blob->id); 1466 } else { 1467 assert(ctx->num_pages != 0); 1468 page = &ctx->pages[ctx->num_pages - 2]; 1469 current_page = page->next; 1470 } 1471 1472 if (bserrno) { 1473 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1474 current_page, blob->id, bserrno); 1475 blob_load_final(ctx, bserrno); 1476 return; 1477 } 1478 1479 page = &ctx->pages[ctx->num_pages - 1]; 1480 crc = blob_md_page_calc_crc(page); 1481 if (crc != page->crc) { 1482 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1483 current_page, blob->id); 1484 blob_load_final(ctx, -EINVAL); 1485 return; 1486 } 1487 1488 if (page->next != SPDK_INVALID_MD_PAGE) { 1489 struct spdk_blob_md_page *tmp_pages; 1490 uint32_t next_page = page->next; 1491 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1492 1493 /* Read the next page */ 1494 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0); 1495 if (tmp_pages == NULL) { 1496 blob_load_final(ctx, -ENOMEM); 1497 return; 1498 } 1499 ctx->num_pages++; 1500 ctx->pages = tmp_pages; 1501 1502 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1503 next_lba, 1504 bs_byte_to_lba(blob->bs, sizeof(*page)), 1505 blob_load_cpl, ctx); 1506 return; 1507 } 1508 1509 /* Parse the pages */ 1510 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1511 if (rc) { 1512 blob_load_final(ctx, rc); 1513 return; 1514 } 1515 1516 if (blob->extent_table_found == true) { 1517 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1518 assert(blob->extent_rle_found == false); 1519 blob->use_extent_table = true; 1520 } else { 1521 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1522 * for extent table. No extent_* descriptors means that blob has length of 0 1523 * and no extent_rle descriptors were persisted for it. 1524 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1525 blob->use_extent_table = false; 1526 } 1527 1528 /* Check the clear_method stored in metadata vs what may have been passed 1529 * via spdk_bs_open_blob_ext() and update accordingly. 1530 */ 1531 blob_update_clear_method(blob); 1532 1533 spdk_free(ctx->pages); 1534 ctx->pages = NULL; 1535 1536 if (blob->extent_table_found) { 1537 blob_load_cpl_extents_cpl(seq, ctx, 0); 1538 } else { 1539 blob_load_backing_dev(ctx); 1540 } 1541 } 1542 1543 /* Load a blob from disk given a blobid */ 1544 static void 1545 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1546 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1547 { 1548 struct spdk_blob_load_ctx *ctx; 1549 struct spdk_blob_store *bs; 1550 uint32_t page_num; 1551 uint64_t lba; 1552 1553 blob_verify_md_op(blob); 1554 1555 bs = blob->bs; 1556 1557 ctx = calloc(1, sizeof(*ctx)); 1558 if (!ctx) { 1559 cb_fn(seq, cb_arg, -ENOMEM); 1560 return; 1561 } 1562 1563 ctx->blob = blob; 1564 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1565 if (!ctx->pages) { 1566 free(ctx); 1567 cb_fn(seq, cb_arg, -ENOMEM); 1568 return; 1569 } 1570 ctx->num_pages = 1; 1571 ctx->cb_fn = cb_fn; 1572 ctx->cb_arg = cb_arg; 1573 ctx->seq = seq; 1574 1575 page_num = bs_blobid_to_page(blob->id); 1576 lba = bs_md_page_to_lba(blob->bs, page_num); 1577 1578 blob->state = SPDK_BLOB_STATE_LOADING; 1579 1580 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1581 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1582 blob_load_cpl, ctx); 1583 } 1584 1585 struct spdk_blob_persist_ctx { 1586 struct spdk_blob *blob; 1587 1588 struct spdk_bs_super_block *super; 1589 1590 struct spdk_blob_md_page *pages; 1591 uint32_t next_extent_page; 1592 struct spdk_blob_md_page *extent_page; 1593 1594 spdk_bs_sequence_t *seq; 1595 spdk_bs_sequence_cpl cb_fn; 1596 void *cb_arg; 1597 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1598 }; 1599 1600 static void 1601 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1602 uint64_t lba_count) 1603 { 1604 switch (ctx->blob->clear_method) { 1605 case BLOB_CLEAR_WITH_DEFAULT: 1606 case BLOB_CLEAR_WITH_UNMAP: 1607 bs_batch_unmap_dev(batch, lba, lba_count); 1608 break; 1609 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1610 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1611 break; 1612 case BLOB_CLEAR_WITH_NONE: 1613 default: 1614 break; 1615 } 1616 } 1617 1618 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1619 1620 static void 1621 blob_persist_complete_cb(void *arg) 1622 { 1623 struct spdk_blob_persist_ctx *ctx = arg; 1624 1625 /* Call user callback */ 1626 ctx->cb_fn(ctx->seq, ctx->cb_arg, 0); 1627 1628 /* Free the memory */ 1629 spdk_free(ctx->pages); 1630 free(ctx); 1631 } 1632 1633 static void 1634 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1635 { 1636 struct spdk_blob_persist_ctx *next_persist, *tmp; 1637 struct spdk_blob *blob = ctx->blob; 1638 1639 if (bserrno == 0) { 1640 blob_mark_clean(blob); 1641 } 1642 1643 assert(ctx == TAILQ_FIRST(&blob->persists_to_complete)); 1644 1645 /* Complete all persists that were pending when the current persist started */ 1646 TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) { 1647 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link); 1648 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist); 1649 } 1650 1651 if (TAILQ_EMPTY(&blob->pending_persists)) { 1652 return; 1653 } 1654 1655 /* Queue up all pending persists for completion and start blob persist with first one */ 1656 TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link); 1657 next_persist = TAILQ_FIRST(&blob->persists_to_complete); 1658 1659 blob->state = SPDK_BLOB_STATE_DIRTY; 1660 blob_persist_check_dirty(next_persist); 1661 } 1662 1663 static void 1664 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1665 { 1666 struct spdk_blob_persist_ctx *ctx = cb_arg; 1667 struct spdk_blob *blob = ctx->blob; 1668 struct spdk_blob_store *bs = blob->bs; 1669 size_t i; 1670 1671 if (bserrno != 0) { 1672 blob_persist_complete(seq, ctx, bserrno); 1673 return; 1674 } 1675 1676 spdk_spin_lock(&bs->used_lock); 1677 1678 /* Release all extent_pages that were truncated */ 1679 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1680 /* Nothing to release if it was not allocated */ 1681 if (blob->active.extent_pages[i] != 0) { 1682 bs_release_md_page(bs, blob->active.extent_pages[i]); 1683 } 1684 } 1685 1686 spdk_spin_unlock(&bs->used_lock); 1687 1688 if (blob->active.num_extent_pages == 0) { 1689 free(blob->active.extent_pages); 1690 blob->active.extent_pages = NULL; 1691 blob->active.extent_pages_array_size = 0; 1692 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1693 #ifndef __clang_analyzer__ 1694 void *tmp; 1695 1696 /* scan-build really can't figure reallocs, workaround it */ 1697 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1698 assert(tmp != NULL); 1699 blob->active.extent_pages = tmp; 1700 #endif 1701 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1702 } 1703 1704 blob_persist_complete(seq, ctx, bserrno); 1705 } 1706 1707 static void 1708 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1709 { 1710 struct spdk_blob *blob = ctx->blob; 1711 struct spdk_blob_store *bs = blob->bs; 1712 size_t i; 1713 uint64_t lba; 1714 uint64_t lba_count; 1715 spdk_bs_batch_t *batch; 1716 1717 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1718 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1719 1720 /* Clear all extent_pages that were truncated */ 1721 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1722 /* Nothing to clear if it was not allocated */ 1723 if (blob->active.extent_pages[i] != 0) { 1724 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1725 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1726 } 1727 } 1728 1729 bs_batch_close(batch); 1730 } 1731 1732 static void 1733 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1734 { 1735 struct spdk_blob_persist_ctx *ctx = cb_arg; 1736 struct spdk_blob *blob = ctx->blob; 1737 struct spdk_blob_store *bs = blob->bs; 1738 size_t i; 1739 1740 if (bserrno != 0) { 1741 blob_persist_complete(seq, ctx, bserrno); 1742 return; 1743 } 1744 1745 spdk_spin_lock(&bs->used_lock); 1746 /* Release all clusters that were truncated */ 1747 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1748 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1749 1750 /* Nothing to release if it was not allocated */ 1751 if (blob->active.clusters[i] != 0) { 1752 bs_release_cluster(bs, cluster_num); 1753 } 1754 } 1755 spdk_spin_unlock(&bs->used_lock); 1756 1757 if (blob->active.num_clusters == 0) { 1758 free(blob->active.clusters); 1759 blob->active.clusters = NULL; 1760 blob->active.cluster_array_size = 0; 1761 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1762 #ifndef __clang_analyzer__ 1763 void *tmp; 1764 1765 /* scan-build really can't figure reallocs, workaround it */ 1766 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1767 assert(tmp != NULL); 1768 blob->active.clusters = tmp; 1769 1770 #endif 1771 blob->active.cluster_array_size = blob->active.num_clusters; 1772 } 1773 1774 /* Move on to clearing extent pages */ 1775 blob_persist_clear_extents(seq, ctx); 1776 } 1777 1778 static void 1779 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1780 { 1781 struct spdk_blob *blob = ctx->blob; 1782 struct spdk_blob_store *bs = blob->bs; 1783 spdk_bs_batch_t *batch; 1784 size_t i; 1785 uint64_t lba; 1786 uint64_t lba_count; 1787 1788 /* Clusters don't move around in blobs. The list shrinks or grows 1789 * at the end, but no changes ever occur in the middle of the list. 1790 */ 1791 1792 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1793 1794 /* Clear all clusters that were truncated */ 1795 lba = 0; 1796 lba_count = 0; 1797 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1798 uint64_t next_lba = blob->active.clusters[i]; 1799 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1); 1800 1801 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1802 /* This cluster is contiguous with the previous one. */ 1803 lba_count += next_lba_count; 1804 continue; 1805 } else if (next_lba == 0) { 1806 continue; 1807 } 1808 1809 /* This cluster is not contiguous with the previous one. */ 1810 1811 /* If a run of LBAs previously existing, clear them now */ 1812 if (lba_count > 0) { 1813 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1814 } 1815 1816 /* Start building the next batch */ 1817 lba = next_lba; 1818 if (next_lba > 0) { 1819 lba_count = next_lba_count; 1820 } else { 1821 lba_count = 0; 1822 } 1823 } 1824 1825 /* If we ended with a contiguous set of LBAs, clear them now */ 1826 if (lba_count > 0) { 1827 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1828 } 1829 1830 bs_batch_close(batch); 1831 } 1832 1833 static void 1834 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1835 { 1836 struct spdk_blob_persist_ctx *ctx = cb_arg; 1837 struct spdk_blob *blob = ctx->blob; 1838 struct spdk_blob_store *bs = blob->bs; 1839 size_t i; 1840 1841 if (bserrno != 0) { 1842 blob_persist_complete(seq, ctx, bserrno); 1843 return; 1844 } 1845 1846 spdk_spin_lock(&bs->used_lock); 1847 1848 /* This loop starts at 1 because the first page is special and handled 1849 * below. The pages (except the first) are never written in place, 1850 * so any pages in the clean list must be zeroed. 1851 */ 1852 for (i = 1; i < blob->clean.num_pages; i++) { 1853 bs_release_md_page(bs, blob->clean.pages[i]); 1854 } 1855 1856 if (blob->active.num_pages == 0) { 1857 uint32_t page_num; 1858 1859 page_num = bs_blobid_to_page(blob->id); 1860 bs_release_md_page(bs, page_num); 1861 } 1862 1863 spdk_spin_unlock(&bs->used_lock); 1864 1865 /* Move on to clearing clusters */ 1866 blob_persist_clear_clusters(seq, ctx); 1867 } 1868 1869 static void 1870 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1871 { 1872 struct spdk_blob_persist_ctx *ctx = cb_arg; 1873 struct spdk_blob *blob = ctx->blob; 1874 struct spdk_blob_store *bs = blob->bs; 1875 uint64_t lba; 1876 uint64_t lba_count; 1877 spdk_bs_batch_t *batch; 1878 size_t i; 1879 1880 if (bserrno != 0) { 1881 blob_persist_complete(seq, ctx, bserrno); 1882 return; 1883 } 1884 1885 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1886 1887 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1888 1889 /* This loop starts at 1 because the first page is special and handled 1890 * below. The pages (except the first) are never written in place, 1891 * so any pages in the clean list must be zeroed. 1892 */ 1893 for (i = 1; i < blob->clean.num_pages; i++) { 1894 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1895 1896 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1897 } 1898 1899 /* The first page will only be zeroed if this is a delete. */ 1900 if (blob->active.num_pages == 0) { 1901 uint32_t page_num; 1902 1903 /* The first page in the metadata goes where the blobid indicates */ 1904 page_num = bs_blobid_to_page(blob->id); 1905 lba = bs_md_page_to_lba(bs, page_num); 1906 1907 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1908 } 1909 1910 bs_batch_close(batch); 1911 } 1912 1913 static void 1914 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1915 { 1916 struct spdk_blob_persist_ctx *ctx = cb_arg; 1917 struct spdk_blob *blob = ctx->blob; 1918 struct spdk_blob_store *bs = blob->bs; 1919 uint64_t lba; 1920 uint32_t lba_count; 1921 struct spdk_blob_md_page *page; 1922 1923 if (bserrno != 0) { 1924 blob_persist_complete(seq, ctx, bserrno); 1925 return; 1926 } 1927 1928 if (blob->active.num_pages == 0) { 1929 /* Move on to the next step */ 1930 blob_persist_zero_pages(seq, ctx, 0); 1931 return; 1932 } 1933 1934 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1935 1936 page = &ctx->pages[0]; 1937 /* The first page in the metadata goes where the blobid indicates */ 1938 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1939 1940 bs_sequence_write_dev(seq, page, lba, lba_count, 1941 blob_persist_zero_pages, ctx); 1942 } 1943 1944 static void 1945 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1946 { 1947 struct spdk_blob *blob = ctx->blob; 1948 struct spdk_blob_store *bs = blob->bs; 1949 uint64_t lba; 1950 uint32_t lba_count; 1951 struct spdk_blob_md_page *page; 1952 spdk_bs_batch_t *batch; 1953 size_t i; 1954 1955 /* Clusters don't move around in blobs. The list shrinks or grows 1956 * at the end, but no changes ever occur in the middle of the list. 1957 */ 1958 1959 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1960 1961 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1962 1963 /* This starts at 1. The root page is not written until 1964 * all of the others are finished 1965 */ 1966 for (i = 1; i < blob->active.num_pages; i++) { 1967 page = &ctx->pages[i]; 1968 assert(page->sequence_num == i); 1969 1970 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1971 1972 bs_batch_write_dev(batch, page, lba, lba_count); 1973 } 1974 1975 bs_batch_close(batch); 1976 } 1977 1978 static int 1979 blob_resize(struct spdk_blob *blob, uint64_t sz) 1980 { 1981 uint64_t i; 1982 uint64_t *tmp; 1983 uint64_t cluster; 1984 uint32_t lfmd; /* lowest free md page */ 1985 uint64_t num_clusters; 1986 uint32_t *ep_tmp; 1987 uint64_t new_num_ep = 0, current_num_ep = 0; 1988 struct spdk_blob_store *bs; 1989 int rc; 1990 1991 bs = blob->bs; 1992 1993 blob_verify_md_op(blob); 1994 1995 if (blob->active.num_clusters == sz) { 1996 return 0; 1997 } 1998 1999 if (blob->active.num_clusters < blob->active.cluster_array_size) { 2000 /* If this blob was resized to be larger, then smaller, then 2001 * larger without syncing, then the cluster array already 2002 * contains spare assigned clusters we can use. 2003 */ 2004 num_clusters = spdk_min(blob->active.cluster_array_size, 2005 sz); 2006 } else { 2007 num_clusters = blob->active.num_clusters; 2008 } 2009 2010 if (blob->use_extent_table) { 2011 /* Round up since every cluster beyond current Extent Table size, 2012 * requires new extent page. */ 2013 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 2014 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 2015 } 2016 2017 assert(!spdk_spin_held(&bs->used_lock)); 2018 2019 /* Check first that we have enough clusters and md pages before we start claiming them. 2020 * bs->used_lock is held to ensure that clusters we think are free are still free when we go 2021 * to claim them later in this function. 2022 */ 2023 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 2024 spdk_spin_lock(&bs->used_lock); 2025 if ((sz - num_clusters) > bs->num_free_clusters) { 2026 rc = -ENOSPC; 2027 goto out; 2028 } 2029 lfmd = 0; 2030 for (i = current_num_ep; i < new_num_ep ; i++) { 2031 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2032 if (lfmd == UINT32_MAX) { 2033 /* No more free md pages. Cannot satisfy the request */ 2034 rc = -ENOSPC; 2035 goto out; 2036 } 2037 } 2038 } 2039 2040 if (sz > num_clusters) { 2041 /* Expand the cluster array if necessary. 2042 * We only shrink the array when persisting. 2043 */ 2044 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2045 if (sz > 0 && tmp == NULL) { 2046 rc = -ENOMEM; 2047 goto out; 2048 } 2049 memset(tmp + blob->active.cluster_array_size, 0, 2050 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2051 blob->active.clusters = tmp; 2052 blob->active.cluster_array_size = sz; 2053 2054 /* Expand the extents table, only if enough clusters were added */ 2055 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2056 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2057 if (new_num_ep > 0 && ep_tmp == NULL) { 2058 rc = -ENOMEM; 2059 goto out; 2060 } 2061 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2062 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2063 blob->active.extent_pages = ep_tmp; 2064 blob->active.extent_pages_array_size = new_num_ep; 2065 } 2066 } 2067 2068 blob->state = SPDK_BLOB_STATE_DIRTY; 2069 2070 if (spdk_blob_is_thin_provisioned(blob) == false) { 2071 cluster = 0; 2072 lfmd = 0; 2073 for (i = num_clusters; i < sz; i++) { 2074 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2075 lfmd++; 2076 } 2077 } 2078 2079 blob->active.num_clusters = sz; 2080 blob->active.num_extent_pages = new_num_ep; 2081 2082 rc = 0; 2083 out: 2084 if (spdk_spin_held(&bs->used_lock)) { 2085 spdk_spin_unlock(&bs->used_lock); 2086 } 2087 2088 return rc; 2089 } 2090 2091 static void 2092 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2093 { 2094 spdk_bs_sequence_t *seq = ctx->seq; 2095 struct spdk_blob *blob = ctx->blob; 2096 struct spdk_blob_store *bs = blob->bs; 2097 uint64_t i; 2098 uint32_t page_num; 2099 void *tmp; 2100 int rc; 2101 2102 /* Generate the new metadata */ 2103 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2104 if (rc < 0) { 2105 blob_persist_complete(seq, ctx, rc); 2106 return; 2107 } 2108 2109 assert(blob->active.num_pages >= 1); 2110 2111 /* Resize the cache of page indices */ 2112 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2113 if (!tmp) { 2114 blob_persist_complete(seq, ctx, -ENOMEM); 2115 return; 2116 } 2117 blob->active.pages = tmp; 2118 2119 /* Assign this metadata to pages. This requires two passes - one to verify that there are 2120 * enough pages and a second to actually claim them. The used_lock is held across 2121 * both passes to ensure things don't change in the middle. 2122 */ 2123 spdk_spin_lock(&bs->used_lock); 2124 page_num = 0; 2125 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2126 for (i = 1; i < blob->active.num_pages; i++) { 2127 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2128 if (page_num == UINT32_MAX) { 2129 spdk_spin_unlock(&bs->used_lock); 2130 blob_persist_complete(seq, ctx, -ENOMEM); 2131 return; 2132 } 2133 page_num++; 2134 } 2135 2136 page_num = 0; 2137 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2138 for (i = 1; i < blob->active.num_pages; i++) { 2139 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2140 ctx->pages[i - 1].next = page_num; 2141 /* Now that previous metadata page is complete, calculate the crc for it. */ 2142 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2143 blob->active.pages[i] = page_num; 2144 bs_claim_md_page(bs, page_num); 2145 SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id); 2146 page_num++; 2147 } 2148 spdk_spin_unlock(&bs->used_lock); 2149 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2150 /* Start writing the metadata from last page to first */ 2151 blob->state = SPDK_BLOB_STATE_CLEAN; 2152 blob_persist_write_page_chain(seq, ctx); 2153 } 2154 2155 static void 2156 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2157 { 2158 struct spdk_blob_persist_ctx *ctx = cb_arg; 2159 struct spdk_blob *blob = ctx->blob; 2160 size_t i; 2161 uint32_t extent_page_id; 2162 uint32_t page_count = 0; 2163 int rc; 2164 2165 if (ctx->extent_page != NULL) { 2166 spdk_free(ctx->extent_page); 2167 ctx->extent_page = NULL; 2168 } 2169 2170 if (bserrno != 0) { 2171 blob_persist_complete(seq, ctx, bserrno); 2172 return; 2173 } 2174 2175 /* Only write out Extent Pages when blob was resized. */ 2176 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2177 extent_page_id = blob->active.extent_pages[i]; 2178 if (extent_page_id == 0) { 2179 /* No Extent Page to persist */ 2180 assert(spdk_blob_is_thin_provisioned(blob)); 2181 continue; 2182 } 2183 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2184 ctx->next_extent_page = i + 1; 2185 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2186 if (rc < 0) { 2187 blob_persist_complete(seq, ctx, rc); 2188 return; 2189 } 2190 2191 blob->state = SPDK_BLOB_STATE_DIRTY; 2192 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2193 2194 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2195 2196 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2197 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2198 blob_persist_write_extent_pages, ctx); 2199 return; 2200 } 2201 2202 blob_persist_generate_new_md(ctx); 2203 } 2204 2205 static void 2206 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2207 { 2208 spdk_bs_sequence_t *seq = ctx->seq; 2209 struct spdk_blob *blob = ctx->blob; 2210 2211 if (blob->active.num_pages == 0) { 2212 /* This is the signal that the blob should be deleted. 2213 * Immediately jump to the clean up routine. */ 2214 assert(blob->clean.num_pages > 0); 2215 blob->state = SPDK_BLOB_STATE_CLEAN; 2216 blob_persist_zero_pages(seq, ctx, 0); 2217 return; 2218 2219 } 2220 2221 if (blob->clean.num_clusters < blob->active.num_clusters) { 2222 /* Blob was resized up */ 2223 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2224 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2225 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2226 /* Blob was resized down */ 2227 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2228 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2229 } else { 2230 /* No change in size occurred */ 2231 blob_persist_generate_new_md(ctx); 2232 return; 2233 } 2234 2235 blob_persist_write_extent_pages(seq, ctx, 0); 2236 } 2237 2238 static void 2239 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2240 { 2241 struct spdk_blob_persist_ctx *ctx = cb_arg; 2242 2243 spdk_free(ctx->super); 2244 2245 if (bserrno != 0) { 2246 blob_persist_complete(seq, ctx, bserrno); 2247 return; 2248 } 2249 2250 ctx->blob->bs->clean = 0; 2251 2252 blob_persist_start(ctx); 2253 } 2254 2255 static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2256 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2257 2258 2259 static void 2260 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2261 { 2262 struct spdk_blob_persist_ctx *ctx = cb_arg; 2263 2264 if (bserrno != 0) { 2265 spdk_free(ctx->super); 2266 blob_persist_complete(seq, ctx, bserrno); 2267 return; 2268 } 2269 2270 ctx->super->clean = 0; 2271 if (ctx->super->size == 0) { 2272 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2273 } 2274 2275 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2276 } 2277 2278 static void 2279 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2280 { 2281 if (ctx->blob->bs->clean) { 2282 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2283 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2284 if (!ctx->super) { 2285 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2286 return; 2287 } 2288 2289 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2290 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2291 blob_persist_dirty, ctx); 2292 } else { 2293 blob_persist_start(ctx); 2294 } 2295 } 2296 2297 /* Write a blob to disk */ 2298 static void 2299 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2300 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2301 { 2302 struct spdk_blob_persist_ctx *ctx; 2303 2304 blob_verify_md_op(blob); 2305 2306 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) { 2307 cb_fn(seq, cb_arg, 0); 2308 return; 2309 } 2310 2311 ctx = calloc(1, sizeof(*ctx)); 2312 if (!ctx) { 2313 cb_fn(seq, cb_arg, -ENOMEM); 2314 return; 2315 } 2316 ctx->blob = blob; 2317 ctx->seq = seq; 2318 ctx->cb_fn = cb_fn; 2319 ctx->cb_arg = cb_arg; 2320 2321 /* Multiple blob persists can affect one another, via blob->state or 2322 * blob mutable data changes. To prevent it, queue up the persists. */ 2323 if (!TAILQ_EMPTY(&blob->persists_to_complete)) { 2324 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2325 return; 2326 } 2327 TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link); 2328 2329 blob_persist_check_dirty(ctx); 2330 } 2331 2332 struct spdk_blob_copy_cluster_ctx { 2333 struct spdk_blob *blob; 2334 uint8_t *buf; 2335 uint64_t page; 2336 uint64_t new_cluster; 2337 uint32_t new_extent_page; 2338 spdk_bs_sequence_t *seq; 2339 struct spdk_blob_md_page *new_cluster_page; 2340 }; 2341 2342 static void 2343 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2344 { 2345 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2346 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2347 TAILQ_HEAD(, spdk_bs_request_set) requests; 2348 spdk_bs_user_op_t *op; 2349 2350 TAILQ_INIT(&requests); 2351 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2352 2353 while (!TAILQ_EMPTY(&requests)) { 2354 op = TAILQ_FIRST(&requests); 2355 TAILQ_REMOVE(&requests, op, link); 2356 if (bserrno == 0) { 2357 bs_user_op_execute(op); 2358 } else { 2359 bs_user_op_abort(op, bserrno); 2360 } 2361 } 2362 2363 spdk_free(ctx->buf); 2364 free(ctx); 2365 } 2366 2367 static void 2368 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2369 { 2370 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2371 2372 if (bserrno) { 2373 if (bserrno == -EEXIST) { 2374 /* The metadata insert failed because another thread 2375 * allocated the cluster first. Free our cluster 2376 * but continue without error. */ 2377 bserrno = 0; 2378 } 2379 spdk_spin_lock(&ctx->blob->bs->used_lock); 2380 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2381 if (ctx->new_extent_page != 0) { 2382 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2383 } 2384 spdk_spin_unlock(&ctx->blob->bs->used_lock); 2385 } 2386 2387 bs_sequence_finish(ctx->seq, bserrno); 2388 } 2389 2390 static void 2391 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2392 { 2393 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2394 uint32_t cluster_number; 2395 2396 if (bserrno) { 2397 /* The write failed, so jump to the final completion handler */ 2398 bs_sequence_finish(seq, bserrno); 2399 return; 2400 } 2401 2402 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2403 2404 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2405 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2406 } 2407 2408 static void 2409 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2410 { 2411 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2412 2413 if (bserrno != 0) { 2414 /* The read failed, so jump to the final completion handler */ 2415 bs_sequence_finish(seq, bserrno); 2416 return; 2417 } 2418 2419 /* Write whole cluster */ 2420 bs_sequence_write_dev(seq, ctx->buf, 2421 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2422 bs_cluster_to_lba(ctx->blob->bs, 1), 2423 blob_write_copy_cpl, ctx); 2424 } 2425 2426 static bool 2427 blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba) 2428 { 2429 uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page); 2430 2431 return (blob->bs->dev->copy != NULL) && 2432 blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba); 2433 } 2434 2435 static void 2436 blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba) 2437 { 2438 struct spdk_blob *blob = ctx->blob; 2439 uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz); 2440 2441 bs_sequence_copy_dev(ctx->seq, 2442 bs_cluster_to_lba(blob->bs, ctx->new_cluster), 2443 src_lba, 2444 lba_count, 2445 blob_write_copy_cpl, ctx); 2446 } 2447 2448 static void 2449 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2450 struct spdk_io_channel *_ch, 2451 uint64_t io_unit, spdk_bs_user_op_t *op) 2452 { 2453 struct spdk_bs_cpl cpl; 2454 struct spdk_bs_channel *ch; 2455 struct spdk_blob_copy_cluster_ctx *ctx; 2456 uint32_t cluster_start_page; 2457 uint32_t cluster_number; 2458 bool is_zeroes; 2459 bool can_copy; 2460 uint64_t copy_src_lba; 2461 int rc; 2462 2463 ch = spdk_io_channel_get_ctx(_ch); 2464 2465 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2466 /* There are already operations pending. Queue this user op 2467 * and return because it will be re-executed when the outstanding 2468 * cluster allocation completes. */ 2469 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2470 return; 2471 } 2472 2473 /* Round the io_unit offset down to the first page in the cluster */ 2474 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2475 2476 /* Calculate which index in the metadata cluster array the corresponding 2477 * cluster is supposed to be at. */ 2478 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2479 2480 ctx = calloc(1, sizeof(*ctx)); 2481 if (!ctx) { 2482 bs_user_op_abort(op, -ENOMEM); 2483 return; 2484 } 2485 2486 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2487 2488 ctx->blob = blob; 2489 ctx->page = cluster_start_page; 2490 ctx->new_cluster_page = ch->new_cluster_page; 2491 memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE); 2492 can_copy = blob_can_copy(blob, cluster_start_page, ©_src_lba); 2493 2494 is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev, 2495 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2496 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz)); 2497 if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) { 2498 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2499 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2500 if (!ctx->buf) { 2501 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2502 blob->bs->cluster_sz); 2503 free(ctx); 2504 bs_user_op_abort(op, -ENOMEM); 2505 return; 2506 } 2507 } 2508 2509 spdk_spin_lock(&blob->bs->used_lock); 2510 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2511 false); 2512 spdk_spin_unlock(&blob->bs->used_lock); 2513 if (rc != 0) { 2514 spdk_free(ctx->buf); 2515 free(ctx); 2516 bs_user_op_abort(op, rc); 2517 return; 2518 } 2519 2520 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2521 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2522 cpl.u.blob_basic.cb_arg = ctx; 2523 2524 ctx->seq = bs_sequence_start(_ch, &cpl); 2525 if (!ctx->seq) { 2526 spdk_spin_lock(&blob->bs->used_lock); 2527 bs_release_cluster(blob->bs, ctx->new_cluster); 2528 spdk_spin_unlock(&blob->bs->used_lock); 2529 spdk_free(ctx->buf); 2530 free(ctx); 2531 bs_user_op_abort(op, -ENOMEM); 2532 return; 2533 } 2534 2535 /* Queue the user op to block other incoming operations */ 2536 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2537 2538 if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) { 2539 if (can_copy) { 2540 blob_copy(ctx, op, copy_src_lba); 2541 } else { 2542 /* Read cluster from backing device */ 2543 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2544 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2545 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2546 blob_write_copy, ctx); 2547 } 2548 2549 } else { 2550 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2551 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx); 2552 } 2553 } 2554 2555 static inline bool 2556 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2557 uint64_t *lba, uint64_t *lba_count) 2558 { 2559 *lba_count = length; 2560 2561 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2562 assert(blob->back_bs_dev != NULL); 2563 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2564 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2565 return false; 2566 } else { 2567 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2568 return true; 2569 } 2570 } 2571 2572 struct op_split_ctx { 2573 struct spdk_blob *blob; 2574 struct spdk_io_channel *channel; 2575 uint64_t io_unit_offset; 2576 uint64_t io_units_remaining; 2577 void *curr_payload; 2578 enum spdk_blob_op_type op_type; 2579 spdk_bs_sequence_t *seq; 2580 bool in_submit_ctx; 2581 bool completed_in_submit_ctx; 2582 bool done; 2583 }; 2584 2585 static void 2586 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2587 { 2588 struct op_split_ctx *ctx = cb_arg; 2589 struct spdk_blob *blob = ctx->blob; 2590 struct spdk_io_channel *ch = ctx->channel; 2591 enum spdk_blob_op_type op_type = ctx->op_type; 2592 uint8_t *buf; 2593 uint64_t offset; 2594 uint64_t length; 2595 uint64_t op_length; 2596 2597 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2598 bs_sequence_finish(ctx->seq, bserrno); 2599 if (ctx->in_submit_ctx) { 2600 /* Defer freeing of the ctx object, since it will be 2601 * accessed when this unwinds back to the submisison 2602 * context. 2603 */ 2604 ctx->done = true; 2605 } else { 2606 free(ctx); 2607 } 2608 return; 2609 } 2610 2611 if (ctx->in_submit_ctx) { 2612 /* If this split operation completed in the context 2613 * of its submission, mark the flag and return immediately 2614 * to avoid recursion. 2615 */ 2616 ctx->completed_in_submit_ctx = true; 2617 return; 2618 } 2619 2620 while (true) { 2621 ctx->completed_in_submit_ctx = false; 2622 2623 offset = ctx->io_unit_offset; 2624 length = ctx->io_units_remaining; 2625 buf = ctx->curr_payload; 2626 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2627 offset)); 2628 2629 /* Update length and payload for next operation */ 2630 ctx->io_units_remaining -= op_length; 2631 ctx->io_unit_offset += op_length; 2632 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2633 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2634 } 2635 2636 assert(!ctx->in_submit_ctx); 2637 ctx->in_submit_ctx = true; 2638 2639 switch (op_type) { 2640 case SPDK_BLOB_READ: 2641 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2642 blob_request_submit_op_split_next, ctx); 2643 break; 2644 case SPDK_BLOB_WRITE: 2645 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2646 blob_request_submit_op_split_next, ctx); 2647 break; 2648 case SPDK_BLOB_UNMAP: 2649 spdk_blob_io_unmap(blob, ch, offset, op_length, 2650 blob_request_submit_op_split_next, ctx); 2651 break; 2652 case SPDK_BLOB_WRITE_ZEROES: 2653 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2654 blob_request_submit_op_split_next, ctx); 2655 break; 2656 case SPDK_BLOB_READV: 2657 case SPDK_BLOB_WRITEV: 2658 SPDK_ERRLOG("readv/write not valid\n"); 2659 bs_sequence_finish(ctx->seq, -EINVAL); 2660 free(ctx); 2661 return; 2662 } 2663 2664 #ifndef __clang_analyzer__ 2665 /* scan-build reports a false positive around accessing the ctx here. It 2666 * forms a path that recursively calls this function, but then says 2667 * "assuming ctx->in_submit_ctx is false", when that isn't possible. 2668 * This path does free(ctx), returns to here, and reports a use-after-free 2669 * bug. Wrapping this bit of code so that scan-build doesn't see it 2670 * works around the scan-build bug. 2671 */ 2672 assert(ctx->in_submit_ctx); 2673 ctx->in_submit_ctx = false; 2674 2675 /* If the operation completed immediately, loop back and submit the 2676 * next operation. Otherwise we can return and the next split 2677 * operation will get submitted when this current operation is 2678 * later completed asynchronously. 2679 */ 2680 if (ctx->completed_in_submit_ctx) { 2681 continue; 2682 } else if (ctx->done) { 2683 free(ctx); 2684 } 2685 #endif 2686 break; 2687 } 2688 } 2689 2690 static void 2691 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2692 void *payload, uint64_t offset, uint64_t length, 2693 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2694 { 2695 struct op_split_ctx *ctx; 2696 spdk_bs_sequence_t *seq; 2697 struct spdk_bs_cpl cpl; 2698 2699 assert(blob != NULL); 2700 2701 ctx = calloc(1, sizeof(struct op_split_ctx)); 2702 if (ctx == NULL) { 2703 cb_fn(cb_arg, -ENOMEM); 2704 return; 2705 } 2706 2707 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2708 cpl.u.blob_basic.cb_fn = cb_fn; 2709 cpl.u.blob_basic.cb_arg = cb_arg; 2710 2711 seq = bs_sequence_start(ch, &cpl); 2712 if (!seq) { 2713 free(ctx); 2714 cb_fn(cb_arg, -ENOMEM); 2715 return; 2716 } 2717 2718 ctx->blob = blob; 2719 ctx->channel = ch; 2720 ctx->curr_payload = payload; 2721 ctx->io_unit_offset = offset; 2722 ctx->io_units_remaining = length; 2723 ctx->op_type = op_type; 2724 ctx->seq = seq; 2725 2726 blob_request_submit_op_split_next(ctx, 0); 2727 } 2728 2729 static void 2730 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2731 void *payload, uint64_t offset, uint64_t length, 2732 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2733 { 2734 struct spdk_bs_cpl cpl; 2735 uint64_t lba; 2736 uint64_t lba_count; 2737 bool is_allocated; 2738 2739 assert(blob != NULL); 2740 2741 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2742 cpl.u.blob_basic.cb_fn = cb_fn; 2743 cpl.u.blob_basic.cb_arg = cb_arg; 2744 2745 if (blob->frozen_refcnt) { 2746 /* This blob I/O is frozen */ 2747 spdk_bs_user_op_t *op; 2748 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2749 2750 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2751 if (!op) { 2752 cb_fn(cb_arg, -ENOMEM); 2753 return; 2754 } 2755 2756 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2757 2758 return; 2759 } 2760 2761 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2762 2763 switch (op_type) { 2764 case SPDK_BLOB_READ: { 2765 spdk_bs_batch_t *batch; 2766 2767 batch = bs_batch_open(_ch, &cpl); 2768 if (!batch) { 2769 cb_fn(cb_arg, -ENOMEM); 2770 return; 2771 } 2772 2773 if (is_allocated) { 2774 /* Read from the blob */ 2775 bs_batch_read_dev(batch, payload, lba, lba_count); 2776 } else { 2777 /* Read from the backing block device */ 2778 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2779 } 2780 2781 bs_batch_close(batch); 2782 break; 2783 } 2784 case SPDK_BLOB_WRITE: 2785 case SPDK_BLOB_WRITE_ZEROES: { 2786 if (is_allocated) { 2787 /* Write to the blob */ 2788 spdk_bs_batch_t *batch; 2789 2790 if (lba_count == 0) { 2791 cb_fn(cb_arg, 0); 2792 return; 2793 } 2794 2795 batch = bs_batch_open(_ch, &cpl); 2796 if (!batch) { 2797 cb_fn(cb_arg, -ENOMEM); 2798 return; 2799 } 2800 2801 if (op_type == SPDK_BLOB_WRITE) { 2802 bs_batch_write_dev(batch, payload, lba, lba_count); 2803 } else { 2804 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2805 } 2806 2807 bs_batch_close(batch); 2808 } else { 2809 /* Queue this operation and allocate the cluster */ 2810 spdk_bs_user_op_t *op; 2811 2812 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2813 if (!op) { 2814 cb_fn(cb_arg, -ENOMEM); 2815 return; 2816 } 2817 2818 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2819 } 2820 break; 2821 } 2822 case SPDK_BLOB_UNMAP: { 2823 spdk_bs_batch_t *batch; 2824 2825 batch = bs_batch_open(_ch, &cpl); 2826 if (!batch) { 2827 cb_fn(cb_arg, -ENOMEM); 2828 return; 2829 } 2830 2831 if (is_allocated) { 2832 bs_batch_unmap_dev(batch, lba, lba_count); 2833 } 2834 2835 bs_batch_close(batch); 2836 break; 2837 } 2838 case SPDK_BLOB_READV: 2839 case SPDK_BLOB_WRITEV: 2840 SPDK_ERRLOG("readv/write not valid\n"); 2841 cb_fn(cb_arg, -EINVAL); 2842 break; 2843 } 2844 } 2845 2846 static void 2847 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2848 void *payload, uint64_t offset, uint64_t length, 2849 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2850 { 2851 assert(blob != NULL); 2852 2853 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2854 cb_fn(cb_arg, -EPERM); 2855 return; 2856 } 2857 2858 if (length == 0) { 2859 cb_fn(cb_arg, 0); 2860 return; 2861 } 2862 2863 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2864 cb_fn(cb_arg, -EINVAL); 2865 return; 2866 } 2867 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2868 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2869 cb_fn, cb_arg, op_type); 2870 } else { 2871 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2872 cb_fn, cb_arg, op_type); 2873 } 2874 } 2875 2876 struct rw_iov_ctx { 2877 struct spdk_blob *blob; 2878 struct spdk_io_channel *channel; 2879 spdk_blob_op_complete cb_fn; 2880 void *cb_arg; 2881 bool read; 2882 int iovcnt; 2883 struct iovec *orig_iov; 2884 uint64_t io_unit_offset; 2885 uint64_t io_units_remaining; 2886 uint64_t io_units_done; 2887 struct spdk_blob_ext_io_opts *ext_io_opts; 2888 struct iovec iov[0]; 2889 }; 2890 2891 static void 2892 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2893 { 2894 assert(cb_arg == NULL); 2895 bs_sequence_finish(seq, bserrno); 2896 } 2897 2898 static void 2899 rw_iov_split_next(void *cb_arg, int bserrno) 2900 { 2901 struct rw_iov_ctx *ctx = cb_arg; 2902 struct spdk_blob *blob = ctx->blob; 2903 struct iovec *iov, *orig_iov; 2904 int iovcnt; 2905 size_t orig_iovoff; 2906 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2907 uint64_t byte_count; 2908 2909 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2910 ctx->cb_fn(ctx->cb_arg, bserrno); 2911 free(ctx); 2912 return; 2913 } 2914 2915 io_unit_offset = ctx->io_unit_offset; 2916 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2917 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2918 /* 2919 * Get index and offset into the original iov array for our current position in the I/O sequence. 2920 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2921 * point to the current position in the I/O sequence. 2922 */ 2923 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2924 orig_iov = &ctx->orig_iov[0]; 2925 orig_iovoff = 0; 2926 while (byte_count > 0) { 2927 if (byte_count >= orig_iov->iov_len) { 2928 byte_count -= orig_iov->iov_len; 2929 orig_iov++; 2930 } else { 2931 orig_iovoff = byte_count; 2932 byte_count = 0; 2933 } 2934 } 2935 2936 /* 2937 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2938 * bytes of this next I/O remain to be accounted for in the new iov array. 2939 */ 2940 byte_count = io_units_count * blob->bs->io_unit_size; 2941 iov = &ctx->iov[0]; 2942 iovcnt = 0; 2943 while (byte_count > 0) { 2944 assert(iovcnt < ctx->iovcnt); 2945 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2946 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2947 byte_count -= iov->iov_len; 2948 orig_iovoff = 0; 2949 orig_iov++; 2950 iov++; 2951 iovcnt++; 2952 } 2953 2954 ctx->io_unit_offset += io_units_count; 2955 ctx->io_units_remaining -= io_units_count; 2956 ctx->io_units_done += io_units_count; 2957 iov = &ctx->iov[0]; 2958 2959 if (ctx->read) { 2960 spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2961 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2962 } else { 2963 spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2964 io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts); 2965 } 2966 } 2967 2968 static void 2969 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2970 struct iovec *iov, int iovcnt, 2971 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read, 2972 struct spdk_blob_ext_io_opts *ext_io_opts) 2973 { 2974 struct spdk_bs_cpl cpl; 2975 2976 assert(blob != NULL); 2977 2978 if (!read && blob->data_ro) { 2979 cb_fn(cb_arg, -EPERM); 2980 return; 2981 } 2982 2983 if (length == 0) { 2984 cb_fn(cb_arg, 0); 2985 return; 2986 } 2987 2988 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2989 cb_fn(cb_arg, -EINVAL); 2990 return; 2991 } 2992 2993 /* 2994 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2995 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2996 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2997 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 2998 * to allocate a separate iov array and split the I/O such that none of the resulting 2999 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 3000 * but since this case happens very infrequently, any performance impact will be negligible. 3001 * 3002 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 3003 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 3004 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 3005 * when the batch was completed, to allow for freeing the memory for the iov arrays. 3006 */ 3007 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 3008 uint64_t lba_count; 3009 uint64_t lba; 3010 bool is_allocated; 3011 3012 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 3013 cpl.u.blob_basic.cb_fn = cb_fn; 3014 cpl.u.blob_basic.cb_arg = cb_arg; 3015 3016 if (blob->frozen_refcnt) { 3017 /* This blob I/O is frozen */ 3018 enum spdk_blob_op_type op_type; 3019 spdk_bs_user_op_t *op; 3020 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 3021 3022 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 3023 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 3024 if (!op) { 3025 cb_fn(cb_arg, -ENOMEM); 3026 return; 3027 } 3028 3029 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 3030 3031 return; 3032 } 3033 3034 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 3035 3036 if (read) { 3037 spdk_bs_sequence_t *seq; 3038 3039 seq = bs_sequence_start(_channel, &cpl); 3040 if (!seq) { 3041 cb_fn(cb_arg, -ENOMEM); 3042 return; 3043 } 3044 3045 seq->ext_io_opts = ext_io_opts; 3046 3047 if (is_allocated) { 3048 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 3049 } else { 3050 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 3051 rw_iov_done, NULL); 3052 } 3053 } else { 3054 if (is_allocated) { 3055 spdk_bs_sequence_t *seq; 3056 3057 seq = bs_sequence_start(_channel, &cpl); 3058 if (!seq) { 3059 cb_fn(cb_arg, -ENOMEM); 3060 return; 3061 } 3062 3063 seq->ext_io_opts = ext_io_opts; 3064 3065 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 3066 } else { 3067 /* Queue this operation and allocate the cluster */ 3068 spdk_bs_user_op_t *op; 3069 3070 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 3071 length); 3072 if (!op) { 3073 cb_fn(cb_arg, -ENOMEM); 3074 return; 3075 } 3076 3077 op->ext_io_opts = ext_io_opts; 3078 3079 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 3080 } 3081 } 3082 } else { 3083 struct rw_iov_ctx *ctx; 3084 3085 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 3086 if (ctx == NULL) { 3087 cb_fn(cb_arg, -ENOMEM); 3088 return; 3089 } 3090 3091 ctx->blob = blob; 3092 ctx->channel = _channel; 3093 ctx->cb_fn = cb_fn; 3094 ctx->cb_arg = cb_arg; 3095 ctx->read = read; 3096 ctx->orig_iov = iov; 3097 ctx->iovcnt = iovcnt; 3098 ctx->io_unit_offset = offset; 3099 ctx->io_units_remaining = length; 3100 ctx->io_units_done = 0; 3101 ctx->ext_io_opts = ext_io_opts; 3102 3103 rw_iov_split_next(ctx, 0); 3104 } 3105 } 3106 3107 static struct spdk_blob * 3108 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 3109 { 3110 struct spdk_blob find; 3111 3112 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 3113 return NULL; 3114 } 3115 3116 find.id = blobid; 3117 return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find); 3118 } 3119 3120 static void 3121 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 3122 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 3123 { 3124 assert(blob != NULL); 3125 *snapshot_entry = NULL; 3126 *clone_entry = NULL; 3127 3128 if (blob->parent_id == SPDK_BLOBID_INVALID) { 3129 return; 3130 } 3131 3132 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 3133 if ((*snapshot_entry)->id == blob->parent_id) { 3134 break; 3135 } 3136 } 3137 3138 if (*snapshot_entry != NULL) { 3139 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3140 if ((*clone_entry)->id == blob->id) { 3141 break; 3142 } 3143 } 3144 3145 assert(*clone_entry != NULL); 3146 } 3147 } 3148 3149 static int 3150 bs_channel_create(void *io_device, void *ctx_buf) 3151 { 3152 struct spdk_blob_store *bs = io_device; 3153 struct spdk_bs_channel *channel = ctx_buf; 3154 struct spdk_bs_dev *dev; 3155 uint32_t max_ops = bs->max_channel_ops; 3156 uint32_t i; 3157 3158 dev = bs->dev; 3159 3160 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3161 if (!channel->req_mem) { 3162 return -1; 3163 } 3164 3165 TAILQ_INIT(&channel->reqs); 3166 3167 for (i = 0; i < max_ops; i++) { 3168 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3169 } 3170 3171 channel->bs = bs; 3172 channel->dev = dev; 3173 channel->dev_channel = dev->create_channel(dev); 3174 3175 if (!channel->dev_channel) { 3176 SPDK_ERRLOG("Failed to create device channel.\n"); 3177 free(channel->req_mem); 3178 return -1; 3179 } 3180 3181 channel->new_cluster_page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, 3182 SPDK_MALLOC_DMA); 3183 if (!channel->new_cluster_page) { 3184 SPDK_ERRLOG("Failed to allocate new cluster page\n"); 3185 free(channel->req_mem); 3186 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3187 return -1; 3188 } 3189 3190 TAILQ_INIT(&channel->need_cluster_alloc); 3191 TAILQ_INIT(&channel->queued_io); 3192 3193 return 0; 3194 } 3195 3196 static void 3197 bs_channel_destroy(void *io_device, void *ctx_buf) 3198 { 3199 struct spdk_bs_channel *channel = ctx_buf; 3200 spdk_bs_user_op_t *op; 3201 3202 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3203 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3204 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3205 bs_user_op_abort(op, -EIO); 3206 } 3207 3208 while (!TAILQ_EMPTY(&channel->queued_io)) { 3209 op = TAILQ_FIRST(&channel->queued_io); 3210 TAILQ_REMOVE(&channel->queued_io, op, link); 3211 bs_user_op_abort(op, -EIO); 3212 } 3213 3214 free(channel->req_mem); 3215 spdk_free(channel->new_cluster_page); 3216 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3217 } 3218 3219 static void 3220 bs_dev_destroy(void *io_device) 3221 { 3222 struct spdk_blob_store *bs = io_device; 3223 struct spdk_blob *blob, *blob_tmp; 3224 3225 bs->dev->destroy(bs->dev); 3226 3227 RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) { 3228 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob); 3229 spdk_bit_array_clear(bs->open_blobids, blob->id); 3230 blob_free(blob); 3231 } 3232 3233 spdk_spin_destroy(&bs->used_lock); 3234 3235 spdk_bit_array_free(&bs->open_blobids); 3236 spdk_bit_array_free(&bs->used_blobids); 3237 spdk_bit_array_free(&bs->used_md_pages); 3238 spdk_bit_pool_free(&bs->used_clusters); 3239 /* 3240 * If this function is called for any reason except a successful unload, 3241 * the unload_cpl type will be NONE and this will be a nop. 3242 */ 3243 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3244 3245 free(bs); 3246 } 3247 3248 static int 3249 bs_blob_list_add(struct spdk_blob *blob) 3250 { 3251 spdk_blob_id snapshot_id; 3252 struct spdk_blob_list *snapshot_entry = NULL; 3253 struct spdk_blob_list *clone_entry = NULL; 3254 3255 assert(blob != NULL); 3256 3257 snapshot_id = blob->parent_id; 3258 if (snapshot_id == SPDK_BLOBID_INVALID) { 3259 return 0; 3260 } 3261 3262 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3263 if (snapshot_entry == NULL) { 3264 /* Snapshot not found */ 3265 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3266 if (snapshot_entry == NULL) { 3267 return -ENOMEM; 3268 } 3269 snapshot_entry->id = snapshot_id; 3270 TAILQ_INIT(&snapshot_entry->clones); 3271 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3272 } else { 3273 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3274 if (clone_entry->id == blob->id) { 3275 break; 3276 } 3277 } 3278 } 3279 3280 if (clone_entry == NULL) { 3281 /* Clone not found */ 3282 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3283 if (clone_entry == NULL) { 3284 return -ENOMEM; 3285 } 3286 clone_entry->id = blob->id; 3287 TAILQ_INIT(&clone_entry->clones); 3288 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3289 snapshot_entry->clone_count++; 3290 } 3291 3292 return 0; 3293 } 3294 3295 static void 3296 bs_blob_list_remove(struct spdk_blob *blob) 3297 { 3298 struct spdk_blob_list *snapshot_entry = NULL; 3299 struct spdk_blob_list *clone_entry = NULL; 3300 3301 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3302 3303 if (snapshot_entry == NULL) { 3304 return; 3305 } 3306 3307 blob->parent_id = SPDK_BLOBID_INVALID; 3308 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3309 free(clone_entry); 3310 3311 snapshot_entry->clone_count--; 3312 } 3313 3314 static int 3315 bs_blob_list_free(struct spdk_blob_store *bs) 3316 { 3317 struct spdk_blob_list *snapshot_entry; 3318 struct spdk_blob_list *snapshot_entry_tmp; 3319 struct spdk_blob_list *clone_entry; 3320 struct spdk_blob_list *clone_entry_tmp; 3321 3322 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3323 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3324 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3325 free(clone_entry); 3326 } 3327 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3328 free(snapshot_entry); 3329 } 3330 3331 return 0; 3332 } 3333 3334 static void 3335 bs_free(struct spdk_blob_store *bs) 3336 { 3337 bs_blob_list_free(bs); 3338 3339 bs_unregister_md_thread(bs); 3340 spdk_io_device_unregister(bs, bs_dev_destroy); 3341 } 3342 3343 void 3344 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3345 { 3346 3347 if (!opts) { 3348 SPDK_ERRLOG("opts should not be NULL\n"); 3349 return; 3350 } 3351 3352 if (!opts_size) { 3353 SPDK_ERRLOG("opts_size should not be zero value\n"); 3354 return; 3355 } 3356 3357 memset(opts, 0, opts_size); 3358 opts->opts_size = opts_size; 3359 3360 #define FIELD_OK(field) \ 3361 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3362 3363 #define SET_FIELD(field, value) \ 3364 if (FIELD_OK(field)) { \ 3365 opts->field = value; \ 3366 } \ 3367 3368 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3369 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3370 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3371 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3372 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3373 3374 if (FIELD_OK(bstype)) { 3375 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3376 } 3377 3378 SET_FIELD(iter_cb_fn, NULL); 3379 SET_FIELD(iter_cb_arg, NULL); 3380 SET_FIELD(force_recover, false); 3381 3382 #undef FIELD_OK 3383 #undef SET_FIELD 3384 } 3385 3386 static int 3387 bs_opts_verify(struct spdk_bs_opts *opts) 3388 { 3389 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3390 opts->max_channel_ops == 0) { 3391 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3392 return -1; 3393 } 3394 3395 return 0; 3396 } 3397 3398 /* START spdk_bs_load */ 3399 3400 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3401 3402 struct spdk_bs_load_ctx { 3403 struct spdk_blob_store *bs; 3404 struct spdk_bs_super_block *super; 3405 3406 struct spdk_bs_md_mask *mask; 3407 bool in_page_chain; 3408 uint32_t page_index; 3409 uint32_t cur_page; 3410 struct spdk_blob_md_page *page; 3411 3412 uint64_t num_extent_pages; 3413 uint32_t *extent_page_num; 3414 struct spdk_blob_md_page *extent_pages; 3415 struct spdk_bit_array *used_clusters; 3416 3417 spdk_bs_sequence_t *seq; 3418 spdk_blob_op_with_handle_complete iter_cb_fn; 3419 void *iter_cb_arg; 3420 struct spdk_blob *blob; 3421 spdk_blob_id blobid; 3422 3423 bool force_recover; 3424 3425 /* These fields are used in the spdk_bs_dump path. */ 3426 bool dumping; 3427 FILE *fp; 3428 spdk_bs_dump_print_xattr print_xattr_fn; 3429 char xattr_name[4096]; 3430 }; 3431 3432 static int 3433 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3434 struct spdk_bs_load_ctx **_ctx) 3435 { 3436 struct spdk_blob_store *bs; 3437 struct spdk_bs_load_ctx *ctx; 3438 uint64_t dev_size; 3439 int rc; 3440 3441 dev_size = dev->blocklen * dev->blockcnt; 3442 if (dev_size < opts->cluster_sz) { 3443 /* Device size cannot be smaller than cluster size of blobstore */ 3444 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3445 dev_size, opts->cluster_sz); 3446 return -ENOSPC; 3447 } 3448 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3449 /* Cluster size cannot be smaller than page size */ 3450 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3451 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3452 return -EINVAL; 3453 } 3454 bs = calloc(1, sizeof(struct spdk_blob_store)); 3455 if (!bs) { 3456 return -ENOMEM; 3457 } 3458 3459 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3460 if (!ctx) { 3461 free(bs); 3462 return -ENOMEM; 3463 } 3464 3465 ctx->bs = bs; 3466 ctx->iter_cb_fn = opts->iter_cb_fn; 3467 ctx->iter_cb_arg = opts->iter_cb_arg; 3468 ctx->force_recover = opts->force_recover; 3469 3470 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3471 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3472 if (!ctx->super) { 3473 free(ctx); 3474 free(bs); 3475 return -ENOMEM; 3476 } 3477 3478 RB_INIT(&bs->open_blobs); 3479 TAILQ_INIT(&bs->snapshots); 3480 bs->dev = dev; 3481 bs->md_thread = spdk_get_thread(); 3482 assert(bs->md_thread != NULL); 3483 3484 /* 3485 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3486 * even multiple of the cluster size. 3487 */ 3488 bs->cluster_sz = opts->cluster_sz; 3489 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3490 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3491 if (!ctx->used_clusters) { 3492 spdk_free(ctx->super); 3493 free(ctx); 3494 free(bs); 3495 return -ENOMEM; 3496 } 3497 3498 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3499 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3500 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3501 } 3502 bs->num_free_clusters = bs->total_clusters; 3503 bs->io_unit_size = dev->blocklen; 3504 3505 bs->max_channel_ops = opts->max_channel_ops; 3506 bs->super_blob = SPDK_BLOBID_INVALID; 3507 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3508 3509 /* The metadata is assumed to be at least 1 page */ 3510 bs->used_md_pages = spdk_bit_array_create(1); 3511 bs->used_blobids = spdk_bit_array_create(0); 3512 bs->open_blobids = spdk_bit_array_create(0); 3513 3514 spdk_spin_init(&bs->used_lock); 3515 3516 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3517 sizeof(struct spdk_bs_channel), "blobstore"); 3518 rc = bs_register_md_thread(bs); 3519 if (rc == -1) { 3520 spdk_io_device_unregister(bs, NULL); 3521 spdk_spin_destroy(&bs->used_lock); 3522 spdk_bit_array_free(&bs->open_blobids); 3523 spdk_bit_array_free(&bs->used_blobids); 3524 spdk_bit_array_free(&bs->used_md_pages); 3525 spdk_bit_array_free(&ctx->used_clusters); 3526 spdk_free(ctx->super); 3527 free(ctx); 3528 free(bs); 3529 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3530 return -ENOMEM; 3531 } 3532 3533 *_ctx = ctx; 3534 *_bs = bs; 3535 return 0; 3536 } 3537 3538 static void 3539 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3540 { 3541 assert(bserrno != 0); 3542 3543 spdk_free(ctx->super); 3544 bs_sequence_finish(ctx->seq, bserrno); 3545 bs_free(ctx->bs); 3546 spdk_bit_array_free(&ctx->used_clusters); 3547 free(ctx); 3548 } 3549 3550 static void 3551 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3552 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3553 { 3554 /* Update the values in the super block */ 3555 super->super_blob = bs->super_blob; 3556 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3557 super->crc = blob_md_page_calc_crc(super); 3558 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3559 bs_byte_to_lba(bs, sizeof(*super)), 3560 cb_fn, cb_arg); 3561 } 3562 3563 static void 3564 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3565 { 3566 struct spdk_bs_load_ctx *ctx = arg; 3567 uint64_t mask_size, lba, lba_count; 3568 3569 /* Write out the used clusters mask */ 3570 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3571 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3572 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3573 if (!ctx->mask) { 3574 bs_load_ctx_fail(ctx, -ENOMEM); 3575 return; 3576 } 3577 3578 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3579 ctx->mask->length = ctx->bs->total_clusters; 3580 /* We could get here through the normal unload path, or through dirty 3581 * shutdown recovery. For the normal unload path, we use the mask from 3582 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3583 * only the bit array from the load ctx. 3584 */ 3585 if (ctx->bs->used_clusters) { 3586 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3587 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3588 } else { 3589 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3590 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3591 } 3592 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3593 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3594 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3595 } 3596 3597 static void 3598 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3599 { 3600 struct spdk_bs_load_ctx *ctx = arg; 3601 uint64_t mask_size, lba, lba_count; 3602 3603 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3604 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3605 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3606 if (!ctx->mask) { 3607 bs_load_ctx_fail(ctx, -ENOMEM); 3608 return; 3609 } 3610 3611 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3612 ctx->mask->length = ctx->super->md_len; 3613 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3614 3615 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3616 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3617 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3618 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3619 } 3620 3621 static void 3622 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3623 { 3624 struct spdk_bs_load_ctx *ctx = arg; 3625 uint64_t mask_size, lba, lba_count; 3626 3627 if (ctx->super->used_blobid_mask_len == 0) { 3628 /* 3629 * This is a pre-v3 on-disk format where the blobid mask does not get 3630 * written to disk. 3631 */ 3632 cb_fn(seq, arg, 0); 3633 return; 3634 } 3635 3636 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3637 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3638 SPDK_MALLOC_DMA); 3639 if (!ctx->mask) { 3640 bs_load_ctx_fail(ctx, -ENOMEM); 3641 return; 3642 } 3643 3644 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3645 ctx->mask->length = ctx->super->md_len; 3646 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3647 3648 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3649 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3650 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3651 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3652 } 3653 3654 static void 3655 blob_set_thin_provision(struct spdk_blob *blob) 3656 { 3657 blob_verify_md_op(blob); 3658 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3659 blob->state = SPDK_BLOB_STATE_DIRTY; 3660 } 3661 3662 static void 3663 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3664 { 3665 blob_verify_md_op(blob); 3666 blob->clear_method = clear_method; 3667 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3668 blob->state = SPDK_BLOB_STATE_DIRTY; 3669 } 3670 3671 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3672 3673 static void 3674 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3675 { 3676 struct spdk_bs_load_ctx *ctx = cb_arg; 3677 spdk_blob_id id; 3678 int64_t page_num; 3679 3680 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3681 * last blob has been removed */ 3682 page_num = bs_blobid_to_page(ctx->blobid); 3683 page_num++; 3684 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3685 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3686 bs_load_iter(ctx, NULL, -ENOENT); 3687 return; 3688 } 3689 3690 id = bs_page_to_blobid(page_num); 3691 3692 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3693 } 3694 3695 static void 3696 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3697 { 3698 struct spdk_bs_load_ctx *ctx = cb_arg; 3699 3700 if (bserrno != 0) { 3701 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3702 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3703 return; 3704 } 3705 3706 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3707 } 3708 3709 static void 3710 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3711 { 3712 struct spdk_bs_load_ctx *ctx = cb_arg; 3713 uint64_t i; 3714 3715 if (bserrno != 0) { 3716 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3717 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3718 return; 3719 } 3720 3721 /* Snapshot and clone have the same copy of cluster map and extent pages 3722 * at this point. Let's clear both for snapshot now, 3723 * so that it won't be cleared for clone later when we remove snapshot. 3724 * Also set thin provision to pass data corruption check */ 3725 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3726 ctx->blob->active.clusters[i] = 0; 3727 } 3728 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3729 ctx->blob->active.extent_pages[i] = 0; 3730 } 3731 3732 ctx->blob->md_ro = false; 3733 3734 blob_set_thin_provision(ctx->blob); 3735 3736 ctx->blobid = ctx->blob->id; 3737 3738 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3739 } 3740 3741 static void 3742 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3743 { 3744 struct spdk_bs_load_ctx *ctx = cb_arg; 3745 3746 if (bserrno != 0) { 3747 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3748 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3749 return; 3750 } 3751 3752 ctx->blob->md_ro = false; 3753 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3754 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3755 spdk_blob_set_read_only(ctx->blob); 3756 3757 if (ctx->iter_cb_fn) { 3758 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3759 } 3760 bs_blob_list_add(ctx->blob); 3761 3762 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3763 } 3764 3765 static void 3766 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3767 { 3768 struct spdk_bs_load_ctx *ctx = cb_arg; 3769 3770 if (bserrno != 0) { 3771 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3772 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3773 return; 3774 } 3775 3776 if (blob->parent_id == ctx->blob->id) { 3777 /* Power failure occurred before updating clone (snapshot delete case) 3778 * or after updating clone (creating snapshot case) - keep snapshot */ 3779 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3780 } else { 3781 /* Power failure occurred after updating clone (snapshot delete case) 3782 * or before updating clone (creating snapshot case) - remove snapshot */ 3783 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3784 } 3785 } 3786 3787 static void 3788 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3789 { 3790 struct spdk_bs_load_ctx *ctx = arg; 3791 const void *value; 3792 size_t len; 3793 int rc = 0; 3794 3795 if (bserrno == 0) { 3796 /* Examine blob if it is corrupted after power failure. Fix 3797 * the ones that can be fixed and remove any other corrupted 3798 * ones. If it is not corrupted just process it */ 3799 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3800 if (rc != 0) { 3801 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3802 if (rc != 0) { 3803 /* Not corrupted - process it and continue with iterating through blobs */ 3804 if (ctx->iter_cb_fn) { 3805 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3806 } 3807 bs_blob_list_add(blob); 3808 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3809 return; 3810 } 3811 3812 } 3813 3814 assert(len == sizeof(spdk_blob_id)); 3815 3816 ctx->blob = blob; 3817 3818 /* Open clone to check if we are able to fix this blob or should we remove it */ 3819 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3820 return; 3821 } else if (bserrno == -ENOENT) { 3822 bserrno = 0; 3823 } else { 3824 /* 3825 * This case needs to be looked at further. Same problem 3826 * exists with applications that rely on explicit blob 3827 * iteration. We should just skip the blob that failed 3828 * to load and continue on to the next one. 3829 */ 3830 SPDK_ERRLOG("Error in iterating blobs\n"); 3831 } 3832 3833 ctx->iter_cb_fn = NULL; 3834 3835 spdk_free(ctx->super); 3836 spdk_free(ctx->mask); 3837 bs_sequence_finish(ctx->seq, bserrno); 3838 free(ctx); 3839 } 3840 3841 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 3842 3843 static void 3844 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3845 { 3846 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3847 if (ctx->dumping) { 3848 bs_dump_read_md_page(ctx->seq, ctx); 3849 return; 3850 } 3851 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3852 } 3853 3854 static void 3855 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3856 { 3857 struct spdk_bs_load_ctx *ctx = cb_arg; 3858 int rc; 3859 3860 /* The type must be correct */ 3861 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3862 3863 /* The length of the mask (in bits) must not be greater than 3864 * the length of the buffer (converted to bits) */ 3865 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3866 3867 /* The length of the mask must be exactly equal to the size 3868 * (in pages) of the metadata region */ 3869 assert(ctx->mask->length == ctx->super->md_len); 3870 3871 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3872 if (rc < 0) { 3873 spdk_free(ctx->mask); 3874 bs_load_ctx_fail(ctx, rc); 3875 return; 3876 } 3877 3878 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3879 bs_load_complete(ctx); 3880 } 3881 3882 static void 3883 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3884 { 3885 struct spdk_bs_load_ctx *ctx = cb_arg; 3886 uint64_t lba, lba_count, mask_size; 3887 int rc; 3888 3889 if (bserrno != 0) { 3890 bs_load_ctx_fail(ctx, bserrno); 3891 return; 3892 } 3893 3894 /* The type must be correct */ 3895 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3896 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3897 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3898 struct spdk_blob_md_page) * 8)); 3899 /* 3900 * The length of the mask must be equal to or larger than the total number of clusters. It may be 3901 * larger than the total number of clusters due to a failure spdk_bs_grow. 3902 */ 3903 assert(ctx->mask->length >= ctx->bs->total_clusters); 3904 if (ctx->mask->length > ctx->bs->total_clusters) { 3905 SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters"); 3906 ctx->mask->length = ctx->bs->total_clusters; 3907 } 3908 3909 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3910 if (rc < 0) { 3911 spdk_free(ctx->mask); 3912 bs_load_ctx_fail(ctx, rc); 3913 return; 3914 } 3915 3916 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3917 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3918 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3919 3920 spdk_free(ctx->mask); 3921 3922 /* Read the used blobids mask */ 3923 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3924 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3925 SPDK_MALLOC_DMA); 3926 if (!ctx->mask) { 3927 bs_load_ctx_fail(ctx, -ENOMEM); 3928 return; 3929 } 3930 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3931 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3932 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3933 bs_load_used_blobids_cpl, ctx); 3934 } 3935 3936 static void 3937 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3938 { 3939 struct spdk_bs_load_ctx *ctx = cb_arg; 3940 uint64_t lba, lba_count, mask_size; 3941 int rc; 3942 3943 if (bserrno != 0) { 3944 bs_load_ctx_fail(ctx, bserrno); 3945 return; 3946 } 3947 3948 /* The type must be correct */ 3949 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3950 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3951 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3952 8)); 3953 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3954 if (ctx->mask->length != ctx->super->md_len) { 3955 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3956 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3957 ctx->mask->length, ctx->super->md_len); 3958 assert(false); 3959 } 3960 3961 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3962 if (rc < 0) { 3963 spdk_free(ctx->mask); 3964 bs_load_ctx_fail(ctx, rc); 3965 return; 3966 } 3967 3968 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3969 spdk_free(ctx->mask); 3970 3971 /* Read the used clusters mask */ 3972 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3973 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3974 SPDK_MALLOC_DMA); 3975 if (!ctx->mask) { 3976 bs_load_ctx_fail(ctx, -ENOMEM); 3977 return; 3978 } 3979 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3980 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3981 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3982 bs_load_used_clusters_cpl, ctx); 3983 } 3984 3985 static void 3986 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3987 { 3988 uint64_t lba, lba_count, mask_size; 3989 3990 /* Read the used pages mask */ 3991 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3992 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3993 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3994 if (!ctx->mask) { 3995 bs_load_ctx_fail(ctx, -ENOMEM); 3996 return; 3997 } 3998 3999 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 4000 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 4001 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 4002 bs_load_used_pages_cpl, ctx); 4003 } 4004 4005 static int 4006 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 4007 { 4008 struct spdk_blob_store *bs = ctx->bs; 4009 struct spdk_blob_md_descriptor *desc; 4010 size_t cur_desc = 0; 4011 4012 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4013 while (cur_desc < sizeof(page->descriptors)) { 4014 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4015 if (desc->length == 0) { 4016 /* If padding and length are 0, this terminates the page */ 4017 break; 4018 } 4019 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4020 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4021 unsigned int i, j; 4022 unsigned int cluster_count = 0; 4023 uint32_t cluster_idx; 4024 4025 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4026 4027 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4028 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 4029 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 4030 /* 4031 * cluster_idx = 0 means an unallocated cluster - don't mark that 4032 * in the used cluster map. 4033 */ 4034 if (cluster_idx != 0) { 4035 SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j); 4036 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 4037 if (bs->num_free_clusters == 0) { 4038 return -ENOSPC; 4039 } 4040 bs->num_free_clusters--; 4041 } 4042 cluster_count++; 4043 } 4044 } 4045 if (cluster_count == 0) { 4046 return -EINVAL; 4047 } 4048 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4049 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4050 uint32_t i; 4051 uint32_t cluster_count = 0; 4052 uint32_t cluster_idx; 4053 size_t cluster_idx_length; 4054 4055 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4056 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 4057 4058 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 4059 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 4060 return -EINVAL; 4061 } 4062 4063 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 4064 cluster_idx = desc_extent->cluster_idx[i]; 4065 /* 4066 * cluster_idx = 0 means an unallocated cluster - don't mark that 4067 * in the used cluster map. 4068 */ 4069 if (cluster_idx != 0) { 4070 if (cluster_idx < desc_extent->start_cluster_idx && 4071 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 4072 return -EINVAL; 4073 } 4074 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 4075 if (bs->num_free_clusters == 0) { 4076 return -ENOSPC; 4077 } 4078 bs->num_free_clusters--; 4079 } 4080 cluster_count++; 4081 } 4082 4083 if (cluster_count == 0) { 4084 return -EINVAL; 4085 } 4086 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4087 /* Skip this item */ 4088 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4089 /* Skip this item */ 4090 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4091 /* Skip this item */ 4092 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4093 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 4094 uint32_t num_extent_pages = ctx->num_extent_pages; 4095 uint32_t i; 4096 size_t extent_pages_length; 4097 void *tmp; 4098 4099 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 4100 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 4101 4102 if (desc_extent_table->length == 0 || 4103 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 4104 return -EINVAL; 4105 } 4106 4107 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4108 if (desc_extent_table->extent_page[i].page_idx != 0) { 4109 if (desc_extent_table->extent_page[i].num_pages != 1) { 4110 return -EINVAL; 4111 } 4112 num_extent_pages += 1; 4113 } 4114 } 4115 4116 if (num_extent_pages > 0) { 4117 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 4118 if (tmp == NULL) { 4119 return -ENOMEM; 4120 } 4121 ctx->extent_page_num = tmp; 4122 4123 /* Extent table entries contain md page numbers for extent pages. 4124 * Zeroes represent unallocated extent pages, those are run-length-encoded. 4125 */ 4126 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 4127 if (desc_extent_table->extent_page[i].page_idx != 0) { 4128 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 4129 ctx->num_extent_pages += 1; 4130 } 4131 } 4132 } 4133 } else { 4134 /* Error */ 4135 return -EINVAL; 4136 } 4137 /* Advance to the next descriptor */ 4138 cur_desc += sizeof(*desc) + desc->length; 4139 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4140 break; 4141 } 4142 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4143 } 4144 return 0; 4145 } 4146 4147 static bool 4148 bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 4149 { 4150 uint32_t crc; 4151 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4152 size_t desc_len; 4153 4154 crc = blob_md_page_calc_crc(page); 4155 if (crc != page->crc) { 4156 return false; 4157 } 4158 4159 /* Extent page should always be of sequence num 0. */ 4160 if (page->sequence_num != 0) { 4161 return false; 4162 } 4163 4164 /* Descriptor type must be EXTENT_PAGE. */ 4165 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4166 return false; 4167 } 4168 4169 /* Descriptor length cannot exceed the page. */ 4170 desc_len = sizeof(*desc) + desc->length; 4171 if (desc_len > sizeof(page->descriptors)) { 4172 return false; 4173 } 4174 4175 /* It has to be the only descriptor in the page. */ 4176 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4177 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4178 if (desc->length != 0) { 4179 return false; 4180 } 4181 } 4182 4183 return true; 4184 } 4185 4186 static bool 4187 bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4188 { 4189 uint32_t crc; 4190 struct spdk_blob_md_page *page = ctx->page; 4191 4192 crc = blob_md_page_calc_crc(page); 4193 if (crc != page->crc) { 4194 return false; 4195 } 4196 4197 /* First page of a sequence should match the blobid. */ 4198 if (page->sequence_num == 0 && 4199 bs_page_to_blobid(ctx->cur_page) != page->id) { 4200 return false; 4201 } 4202 assert(bs_load_cur_extent_page_valid(page) == false); 4203 4204 return true; 4205 } 4206 4207 static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4208 4209 static void 4210 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4211 { 4212 struct spdk_bs_load_ctx *ctx = cb_arg; 4213 4214 if (bserrno != 0) { 4215 bs_load_ctx_fail(ctx, bserrno); 4216 return; 4217 } 4218 4219 bs_load_complete(ctx); 4220 } 4221 4222 static void 4223 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4224 { 4225 struct spdk_bs_load_ctx *ctx = cb_arg; 4226 4227 spdk_free(ctx->mask); 4228 ctx->mask = NULL; 4229 4230 if (bserrno != 0) { 4231 bs_load_ctx_fail(ctx, bserrno); 4232 return; 4233 } 4234 4235 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4236 } 4237 4238 static void 4239 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4240 { 4241 struct spdk_bs_load_ctx *ctx = cb_arg; 4242 4243 spdk_free(ctx->mask); 4244 ctx->mask = NULL; 4245 4246 if (bserrno != 0) { 4247 bs_load_ctx_fail(ctx, bserrno); 4248 return; 4249 } 4250 4251 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4252 } 4253 4254 static void 4255 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4256 { 4257 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4258 } 4259 4260 static void 4261 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4262 { 4263 uint64_t num_md_clusters; 4264 uint64_t i; 4265 4266 ctx->in_page_chain = false; 4267 4268 do { 4269 ctx->page_index++; 4270 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4271 4272 if (ctx->page_index < ctx->super->md_len) { 4273 ctx->cur_page = ctx->page_index; 4274 bs_load_replay_cur_md_page(ctx); 4275 } else { 4276 /* Claim all of the clusters used by the metadata */ 4277 num_md_clusters = spdk_divide_round_up( 4278 ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster); 4279 for (i = 0; i < num_md_clusters; i++) { 4280 spdk_bit_array_set(ctx->used_clusters, i); 4281 } 4282 ctx->bs->num_free_clusters -= num_md_clusters; 4283 spdk_free(ctx->page); 4284 bs_load_write_used_md(ctx); 4285 } 4286 } 4287 4288 static void 4289 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4290 { 4291 struct spdk_bs_load_ctx *ctx = cb_arg; 4292 uint32_t page_num; 4293 uint64_t i; 4294 4295 if (bserrno != 0) { 4296 spdk_free(ctx->extent_pages); 4297 bs_load_ctx_fail(ctx, bserrno); 4298 return; 4299 } 4300 4301 for (i = 0; i < ctx->num_extent_pages; i++) { 4302 /* Extent pages are only read when present within in chain md. 4303 * Integrity of md is not right if that page was not a valid extent page. */ 4304 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4305 spdk_free(ctx->extent_pages); 4306 bs_load_ctx_fail(ctx, -EILSEQ); 4307 return; 4308 } 4309 4310 page_num = ctx->extent_page_num[i]; 4311 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4312 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4313 spdk_free(ctx->extent_pages); 4314 bs_load_ctx_fail(ctx, -EILSEQ); 4315 return; 4316 } 4317 } 4318 4319 spdk_free(ctx->extent_pages); 4320 free(ctx->extent_page_num); 4321 ctx->extent_page_num = NULL; 4322 ctx->num_extent_pages = 0; 4323 4324 bs_load_replay_md_chain_cpl(ctx); 4325 } 4326 4327 static void 4328 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4329 { 4330 spdk_bs_batch_t *batch; 4331 uint32_t page; 4332 uint64_t lba; 4333 uint64_t i; 4334 4335 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4336 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4337 if (!ctx->extent_pages) { 4338 bs_load_ctx_fail(ctx, -ENOMEM); 4339 return; 4340 } 4341 4342 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4343 4344 for (i = 0; i < ctx->num_extent_pages; i++) { 4345 page = ctx->extent_page_num[i]; 4346 assert(page < ctx->super->md_len); 4347 lba = bs_md_page_to_lba(ctx->bs, page); 4348 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4349 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4350 } 4351 4352 bs_batch_close(batch); 4353 } 4354 4355 static void 4356 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4357 { 4358 struct spdk_bs_load_ctx *ctx = cb_arg; 4359 uint32_t page_num; 4360 struct spdk_blob_md_page *page; 4361 4362 if (bserrno != 0) { 4363 bs_load_ctx_fail(ctx, bserrno); 4364 return; 4365 } 4366 4367 page_num = ctx->cur_page; 4368 page = ctx->page; 4369 if (bs_load_cur_md_page_valid(ctx) == true) { 4370 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4371 spdk_spin_lock(&ctx->bs->used_lock); 4372 bs_claim_md_page(ctx->bs, page_num); 4373 spdk_spin_unlock(&ctx->bs->used_lock); 4374 if (page->sequence_num == 0) { 4375 SPDK_NOTICELOG("Recover: blob %" PRIu32 "\n", page_num); 4376 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4377 } 4378 if (bs_load_replay_md_parse_page(ctx, page)) { 4379 bs_load_ctx_fail(ctx, -EILSEQ); 4380 return; 4381 } 4382 if (page->next != SPDK_INVALID_MD_PAGE) { 4383 ctx->in_page_chain = true; 4384 ctx->cur_page = page->next; 4385 bs_load_replay_cur_md_page(ctx); 4386 return; 4387 } 4388 if (ctx->num_extent_pages != 0) { 4389 bs_load_replay_extent_pages(ctx); 4390 return; 4391 } 4392 } 4393 } 4394 bs_load_replay_md_chain_cpl(ctx); 4395 } 4396 4397 static void 4398 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4399 { 4400 uint64_t lba; 4401 4402 assert(ctx->cur_page < ctx->super->md_len); 4403 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4404 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4405 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4406 bs_load_replay_md_cpl, ctx); 4407 } 4408 4409 static void 4410 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4411 { 4412 ctx->page_index = 0; 4413 ctx->cur_page = 0; 4414 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4415 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4416 if (!ctx->page) { 4417 bs_load_ctx_fail(ctx, -ENOMEM); 4418 return; 4419 } 4420 bs_load_replay_cur_md_page(ctx); 4421 } 4422 4423 static void 4424 bs_recover(struct spdk_bs_load_ctx *ctx) 4425 { 4426 int rc; 4427 4428 SPDK_NOTICELOG("Performing recovery on blobstore\n"); 4429 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4430 if (rc < 0) { 4431 bs_load_ctx_fail(ctx, -ENOMEM); 4432 return; 4433 } 4434 4435 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4436 if (rc < 0) { 4437 bs_load_ctx_fail(ctx, -ENOMEM); 4438 return; 4439 } 4440 4441 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4442 if (rc < 0) { 4443 bs_load_ctx_fail(ctx, -ENOMEM); 4444 return; 4445 } 4446 4447 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4448 if (rc < 0) { 4449 bs_load_ctx_fail(ctx, -ENOMEM); 4450 return; 4451 } 4452 4453 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4454 bs_load_replay_md(ctx); 4455 } 4456 4457 static int 4458 bs_parse_super(struct spdk_bs_load_ctx *ctx) 4459 { 4460 int rc; 4461 4462 if (ctx->super->size == 0) { 4463 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4464 } 4465 4466 if (ctx->super->io_unit_size == 0) { 4467 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4468 } 4469 4470 ctx->bs->clean = 1; 4471 ctx->bs->cluster_sz = ctx->super->cluster_size; 4472 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4473 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4474 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4475 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4476 } 4477 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4478 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4479 if (rc < 0) { 4480 return -ENOMEM; 4481 } 4482 ctx->bs->md_start = ctx->super->md_start; 4483 ctx->bs->md_len = ctx->super->md_len; 4484 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 4485 if (rc < 0) { 4486 return -ENOMEM; 4487 } 4488 4489 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4490 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4491 ctx->bs->super_blob = ctx->super->super_blob; 4492 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4493 4494 return 0; 4495 } 4496 4497 static void 4498 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4499 { 4500 struct spdk_bs_load_ctx *ctx = cb_arg; 4501 uint32_t crc; 4502 int rc; 4503 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4504 4505 if (ctx->super->version > SPDK_BS_VERSION || 4506 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4507 bs_load_ctx_fail(ctx, -EILSEQ); 4508 return; 4509 } 4510 4511 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4512 sizeof(ctx->super->signature)) != 0) { 4513 bs_load_ctx_fail(ctx, -EILSEQ); 4514 return; 4515 } 4516 4517 crc = blob_md_page_calc_crc(ctx->super); 4518 if (crc != ctx->super->crc) { 4519 bs_load_ctx_fail(ctx, -EILSEQ); 4520 return; 4521 } 4522 4523 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4524 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4525 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4526 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4527 } else { 4528 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4529 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4530 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4531 bs_load_ctx_fail(ctx, -ENXIO); 4532 return; 4533 } 4534 4535 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4536 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4537 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4538 bs_load_ctx_fail(ctx, -EILSEQ); 4539 return; 4540 } 4541 4542 rc = bs_parse_super(ctx); 4543 if (rc < 0) { 4544 bs_load_ctx_fail(ctx, rc); 4545 return; 4546 } 4547 4548 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) { 4549 bs_recover(ctx); 4550 } else { 4551 bs_load_read_used_pages(ctx); 4552 } 4553 } 4554 4555 static inline int 4556 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4557 { 4558 4559 if (!src->opts_size) { 4560 SPDK_ERRLOG("opts_size should not be zero value\n"); 4561 return -1; 4562 } 4563 4564 #define FIELD_OK(field) \ 4565 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4566 4567 #define SET_FIELD(field) \ 4568 if (FIELD_OK(field)) { \ 4569 dst->field = src->field; \ 4570 } \ 4571 4572 SET_FIELD(cluster_sz); 4573 SET_FIELD(num_md_pages); 4574 SET_FIELD(max_md_ops); 4575 SET_FIELD(max_channel_ops); 4576 SET_FIELD(clear_method); 4577 4578 if (FIELD_OK(bstype)) { 4579 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4580 } 4581 SET_FIELD(iter_cb_fn); 4582 SET_FIELD(iter_cb_arg); 4583 SET_FIELD(force_recover); 4584 4585 dst->opts_size = src->opts_size; 4586 4587 /* You should not remove this statement, but need to update the assert statement 4588 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4589 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 72, "Incorrect size"); 4590 4591 #undef FIELD_OK 4592 #undef SET_FIELD 4593 4594 return 0; 4595 } 4596 4597 void 4598 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4599 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4600 { 4601 struct spdk_blob_store *bs; 4602 struct spdk_bs_cpl cpl; 4603 struct spdk_bs_load_ctx *ctx; 4604 struct spdk_bs_opts opts = {}; 4605 int err; 4606 4607 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4608 4609 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4610 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4611 dev->destroy(dev); 4612 cb_fn(cb_arg, NULL, -EINVAL); 4613 return; 4614 } 4615 4616 spdk_bs_opts_init(&opts, sizeof(opts)); 4617 if (o) { 4618 if (bs_opts_copy(o, &opts)) { 4619 return; 4620 } 4621 } 4622 4623 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4624 dev->destroy(dev); 4625 cb_fn(cb_arg, NULL, -EINVAL); 4626 return; 4627 } 4628 4629 err = bs_alloc(dev, &opts, &bs, &ctx); 4630 if (err) { 4631 dev->destroy(dev); 4632 cb_fn(cb_arg, NULL, err); 4633 return; 4634 } 4635 4636 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4637 cpl.u.bs_handle.cb_fn = cb_fn; 4638 cpl.u.bs_handle.cb_arg = cb_arg; 4639 cpl.u.bs_handle.bs = bs; 4640 4641 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4642 if (!ctx->seq) { 4643 spdk_free(ctx->super); 4644 free(ctx); 4645 bs_free(bs); 4646 cb_fn(cb_arg, NULL, -ENOMEM); 4647 return; 4648 } 4649 4650 /* Read the super block */ 4651 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4652 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4653 bs_load_super_cpl, ctx); 4654 } 4655 4656 /* END spdk_bs_load */ 4657 4658 /* START spdk_bs_dump */ 4659 4660 static void 4661 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4662 { 4663 spdk_free(ctx->super); 4664 4665 /* 4666 * We need to defer calling bs_call_cpl() until after 4667 * dev destruction, so tuck these away for later use. 4668 */ 4669 ctx->bs->unload_err = bserrno; 4670 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4671 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4672 4673 bs_sequence_finish(seq, 0); 4674 bs_free(ctx->bs); 4675 free(ctx); 4676 } 4677 4678 static void 4679 bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4680 { 4681 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4682 uint32_t i; 4683 const char *type; 4684 4685 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4686 4687 if (desc_xattr->length != 4688 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4689 desc_xattr->name_length + desc_xattr->value_length) { 4690 } 4691 4692 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4693 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4694 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4695 type = "XATTR"; 4696 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4697 type = "XATTR_INTERNAL"; 4698 } else { 4699 assert(false); 4700 type = "XATTR_?"; 4701 } 4702 fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name); 4703 fprintf(ctx->fp, " value = \""); 4704 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4705 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4706 desc_xattr->value_length); 4707 fprintf(ctx->fp, "\"\n"); 4708 for (i = 0; i < desc_xattr->value_length; i++) { 4709 if (i % 16 == 0) { 4710 fprintf(ctx->fp, " "); 4711 } 4712 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4713 if ((i + 1) % 16 == 0) { 4714 fprintf(ctx->fp, "\n"); 4715 } 4716 } 4717 if (i % 16 != 0) { 4718 fprintf(ctx->fp, "\n"); 4719 } 4720 } 4721 4722 struct type_flag_desc { 4723 uint64_t mask; 4724 uint64_t val; 4725 const char *name; 4726 }; 4727 4728 static void 4729 bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags, 4730 struct type_flag_desc *desc, size_t numflags) 4731 { 4732 uint64_t covered = 0; 4733 size_t i; 4734 4735 for (i = 0; i < numflags; i++) { 4736 if ((desc[i].mask & flags) != desc[i].val) { 4737 continue; 4738 } 4739 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name); 4740 if (desc[i].mask != desc[i].val) { 4741 fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")", 4742 desc[i].mask, desc[i].val); 4743 } 4744 fprintf(ctx->fp, "\n"); 4745 covered |= desc[i].mask; 4746 } 4747 if ((flags & ~covered) != 0) { 4748 fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered); 4749 } 4750 } 4751 4752 static void 4753 bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4754 { 4755 struct spdk_blob_md_descriptor_flags *type_desc; 4756 #define ADD_FLAG(f) { f, f, #f } 4757 #define ADD_MASK_VAL(m, v) { m, v, #v } 4758 static struct type_flag_desc invalid[] = { 4759 ADD_FLAG(SPDK_BLOB_THIN_PROV), 4760 ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR), 4761 ADD_FLAG(SPDK_BLOB_EXTENT_TABLE), 4762 }; 4763 static struct type_flag_desc data_ro[] = { 4764 ADD_FLAG(SPDK_BLOB_READ_ONLY), 4765 }; 4766 static struct type_flag_desc md_ro[] = { 4767 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT), 4768 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE), 4769 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP), 4770 ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES), 4771 }; 4772 #undef ADD_FLAG 4773 #undef ADD_MASK_VAL 4774 4775 type_desc = (struct spdk_blob_md_descriptor_flags *)desc; 4776 fprintf(ctx->fp, "Flags:\n"); 4777 fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags); 4778 bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid, 4779 SPDK_COUNTOF(invalid)); 4780 fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags); 4781 bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro, 4782 SPDK_COUNTOF(data_ro)); 4783 fprintf(ctx->fp, "\t md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags); 4784 bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro, 4785 SPDK_COUNTOF(md_ro)); 4786 } 4787 4788 static void 4789 bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc) 4790 { 4791 struct spdk_blob_md_descriptor_extent_table *et_desc; 4792 uint64_t num_extent_pages; 4793 uint32_t et_idx; 4794 4795 et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc; 4796 num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) / 4797 sizeof(et_desc->extent_page[0]); 4798 4799 fprintf(ctx->fp, "Extent table:\n"); 4800 for (et_idx = 0; et_idx < num_extent_pages; et_idx++) { 4801 if (et_desc->extent_page[et_idx].page_idx == 0) { 4802 /* Zeroes represent unallocated extent pages. */ 4803 continue; 4804 } 4805 fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32 4806 " at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx, 4807 et_desc->extent_page[et_idx].num_pages, 4808 bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx)); 4809 } 4810 } 4811 4812 static void 4813 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4814 { 4815 uint32_t page_idx = ctx->cur_page; 4816 struct spdk_blob_md_page *page = ctx->page; 4817 struct spdk_blob_md_descriptor *desc; 4818 size_t cur_desc = 0; 4819 uint32_t crc; 4820 4821 fprintf(ctx->fp, "=========\n"); 4822 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4823 fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx)); 4824 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4825 fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num); 4826 if (page->next == SPDK_INVALID_MD_PAGE) { 4827 fprintf(ctx->fp, "Next: None\n"); 4828 } else { 4829 fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next); 4830 } 4831 fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)"); 4832 if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) { 4833 fprintf(ctx->fp, " md"); 4834 } 4835 if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) { 4836 fprintf(ctx->fp, " blob"); 4837 } 4838 fprintf(ctx->fp, "\n"); 4839 4840 crc = blob_md_page_calc_crc(page); 4841 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4842 4843 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4844 while (cur_desc < sizeof(page->descriptors)) { 4845 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4846 if (desc->length == 0) { 4847 /* If padding and length are 0, this terminates the page */ 4848 break; 4849 } 4850 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4851 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4852 unsigned int i; 4853 4854 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4855 4856 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4857 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4858 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4859 desc_extent_rle->extents[i].cluster_idx); 4860 } else { 4861 fprintf(ctx->fp, "Unallocated Extent - "); 4862 } 4863 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4864 fprintf(ctx->fp, "\n"); 4865 } 4866 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4867 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4868 unsigned int i; 4869 4870 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4871 4872 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4873 if (desc_extent->cluster_idx[i] != 0) { 4874 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4875 desc_extent->cluster_idx[i]); 4876 } else { 4877 fprintf(ctx->fp, "Unallocated Extent"); 4878 } 4879 fprintf(ctx->fp, "\n"); 4880 } 4881 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4882 bs_dump_print_xattr(ctx, desc); 4883 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4884 bs_dump_print_xattr(ctx, desc); 4885 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4886 bs_dump_print_type_flags(ctx, desc); 4887 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 4888 bs_dump_print_extent_table(ctx, desc); 4889 } else { 4890 /* Error */ 4891 fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type); 4892 } 4893 /* Advance to the next descriptor */ 4894 cur_desc += sizeof(*desc) + desc->length; 4895 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4896 break; 4897 } 4898 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4899 } 4900 } 4901 4902 static void 4903 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4904 { 4905 struct spdk_bs_load_ctx *ctx = cb_arg; 4906 4907 if (bserrno != 0) { 4908 bs_dump_finish(seq, ctx, bserrno); 4909 return; 4910 } 4911 4912 if (ctx->page->id != 0) { 4913 bs_dump_print_md_page(ctx); 4914 } 4915 4916 ctx->cur_page++; 4917 4918 if (ctx->cur_page < ctx->super->md_len) { 4919 bs_dump_read_md_page(seq, ctx); 4920 } else { 4921 spdk_free(ctx->page); 4922 bs_dump_finish(seq, ctx, 0); 4923 } 4924 } 4925 4926 static void 4927 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4928 { 4929 struct spdk_bs_load_ctx *ctx = cb_arg; 4930 uint64_t lba; 4931 4932 assert(ctx->cur_page < ctx->super->md_len); 4933 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4934 bs_sequence_read_dev(seq, ctx->page, lba, 4935 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4936 bs_dump_read_md_page_cpl, ctx); 4937 } 4938 4939 static void 4940 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4941 { 4942 struct spdk_bs_load_ctx *ctx = cb_arg; 4943 int rc; 4944 4945 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4946 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4947 sizeof(ctx->super->signature)) != 0) { 4948 fprintf(ctx->fp, "(Mismatch)\n"); 4949 bs_dump_finish(seq, ctx, bserrno); 4950 return; 4951 } else { 4952 fprintf(ctx->fp, "(OK)\n"); 4953 } 4954 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4955 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4956 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4957 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4958 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4959 fprintf(ctx->fp, "Super Blob ID: "); 4960 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4961 fprintf(ctx->fp, "(None)\n"); 4962 } else { 4963 fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob); 4964 } 4965 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4966 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4967 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4968 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4969 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4970 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4971 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4972 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4973 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4974 4975 ctx->cur_page = 0; 4976 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4977 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4978 if (!ctx->page) { 4979 bs_dump_finish(seq, ctx, -ENOMEM); 4980 return; 4981 } 4982 4983 rc = bs_parse_super(ctx); 4984 if (rc < 0) { 4985 bs_load_ctx_fail(ctx, rc); 4986 return; 4987 } 4988 4989 bs_load_read_used_pages(ctx); 4990 } 4991 4992 void 4993 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4994 spdk_bs_op_complete cb_fn, void *cb_arg) 4995 { 4996 struct spdk_blob_store *bs; 4997 struct spdk_bs_cpl cpl; 4998 struct spdk_bs_load_ctx *ctx; 4999 struct spdk_bs_opts opts = {}; 5000 int err; 5001 5002 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 5003 5004 spdk_bs_opts_init(&opts, sizeof(opts)); 5005 5006 err = bs_alloc(dev, &opts, &bs, &ctx); 5007 if (err) { 5008 dev->destroy(dev); 5009 cb_fn(cb_arg, err); 5010 return; 5011 } 5012 5013 ctx->dumping = true; 5014 ctx->fp = fp; 5015 ctx->print_xattr_fn = print_xattr_fn; 5016 5017 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5018 cpl.u.bs_basic.cb_fn = cb_fn; 5019 cpl.u.bs_basic.cb_arg = cb_arg; 5020 5021 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5022 if (!ctx->seq) { 5023 spdk_free(ctx->super); 5024 free(ctx); 5025 bs_free(bs); 5026 cb_fn(cb_arg, -ENOMEM); 5027 return; 5028 } 5029 5030 /* Read the super block */ 5031 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5032 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5033 bs_dump_super_cpl, ctx); 5034 } 5035 5036 /* END spdk_bs_dump */ 5037 5038 /* START spdk_bs_init */ 5039 5040 static void 5041 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5042 { 5043 struct spdk_bs_load_ctx *ctx = cb_arg; 5044 5045 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 5046 spdk_free(ctx->super); 5047 free(ctx); 5048 5049 bs_sequence_finish(seq, bserrno); 5050 } 5051 5052 static void 5053 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5054 { 5055 struct spdk_bs_load_ctx *ctx = cb_arg; 5056 5057 /* Write super block */ 5058 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 5059 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 5060 bs_init_persist_super_cpl, ctx); 5061 } 5062 5063 void 5064 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 5065 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 5066 { 5067 struct spdk_bs_load_ctx *ctx; 5068 struct spdk_blob_store *bs; 5069 struct spdk_bs_cpl cpl; 5070 spdk_bs_sequence_t *seq; 5071 spdk_bs_batch_t *batch; 5072 uint64_t num_md_lba; 5073 uint64_t num_md_pages; 5074 uint64_t num_md_clusters; 5075 uint64_t max_used_cluster_mask_len; 5076 uint32_t i; 5077 struct spdk_bs_opts opts = {}; 5078 int rc; 5079 uint64_t lba, lba_count; 5080 5081 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 5082 5083 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 5084 SPDK_ERRLOG("unsupported dev block length of %d\n", 5085 dev->blocklen); 5086 dev->destroy(dev); 5087 cb_fn(cb_arg, NULL, -EINVAL); 5088 return; 5089 } 5090 5091 spdk_bs_opts_init(&opts, sizeof(opts)); 5092 if (o) { 5093 if (bs_opts_copy(o, &opts)) { 5094 return; 5095 } 5096 } 5097 5098 if (bs_opts_verify(&opts) != 0) { 5099 dev->destroy(dev); 5100 cb_fn(cb_arg, NULL, -EINVAL); 5101 return; 5102 } 5103 5104 rc = bs_alloc(dev, &opts, &bs, &ctx); 5105 if (rc) { 5106 dev->destroy(dev); 5107 cb_fn(cb_arg, NULL, rc); 5108 return; 5109 } 5110 5111 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 5112 /* By default, allocate 1 page per cluster. 5113 * Technically, this over-allocates metadata 5114 * because more metadata will reduce the number 5115 * of usable clusters. This can be addressed with 5116 * more complex math in the future. 5117 */ 5118 bs->md_len = bs->total_clusters; 5119 } else { 5120 bs->md_len = opts.num_md_pages; 5121 } 5122 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 5123 if (rc < 0) { 5124 spdk_free(ctx->super); 5125 free(ctx); 5126 bs_free(bs); 5127 cb_fn(cb_arg, NULL, -ENOMEM); 5128 return; 5129 } 5130 5131 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 5132 if (rc < 0) { 5133 spdk_free(ctx->super); 5134 free(ctx); 5135 bs_free(bs); 5136 cb_fn(cb_arg, NULL, -ENOMEM); 5137 return; 5138 } 5139 5140 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 5141 if (rc < 0) { 5142 spdk_free(ctx->super); 5143 free(ctx); 5144 bs_free(bs); 5145 cb_fn(cb_arg, NULL, -ENOMEM); 5146 return; 5147 } 5148 5149 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 5150 sizeof(ctx->super->signature)); 5151 ctx->super->version = SPDK_BS_VERSION; 5152 ctx->super->length = sizeof(*ctx->super); 5153 ctx->super->super_blob = bs->super_blob; 5154 ctx->super->clean = 0; 5155 ctx->super->cluster_size = bs->cluster_sz; 5156 ctx->super->io_unit_size = bs->io_unit_size; 5157 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 5158 5159 /* Calculate how many pages the metadata consumes at the front 5160 * of the disk. 5161 */ 5162 5163 /* The super block uses 1 page */ 5164 num_md_pages = 1; 5165 5166 /* The used_md_pages mask requires 1 bit per metadata page, rounded 5167 * up to the nearest page, plus a header. 5168 */ 5169 ctx->super->used_page_mask_start = num_md_pages; 5170 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5171 spdk_divide_round_up(bs->md_len, 8), 5172 SPDK_BS_PAGE_SIZE); 5173 num_md_pages += ctx->super->used_page_mask_len; 5174 5175 /* The used_clusters mask requires 1 bit per cluster, rounded 5176 * up to the nearest page, plus a header. 5177 */ 5178 ctx->super->used_cluster_mask_start = num_md_pages; 5179 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5180 spdk_divide_round_up(bs->total_clusters, 8), 5181 SPDK_BS_PAGE_SIZE); 5182 /* The blobstore might be extended, then the used_cluster bitmap will need more space. 5183 * Here we calculate the max clusters we can support according to the 5184 * num_md_pages (bs->md_len). 5185 */ 5186 max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5187 spdk_divide_round_up(bs->md_len, 8), 5188 SPDK_BS_PAGE_SIZE); 5189 max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len, 5190 ctx->super->used_cluster_mask_len); 5191 num_md_pages += max_used_cluster_mask_len; 5192 5193 /* The used_blobids mask requires 1 bit per metadata page, rounded 5194 * up to the nearest page, plus a header. 5195 */ 5196 ctx->super->used_blobid_mask_start = num_md_pages; 5197 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 5198 spdk_divide_round_up(bs->md_len, 8), 5199 SPDK_BS_PAGE_SIZE); 5200 num_md_pages += ctx->super->used_blobid_mask_len; 5201 5202 /* The metadata region size was chosen above */ 5203 ctx->super->md_start = bs->md_start = num_md_pages; 5204 ctx->super->md_len = bs->md_len; 5205 num_md_pages += bs->md_len; 5206 5207 num_md_lba = bs_page_to_lba(bs, num_md_pages); 5208 5209 ctx->super->size = dev->blockcnt * dev->blocklen; 5210 5211 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 5212 5213 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 5214 if (num_md_clusters > bs->total_clusters) { 5215 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 5216 "please decrease number of pages reserved for metadata " 5217 "or increase cluster size.\n"); 5218 spdk_free(ctx->super); 5219 spdk_bit_array_free(&ctx->used_clusters); 5220 free(ctx); 5221 bs_free(bs); 5222 cb_fn(cb_arg, NULL, -ENOMEM); 5223 return; 5224 } 5225 /* Claim all of the clusters used by the metadata */ 5226 for (i = 0; i < num_md_clusters; i++) { 5227 spdk_bit_array_set(ctx->used_clusters, i); 5228 } 5229 5230 bs->num_free_clusters -= num_md_clusters; 5231 bs->total_data_clusters = bs->num_free_clusters; 5232 5233 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 5234 cpl.u.bs_handle.cb_fn = cb_fn; 5235 cpl.u.bs_handle.cb_arg = cb_arg; 5236 cpl.u.bs_handle.bs = bs; 5237 5238 seq = bs_sequence_start(bs->md_channel, &cpl); 5239 if (!seq) { 5240 spdk_free(ctx->super); 5241 free(ctx); 5242 bs_free(bs); 5243 cb_fn(cb_arg, NULL, -ENOMEM); 5244 return; 5245 } 5246 5247 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 5248 5249 /* Clear metadata space */ 5250 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 5251 5252 lba = num_md_lba; 5253 lba_count = ctx->bs->dev->blockcnt - lba; 5254 switch (opts.clear_method) { 5255 case BS_CLEAR_WITH_UNMAP: 5256 /* Trim data clusters */ 5257 bs_batch_unmap_dev(batch, lba, lba_count); 5258 break; 5259 case BS_CLEAR_WITH_WRITE_ZEROES: 5260 /* Write_zeroes to data clusters */ 5261 bs_batch_write_zeroes_dev(batch, lba, lba_count); 5262 break; 5263 case BS_CLEAR_WITH_NONE: 5264 default: 5265 break; 5266 } 5267 5268 bs_batch_close(batch); 5269 } 5270 5271 /* END spdk_bs_init */ 5272 5273 /* START spdk_bs_destroy */ 5274 5275 static void 5276 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5277 { 5278 struct spdk_bs_load_ctx *ctx = cb_arg; 5279 struct spdk_blob_store *bs = ctx->bs; 5280 5281 /* 5282 * We need to defer calling bs_call_cpl() until after 5283 * dev destruction, so tuck these away for later use. 5284 */ 5285 bs->unload_err = bserrno; 5286 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5287 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5288 5289 bs_sequence_finish(seq, bserrno); 5290 5291 bs_free(bs); 5292 free(ctx); 5293 } 5294 5295 void 5296 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 5297 void *cb_arg) 5298 { 5299 struct spdk_bs_cpl cpl; 5300 spdk_bs_sequence_t *seq; 5301 struct spdk_bs_load_ctx *ctx; 5302 5303 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 5304 5305 if (!RB_EMPTY(&bs->open_blobs)) { 5306 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5307 cb_fn(cb_arg, -EBUSY); 5308 return; 5309 } 5310 5311 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5312 cpl.u.bs_basic.cb_fn = cb_fn; 5313 cpl.u.bs_basic.cb_arg = cb_arg; 5314 5315 ctx = calloc(1, sizeof(*ctx)); 5316 if (!ctx) { 5317 cb_fn(cb_arg, -ENOMEM); 5318 return; 5319 } 5320 5321 ctx->bs = bs; 5322 5323 seq = bs_sequence_start(bs->md_channel, &cpl); 5324 if (!seq) { 5325 free(ctx); 5326 cb_fn(cb_arg, -ENOMEM); 5327 return; 5328 } 5329 5330 /* Write zeroes to the super block */ 5331 bs_sequence_write_zeroes_dev(seq, 5332 bs_page_to_lba(bs, 0), 5333 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5334 bs_destroy_trim_cpl, ctx); 5335 } 5336 5337 /* END spdk_bs_destroy */ 5338 5339 /* START spdk_bs_unload */ 5340 5341 static void 5342 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5343 { 5344 spdk_bs_sequence_t *seq = ctx->seq; 5345 5346 spdk_free(ctx->super); 5347 5348 /* 5349 * We need to defer calling bs_call_cpl() until after 5350 * dev destruction, so tuck these away for later use. 5351 */ 5352 ctx->bs->unload_err = bserrno; 5353 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5354 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5355 5356 bs_sequence_finish(seq, bserrno); 5357 5358 bs_free(ctx->bs); 5359 free(ctx); 5360 } 5361 5362 static void 5363 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5364 { 5365 struct spdk_bs_load_ctx *ctx = cb_arg; 5366 5367 bs_unload_finish(ctx, bserrno); 5368 } 5369 5370 static void 5371 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5372 { 5373 struct spdk_bs_load_ctx *ctx = cb_arg; 5374 5375 spdk_free(ctx->mask); 5376 5377 if (bserrno != 0) { 5378 bs_unload_finish(ctx, bserrno); 5379 return; 5380 } 5381 5382 ctx->super->clean = 1; 5383 5384 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5385 } 5386 5387 static void 5388 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5389 { 5390 struct spdk_bs_load_ctx *ctx = cb_arg; 5391 5392 spdk_free(ctx->mask); 5393 ctx->mask = NULL; 5394 5395 if (bserrno != 0) { 5396 bs_unload_finish(ctx, bserrno); 5397 return; 5398 } 5399 5400 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5401 } 5402 5403 static void 5404 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5405 { 5406 struct spdk_bs_load_ctx *ctx = cb_arg; 5407 5408 spdk_free(ctx->mask); 5409 ctx->mask = NULL; 5410 5411 if (bserrno != 0) { 5412 bs_unload_finish(ctx, bserrno); 5413 return; 5414 } 5415 5416 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5417 } 5418 5419 static void 5420 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5421 { 5422 struct spdk_bs_load_ctx *ctx = cb_arg; 5423 5424 if (bserrno != 0) { 5425 bs_unload_finish(ctx, bserrno); 5426 return; 5427 } 5428 5429 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5430 } 5431 5432 void 5433 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5434 { 5435 struct spdk_bs_cpl cpl; 5436 struct spdk_bs_load_ctx *ctx; 5437 5438 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5439 5440 if (!RB_EMPTY(&bs->open_blobs)) { 5441 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5442 cb_fn(cb_arg, -EBUSY); 5443 return; 5444 } 5445 5446 ctx = calloc(1, sizeof(*ctx)); 5447 if (!ctx) { 5448 cb_fn(cb_arg, -ENOMEM); 5449 return; 5450 } 5451 5452 ctx->bs = bs; 5453 5454 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5455 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5456 if (!ctx->super) { 5457 free(ctx); 5458 cb_fn(cb_arg, -ENOMEM); 5459 return; 5460 } 5461 5462 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5463 cpl.u.bs_basic.cb_fn = cb_fn; 5464 cpl.u.bs_basic.cb_arg = cb_arg; 5465 5466 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5467 if (!ctx->seq) { 5468 spdk_free(ctx->super); 5469 free(ctx); 5470 cb_fn(cb_arg, -ENOMEM); 5471 return; 5472 } 5473 5474 /* Read super block */ 5475 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5476 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5477 bs_unload_read_super_cpl, ctx); 5478 } 5479 5480 /* END spdk_bs_unload */ 5481 5482 /* START spdk_bs_set_super */ 5483 5484 struct spdk_bs_set_super_ctx { 5485 struct spdk_blob_store *bs; 5486 struct spdk_bs_super_block *super; 5487 }; 5488 5489 static void 5490 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5491 { 5492 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5493 5494 if (bserrno != 0) { 5495 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5496 } 5497 5498 spdk_free(ctx->super); 5499 5500 bs_sequence_finish(seq, bserrno); 5501 5502 free(ctx); 5503 } 5504 5505 static void 5506 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5507 { 5508 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5509 5510 if (bserrno != 0) { 5511 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5512 spdk_free(ctx->super); 5513 bs_sequence_finish(seq, bserrno); 5514 free(ctx); 5515 return; 5516 } 5517 5518 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5519 } 5520 5521 void 5522 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5523 spdk_bs_op_complete cb_fn, void *cb_arg) 5524 { 5525 struct spdk_bs_cpl cpl; 5526 spdk_bs_sequence_t *seq; 5527 struct spdk_bs_set_super_ctx *ctx; 5528 5529 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5530 5531 ctx = calloc(1, sizeof(*ctx)); 5532 if (!ctx) { 5533 cb_fn(cb_arg, -ENOMEM); 5534 return; 5535 } 5536 5537 ctx->bs = bs; 5538 5539 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5540 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5541 if (!ctx->super) { 5542 free(ctx); 5543 cb_fn(cb_arg, -ENOMEM); 5544 return; 5545 } 5546 5547 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5548 cpl.u.bs_basic.cb_fn = cb_fn; 5549 cpl.u.bs_basic.cb_arg = cb_arg; 5550 5551 seq = bs_sequence_start(bs->md_channel, &cpl); 5552 if (!seq) { 5553 spdk_free(ctx->super); 5554 free(ctx); 5555 cb_fn(cb_arg, -ENOMEM); 5556 return; 5557 } 5558 5559 bs->super_blob = blobid; 5560 5561 /* Read super block */ 5562 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5563 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5564 bs_set_super_read_cpl, ctx); 5565 } 5566 5567 /* END spdk_bs_set_super */ 5568 5569 void 5570 spdk_bs_get_super(struct spdk_blob_store *bs, 5571 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5572 { 5573 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5574 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5575 } else { 5576 cb_fn(cb_arg, bs->super_blob, 0); 5577 } 5578 } 5579 5580 uint64_t 5581 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5582 { 5583 return bs->cluster_sz; 5584 } 5585 5586 uint64_t 5587 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5588 { 5589 return SPDK_BS_PAGE_SIZE; 5590 } 5591 5592 uint64_t 5593 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5594 { 5595 return bs->io_unit_size; 5596 } 5597 5598 uint64_t 5599 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5600 { 5601 return bs->num_free_clusters; 5602 } 5603 5604 uint64_t 5605 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5606 { 5607 return bs->total_data_clusters; 5608 } 5609 5610 static int 5611 bs_register_md_thread(struct spdk_blob_store *bs) 5612 { 5613 bs->md_channel = spdk_get_io_channel(bs); 5614 if (!bs->md_channel) { 5615 SPDK_ERRLOG("Failed to get IO channel.\n"); 5616 return -1; 5617 } 5618 5619 return 0; 5620 } 5621 5622 static int 5623 bs_unregister_md_thread(struct spdk_blob_store *bs) 5624 { 5625 spdk_put_io_channel(bs->md_channel); 5626 5627 return 0; 5628 } 5629 5630 spdk_blob_id 5631 spdk_blob_get_id(struct spdk_blob *blob) 5632 { 5633 assert(blob != NULL); 5634 5635 return blob->id; 5636 } 5637 5638 uint64_t 5639 spdk_blob_get_num_pages(struct spdk_blob *blob) 5640 { 5641 assert(blob != NULL); 5642 5643 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5644 } 5645 5646 uint64_t 5647 spdk_blob_get_num_io_units(struct spdk_blob *blob) 5648 { 5649 assert(blob != NULL); 5650 5651 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5652 } 5653 5654 uint64_t 5655 spdk_blob_get_num_clusters(struct spdk_blob *blob) 5656 { 5657 assert(blob != NULL); 5658 5659 return blob->active.num_clusters; 5660 } 5661 5662 static uint64_t 5663 blob_find_io_unit(struct spdk_blob *blob, uint64_t offset, bool is_allocated) 5664 { 5665 uint64_t blob_io_unit_num = spdk_blob_get_num_io_units(blob); 5666 5667 while (offset < blob_io_unit_num) { 5668 if (bs_io_unit_is_allocated(blob, offset) == is_allocated) { 5669 return offset; 5670 } 5671 5672 offset += bs_num_io_units_to_cluster_boundary(blob, offset); 5673 } 5674 5675 return UINT64_MAX; 5676 } 5677 5678 uint64_t 5679 spdk_blob_get_next_allocated_io_unit(struct spdk_blob *blob, uint64_t offset) 5680 { 5681 return blob_find_io_unit(blob, offset, true); 5682 } 5683 5684 uint64_t 5685 spdk_blob_get_next_unallocated_io_unit(struct spdk_blob *blob, uint64_t offset) 5686 { 5687 return blob_find_io_unit(blob, offset, false); 5688 } 5689 5690 /* START spdk_bs_create_blob */ 5691 5692 static void 5693 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5694 { 5695 struct spdk_blob *blob = cb_arg; 5696 uint32_t page_idx = bs_blobid_to_page(blob->id); 5697 5698 if (bserrno != 0) { 5699 spdk_spin_lock(&blob->bs->used_lock); 5700 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5701 bs_release_md_page(blob->bs, page_idx); 5702 spdk_spin_unlock(&blob->bs->used_lock); 5703 } 5704 5705 blob_free(blob); 5706 5707 bs_sequence_finish(seq, bserrno); 5708 } 5709 5710 static int 5711 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5712 bool internal) 5713 { 5714 uint64_t i; 5715 size_t value_len = 0; 5716 int rc; 5717 const void *value = NULL; 5718 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5719 return -EINVAL; 5720 } 5721 for (i = 0; i < xattrs->count; i++) { 5722 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5723 if (value == NULL || value_len == 0) { 5724 return -EINVAL; 5725 } 5726 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5727 if (rc < 0) { 5728 return rc; 5729 } 5730 } 5731 return 0; 5732 } 5733 5734 static void 5735 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5736 { 5737 #define FIELD_OK(field) \ 5738 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5739 5740 #define SET_FIELD(field) \ 5741 if (FIELD_OK(field)) { \ 5742 dst->field = src->field; \ 5743 } \ 5744 5745 SET_FIELD(num_clusters); 5746 SET_FIELD(thin_provision); 5747 SET_FIELD(clear_method); 5748 5749 if (FIELD_OK(xattrs)) { 5750 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5751 } 5752 5753 SET_FIELD(use_extent_table); 5754 5755 dst->opts_size = src->opts_size; 5756 5757 /* You should not remove this statement, but need to update the assert statement 5758 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5759 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5760 5761 #undef FIELD_OK 5762 #undef SET_FIELD 5763 } 5764 5765 static void 5766 bs_create_blob(struct spdk_blob_store *bs, 5767 const struct spdk_blob_opts *opts, 5768 const struct spdk_blob_xattr_opts *internal_xattrs, 5769 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5770 { 5771 struct spdk_blob *blob; 5772 uint32_t page_idx; 5773 struct spdk_bs_cpl cpl; 5774 struct spdk_blob_opts opts_local; 5775 struct spdk_blob_xattr_opts internal_xattrs_default; 5776 spdk_bs_sequence_t *seq; 5777 spdk_blob_id id; 5778 int rc; 5779 5780 assert(spdk_get_thread() == bs->md_thread); 5781 5782 spdk_spin_lock(&bs->used_lock); 5783 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5784 if (page_idx == UINT32_MAX) { 5785 spdk_spin_unlock(&bs->used_lock); 5786 cb_fn(cb_arg, 0, -ENOMEM); 5787 return; 5788 } 5789 spdk_bit_array_set(bs->used_blobids, page_idx); 5790 bs_claim_md_page(bs, page_idx); 5791 spdk_spin_unlock(&bs->used_lock); 5792 5793 id = bs_page_to_blobid(page_idx); 5794 5795 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5796 5797 blob = blob_alloc(bs, id); 5798 if (!blob) { 5799 rc = -ENOMEM; 5800 goto error; 5801 } 5802 5803 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5804 if (opts) { 5805 blob_opts_copy(opts, &opts_local); 5806 } 5807 5808 blob->use_extent_table = opts_local.use_extent_table; 5809 if (blob->use_extent_table) { 5810 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5811 } 5812 5813 if (!internal_xattrs) { 5814 blob_xattrs_init(&internal_xattrs_default); 5815 internal_xattrs = &internal_xattrs_default; 5816 } 5817 5818 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5819 if (rc < 0) { 5820 goto error; 5821 } 5822 5823 rc = blob_set_xattrs(blob, internal_xattrs, true); 5824 if (rc < 0) { 5825 goto error; 5826 } 5827 5828 if (opts_local.thin_provision) { 5829 blob_set_thin_provision(blob); 5830 } 5831 5832 blob_set_clear_method(blob, opts_local.clear_method); 5833 5834 rc = blob_resize(blob, opts_local.num_clusters); 5835 if (rc < 0) { 5836 goto error; 5837 } 5838 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5839 cpl.u.blobid.cb_fn = cb_fn; 5840 cpl.u.blobid.cb_arg = cb_arg; 5841 cpl.u.blobid.blobid = blob->id; 5842 5843 seq = bs_sequence_start(bs->md_channel, &cpl); 5844 if (!seq) { 5845 rc = -ENOMEM; 5846 goto error; 5847 } 5848 5849 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5850 return; 5851 5852 error: 5853 if (blob != NULL) { 5854 blob_free(blob); 5855 } 5856 spdk_spin_lock(&bs->used_lock); 5857 spdk_bit_array_clear(bs->used_blobids, page_idx); 5858 bs_release_md_page(bs, page_idx); 5859 spdk_spin_unlock(&bs->used_lock); 5860 cb_fn(cb_arg, 0, rc); 5861 } 5862 5863 void 5864 spdk_bs_create_blob(struct spdk_blob_store *bs, 5865 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5866 { 5867 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5868 } 5869 5870 void 5871 spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5872 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5873 { 5874 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5875 } 5876 5877 /* END spdk_bs_create_blob */ 5878 5879 /* START blob_cleanup */ 5880 5881 struct spdk_clone_snapshot_ctx { 5882 struct spdk_bs_cpl cpl; 5883 int bserrno; 5884 bool frozen; 5885 5886 struct spdk_io_channel *channel; 5887 5888 /* Current cluster for inflate operation */ 5889 uint64_t cluster; 5890 5891 /* For inflation force allocation of all unallocated clusters and remove 5892 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5893 bool allocate_all; 5894 5895 struct { 5896 spdk_blob_id id; 5897 struct spdk_blob *blob; 5898 bool md_ro; 5899 } original; 5900 struct { 5901 spdk_blob_id id; 5902 struct spdk_blob *blob; 5903 } new; 5904 5905 /* xattrs specified for snapshot/clones only. They have no impact on 5906 * the original blobs xattrs. */ 5907 const struct spdk_blob_xattr_opts *xattrs; 5908 }; 5909 5910 static void 5911 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5912 { 5913 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5914 struct spdk_bs_cpl *cpl = &ctx->cpl; 5915 5916 if (bserrno != 0) { 5917 if (ctx->bserrno != 0) { 5918 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5919 } else { 5920 ctx->bserrno = bserrno; 5921 } 5922 } 5923 5924 switch (cpl->type) { 5925 case SPDK_BS_CPL_TYPE_BLOBID: 5926 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5927 break; 5928 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5929 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5930 break; 5931 default: 5932 SPDK_UNREACHABLE(); 5933 break; 5934 } 5935 5936 free(ctx); 5937 } 5938 5939 static void 5940 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5941 { 5942 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5943 struct spdk_blob *origblob = ctx->original.blob; 5944 5945 if (bserrno != 0) { 5946 if (ctx->bserrno != 0) { 5947 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5948 } else { 5949 ctx->bserrno = bserrno; 5950 } 5951 } 5952 5953 ctx->original.id = origblob->id; 5954 origblob->locked_operation_in_progress = false; 5955 5956 /* Revert md_ro to original state */ 5957 origblob->md_ro = ctx->original.md_ro; 5958 5959 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5960 } 5961 5962 static void 5963 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5964 { 5965 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5966 struct spdk_blob *origblob = ctx->original.blob; 5967 5968 if (bserrno != 0) { 5969 if (ctx->bserrno != 0) { 5970 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5971 } else { 5972 ctx->bserrno = bserrno; 5973 } 5974 } 5975 5976 if (ctx->frozen) { 5977 /* Unfreeze any outstanding I/O */ 5978 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5979 } else { 5980 bs_snapshot_unfreeze_cpl(ctx, 0); 5981 } 5982 5983 } 5984 5985 static void 5986 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5987 { 5988 struct spdk_blob *newblob = ctx->new.blob; 5989 5990 if (bserrno != 0) { 5991 if (ctx->bserrno != 0) { 5992 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5993 } else { 5994 ctx->bserrno = bserrno; 5995 } 5996 } 5997 5998 ctx->new.id = newblob->id; 5999 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 6000 } 6001 6002 /* END blob_cleanup */ 6003 6004 /* START spdk_bs_create_snapshot */ 6005 6006 static void 6007 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 6008 { 6009 uint64_t *cluster_temp; 6010 uint32_t *extent_page_temp; 6011 6012 cluster_temp = blob1->active.clusters; 6013 blob1->active.clusters = blob2->active.clusters; 6014 blob2->active.clusters = cluster_temp; 6015 6016 extent_page_temp = blob1->active.extent_pages; 6017 blob1->active.extent_pages = blob2->active.extent_pages; 6018 blob2->active.extent_pages = extent_page_temp; 6019 } 6020 6021 static void 6022 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 6023 { 6024 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6025 struct spdk_blob *origblob = ctx->original.blob; 6026 struct spdk_blob *newblob = ctx->new.blob; 6027 6028 if (bserrno != 0) { 6029 bs_snapshot_swap_cluster_maps(newblob, origblob); 6030 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6031 return; 6032 } 6033 6034 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 6035 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 6036 if (bserrno != 0) { 6037 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6038 return; 6039 } 6040 6041 bs_blob_list_add(ctx->original.blob); 6042 6043 spdk_blob_set_read_only(newblob); 6044 6045 /* sync snapshot metadata */ 6046 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 6047 } 6048 6049 static void 6050 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 6051 { 6052 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6053 struct spdk_blob *origblob = ctx->original.blob; 6054 struct spdk_blob *newblob = ctx->new.blob; 6055 6056 if (bserrno != 0) { 6057 /* return cluster map back to original */ 6058 bs_snapshot_swap_cluster_maps(newblob, origblob); 6059 6060 /* Newblob md sync failed. Valid clusters are only present in origblob. 6061 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred. 6062 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 6063 blob_set_thin_provision(newblob); 6064 assert(spdk_mem_all_zero(newblob->active.clusters, 6065 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6066 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6067 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6068 6069 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6070 return; 6071 } 6072 6073 /* Set internal xattr for snapshot id */ 6074 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 6075 if (bserrno != 0) { 6076 /* return cluster map back to original */ 6077 bs_snapshot_swap_cluster_maps(newblob, origblob); 6078 blob_set_thin_provision(newblob); 6079 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6080 return; 6081 } 6082 6083 /* Create new back_bs_dev for snapshot */ 6084 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 6085 if (origblob->back_bs_dev == NULL) { 6086 /* return cluster map back to original */ 6087 bs_snapshot_swap_cluster_maps(newblob, origblob); 6088 blob_set_thin_provision(newblob); 6089 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 6090 return; 6091 } 6092 6093 bs_blob_list_remove(origblob); 6094 origblob->parent_id = newblob->id; 6095 /* set clone blob as thin provisioned */ 6096 blob_set_thin_provision(origblob); 6097 6098 bs_blob_list_add(newblob); 6099 6100 /* sync clone metadata */ 6101 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 6102 } 6103 6104 static void 6105 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 6106 { 6107 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6108 struct spdk_blob *origblob = ctx->original.blob; 6109 struct spdk_blob *newblob = ctx->new.blob; 6110 int bserrno; 6111 6112 if (rc != 0) { 6113 bs_clone_snapshot_newblob_cleanup(ctx, rc); 6114 return; 6115 } 6116 6117 ctx->frozen = true; 6118 6119 if (newblob->back_bs_dev) { 6120 newblob->back_bs_dev->destroy(newblob->back_bs_dev); 6121 } 6122 /* set new back_bs_dev for snapshot */ 6123 newblob->back_bs_dev = origblob->back_bs_dev; 6124 /* Set invalid flags from origblob */ 6125 newblob->invalid_flags = origblob->invalid_flags; 6126 6127 /* inherit parent from original blob if set */ 6128 newblob->parent_id = origblob->parent_id; 6129 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 6130 /* Set internal xattr for snapshot id */ 6131 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 6132 &origblob->parent_id, sizeof(spdk_blob_id), true); 6133 if (bserrno != 0) { 6134 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 6135 return; 6136 } 6137 } 6138 6139 /* swap cluster maps */ 6140 bs_snapshot_swap_cluster_maps(newblob, origblob); 6141 6142 /* Set the clear method on the new blob to match the original. */ 6143 blob_set_clear_method(newblob, origblob->clear_method); 6144 6145 /* sync snapshot metadata */ 6146 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 6147 } 6148 6149 static void 6150 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6151 { 6152 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6153 struct spdk_blob *origblob = ctx->original.blob; 6154 struct spdk_blob *newblob = _blob; 6155 6156 if (bserrno != 0) { 6157 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6158 return; 6159 } 6160 6161 ctx->new.blob = newblob; 6162 assert(spdk_blob_is_thin_provisioned(newblob)); 6163 assert(spdk_mem_all_zero(newblob->active.clusters, 6164 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 6165 assert(spdk_mem_all_zero(newblob->active.extent_pages, 6166 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 6167 6168 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 6169 } 6170 6171 static void 6172 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6173 { 6174 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6175 struct spdk_blob *origblob = ctx->original.blob; 6176 6177 if (bserrno != 0) { 6178 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6179 return; 6180 } 6181 6182 ctx->new.id = blobid; 6183 ctx->cpl.u.blobid.blobid = blobid; 6184 6185 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 6186 } 6187 6188 6189 static void 6190 bs_xattr_snapshot(void *arg, const char *name, 6191 const void **value, size_t *value_len) 6192 { 6193 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 6194 6195 struct spdk_blob *blob = (struct spdk_blob *)arg; 6196 *value = &blob->id; 6197 *value_len = sizeof(blob->id); 6198 } 6199 6200 static void 6201 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6202 { 6203 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6204 struct spdk_blob_opts opts; 6205 struct spdk_blob_xattr_opts internal_xattrs; 6206 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 6207 6208 if (bserrno != 0) { 6209 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6210 return; 6211 } 6212 6213 ctx->original.blob = _blob; 6214 6215 if (_blob->data_ro || _blob->md_ro) { 6216 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 6217 _blob->id); 6218 ctx->bserrno = -EINVAL; 6219 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6220 return; 6221 } 6222 6223 if (_blob->locked_operation_in_progress) { 6224 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 6225 ctx->bserrno = -EBUSY; 6226 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6227 return; 6228 } 6229 6230 _blob->locked_operation_in_progress = true; 6231 6232 spdk_blob_opts_init(&opts, sizeof(opts)); 6233 blob_xattrs_init(&internal_xattrs); 6234 6235 /* Change the size of new blob to the same as in original blob, 6236 * but do not allocate clusters */ 6237 opts.thin_provision = true; 6238 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6239 opts.use_extent_table = _blob->use_extent_table; 6240 6241 /* If there are any xattrs specified for snapshot, set them now */ 6242 if (ctx->xattrs) { 6243 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6244 } 6245 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 6246 internal_xattrs.count = 1; 6247 internal_xattrs.ctx = _blob; 6248 internal_xattrs.names = xattrs_names; 6249 internal_xattrs.get_value = bs_xattr_snapshot; 6250 6251 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6252 bs_snapshot_newblob_create_cpl, ctx); 6253 } 6254 6255 void 6256 spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 6257 const struct spdk_blob_xattr_opts *snapshot_xattrs, 6258 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6259 { 6260 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6261 6262 if (!ctx) { 6263 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6264 return; 6265 } 6266 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6267 ctx->cpl.u.blobid.cb_fn = cb_fn; 6268 ctx->cpl.u.blobid.cb_arg = cb_arg; 6269 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6270 ctx->bserrno = 0; 6271 ctx->frozen = false; 6272 ctx->original.id = blobid; 6273 ctx->xattrs = snapshot_xattrs; 6274 6275 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 6276 } 6277 /* END spdk_bs_create_snapshot */ 6278 6279 /* START spdk_bs_create_clone */ 6280 6281 static void 6282 bs_xattr_clone(void *arg, const char *name, 6283 const void **value, size_t *value_len) 6284 { 6285 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 6286 6287 struct spdk_blob *blob = (struct spdk_blob *)arg; 6288 *value = &blob->id; 6289 *value_len = sizeof(blob->id); 6290 } 6291 6292 static void 6293 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6294 { 6295 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6296 struct spdk_blob *clone = _blob; 6297 6298 ctx->new.blob = clone; 6299 bs_blob_list_add(clone); 6300 6301 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 6302 } 6303 6304 static void 6305 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 6306 { 6307 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6308 6309 ctx->cpl.u.blobid.blobid = blobid; 6310 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 6311 } 6312 6313 static void 6314 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6315 { 6316 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6317 struct spdk_blob_opts opts; 6318 struct spdk_blob_xattr_opts internal_xattrs; 6319 char *xattr_names[] = { BLOB_SNAPSHOT }; 6320 6321 if (bserrno != 0) { 6322 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6323 return; 6324 } 6325 6326 ctx->original.blob = _blob; 6327 ctx->original.md_ro = _blob->md_ro; 6328 6329 if (!_blob->data_ro || !_blob->md_ro) { 6330 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 6331 ctx->bserrno = -EINVAL; 6332 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6333 return; 6334 } 6335 6336 if (_blob->locked_operation_in_progress) { 6337 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 6338 ctx->bserrno = -EBUSY; 6339 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6340 return; 6341 } 6342 6343 _blob->locked_operation_in_progress = true; 6344 6345 spdk_blob_opts_init(&opts, sizeof(opts)); 6346 blob_xattrs_init(&internal_xattrs); 6347 6348 opts.thin_provision = true; 6349 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6350 opts.use_extent_table = _blob->use_extent_table; 6351 if (ctx->xattrs) { 6352 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6353 } 6354 6355 /* Set internal xattr BLOB_SNAPSHOT */ 6356 internal_xattrs.count = 1; 6357 internal_xattrs.ctx = _blob; 6358 internal_xattrs.names = xattr_names; 6359 internal_xattrs.get_value = bs_xattr_clone; 6360 6361 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6362 bs_clone_newblob_create_cpl, ctx); 6363 } 6364 6365 void 6366 spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 6367 const struct spdk_blob_xattr_opts *clone_xattrs, 6368 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6369 { 6370 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6371 6372 if (!ctx) { 6373 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6374 return; 6375 } 6376 6377 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6378 ctx->cpl.u.blobid.cb_fn = cb_fn; 6379 ctx->cpl.u.blobid.cb_arg = cb_arg; 6380 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6381 ctx->bserrno = 0; 6382 ctx->xattrs = clone_xattrs; 6383 ctx->original.id = blobid; 6384 6385 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6386 } 6387 6388 /* END spdk_bs_create_clone */ 6389 6390 /* START spdk_bs_inflate_blob */ 6391 6392 static void 6393 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6394 { 6395 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6396 struct spdk_blob *_blob = ctx->original.blob; 6397 6398 if (bserrno != 0) { 6399 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6400 return; 6401 } 6402 6403 /* Temporarily override md_ro flag for MD modification */ 6404 _blob->md_ro = false; 6405 6406 bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true); 6407 if (bserrno != 0) { 6408 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6409 return; 6410 } 6411 6412 assert(_parent != NULL); 6413 6414 bs_blob_list_remove(_blob); 6415 _blob->parent_id = _parent->id; 6416 6417 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6418 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6419 bs_blob_list_add(_blob); 6420 6421 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6422 } 6423 6424 static void 6425 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6426 { 6427 struct spdk_blob *_blob = ctx->original.blob; 6428 struct spdk_blob *_parent; 6429 6430 if (ctx->allocate_all) { 6431 /* remove thin provisioning */ 6432 bs_blob_list_remove(_blob); 6433 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6434 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6435 _blob->back_bs_dev = NULL; 6436 _blob->parent_id = SPDK_BLOBID_INVALID; 6437 } else { 6438 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6439 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6440 /* We must change the parent of the inflated blob */ 6441 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6442 bs_inflate_blob_set_parent_cpl, ctx); 6443 return; 6444 } 6445 6446 bs_blob_list_remove(_blob); 6447 _blob->parent_id = SPDK_BLOBID_INVALID; 6448 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6449 _blob->back_bs_dev = bs_create_zeroes_dev(); 6450 } 6451 6452 /* Temporarily override md_ro flag for MD modification */ 6453 _blob->md_ro = false; 6454 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6455 _blob->state = SPDK_BLOB_STATE_DIRTY; 6456 6457 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6458 } 6459 6460 /* Check if cluster needs allocation */ 6461 static inline bool 6462 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6463 { 6464 struct spdk_blob_bs_dev *b; 6465 6466 assert(blob != NULL); 6467 6468 if (blob->active.clusters[cluster] != 0) { 6469 /* Cluster is already allocated */ 6470 return false; 6471 } 6472 6473 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6474 /* Blob have no parent blob */ 6475 return allocate_all; 6476 } 6477 6478 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6479 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6480 } 6481 6482 static void 6483 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6484 { 6485 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6486 struct spdk_blob *_blob = ctx->original.blob; 6487 struct spdk_bs_cpl cpl; 6488 spdk_bs_user_op_t *op; 6489 uint64_t offset; 6490 6491 if (bserrno != 0) { 6492 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6493 return; 6494 } 6495 6496 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6497 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6498 break; 6499 } 6500 } 6501 6502 if (ctx->cluster < _blob->active.num_clusters) { 6503 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6504 6505 /* We may safely increment a cluster before copying */ 6506 ctx->cluster++; 6507 6508 /* Use a dummy 0B read as a context for cluster copy */ 6509 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6510 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next; 6511 cpl.u.blob_basic.cb_arg = ctx; 6512 6513 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob, 6514 NULL, 0, offset, 0); 6515 if (!op) { 6516 bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM); 6517 return; 6518 } 6519 6520 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op); 6521 } else { 6522 bs_inflate_blob_done(ctx); 6523 } 6524 } 6525 6526 static void 6527 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6528 { 6529 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6530 uint64_t clusters_needed; 6531 uint64_t i; 6532 6533 if (bserrno != 0) { 6534 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6535 return; 6536 } 6537 6538 ctx->original.blob = _blob; 6539 ctx->original.md_ro = _blob->md_ro; 6540 6541 if (_blob->locked_operation_in_progress) { 6542 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6543 ctx->bserrno = -EBUSY; 6544 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6545 return; 6546 } 6547 6548 _blob->locked_operation_in_progress = true; 6549 6550 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6551 /* This blob have no parent, so we cannot decouple it. */ 6552 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6553 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6554 return; 6555 } 6556 6557 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6558 /* This is not thin provisioned blob. No need to inflate. */ 6559 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6560 return; 6561 } 6562 6563 /* Do two passes - one to verify that we can obtain enough clusters 6564 * and another to actually claim them. 6565 */ 6566 clusters_needed = 0; 6567 for (i = 0; i < _blob->active.num_clusters; i++) { 6568 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6569 clusters_needed++; 6570 } 6571 } 6572 6573 if (clusters_needed > _blob->bs->num_free_clusters) { 6574 /* Not enough free clusters. Cannot satisfy the request. */ 6575 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6576 return; 6577 } 6578 6579 ctx->cluster = 0; 6580 bs_inflate_blob_touch_next(ctx, 0); 6581 } 6582 6583 static void 6584 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6585 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6586 { 6587 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6588 6589 if (!ctx) { 6590 cb_fn(cb_arg, -ENOMEM); 6591 return; 6592 } 6593 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6594 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6595 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6596 ctx->bserrno = 0; 6597 ctx->original.id = blobid; 6598 ctx->channel = channel; 6599 ctx->allocate_all = allocate_all; 6600 6601 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6602 } 6603 6604 void 6605 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6606 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6607 { 6608 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6609 } 6610 6611 void 6612 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6613 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6614 { 6615 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6616 } 6617 /* END spdk_bs_inflate_blob */ 6618 6619 /* START spdk_blob_resize */ 6620 struct spdk_bs_resize_ctx { 6621 spdk_blob_op_complete cb_fn; 6622 void *cb_arg; 6623 struct spdk_blob *blob; 6624 uint64_t sz; 6625 int rc; 6626 }; 6627 6628 static void 6629 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6630 { 6631 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6632 6633 if (rc != 0) { 6634 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6635 } 6636 6637 if (ctx->rc != 0) { 6638 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6639 rc = ctx->rc; 6640 } 6641 6642 ctx->blob->locked_operation_in_progress = false; 6643 6644 ctx->cb_fn(ctx->cb_arg, rc); 6645 free(ctx); 6646 } 6647 6648 static void 6649 bs_resize_freeze_cpl(void *cb_arg, int rc) 6650 { 6651 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6652 6653 if (rc != 0) { 6654 ctx->blob->locked_operation_in_progress = false; 6655 ctx->cb_fn(ctx->cb_arg, rc); 6656 free(ctx); 6657 return; 6658 } 6659 6660 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6661 6662 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6663 } 6664 6665 void 6666 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6667 { 6668 struct spdk_bs_resize_ctx *ctx; 6669 6670 blob_verify_md_op(blob); 6671 6672 SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz); 6673 6674 if (blob->md_ro) { 6675 cb_fn(cb_arg, -EPERM); 6676 return; 6677 } 6678 6679 if (sz == blob->active.num_clusters) { 6680 cb_fn(cb_arg, 0); 6681 return; 6682 } 6683 6684 if (blob->locked_operation_in_progress) { 6685 cb_fn(cb_arg, -EBUSY); 6686 return; 6687 } 6688 6689 ctx = calloc(1, sizeof(*ctx)); 6690 if (!ctx) { 6691 cb_fn(cb_arg, -ENOMEM); 6692 return; 6693 } 6694 6695 blob->locked_operation_in_progress = true; 6696 ctx->cb_fn = cb_fn; 6697 ctx->cb_arg = cb_arg; 6698 ctx->blob = blob; 6699 ctx->sz = sz; 6700 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6701 } 6702 6703 /* END spdk_blob_resize */ 6704 6705 6706 /* START spdk_bs_delete_blob */ 6707 6708 static void 6709 bs_delete_close_cpl(void *cb_arg, int bserrno) 6710 { 6711 spdk_bs_sequence_t *seq = cb_arg; 6712 6713 bs_sequence_finish(seq, bserrno); 6714 } 6715 6716 static void 6717 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6718 { 6719 struct spdk_blob *blob = cb_arg; 6720 6721 if (bserrno != 0) { 6722 /* 6723 * We already removed this blob from the blobstore tailq, so 6724 * we need to free it here since this is the last reference 6725 * to it. 6726 */ 6727 blob_free(blob); 6728 bs_delete_close_cpl(seq, bserrno); 6729 return; 6730 } 6731 6732 /* 6733 * This will immediately decrement the ref_count and call 6734 * the completion routine since the metadata state is clean. 6735 * By calling spdk_blob_close, we reduce the number of call 6736 * points into code that touches the blob->open_ref count 6737 * and the blobstore's blob list. 6738 */ 6739 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6740 } 6741 6742 struct delete_snapshot_ctx { 6743 struct spdk_blob_list *parent_snapshot_entry; 6744 struct spdk_blob *snapshot; 6745 struct spdk_blob_md_page *page; 6746 bool snapshot_md_ro; 6747 struct spdk_blob *clone; 6748 bool clone_md_ro; 6749 spdk_blob_op_with_handle_complete cb_fn; 6750 void *cb_arg; 6751 int bserrno; 6752 uint32_t next_extent_page; 6753 }; 6754 6755 static void 6756 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6757 { 6758 struct delete_snapshot_ctx *ctx = cb_arg; 6759 6760 if (bserrno != 0) { 6761 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6762 } 6763 6764 assert(ctx != NULL); 6765 6766 if (bserrno != 0 && ctx->bserrno == 0) { 6767 ctx->bserrno = bserrno; 6768 } 6769 6770 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6771 spdk_free(ctx->page); 6772 free(ctx); 6773 } 6774 6775 static void 6776 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6777 { 6778 struct delete_snapshot_ctx *ctx = cb_arg; 6779 6780 if (bserrno != 0) { 6781 ctx->bserrno = bserrno; 6782 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6783 } 6784 6785 if (ctx->bserrno != 0) { 6786 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6787 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot); 6788 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6789 } 6790 6791 ctx->snapshot->locked_operation_in_progress = false; 6792 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6793 6794 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6795 } 6796 6797 static void 6798 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6799 { 6800 struct delete_snapshot_ctx *ctx = cb_arg; 6801 6802 ctx->clone->locked_operation_in_progress = false; 6803 ctx->clone->md_ro = ctx->clone_md_ro; 6804 6805 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6806 } 6807 6808 static void 6809 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6810 { 6811 struct delete_snapshot_ctx *ctx = cb_arg; 6812 6813 if (bserrno) { 6814 ctx->bserrno = bserrno; 6815 delete_snapshot_cleanup_clone(ctx, 0); 6816 return; 6817 } 6818 6819 ctx->clone->locked_operation_in_progress = false; 6820 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6821 } 6822 6823 static void 6824 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6825 { 6826 struct delete_snapshot_ctx *ctx = cb_arg; 6827 struct spdk_blob_list *parent_snapshot_entry = NULL; 6828 struct spdk_blob_list *snapshot_entry = NULL; 6829 struct spdk_blob_list *clone_entry = NULL; 6830 struct spdk_blob_list *snapshot_clone_entry = NULL; 6831 6832 if (bserrno) { 6833 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6834 ctx->bserrno = bserrno; 6835 delete_snapshot_cleanup_clone(ctx, 0); 6836 return; 6837 } 6838 6839 /* Get snapshot entry for the snapshot we want to remove */ 6840 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6841 6842 assert(snapshot_entry != NULL); 6843 6844 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6845 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6846 assert(clone_entry != NULL); 6847 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6848 snapshot_entry->clone_count--; 6849 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6850 6851 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6852 /* This snapshot is at the same time a clone of another snapshot - we need to 6853 * update parent snapshot (remove current clone, add new one inherited from 6854 * the snapshot that is being removed) */ 6855 6856 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6857 * snapshot that we are removing */ 6858 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6859 &snapshot_clone_entry); 6860 6861 /* Switch clone entry in parent snapshot */ 6862 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6863 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6864 free(snapshot_clone_entry); 6865 } else { 6866 /* No parent snapshot - just remove clone entry */ 6867 free(clone_entry); 6868 } 6869 6870 /* Restore md_ro flags */ 6871 ctx->clone->md_ro = ctx->clone_md_ro; 6872 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6873 6874 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6875 } 6876 6877 static void 6878 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6879 { 6880 struct delete_snapshot_ctx *ctx = cb_arg; 6881 uint64_t i; 6882 6883 ctx->snapshot->md_ro = false; 6884 6885 if (bserrno) { 6886 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6887 ctx->bserrno = bserrno; 6888 6889 /* Restore snapshot to previous state */ 6890 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6891 if (bserrno != 0) { 6892 delete_snapshot_cleanup_clone(ctx, bserrno); 6893 return; 6894 } 6895 6896 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6897 return; 6898 } 6899 6900 /* Clear cluster map entries for snapshot */ 6901 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6902 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6903 ctx->snapshot->active.clusters[i] = 0; 6904 } 6905 } 6906 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6907 i < ctx->clone->active.num_extent_pages; i++) { 6908 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6909 ctx->snapshot->active.extent_pages[i] = 0; 6910 } 6911 } 6912 6913 blob_set_thin_provision(ctx->snapshot); 6914 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6915 6916 if (ctx->parent_snapshot_entry != NULL) { 6917 ctx->snapshot->back_bs_dev = NULL; 6918 } 6919 6920 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6921 } 6922 6923 static void 6924 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6925 { 6926 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6927 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6928 6929 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6930 if (ctx->parent_snapshot_entry != NULL) { 6931 /* ...to parent snapshot */ 6932 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6933 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6934 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6935 sizeof(spdk_blob_id), 6936 true); 6937 } else { 6938 /* ...to blobid invalid and zeroes dev */ 6939 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6940 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6941 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6942 } 6943 6944 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6945 } 6946 6947 static void 6948 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6949 { 6950 struct delete_snapshot_ctx *ctx = cb_arg; 6951 uint32_t *extent_page; 6952 uint64_t i; 6953 6954 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6955 i < ctx->clone->active.num_extent_pages; i++) { 6956 if (ctx->snapshot->active.extent_pages[i] == 0) { 6957 /* No extent page to use from snapshot */ 6958 continue; 6959 } 6960 6961 extent_page = &ctx->clone->active.extent_pages[i]; 6962 if (*extent_page == 0) { 6963 /* Copy extent page from snapshot when clone did not have a matching one */ 6964 *extent_page = ctx->snapshot->active.extent_pages[i]; 6965 continue; 6966 } 6967 6968 /* Clone and snapshot both contain partially filled matching extent pages. 6969 * Update the clone extent page in place with cluster map containing the mix of both. */ 6970 ctx->next_extent_page = i + 1; 6971 memset(ctx->page, 0, SPDK_BS_PAGE_SIZE); 6972 6973 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page, 6974 delete_snapshot_update_extent_pages, ctx); 6975 return; 6976 } 6977 delete_snapshot_update_extent_pages_cpl(ctx); 6978 } 6979 6980 static void 6981 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6982 { 6983 struct delete_snapshot_ctx *ctx = cb_arg; 6984 uint64_t i; 6985 6986 /* Temporarily override md_ro flag for clone for MD modification */ 6987 ctx->clone_md_ro = ctx->clone->md_ro; 6988 ctx->clone->md_ro = false; 6989 6990 if (bserrno) { 6991 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6992 ctx->bserrno = bserrno; 6993 delete_snapshot_cleanup_clone(ctx, 0); 6994 return; 6995 } 6996 6997 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 6998 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6999 if (ctx->clone->active.clusters[i] == 0) { 7000 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 7001 } 7002 } 7003 ctx->next_extent_page = 0; 7004 delete_snapshot_update_extent_pages(ctx, 0); 7005 } 7006 7007 static void 7008 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 7009 { 7010 struct delete_snapshot_ctx *ctx = cb_arg; 7011 7012 if (bserrno) { 7013 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 7014 ctx->bserrno = bserrno; 7015 delete_snapshot_cleanup_clone(ctx, 0); 7016 return; 7017 } 7018 7019 /* Temporarily override md_ro flag for snapshot for MD modification */ 7020 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 7021 ctx->snapshot->md_ro = false; 7022 7023 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 7024 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 7025 sizeof(spdk_blob_id), true); 7026 if (ctx->bserrno != 0) { 7027 delete_snapshot_cleanup_clone(ctx, 0); 7028 return; 7029 } 7030 7031 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 7032 } 7033 7034 static void 7035 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 7036 { 7037 struct delete_snapshot_ctx *ctx = cb_arg; 7038 7039 if (bserrno) { 7040 SPDK_ERRLOG("Failed to open clone\n"); 7041 ctx->bserrno = bserrno; 7042 delete_snapshot_cleanup_snapshot(ctx, 0); 7043 return; 7044 } 7045 7046 ctx->clone = clone; 7047 7048 if (clone->locked_operation_in_progress) { 7049 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 7050 ctx->bserrno = -EBUSY; 7051 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 7052 return; 7053 } 7054 7055 clone->locked_operation_in_progress = true; 7056 7057 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 7058 } 7059 7060 static void 7061 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 7062 { 7063 struct spdk_blob_list *snapshot_entry = NULL; 7064 struct spdk_blob_list *clone_entry = NULL; 7065 struct spdk_blob_list *snapshot_clone_entry = NULL; 7066 7067 /* Get snapshot entry for the snapshot we want to remove */ 7068 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 7069 7070 assert(snapshot_entry != NULL); 7071 7072 /* Get clone of the snapshot (at this point there can be only one clone) */ 7073 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7074 assert(snapshot_entry->clone_count == 1); 7075 assert(clone_entry != NULL); 7076 7077 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 7078 * snapshot that we are removing */ 7079 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 7080 &snapshot_clone_entry); 7081 7082 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 7083 } 7084 7085 static void 7086 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 7087 { 7088 spdk_bs_sequence_t *seq = cb_arg; 7089 struct spdk_blob_list *snapshot_entry = NULL; 7090 uint32_t page_num; 7091 7092 if (bserrno) { 7093 SPDK_ERRLOG("Failed to remove blob\n"); 7094 bs_sequence_finish(seq, bserrno); 7095 return; 7096 } 7097 7098 /* Remove snapshot from the list */ 7099 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7100 if (snapshot_entry != NULL) { 7101 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 7102 free(snapshot_entry); 7103 } 7104 7105 page_num = bs_blobid_to_page(blob->id); 7106 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 7107 blob->state = SPDK_BLOB_STATE_DIRTY; 7108 blob->active.num_pages = 0; 7109 blob_resize(blob, 0); 7110 7111 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 7112 } 7113 7114 static int 7115 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 7116 { 7117 struct spdk_blob_list *snapshot_entry = NULL; 7118 struct spdk_blob_list *clone_entry = NULL; 7119 struct spdk_blob *clone = NULL; 7120 bool has_one_clone = false; 7121 7122 /* Check if this is a snapshot with clones */ 7123 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7124 if (snapshot_entry != NULL) { 7125 if (snapshot_entry->clone_count > 1) { 7126 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 7127 return -EBUSY; 7128 } else if (snapshot_entry->clone_count == 1) { 7129 has_one_clone = true; 7130 } 7131 } 7132 7133 /* Check if someone has this blob open (besides this delete context): 7134 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 7135 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 7136 * and that is ok, because we will update it accordingly */ 7137 if (blob->open_ref <= 2 && has_one_clone) { 7138 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 7139 assert(clone_entry != NULL); 7140 clone = blob_lookup(blob->bs, clone_entry->id); 7141 7142 if (blob->open_ref == 2 && clone == NULL) { 7143 /* Clone is closed and someone else opened this blob */ 7144 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7145 return -EBUSY; 7146 } 7147 7148 *update_clone = true; 7149 return 0; 7150 } 7151 7152 if (blob->open_ref > 1) { 7153 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 7154 return -EBUSY; 7155 } 7156 7157 assert(has_one_clone == false); 7158 *update_clone = false; 7159 return 0; 7160 } 7161 7162 static void 7163 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 7164 { 7165 spdk_bs_sequence_t *seq = cb_arg; 7166 7167 bs_sequence_finish(seq, -ENOMEM); 7168 } 7169 7170 static void 7171 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 7172 { 7173 spdk_bs_sequence_t *seq = cb_arg; 7174 struct delete_snapshot_ctx *ctx; 7175 bool update_clone = false; 7176 7177 if (bserrno != 0) { 7178 bs_sequence_finish(seq, bserrno); 7179 return; 7180 } 7181 7182 blob_verify_md_op(blob); 7183 7184 ctx = calloc(1, sizeof(*ctx)); 7185 if (ctx == NULL) { 7186 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 7187 return; 7188 } 7189 7190 ctx->snapshot = blob; 7191 ctx->cb_fn = bs_delete_blob_finish; 7192 ctx->cb_arg = seq; 7193 7194 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 7195 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 7196 if (ctx->bserrno) { 7197 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7198 return; 7199 } 7200 7201 if (blob->locked_operation_in_progress) { 7202 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 7203 ctx->bserrno = -EBUSY; 7204 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7205 return; 7206 } 7207 7208 blob->locked_operation_in_progress = true; 7209 7210 /* 7211 * Remove the blob from the blob_store list now, to ensure it does not 7212 * get returned after this point by blob_lookup(). 7213 */ 7214 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7215 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7216 7217 if (update_clone) { 7218 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 7219 if (!ctx->page) { 7220 ctx->bserrno = -ENOMEM; 7221 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 7222 return; 7223 } 7224 /* This blob is a snapshot with active clone - update clone first */ 7225 update_clone_on_snapshot_deletion(blob, ctx); 7226 } else { 7227 /* This blob does not have any clones - just remove it */ 7228 bs_blob_list_remove(blob); 7229 bs_delete_blob_finish(seq, blob, 0); 7230 free(ctx); 7231 } 7232 } 7233 7234 void 7235 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7236 spdk_blob_op_complete cb_fn, void *cb_arg) 7237 { 7238 struct spdk_bs_cpl cpl; 7239 spdk_bs_sequence_t *seq; 7240 7241 SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid); 7242 7243 assert(spdk_get_thread() == bs->md_thread); 7244 7245 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7246 cpl.u.blob_basic.cb_fn = cb_fn; 7247 cpl.u.blob_basic.cb_arg = cb_arg; 7248 7249 seq = bs_sequence_start(bs->md_channel, &cpl); 7250 if (!seq) { 7251 cb_fn(cb_arg, -ENOMEM); 7252 return; 7253 } 7254 7255 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 7256 } 7257 7258 /* END spdk_bs_delete_blob */ 7259 7260 /* START spdk_bs_open_blob */ 7261 7262 static void 7263 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7264 { 7265 struct spdk_blob *blob = cb_arg; 7266 struct spdk_blob *existing; 7267 7268 if (bserrno != 0) { 7269 blob_free(blob); 7270 seq->cpl.u.blob_handle.blob = NULL; 7271 bs_sequence_finish(seq, bserrno); 7272 return; 7273 } 7274 7275 existing = blob_lookup(blob->bs, blob->id); 7276 if (existing) { 7277 blob_free(blob); 7278 existing->open_ref++; 7279 seq->cpl.u.blob_handle.blob = existing; 7280 bs_sequence_finish(seq, 0); 7281 return; 7282 } 7283 7284 blob->open_ref++; 7285 7286 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 7287 RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob); 7288 7289 bs_sequence_finish(seq, bserrno); 7290 } 7291 7292 static inline void 7293 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 7294 { 7295 #define FIELD_OK(field) \ 7296 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 7297 7298 #define SET_FIELD(field) \ 7299 if (FIELD_OK(field)) { \ 7300 dst->field = src->field; \ 7301 } \ 7302 7303 SET_FIELD(clear_method); 7304 7305 dst->opts_size = src->opts_size; 7306 7307 /* You should not remove this statement, but need to update the assert statement 7308 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7309 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 7310 7311 #undef FIELD_OK 7312 #undef SET_FIELD 7313 } 7314 7315 static void 7316 bs_open_blob(struct spdk_blob_store *bs, 7317 spdk_blob_id blobid, 7318 struct spdk_blob_open_opts *opts, 7319 spdk_blob_op_with_handle_complete cb_fn, 7320 void *cb_arg) 7321 { 7322 struct spdk_blob *blob; 7323 struct spdk_bs_cpl cpl; 7324 struct spdk_blob_open_opts opts_local; 7325 spdk_bs_sequence_t *seq; 7326 uint32_t page_num; 7327 7328 SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid); 7329 assert(spdk_get_thread() == bs->md_thread); 7330 7331 page_num = bs_blobid_to_page(blobid); 7332 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 7333 /* Invalid blobid */ 7334 cb_fn(cb_arg, NULL, -ENOENT); 7335 return; 7336 } 7337 7338 blob = blob_lookup(bs, blobid); 7339 if (blob) { 7340 blob->open_ref++; 7341 cb_fn(cb_arg, blob, 0); 7342 return; 7343 } 7344 7345 blob = blob_alloc(bs, blobid); 7346 if (!blob) { 7347 cb_fn(cb_arg, NULL, -ENOMEM); 7348 return; 7349 } 7350 7351 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 7352 if (opts) { 7353 blob_open_opts_copy(opts, &opts_local); 7354 } 7355 7356 blob->clear_method = opts_local.clear_method; 7357 7358 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 7359 cpl.u.blob_handle.cb_fn = cb_fn; 7360 cpl.u.blob_handle.cb_arg = cb_arg; 7361 cpl.u.blob_handle.blob = blob; 7362 7363 seq = bs_sequence_start(bs->md_channel, &cpl); 7364 if (!seq) { 7365 blob_free(blob); 7366 cb_fn(cb_arg, NULL, -ENOMEM); 7367 return; 7368 } 7369 7370 blob_load(seq, blob, bs_open_blob_cpl, blob); 7371 } 7372 7373 void 7374 spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7375 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7376 { 7377 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 7378 } 7379 7380 void 7381 spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 7382 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7383 { 7384 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 7385 } 7386 7387 /* END spdk_bs_open_blob */ 7388 7389 /* START spdk_blob_set_read_only */ 7390 int 7391 spdk_blob_set_read_only(struct spdk_blob *blob) 7392 { 7393 blob_verify_md_op(blob); 7394 7395 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 7396 7397 blob->state = SPDK_BLOB_STATE_DIRTY; 7398 return 0; 7399 } 7400 /* END spdk_blob_set_read_only */ 7401 7402 /* START spdk_blob_sync_md */ 7403 7404 static void 7405 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7406 { 7407 struct spdk_blob *blob = cb_arg; 7408 7409 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7410 blob->data_ro = true; 7411 blob->md_ro = true; 7412 } 7413 7414 bs_sequence_finish(seq, bserrno); 7415 } 7416 7417 static void 7418 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7419 { 7420 struct spdk_bs_cpl cpl; 7421 spdk_bs_sequence_t *seq; 7422 7423 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7424 cpl.u.blob_basic.cb_fn = cb_fn; 7425 cpl.u.blob_basic.cb_arg = cb_arg; 7426 7427 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7428 if (!seq) { 7429 cb_fn(cb_arg, -ENOMEM); 7430 return; 7431 } 7432 7433 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7434 } 7435 7436 void 7437 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7438 { 7439 blob_verify_md_op(blob); 7440 7441 SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id); 7442 7443 if (blob->md_ro) { 7444 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7445 cb_fn(cb_arg, 0); 7446 return; 7447 } 7448 7449 blob_sync_md(blob, cb_fn, cb_arg); 7450 } 7451 7452 /* END spdk_blob_sync_md */ 7453 7454 struct spdk_blob_insert_cluster_ctx { 7455 struct spdk_thread *thread; 7456 struct spdk_blob *blob; 7457 uint32_t cluster_num; /* cluster index in blob */ 7458 uint32_t cluster; /* cluster on disk */ 7459 uint32_t extent_page; /* extent page on disk */ 7460 struct spdk_blob_md_page *page; /* preallocated extent page */ 7461 int rc; 7462 spdk_blob_op_complete cb_fn; 7463 void *cb_arg; 7464 }; 7465 7466 static void 7467 blob_insert_cluster_msg_cpl(void *arg) 7468 { 7469 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7470 7471 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7472 free(ctx); 7473 } 7474 7475 static void 7476 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7477 { 7478 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7479 7480 ctx->rc = bserrno; 7481 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7482 } 7483 7484 static void 7485 blob_insert_new_ep_cb(void *arg, int bserrno) 7486 { 7487 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7488 uint32_t *extent_page; 7489 7490 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7491 *extent_page = ctx->extent_page; 7492 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7493 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7494 } 7495 7496 static void 7497 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7498 { 7499 bs_sequence_finish(seq, bserrno); 7500 } 7501 7502 static void 7503 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7504 struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg) 7505 { 7506 spdk_bs_sequence_t *seq; 7507 struct spdk_bs_cpl cpl; 7508 7509 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7510 cpl.u.blob_basic.cb_fn = cb_fn; 7511 cpl.u.blob_basic.cb_arg = cb_arg; 7512 7513 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7514 if (!seq) { 7515 cb_fn(cb_arg, -ENOMEM); 7516 return; 7517 } 7518 7519 assert(page); 7520 page->next = SPDK_INVALID_MD_PAGE; 7521 page->id = blob->id; 7522 page->sequence_num = 0; 7523 7524 blob_serialize_extent_page(blob, cluster_num, page); 7525 7526 page->crc = blob_md_page_calc_crc(page); 7527 7528 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7529 7530 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7531 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7532 blob_persist_extent_page_cpl, page); 7533 } 7534 7535 static void 7536 blob_insert_cluster_msg(void *arg) 7537 { 7538 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7539 uint32_t *extent_page; 7540 7541 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7542 if (ctx->rc != 0) { 7543 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7544 return; 7545 } 7546 7547 if (ctx->blob->use_extent_table == false) { 7548 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7549 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7550 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7551 return; 7552 } 7553 7554 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7555 if (*extent_page == 0) { 7556 /* Extent page requires allocation. 7557 * It was already claimed in the used_md_pages map and placed in ctx. */ 7558 assert(ctx->extent_page != 0); 7559 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7560 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page, 7561 blob_insert_new_ep_cb, ctx); 7562 } else { 7563 /* It is possible for original thread to allocate extent page for 7564 * different cluster in the same extent page. In such case proceed with 7565 * updating the existing extent page, but release the additional one. */ 7566 if (ctx->extent_page != 0) { 7567 spdk_spin_lock(&ctx->blob->bs->used_lock); 7568 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7569 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7570 spdk_spin_unlock(&ctx->blob->bs->used_lock); 7571 ctx->extent_page = 0; 7572 } 7573 /* Extent page already allocated. 7574 * Every cluster allocation, requires just an update of single extent page. */ 7575 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page, 7576 blob_insert_cluster_msg_cb, ctx); 7577 } 7578 } 7579 7580 static void 7581 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7582 uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page, 7583 spdk_blob_op_complete cb_fn, void *cb_arg) 7584 { 7585 struct spdk_blob_insert_cluster_ctx *ctx; 7586 7587 ctx = calloc(1, sizeof(*ctx)); 7588 if (ctx == NULL) { 7589 cb_fn(cb_arg, -ENOMEM); 7590 return; 7591 } 7592 7593 ctx->thread = spdk_get_thread(); 7594 ctx->blob = blob; 7595 ctx->cluster_num = cluster_num; 7596 ctx->cluster = cluster; 7597 ctx->extent_page = extent_page; 7598 ctx->page = page; 7599 ctx->cb_fn = cb_fn; 7600 ctx->cb_arg = cb_arg; 7601 7602 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7603 } 7604 7605 /* START spdk_blob_close */ 7606 7607 static void 7608 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7609 { 7610 struct spdk_blob *blob = cb_arg; 7611 7612 if (bserrno == 0) { 7613 blob->open_ref--; 7614 if (blob->open_ref == 0) { 7615 /* 7616 * Blobs with active.num_pages == 0 are deleted blobs. 7617 * these blobs are removed from the blob_store list 7618 * when the deletion process starts - so don't try to 7619 * remove them again. 7620 */ 7621 if (blob->active.num_pages > 0) { 7622 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7623 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7624 } 7625 blob_free(blob); 7626 } 7627 } 7628 7629 bs_sequence_finish(seq, bserrno); 7630 } 7631 7632 void 7633 spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7634 { 7635 struct spdk_bs_cpl cpl; 7636 spdk_bs_sequence_t *seq; 7637 7638 blob_verify_md_op(blob); 7639 7640 SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id); 7641 7642 if (blob->open_ref == 0) { 7643 cb_fn(cb_arg, -EBADF); 7644 return; 7645 } 7646 7647 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7648 cpl.u.blob_basic.cb_fn = cb_fn; 7649 cpl.u.blob_basic.cb_arg = cb_arg; 7650 7651 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7652 if (!seq) { 7653 cb_fn(cb_arg, -ENOMEM); 7654 return; 7655 } 7656 7657 /* Sync metadata */ 7658 blob_persist(seq, blob, blob_close_cpl, blob); 7659 } 7660 7661 /* END spdk_blob_close */ 7662 7663 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7664 { 7665 return spdk_get_io_channel(bs); 7666 } 7667 7668 void 7669 spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7670 { 7671 spdk_put_io_channel(channel); 7672 } 7673 7674 void 7675 spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7676 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7677 { 7678 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7679 SPDK_BLOB_UNMAP); 7680 } 7681 7682 void 7683 spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7684 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7685 { 7686 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7687 SPDK_BLOB_WRITE_ZEROES); 7688 } 7689 7690 void 7691 spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7692 void *payload, uint64_t offset, uint64_t length, 7693 spdk_blob_op_complete cb_fn, void *cb_arg) 7694 { 7695 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7696 SPDK_BLOB_WRITE); 7697 } 7698 7699 void 7700 spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7701 void *payload, uint64_t offset, uint64_t length, 7702 spdk_blob_op_complete cb_fn, void *cb_arg) 7703 { 7704 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7705 SPDK_BLOB_READ); 7706 } 7707 7708 void 7709 spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7710 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7711 spdk_blob_op_complete cb_fn, void *cb_arg) 7712 { 7713 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL); 7714 } 7715 7716 void 7717 spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7718 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7719 spdk_blob_op_complete cb_fn, void *cb_arg) 7720 { 7721 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL); 7722 } 7723 7724 void 7725 spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7726 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7727 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7728 { 7729 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, 7730 io_opts); 7731 } 7732 7733 void 7734 spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel, 7735 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7736 spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts) 7737 { 7738 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, 7739 io_opts); 7740 } 7741 7742 struct spdk_bs_iter_ctx { 7743 int64_t page_num; 7744 struct spdk_blob_store *bs; 7745 7746 spdk_blob_op_with_handle_complete cb_fn; 7747 void *cb_arg; 7748 }; 7749 7750 static void 7751 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7752 { 7753 struct spdk_bs_iter_ctx *ctx = cb_arg; 7754 struct spdk_blob_store *bs = ctx->bs; 7755 spdk_blob_id id; 7756 7757 if (bserrno == 0) { 7758 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7759 free(ctx); 7760 return; 7761 } 7762 7763 ctx->page_num++; 7764 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7765 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7766 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7767 free(ctx); 7768 return; 7769 } 7770 7771 id = bs_page_to_blobid(ctx->page_num); 7772 7773 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7774 } 7775 7776 void 7777 spdk_bs_iter_first(struct spdk_blob_store *bs, 7778 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7779 { 7780 struct spdk_bs_iter_ctx *ctx; 7781 7782 ctx = calloc(1, sizeof(*ctx)); 7783 if (!ctx) { 7784 cb_fn(cb_arg, NULL, -ENOMEM); 7785 return; 7786 } 7787 7788 ctx->page_num = -1; 7789 ctx->bs = bs; 7790 ctx->cb_fn = cb_fn; 7791 ctx->cb_arg = cb_arg; 7792 7793 bs_iter_cpl(ctx, NULL, -1); 7794 } 7795 7796 static void 7797 bs_iter_close_cpl(void *cb_arg, int bserrno) 7798 { 7799 struct spdk_bs_iter_ctx *ctx = cb_arg; 7800 7801 bs_iter_cpl(ctx, NULL, -1); 7802 } 7803 7804 void 7805 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7806 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7807 { 7808 struct spdk_bs_iter_ctx *ctx; 7809 7810 assert(blob != NULL); 7811 7812 ctx = calloc(1, sizeof(*ctx)); 7813 if (!ctx) { 7814 cb_fn(cb_arg, NULL, -ENOMEM); 7815 return; 7816 } 7817 7818 ctx->page_num = bs_blobid_to_page(blob->id); 7819 ctx->bs = bs; 7820 ctx->cb_fn = cb_fn; 7821 ctx->cb_arg = cb_arg; 7822 7823 /* Close the existing blob */ 7824 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7825 } 7826 7827 static int 7828 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7829 uint16_t value_len, bool internal) 7830 { 7831 struct spdk_xattr_tailq *xattrs; 7832 struct spdk_xattr *xattr; 7833 size_t desc_size; 7834 void *tmp; 7835 7836 blob_verify_md_op(blob); 7837 7838 if (blob->md_ro) { 7839 return -EPERM; 7840 } 7841 7842 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7843 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7844 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7845 desc_size, SPDK_BS_MAX_DESC_SIZE); 7846 return -ENOMEM; 7847 } 7848 7849 if (internal) { 7850 xattrs = &blob->xattrs_internal; 7851 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7852 } else { 7853 xattrs = &blob->xattrs; 7854 } 7855 7856 TAILQ_FOREACH(xattr, xattrs, link) { 7857 if (!strcmp(name, xattr->name)) { 7858 tmp = malloc(value_len); 7859 if (!tmp) { 7860 return -ENOMEM; 7861 } 7862 7863 free(xattr->value); 7864 xattr->value_len = value_len; 7865 xattr->value = tmp; 7866 memcpy(xattr->value, value, value_len); 7867 7868 blob->state = SPDK_BLOB_STATE_DIRTY; 7869 7870 return 0; 7871 } 7872 } 7873 7874 xattr = calloc(1, sizeof(*xattr)); 7875 if (!xattr) { 7876 return -ENOMEM; 7877 } 7878 7879 xattr->name = strdup(name); 7880 if (!xattr->name) { 7881 free(xattr); 7882 return -ENOMEM; 7883 } 7884 7885 xattr->value_len = value_len; 7886 xattr->value = malloc(value_len); 7887 if (!xattr->value) { 7888 free(xattr->name); 7889 free(xattr); 7890 return -ENOMEM; 7891 } 7892 memcpy(xattr->value, value, value_len); 7893 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7894 7895 blob->state = SPDK_BLOB_STATE_DIRTY; 7896 7897 return 0; 7898 } 7899 7900 int 7901 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7902 uint16_t value_len) 7903 { 7904 return blob_set_xattr(blob, name, value, value_len, false); 7905 } 7906 7907 static int 7908 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7909 { 7910 struct spdk_xattr_tailq *xattrs; 7911 struct spdk_xattr *xattr; 7912 7913 blob_verify_md_op(blob); 7914 7915 if (blob->md_ro) { 7916 return -EPERM; 7917 } 7918 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7919 7920 TAILQ_FOREACH(xattr, xattrs, link) { 7921 if (!strcmp(name, xattr->name)) { 7922 TAILQ_REMOVE(xattrs, xattr, link); 7923 free(xattr->value); 7924 free(xattr->name); 7925 free(xattr); 7926 7927 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7928 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7929 } 7930 blob->state = SPDK_BLOB_STATE_DIRTY; 7931 7932 return 0; 7933 } 7934 } 7935 7936 return -ENOENT; 7937 } 7938 7939 int 7940 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7941 { 7942 return blob_remove_xattr(blob, name, false); 7943 } 7944 7945 static int 7946 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7947 const void **value, size_t *value_len, bool internal) 7948 { 7949 struct spdk_xattr *xattr; 7950 struct spdk_xattr_tailq *xattrs; 7951 7952 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7953 7954 TAILQ_FOREACH(xattr, xattrs, link) { 7955 if (!strcmp(name, xattr->name)) { 7956 *value = xattr->value; 7957 *value_len = xattr->value_len; 7958 return 0; 7959 } 7960 } 7961 return -ENOENT; 7962 } 7963 7964 int 7965 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7966 const void **value, size_t *value_len) 7967 { 7968 blob_verify_md_op(blob); 7969 7970 return blob_get_xattr_value(blob, name, value, value_len, false); 7971 } 7972 7973 struct spdk_xattr_names { 7974 uint32_t count; 7975 const char *names[0]; 7976 }; 7977 7978 static int 7979 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7980 { 7981 struct spdk_xattr *xattr; 7982 int count = 0; 7983 7984 TAILQ_FOREACH(xattr, xattrs, link) { 7985 count++; 7986 } 7987 7988 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7989 if (*names == NULL) { 7990 return -ENOMEM; 7991 } 7992 7993 TAILQ_FOREACH(xattr, xattrs, link) { 7994 (*names)->names[(*names)->count++] = xattr->name; 7995 } 7996 7997 return 0; 7998 } 7999 8000 int 8001 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 8002 { 8003 blob_verify_md_op(blob); 8004 8005 return blob_get_xattr_names(&blob->xattrs, names); 8006 } 8007 8008 uint32_t 8009 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 8010 { 8011 assert(names != NULL); 8012 8013 return names->count; 8014 } 8015 8016 const char * 8017 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 8018 { 8019 if (index >= names->count) { 8020 return NULL; 8021 } 8022 8023 return names->names[index]; 8024 } 8025 8026 void 8027 spdk_xattr_names_free(struct spdk_xattr_names *names) 8028 { 8029 free(names); 8030 } 8031 8032 struct spdk_bs_type 8033 spdk_bs_get_bstype(struct spdk_blob_store *bs) 8034 { 8035 return bs->bstype; 8036 } 8037 8038 void 8039 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 8040 { 8041 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 8042 } 8043 8044 bool 8045 spdk_blob_is_read_only(struct spdk_blob *blob) 8046 { 8047 assert(blob != NULL); 8048 return (blob->data_ro || blob->md_ro); 8049 } 8050 8051 bool 8052 spdk_blob_is_snapshot(struct spdk_blob *blob) 8053 { 8054 struct spdk_blob_list *snapshot_entry; 8055 8056 assert(blob != NULL); 8057 8058 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 8059 if (snapshot_entry == NULL) { 8060 return false; 8061 } 8062 8063 return true; 8064 } 8065 8066 bool 8067 spdk_blob_is_clone(struct spdk_blob *blob) 8068 { 8069 assert(blob != NULL); 8070 8071 if (blob->parent_id != SPDK_BLOBID_INVALID) { 8072 assert(spdk_blob_is_thin_provisioned(blob)); 8073 return true; 8074 } 8075 8076 return false; 8077 } 8078 8079 bool 8080 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 8081 { 8082 assert(blob != NULL); 8083 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 8084 } 8085 8086 static void 8087 blob_update_clear_method(struct spdk_blob *blob) 8088 { 8089 enum blob_clear_method stored_cm; 8090 8091 assert(blob != NULL); 8092 8093 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 8094 * in metadata previously. If something other than the default was 8095 * specified, ignore stored value and used what was passed in. 8096 */ 8097 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 8098 8099 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 8100 blob->clear_method = stored_cm; 8101 } else if (blob->clear_method != stored_cm) { 8102 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 8103 blob->clear_method, stored_cm); 8104 } 8105 } 8106 8107 spdk_blob_id 8108 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 8109 { 8110 struct spdk_blob_list *snapshot_entry = NULL; 8111 struct spdk_blob_list *clone_entry = NULL; 8112 8113 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 8114 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8115 if (clone_entry->id == blob_id) { 8116 return snapshot_entry->id; 8117 } 8118 } 8119 } 8120 8121 return SPDK_BLOBID_INVALID; 8122 } 8123 8124 int 8125 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 8126 size_t *count) 8127 { 8128 struct spdk_blob_list *snapshot_entry, *clone_entry; 8129 size_t n; 8130 8131 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 8132 if (snapshot_entry == NULL) { 8133 *count = 0; 8134 return 0; 8135 } 8136 8137 if (ids == NULL || *count < snapshot_entry->clone_count) { 8138 *count = snapshot_entry->clone_count; 8139 return -ENOMEM; 8140 } 8141 *count = snapshot_entry->clone_count; 8142 8143 n = 0; 8144 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 8145 ids[n++] = clone_entry->id; 8146 } 8147 8148 return 0; 8149 } 8150 8151 static void 8152 bs_load_grow_continue(struct spdk_bs_load_ctx *ctx) 8153 { 8154 int rc; 8155 8156 if (ctx->super->size == 0) { 8157 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8158 } 8159 8160 if (ctx->super->io_unit_size == 0) { 8161 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 8162 } 8163 8164 /* Parse the super block */ 8165 ctx->bs->clean = 1; 8166 ctx->bs->cluster_sz = ctx->super->cluster_size; 8167 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 8168 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 8169 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 8170 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 8171 } 8172 ctx->bs->io_unit_size = ctx->super->io_unit_size; 8173 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 8174 if (rc < 0) { 8175 bs_load_ctx_fail(ctx, -ENOMEM); 8176 return; 8177 } 8178 ctx->bs->md_start = ctx->super->md_start; 8179 ctx->bs->md_len = ctx->super->md_len; 8180 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 8181 if (rc < 0) { 8182 bs_load_ctx_fail(ctx, -ENOMEM); 8183 return; 8184 } 8185 8186 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 8187 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 8188 ctx->bs->super_blob = ctx->super->super_blob; 8189 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 8190 8191 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 8192 SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n"); 8193 bs_load_ctx_fail(ctx, -EIO); 8194 return; 8195 } else { 8196 bs_load_read_used_pages(ctx); 8197 } 8198 } 8199 8200 static void 8201 bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8202 { 8203 struct spdk_bs_load_ctx *ctx = cb_arg; 8204 8205 if (bserrno != 0) { 8206 bs_load_ctx_fail(ctx, bserrno); 8207 return; 8208 } 8209 bs_load_grow_continue(ctx); 8210 } 8211 8212 static void 8213 bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8214 { 8215 struct spdk_bs_load_ctx *ctx = cb_arg; 8216 8217 if (bserrno != 0) { 8218 bs_load_ctx_fail(ctx, bserrno); 8219 return; 8220 } 8221 8222 spdk_free(ctx->mask); 8223 8224 bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 8225 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 8226 bs_load_grow_super_write_cpl, ctx); 8227 } 8228 8229 static void 8230 bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8231 { 8232 struct spdk_bs_load_ctx *ctx = cb_arg; 8233 uint64_t lba, lba_count; 8234 uint64_t dev_size; 8235 uint64_t total_clusters; 8236 8237 if (bserrno != 0) { 8238 bs_load_ctx_fail(ctx, bserrno); 8239 return; 8240 } 8241 8242 /* The type must be correct */ 8243 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 8244 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 8245 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 8246 struct spdk_blob_md_page) * 8)); 8247 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8248 total_clusters = dev_size / ctx->super->cluster_size; 8249 ctx->mask->length = total_clusters; 8250 8251 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8252 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8253 bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count, 8254 bs_load_grow_used_clusters_write_cpl, ctx); 8255 } 8256 8257 static void 8258 bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx) 8259 { 8260 uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask; 8261 uint64_t lba, lba_count, mask_size; 8262 8263 dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 8264 total_clusters = dev_size / ctx->super->cluster_size; 8265 used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 8266 spdk_divide_round_up(total_clusters, 8), 8267 SPDK_BS_PAGE_SIZE); 8268 max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start; 8269 /* No necessary to grow or no space to grow */ 8270 if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) { 8271 SPDK_DEBUGLOG(blob, "No grow\n"); 8272 bs_load_grow_continue(ctx); 8273 return; 8274 } 8275 8276 SPDK_DEBUGLOG(blob, "Resize blobstore\n"); 8277 8278 ctx->super->size = dev_size; 8279 ctx->super->used_cluster_mask_len = used_cluster_mask_len; 8280 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 8281 8282 mask_size = used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 8283 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 8284 SPDK_MALLOC_DMA); 8285 if (!ctx->mask) { 8286 bs_load_ctx_fail(ctx, -ENOMEM); 8287 return; 8288 } 8289 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 8290 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 8291 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 8292 bs_load_grow_used_clusters_read_cpl, ctx); 8293 } 8294 8295 static void 8296 bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 8297 { 8298 struct spdk_bs_load_ctx *ctx = cb_arg; 8299 uint32_t crc; 8300 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 8301 8302 if (ctx->super->version > SPDK_BS_VERSION || 8303 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 8304 bs_load_ctx_fail(ctx, -EILSEQ); 8305 return; 8306 } 8307 8308 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 8309 sizeof(ctx->super->signature)) != 0) { 8310 bs_load_ctx_fail(ctx, -EILSEQ); 8311 return; 8312 } 8313 8314 crc = blob_md_page_calc_crc(ctx->super); 8315 if (crc != ctx->super->crc) { 8316 bs_load_ctx_fail(ctx, -EILSEQ); 8317 return; 8318 } 8319 8320 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8321 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 8322 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 8323 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 8324 } else { 8325 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 8326 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8327 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 8328 bs_load_ctx_fail(ctx, -ENXIO); 8329 return; 8330 } 8331 8332 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 8333 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 8334 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 8335 bs_load_ctx_fail(ctx, -EILSEQ); 8336 return; 8337 } 8338 8339 bs_load_try_to_grow(ctx); 8340 8341 } 8342 8343 void 8344 spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 8345 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 8346 { 8347 struct spdk_blob_store *bs; 8348 struct spdk_bs_cpl cpl; 8349 struct spdk_bs_load_ctx *ctx; 8350 struct spdk_bs_opts opts = {}; 8351 int err; 8352 8353 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 8354 8355 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 8356 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 8357 dev->destroy(dev); 8358 cb_fn(cb_arg, NULL, -EINVAL); 8359 return; 8360 } 8361 8362 spdk_bs_opts_init(&opts, sizeof(opts)); 8363 if (o) { 8364 if (bs_opts_copy(o, &opts)) { 8365 return; 8366 } 8367 } 8368 8369 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 8370 dev->destroy(dev); 8371 cb_fn(cb_arg, NULL, -EINVAL); 8372 return; 8373 } 8374 8375 err = bs_alloc(dev, &opts, &bs, &ctx); 8376 if (err) { 8377 dev->destroy(dev); 8378 cb_fn(cb_arg, NULL, err); 8379 return; 8380 } 8381 8382 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 8383 cpl.u.bs_handle.cb_fn = cb_fn; 8384 cpl.u.bs_handle.cb_arg = cb_arg; 8385 cpl.u.bs_handle.bs = bs; 8386 8387 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 8388 if (!ctx->seq) { 8389 spdk_free(ctx->super); 8390 free(ctx); 8391 bs_free(bs); 8392 cb_fn(cb_arg, NULL, -ENOMEM); 8393 return; 8394 } 8395 8396 /* Read the super block */ 8397 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 8398 bs_byte_to_lba(bs, sizeof(*ctx->super)), 8399 bs_grow_load_super_cpl, ctx); 8400 } 8401 8402 SPDK_LOG_REGISTER_COMPONENT(blob) 8403