1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "spdk/blob.h" 37 #include "spdk/crc32.h" 38 #include "spdk/env.h" 39 #include "spdk/queue.h" 40 #include "spdk/thread.h" 41 #include "spdk/bit_array.h" 42 #include "spdk/bit_pool.h" 43 #include "spdk/likely.h" 44 #include "spdk/util.h" 45 #include "spdk/string.h" 46 47 #include "spdk_internal/assert.h" 48 #include "spdk/log.h" 49 50 #include "blobstore.h" 51 52 #define BLOB_CRC32C_INITIAL 0xffffffffUL 53 54 static int bs_register_md_thread(struct spdk_blob_store *bs); 55 static int bs_unregister_md_thread(struct spdk_blob_store *bs); 56 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); 57 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 58 uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg); 59 60 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 61 uint16_t value_len, bool internal); 62 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name, 63 const void **value, size_t *value_len, bool internal); 64 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); 65 66 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 67 spdk_blob_op_complete cb_fn, void *cb_arg); 68 69 static int 70 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2) 71 { 72 return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id); 73 } 74 75 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp); 76 77 static void 78 blob_verify_md_op(struct spdk_blob *blob) 79 { 80 assert(blob != NULL); 81 assert(spdk_get_thread() == blob->bs->md_thread); 82 assert(blob->state != SPDK_BLOB_STATE_LOADING); 83 } 84 85 static struct spdk_blob_list * 86 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid) 87 { 88 struct spdk_blob_list *snapshot_entry = NULL; 89 90 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 91 if (snapshot_entry->id == blobid) { 92 break; 93 } 94 } 95 96 return snapshot_entry; 97 } 98 99 static void 100 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page) 101 { 102 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 103 assert(spdk_bit_array_get(bs->used_md_pages, page) == false); 104 105 spdk_bit_array_set(bs->used_md_pages, page); 106 } 107 108 static void 109 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page) 110 { 111 assert(page < spdk_bit_array_capacity(bs->used_md_pages)); 112 assert(spdk_bit_array_get(bs->used_md_pages, page) == true); 113 114 spdk_bit_array_clear(bs->used_md_pages, page); 115 } 116 117 static uint32_t 118 bs_claim_cluster(struct spdk_blob_store *bs) 119 { 120 uint32_t cluster_num; 121 122 cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters); 123 if (cluster_num == UINT32_MAX) { 124 return UINT32_MAX; 125 } 126 127 SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num); 128 bs->num_free_clusters--; 129 130 return cluster_num; 131 } 132 133 static void 134 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) 135 { 136 assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters)); 137 assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true); 138 assert(bs->num_free_clusters < bs->total_clusters); 139 140 SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num); 141 142 spdk_bit_pool_free_bit(bs->used_clusters, cluster_num); 143 bs->num_free_clusters++; 144 } 145 146 static int 147 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) 148 { 149 uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; 150 151 blob_verify_md_op(blob); 152 153 if (*cluster_lba != 0) { 154 return -EEXIST; 155 } 156 157 *cluster_lba = bs_cluster_to_lba(blob->bs, cluster); 158 return 0; 159 } 160 161 static int 162 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, 163 uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map) 164 { 165 uint32_t *extent_page = 0; 166 167 *cluster = bs_claim_cluster(blob->bs); 168 if (*cluster == UINT32_MAX) { 169 /* No more free clusters. Cannot satisfy the request */ 170 return -ENOSPC; 171 } 172 173 if (blob->use_extent_table) { 174 extent_page = bs_cluster_to_extent_page(blob, cluster_num); 175 if (*extent_page == 0) { 176 /* Extent page shall never occupy md_page so start the search from 1 */ 177 if (*lowest_free_md_page == 0) { 178 *lowest_free_md_page = 1; 179 } 180 /* No extent_page is allocated for the cluster */ 181 *lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, 182 *lowest_free_md_page); 183 if (*lowest_free_md_page == UINT32_MAX) { 184 /* No more free md pages. Cannot satisfy the request */ 185 bs_release_cluster(blob->bs, *cluster); 186 return -ENOSPC; 187 } 188 bs_claim_md_page(blob->bs, *lowest_free_md_page); 189 } 190 } 191 192 SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id); 193 194 if (update_map) { 195 blob_insert_cluster(blob, cluster_num, *cluster); 196 if (blob->use_extent_table && *extent_page == 0) { 197 *extent_page = *lowest_free_md_page; 198 } 199 } 200 201 return 0; 202 } 203 204 static void 205 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) 206 { 207 xattrs->count = 0; 208 xattrs->names = NULL; 209 xattrs->ctx = NULL; 210 xattrs->get_value = NULL; 211 } 212 213 void 214 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size) 215 { 216 if (!opts) { 217 SPDK_ERRLOG("opts should not be NULL\n"); 218 return; 219 } 220 221 if (!opts_size) { 222 SPDK_ERRLOG("opts_size should not be zero value\n"); 223 return; 224 } 225 226 memset(opts, 0, opts_size); 227 opts->opts_size = opts_size; 228 229 #define FIELD_OK(field) \ 230 offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size 231 232 #define SET_FIELD(field, value) \ 233 if (FIELD_OK(field)) { \ 234 opts->field = value; \ 235 } \ 236 237 SET_FIELD(num_clusters, 0); 238 SET_FIELD(thin_provision, false); 239 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 240 241 if (FIELD_OK(xattrs)) { 242 blob_xattrs_init(&opts->xattrs); 243 } 244 245 SET_FIELD(use_extent_table, true); 246 247 #undef FIELD_OK 248 #undef SET_FIELD 249 } 250 251 void 252 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size) 253 { 254 if (!opts) { 255 SPDK_ERRLOG("opts should not be NULL\n"); 256 return; 257 } 258 259 if (!opts_size) { 260 SPDK_ERRLOG("opts_size should not be zero value\n"); 261 return; 262 } 263 264 memset(opts, 0, opts_size); 265 opts->opts_size = opts_size; 266 267 #define FIELD_OK(field) \ 268 offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size 269 270 #define SET_FIELD(field, value) \ 271 if (FIELD_OK(field)) { \ 272 opts->field = value; \ 273 } \ 274 275 SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT); 276 277 #undef FIELD_OK 278 #undef SET_FILED 279 } 280 281 static struct spdk_blob * 282 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) 283 { 284 struct spdk_blob *blob; 285 286 blob = calloc(1, sizeof(*blob)); 287 if (!blob) { 288 return NULL; 289 } 290 291 blob->id = id; 292 blob->bs = bs; 293 294 blob->parent_id = SPDK_BLOBID_INVALID; 295 296 blob->state = SPDK_BLOB_STATE_DIRTY; 297 blob->extent_rle_found = false; 298 blob->extent_table_found = false; 299 blob->active.num_pages = 1; 300 blob->active.pages = calloc(1, sizeof(*blob->active.pages)); 301 if (!blob->active.pages) { 302 free(blob); 303 return NULL; 304 } 305 306 blob->active.pages[0] = bs_blobid_to_page(id); 307 308 TAILQ_INIT(&blob->xattrs); 309 TAILQ_INIT(&blob->xattrs_internal); 310 TAILQ_INIT(&blob->pending_persists); 311 TAILQ_INIT(&blob->persists_to_complete); 312 313 return blob; 314 } 315 316 static void 317 xattrs_free(struct spdk_xattr_tailq *xattrs) 318 { 319 struct spdk_xattr *xattr, *xattr_tmp; 320 321 TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { 322 TAILQ_REMOVE(xattrs, xattr, link); 323 free(xattr->name); 324 free(xattr->value); 325 free(xattr); 326 } 327 } 328 329 static void 330 blob_free(struct spdk_blob *blob) 331 { 332 assert(blob != NULL); 333 assert(TAILQ_EMPTY(&blob->pending_persists)); 334 assert(TAILQ_EMPTY(&blob->persists_to_complete)); 335 336 free(blob->active.extent_pages); 337 free(blob->clean.extent_pages); 338 free(blob->active.clusters); 339 free(blob->clean.clusters); 340 free(blob->active.pages); 341 free(blob->clean.pages); 342 343 xattrs_free(&blob->xattrs); 344 xattrs_free(&blob->xattrs_internal); 345 346 if (blob->back_bs_dev) { 347 blob->back_bs_dev->destroy(blob->back_bs_dev); 348 } 349 350 free(blob); 351 } 352 353 struct freeze_io_ctx { 354 struct spdk_bs_cpl cpl; 355 struct spdk_blob *blob; 356 }; 357 358 static void 359 blob_io_sync(struct spdk_io_channel_iter *i) 360 { 361 spdk_for_each_channel_continue(i, 0); 362 } 363 364 static void 365 blob_execute_queued_io(struct spdk_io_channel_iter *i) 366 { 367 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 368 struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); 369 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 370 struct spdk_bs_request_set *set; 371 struct spdk_bs_user_op_args *args; 372 spdk_bs_user_op_t *op, *tmp; 373 374 TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { 375 set = (struct spdk_bs_request_set *)op; 376 args = &set->u.user_op; 377 378 if (args->blob == ctx->blob) { 379 TAILQ_REMOVE(&ch->queued_io, op, link); 380 bs_user_op_execute(op); 381 } 382 } 383 384 spdk_for_each_channel_continue(i, 0); 385 } 386 387 static void 388 blob_io_cpl(struct spdk_io_channel_iter *i, int status) 389 { 390 struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 391 392 ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); 393 394 free(ctx); 395 } 396 397 static void 398 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 399 { 400 struct freeze_io_ctx *ctx; 401 402 ctx = calloc(1, sizeof(*ctx)); 403 if (!ctx) { 404 cb_fn(cb_arg, -ENOMEM); 405 return; 406 } 407 408 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 409 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 410 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 411 ctx->blob = blob; 412 413 /* Freeze I/O on blob */ 414 blob->frozen_refcnt++; 415 416 if (blob->frozen_refcnt == 1) { 417 spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl); 418 } else { 419 cb_fn(cb_arg, 0); 420 free(ctx); 421 } 422 } 423 424 static void 425 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 426 { 427 struct freeze_io_ctx *ctx; 428 429 ctx = calloc(1, sizeof(*ctx)); 430 if (!ctx) { 431 cb_fn(cb_arg, -ENOMEM); 432 return; 433 } 434 435 ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 436 ctx->cpl.u.blob_basic.cb_fn = cb_fn; 437 ctx->cpl.u.blob_basic.cb_arg = cb_arg; 438 ctx->blob = blob; 439 440 assert(blob->frozen_refcnt > 0); 441 442 blob->frozen_refcnt--; 443 444 if (blob->frozen_refcnt == 0) { 445 spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl); 446 } else { 447 cb_fn(cb_arg, 0); 448 free(ctx); 449 } 450 } 451 452 static int 453 blob_mark_clean(struct spdk_blob *blob) 454 { 455 uint32_t *extent_pages = NULL; 456 uint64_t *clusters = NULL; 457 uint32_t *pages = NULL; 458 459 assert(blob != NULL); 460 461 if (blob->active.num_extent_pages) { 462 assert(blob->active.extent_pages); 463 extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages)); 464 if (!extent_pages) { 465 return -ENOMEM; 466 } 467 memcpy(extent_pages, blob->active.extent_pages, 468 blob->active.num_extent_pages * sizeof(*extent_pages)); 469 } 470 471 if (blob->active.num_clusters) { 472 assert(blob->active.clusters); 473 clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); 474 if (!clusters) { 475 free(extent_pages); 476 return -ENOMEM; 477 } 478 memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 479 } 480 481 if (blob->active.num_pages) { 482 assert(blob->active.pages); 483 pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); 484 if (!pages) { 485 free(extent_pages); 486 free(clusters); 487 return -ENOMEM; 488 } 489 memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 490 } 491 492 free(blob->clean.extent_pages); 493 free(blob->clean.clusters); 494 free(blob->clean.pages); 495 496 blob->clean.num_extent_pages = blob->active.num_extent_pages; 497 blob->clean.extent_pages = blob->active.extent_pages; 498 blob->clean.num_clusters = blob->active.num_clusters; 499 blob->clean.clusters = blob->active.clusters; 500 blob->clean.num_pages = blob->active.num_pages; 501 blob->clean.pages = blob->active.pages; 502 503 blob->active.extent_pages = extent_pages; 504 blob->active.clusters = clusters; 505 blob->active.pages = pages; 506 507 /* If the metadata was dirtied again while the metadata was being written to disk, 508 * we do not want to revert the DIRTY state back to CLEAN here. 509 */ 510 if (blob->state == SPDK_BLOB_STATE_LOADING) { 511 blob->state = SPDK_BLOB_STATE_CLEAN; 512 } 513 514 return 0; 515 } 516 517 static int 518 blob_deserialize_xattr(struct spdk_blob *blob, 519 struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) 520 { 521 struct spdk_xattr *xattr; 522 523 if (desc_xattr->length != sizeof(desc_xattr->name_length) + 524 sizeof(desc_xattr->value_length) + 525 desc_xattr->name_length + desc_xattr->value_length) { 526 return -EINVAL; 527 } 528 529 xattr = calloc(1, sizeof(*xattr)); 530 if (xattr == NULL) { 531 return -ENOMEM; 532 } 533 534 xattr->name = malloc(desc_xattr->name_length + 1); 535 if (xattr->name == NULL) { 536 free(xattr); 537 return -ENOMEM; 538 } 539 memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); 540 xattr->name[desc_xattr->name_length] = '\0'; 541 542 xattr->value = malloc(desc_xattr->value_length); 543 if (xattr->value == NULL) { 544 free(xattr->name); 545 free(xattr); 546 return -ENOMEM; 547 } 548 xattr->value_len = desc_xattr->value_length; 549 memcpy(xattr->value, 550 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 551 desc_xattr->value_length); 552 553 TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); 554 555 return 0; 556 } 557 558 559 static int 560 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) 561 { 562 struct spdk_blob_md_descriptor *desc; 563 size_t cur_desc = 0; 564 void *tmp; 565 566 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 567 while (cur_desc < sizeof(page->descriptors)) { 568 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 569 if (desc->length == 0) { 570 /* If padding and length are 0, this terminates the page */ 571 break; 572 } 573 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 574 struct spdk_blob_md_descriptor_flags *desc_flags; 575 576 desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; 577 578 if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { 579 return -EINVAL; 580 } 581 582 if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != 583 SPDK_BLOB_INVALID_FLAGS_MASK) { 584 return -EINVAL; 585 } 586 587 if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != 588 SPDK_BLOB_DATA_RO_FLAGS_MASK) { 589 blob->data_ro = true; 590 blob->md_ro = true; 591 } 592 593 if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != 594 SPDK_BLOB_MD_RO_FLAGS_MASK) { 595 blob->md_ro = true; 596 } 597 598 if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 599 blob->data_ro = true; 600 blob->md_ro = true; 601 } 602 603 blob->invalid_flags = desc_flags->invalid_flags; 604 blob->data_ro_flags = desc_flags->data_ro_flags; 605 blob->md_ro_flags = desc_flags->md_ro_flags; 606 607 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 608 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 609 unsigned int i, j; 610 unsigned int cluster_count = blob->active.num_clusters; 611 612 if (blob->extent_table_found) { 613 /* Extent Table already present in the md, 614 * both descriptors should never be at the same time. */ 615 return -EINVAL; 616 } 617 blob->extent_rle_found = true; 618 619 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 620 621 if (desc_extent_rle->length == 0 || 622 (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) { 623 return -EINVAL; 624 } 625 626 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 627 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 628 if (desc_extent_rle->extents[i].cluster_idx != 0) { 629 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, 630 desc_extent_rle->extents[i].cluster_idx + j)) { 631 return -EINVAL; 632 } 633 } 634 cluster_count++; 635 } 636 } 637 638 if (cluster_count == 0) { 639 return -EINVAL; 640 } 641 tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters)); 642 if (tmp == NULL) { 643 return -ENOMEM; 644 } 645 blob->active.clusters = tmp; 646 blob->active.cluster_array_size = cluster_count; 647 648 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 649 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 650 if (desc_extent_rle->extents[i].cluster_idx != 0) { 651 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 652 desc_extent_rle->extents[i].cluster_idx + j); 653 } else if (spdk_blob_is_thin_provisioned(blob)) { 654 blob->active.clusters[blob->active.num_clusters++] = 0; 655 } else { 656 return -EINVAL; 657 } 658 } 659 } 660 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 661 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 662 uint32_t num_extent_pages = blob->active.num_extent_pages; 663 uint32_t i, j; 664 size_t extent_pages_length; 665 666 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 667 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 668 669 if (blob->extent_rle_found) { 670 /* This means that Extent RLE is present in MD, 671 * both should never be at the same time. */ 672 return -EINVAL; 673 } else if (blob->extent_table_found && 674 desc_extent_table->num_clusters != blob->remaining_clusters_in_et) { 675 /* Number of clusters in this ET does not match number 676 * from previously read EXTENT_TABLE. */ 677 return -EINVAL; 678 } 679 680 if (desc_extent_table->length == 0 || 681 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 682 return -EINVAL; 683 } 684 685 blob->extent_table_found = true; 686 687 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 688 num_extent_pages += desc_extent_table->extent_page[i].num_pages; 689 } 690 691 if (num_extent_pages > 0) { 692 tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t)); 693 if (tmp == NULL) { 694 return -ENOMEM; 695 } 696 blob->active.extent_pages = tmp; 697 } 698 blob->active.extent_pages_array_size = num_extent_pages; 699 700 blob->remaining_clusters_in_et = desc_extent_table->num_clusters; 701 702 /* Extent table entries contain md page numbers for extent pages. 703 * Zeroes represent unallocated extent pages, those are run-length-encoded. 704 */ 705 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 706 if (desc_extent_table->extent_page[i].page_idx != 0) { 707 assert(desc_extent_table->extent_page[i].num_pages == 1); 708 blob->active.extent_pages[blob->active.num_extent_pages++] = 709 desc_extent_table->extent_page[i].page_idx; 710 } else if (spdk_blob_is_thin_provisioned(blob)) { 711 for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) { 712 blob->active.extent_pages[blob->active.num_extent_pages++] = 0; 713 } 714 } else { 715 return -EINVAL; 716 } 717 } 718 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 719 struct spdk_blob_md_descriptor_extent_page *desc_extent; 720 unsigned int i; 721 unsigned int cluster_count = 0; 722 size_t cluster_idx_length; 723 724 if (blob->extent_rle_found) { 725 /* This means that Extent RLE is present in MD, 726 * both should never be at the same time. */ 727 return -EINVAL; 728 } 729 730 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 731 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 732 733 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 734 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 735 return -EINVAL; 736 } 737 738 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 739 if (desc_extent->cluster_idx[i] != 0) { 740 if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) { 741 return -EINVAL; 742 } 743 } 744 cluster_count++; 745 } 746 747 if (cluster_count == 0) { 748 return -EINVAL; 749 } 750 751 /* When reading extent pages sequentially starting cluster idx should match 752 * current size of a blob. 753 * If changed to batch reading, this check shall be removed. */ 754 if (desc_extent->start_cluster_idx != blob->active.num_clusters) { 755 return -EINVAL; 756 } 757 758 tmp = realloc(blob->active.clusters, 759 (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters)); 760 if (tmp == NULL) { 761 return -ENOMEM; 762 } 763 blob->active.clusters = tmp; 764 blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters); 765 766 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 767 if (desc_extent->cluster_idx[i] != 0) { 768 blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs, 769 desc_extent->cluster_idx[i]); 770 } else if (spdk_blob_is_thin_provisioned(blob)) { 771 blob->active.clusters[blob->active.num_clusters++] = 0; 772 } else { 773 return -EINVAL; 774 } 775 } 776 assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters); 777 assert(blob->remaining_clusters_in_et >= cluster_count); 778 blob->remaining_clusters_in_et -= cluster_count; 779 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 780 int rc; 781 782 rc = blob_deserialize_xattr(blob, 783 (struct spdk_blob_md_descriptor_xattr *) desc, false); 784 if (rc != 0) { 785 return rc; 786 } 787 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 788 int rc; 789 790 rc = blob_deserialize_xattr(blob, 791 (struct spdk_blob_md_descriptor_xattr *) desc, true); 792 if (rc != 0) { 793 return rc; 794 } 795 } else { 796 /* Unrecognized descriptor type. Do not fail - just continue to the 797 * next descriptor. If this descriptor is associated with some feature 798 * defined in a newer version of blobstore, that version of blobstore 799 * should create and set an associated feature flag to specify if this 800 * blob can be loaded or not. 801 */ 802 } 803 804 /* Advance to the next descriptor */ 805 cur_desc += sizeof(*desc) + desc->length; 806 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 807 break; 808 } 809 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 810 } 811 812 return 0; 813 } 814 815 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page); 816 817 static int 818 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob) 819 { 820 assert(blob != NULL); 821 assert(blob->state == SPDK_BLOB_STATE_LOADING); 822 823 if (bs_load_cur_extent_page_valid(extent_page) == false) { 824 return -ENOENT; 825 } 826 827 return blob_parse_page(extent_page, blob); 828 } 829 830 static int 831 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, 832 struct spdk_blob *blob) 833 { 834 const struct spdk_blob_md_page *page; 835 uint32_t i; 836 int rc; 837 void *tmp; 838 839 assert(page_count > 0); 840 assert(pages[0].sequence_num == 0); 841 assert(blob != NULL); 842 assert(blob->state == SPDK_BLOB_STATE_LOADING); 843 assert(blob->active.clusters == NULL); 844 845 /* The blobid provided doesn't match what's in the MD, this can 846 * happen for example if a bogus blobid is passed in through open. 847 */ 848 if (blob->id != pages[0].id) { 849 SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n", 850 blob->id, pages[0].id); 851 return -ENOENT; 852 } 853 854 tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages)); 855 if (!tmp) { 856 return -ENOMEM; 857 } 858 blob->active.pages = tmp; 859 860 blob->active.pages[0] = pages[0].id; 861 862 for (i = 1; i < page_count; i++) { 863 assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next)); 864 blob->active.pages[i] = pages[i - 1].next; 865 } 866 blob->active.num_pages = page_count; 867 868 for (i = 0; i < page_count; i++) { 869 page = &pages[i]; 870 871 assert(page->id == blob->id); 872 assert(page->sequence_num == i); 873 874 rc = blob_parse_page(page, blob); 875 if (rc != 0) { 876 return rc; 877 } 878 } 879 880 return 0; 881 } 882 883 static int 884 blob_serialize_add_page(const struct spdk_blob *blob, 885 struct spdk_blob_md_page **pages, 886 uint32_t *page_count, 887 struct spdk_blob_md_page **last_page) 888 { 889 struct spdk_blob_md_page *page, *tmp_pages; 890 891 assert(pages != NULL); 892 assert(page_count != NULL); 893 894 *last_page = NULL; 895 if (*page_count == 0) { 896 assert(*pages == NULL); 897 *pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0, 898 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 899 if (*pages == NULL) { 900 return -ENOMEM; 901 } 902 *page_count = 1; 903 } else { 904 assert(*pages != NULL); 905 tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0); 906 if (tmp_pages == NULL) { 907 return -ENOMEM; 908 } 909 (*page_count)++; 910 *pages = tmp_pages; 911 } 912 913 page = &(*pages)[*page_count - 1]; 914 memset(page, 0, sizeof(*page)); 915 page->id = blob->id; 916 page->sequence_num = *page_count - 1; 917 page->next = SPDK_INVALID_MD_PAGE; 918 *last_page = page; 919 920 return 0; 921 } 922 923 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. 924 * Update required_sz on both success and failure. 925 * 926 */ 927 static int 928 blob_serialize_xattr(const struct spdk_xattr *xattr, 929 uint8_t *buf, size_t buf_sz, 930 size_t *required_sz, bool internal) 931 { 932 struct spdk_blob_md_descriptor_xattr *desc; 933 934 *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + 935 strlen(xattr->name) + 936 xattr->value_len; 937 938 if (buf_sz < *required_sz) { 939 return -1; 940 } 941 942 desc = (struct spdk_blob_md_descriptor_xattr *)buf; 943 944 desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; 945 desc->length = sizeof(desc->name_length) + 946 sizeof(desc->value_length) + 947 strlen(xattr->name) + 948 xattr->value_len; 949 desc->name_length = strlen(xattr->name); 950 desc->value_length = xattr->value_len; 951 952 memcpy(desc->name, xattr->name, desc->name_length); 953 memcpy((void *)((uintptr_t)desc->name + desc->name_length), 954 xattr->value, 955 desc->value_length); 956 957 return 0; 958 } 959 960 static void 961 blob_serialize_extent_table_entry(const struct spdk_blob *blob, 962 uint64_t start_ep, uint64_t *next_ep, 963 uint8_t **buf, size_t *remaining_sz) 964 { 965 struct spdk_blob_md_descriptor_extent_table *desc; 966 size_t cur_sz; 967 uint64_t i, et_idx; 968 uint32_t extent_page, ep_len; 969 970 /* The buffer must have room for at least num_clusters entry */ 971 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters); 972 if (*remaining_sz < cur_sz) { 973 *next_ep = start_ep; 974 return; 975 } 976 977 desc = (struct spdk_blob_md_descriptor_extent_table *)*buf; 978 desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE; 979 980 desc->num_clusters = blob->active.num_clusters; 981 982 ep_len = 1; 983 et_idx = 0; 984 for (i = start_ep; i < blob->active.num_extent_pages; i++) { 985 if (*remaining_sz < cur_sz + sizeof(desc->extent_page[0])) { 986 /* If we ran out of buffer space, return */ 987 break; 988 } 989 990 extent_page = blob->active.extent_pages[i]; 991 /* Verify that next extent_page is unallocated */ 992 if (extent_page == 0 && 993 (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) { 994 ep_len++; 995 continue; 996 } 997 desc->extent_page[et_idx].page_idx = extent_page; 998 desc->extent_page[et_idx].num_pages = ep_len; 999 et_idx++; 1000 1001 ep_len = 1; 1002 cur_sz += sizeof(desc->extent_page[et_idx]); 1003 } 1004 *next_ep = i; 1005 1006 desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx; 1007 *remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length; 1008 *buf += sizeof(struct spdk_blob_md_descriptor) + desc->length; 1009 } 1010 1011 static int 1012 blob_serialize_extent_table(const struct spdk_blob *blob, 1013 struct spdk_blob_md_page **pages, 1014 struct spdk_blob_md_page *cur_page, 1015 uint32_t *page_count, uint8_t **buf, 1016 size_t *remaining_sz) 1017 { 1018 uint64_t last_extent_page; 1019 int rc; 1020 1021 last_extent_page = 0; 1022 /* At least single extent table entry has to be always persisted. 1023 * Such case occurs with num_extent_pages == 0. */ 1024 while (last_extent_page <= blob->active.num_extent_pages) { 1025 blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf, 1026 remaining_sz); 1027 1028 if (last_extent_page == blob->active.num_extent_pages) { 1029 break; 1030 } 1031 1032 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1033 if (rc < 0) { 1034 return rc; 1035 } 1036 1037 *buf = (uint8_t *)cur_page->descriptors; 1038 *remaining_sz = sizeof(cur_page->descriptors); 1039 } 1040 1041 return 0; 1042 } 1043 1044 static void 1045 blob_serialize_extent_rle(const struct spdk_blob *blob, 1046 uint64_t start_cluster, uint64_t *next_cluster, 1047 uint8_t **buf, size_t *buf_sz) 1048 { 1049 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 1050 size_t cur_sz; 1051 uint64_t i, extent_idx; 1052 uint64_t lba, lba_per_cluster, lba_count; 1053 1054 /* The buffer must have room for at least one extent */ 1055 cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]); 1056 if (*buf_sz < cur_sz) { 1057 *next_cluster = start_cluster; 1058 return; 1059 } 1060 1061 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf; 1062 desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE; 1063 1064 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1065 1066 lba = blob->active.clusters[start_cluster]; 1067 lba_count = lba_per_cluster; 1068 extent_idx = 0; 1069 for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { 1070 if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) { 1071 /* Run-length encode sequential non-zero LBA */ 1072 lba_count += lba_per_cluster; 1073 continue; 1074 } else if (lba == 0 && blob->active.clusters[i] == 0) { 1075 /* Run-length encode unallocated clusters */ 1076 lba_count += lba_per_cluster; 1077 continue; 1078 } 1079 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1080 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1081 extent_idx++; 1082 1083 cur_sz += sizeof(desc_extent_rle->extents[extent_idx]); 1084 1085 if (*buf_sz < cur_sz) { 1086 /* If we ran out of buffer space, return */ 1087 *next_cluster = i; 1088 break; 1089 } 1090 1091 lba = blob->active.clusters[i]; 1092 lba_count = lba_per_cluster; 1093 } 1094 1095 if (*buf_sz >= cur_sz) { 1096 desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster; 1097 desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster; 1098 extent_idx++; 1099 1100 *next_cluster = blob->active.num_clusters; 1101 } 1102 1103 desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx; 1104 *buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1105 *buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length; 1106 } 1107 1108 static int 1109 blob_serialize_extents_rle(const struct spdk_blob *blob, 1110 struct spdk_blob_md_page **pages, 1111 struct spdk_blob_md_page *cur_page, 1112 uint32_t *page_count, uint8_t **buf, 1113 size_t *remaining_sz) 1114 { 1115 uint64_t last_cluster; 1116 int rc; 1117 1118 last_cluster = 0; 1119 while (last_cluster < blob->active.num_clusters) { 1120 blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz); 1121 1122 if (last_cluster == blob->active.num_clusters) { 1123 break; 1124 } 1125 1126 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1127 if (rc < 0) { 1128 return rc; 1129 } 1130 1131 *buf = (uint8_t *)cur_page->descriptors; 1132 *remaining_sz = sizeof(cur_page->descriptors); 1133 } 1134 1135 return 0; 1136 } 1137 1138 static void 1139 blob_serialize_extent_page(const struct spdk_blob *blob, 1140 uint64_t cluster, struct spdk_blob_md_page *page) 1141 { 1142 struct spdk_blob_md_descriptor_extent_page *desc_extent; 1143 uint64_t i, extent_idx; 1144 uint64_t lba, lba_per_cluster; 1145 uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP; 1146 1147 desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors; 1148 desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE; 1149 1150 lba_per_cluster = bs_cluster_to_lba(blob->bs, 1); 1151 1152 desc_extent->start_cluster_idx = start_cluster_idx; 1153 extent_idx = 0; 1154 for (i = start_cluster_idx; i < blob->active.num_clusters; i++) { 1155 lba = blob->active.clusters[i]; 1156 desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster; 1157 if (extent_idx >= SPDK_EXTENTS_PER_EP) { 1158 break; 1159 } 1160 } 1161 desc_extent->length = sizeof(desc_extent->start_cluster_idx) + 1162 sizeof(desc_extent->cluster_idx[0]) * extent_idx; 1163 } 1164 1165 static void 1166 blob_serialize_flags(const struct spdk_blob *blob, 1167 uint8_t *buf, size_t *buf_sz) 1168 { 1169 struct spdk_blob_md_descriptor_flags *desc; 1170 1171 /* 1172 * Flags get serialized first, so we should always have room for the flags 1173 * descriptor. 1174 */ 1175 assert(*buf_sz >= sizeof(*desc)); 1176 1177 desc = (struct spdk_blob_md_descriptor_flags *)buf; 1178 desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; 1179 desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); 1180 desc->invalid_flags = blob->invalid_flags; 1181 desc->data_ro_flags = blob->data_ro_flags; 1182 desc->md_ro_flags = blob->md_ro_flags; 1183 1184 *buf_sz -= sizeof(*desc); 1185 } 1186 1187 static int 1188 blob_serialize_xattrs(const struct spdk_blob *blob, 1189 const struct spdk_xattr_tailq *xattrs, bool internal, 1190 struct spdk_blob_md_page **pages, 1191 struct spdk_blob_md_page *cur_page, 1192 uint32_t *page_count, uint8_t **buf, 1193 size_t *remaining_sz) 1194 { 1195 const struct spdk_xattr *xattr; 1196 int rc; 1197 1198 TAILQ_FOREACH(xattr, xattrs, link) { 1199 size_t required_sz = 0; 1200 1201 rc = blob_serialize_xattr(xattr, 1202 *buf, *remaining_sz, 1203 &required_sz, internal); 1204 if (rc < 0) { 1205 /* Need to add a new page to the chain */ 1206 rc = blob_serialize_add_page(blob, pages, page_count, 1207 &cur_page); 1208 if (rc < 0) { 1209 spdk_free(*pages); 1210 *pages = NULL; 1211 *page_count = 0; 1212 return rc; 1213 } 1214 1215 *buf = (uint8_t *)cur_page->descriptors; 1216 *remaining_sz = sizeof(cur_page->descriptors); 1217 1218 /* Try again */ 1219 required_sz = 0; 1220 rc = blob_serialize_xattr(xattr, 1221 *buf, *remaining_sz, 1222 &required_sz, internal); 1223 1224 if (rc < 0) { 1225 spdk_free(*pages); 1226 *pages = NULL; 1227 *page_count = 0; 1228 return rc; 1229 } 1230 } 1231 1232 *remaining_sz -= required_sz; 1233 *buf += required_sz; 1234 } 1235 1236 return 0; 1237 } 1238 1239 static int 1240 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, 1241 uint32_t *page_count) 1242 { 1243 struct spdk_blob_md_page *cur_page; 1244 int rc; 1245 uint8_t *buf; 1246 size_t remaining_sz; 1247 1248 assert(pages != NULL); 1249 assert(page_count != NULL); 1250 assert(blob != NULL); 1251 assert(blob->state == SPDK_BLOB_STATE_DIRTY); 1252 1253 *pages = NULL; 1254 *page_count = 0; 1255 1256 /* A blob always has at least 1 page, even if it has no descriptors */ 1257 rc = blob_serialize_add_page(blob, pages, page_count, &cur_page); 1258 if (rc < 0) { 1259 return rc; 1260 } 1261 1262 buf = (uint8_t *)cur_page->descriptors; 1263 remaining_sz = sizeof(cur_page->descriptors); 1264 1265 /* Serialize flags */ 1266 blob_serialize_flags(blob, buf, &remaining_sz); 1267 buf += sizeof(struct spdk_blob_md_descriptor_flags); 1268 1269 /* Serialize xattrs */ 1270 rc = blob_serialize_xattrs(blob, &blob->xattrs, false, 1271 pages, cur_page, page_count, &buf, &remaining_sz); 1272 if (rc < 0) { 1273 return rc; 1274 } 1275 1276 /* Serialize internal xattrs */ 1277 rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true, 1278 pages, cur_page, page_count, &buf, &remaining_sz); 1279 if (rc < 0) { 1280 return rc; 1281 } 1282 1283 if (blob->use_extent_table) { 1284 /* Serialize extent table */ 1285 rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1286 } else { 1287 /* Serialize extents */ 1288 rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz); 1289 } 1290 1291 return rc; 1292 } 1293 1294 struct spdk_blob_load_ctx { 1295 struct spdk_blob *blob; 1296 1297 struct spdk_blob_md_page *pages; 1298 uint32_t num_pages; 1299 uint32_t next_extent_page; 1300 spdk_bs_sequence_t *seq; 1301 1302 spdk_bs_sequence_cpl cb_fn; 1303 void *cb_arg; 1304 }; 1305 1306 static uint32_t 1307 blob_md_page_calc_crc(void *page) 1308 { 1309 uint32_t crc; 1310 1311 crc = BLOB_CRC32C_INITIAL; 1312 crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); 1313 crc ^= BLOB_CRC32C_INITIAL; 1314 1315 return crc; 1316 1317 } 1318 1319 static void 1320 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno) 1321 { 1322 struct spdk_blob *blob = ctx->blob; 1323 1324 if (bserrno == 0) { 1325 blob_mark_clean(blob); 1326 } 1327 1328 ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); 1329 1330 /* Free the memory */ 1331 spdk_free(ctx->pages); 1332 free(ctx); 1333 } 1334 1335 static void 1336 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) 1337 { 1338 struct spdk_blob_load_ctx *ctx = cb_arg; 1339 struct spdk_blob *blob = ctx->blob; 1340 1341 if (bserrno == 0) { 1342 blob->back_bs_dev = bs_create_blob_bs_dev(snapshot); 1343 if (blob->back_bs_dev == NULL) { 1344 bserrno = -ENOMEM; 1345 } 1346 } 1347 if (bserrno != 0) { 1348 SPDK_ERRLOG("Snapshot fail\n"); 1349 } 1350 1351 blob_load_final(ctx, bserrno); 1352 } 1353 1354 static void blob_update_clear_method(struct spdk_blob *blob); 1355 1356 static void 1357 blob_load_backing_dev(void *cb_arg) 1358 { 1359 struct spdk_blob_load_ctx *ctx = cb_arg; 1360 struct spdk_blob *blob = ctx->blob; 1361 const void *value; 1362 size_t len; 1363 int rc; 1364 1365 if (spdk_blob_is_thin_provisioned(blob)) { 1366 rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); 1367 if (rc == 0) { 1368 if (len != sizeof(spdk_blob_id)) { 1369 blob_load_final(ctx, -EINVAL); 1370 return; 1371 } 1372 /* open snapshot blob and continue in the callback function */ 1373 blob->parent_id = *(spdk_blob_id *)value; 1374 spdk_bs_open_blob(blob->bs, blob->parent_id, 1375 blob_load_snapshot_cpl, ctx); 1376 return; 1377 } else { 1378 /* add zeroes_dev for thin provisioned blob */ 1379 blob->back_bs_dev = bs_create_zeroes_dev(); 1380 } 1381 } else { 1382 /* standard blob */ 1383 blob->back_bs_dev = NULL; 1384 } 1385 blob_load_final(ctx, 0); 1386 } 1387 1388 static void 1389 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1390 { 1391 struct spdk_blob_load_ctx *ctx = cb_arg; 1392 struct spdk_blob *blob = ctx->blob; 1393 struct spdk_blob_md_page *page; 1394 uint64_t i; 1395 uint32_t crc; 1396 uint64_t lba; 1397 void *tmp; 1398 uint64_t sz; 1399 1400 if (bserrno) { 1401 SPDK_ERRLOG("Extent page read failed: %d\n", bserrno); 1402 blob_load_final(ctx, bserrno); 1403 return; 1404 } 1405 1406 if (ctx->pages == NULL) { 1407 /* First iteration of this function, allocate buffer for single EXTENT_PAGE */ 1408 ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 1409 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 1410 if (!ctx->pages) { 1411 blob_load_final(ctx, -ENOMEM); 1412 return; 1413 } 1414 ctx->num_pages = 1; 1415 ctx->next_extent_page = 0; 1416 } else { 1417 page = &ctx->pages[0]; 1418 crc = blob_md_page_calc_crc(page); 1419 if (crc != page->crc) { 1420 blob_load_final(ctx, -EINVAL); 1421 return; 1422 } 1423 1424 if (page->next != SPDK_INVALID_MD_PAGE) { 1425 blob_load_final(ctx, -EINVAL); 1426 return; 1427 } 1428 1429 bserrno = blob_parse_extent_page(page, blob); 1430 if (bserrno) { 1431 blob_load_final(ctx, bserrno); 1432 return; 1433 } 1434 } 1435 1436 for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) { 1437 if (blob->active.extent_pages[i] != 0) { 1438 /* Extent page was allocated, read and parse it. */ 1439 lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]); 1440 ctx->next_extent_page = i + 1; 1441 1442 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1443 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 1444 blob_load_cpl_extents_cpl, ctx); 1445 return; 1446 } else { 1447 /* Thin provisioned blobs can point to unallocated extent pages. 1448 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */ 1449 1450 sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP); 1451 blob->active.num_clusters += sz; 1452 blob->remaining_clusters_in_et -= sz; 1453 1454 assert(spdk_blob_is_thin_provisioned(blob)); 1455 assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0); 1456 1457 tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters)); 1458 if (tmp == NULL) { 1459 blob_load_final(ctx, -ENOMEM); 1460 return; 1461 } 1462 memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0, 1463 sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size)); 1464 blob->active.clusters = tmp; 1465 blob->active.cluster_array_size = blob->active.num_clusters; 1466 } 1467 } 1468 1469 blob_load_backing_dev(ctx); 1470 } 1471 1472 static void 1473 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1474 { 1475 struct spdk_blob_load_ctx *ctx = cb_arg; 1476 struct spdk_blob *blob = ctx->blob; 1477 struct spdk_blob_md_page *page; 1478 int rc; 1479 uint32_t crc; 1480 uint32_t current_page; 1481 1482 if (ctx->num_pages == 1) { 1483 current_page = bs_blobid_to_page(blob->id); 1484 } else { 1485 assert(ctx->num_pages != 0); 1486 page = &ctx->pages[ctx->num_pages - 2]; 1487 current_page = page->next; 1488 } 1489 1490 if (bserrno) { 1491 SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n", 1492 current_page, blob->id, bserrno); 1493 blob_load_final(ctx, bserrno); 1494 return; 1495 } 1496 1497 page = &ctx->pages[ctx->num_pages - 1]; 1498 crc = blob_md_page_calc_crc(page); 1499 if (crc != page->crc) { 1500 SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n", 1501 current_page, blob->id); 1502 blob_load_final(ctx, -EINVAL); 1503 return; 1504 } 1505 1506 if (page->next != SPDK_INVALID_MD_PAGE) { 1507 struct spdk_blob_md_page *tmp_pages; 1508 uint32_t next_page = page->next; 1509 uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page); 1510 1511 /* Read the next page */ 1512 tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0); 1513 if (tmp_pages == NULL) { 1514 blob_load_final(ctx, -ENOMEM); 1515 return; 1516 } 1517 ctx->num_pages++; 1518 ctx->pages = tmp_pages; 1519 1520 bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], 1521 next_lba, 1522 bs_byte_to_lba(blob->bs, sizeof(*page)), 1523 blob_load_cpl, ctx); 1524 return; 1525 } 1526 1527 /* Parse the pages */ 1528 rc = blob_parse(ctx->pages, ctx->num_pages, blob); 1529 if (rc) { 1530 blob_load_final(ctx, rc); 1531 return; 1532 } 1533 1534 if (blob->extent_table_found == true) { 1535 /* If EXTENT_TABLE was found, that means support for it should be enabled. */ 1536 assert(blob->extent_rle_found == false); 1537 blob->use_extent_table = true; 1538 } else { 1539 /* If EXTENT_RLE or no extent_* descriptor was found disable support 1540 * for extent table. No extent_* descriptors means that blob has length of 0 1541 * and no extent_rle descriptors were persisted for it. 1542 * EXTENT_TABLE if used, is always present in metadata regardless of length. */ 1543 blob->use_extent_table = false; 1544 } 1545 1546 /* Check the clear_method stored in metadata vs what may have been passed 1547 * via spdk_bs_open_blob_ext() and update accordingly. 1548 */ 1549 blob_update_clear_method(blob); 1550 1551 spdk_free(ctx->pages); 1552 ctx->pages = NULL; 1553 1554 if (blob->extent_table_found) { 1555 blob_load_cpl_extents_cpl(seq, ctx, 0); 1556 } else { 1557 blob_load_backing_dev(ctx); 1558 } 1559 } 1560 1561 /* Load a blob from disk given a blobid */ 1562 static void 1563 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 1564 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 1565 { 1566 struct spdk_blob_load_ctx *ctx; 1567 struct spdk_blob_store *bs; 1568 uint32_t page_num; 1569 uint64_t lba; 1570 1571 blob_verify_md_op(blob); 1572 1573 bs = blob->bs; 1574 1575 ctx = calloc(1, sizeof(*ctx)); 1576 if (!ctx) { 1577 cb_fn(seq, cb_arg, -ENOMEM); 1578 return; 1579 } 1580 1581 ctx->blob = blob; 1582 ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0); 1583 if (!ctx->pages) { 1584 free(ctx); 1585 cb_fn(seq, cb_arg, -ENOMEM); 1586 return; 1587 } 1588 ctx->num_pages = 1; 1589 ctx->cb_fn = cb_fn; 1590 ctx->cb_arg = cb_arg; 1591 ctx->seq = seq; 1592 1593 page_num = bs_blobid_to_page(blob->id); 1594 lba = bs_md_page_to_lba(blob->bs, page_num); 1595 1596 blob->state = SPDK_BLOB_STATE_LOADING; 1597 1598 bs_sequence_read_dev(seq, &ctx->pages[0], lba, 1599 bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), 1600 blob_load_cpl, ctx); 1601 } 1602 1603 struct spdk_blob_persist_ctx { 1604 struct spdk_blob *blob; 1605 1606 struct spdk_bs_super_block *super; 1607 1608 struct spdk_blob_md_page *pages; 1609 uint32_t next_extent_page; 1610 struct spdk_blob_md_page *extent_page; 1611 1612 spdk_bs_sequence_t *seq; 1613 spdk_bs_sequence_cpl cb_fn; 1614 void *cb_arg; 1615 TAILQ_ENTRY(spdk_blob_persist_ctx) link; 1616 }; 1617 1618 static void 1619 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba, 1620 uint64_t lba_count) 1621 { 1622 switch (ctx->blob->clear_method) { 1623 case BLOB_CLEAR_WITH_DEFAULT: 1624 case BLOB_CLEAR_WITH_UNMAP: 1625 bs_batch_unmap_dev(batch, lba, lba_count); 1626 break; 1627 case BLOB_CLEAR_WITH_WRITE_ZEROES: 1628 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1629 break; 1630 case BLOB_CLEAR_WITH_NONE: 1631 default: 1632 break; 1633 } 1634 } 1635 1636 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx); 1637 1638 static void 1639 blob_persist_complete_cb(void *arg) 1640 { 1641 struct spdk_blob_persist_ctx *ctx = arg; 1642 1643 /* Call user callback */ 1644 ctx->cb_fn(ctx->seq, ctx->cb_arg, 0); 1645 1646 /* Free the memory */ 1647 spdk_free(ctx->pages); 1648 free(ctx); 1649 } 1650 1651 static void 1652 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno) 1653 { 1654 struct spdk_blob_persist_ctx *next_persist, *tmp; 1655 struct spdk_blob *blob = ctx->blob; 1656 1657 if (bserrno == 0) { 1658 blob_mark_clean(blob); 1659 } 1660 1661 assert(ctx == TAILQ_FIRST(&blob->persists_to_complete)); 1662 1663 /* Complete all persists that were pending when the current persist started */ 1664 TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) { 1665 TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link); 1666 spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist); 1667 } 1668 1669 if (TAILQ_EMPTY(&blob->pending_persists)) { 1670 return; 1671 } 1672 1673 /* Queue up all pending persists for completion and start blob persist with first one */ 1674 TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link); 1675 next_persist = TAILQ_FIRST(&blob->persists_to_complete); 1676 1677 blob->state = SPDK_BLOB_STATE_DIRTY; 1678 blob_persist_check_dirty(next_persist); 1679 } 1680 1681 static void 1682 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1683 { 1684 struct spdk_blob_persist_ctx *ctx = cb_arg; 1685 struct spdk_blob *blob = ctx->blob; 1686 struct spdk_blob_store *bs = blob->bs; 1687 size_t i; 1688 1689 if (bserrno != 0) { 1690 blob_persist_complete(seq, ctx, bserrno); 1691 return; 1692 } 1693 1694 /* Release all extent_pages that were truncated */ 1695 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1696 /* Nothing to release if it was not allocated */ 1697 if (blob->active.extent_pages[i] != 0) { 1698 bs_release_md_page(bs, blob->active.extent_pages[i]); 1699 } 1700 } 1701 1702 if (blob->active.num_extent_pages == 0) { 1703 free(blob->active.extent_pages); 1704 blob->active.extent_pages = NULL; 1705 blob->active.extent_pages_array_size = 0; 1706 } else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) { 1707 #ifndef __clang_analyzer__ 1708 void *tmp; 1709 1710 /* scan-build really can't figure reallocs, workaround it */ 1711 tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages); 1712 assert(tmp != NULL); 1713 blob->active.extent_pages = tmp; 1714 #endif 1715 blob->active.extent_pages_array_size = blob->active.num_extent_pages; 1716 } 1717 1718 blob_persist_complete(seq, ctx, bserrno); 1719 } 1720 1721 static void 1722 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1723 { 1724 struct spdk_blob *blob = ctx->blob; 1725 struct spdk_blob_store *bs = blob->bs; 1726 size_t i; 1727 uint64_t lba; 1728 uint64_t lba_count; 1729 spdk_bs_batch_t *batch; 1730 1731 batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx); 1732 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1733 1734 /* Clear all extent_pages that were truncated */ 1735 for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) { 1736 /* Nothing to clear if it was not allocated */ 1737 if (blob->active.extent_pages[i] != 0) { 1738 lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]); 1739 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1740 } 1741 } 1742 1743 bs_batch_close(batch); 1744 } 1745 1746 static void 1747 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1748 { 1749 struct spdk_blob_persist_ctx *ctx = cb_arg; 1750 struct spdk_blob *blob = ctx->blob; 1751 struct spdk_blob_store *bs = blob->bs; 1752 size_t i; 1753 1754 if (bserrno != 0) { 1755 blob_persist_complete(seq, ctx, bserrno); 1756 return; 1757 } 1758 1759 pthread_mutex_lock(&bs->used_clusters_mutex); 1760 /* Release all clusters that were truncated */ 1761 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1762 uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]); 1763 1764 /* Nothing to release if it was not allocated */ 1765 if (blob->active.clusters[i] != 0) { 1766 bs_release_cluster(bs, cluster_num); 1767 } 1768 } 1769 pthread_mutex_unlock(&bs->used_clusters_mutex); 1770 1771 if (blob->active.num_clusters == 0) { 1772 free(blob->active.clusters); 1773 blob->active.clusters = NULL; 1774 blob->active.cluster_array_size = 0; 1775 } else if (blob->active.num_clusters != blob->active.cluster_array_size) { 1776 #ifndef __clang_analyzer__ 1777 void *tmp; 1778 1779 /* scan-build really can't figure reallocs, workaround it */ 1780 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters); 1781 assert(tmp != NULL); 1782 blob->active.clusters = tmp; 1783 1784 #endif 1785 blob->active.cluster_array_size = blob->active.num_clusters; 1786 } 1787 1788 /* Move on to clearing extent pages */ 1789 blob_persist_clear_extents(seq, ctx); 1790 } 1791 1792 static void 1793 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1794 { 1795 struct spdk_blob *blob = ctx->blob; 1796 struct spdk_blob_store *bs = blob->bs; 1797 spdk_bs_batch_t *batch; 1798 size_t i; 1799 uint64_t lba; 1800 uint64_t lba_count; 1801 1802 /* Clusters don't move around in blobs. The list shrinks or grows 1803 * at the end, but no changes ever occur in the middle of the list. 1804 */ 1805 1806 batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx); 1807 1808 /* Clear all clusters that were truncated */ 1809 lba = 0; 1810 lba_count = 0; 1811 for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { 1812 uint64_t next_lba = blob->active.clusters[i]; 1813 uint64_t next_lba_count = bs_cluster_to_lba(bs, 1); 1814 1815 if (next_lba > 0 && (lba + lba_count) == next_lba) { 1816 /* This cluster is contiguous with the previous one. */ 1817 lba_count += next_lba_count; 1818 continue; 1819 } else if (next_lba == 0) { 1820 continue; 1821 } 1822 1823 /* This cluster is not contiguous with the previous one. */ 1824 1825 /* If a run of LBAs previously existing, clear them now */ 1826 if (lba_count > 0) { 1827 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1828 } 1829 1830 /* Start building the next batch */ 1831 lba = next_lba; 1832 if (next_lba > 0) { 1833 lba_count = next_lba_count; 1834 } else { 1835 lba_count = 0; 1836 } 1837 } 1838 1839 /* If we ended with a contiguous set of LBAs, clear them now */ 1840 if (lba_count > 0) { 1841 bs_batch_clear_dev(ctx, batch, lba, lba_count); 1842 } 1843 1844 bs_batch_close(batch); 1845 } 1846 1847 static void 1848 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1849 { 1850 struct spdk_blob_persist_ctx *ctx = cb_arg; 1851 struct spdk_blob *blob = ctx->blob; 1852 struct spdk_blob_store *bs = blob->bs; 1853 size_t i; 1854 1855 if (bserrno != 0) { 1856 blob_persist_complete(seq, ctx, bserrno); 1857 return; 1858 } 1859 1860 /* This loop starts at 1 because the first page is special and handled 1861 * below. The pages (except the first) are never written in place, 1862 * so any pages in the clean list must be zeroed. 1863 */ 1864 for (i = 1; i < blob->clean.num_pages; i++) { 1865 bs_release_md_page(bs, blob->clean.pages[i]); 1866 } 1867 1868 if (blob->active.num_pages == 0) { 1869 uint32_t page_num; 1870 1871 page_num = bs_blobid_to_page(blob->id); 1872 bs_release_md_page(bs, page_num); 1873 } 1874 1875 /* Move on to clearing clusters */ 1876 blob_persist_clear_clusters(seq, ctx); 1877 } 1878 1879 static void 1880 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1881 { 1882 struct spdk_blob_persist_ctx *ctx = cb_arg; 1883 struct spdk_blob *blob = ctx->blob; 1884 struct spdk_blob_store *bs = blob->bs; 1885 uint64_t lba; 1886 uint64_t lba_count; 1887 spdk_bs_batch_t *batch; 1888 size_t i; 1889 1890 if (bserrno != 0) { 1891 blob_persist_complete(seq, ctx, bserrno); 1892 return; 1893 } 1894 1895 batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx); 1896 1897 lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); 1898 1899 /* This loop starts at 1 because the first page is special and handled 1900 * below. The pages (except the first) are never written in place, 1901 * so any pages in the clean list must be zeroed. 1902 */ 1903 for (i = 1; i < blob->clean.num_pages; i++) { 1904 lba = bs_md_page_to_lba(bs, blob->clean.pages[i]); 1905 1906 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1907 } 1908 1909 /* The first page will only be zeroed if this is a delete. */ 1910 if (blob->active.num_pages == 0) { 1911 uint32_t page_num; 1912 1913 /* The first page in the metadata goes where the blobid indicates */ 1914 page_num = bs_blobid_to_page(blob->id); 1915 lba = bs_md_page_to_lba(bs, page_num); 1916 1917 bs_batch_write_zeroes_dev(batch, lba, lba_count); 1918 } 1919 1920 bs_batch_close(batch); 1921 } 1922 1923 static void 1924 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 1925 { 1926 struct spdk_blob_persist_ctx *ctx = cb_arg; 1927 struct spdk_blob *blob = ctx->blob; 1928 struct spdk_blob_store *bs = blob->bs; 1929 uint64_t lba; 1930 uint32_t lba_count; 1931 struct spdk_blob_md_page *page; 1932 1933 if (bserrno != 0) { 1934 blob_persist_complete(seq, ctx, bserrno); 1935 return; 1936 } 1937 1938 if (blob->active.num_pages == 0) { 1939 /* Move on to the next step */ 1940 blob_persist_zero_pages(seq, ctx, 0); 1941 return; 1942 } 1943 1944 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1945 1946 page = &ctx->pages[0]; 1947 /* The first page in the metadata goes where the blobid indicates */ 1948 lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id)); 1949 1950 bs_sequence_write_dev(seq, page, lba, lba_count, 1951 blob_persist_zero_pages, ctx); 1952 } 1953 1954 static void 1955 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx) 1956 { 1957 struct spdk_blob *blob = ctx->blob; 1958 struct spdk_blob_store *bs = blob->bs; 1959 uint64_t lba; 1960 uint32_t lba_count; 1961 struct spdk_blob_md_page *page; 1962 spdk_bs_batch_t *batch; 1963 size_t i; 1964 1965 /* Clusters don't move around in blobs. The list shrinks or grows 1966 * at the end, but no changes ever occur in the middle of the list. 1967 */ 1968 1969 lba_count = bs_byte_to_lba(bs, sizeof(*page)); 1970 1971 batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx); 1972 1973 /* This starts at 1. The root page is not written until 1974 * all of the others are finished 1975 */ 1976 for (i = 1; i < blob->active.num_pages; i++) { 1977 page = &ctx->pages[i]; 1978 assert(page->sequence_num == i); 1979 1980 lba = bs_md_page_to_lba(bs, blob->active.pages[i]); 1981 1982 bs_batch_write_dev(batch, page, lba, lba_count); 1983 } 1984 1985 bs_batch_close(batch); 1986 } 1987 1988 static int 1989 blob_resize(struct spdk_blob *blob, uint64_t sz) 1990 { 1991 uint64_t i; 1992 uint64_t *tmp; 1993 uint64_t cluster; 1994 uint32_t lfmd; /* lowest free md page */ 1995 uint64_t num_clusters; 1996 uint32_t *ep_tmp; 1997 uint64_t new_num_ep = 0, current_num_ep = 0; 1998 struct spdk_blob_store *bs; 1999 2000 bs = blob->bs; 2001 2002 blob_verify_md_op(blob); 2003 2004 if (blob->active.num_clusters == sz) { 2005 return 0; 2006 } 2007 2008 if (blob->active.num_clusters < blob->active.cluster_array_size) { 2009 /* If this blob was resized to be larger, then smaller, then 2010 * larger without syncing, then the cluster array already 2011 * contains spare assigned clusters we can use. 2012 */ 2013 num_clusters = spdk_min(blob->active.cluster_array_size, 2014 sz); 2015 } else { 2016 num_clusters = blob->active.num_clusters; 2017 } 2018 2019 if (blob->use_extent_table) { 2020 /* Round up since every cluster beyond current Extent Table size, 2021 * requires new extent page. */ 2022 new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP); 2023 current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP); 2024 } 2025 2026 /* Check first that we have enough clusters and md pages before we start claiming them. */ 2027 if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) { 2028 if ((sz - num_clusters) > bs->num_free_clusters) { 2029 return -ENOSPC; 2030 } 2031 lfmd = 0; 2032 for (i = current_num_ep; i < new_num_ep ; i++) { 2033 lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd); 2034 if (lfmd == UINT32_MAX) { 2035 /* No more free md pages. Cannot satisfy the request */ 2036 return -ENOSPC; 2037 } 2038 } 2039 } 2040 2041 if (sz > num_clusters) { 2042 /* Expand the cluster array if necessary. 2043 * We only shrink the array when persisting. 2044 */ 2045 tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz); 2046 if (sz > 0 && tmp == NULL) { 2047 return -ENOMEM; 2048 } 2049 memset(tmp + blob->active.cluster_array_size, 0, 2050 sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size)); 2051 blob->active.clusters = tmp; 2052 blob->active.cluster_array_size = sz; 2053 2054 /* Expand the extents table, only if enough clusters were added */ 2055 if (new_num_ep > current_num_ep && blob->use_extent_table) { 2056 ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep); 2057 if (new_num_ep > 0 && ep_tmp == NULL) { 2058 return -ENOMEM; 2059 } 2060 memset(ep_tmp + blob->active.extent_pages_array_size, 0, 2061 sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size)); 2062 blob->active.extent_pages = ep_tmp; 2063 blob->active.extent_pages_array_size = new_num_ep; 2064 } 2065 } 2066 2067 blob->state = SPDK_BLOB_STATE_DIRTY; 2068 2069 if (spdk_blob_is_thin_provisioned(blob) == false) { 2070 cluster = 0; 2071 lfmd = 0; 2072 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2073 for (i = num_clusters; i < sz; i++) { 2074 bs_allocate_cluster(blob, i, &cluster, &lfmd, true); 2075 lfmd++; 2076 } 2077 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2078 } 2079 2080 blob->active.num_clusters = sz; 2081 blob->active.num_extent_pages = new_num_ep; 2082 2083 return 0; 2084 } 2085 2086 static void 2087 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx) 2088 { 2089 spdk_bs_sequence_t *seq = ctx->seq; 2090 struct spdk_blob *blob = ctx->blob; 2091 struct spdk_blob_store *bs = blob->bs; 2092 uint64_t i; 2093 uint32_t page_num; 2094 void *tmp; 2095 int rc; 2096 2097 /* Generate the new metadata */ 2098 rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages); 2099 if (rc < 0) { 2100 blob_persist_complete(seq, ctx, rc); 2101 return; 2102 } 2103 2104 assert(blob->active.num_pages >= 1); 2105 2106 /* Resize the cache of page indices */ 2107 tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); 2108 if (!tmp) { 2109 blob_persist_complete(seq, ctx, -ENOMEM); 2110 return; 2111 } 2112 blob->active.pages = tmp; 2113 2114 /* Assign this metadata to pages. This requires two passes - 2115 * one to verify that there are enough pages and a second 2116 * to actually claim them. */ 2117 page_num = 0; 2118 /* Note that this loop starts at one. The first page location is fixed by the blobid. */ 2119 for (i = 1; i < blob->active.num_pages; i++) { 2120 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2121 if (page_num == UINT32_MAX) { 2122 blob_persist_complete(seq, ctx, -ENOMEM); 2123 return; 2124 } 2125 page_num++; 2126 } 2127 2128 page_num = 0; 2129 blob->active.pages[0] = bs_blobid_to_page(blob->id); 2130 for (i = 1; i < blob->active.num_pages; i++) { 2131 page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); 2132 ctx->pages[i - 1].next = page_num; 2133 /* Now that previous metadata page is complete, calculate the crc for it. */ 2134 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2135 blob->active.pages[i] = page_num; 2136 bs_claim_md_page(bs, page_num); 2137 SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id); 2138 page_num++; 2139 } 2140 ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]); 2141 /* Start writing the metadata from last page to first */ 2142 blob->state = SPDK_BLOB_STATE_CLEAN; 2143 blob_persist_write_page_chain(seq, ctx); 2144 } 2145 2146 static void 2147 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2148 { 2149 struct spdk_blob_persist_ctx *ctx = cb_arg; 2150 struct spdk_blob *blob = ctx->blob; 2151 size_t i; 2152 uint32_t extent_page_id; 2153 uint32_t page_count = 0; 2154 int rc; 2155 2156 if (ctx->extent_page != NULL) { 2157 spdk_free(ctx->extent_page); 2158 ctx->extent_page = NULL; 2159 } 2160 2161 if (bserrno != 0) { 2162 blob_persist_complete(seq, ctx, bserrno); 2163 return; 2164 } 2165 2166 /* Only write out Extent Pages when blob was resized. */ 2167 for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) { 2168 extent_page_id = blob->active.extent_pages[i]; 2169 if (extent_page_id == 0) { 2170 /* No Extent Page to persist */ 2171 assert(spdk_blob_is_thin_provisioned(blob)); 2172 continue; 2173 } 2174 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id)); 2175 ctx->next_extent_page = i + 1; 2176 rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page); 2177 if (rc < 0) { 2178 blob_persist_complete(seq, ctx, rc); 2179 return; 2180 } 2181 2182 blob->state = SPDK_BLOB_STATE_DIRTY; 2183 blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page); 2184 2185 ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page); 2186 2187 bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id), 2188 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 2189 blob_persist_write_extent_pages, ctx); 2190 return; 2191 } 2192 2193 blob_persist_generate_new_md(ctx); 2194 } 2195 2196 static void 2197 blob_persist_start(struct spdk_blob_persist_ctx *ctx) 2198 { 2199 spdk_bs_sequence_t *seq = ctx->seq; 2200 struct spdk_blob *blob = ctx->blob; 2201 2202 if (blob->active.num_pages == 0) { 2203 /* This is the signal that the blob should be deleted. 2204 * Immediately jump to the clean up routine. */ 2205 assert(blob->clean.num_pages > 0); 2206 blob->state = SPDK_BLOB_STATE_CLEAN; 2207 blob_persist_zero_pages(seq, ctx, 0); 2208 return; 2209 2210 } 2211 2212 if (blob->clean.num_clusters < blob->active.num_clusters) { 2213 /* Blob was resized up */ 2214 assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages); 2215 ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1; 2216 } else if (blob->active.num_clusters < blob->active.cluster_array_size) { 2217 /* Blob was resized down */ 2218 assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages); 2219 ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1; 2220 } else { 2221 /* No change in size occurred */ 2222 blob_persist_generate_new_md(ctx); 2223 return; 2224 } 2225 2226 blob_persist_write_extent_pages(seq, ctx, 0); 2227 } 2228 2229 static void 2230 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2231 { 2232 struct spdk_blob_persist_ctx *ctx = cb_arg; 2233 2234 spdk_free(ctx->super); 2235 2236 if (bserrno != 0) { 2237 blob_persist_complete(seq, ctx, bserrno); 2238 return; 2239 } 2240 2241 ctx->blob->bs->clean = 0; 2242 2243 blob_persist_start(ctx); 2244 } 2245 2246 static void 2247 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 2248 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); 2249 2250 2251 static void 2252 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2253 { 2254 struct spdk_blob_persist_ctx *ctx = cb_arg; 2255 2256 if (bserrno != 0) { 2257 spdk_free(ctx->super); 2258 blob_persist_complete(seq, ctx, bserrno); 2259 return; 2260 } 2261 2262 ctx->super->clean = 0; 2263 if (ctx->super->size == 0) { 2264 ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; 2265 } 2266 2267 bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx); 2268 } 2269 2270 static void 2271 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx) 2272 { 2273 if (ctx->blob->bs->clean) { 2274 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 2275 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2276 if (!ctx->super) { 2277 blob_persist_complete(ctx->seq, ctx, -ENOMEM); 2278 return; 2279 } 2280 2281 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0), 2282 bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)), 2283 blob_persist_dirty, ctx); 2284 } else { 2285 blob_persist_start(ctx); 2286 } 2287 } 2288 2289 /* Write a blob to disk */ 2290 static void 2291 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, 2292 spdk_bs_sequence_cpl cb_fn, void *cb_arg) 2293 { 2294 struct spdk_blob_persist_ctx *ctx; 2295 2296 blob_verify_md_op(blob); 2297 2298 if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) { 2299 cb_fn(seq, cb_arg, 0); 2300 return; 2301 } 2302 2303 ctx = calloc(1, sizeof(*ctx)); 2304 if (!ctx) { 2305 cb_fn(seq, cb_arg, -ENOMEM); 2306 return; 2307 } 2308 ctx->blob = blob; 2309 ctx->seq = seq; 2310 ctx->cb_fn = cb_fn; 2311 ctx->cb_arg = cb_arg; 2312 2313 /* Multiple blob persists can affect one another, via blob->state or 2314 * blob mutable data changes. To prevent it, queue up the persists. */ 2315 if (!TAILQ_EMPTY(&blob->persists_to_complete)) { 2316 TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link); 2317 return; 2318 } 2319 TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link); 2320 2321 blob_persist_check_dirty(ctx); 2322 } 2323 2324 struct spdk_blob_copy_cluster_ctx { 2325 struct spdk_blob *blob; 2326 uint8_t *buf; 2327 uint64_t page; 2328 uint64_t new_cluster; 2329 uint32_t new_extent_page; 2330 spdk_bs_sequence_t *seq; 2331 }; 2332 2333 static void 2334 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) 2335 { 2336 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2337 struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; 2338 TAILQ_HEAD(, spdk_bs_request_set) requests; 2339 spdk_bs_user_op_t *op; 2340 2341 TAILQ_INIT(&requests); 2342 TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); 2343 2344 while (!TAILQ_EMPTY(&requests)) { 2345 op = TAILQ_FIRST(&requests); 2346 TAILQ_REMOVE(&requests, op, link); 2347 if (bserrno == 0) { 2348 bs_user_op_execute(op); 2349 } else { 2350 bs_user_op_abort(op); 2351 } 2352 } 2353 2354 spdk_free(ctx->buf); 2355 free(ctx); 2356 } 2357 2358 static void 2359 blob_insert_cluster_cpl(void *cb_arg, int bserrno) 2360 { 2361 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2362 2363 if (bserrno) { 2364 if (bserrno == -EEXIST) { 2365 /* The metadata insert failed because another thread 2366 * allocated the cluster first. Free our cluster 2367 * but continue without error. */ 2368 bserrno = 0; 2369 } 2370 pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex); 2371 bs_release_cluster(ctx->blob->bs, ctx->new_cluster); 2372 pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex); 2373 if (ctx->new_extent_page != 0) { 2374 bs_release_md_page(ctx->blob->bs, ctx->new_extent_page); 2375 } 2376 } 2377 2378 bs_sequence_finish(ctx->seq, bserrno); 2379 } 2380 2381 static void 2382 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2383 { 2384 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2385 uint32_t cluster_number; 2386 2387 if (bserrno) { 2388 /* The write failed, so jump to the final completion handler */ 2389 bs_sequence_finish(seq, bserrno); 2390 return; 2391 } 2392 2393 cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page); 2394 2395 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2396 ctx->new_extent_page, blob_insert_cluster_cpl, ctx); 2397 } 2398 2399 static void 2400 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2401 { 2402 struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; 2403 2404 if (bserrno != 0) { 2405 /* The read failed, so jump to the final completion handler */ 2406 bs_sequence_finish(seq, bserrno); 2407 return; 2408 } 2409 2410 /* Write whole cluster */ 2411 bs_sequence_write_dev(seq, ctx->buf, 2412 bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), 2413 bs_cluster_to_lba(ctx->blob->bs, 1), 2414 blob_write_copy_cpl, ctx); 2415 } 2416 2417 static void 2418 bs_allocate_and_copy_cluster(struct spdk_blob *blob, 2419 struct spdk_io_channel *_ch, 2420 uint64_t io_unit, spdk_bs_user_op_t *op) 2421 { 2422 struct spdk_bs_cpl cpl; 2423 struct spdk_bs_channel *ch; 2424 struct spdk_blob_copy_cluster_ctx *ctx; 2425 uint32_t cluster_start_page; 2426 uint32_t cluster_number; 2427 int rc; 2428 2429 ch = spdk_io_channel_get_ctx(_ch); 2430 2431 if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { 2432 /* There are already operations pending. Queue this user op 2433 * and return because it will be re-executed when the outstanding 2434 * cluster allocation completes. */ 2435 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2436 return; 2437 } 2438 2439 /* Round the io_unit offset down to the first page in the cluster */ 2440 cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit); 2441 2442 /* Calculate which index in the metadata cluster array the corresponding 2443 * cluster is supposed to be at. */ 2444 cluster_number = bs_io_unit_to_cluster_number(blob, io_unit); 2445 2446 ctx = calloc(1, sizeof(*ctx)); 2447 if (!ctx) { 2448 bs_user_op_abort(op); 2449 return; 2450 } 2451 2452 assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); 2453 2454 ctx->blob = blob; 2455 ctx->page = cluster_start_page; 2456 2457 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2458 ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, 2459 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2460 if (!ctx->buf) { 2461 SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", 2462 blob->bs->cluster_sz); 2463 free(ctx); 2464 bs_user_op_abort(op); 2465 return; 2466 } 2467 } 2468 2469 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2470 rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page, 2471 false); 2472 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2473 if (rc != 0) { 2474 spdk_free(ctx->buf); 2475 free(ctx); 2476 bs_user_op_abort(op); 2477 return; 2478 } 2479 2480 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2481 cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl; 2482 cpl.u.blob_basic.cb_arg = ctx; 2483 2484 ctx->seq = bs_sequence_start(_ch, &cpl); 2485 if (!ctx->seq) { 2486 pthread_mutex_lock(&blob->bs->used_clusters_mutex); 2487 bs_release_cluster(blob->bs, ctx->new_cluster); 2488 pthread_mutex_unlock(&blob->bs->used_clusters_mutex); 2489 spdk_free(ctx->buf); 2490 free(ctx); 2491 bs_user_op_abort(op); 2492 return; 2493 } 2494 2495 /* Queue the user op to block other incoming operations */ 2496 TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); 2497 2498 if (blob->parent_id != SPDK_BLOBID_INVALID) { 2499 /* Read cluster from backing device */ 2500 bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, 2501 bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), 2502 bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), 2503 blob_write_copy, ctx); 2504 } else { 2505 blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, 2506 ctx->new_extent_page, blob_insert_cluster_cpl, ctx); 2507 } 2508 } 2509 2510 static inline bool 2511 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, 2512 uint64_t *lba, uint64_t *lba_count) 2513 { 2514 *lba_count = length; 2515 2516 if (!bs_io_unit_is_allocated(blob, io_unit)) { 2517 assert(blob->back_bs_dev != NULL); 2518 *lba = bs_io_unit_to_back_dev_lba(blob, io_unit); 2519 *lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count); 2520 return false; 2521 } else { 2522 *lba = bs_blob_io_unit_to_lba(blob, io_unit); 2523 return true; 2524 } 2525 } 2526 2527 struct op_split_ctx { 2528 struct spdk_blob *blob; 2529 struct spdk_io_channel *channel; 2530 uint64_t io_unit_offset; 2531 uint64_t io_units_remaining; 2532 void *curr_payload; 2533 enum spdk_blob_op_type op_type; 2534 spdk_bs_sequence_t *seq; 2535 }; 2536 2537 static void 2538 blob_request_submit_op_split_next(void *cb_arg, int bserrno) 2539 { 2540 struct op_split_ctx *ctx = cb_arg; 2541 struct spdk_blob *blob = ctx->blob; 2542 struct spdk_io_channel *ch = ctx->channel; 2543 enum spdk_blob_op_type op_type = ctx->op_type; 2544 uint8_t *buf = ctx->curr_payload; 2545 uint64_t offset = ctx->io_unit_offset; 2546 uint64_t length = ctx->io_units_remaining; 2547 uint64_t op_length; 2548 2549 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2550 bs_sequence_finish(ctx->seq, bserrno); 2551 free(ctx); 2552 return; 2553 } 2554 2555 op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob, 2556 offset)); 2557 2558 /* Update length and payload for next operation */ 2559 ctx->io_units_remaining -= op_length; 2560 ctx->io_unit_offset += op_length; 2561 if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { 2562 ctx->curr_payload += op_length * blob->bs->io_unit_size; 2563 } 2564 2565 switch (op_type) { 2566 case SPDK_BLOB_READ: 2567 spdk_blob_io_read(blob, ch, buf, offset, op_length, 2568 blob_request_submit_op_split_next, ctx); 2569 break; 2570 case SPDK_BLOB_WRITE: 2571 spdk_blob_io_write(blob, ch, buf, offset, op_length, 2572 blob_request_submit_op_split_next, ctx); 2573 break; 2574 case SPDK_BLOB_UNMAP: 2575 spdk_blob_io_unmap(blob, ch, offset, op_length, 2576 blob_request_submit_op_split_next, ctx); 2577 break; 2578 case SPDK_BLOB_WRITE_ZEROES: 2579 spdk_blob_io_write_zeroes(blob, ch, offset, op_length, 2580 blob_request_submit_op_split_next, ctx); 2581 break; 2582 case SPDK_BLOB_READV: 2583 case SPDK_BLOB_WRITEV: 2584 SPDK_ERRLOG("readv/write not valid\n"); 2585 bs_sequence_finish(ctx->seq, -EINVAL); 2586 free(ctx); 2587 break; 2588 } 2589 } 2590 2591 static void 2592 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, 2593 void *payload, uint64_t offset, uint64_t length, 2594 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2595 { 2596 struct op_split_ctx *ctx; 2597 spdk_bs_sequence_t *seq; 2598 struct spdk_bs_cpl cpl; 2599 2600 assert(blob != NULL); 2601 2602 ctx = calloc(1, sizeof(struct op_split_ctx)); 2603 if (ctx == NULL) { 2604 cb_fn(cb_arg, -ENOMEM); 2605 return; 2606 } 2607 2608 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2609 cpl.u.blob_basic.cb_fn = cb_fn; 2610 cpl.u.blob_basic.cb_arg = cb_arg; 2611 2612 seq = bs_sequence_start(ch, &cpl); 2613 if (!seq) { 2614 free(ctx); 2615 cb_fn(cb_arg, -ENOMEM); 2616 return; 2617 } 2618 2619 ctx->blob = blob; 2620 ctx->channel = ch; 2621 ctx->curr_payload = payload; 2622 ctx->io_unit_offset = offset; 2623 ctx->io_units_remaining = length; 2624 ctx->op_type = op_type; 2625 ctx->seq = seq; 2626 2627 blob_request_submit_op_split_next(ctx, 0); 2628 } 2629 2630 static void 2631 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, 2632 void *payload, uint64_t offset, uint64_t length, 2633 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2634 { 2635 struct spdk_bs_cpl cpl; 2636 uint64_t lba; 2637 uint64_t lba_count; 2638 bool is_allocated; 2639 2640 assert(blob != NULL); 2641 2642 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2643 cpl.u.blob_basic.cb_fn = cb_fn; 2644 cpl.u.blob_basic.cb_arg = cb_arg; 2645 2646 if (blob->frozen_refcnt) { 2647 /* This blob I/O is frozen */ 2648 spdk_bs_user_op_t *op; 2649 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); 2650 2651 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2652 if (!op) { 2653 cb_fn(cb_arg, -ENOMEM); 2654 return; 2655 } 2656 2657 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2658 2659 return; 2660 } 2661 2662 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2663 2664 switch (op_type) { 2665 case SPDK_BLOB_READ: { 2666 spdk_bs_batch_t *batch; 2667 2668 batch = bs_batch_open(_ch, &cpl); 2669 if (!batch) { 2670 cb_fn(cb_arg, -ENOMEM); 2671 return; 2672 } 2673 2674 if (is_allocated) { 2675 /* Read from the blob */ 2676 bs_batch_read_dev(batch, payload, lba, lba_count); 2677 } else { 2678 /* Read from the backing block device */ 2679 bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); 2680 } 2681 2682 bs_batch_close(batch); 2683 break; 2684 } 2685 case SPDK_BLOB_WRITE: 2686 case SPDK_BLOB_WRITE_ZEROES: { 2687 if (is_allocated) { 2688 /* Write to the blob */ 2689 spdk_bs_batch_t *batch; 2690 2691 if (lba_count == 0) { 2692 cb_fn(cb_arg, 0); 2693 return; 2694 } 2695 2696 batch = bs_batch_open(_ch, &cpl); 2697 if (!batch) { 2698 cb_fn(cb_arg, -ENOMEM); 2699 return; 2700 } 2701 2702 if (op_type == SPDK_BLOB_WRITE) { 2703 bs_batch_write_dev(batch, payload, lba, lba_count); 2704 } else { 2705 bs_batch_write_zeroes_dev(batch, lba, lba_count); 2706 } 2707 2708 bs_batch_close(batch); 2709 } else { 2710 /* Queue this operation and allocate the cluster */ 2711 spdk_bs_user_op_t *op; 2712 2713 op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); 2714 if (!op) { 2715 cb_fn(cb_arg, -ENOMEM); 2716 return; 2717 } 2718 2719 bs_allocate_and_copy_cluster(blob, _ch, offset, op); 2720 } 2721 break; 2722 } 2723 case SPDK_BLOB_UNMAP: { 2724 spdk_bs_batch_t *batch; 2725 2726 batch = bs_batch_open(_ch, &cpl); 2727 if (!batch) { 2728 cb_fn(cb_arg, -ENOMEM); 2729 return; 2730 } 2731 2732 if (is_allocated) { 2733 bs_batch_unmap_dev(batch, lba, lba_count); 2734 } 2735 2736 bs_batch_close(batch); 2737 break; 2738 } 2739 case SPDK_BLOB_READV: 2740 case SPDK_BLOB_WRITEV: 2741 SPDK_ERRLOG("readv/write not valid\n"); 2742 cb_fn(cb_arg, -EINVAL); 2743 break; 2744 } 2745 } 2746 2747 static void 2748 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2749 void *payload, uint64_t offset, uint64_t length, 2750 spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) 2751 { 2752 assert(blob != NULL); 2753 2754 if (blob->data_ro && op_type != SPDK_BLOB_READ) { 2755 cb_fn(cb_arg, -EPERM); 2756 return; 2757 } 2758 2759 if (length == 0) { 2760 cb_fn(cb_arg, 0); 2761 return; 2762 } 2763 2764 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2765 cb_fn(cb_arg, -EINVAL); 2766 return; 2767 } 2768 if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) { 2769 blob_request_submit_op_single(_channel, blob, payload, offset, length, 2770 cb_fn, cb_arg, op_type); 2771 } else { 2772 blob_request_submit_op_split(_channel, blob, payload, offset, length, 2773 cb_fn, cb_arg, op_type); 2774 } 2775 } 2776 2777 struct rw_iov_ctx { 2778 struct spdk_blob *blob; 2779 struct spdk_io_channel *channel; 2780 spdk_blob_op_complete cb_fn; 2781 void *cb_arg; 2782 bool read; 2783 int iovcnt; 2784 struct iovec *orig_iov; 2785 uint64_t io_unit_offset; 2786 uint64_t io_units_remaining; 2787 uint64_t io_units_done; 2788 struct iovec iov[0]; 2789 }; 2790 2791 static void 2792 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 2793 { 2794 assert(cb_arg == NULL); 2795 bs_sequence_finish(seq, bserrno); 2796 } 2797 2798 static void 2799 rw_iov_split_next(void *cb_arg, int bserrno) 2800 { 2801 struct rw_iov_ctx *ctx = cb_arg; 2802 struct spdk_blob *blob = ctx->blob; 2803 struct iovec *iov, *orig_iov; 2804 int iovcnt; 2805 size_t orig_iovoff; 2806 uint64_t io_units_count, io_units_to_boundary, io_unit_offset; 2807 uint64_t byte_count; 2808 2809 if (bserrno != 0 || ctx->io_units_remaining == 0) { 2810 ctx->cb_fn(ctx->cb_arg, bserrno); 2811 free(ctx); 2812 return; 2813 } 2814 2815 io_unit_offset = ctx->io_unit_offset; 2816 io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); 2817 io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); 2818 /* 2819 * Get index and offset into the original iov array for our current position in the I/O sequence. 2820 * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will 2821 * point to the current position in the I/O sequence. 2822 */ 2823 byte_count = ctx->io_units_done * blob->bs->io_unit_size; 2824 orig_iov = &ctx->orig_iov[0]; 2825 orig_iovoff = 0; 2826 while (byte_count > 0) { 2827 if (byte_count >= orig_iov->iov_len) { 2828 byte_count -= orig_iov->iov_len; 2829 orig_iov++; 2830 } else { 2831 orig_iovoff = byte_count; 2832 byte_count = 0; 2833 } 2834 } 2835 2836 /* 2837 * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many 2838 * bytes of this next I/O remain to be accounted for in the new iov array. 2839 */ 2840 byte_count = io_units_count * blob->bs->io_unit_size; 2841 iov = &ctx->iov[0]; 2842 iovcnt = 0; 2843 while (byte_count > 0) { 2844 assert(iovcnt < ctx->iovcnt); 2845 iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); 2846 iov->iov_base = orig_iov->iov_base + orig_iovoff; 2847 byte_count -= iov->iov_len; 2848 orig_iovoff = 0; 2849 orig_iov++; 2850 iov++; 2851 iovcnt++; 2852 } 2853 2854 ctx->io_unit_offset += io_units_count; 2855 ctx->io_units_remaining -= io_units_count; 2856 ctx->io_units_done += io_units_count; 2857 iov = &ctx->iov[0]; 2858 2859 if (ctx->read) { 2860 spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2861 io_units_count, rw_iov_split_next, ctx); 2862 } else { 2863 spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, 2864 io_units_count, rw_iov_split_next, ctx); 2865 } 2866 } 2867 2868 static void 2869 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, 2870 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 2871 spdk_blob_op_complete cb_fn, void *cb_arg, bool read) 2872 { 2873 struct spdk_bs_cpl cpl; 2874 2875 assert(blob != NULL); 2876 2877 if (!read && blob->data_ro) { 2878 cb_fn(cb_arg, -EPERM); 2879 return; 2880 } 2881 2882 if (length == 0) { 2883 cb_fn(cb_arg, 0); 2884 return; 2885 } 2886 2887 if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { 2888 cb_fn(cb_arg, -EINVAL); 2889 return; 2890 } 2891 2892 /* 2893 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having 2894 * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, 2895 * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster 2896 * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need 2897 * to allocate a separate iov array and split the I/O such that none of the resulting 2898 * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) 2899 * but since this case happens very infrequently, any performance impact will be negligible. 2900 * 2901 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs 2902 * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them 2903 * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called 2904 * when the batch was completed, to allow for freeing the memory for the iov arrays. 2905 */ 2906 if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) { 2907 uint64_t lba_count; 2908 uint64_t lba; 2909 bool is_allocated; 2910 2911 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 2912 cpl.u.blob_basic.cb_fn = cb_fn; 2913 cpl.u.blob_basic.cb_arg = cb_arg; 2914 2915 if (blob->frozen_refcnt) { 2916 /* This blob I/O is frozen */ 2917 enum spdk_blob_op_type op_type; 2918 spdk_bs_user_op_t *op; 2919 struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); 2920 2921 op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV; 2922 op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length); 2923 if (!op) { 2924 cb_fn(cb_arg, -ENOMEM); 2925 return; 2926 } 2927 2928 TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); 2929 2930 return; 2931 } 2932 2933 is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); 2934 2935 if (read) { 2936 spdk_bs_sequence_t *seq; 2937 2938 seq = bs_sequence_start(_channel, &cpl); 2939 if (!seq) { 2940 cb_fn(cb_arg, -ENOMEM); 2941 return; 2942 } 2943 2944 if (is_allocated) { 2945 bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2946 } else { 2947 bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, 2948 rw_iov_done, NULL); 2949 } 2950 } else { 2951 if (is_allocated) { 2952 spdk_bs_sequence_t *seq; 2953 2954 seq = bs_sequence_start(_channel, &cpl); 2955 if (!seq) { 2956 cb_fn(cb_arg, -ENOMEM); 2957 return; 2958 } 2959 2960 bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL); 2961 } else { 2962 /* Queue this operation and allocate the cluster */ 2963 spdk_bs_user_op_t *op; 2964 2965 op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, 2966 length); 2967 if (!op) { 2968 cb_fn(cb_arg, -ENOMEM); 2969 return; 2970 } 2971 2972 bs_allocate_and_copy_cluster(blob, _channel, offset, op); 2973 } 2974 } 2975 } else { 2976 struct rw_iov_ctx *ctx; 2977 2978 ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); 2979 if (ctx == NULL) { 2980 cb_fn(cb_arg, -ENOMEM); 2981 return; 2982 } 2983 2984 ctx->blob = blob; 2985 ctx->channel = _channel; 2986 ctx->cb_fn = cb_fn; 2987 ctx->cb_arg = cb_arg; 2988 ctx->read = read; 2989 ctx->orig_iov = iov; 2990 ctx->iovcnt = iovcnt; 2991 ctx->io_unit_offset = offset; 2992 ctx->io_units_remaining = length; 2993 ctx->io_units_done = 0; 2994 2995 rw_iov_split_next(ctx, 0); 2996 } 2997 } 2998 2999 static struct spdk_blob * 3000 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) 3001 { 3002 struct spdk_blob find; 3003 3004 if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) { 3005 return NULL; 3006 } 3007 3008 find.id = blobid; 3009 return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find); 3010 } 3011 3012 static void 3013 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob, 3014 struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry) 3015 { 3016 assert(blob != NULL); 3017 *snapshot_entry = NULL; 3018 *clone_entry = NULL; 3019 3020 if (blob->parent_id == SPDK_BLOBID_INVALID) { 3021 return; 3022 } 3023 3024 TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) { 3025 if ((*snapshot_entry)->id == blob->parent_id) { 3026 break; 3027 } 3028 } 3029 3030 if (*snapshot_entry != NULL) { 3031 TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) { 3032 if ((*clone_entry)->id == blob->id) { 3033 break; 3034 } 3035 } 3036 3037 assert(*clone_entry != NULL); 3038 } 3039 } 3040 3041 static int 3042 bs_channel_create(void *io_device, void *ctx_buf) 3043 { 3044 struct spdk_blob_store *bs = io_device; 3045 struct spdk_bs_channel *channel = ctx_buf; 3046 struct spdk_bs_dev *dev; 3047 uint32_t max_ops = bs->max_channel_ops; 3048 uint32_t i; 3049 3050 dev = bs->dev; 3051 3052 channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); 3053 if (!channel->req_mem) { 3054 return -1; 3055 } 3056 3057 TAILQ_INIT(&channel->reqs); 3058 3059 for (i = 0; i < max_ops; i++) { 3060 TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); 3061 } 3062 3063 channel->bs = bs; 3064 channel->dev = dev; 3065 channel->dev_channel = dev->create_channel(dev); 3066 3067 if (!channel->dev_channel) { 3068 SPDK_ERRLOG("Failed to create device channel.\n"); 3069 free(channel->req_mem); 3070 return -1; 3071 } 3072 3073 TAILQ_INIT(&channel->need_cluster_alloc); 3074 TAILQ_INIT(&channel->queued_io); 3075 3076 return 0; 3077 } 3078 3079 static void 3080 bs_channel_destroy(void *io_device, void *ctx_buf) 3081 { 3082 struct spdk_bs_channel *channel = ctx_buf; 3083 spdk_bs_user_op_t *op; 3084 3085 while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { 3086 op = TAILQ_FIRST(&channel->need_cluster_alloc); 3087 TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); 3088 bs_user_op_abort(op); 3089 } 3090 3091 while (!TAILQ_EMPTY(&channel->queued_io)) { 3092 op = TAILQ_FIRST(&channel->queued_io); 3093 TAILQ_REMOVE(&channel->queued_io, op, link); 3094 bs_user_op_abort(op); 3095 } 3096 3097 free(channel->req_mem); 3098 channel->dev->destroy_channel(channel->dev, channel->dev_channel); 3099 } 3100 3101 static void 3102 bs_dev_destroy(void *io_device) 3103 { 3104 struct spdk_blob_store *bs = io_device; 3105 struct spdk_blob *blob, *blob_tmp; 3106 3107 bs->dev->destroy(bs->dev); 3108 3109 RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) { 3110 RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob); 3111 spdk_bit_array_clear(bs->open_blobids, blob->id); 3112 blob_free(blob); 3113 } 3114 3115 pthread_mutex_destroy(&bs->used_clusters_mutex); 3116 3117 spdk_bit_array_free(&bs->open_blobids); 3118 spdk_bit_array_free(&bs->used_blobids); 3119 spdk_bit_array_free(&bs->used_md_pages); 3120 spdk_bit_pool_free(&bs->used_clusters); 3121 /* 3122 * If this function is called for any reason except a successful unload, 3123 * the unload_cpl type will be NONE and this will be a nop. 3124 */ 3125 bs_call_cpl(&bs->unload_cpl, bs->unload_err); 3126 3127 free(bs); 3128 } 3129 3130 static int 3131 bs_blob_list_add(struct spdk_blob *blob) 3132 { 3133 spdk_blob_id snapshot_id; 3134 struct spdk_blob_list *snapshot_entry = NULL; 3135 struct spdk_blob_list *clone_entry = NULL; 3136 3137 assert(blob != NULL); 3138 3139 snapshot_id = blob->parent_id; 3140 if (snapshot_id == SPDK_BLOBID_INVALID) { 3141 return 0; 3142 } 3143 3144 snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id); 3145 if (snapshot_entry == NULL) { 3146 /* Snapshot not found */ 3147 snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); 3148 if (snapshot_entry == NULL) { 3149 return -ENOMEM; 3150 } 3151 snapshot_entry->id = snapshot_id; 3152 TAILQ_INIT(&snapshot_entry->clones); 3153 TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); 3154 } else { 3155 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 3156 if (clone_entry->id == blob->id) { 3157 break; 3158 } 3159 } 3160 } 3161 3162 if (clone_entry == NULL) { 3163 /* Clone not found */ 3164 clone_entry = calloc(1, sizeof(struct spdk_blob_list)); 3165 if (clone_entry == NULL) { 3166 return -ENOMEM; 3167 } 3168 clone_entry->id = blob->id; 3169 TAILQ_INIT(&clone_entry->clones); 3170 TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); 3171 snapshot_entry->clone_count++; 3172 } 3173 3174 return 0; 3175 } 3176 3177 static void 3178 bs_blob_list_remove(struct spdk_blob *blob) 3179 { 3180 struct spdk_blob_list *snapshot_entry = NULL; 3181 struct spdk_blob_list *clone_entry = NULL; 3182 3183 blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry); 3184 3185 if (snapshot_entry == NULL) { 3186 return; 3187 } 3188 3189 blob->parent_id = SPDK_BLOBID_INVALID; 3190 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3191 free(clone_entry); 3192 3193 snapshot_entry->clone_count--; 3194 } 3195 3196 static int 3197 bs_blob_list_free(struct spdk_blob_store *bs) 3198 { 3199 struct spdk_blob_list *snapshot_entry; 3200 struct spdk_blob_list *snapshot_entry_tmp; 3201 struct spdk_blob_list *clone_entry; 3202 struct spdk_blob_list *clone_entry_tmp; 3203 3204 TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { 3205 TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { 3206 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 3207 free(clone_entry); 3208 } 3209 TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); 3210 free(snapshot_entry); 3211 } 3212 3213 return 0; 3214 } 3215 3216 static void 3217 bs_free(struct spdk_blob_store *bs) 3218 { 3219 bs_blob_list_free(bs); 3220 3221 bs_unregister_md_thread(bs); 3222 spdk_io_device_unregister(bs, bs_dev_destroy); 3223 } 3224 3225 void 3226 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size) 3227 { 3228 3229 if (!opts) { 3230 SPDK_ERRLOG("opts should not be NULL\n"); 3231 return; 3232 } 3233 3234 if (!opts_size) { 3235 SPDK_ERRLOG("opts_size should not be zero value\n"); 3236 return; 3237 } 3238 3239 memset(opts, 0, opts_size); 3240 opts->opts_size = opts_size; 3241 3242 #define FIELD_OK(field) \ 3243 offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size 3244 3245 #define SET_FIELD(field, value) \ 3246 if (FIELD_OK(field)) { \ 3247 opts->field = value; \ 3248 } \ 3249 3250 SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ); 3251 SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3252 SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES); 3253 SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS); 3254 SET_FIELD(clear_method, BS_CLEAR_WITH_UNMAP); 3255 3256 if (FIELD_OK(bstype)) { 3257 memset(&opts->bstype, 0, sizeof(opts->bstype)); 3258 } 3259 3260 SET_FIELD(iter_cb_fn, NULL); 3261 SET_FIELD(iter_cb_arg, NULL); 3262 3263 #undef FIELD_OK 3264 #undef SET_FIELD 3265 } 3266 3267 static int 3268 bs_opts_verify(struct spdk_bs_opts *opts) 3269 { 3270 if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || 3271 opts->max_channel_ops == 0) { 3272 SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); 3273 return -1; 3274 } 3275 3276 return 0; 3277 } 3278 3279 /* START spdk_bs_load */ 3280 3281 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */ 3282 3283 struct spdk_bs_load_ctx { 3284 struct spdk_blob_store *bs; 3285 struct spdk_bs_super_block *super; 3286 3287 struct spdk_bs_md_mask *mask; 3288 bool in_page_chain; 3289 uint32_t page_index; 3290 uint32_t cur_page; 3291 struct spdk_blob_md_page *page; 3292 3293 uint64_t num_extent_pages; 3294 uint32_t *extent_page_num; 3295 struct spdk_blob_md_page *extent_pages; 3296 struct spdk_bit_array *used_clusters; 3297 3298 spdk_bs_sequence_t *seq; 3299 spdk_blob_op_with_handle_complete iter_cb_fn; 3300 void *iter_cb_arg; 3301 struct spdk_blob *blob; 3302 spdk_blob_id blobid; 3303 3304 /* These fields are used in the spdk_bs_dump path. */ 3305 FILE *fp; 3306 spdk_bs_dump_print_xattr print_xattr_fn; 3307 char xattr_name[4096]; 3308 }; 3309 3310 static int 3311 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs, 3312 struct spdk_bs_load_ctx **_ctx) 3313 { 3314 struct spdk_blob_store *bs; 3315 struct spdk_bs_load_ctx *ctx; 3316 uint64_t dev_size; 3317 int rc; 3318 3319 dev_size = dev->blocklen * dev->blockcnt; 3320 if (dev_size < opts->cluster_sz) { 3321 /* Device size cannot be smaller than cluster size of blobstore */ 3322 SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", 3323 dev_size, opts->cluster_sz); 3324 return -ENOSPC; 3325 } 3326 if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { 3327 /* Cluster size cannot be smaller than page size */ 3328 SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", 3329 opts->cluster_sz, SPDK_BS_PAGE_SIZE); 3330 return -EINVAL; 3331 } 3332 bs = calloc(1, sizeof(struct spdk_blob_store)); 3333 if (!bs) { 3334 return -ENOMEM; 3335 } 3336 3337 ctx = calloc(1, sizeof(struct spdk_bs_load_ctx)); 3338 if (!ctx) { 3339 free(bs); 3340 return -ENOMEM; 3341 } 3342 3343 ctx->bs = bs; 3344 ctx->iter_cb_fn = opts->iter_cb_fn; 3345 ctx->iter_cb_arg = opts->iter_cb_arg; 3346 3347 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 3348 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3349 if (!ctx->super) { 3350 free(ctx); 3351 free(bs); 3352 return -ENOMEM; 3353 } 3354 3355 RB_INIT(&bs->open_blobs); 3356 TAILQ_INIT(&bs->snapshots); 3357 bs->dev = dev; 3358 bs->md_thread = spdk_get_thread(); 3359 assert(bs->md_thread != NULL); 3360 3361 /* 3362 * Do not use bs_lba_to_cluster() here since blockcnt may not be an 3363 * even multiple of the cluster size. 3364 */ 3365 bs->cluster_sz = opts->cluster_sz; 3366 bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); 3367 ctx->used_clusters = spdk_bit_array_create(bs->total_clusters); 3368 if (!ctx->used_clusters) { 3369 spdk_free(ctx->super); 3370 free(ctx); 3371 free(bs); 3372 return -ENOMEM; 3373 } 3374 3375 bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; 3376 if (spdk_u32_is_pow2(bs->pages_per_cluster)) { 3377 bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster); 3378 } 3379 bs->num_free_clusters = bs->total_clusters; 3380 bs->io_unit_size = dev->blocklen; 3381 3382 bs->max_channel_ops = opts->max_channel_ops; 3383 bs->super_blob = SPDK_BLOBID_INVALID; 3384 memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); 3385 3386 /* The metadata is assumed to be at least 1 page */ 3387 bs->used_md_pages = spdk_bit_array_create(1); 3388 bs->used_blobids = spdk_bit_array_create(0); 3389 bs->open_blobids = spdk_bit_array_create(0); 3390 3391 pthread_mutex_init(&bs->used_clusters_mutex, NULL); 3392 3393 spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy, 3394 sizeof(struct spdk_bs_channel), "blobstore"); 3395 rc = bs_register_md_thread(bs); 3396 if (rc == -1) { 3397 spdk_io_device_unregister(bs, NULL); 3398 pthread_mutex_destroy(&bs->used_clusters_mutex); 3399 spdk_bit_array_free(&bs->open_blobids); 3400 spdk_bit_array_free(&bs->used_blobids); 3401 spdk_bit_array_free(&bs->used_md_pages); 3402 spdk_bit_array_free(&ctx->used_clusters); 3403 spdk_free(ctx->super); 3404 free(ctx); 3405 free(bs); 3406 /* FIXME: this is a lie but don't know how to get a proper error code here */ 3407 return -ENOMEM; 3408 } 3409 3410 *_ctx = ctx; 3411 *_bs = bs; 3412 return 0; 3413 } 3414 3415 static void 3416 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno) 3417 { 3418 assert(bserrno != 0); 3419 3420 spdk_free(ctx->super); 3421 bs_sequence_finish(ctx->seq, bserrno); 3422 bs_free(ctx->bs); 3423 spdk_bit_array_free(&ctx->used_clusters); 3424 free(ctx); 3425 } 3426 3427 static void 3428 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, 3429 struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) 3430 { 3431 /* Update the values in the super block */ 3432 super->super_blob = bs->super_blob; 3433 memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); 3434 super->crc = blob_md_page_calc_crc(super); 3435 bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0), 3436 bs_byte_to_lba(bs, sizeof(*super)), 3437 cb_fn, cb_arg); 3438 } 3439 3440 static void 3441 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3442 { 3443 struct spdk_bs_load_ctx *ctx = arg; 3444 uint64_t mask_size, lba, lba_count; 3445 3446 /* Write out the used clusters mask */ 3447 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3448 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3449 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3450 if (!ctx->mask) { 3451 bs_load_ctx_fail(ctx, -ENOMEM); 3452 return; 3453 } 3454 3455 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; 3456 ctx->mask->length = ctx->bs->total_clusters; 3457 /* We could get here through the normal unload path, or through dirty 3458 * shutdown recovery. For the normal unload path, we use the mask from 3459 * the bit pool. For dirty shutdown recovery, we don't have a bit pool yet - 3460 * only the bit array from the load ctx. 3461 */ 3462 if (ctx->bs->used_clusters) { 3463 assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters)); 3464 spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask); 3465 } else { 3466 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters)); 3467 spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask); 3468 } 3469 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3470 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3471 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3472 } 3473 3474 static void 3475 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3476 { 3477 struct spdk_bs_load_ctx *ctx = arg; 3478 uint64_t mask_size, lba, lba_count; 3479 3480 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3481 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3482 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3483 if (!ctx->mask) { 3484 bs_load_ctx_fail(ctx, -ENOMEM); 3485 return; 3486 } 3487 3488 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; 3489 ctx->mask->length = ctx->super->md_len; 3490 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); 3491 3492 spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3493 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3494 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3495 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3496 } 3497 3498 static void 3499 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) 3500 { 3501 struct spdk_bs_load_ctx *ctx = arg; 3502 uint64_t mask_size, lba, lba_count; 3503 3504 if (ctx->super->used_blobid_mask_len == 0) { 3505 /* 3506 * This is a pre-v3 on-disk format where the blobid mask does not get 3507 * written to disk. 3508 */ 3509 cb_fn(seq, arg, 0); 3510 return; 3511 } 3512 3513 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3514 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3515 SPDK_MALLOC_DMA); 3516 if (!ctx->mask) { 3517 bs_load_ctx_fail(ctx, -ENOMEM); 3518 return; 3519 } 3520 3521 ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; 3522 ctx->mask->length = ctx->super->md_len; 3523 assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); 3524 3525 spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask); 3526 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3527 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3528 bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); 3529 } 3530 3531 static void 3532 blob_set_thin_provision(struct spdk_blob *blob) 3533 { 3534 blob_verify_md_op(blob); 3535 blob->invalid_flags |= SPDK_BLOB_THIN_PROV; 3536 blob->state = SPDK_BLOB_STATE_DIRTY; 3537 } 3538 3539 static void 3540 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method) 3541 { 3542 blob_verify_md_op(blob); 3543 blob->clear_method = clear_method; 3544 blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT); 3545 blob->state = SPDK_BLOB_STATE_DIRTY; 3546 } 3547 3548 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno); 3549 3550 static void 3551 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno) 3552 { 3553 struct spdk_bs_load_ctx *ctx = cb_arg; 3554 spdk_blob_id id; 3555 int64_t page_num; 3556 3557 /* Iterate to next blob (we can't use spdk_bs_iter_next function as our 3558 * last blob has been removed */ 3559 page_num = bs_blobid_to_page(ctx->blobid); 3560 page_num++; 3561 page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num); 3562 if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) { 3563 bs_load_iter(ctx, NULL, -ENOENT); 3564 return; 3565 } 3566 3567 id = bs_page_to_blobid(page_num); 3568 3569 spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx); 3570 } 3571 3572 static void 3573 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno) 3574 { 3575 struct spdk_bs_load_ctx *ctx = cb_arg; 3576 3577 if (bserrno != 0) { 3578 SPDK_ERRLOG("Failed to close corrupted blob\n"); 3579 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3580 return; 3581 } 3582 3583 spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx); 3584 } 3585 3586 static void 3587 bs_delete_corrupted_blob(void *cb_arg, int bserrno) 3588 { 3589 struct spdk_bs_load_ctx *ctx = cb_arg; 3590 uint64_t i; 3591 3592 if (bserrno != 0) { 3593 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3594 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3595 return; 3596 } 3597 3598 /* Snapshot and clone have the same copy of cluster map and extent pages 3599 * at this point. Let's clear both for snapshot now, 3600 * so that it won't be cleared for clone later when we remove snapshot. 3601 * Also set thin provision to pass data corruption check */ 3602 for (i = 0; i < ctx->blob->active.num_clusters; i++) { 3603 ctx->blob->active.clusters[i] = 0; 3604 } 3605 for (i = 0; i < ctx->blob->active.num_extent_pages; i++) { 3606 ctx->blob->active.extent_pages[i] = 0; 3607 } 3608 3609 ctx->blob->md_ro = false; 3610 3611 blob_set_thin_provision(ctx->blob); 3612 3613 ctx->blobid = ctx->blob->id; 3614 3615 spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx); 3616 } 3617 3618 static void 3619 bs_update_corrupted_blob(void *cb_arg, int bserrno) 3620 { 3621 struct spdk_bs_load_ctx *ctx = cb_arg; 3622 3623 if (bserrno != 0) { 3624 SPDK_ERRLOG("Failed to close clone of a corrupted blob\n"); 3625 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3626 return; 3627 } 3628 3629 ctx->blob->md_ro = false; 3630 blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true); 3631 blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true); 3632 spdk_blob_set_read_only(ctx->blob); 3633 3634 if (ctx->iter_cb_fn) { 3635 ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0); 3636 } 3637 bs_blob_list_add(ctx->blob); 3638 3639 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3640 } 3641 3642 static void 3643 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno) 3644 { 3645 struct spdk_bs_load_ctx *ctx = cb_arg; 3646 3647 if (bserrno != 0) { 3648 SPDK_ERRLOG("Failed to open clone of a corrupted blob\n"); 3649 spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx); 3650 return; 3651 } 3652 3653 if (blob->parent_id == ctx->blob->id) { 3654 /* Power failure occurred before updating clone (snapshot delete case) 3655 * or after updating clone (creating snapshot case) - keep snapshot */ 3656 spdk_blob_close(blob, bs_update_corrupted_blob, ctx); 3657 } else { 3658 /* Power failure occurred after updating clone (snapshot delete case) 3659 * or before updating clone (creating snapshot case) - remove snapshot */ 3660 spdk_blob_close(blob, bs_delete_corrupted_blob, ctx); 3661 } 3662 } 3663 3664 static void 3665 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) 3666 { 3667 struct spdk_bs_load_ctx *ctx = arg; 3668 const void *value; 3669 size_t len; 3670 int rc = 0; 3671 3672 if (bserrno == 0) { 3673 /* Examine blob if it is corrupted after power failure. Fix 3674 * the ones that can be fixed and remove any other corrupted 3675 * ones. If it is not corrupted just process it */ 3676 rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true); 3677 if (rc != 0) { 3678 rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true); 3679 if (rc != 0) { 3680 /* Not corrupted - process it and continue with iterating through blobs */ 3681 if (ctx->iter_cb_fn) { 3682 ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); 3683 } 3684 bs_blob_list_add(blob); 3685 spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx); 3686 return; 3687 } 3688 3689 } 3690 3691 assert(len == sizeof(spdk_blob_id)); 3692 3693 ctx->blob = blob; 3694 3695 /* Open clone to check if we are able to fix this blob or should we remove it */ 3696 spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx); 3697 return; 3698 } else if (bserrno == -ENOENT) { 3699 bserrno = 0; 3700 } else { 3701 /* 3702 * This case needs to be looked at further. Same problem 3703 * exists with applications that rely on explicit blob 3704 * iteration. We should just skip the blob that failed 3705 * to load and continue on to the next one. 3706 */ 3707 SPDK_ERRLOG("Error in iterating blobs\n"); 3708 } 3709 3710 ctx->iter_cb_fn = NULL; 3711 3712 spdk_free(ctx->super); 3713 spdk_free(ctx->mask); 3714 bs_sequence_finish(ctx->seq, bserrno); 3715 free(ctx); 3716 } 3717 3718 static void 3719 bs_load_complete(struct spdk_bs_load_ctx *ctx) 3720 { 3721 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 3722 spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx); 3723 } 3724 3725 static void 3726 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3727 { 3728 struct spdk_bs_load_ctx *ctx = cb_arg; 3729 int rc; 3730 3731 /* The type must be correct */ 3732 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); 3733 3734 /* The length of the mask (in bits) must not be greater than 3735 * the length of the buffer (converted to bits) */ 3736 assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); 3737 3738 /* The length of the mask must be exactly equal to the size 3739 * (in pages) of the metadata region */ 3740 assert(ctx->mask->length == ctx->super->md_len); 3741 3742 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length); 3743 if (rc < 0) { 3744 spdk_free(ctx->mask); 3745 bs_load_ctx_fail(ctx, rc); 3746 return; 3747 } 3748 3749 spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask); 3750 bs_load_complete(ctx); 3751 } 3752 3753 static void 3754 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3755 { 3756 struct spdk_bs_load_ctx *ctx = cb_arg; 3757 uint64_t lba, lba_count, mask_size; 3758 int rc; 3759 3760 if (bserrno != 0) { 3761 bs_load_ctx_fail(ctx, bserrno); 3762 return; 3763 } 3764 3765 /* The type must be correct */ 3766 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); 3767 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3768 assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( 3769 struct spdk_blob_md_page) * 8)); 3770 /* The length of the mask must be exactly equal to the total number of clusters */ 3771 assert(ctx->mask->length == ctx->bs->total_clusters); 3772 3773 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length); 3774 if (rc < 0) { 3775 spdk_free(ctx->mask); 3776 bs_load_ctx_fail(ctx, rc); 3777 return; 3778 } 3779 3780 spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask); 3781 ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters); 3782 assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); 3783 3784 spdk_free(ctx->mask); 3785 3786 /* Read the used blobids mask */ 3787 mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; 3788 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3789 SPDK_MALLOC_DMA); 3790 if (!ctx->mask) { 3791 bs_load_ctx_fail(ctx, -ENOMEM); 3792 return; 3793 } 3794 lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); 3795 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); 3796 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3797 bs_load_used_blobids_cpl, ctx); 3798 } 3799 3800 static void 3801 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 3802 { 3803 struct spdk_bs_load_ctx *ctx = cb_arg; 3804 uint64_t lba, lba_count, mask_size; 3805 int rc; 3806 3807 if (bserrno != 0) { 3808 bs_load_ctx_fail(ctx, bserrno); 3809 return; 3810 } 3811 3812 /* The type must be correct */ 3813 assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); 3814 /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ 3815 assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * 3816 8)); 3817 /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ 3818 if (ctx->mask->length != ctx->super->md_len) { 3819 SPDK_ERRLOG("mismatched md_len in used_pages mask: " 3820 "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n", 3821 ctx->mask->length, ctx->super->md_len); 3822 assert(false); 3823 } 3824 3825 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length); 3826 if (rc < 0) { 3827 spdk_free(ctx->mask); 3828 bs_load_ctx_fail(ctx, rc); 3829 return; 3830 } 3831 3832 spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask); 3833 spdk_free(ctx->mask); 3834 3835 /* Read the used clusters mask */ 3836 mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; 3837 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, 3838 SPDK_MALLOC_DMA); 3839 if (!ctx->mask) { 3840 bs_load_ctx_fail(ctx, -ENOMEM); 3841 return; 3842 } 3843 lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); 3844 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); 3845 bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, 3846 bs_load_used_clusters_cpl, ctx); 3847 } 3848 3849 static void 3850 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx) 3851 { 3852 uint64_t lba, lba_count, mask_size; 3853 3854 /* Read the used pages mask */ 3855 mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; 3856 ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, 3857 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 3858 if (!ctx->mask) { 3859 bs_load_ctx_fail(ctx, -ENOMEM); 3860 return; 3861 } 3862 3863 lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); 3864 lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); 3865 bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count, 3866 bs_load_used_pages_cpl, ctx); 3867 } 3868 3869 static int 3870 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page) 3871 { 3872 struct spdk_blob_store *bs = ctx->bs; 3873 struct spdk_blob_md_descriptor *desc; 3874 size_t cur_desc = 0; 3875 3876 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 3877 while (cur_desc < sizeof(page->descriptors)) { 3878 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 3879 if (desc->length == 0) { 3880 /* If padding and length are 0, this terminates the page */ 3881 break; 3882 } 3883 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 3884 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 3885 unsigned int i, j; 3886 unsigned int cluster_count = 0; 3887 uint32_t cluster_idx; 3888 3889 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 3890 3891 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 3892 for (j = 0; j < desc_extent_rle->extents[i].length; j++) { 3893 cluster_idx = desc_extent_rle->extents[i].cluster_idx; 3894 /* 3895 * cluster_idx = 0 means an unallocated cluster - don't mark that 3896 * in the used cluster map. 3897 */ 3898 if (cluster_idx != 0) { 3899 spdk_bit_array_set(ctx->used_clusters, cluster_idx + j); 3900 if (bs->num_free_clusters == 0) { 3901 return -ENOSPC; 3902 } 3903 bs->num_free_clusters--; 3904 } 3905 cluster_count++; 3906 } 3907 } 3908 if (cluster_count == 0) { 3909 return -EINVAL; 3910 } 3911 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 3912 struct spdk_blob_md_descriptor_extent_page *desc_extent; 3913 uint32_t i; 3914 uint32_t cluster_count = 0; 3915 uint32_t cluster_idx; 3916 size_t cluster_idx_length; 3917 3918 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 3919 cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx); 3920 3921 if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) || 3922 (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) { 3923 return -EINVAL; 3924 } 3925 3926 for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) { 3927 cluster_idx = desc_extent->cluster_idx[i]; 3928 /* 3929 * cluster_idx = 0 means an unallocated cluster - don't mark that 3930 * in the used cluster map. 3931 */ 3932 if (cluster_idx != 0) { 3933 if (cluster_idx < desc_extent->start_cluster_idx && 3934 cluster_idx >= desc_extent->start_cluster_idx + cluster_count) { 3935 return -EINVAL; 3936 } 3937 spdk_bit_array_set(ctx->used_clusters, cluster_idx); 3938 if (bs->num_free_clusters == 0) { 3939 return -ENOSPC; 3940 } 3941 bs->num_free_clusters--; 3942 } 3943 cluster_count++; 3944 } 3945 3946 if (cluster_count == 0) { 3947 return -EINVAL; 3948 } 3949 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 3950 /* Skip this item */ 3951 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 3952 /* Skip this item */ 3953 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 3954 /* Skip this item */ 3955 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) { 3956 struct spdk_blob_md_descriptor_extent_table *desc_extent_table; 3957 uint32_t num_extent_pages = ctx->num_extent_pages; 3958 uint32_t i; 3959 size_t extent_pages_length; 3960 void *tmp; 3961 3962 desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc; 3963 extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters); 3964 3965 if (desc_extent_table->length == 0 || 3966 (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) { 3967 return -EINVAL; 3968 } 3969 3970 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 3971 if (desc_extent_table->extent_page[i].page_idx != 0) { 3972 if (desc_extent_table->extent_page[i].num_pages != 1) { 3973 return -EINVAL; 3974 } 3975 num_extent_pages += 1; 3976 } 3977 } 3978 3979 if (num_extent_pages > 0) { 3980 tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t)); 3981 if (tmp == NULL) { 3982 return -ENOMEM; 3983 } 3984 ctx->extent_page_num = tmp; 3985 3986 /* Extent table entries contain md page numbers for extent pages. 3987 * Zeroes represent unallocated extent pages, those are run-length-encoded. 3988 */ 3989 for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) { 3990 if (desc_extent_table->extent_page[i].page_idx != 0) { 3991 ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx; 3992 ctx->num_extent_pages += 1; 3993 } 3994 } 3995 } 3996 } else { 3997 /* Error */ 3998 return -EINVAL; 3999 } 4000 /* Advance to the next descriptor */ 4001 cur_desc += sizeof(*desc) + desc->length; 4002 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4003 break; 4004 } 4005 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4006 } 4007 return 0; 4008 } 4009 4010 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page) 4011 { 4012 uint32_t crc; 4013 struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4014 size_t desc_len; 4015 4016 crc = blob_md_page_calc_crc(page); 4017 if (crc != page->crc) { 4018 return false; 4019 } 4020 4021 /* Extent page should always be of sequence num 0. */ 4022 if (page->sequence_num != 0) { 4023 return false; 4024 } 4025 4026 /* Descriptor type must be EXTENT_PAGE. */ 4027 if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4028 return false; 4029 } 4030 4031 /* Descriptor length cannot exceed the page. */ 4032 desc_len = sizeof(*desc) + desc->length; 4033 if (desc_len > sizeof(page->descriptors)) { 4034 return false; 4035 } 4036 4037 /* It has to be the only descriptor in the page. */ 4038 if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) { 4039 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len); 4040 if (desc->length != 0) { 4041 return false; 4042 } 4043 } 4044 4045 return true; 4046 } 4047 4048 static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) 4049 { 4050 uint32_t crc; 4051 struct spdk_blob_md_page *page = ctx->page; 4052 4053 crc = blob_md_page_calc_crc(page); 4054 if (crc != page->crc) { 4055 return false; 4056 } 4057 4058 /* First page of a sequence should match the blobid. */ 4059 if (page->sequence_num == 0 && 4060 bs_page_to_blobid(ctx->cur_page) != page->id) { 4061 return false; 4062 } 4063 assert(bs_load_cur_extent_page_valid(page) == false); 4064 4065 return true; 4066 } 4067 4068 static void 4069 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx); 4070 4071 static void 4072 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4073 { 4074 struct spdk_bs_load_ctx *ctx = cb_arg; 4075 4076 if (bserrno != 0) { 4077 bs_load_ctx_fail(ctx, bserrno); 4078 return; 4079 } 4080 4081 bs_load_complete(ctx); 4082 } 4083 4084 static void 4085 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4086 { 4087 struct spdk_bs_load_ctx *ctx = cb_arg; 4088 4089 spdk_free(ctx->mask); 4090 ctx->mask = NULL; 4091 4092 if (bserrno != 0) { 4093 bs_load_ctx_fail(ctx, bserrno); 4094 return; 4095 } 4096 4097 bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl); 4098 } 4099 4100 static void 4101 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4102 { 4103 struct spdk_bs_load_ctx *ctx = cb_arg; 4104 4105 spdk_free(ctx->mask); 4106 ctx->mask = NULL; 4107 4108 if (bserrno != 0) { 4109 bs_load_ctx_fail(ctx, bserrno); 4110 return; 4111 } 4112 4113 bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl); 4114 } 4115 4116 static void 4117 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx) 4118 { 4119 bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl); 4120 } 4121 4122 static void 4123 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx) 4124 { 4125 uint64_t num_md_clusters; 4126 uint64_t i; 4127 4128 ctx->in_page_chain = false; 4129 4130 do { 4131 ctx->page_index++; 4132 } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); 4133 4134 if (ctx->page_index < ctx->super->md_len) { 4135 ctx->cur_page = ctx->page_index; 4136 bs_load_replay_cur_md_page(ctx); 4137 } else { 4138 /* Claim all of the clusters used by the metadata */ 4139 num_md_clusters = spdk_divide_round_up( 4140 ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster); 4141 for (i = 0; i < num_md_clusters; i++) { 4142 spdk_bit_array_set(ctx->used_clusters, i); 4143 } 4144 ctx->bs->num_free_clusters -= num_md_clusters; 4145 spdk_free(ctx->page); 4146 bs_load_write_used_md(ctx); 4147 } 4148 } 4149 4150 static void 4151 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4152 { 4153 struct spdk_bs_load_ctx *ctx = cb_arg; 4154 uint32_t page_num; 4155 uint64_t i; 4156 4157 if (bserrno != 0) { 4158 spdk_free(ctx->extent_pages); 4159 bs_load_ctx_fail(ctx, bserrno); 4160 return; 4161 } 4162 4163 for (i = 0; i < ctx->num_extent_pages; i++) { 4164 /* Extent pages are only read when present within in chain md. 4165 * Integrity of md is not right if that page was not a valid extent page. */ 4166 if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) { 4167 spdk_free(ctx->extent_pages); 4168 bs_load_ctx_fail(ctx, -EILSEQ); 4169 return; 4170 } 4171 4172 page_num = ctx->extent_page_num[i]; 4173 spdk_bit_array_set(ctx->bs->used_md_pages, page_num); 4174 if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) { 4175 spdk_free(ctx->extent_pages); 4176 bs_load_ctx_fail(ctx, -EILSEQ); 4177 return; 4178 } 4179 } 4180 4181 spdk_free(ctx->extent_pages); 4182 free(ctx->extent_page_num); 4183 ctx->extent_page_num = NULL; 4184 ctx->num_extent_pages = 0; 4185 4186 bs_load_replay_md_chain_cpl(ctx); 4187 } 4188 4189 static void 4190 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx) 4191 { 4192 spdk_bs_batch_t *batch; 4193 uint32_t page; 4194 uint64_t lba; 4195 uint64_t i; 4196 4197 ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0, 4198 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4199 if (!ctx->extent_pages) { 4200 bs_load_ctx_fail(ctx, -ENOMEM); 4201 return; 4202 } 4203 4204 batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx); 4205 4206 for (i = 0; i < ctx->num_extent_pages; i++) { 4207 page = ctx->extent_page_num[i]; 4208 assert(page < ctx->super->md_len); 4209 lba = bs_md_page_to_lba(ctx->bs, page); 4210 bs_batch_read_dev(batch, &ctx->extent_pages[i], lba, 4211 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE)); 4212 } 4213 4214 bs_batch_close(batch); 4215 } 4216 4217 static void 4218 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4219 { 4220 struct spdk_bs_load_ctx *ctx = cb_arg; 4221 uint32_t page_num; 4222 struct spdk_blob_md_page *page; 4223 4224 if (bserrno != 0) { 4225 bs_load_ctx_fail(ctx, bserrno); 4226 return; 4227 } 4228 4229 page_num = ctx->cur_page; 4230 page = ctx->page; 4231 if (bs_load_cur_md_page_valid(ctx) == true) { 4232 if (page->sequence_num == 0 || ctx->in_page_chain == true) { 4233 bs_claim_md_page(ctx->bs, page_num); 4234 if (page->sequence_num == 0) { 4235 spdk_bit_array_set(ctx->bs->used_blobids, page_num); 4236 } 4237 if (bs_load_replay_md_parse_page(ctx, page)) { 4238 bs_load_ctx_fail(ctx, -EILSEQ); 4239 return; 4240 } 4241 if (page->next != SPDK_INVALID_MD_PAGE) { 4242 ctx->in_page_chain = true; 4243 ctx->cur_page = page->next; 4244 bs_load_replay_cur_md_page(ctx); 4245 return; 4246 } 4247 if (ctx->num_extent_pages != 0) { 4248 bs_load_replay_extent_pages(ctx); 4249 return; 4250 } 4251 } 4252 } 4253 bs_load_replay_md_chain_cpl(ctx); 4254 } 4255 4256 static void 4257 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx) 4258 { 4259 uint64_t lba; 4260 4261 assert(ctx->cur_page < ctx->super->md_len); 4262 lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page); 4263 bs_sequence_read_dev(ctx->seq, ctx->page, lba, 4264 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4265 bs_load_replay_md_cpl, ctx); 4266 } 4267 4268 static void 4269 bs_load_replay_md(struct spdk_bs_load_ctx *ctx) 4270 { 4271 ctx->page_index = 0; 4272 ctx->cur_page = 0; 4273 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4274 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4275 if (!ctx->page) { 4276 bs_load_ctx_fail(ctx, -ENOMEM); 4277 return; 4278 } 4279 bs_load_replay_cur_md_page(ctx); 4280 } 4281 4282 static void 4283 bs_recover(struct spdk_bs_load_ctx *ctx) 4284 { 4285 int rc; 4286 4287 rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); 4288 if (rc < 0) { 4289 bs_load_ctx_fail(ctx, -ENOMEM); 4290 return; 4291 } 4292 4293 rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); 4294 if (rc < 0) { 4295 bs_load_ctx_fail(ctx, -ENOMEM); 4296 return; 4297 } 4298 4299 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4300 if (rc < 0) { 4301 bs_load_ctx_fail(ctx, -ENOMEM); 4302 return; 4303 } 4304 4305 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len); 4306 if (rc < 0) { 4307 bs_load_ctx_fail(ctx, -ENOMEM); 4308 return; 4309 } 4310 4311 ctx->bs->num_free_clusters = ctx->bs->total_clusters; 4312 bs_load_replay_md(ctx); 4313 } 4314 4315 static void 4316 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4317 { 4318 struct spdk_bs_load_ctx *ctx = cb_arg; 4319 uint32_t crc; 4320 int rc; 4321 static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; 4322 4323 if (ctx->super->version > SPDK_BS_VERSION || 4324 ctx->super->version < SPDK_BS_INITIAL_VERSION) { 4325 bs_load_ctx_fail(ctx, -EILSEQ); 4326 return; 4327 } 4328 4329 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4330 sizeof(ctx->super->signature)) != 0) { 4331 bs_load_ctx_fail(ctx, -EILSEQ); 4332 return; 4333 } 4334 4335 crc = blob_md_page_calc_crc(ctx->super); 4336 if (crc != ctx->super->crc) { 4337 bs_load_ctx_fail(ctx, -EILSEQ); 4338 return; 4339 } 4340 4341 if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4342 SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n"); 4343 } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { 4344 SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n"); 4345 } else { 4346 SPDK_DEBUGLOG(blob, "Unexpected bstype\n"); 4347 SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4348 SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); 4349 bs_load_ctx_fail(ctx, -ENXIO); 4350 return; 4351 } 4352 4353 if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { 4354 SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n", 4355 ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); 4356 bs_load_ctx_fail(ctx, -EILSEQ); 4357 return; 4358 } 4359 4360 if (ctx->super->size == 0) { 4361 ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; 4362 } 4363 4364 if (ctx->super->io_unit_size == 0) { 4365 ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; 4366 } 4367 4368 /* Parse the super block */ 4369 ctx->bs->clean = 1; 4370 ctx->bs->cluster_sz = ctx->super->cluster_size; 4371 ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; 4372 ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; 4373 if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) { 4374 ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster); 4375 } 4376 ctx->bs->io_unit_size = ctx->super->io_unit_size; 4377 rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters); 4378 if (rc < 0) { 4379 bs_load_ctx_fail(ctx, -ENOMEM); 4380 return; 4381 } 4382 ctx->bs->md_start = ctx->super->md_start; 4383 ctx->bs->md_len = ctx->super->md_len; 4384 rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len); 4385 if (rc < 0) { 4386 bs_load_ctx_fail(ctx, -ENOMEM); 4387 return; 4388 } 4389 4390 ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up( 4391 ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); 4392 ctx->bs->super_blob = ctx->super->super_blob; 4393 memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); 4394 4395 if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { 4396 bs_recover(ctx); 4397 } else { 4398 bs_load_read_used_pages(ctx); 4399 } 4400 } 4401 4402 static inline int 4403 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst) 4404 { 4405 4406 if (!src->opts_size) { 4407 SPDK_ERRLOG("opts_size should not be zero value\n"); 4408 return -1; 4409 } 4410 4411 #define FIELD_OK(field) \ 4412 offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size 4413 4414 #define SET_FIELD(field) \ 4415 if (FIELD_OK(field)) { \ 4416 dst->field = src->field; \ 4417 } \ 4418 4419 SET_FIELD(cluster_sz); 4420 SET_FIELD(num_md_pages); 4421 SET_FIELD(max_md_ops); 4422 SET_FIELD(max_channel_ops); 4423 SET_FIELD(clear_method); 4424 4425 if (FIELD_OK(bstype)) { 4426 memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype)); 4427 } 4428 SET_FIELD(iter_cb_fn); 4429 SET_FIELD(iter_cb_arg); 4430 4431 dst->opts_size = src->opts_size; 4432 4433 /* You should not remove this statement, but need to update the assert statement 4434 * if you add a new field, and also add a corresponding SET_FIELD statement */ 4435 SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 64, "Incorrect size"); 4436 4437 #undef FIELD_OK 4438 #undef SET_FIELD 4439 4440 return 0; 4441 } 4442 4443 void 4444 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4445 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4446 { 4447 struct spdk_blob_store *bs; 4448 struct spdk_bs_cpl cpl; 4449 struct spdk_bs_load_ctx *ctx; 4450 struct spdk_bs_opts opts = {}; 4451 int err; 4452 4453 SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev); 4454 4455 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4456 SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen); 4457 dev->destroy(dev); 4458 cb_fn(cb_arg, NULL, -EINVAL); 4459 return; 4460 } 4461 4462 spdk_bs_opts_init(&opts, sizeof(opts)); 4463 if (o) { 4464 if (bs_opts_copy(o, &opts)) { 4465 return; 4466 } 4467 } 4468 4469 if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { 4470 dev->destroy(dev); 4471 cb_fn(cb_arg, NULL, -EINVAL); 4472 return; 4473 } 4474 4475 err = bs_alloc(dev, &opts, &bs, &ctx); 4476 if (err) { 4477 dev->destroy(dev); 4478 cb_fn(cb_arg, NULL, err); 4479 return; 4480 } 4481 4482 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4483 cpl.u.bs_handle.cb_fn = cb_fn; 4484 cpl.u.bs_handle.cb_arg = cb_arg; 4485 cpl.u.bs_handle.bs = bs; 4486 4487 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 4488 if (!ctx->seq) { 4489 spdk_free(ctx->super); 4490 free(ctx); 4491 bs_free(bs); 4492 cb_fn(cb_arg, NULL, -ENOMEM); 4493 return; 4494 } 4495 4496 /* Read the super block */ 4497 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 4498 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4499 bs_load_super_cpl, ctx); 4500 } 4501 4502 /* END spdk_bs_load */ 4503 4504 /* START spdk_bs_dump */ 4505 4506 static void 4507 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) 4508 { 4509 spdk_free(ctx->super); 4510 4511 /* 4512 * We need to defer calling bs_call_cpl() until after 4513 * dev destruction, so tuck these away for later use. 4514 */ 4515 ctx->bs->unload_err = bserrno; 4516 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4517 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4518 4519 bs_sequence_finish(seq, 0); 4520 bs_free(ctx->bs); 4521 free(ctx); 4522 } 4523 4524 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); 4525 4526 static void 4527 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx) 4528 { 4529 uint32_t page_idx = ctx->cur_page; 4530 struct spdk_blob_md_page *page = ctx->page; 4531 struct spdk_blob_md_descriptor *desc; 4532 size_t cur_desc = 0; 4533 uint32_t crc; 4534 4535 fprintf(ctx->fp, "=========\n"); 4536 fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); 4537 fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); 4538 4539 crc = blob_md_page_calc_crc(page); 4540 fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); 4541 4542 desc = (struct spdk_blob_md_descriptor *)page->descriptors; 4543 while (cur_desc < sizeof(page->descriptors)) { 4544 if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { 4545 if (desc->length == 0) { 4546 /* If padding and length are 0, this terminates the page */ 4547 break; 4548 } 4549 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) { 4550 struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle; 4551 unsigned int i; 4552 4553 desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc; 4554 4555 for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) { 4556 if (desc_extent_rle->extents[i].cluster_idx != 0) { 4557 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4558 desc_extent_rle->extents[i].cluster_idx); 4559 } else { 4560 fprintf(ctx->fp, "Unallocated Extent - "); 4561 } 4562 fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length); 4563 fprintf(ctx->fp, "\n"); 4564 } 4565 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) { 4566 struct spdk_blob_md_descriptor_extent_page *desc_extent; 4567 unsigned int i; 4568 4569 desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc; 4570 4571 for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) { 4572 if (desc_extent->cluster_idx[i] != 0) { 4573 fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, 4574 desc_extent->cluster_idx[i]); 4575 } else { 4576 fprintf(ctx->fp, "Unallocated Extent"); 4577 } 4578 fprintf(ctx->fp, "\n"); 4579 } 4580 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { 4581 struct spdk_blob_md_descriptor_xattr *desc_xattr; 4582 uint32_t i; 4583 4584 desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; 4585 4586 if (desc_xattr->length != 4587 sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + 4588 desc_xattr->name_length + desc_xattr->value_length) { 4589 } 4590 4591 memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); 4592 ctx->xattr_name[desc_xattr->name_length] = '\0'; 4593 fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name); 4594 fprintf(ctx->fp, " value = \""); 4595 ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, 4596 (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), 4597 desc_xattr->value_length); 4598 fprintf(ctx->fp, "\"\n"); 4599 for (i = 0; i < desc_xattr->value_length; i++) { 4600 if (i % 16 == 0) { 4601 fprintf(ctx->fp, " "); 4602 } 4603 fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); 4604 if ((i + 1) % 16 == 0) { 4605 fprintf(ctx->fp, "\n"); 4606 } 4607 } 4608 if (i % 16 != 0) { 4609 fprintf(ctx->fp, "\n"); 4610 } 4611 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { 4612 /* TODO */ 4613 } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { 4614 /* TODO */ 4615 } else { 4616 /* Error */ 4617 } 4618 /* Advance to the next descriptor */ 4619 cur_desc += sizeof(*desc) + desc->length; 4620 if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { 4621 break; 4622 } 4623 desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); 4624 } 4625 } 4626 4627 static void 4628 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4629 { 4630 struct spdk_bs_load_ctx *ctx = cb_arg; 4631 4632 if (bserrno != 0) { 4633 bs_dump_finish(seq, ctx, bserrno); 4634 return; 4635 } 4636 4637 if (ctx->page->id != 0) { 4638 bs_dump_print_md_page(ctx); 4639 } 4640 4641 ctx->cur_page++; 4642 4643 if (ctx->cur_page < ctx->super->md_len) { 4644 bs_dump_read_md_page(seq, ctx); 4645 } else { 4646 spdk_free(ctx->page); 4647 bs_dump_finish(seq, ctx, 0); 4648 } 4649 } 4650 4651 static void 4652 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) 4653 { 4654 struct spdk_bs_load_ctx *ctx = cb_arg; 4655 uint64_t lba; 4656 4657 assert(ctx->cur_page < ctx->super->md_len); 4658 lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); 4659 bs_sequence_read_dev(seq, ctx->page, lba, 4660 bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), 4661 bs_dump_read_md_page_cpl, ctx); 4662 } 4663 4664 static void 4665 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4666 { 4667 struct spdk_bs_load_ctx *ctx = cb_arg; 4668 4669 fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); 4670 if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4671 sizeof(ctx->super->signature)) != 0) { 4672 fprintf(ctx->fp, "(Mismatch)\n"); 4673 bs_dump_finish(seq, ctx, bserrno); 4674 return; 4675 } else { 4676 fprintf(ctx->fp, "(OK)\n"); 4677 } 4678 fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); 4679 fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, 4680 (ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); 4681 fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); 4682 fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); 4683 fprintf(ctx->fp, "Super Blob ID: "); 4684 if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { 4685 fprintf(ctx->fp, "(None)\n"); 4686 } else { 4687 fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob); 4688 } 4689 fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); 4690 fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); 4691 fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); 4692 fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); 4693 fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); 4694 fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); 4695 fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); 4696 fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); 4697 fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); 4698 4699 ctx->cur_page = 0; 4700 ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, 4701 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 4702 if (!ctx->page) { 4703 bs_dump_finish(seq, ctx, -ENOMEM); 4704 return; 4705 } 4706 bs_dump_read_md_page(seq, ctx); 4707 } 4708 4709 void 4710 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, 4711 spdk_bs_op_complete cb_fn, void *cb_arg) 4712 { 4713 struct spdk_blob_store *bs; 4714 struct spdk_bs_cpl cpl; 4715 spdk_bs_sequence_t *seq; 4716 struct spdk_bs_load_ctx *ctx; 4717 struct spdk_bs_opts opts = {}; 4718 int err; 4719 4720 SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev); 4721 4722 spdk_bs_opts_init(&opts, sizeof(opts)); 4723 4724 err = bs_alloc(dev, &opts, &bs, &ctx); 4725 if (err) { 4726 dev->destroy(dev); 4727 cb_fn(cb_arg, err); 4728 return; 4729 } 4730 4731 ctx->fp = fp; 4732 ctx->print_xattr_fn = print_xattr_fn; 4733 4734 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 4735 cpl.u.bs_basic.cb_fn = cb_fn; 4736 cpl.u.bs_basic.cb_arg = cb_arg; 4737 4738 seq = bs_sequence_start(bs->md_channel, &cpl); 4739 if (!seq) { 4740 spdk_free(ctx->super); 4741 free(ctx); 4742 bs_free(bs); 4743 cb_fn(cb_arg, -ENOMEM); 4744 return; 4745 } 4746 4747 /* Read the super block */ 4748 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 4749 bs_byte_to_lba(bs, sizeof(*ctx->super)), 4750 bs_dump_super_cpl, ctx); 4751 } 4752 4753 /* END spdk_bs_dump */ 4754 4755 /* START spdk_bs_init */ 4756 4757 static void 4758 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4759 { 4760 struct spdk_bs_load_ctx *ctx = cb_arg; 4761 4762 ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters); 4763 spdk_free(ctx->super); 4764 free(ctx); 4765 4766 bs_sequence_finish(seq, bserrno); 4767 } 4768 4769 static void 4770 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4771 { 4772 struct spdk_bs_load_ctx *ctx = cb_arg; 4773 4774 /* Write super block */ 4775 bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0), 4776 bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), 4777 bs_init_persist_super_cpl, ctx); 4778 } 4779 4780 void 4781 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, 4782 spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) 4783 { 4784 struct spdk_bs_load_ctx *ctx; 4785 struct spdk_blob_store *bs; 4786 struct spdk_bs_cpl cpl; 4787 spdk_bs_sequence_t *seq; 4788 spdk_bs_batch_t *batch; 4789 uint64_t num_md_lba; 4790 uint64_t num_md_pages; 4791 uint64_t num_md_clusters; 4792 uint32_t i; 4793 struct spdk_bs_opts opts = {}; 4794 int rc; 4795 uint64_t lba, lba_count; 4796 4797 SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev); 4798 4799 if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { 4800 SPDK_ERRLOG("unsupported dev block length of %d\n", 4801 dev->blocklen); 4802 dev->destroy(dev); 4803 cb_fn(cb_arg, NULL, -EINVAL); 4804 return; 4805 } 4806 4807 spdk_bs_opts_init(&opts, sizeof(opts)); 4808 if (o) { 4809 if (bs_opts_copy(o, &opts)) { 4810 return; 4811 } 4812 } 4813 4814 if (bs_opts_verify(&opts) != 0) { 4815 dev->destroy(dev); 4816 cb_fn(cb_arg, NULL, -EINVAL); 4817 return; 4818 } 4819 4820 rc = bs_alloc(dev, &opts, &bs, &ctx); 4821 if (rc) { 4822 dev->destroy(dev); 4823 cb_fn(cb_arg, NULL, rc); 4824 return; 4825 } 4826 4827 if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { 4828 /* By default, allocate 1 page per cluster. 4829 * Technically, this over-allocates metadata 4830 * because more metadata will reduce the number 4831 * of usable clusters. This can be addressed with 4832 * more complex math in the future. 4833 */ 4834 bs->md_len = bs->total_clusters; 4835 } else { 4836 bs->md_len = opts.num_md_pages; 4837 } 4838 rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); 4839 if (rc < 0) { 4840 spdk_free(ctx->super); 4841 free(ctx); 4842 bs_free(bs); 4843 cb_fn(cb_arg, NULL, -ENOMEM); 4844 return; 4845 } 4846 4847 rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); 4848 if (rc < 0) { 4849 spdk_free(ctx->super); 4850 free(ctx); 4851 bs_free(bs); 4852 cb_fn(cb_arg, NULL, -ENOMEM); 4853 return; 4854 } 4855 4856 rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len); 4857 if (rc < 0) { 4858 spdk_free(ctx->super); 4859 free(ctx); 4860 bs_free(bs); 4861 cb_fn(cb_arg, NULL, -ENOMEM); 4862 return; 4863 } 4864 4865 memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, 4866 sizeof(ctx->super->signature)); 4867 ctx->super->version = SPDK_BS_VERSION; 4868 ctx->super->length = sizeof(*ctx->super); 4869 ctx->super->super_blob = bs->super_blob; 4870 ctx->super->clean = 0; 4871 ctx->super->cluster_size = bs->cluster_sz; 4872 ctx->super->io_unit_size = bs->io_unit_size; 4873 memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); 4874 4875 /* Calculate how many pages the metadata consumes at the front 4876 * of the disk. 4877 */ 4878 4879 /* The super block uses 1 page */ 4880 num_md_pages = 1; 4881 4882 /* The used_md_pages mask requires 1 bit per metadata page, rounded 4883 * up to the nearest page, plus a header. 4884 */ 4885 ctx->super->used_page_mask_start = num_md_pages; 4886 ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4887 spdk_divide_round_up(bs->md_len, 8), 4888 SPDK_BS_PAGE_SIZE); 4889 num_md_pages += ctx->super->used_page_mask_len; 4890 4891 /* The used_clusters mask requires 1 bit per cluster, rounded 4892 * up to the nearest page, plus a header. 4893 */ 4894 ctx->super->used_cluster_mask_start = num_md_pages; 4895 ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4896 spdk_divide_round_up(bs->total_clusters, 8), 4897 SPDK_BS_PAGE_SIZE); 4898 num_md_pages += ctx->super->used_cluster_mask_len; 4899 4900 /* The used_blobids mask requires 1 bit per metadata page, rounded 4901 * up to the nearest page, plus a header. 4902 */ 4903 ctx->super->used_blobid_mask_start = num_md_pages; 4904 ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) + 4905 spdk_divide_round_up(bs->md_len, 8), 4906 SPDK_BS_PAGE_SIZE); 4907 num_md_pages += ctx->super->used_blobid_mask_len; 4908 4909 /* The metadata region size was chosen above */ 4910 ctx->super->md_start = bs->md_start = num_md_pages; 4911 ctx->super->md_len = bs->md_len; 4912 num_md_pages += bs->md_len; 4913 4914 num_md_lba = bs_page_to_lba(bs, num_md_pages); 4915 4916 ctx->super->size = dev->blockcnt * dev->blocklen; 4917 4918 ctx->super->crc = blob_md_page_calc_crc(ctx->super); 4919 4920 num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster); 4921 if (num_md_clusters > bs->total_clusters) { 4922 SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " 4923 "please decrease number of pages reserved for metadata " 4924 "or increase cluster size.\n"); 4925 spdk_free(ctx->super); 4926 spdk_bit_array_free(&ctx->used_clusters); 4927 free(ctx); 4928 bs_free(bs); 4929 cb_fn(cb_arg, NULL, -ENOMEM); 4930 return; 4931 } 4932 /* Claim all of the clusters used by the metadata */ 4933 for (i = 0; i < num_md_clusters; i++) { 4934 spdk_bit_array_set(ctx->used_clusters, i); 4935 } 4936 4937 bs->num_free_clusters -= num_md_clusters; 4938 bs->total_data_clusters = bs->num_free_clusters; 4939 4940 cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; 4941 cpl.u.bs_handle.cb_fn = cb_fn; 4942 cpl.u.bs_handle.cb_arg = cb_arg; 4943 cpl.u.bs_handle.bs = bs; 4944 4945 seq = bs_sequence_start(bs->md_channel, &cpl); 4946 if (!seq) { 4947 spdk_free(ctx->super); 4948 free(ctx); 4949 bs_free(bs); 4950 cb_fn(cb_arg, NULL, -ENOMEM); 4951 return; 4952 } 4953 4954 batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx); 4955 4956 /* Clear metadata space */ 4957 bs_batch_write_zeroes_dev(batch, 0, num_md_lba); 4958 4959 lba = num_md_lba; 4960 lba_count = ctx->bs->dev->blockcnt - lba; 4961 switch (opts.clear_method) { 4962 case BS_CLEAR_WITH_UNMAP: 4963 /* Trim data clusters */ 4964 bs_batch_unmap_dev(batch, lba, lba_count); 4965 break; 4966 case BS_CLEAR_WITH_WRITE_ZEROES: 4967 /* Write_zeroes to data clusters */ 4968 bs_batch_write_zeroes_dev(batch, lba, lba_count); 4969 break; 4970 case BS_CLEAR_WITH_NONE: 4971 default: 4972 break; 4973 } 4974 4975 bs_batch_close(batch); 4976 } 4977 4978 /* END spdk_bs_init */ 4979 4980 /* START spdk_bs_destroy */ 4981 4982 static void 4983 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 4984 { 4985 struct spdk_bs_load_ctx *ctx = cb_arg; 4986 struct spdk_blob_store *bs = ctx->bs; 4987 4988 /* 4989 * We need to defer calling bs_call_cpl() until after 4990 * dev destruction, so tuck these away for later use. 4991 */ 4992 bs->unload_err = bserrno; 4993 memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 4994 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 4995 4996 bs_sequence_finish(seq, bserrno); 4997 4998 bs_free(bs); 4999 free(ctx); 5000 } 5001 5002 void 5003 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, 5004 void *cb_arg) 5005 { 5006 struct spdk_bs_cpl cpl; 5007 spdk_bs_sequence_t *seq; 5008 struct spdk_bs_load_ctx *ctx; 5009 5010 SPDK_DEBUGLOG(blob, "Destroying blobstore\n"); 5011 5012 if (!RB_EMPTY(&bs->open_blobs)) { 5013 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5014 cb_fn(cb_arg, -EBUSY); 5015 return; 5016 } 5017 5018 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5019 cpl.u.bs_basic.cb_fn = cb_fn; 5020 cpl.u.bs_basic.cb_arg = cb_arg; 5021 5022 ctx = calloc(1, sizeof(*ctx)); 5023 if (!ctx) { 5024 cb_fn(cb_arg, -ENOMEM); 5025 return; 5026 } 5027 5028 ctx->bs = bs; 5029 5030 seq = bs_sequence_start(bs->md_channel, &cpl); 5031 if (!seq) { 5032 free(ctx); 5033 cb_fn(cb_arg, -ENOMEM); 5034 return; 5035 } 5036 5037 /* Write zeroes to the super block */ 5038 bs_sequence_write_zeroes_dev(seq, 5039 bs_page_to_lba(bs, 0), 5040 bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), 5041 bs_destroy_trim_cpl, ctx); 5042 } 5043 5044 /* END spdk_bs_destroy */ 5045 5046 /* START spdk_bs_unload */ 5047 5048 static void 5049 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno) 5050 { 5051 spdk_bs_sequence_t *seq = ctx->seq; 5052 5053 spdk_free(ctx->super); 5054 5055 /* 5056 * We need to defer calling bs_call_cpl() until after 5057 * dev destruction, so tuck these away for later use. 5058 */ 5059 ctx->bs->unload_err = bserrno; 5060 memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); 5061 seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; 5062 5063 bs_sequence_finish(seq, bserrno); 5064 5065 bs_free(ctx->bs); 5066 free(ctx); 5067 } 5068 5069 static void 5070 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5071 { 5072 struct spdk_bs_load_ctx *ctx = cb_arg; 5073 5074 bs_unload_finish(ctx, bserrno); 5075 } 5076 5077 static void 5078 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5079 { 5080 struct spdk_bs_load_ctx *ctx = cb_arg; 5081 5082 spdk_free(ctx->mask); 5083 5084 if (bserrno != 0) { 5085 bs_unload_finish(ctx, bserrno); 5086 return; 5087 } 5088 5089 ctx->super->clean = 1; 5090 5091 bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx); 5092 } 5093 5094 static void 5095 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5096 { 5097 struct spdk_bs_load_ctx *ctx = cb_arg; 5098 5099 spdk_free(ctx->mask); 5100 ctx->mask = NULL; 5101 5102 if (bserrno != 0) { 5103 bs_unload_finish(ctx, bserrno); 5104 return; 5105 } 5106 5107 bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl); 5108 } 5109 5110 static void 5111 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5112 { 5113 struct spdk_bs_load_ctx *ctx = cb_arg; 5114 5115 spdk_free(ctx->mask); 5116 ctx->mask = NULL; 5117 5118 if (bserrno != 0) { 5119 bs_unload_finish(ctx, bserrno); 5120 return; 5121 } 5122 5123 bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl); 5124 } 5125 5126 static void 5127 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5128 { 5129 struct spdk_bs_load_ctx *ctx = cb_arg; 5130 5131 if (bserrno != 0) { 5132 bs_unload_finish(ctx, bserrno); 5133 return; 5134 } 5135 5136 bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl); 5137 } 5138 5139 void 5140 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) 5141 { 5142 struct spdk_bs_cpl cpl; 5143 struct spdk_bs_load_ctx *ctx; 5144 5145 SPDK_DEBUGLOG(blob, "Syncing blobstore\n"); 5146 5147 if (!RB_EMPTY(&bs->open_blobs)) { 5148 SPDK_ERRLOG("Blobstore still has open blobs\n"); 5149 cb_fn(cb_arg, -EBUSY); 5150 return; 5151 } 5152 5153 ctx = calloc(1, sizeof(*ctx)); 5154 if (!ctx) { 5155 cb_fn(cb_arg, -ENOMEM); 5156 return; 5157 } 5158 5159 ctx->bs = bs; 5160 5161 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5162 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5163 if (!ctx->super) { 5164 free(ctx); 5165 cb_fn(cb_arg, -ENOMEM); 5166 return; 5167 } 5168 5169 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5170 cpl.u.bs_basic.cb_fn = cb_fn; 5171 cpl.u.bs_basic.cb_arg = cb_arg; 5172 5173 ctx->seq = bs_sequence_start(bs->md_channel, &cpl); 5174 if (!ctx->seq) { 5175 spdk_free(ctx->super); 5176 free(ctx); 5177 cb_fn(cb_arg, -ENOMEM); 5178 return; 5179 } 5180 5181 /* Read super block */ 5182 bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0), 5183 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5184 bs_unload_read_super_cpl, ctx); 5185 } 5186 5187 /* END spdk_bs_unload */ 5188 5189 /* START spdk_bs_set_super */ 5190 5191 struct spdk_bs_set_super_ctx { 5192 struct spdk_blob_store *bs; 5193 struct spdk_bs_super_block *super; 5194 }; 5195 5196 static void 5197 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5198 { 5199 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5200 5201 if (bserrno != 0) { 5202 SPDK_ERRLOG("Unable to write to super block of blobstore\n"); 5203 } 5204 5205 spdk_free(ctx->super); 5206 5207 bs_sequence_finish(seq, bserrno); 5208 5209 free(ctx); 5210 } 5211 5212 static void 5213 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5214 { 5215 struct spdk_bs_set_super_ctx *ctx = cb_arg; 5216 5217 if (bserrno != 0) { 5218 SPDK_ERRLOG("Unable to read super block of blobstore\n"); 5219 spdk_free(ctx->super); 5220 bs_sequence_finish(seq, bserrno); 5221 free(ctx); 5222 return; 5223 } 5224 5225 bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx); 5226 } 5227 5228 void 5229 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, 5230 spdk_bs_op_complete cb_fn, void *cb_arg) 5231 { 5232 struct spdk_bs_cpl cpl; 5233 spdk_bs_sequence_t *seq; 5234 struct spdk_bs_set_super_ctx *ctx; 5235 5236 SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n"); 5237 5238 ctx = calloc(1, sizeof(*ctx)); 5239 if (!ctx) { 5240 cb_fn(cb_arg, -ENOMEM); 5241 return; 5242 } 5243 5244 ctx->bs = bs; 5245 5246 ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL, 5247 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5248 if (!ctx->super) { 5249 free(ctx); 5250 cb_fn(cb_arg, -ENOMEM); 5251 return; 5252 } 5253 5254 cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; 5255 cpl.u.bs_basic.cb_fn = cb_fn; 5256 cpl.u.bs_basic.cb_arg = cb_arg; 5257 5258 seq = bs_sequence_start(bs->md_channel, &cpl); 5259 if (!seq) { 5260 spdk_free(ctx->super); 5261 free(ctx); 5262 cb_fn(cb_arg, -ENOMEM); 5263 return; 5264 } 5265 5266 bs->super_blob = blobid; 5267 5268 /* Read super block */ 5269 bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0), 5270 bs_byte_to_lba(bs, sizeof(*ctx->super)), 5271 bs_set_super_read_cpl, ctx); 5272 } 5273 5274 /* END spdk_bs_set_super */ 5275 5276 void 5277 spdk_bs_get_super(struct spdk_blob_store *bs, 5278 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5279 { 5280 if (bs->super_blob == SPDK_BLOBID_INVALID) { 5281 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); 5282 } else { 5283 cb_fn(cb_arg, bs->super_blob, 0); 5284 } 5285 } 5286 5287 uint64_t 5288 spdk_bs_get_cluster_size(struct spdk_blob_store *bs) 5289 { 5290 return bs->cluster_sz; 5291 } 5292 5293 uint64_t 5294 spdk_bs_get_page_size(struct spdk_blob_store *bs) 5295 { 5296 return SPDK_BS_PAGE_SIZE; 5297 } 5298 5299 uint64_t 5300 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) 5301 { 5302 return bs->io_unit_size; 5303 } 5304 5305 uint64_t 5306 spdk_bs_free_cluster_count(struct spdk_blob_store *bs) 5307 { 5308 return bs->num_free_clusters; 5309 } 5310 5311 uint64_t 5312 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) 5313 { 5314 return bs->total_data_clusters; 5315 } 5316 5317 static int 5318 bs_register_md_thread(struct spdk_blob_store *bs) 5319 { 5320 bs->md_channel = spdk_get_io_channel(bs); 5321 if (!bs->md_channel) { 5322 SPDK_ERRLOG("Failed to get IO channel.\n"); 5323 return -1; 5324 } 5325 5326 return 0; 5327 } 5328 5329 static int 5330 bs_unregister_md_thread(struct spdk_blob_store *bs) 5331 { 5332 spdk_put_io_channel(bs->md_channel); 5333 5334 return 0; 5335 } 5336 5337 spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob) 5338 { 5339 assert(blob != NULL); 5340 5341 return blob->id; 5342 } 5343 5344 uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob) 5345 { 5346 assert(blob != NULL); 5347 5348 return bs_cluster_to_page(blob->bs, blob->active.num_clusters); 5349 } 5350 5351 uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob) 5352 { 5353 assert(blob != NULL); 5354 5355 return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs); 5356 } 5357 5358 uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob) 5359 { 5360 assert(blob != NULL); 5361 5362 return blob->active.num_clusters; 5363 } 5364 5365 /* START spdk_bs_create_blob */ 5366 5367 static void 5368 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 5369 { 5370 struct spdk_blob *blob = cb_arg; 5371 uint32_t page_idx = bs_blobid_to_page(blob->id); 5372 5373 if (bserrno != 0) { 5374 spdk_bit_array_clear(blob->bs->used_blobids, page_idx); 5375 bs_release_md_page(blob->bs, page_idx); 5376 } 5377 5378 blob_free(blob); 5379 5380 bs_sequence_finish(seq, bserrno); 5381 } 5382 5383 static int 5384 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, 5385 bool internal) 5386 { 5387 uint64_t i; 5388 size_t value_len = 0; 5389 int rc; 5390 const void *value = NULL; 5391 if (xattrs->count > 0 && xattrs->get_value == NULL) { 5392 return -EINVAL; 5393 } 5394 for (i = 0; i < xattrs->count; i++) { 5395 xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); 5396 if (value == NULL || value_len == 0) { 5397 return -EINVAL; 5398 } 5399 rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); 5400 if (rc < 0) { 5401 return rc; 5402 } 5403 } 5404 return 0; 5405 } 5406 5407 static void 5408 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst) 5409 { 5410 #define FIELD_OK(field) \ 5411 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 5412 5413 #define SET_FIELD(field) \ 5414 if (FIELD_OK(field)) { \ 5415 dst->field = src->field; \ 5416 } \ 5417 5418 SET_FIELD(num_clusters); 5419 SET_FIELD(thin_provision); 5420 SET_FIELD(clear_method); 5421 5422 if (FIELD_OK(xattrs)) { 5423 memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs)); 5424 } 5425 5426 SET_FIELD(use_extent_table); 5427 5428 dst->opts_size = src->opts_size; 5429 5430 /* You should not remove this statement, but need to update the assert statement 5431 * if you add a new field, and also add a corresponding SET_FIELD statement */ 5432 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 64, "Incorrect size"); 5433 5434 #undef FIELD_OK 5435 #undef SET_FIELD 5436 } 5437 5438 static void 5439 bs_create_blob(struct spdk_blob_store *bs, 5440 const struct spdk_blob_opts *opts, 5441 const struct spdk_blob_xattr_opts *internal_xattrs, 5442 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5443 { 5444 struct spdk_blob *blob; 5445 uint32_t page_idx; 5446 struct spdk_bs_cpl cpl; 5447 struct spdk_blob_opts opts_local; 5448 struct spdk_blob_xattr_opts internal_xattrs_default; 5449 spdk_bs_sequence_t *seq; 5450 spdk_blob_id id; 5451 int rc; 5452 5453 assert(spdk_get_thread() == bs->md_thread); 5454 5455 page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); 5456 if (page_idx == UINT32_MAX) { 5457 cb_fn(cb_arg, 0, -ENOMEM); 5458 return; 5459 } 5460 spdk_bit_array_set(bs->used_blobids, page_idx); 5461 bs_claim_md_page(bs, page_idx); 5462 5463 id = bs_page_to_blobid(page_idx); 5464 5465 SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx); 5466 5467 blob = blob_alloc(bs, id); 5468 if (!blob) { 5469 spdk_bit_array_clear(bs->used_blobids, page_idx); 5470 bs_release_md_page(bs, page_idx); 5471 cb_fn(cb_arg, 0, -ENOMEM); 5472 return; 5473 } 5474 5475 spdk_blob_opts_init(&opts_local, sizeof(opts_local)); 5476 if (opts) { 5477 blob_opts_copy(opts, &opts_local); 5478 } 5479 5480 blob->use_extent_table = opts_local.use_extent_table; 5481 if (blob->use_extent_table) { 5482 blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE; 5483 } 5484 5485 if (!internal_xattrs) { 5486 blob_xattrs_init(&internal_xattrs_default); 5487 internal_xattrs = &internal_xattrs_default; 5488 } 5489 5490 rc = blob_set_xattrs(blob, &opts_local.xattrs, false); 5491 if (rc < 0) { 5492 blob_free(blob); 5493 spdk_bit_array_clear(bs->used_blobids, page_idx); 5494 bs_release_md_page(bs, page_idx); 5495 cb_fn(cb_arg, 0, rc); 5496 return; 5497 } 5498 5499 rc = blob_set_xattrs(blob, internal_xattrs, true); 5500 if (rc < 0) { 5501 blob_free(blob); 5502 spdk_bit_array_clear(bs->used_blobids, page_idx); 5503 bs_release_md_page(bs, page_idx); 5504 cb_fn(cb_arg, 0, rc); 5505 return; 5506 } 5507 5508 if (opts_local.thin_provision) { 5509 blob_set_thin_provision(blob); 5510 } 5511 5512 blob_set_clear_method(blob, opts_local.clear_method); 5513 5514 rc = blob_resize(blob, opts_local.num_clusters); 5515 if (rc < 0) { 5516 blob_free(blob); 5517 spdk_bit_array_clear(bs->used_blobids, page_idx); 5518 bs_release_md_page(bs, page_idx); 5519 cb_fn(cb_arg, 0, rc); 5520 return; 5521 } 5522 cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5523 cpl.u.blobid.cb_fn = cb_fn; 5524 cpl.u.blobid.cb_arg = cb_arg; 5525 cpl.u.blobid.blobid = blob->id; 5526 5527 seq = bs_sequence_start(bs->md_channel, &cpl); 5528 if (!seq) { 5529 blob_free(blob); 5530 spdk_bit_array_clear(bs->used_blobids, page_idx); 5531 bs_release_md_page(bs, page_idx); 5532 cb_fn(cb_arg, 0, -ENOMEM); 5533 return; 5534 } 5535 5536 blob_persist(seq, blob, bs_create_blob_cpl, blob); 5537 } 5538 5539 void spdk_bs_create_blob(struct spdk_blob_store *bs, 5540 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5541 { 5542 bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); 5543 } 5544 5545 void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, 5546 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5547 { 5548 bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); 5549 } 5550 5551 /* END spdk_bs_create_blob */ 5552 5553 /* START blob_cleanup */ 5554 5555 struct spdk_clone_snapshot_ctx { 5556 struct spdk_bs_cpl cpl; 5557 int bserrno; 5558 bool frozen; 5559 5560 struct spdk_io_channel *channel; 5561 5562 /* Current cluster for inflate operation */ 5563 uint64_t cluster; 5564 5565 /* For inflation force allocation of all unallocated clusters and remove 5566 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ 5567 bool allocate_all; 5568 5569 struct { 5570 spdk_blob_id id; 5571 struct spdk_blob *blob; 5572 bool md_ro; 5573 } original; 5574 struct { 5575 spdk_blob_id id; 5576 struct spdk_blob *blob; 5577 } new; 5578 5579 /* xattrs specified for snapshot/clones only. They have no impact on 5580 * the original blobs xattrs. */ 5581 const struct spdk_blob_xattr_opts *xattrs; 5582 }; 5583 5584 static void 5585 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) 5586 { 5587 struct spdk_clone_snapshot_ctx *ctx = cb_arg; 5588 struct spdk_bs_cpl *cpl = &ctx->cpl; 5589 5590 if (bserrno != 0) { 5591 if (ctx->bserrno != 0) { 5592 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5593 } else { 5594 ctx->bserrno = bserrno; 5595 } 5596 } 5597 5598 switch (cpl->type) { 5599 case SPDK_BS_CPL_TYPE_BLOBID: 5600 cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); 5601 break; 5602 case SPDK_BS_CPL_TYPE_BLOB_BASIC: 5603 cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); 5604 break; 5605 default: 5606 SPDK_UNREACHABLE(); 5607 break; 5608 } 5609 5610 free(ctx); 5611 } 5612 5613 static void 5614 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 5615 { 5616 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5617 struct spdk_blob *origblob = ctx->original.blob; 5618 5619 if (bserrno != 0) { 5620 if (ctx->bserrno != 0) { 5621 SPDK_ERRLOG("Unfreeze error %d\n", bserrno); 5622 } else { 5623 ctx->bserrno = bserrno; 5624 } 5625 } 5626 5627 ctx->original.id = origblob->id; 5628 origblob->locked_operation_in_progress = false; 5629 5630 /* Revert md_ro to original state */ 5631 origblob->md_ro = ctx->original.md_ro; 5632 5633 spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx); 5634 } 5635 5636 static void 5637 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) 5638 { 5639 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5640 struct spdk_blob *origblob = ctx->original.blob; 5641 5642 if (bserrno != 0) { 5643 if (ctx->bserrno != 0) { 5644 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5645 } else { 5646 ctx->bserrno = bserrno; 5647 } 5648 } 5649 5650 if (ctx->frozen) { 5651 /* Unfreeze any outstanding I/O */ 5652 blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx); 5653 } else { 5654 bs_snapshot_unfreeze_cpl(ctx, 0); 5655 } 5656 5657 } 5658 5659 static void 5660 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno) 5661 { 5662 struct spdk_blob *newblob = ctx->new.blob; 5663 5664 if (bserrno != 0) { 5665 if (ctx->bserrno != 0) { 5666 SPDK_ERRLOG("Cleanup error %d\n", bserrno); 5667 } else { 5668 ctx->bserrno = bserrno; 5669 } 5670 } 5671 5672 ctx->new.id = newblob->id; 5673 spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5674 } 5675 5676 /* END blob_cleanup */ 5677 5678 /* START spdk_bs_create_snapshot */ 5679 5680 static void 5681 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2) 5682 { 5683 uint64_t *cluster_temp; 5684 uint32_t *extent_page_temp; 5685 5686 cluster_temp = blob1->active.clusters; 5687 blob1->active.clusters = blob2->active.clusters; 5688 blob2->active.clusters = cluster_temp; 5689 5690 extent_page_temp = blob1->active.extent_pages; 5691 blob1->active.extent_pages = blob2->active.extent_pages; 5692 blob2->active.extent_pages = extent_page_temp; 5693 } 5694 5695 static void 5696 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) 5697 { 5698 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5699 struct spdk_blob *origblob = ctx->original.blob; 5700 struct spdk_blob *newblob = ctx->new.blob; 5701 5702 if (bserrno != 0) { 5703 bs_snapshot_swap_cluster_maps(newblob, origblob); 5704 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5705 return; 5706 } 5707 5708 /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ 5709 bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); 5710 if (bserrno != 0) { 5711 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5712 return; 5713 } 5714 5715 bs_blob_list_add(ctx->original.blob); 5716 5717 spdk_blob_set_read_only(newblob); 5718 5719 /* sync snapshot metadata */ 5720 spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx); 5721 } 5722 5723 static void 5724 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) 5725 { 5726 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5727 struct spdk_blob *origblob = ctx->original.blob; 5728 struct spdk_blob *newblob = ctx->new.blob; 5729 5730 if (bserrno != 0) { 5731 /* return cluster map back to original */ 5732 bs_snapshot_swap_cluster_maps(newblob, origblob); 5733 5734 /* Newblob md sync failed. Valid clusters are only present in origblob. 5735 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred. 5736 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */ 5737 blob_set_thin_provision(newblob); 5738 assert(spdk_mem_all_zero(newblob->active.clusters, 5739 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5740 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5741 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5742 5743 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5744 return; 5745 } 5746 5747 /* Set internal xattr for snapshot id */ 5748 bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); 5749 if (bserrno != 0) { 5750 /* return cluster map back to original */ 5751 bs_snapshot_swap_cluster_maps(newblob, origblob); 5752 blob_set_thin_provision(newblob); 5753 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5754 return; 5755 } 5756 5757 /* Create new back_bs_dev for snapshot */ 5758 origblob->back_bs_dev = bs_create_blob_bs_dev(newblob); 5759 if (origblob->back_bs_dev == NULL) { 5760 /* return cluster map back to original */ 5761 bs_snapshot_swap_cluster_maps(newblob, origblob); 5762 blob_set_thin_provision(newblob); 5763 bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); 5764 return; 5765 } 5766 5767 bs_blob_list_remove(origblob); 5768 origblob->parent_id = newblob->id; 5769 /* set clone blob as thin provisioned */ 5770 blob_set_thin_provision(origblob); 5771 5772 bs_blob_list_add(newblob); 5773 5774 /* sync clone metadata */ 5775 spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx); 5776 } 5777 5778 static void 5779 bs_snapshot_freeze_cpl(void *cb_arg, int rc) 5780 { 5781 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5782 struct spdk_blob *origblob = ctx->original.blob; 5783 struct spdk_blob *newblob = ctx->new.blob; 5784 int bserrno; 5785 5786 if (rc != 0) { 5787 bs_clone_snapshot_newblob_cleanup(ctx, rc); 5788 return; 5789 } 5790 5791 ctx->frozen = true; 5792 5793 /* set new back_bs_dev for snapshot */ 5794 newblob->back_bs_dev = origblob->back_bs_dev; 5795 /* Set invalid flags from origblob */ 5796 newblob->invalid_flags = origblob->invalid_flags; 5797 5798 /* inherit parent from original blob if set */ 5799 newblob->parent_id = origblob->parent_id; 5800 if (origblob->parent_id != SPDK_BLOBID_INVALID) { 5801 /* Set internal xattr for snapshot id */ 5802 bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT, 5803 &origblob->parent_id, sizeof(spdk_blob_id), true); 5804 if (bserrno != 0) { 5805 bs_clone_snapshot_newblob_cleanup(ctx, bserrno); 5806 return; 5807 } 5808 } 5809 5810 /* swap cluster maps */ 5811 bs_snapshot_swap_cluster_maps(newblob, origblob); 5812 5813 /* Set the clear method on the new blob to match the original. */ 5814 blob_set_clear_method(newblob, origblob->clear_method); 5815 5816 /* sync snapshot metadata */ 5817 spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx); 5818 } 5819 5820 static void 5821 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5822 { 5823 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5824 struct spdk_blob *origblob = ctx->original.blob; 5825 struct spdk_blob *newblob = _blob; 5826 5827 if (bserrno != 0) { 5828 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5829 return; 5830 } 5831 5832 ctx->new.blob = newblob; 5833 assert(spdk_blob_is_thin_provisioned(newblob)); 5834 assert(spdk_mem_all_zero(newblob->active.clusters, 5835 newblob->active.num_clusters * sizeof(*newblob->active.clusters))); 5836 assert(spdk_mem_all_zero(newblob->active.extent_pages, 5837 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages))); 5838 5839 blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx); 5840 } 5841 5842 static void 5843 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 5844 { 5845 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5846 struct spdk_blob *origblob = ctx->original.blob; 5847 5848 if (bserrno != 0) { 5849 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 5850 return; 5851 } 5852 5853 ctx->new.id = blobid; 5854 ctx->cpl.u.blobid.blobid = blobid; 5855 5856 spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx); 5857 } 5858 5859 5860 static void 5861 bs_xattr_snapshot(void *arg, const char *name, 5862 const void **value, size_t *value_len) 5863 { 5864 assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); 5865 5866 struct spdk_blob *blob = (struct spdk_blob *)arg; 5867 *value = &blob->id; 5868 *value_len = sizeof(blob->id); 5869 } 5870 5871 static void 5872 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5873 { 5874 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5875 struct spdk_blob_opts opts; 5876 struct spdk_blob_xattr_opts internal_xattrs; 5877 char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; 5878 5879 if (bserrno != 0) { 5880 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 5881 return; 5882 } 5883 5884 ctx->original.blob = _blob; 5885 5886 if (_blob->data_ro || _blob->md_ro) { 5887 SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n", 5888 _blob->id); 5889 ctx->bserrno = -EINVAL; 5890 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5891 return; 5892 } 5893 5894 if (_blob->locked_operation_in_progress) { 5895 SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n"); 5896 ctx->bserrno = -EBUSY; 5897 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 5898 return; 5899 } 5900 5901 _blob->locked_operation_in_progress = true; 5902 5903 spdk_blob_opts_init(&opts, sizeof(opts)); 5904 blob_xattrs_init(&internal_xattrs); 5905 5906 /* Change the size of new blob to the same as in original blob, 5907 * but do not allocate clusters */ 5908 opts.thin_provision = true; 5909 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 5910 opts.use_extent_table = _blob->use_extent_table; 5911 5912 /* If there are any xattrs specified for snapshot, set them now */ 5913 if (ctx->xattrs) { 5914 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 5915 } 5916 /* Set internal xattr SNAPSHOT_IN_PROGRESS */ 5917 internal_xattrs.count = 1; 5918 internal_xattrs.ctx = _blob; 5919 internal_xattrs.names = xattrs_names; 5920 internal_xattrs.get_value = bs_xattr_snapshot; 5921 5922 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 5923 bs_snapshot_newblob_create_cpl, ctx); 5924 } 5925 5926 void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, 5927 const struct spdk_blob_xattr_opts *snapshot_xattrs, 5928 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 5929 { 5930 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 5931 5932 if (!ctx) { 5933 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 5934 return; 5935 } 5936 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 5937 ctx->cpl.u.blobid.cb_fn = cb_fn; 5938 ctx->cpl.u.blobid.cb_arg = cb_arg; 5939 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 5940 ctx->bserrno = 0; 5941 ctx->frozen = false; 5942 ctx->original.id = blobid; 5943 ctx->xattrs = snapshot_xattrs; 5944 5945 spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx); 5946 } 5947 /* END spdk_bs_create_snapshot */ 5948 5949 /* START spdk_bs_create_clone */ 5950 5951 static void 5952 bs_xattr_clone(void *arg, const char *name, 5953 const void **value, size_t *value_len) 5954 { 5955 assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); 5956 5957 struct spdk_blob *blob = (struct spdk_blob *)arg; 5958 *value = &blob->id; 5959 *value_len = sizeof(blob->id); 5960 } 5961 5962 static void 5963 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5964 { 5965 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5966 struct spdk_blob *clone = _blob; 5967 5968 ctx->new.blob = clone; 5969 bs_blob_list_add(clone); 5970 5971 spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx); 5972 } 5973 5974 static void 5975 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) 5976 { 5977 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5978 5979 ctx->cpl.u.blobid.blobid = blobid; 5980 spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx); 5981 } 5982 5983 static void 5984 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 5985 { 5986 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 5987 struct spdk_blob_opts opts; 5988 struct spdk_blob_xattr_opts internal_xattrs; 5989 char *xattr_names[] = { BLOB_SNAPSHOT }; 5990 5991 if (bserrno != 0) { 5992 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 5993 return; 5994 } 5995 5996 ctx->original.blob = _blob; 5997 ctx->original.md_ro = _blob->md_ro; 5998 5999 if (!_blob->data_ro || !_blob->md_ro) { 6000 SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n"); 6001 ctx->bserrno = -EINVAL; 6002 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6003 return; 6004 } 6005 6006 if (_blob->locked_operation_in_progress) { 6007 SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n"); 6008 ctx->bserrno = -EBUSY; 6009 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6010 return; 6011 } 6012 6013 _blob->locked_operation_in_progress = true; 6014 6015 spdk_blob_opts_init(&opts, sizeof(opts)); 6016 blob_xattrs_init(&internal_xattrs); 6017 6018 opts.thin_provision = true; 6019 opts.num_clusters = spdk_blob_get_num_clusters(_blob); 6020 opts.use_extent_table = _blob->use_extent_table; 6021 if (ctx->xattrs) { 6022 memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); 6023 } 6024 6025 /* Set internal xattr BLOB_SNAPSHOT */ 6026 internal_xattrs.count = 1; 6027 internal_xattrs.ctx = _blob; 6028 internal_xattrs.names = xattr_names; 6029 internal_xattrs.get_value = bs_xattr_clone; 6030 6031 bs_create_blob(_blob->bs, &opts, &internal_xattrs, 6032 bs_clone_newblob_create_cpl, ctx); 6033 } 6034 6035 void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, 6036 const struct spdk_blob_xattr_opts *clone_xattrs, 6037 spdk_blob_op_with_id_complete cb_fn, void *cb_arg) 6038 { 6039 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6040 6041 if (!ctx) { 6042 cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); 6043 return; 6044 } 6045 6046 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; 6047 ctx->cpl.u.blobid.cb_fn = cb_fn; 6048 ctx->cpl.u.blobid.cb_arg = cb_arg; 6049 ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; 6050 ctx->bserrno = 0; 6051 ctx->xattrs = clone_xattrs; 6052 ctx->original.id = blobid; 6053 6054 spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx); 6055 } 6056 6057 /* END spdk_bs_create_clone */ 6058 6059 /* START spdk_bs_inflate_blob */ 6060 6061 static void 6062 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) 6063 { 6064 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6065 struct spdk_blob *_blob = ctx->original.blob; 6066 6067 if (bserrno != 0) { 6068 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6069 return; 6070 } 6071 6072 /* Temporarily override md_ro flag for MD modification */ 6073 _blob->md_ro = false; 6074 6075 bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true); 6076 if (bserrno != 0) { 6077 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6078 return; 6079 } 6080 6081 assert(_parent != NULL); 6082 6083 bs_blob_list_remove(_blob); 6084 _blob->parent_id = _parent->id; 6085 6086 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6087 _blob->back_bs_dev = bs_create_blob_bs_dev(_parent); 6088 bs_blob_list_add(_blob); 6089 6090 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6091 } 6092 6093 static void 6094 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx) 6095 { 6096 struct spdk_blob *_blob = ctx->original.blob; 6097 struct spdk_blob *_parent; 6098 6099 if (ctx->allocate_all) { 6100 /* remove thin provisioning */ 6101 bs_blob_list_remove(_blob); 6102 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6103 _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; 6104 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6105 _blob->back_bs_dev = NULL; 6106 _blob->parent_id = SPDK_BLOBID_INVALID; 6107 } else { 6108 _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; 6109 if (_parent->parent_id != SPDK_BLOBID_INVALID) { 6110 /* We must change the parent of the inflated blob */ 6111 spdk_bs_open_blob(_blob->bs, _parent->parent_id, 6112 bs_inflate_blob_set_parent_cpl, ctx); 6113 return; 6114 } 6115 6116 bs_blob_list_remove(_blob); 6117 blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); 6118 _blob->parent_id = SPDK_BLOBID_INVALID; 6119 _blob->back_bs_dev->destroy(_blob->back_bs_dev); 6120 _blob->back_bs_dev = bs_create_zeroes_dev(); 6121 } 6122 6123 /* Temporarily override md_ro flag for MD modification */ 6124 _blob->md_ro = false; 6125 _blob->state = SPDK_BLOB_STATE_DIRTY; 6126 6127 spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx); 6128 } 6129 6130 /* Check if cluster needs allocation */ 6131 static inline bool 6132 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) 6133 { 6134 struct spdk_blob_bs_dev *b; 6135 6136 assert(blob != NULL); 6137 6138 if (blob->active.clusters[cluster] != 0) { 6139 /* Cluster is already allocated */ 6140 return false; 6141 } 6142 6143 if (blob->parent_id == SPDK_BLOBID_INVALID) { 6144 /* Blob have no parent blob */ 6145 return allocate_all; 6146 } 6147 6148 b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; 6149 return (allocate_all || b->blob->active.clusters[cluster] != 0); 6150 } 6151 6152 static void 6153 bs_inflate_blob_touch_next(void *cb_arg, int bserrno) 6154 { 6155 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6156 struct spdk_blob *_blob = ctx->original.blob; 6157 struct spdk_bs_cpl cpl; 6158 spdk_bs_user_op_t *op; 6159 uint64_t offset; 6160 6161 if (bserrno != 0) { 6162 bs_clone_snapshot_origblob_cleanup(ctx, bserrno); 6163 return; 6164 } 6165 6166 for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { 6167 if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { 6168 break; 6169 } 6170 } 6171 6172 if (ctx->cluster < _blob->active.num_clusters) { 6173 offset = bs_cluster_to_lba(_blob->bs, ctx->cluster); 6174 6175 /* We may safely increment a cluster before copying */ 6176 ctx->cluster++; 6177 6178 /* Use a dummy 0B read as a context for cluster copy */ 6179 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6180 cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next; 6181 cpl.u.blob_basic.cb_arg = ctx; 6182 6183 op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob, 6184 NULL, 0, offset, 0); 6185 if (!op) { 6186 bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM); 6187 return; 6188 } 6189 6190 bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op); 6191 } else { 6192 bs_inflate_blob_done(ctx); 6193 } 6194 } 6195 6196 static void 6197 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 6198 { 6199 struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; 6200 uint64_t clusters_needed; 6201 uint64_t i; 6202 6203 if (bserrno != 0) { 6204 bs_clone_snapshot_cleanup_finish(ctx, bserrno); 6205 return; 6206 } 6207 6208 ctx->original.blob = _blob; 6209 ctx->original.md_ro = _blob->md_ro; 6210 6211 if (_blob->locked_operation_in_progress) { 6212 SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n"); 6213 ctx->bserrno = -EBUSY; 6214 spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx); 6215 return; 6216 } 6217 6218 _blob->locked_operation_in_progress = true; 6219 6220 if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { 6221 /* This blob have no parent, so we cannot decouple it. */ 6222 SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); 6223 bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); 6224 return; 6225 } 6226 6227 if (spdk_blob_is_thin_provisioned(_blob) == false) { 6228 /* This is not thin provisioned blob. No need to inflate. */ 6229 bs_clone_snapshot_origblob_cleanup(ctx, 0); 6230 return; 6231 } 6232 6233 /* Do two passes - one to verify that we can obtain enough clusters 6234 * and another to actually claim them. 6235 */ 6236 clusters_needed = 0; 6237 for (i = 0; i < _blob->active.num_clusters; i++) { 6238 if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { 6239 clusters_needed++; 6240 } 6241 } 6242 6243 if (clusters_needed > _blob->bs->num_free_clusters) { 6244 /* Not enough free clusters. Cannot satisfy the request. */ 6245 bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); 6246 return; 6247 } 6248 6249 ctx->cluster = 0; 6250 bs_inflate_blob_touch_next(ctx, 0); 6251 } 6252 6253 static void 6254 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6255 spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) 6256 { 6257 struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); 6258 6259 if (!ctx) { 6260 cb_fn(cb_arg, -ENOMEM); 6261 return; 6262 } 6263 ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6264 ctx->cpl.u.bs_basic.cb_fn = cb_fn; 6265 ctx->cpl.u.bs_basic.cb_arg = cb_arg; 6266 ctx->bserrno = 0; 6267 ctx->original.id = blobid; 6268 ctx->channel = channel; 6269 ctx->allocate_all = allocate_all; 6270 6271 spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx); 6272 } 6273 6274 void 6275 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6276 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6277 { 6278 bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); 6279 } 6280 6281 void 6282 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, 6283 spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) 6284 { 6285 bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); 6286 } 6287 /* END spdk_bs_inflate_blob */ 6288 6289 /* START spdk_blob_resize */ 6290 struct spdk_bs_resize_ctx { 6291 spdk_blob_op_complete cb_fn; 6292 void *cb_arg; 6293 struct spdk_blob *blob; 6294 uint64_t sz; 6295 int rc; 6296 }; 6297 6298 static void 6299 bs_resize_unfreeze_cpl(void *cb_arg, int rc) 6300 { 6301 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6302 6303 if (rc != 0) { 6304 SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); 6305 } 6306 6307 if (ctx->rc != 0) { 6308 SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); 6309 rc = ctx->rc; 6310 } 6311 6312 ctx->blob->locked_operation_in_progress = false; 6313 6314 ctx->cb_fn(ctx->cb_arg, rc); 6315 free(ctx); 6316 } 6317 6318 static void 6319 bs_resize_freeze_cpl(void *cb_arg, int rc) 6320 { 6321 struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; 6322 6323 if (rc != 0) { 6324 ctx->blob->locked_operation_in_progress = false; 6325 ctx->cb_fn(ctx->cb_arg, rc); 6326 free(ctx); 6327 return; 6328 } 6329 6330 ctx->rc = blob_resize(ctx->blob, ctx->sz); 6331 6332 blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx); 6333 } 6334 6335 void 6336 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) 6337 { 6338 struct spdk_bs_resize_ctx *ctx; 6339 6340 blob_verify_md_op(blob); 6341 6342 SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz); 6343 6344 if (blob->md_ro) { 6345 cb_fn(cb_arg, -EPERM); 6346 return; 6347 } 6348 6349 if (sz == blob->active.num_clusters) { 6350 cb_fn(cb_arg, 0); 6351 return; 6352 } 6353 6354 if (blob->locked_operation_in_progress) { 6355 cb_fn(cb_arg, -EBUSY); 6356 return; 6357 } 6358 6359 ctx = calloc(1, sizeof(*ctx)); 6360 if (!ctx) { 6361 cb_fn(cb_arg, -ENOMEM); 6362 return; 6363 } 6364 6365 blob->locked_operation_in_progress = true; 6366 ctx->cb_fn = cb_fn; 6367 ctx->cb_arg = cb_arg; 6368 ctx->blob = blob; 6369 ctx->sz = sz; 6370 blob_freeze_io(blob, bs_resize_freeze_cpl, ctx); 6371 } 6372 6373 /* END spdk_blob_resize */ 6374 6375 6376 /* START spdk_bs_delete_blob */ 6377 6378 static void 6379 bs_delete_close_cpl(void *cb_arg, int bserrno) 6380 { 6381 spdk_bs_sequence_t *seq = cb_arg; 6382 6383 bs_sequence_finish(seq, bserrno); 6384 } 6385 6386 static void 6387 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6388 { 6389 struct spdk_blob *blob = cb_arg; 6390 6391 if (bserrno != 0) { 6392 /* 6393 * We already removed this blob from the blobstore tailq, so 6394 * we need to free it here since this is the last reference 6395 * to it. 6396 */ 6397 blob_free(blob); 6398 bs_delete_close_cpl(seq, bserrno); 6399 return; 6400 } 6401 6402 /* 6403 * This will immediately decrement the ref_count and call 6404 * the completion routine since the metadata state is clean. 6405 * By calling spdk_blob_close, we reduce the number of call 6406 * points into code that touches the blob->open_ref count 6407 * and the blobstore's blob list. 6408 */ 6409 spdk_blob_close(blob, bs_delete_close_cpl, seq); 6410 } 6411 6412 struct delete_snapshot_ctx { 6413 struct spdk_blob_list *parent_snapshot_entry; 6414 struct spdk_blob *snapshot; 6415 bool snapshot_md_ro; 6416 struct spdk_blob *clone; 6417 bool clone_md_ro; 6418 spdk_blob_op_with_handle_complete cb_fn; 6419 void *cb_arg; 6420 int bserrno; 6421 uint32_t next_extent_page; 6422 }; 6423 6424 static void 6425 delete_blob_cleanup_finish(void *cb_arg, int bserrno) 6426 { 6427 struct delete_snapshot_ctx *ctx = cb_arg; 6428 6429 if (bserrno != 0) { 6430 SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno); 6431 } 6432 6433 assert(ctx != NULL); 6434 6435 if (bserrno != 0 && ctx->bserrno == 0) { 6436 ctx->bserrno = bserrno; 6437 } 6438 6439 ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno); 6440 free(ctx); 6441 } 6442 6443 static void 6444 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno) 6445 { 6446 struct delete_snapshot_ctx *ctx = cb_arg; 6447 6448 if (bserrno != 0) { 6449 ctx->bserrno = bserrno; 6450 SPDK_ERRLOG("Clone cleanup error %d\n", bserrno); 6451 } 6452 6453 if (ctx->bserrno != 0) { 6454 assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL); 6455 RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot); 6456 spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id); 6457 } 6458 6459 ctx->snapshot->locked_operation_in_progress = false; 6460 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6461 6462 spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx); 6463 } 6464 6465 static void 6466 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno) 6467 { 6468 struct delete_snapshot_ctx *ctx = cb_arg; 6469 6470 ctx->clone->locked_operation_in_progress = false; 6471 ctx->clone->md_ro = ctx->clone_md_ro; 6472 6473 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6474 } 6475 6476 static void 6477 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) 6478 { 6479 struct delete_snapshot_ctx *ctx = cb_arg; 6480 6481 if (bserrno) { 6482 ctx->bserrno = bserrno; 6483 delete_snapshot_cleanup_clone(ctx, 0); 6484 return; 6485 } 6486 6487 ctx->clone->locked_operation_in_progress = false; 6488 spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx); 6489 } 6490 6491 static void 6492 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno) 6493 { 6494 struct delete_snapshot_ctx *ctx = cb_arg; 6495 struct spdk_blob_list *parent_snapshot_entry = NULL; 6496 struct spdk_blob_list *snapshot_entry = NULL; 6497 struct spdk_blob_list *clone_entry = NULL; 6498 struct spdk_blob_list *snapshot_clone_entry = NULL; 6499 6500 if (bserrno) { 6501 SPDK_ERRLOG("Failed to sync MD on blob\n"); 6502 ctx->bserrno = bserrno; 6503 delete_snapshot_cleanup_clone(ctx, 0); 6504 return; 6505 } 6506 6507 /* Get snapshot entry for the snapshot we want to remove */ 6508 snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id); 6509 6510 assert(snapshot_entry != NULL); 6511 6512 /* Remove clone entry in this snapshot (at this point there can be only one clone) */ 6513 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6514 assert(clone_entry != NULL); 6515 TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); 6516 snapshot_entry->clone_count--; 6517 assert(TAILQ_EMPTY(&snapshot_entry->clones)); 6518 6519 if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) { 6520 /* This snapshot is at the same time a clone of another snapshot - we need to 6521 * update parent snapshot (remove current clone, add new one inherited from 6522 * the snapshot that is being removed) */ 6523 6524 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6525 * snapshot that we are removing */ 6526 blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry, 6527 &snapshot_clone_entry); 6528 6529 /* Switch clone entry in parent snapshot */ 6530 TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link); 6531 TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link); 6532 free(snapshot_clone_entry); 6533 } else { 6534 /* No parent snapshot - just remove clone entry */ 6535 free(clone_entry); 6536 } 6537 6538 /* Restore md_ro flags */ 6539 ctx->clone->md_ro = ctx->clone_md_ro; 6540 ctx->snapshot->md_ro = ctx->snapshot_md_ro; 6541 6542 blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx); 6543 } 6544 6545 static void 6546 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno) 6547 { 6548 struct delete_snapshot_ctx *ctx = cb_arg; 6549 uint64_t i; 6550 6551 ctx->snapshot->md_ro = false; 6552 6553 if (bserrno) { 6554 SPDK_ERRLOG("Failed to sync MD on clone\n"); 6555 ctx->bserrno = bserrno; 6556 6557 /* Restore snapshot to previous state */ 6558 bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true); 6559 if (bserrno != 0) { 6560 delete_snapshot_cleanup_clone(ctx, bserrno); 6561 return; 6562 } 6563 6564 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx); 6565 return; 6566 } 6567 6568 /* Clear cluster map entries for snapshot */ 6569 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6570 if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) { 6571 ctx->snapshot->active.clusters[i] = 0; 6572 } 6573 } 6574 for (i = 0; i < ctx->snapshot->active.num_extent_pages && 6575 i < ctx->clone->active.num_extent_pages; i++) { 6576 if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) { 6577 ctx->snapshot->active.extent_pages[i] = 0; 6578 } 6579 } 6580 6581 blob_set_thin_provision(ctx->snapshot); 6582 ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY; 6583 6584 if (ctx->parent_snapshot_entry != NULL) { 6585 ctx->snapshot->back_bs_dev = NULL; 6586 } 6587 6588 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx); 6589 } 6590 6591 static void 6592 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx) 6593 { 6594 /* Delete old backing bs_dev from clone (related to snapshot that will be removed) */ 6595 ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev); 6596 6597 /* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */ 6598 if (ctx->parent_snapshot_entry != NULL) { 6599 /* ...to parent snapshot */ 6600 ctx->clone->parent_id = ctx->parent_snapshot_entry->id; 6601 ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev; 6602 blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id, 6603 sizeof(spdk_blob_id), 6604 true); 6605 } else { 6606 /* ...to blobid invalid and zeroes dev */ 6607 ctx->clone->parent_id = SPDK_BLOBID_INVALID; 6608 ctx->clone->back_bs_dev = bs_create_zeroes_dev(); 6609 blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true); 6610 } 6611 6612 spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx); 6613 } 6614 6615 static void 6616 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno) 6617 { 6618 struct delete_snapshot_ctx *ctx = cb_arg; 6619 uint32_t *extent_page; 6620 uint64_t i; 6621 6622 for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages && 6623 i < ctx->clone->active.num_extent_pages; i++) { 6624 if (ctx->snapshot->active.extent_pages[i] == 0) { 6625 /* No extent page to use from snapshot */ 6626 continue; 6627 } 6628 6629 extent_page = &ctx->clone->active.extent_pages[i]; 6630 if (*extent_page == 0) { 6631 /* Copy extent page from snapshot when clone did not have a matching one */ 6632 *extent_page = ctx->snapshot->active.extent_pages[i]; 6633 continue; 6634 } 6635 6636 /* Clone and snapshot both contain partially filled matching extent pages. 6637 * Update the clone extent page in place with cluster map containing the mix of both. */ 6638 ctx->next_extent_page = i + 1; 6639 6640 blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, 6641 delete_snapshot_update_extent_pages, ctx); 6642 return; 6643 } 6644 delete_snapshot_update_extent_pages_cpl(ctx); 6645 } 6646 6647 static void 6648 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno) 6649 { 6650 struct delete_snapshot_ctx *ctx = cb_arg; 6651 uint64_t i; 6652 6653 /* Temporarily override md_ro flag for clone for MD modification */ 6654 ctx->clone_md_ro = ctx->clone->md_ro; 6655 ctx->clone->md_ro = false; 6656 6657 if (bserrno) { 6658 SPDK_ERRLOG("Failed to sync MD with xattr on blob\n"); 6659 ctx->bserrno = bserrno; 6660 delete_snapshot_cleanup_clone(ctx, 0); 6661 return; 6662 } 6663 6664 /* Copy snapshot map to clone map (only unallocated clusters in clone) */ 6665 for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) { 6666 if (ctx->clone->active.clusters[i] == 0) { 6667 ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i]; 6668 } 6669 } 6670 ctx->next_extent_page = 0; 6671 delete_snapshot_update_extent_pages(ctx, 0); 6672 } 6673 6674 static void 6675 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno) 6676 { 6677 struct delete_snapshot_ctx *ctx = cb_arg; 6678 6679 if (bserrno) { 6680 SPDK_ERRLOG("Failed to freeze I/O on clone\n"); 6681 ctx->bserrno = bserrno; 6682 delete_snapshot_cleanup_clone(ctx, 0); 6683 return; 6684 } 6685 6686 /* Temporarily override md_ro flag for snapshot for MD modification */ 6687 ctx->snapshot_md_ro = ctx->snapshot->md_ro; 6688 ctx->snapshot->md_ro = false; 6689 6690 /* Mark blob as pending for removal for power failure safety, use clone id for recovery */ 6691 ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id, 6692 sizeof(spdk_blob_id), true); 6693 if (ctx->bserrno != 0) { 6694 delete_snapshot_cleanup_clone(ctx, 0); 6695 return; 6696 } 6697 6698 spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx); 6699 } 6700 6701 static void 6702 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno) 6703 { 6704 struct delete_snapshot_ctx *ctx = cb_arg; 6705 6706 if (bserrno) { 6707 SPDK_ERRLOG("Failed to open clone\n"); 6708 ctx->bserrno = bserrno; 6709 delete_snapshot_cleanup_snapshot(ctx, 0); 6710 return; 6711 } 6712 6713 ctx->clone = clone; 6714 6715 if (clone->locked_operation_in_progress) { 6716 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n"); 6717 ctx->bserrno = -EBUSY; 6718 spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx); 6719 return; 6720 } 6721 6722 clone->locked_operation_in_progress = true; 6723 6724 blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx); 6725 } 6726 6727 static void 6728 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx) 6729 { 6730 struct spdk_blob_list *snapshot_entry = NULL; 6731 struct spdk_blob_list *clone_entry = NULL; 6732 struct spdk_blob_list *snapshot_clone_entry = NULL; 6733 6734 /* Get snapshot entry for the snapshot we want to remove */ 6735 snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id); 6736 6737 assert(snapshot_entry != NULL); 6738 6739 /* Get clone of the snapshot (at this point there can be only one clone) */ 6740 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6741 assert(snapshot_entry->clone_count == 1); 6742 assert(clone_entry != NULL); 6743 6744 /* Get snapshot entry for parent snapshot and clone entry within that snapshot for 6745 * snapshot that we are removing */ 6746 blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry, 6747 &snapshot_clone_entry); 6748 6749 spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx); 6750 } 6751 6752 static void 6753 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno) 6754 { 6755 spdk_bs_sequence_t *seq = cb_arg; 6756 struct spdk_blob_list *snapshot_entry = NULL; 6757 uint32_t page_num; 6758 6759 if (bserrno) { 6760 SPDK_ERRLOG("Failed to remove blob\n"); 6761 bs_sequence_finish(seq, bserrno); 6762 return; 6763 } 6764 6765 /* Remove snapshot from the list */ 6766 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 6767 if (snapshot_entry != NULL) { 6768 TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); 6769 free(snapshot_entry); 6770 } 6771 6772 page_num = bs_blobid_to_page(blob->id); 6773 spdk_bit_array_clear(blob->bs->used_blobids, page_num); 6774 blob->state = SPDK_BLOB_STATE_DIRTY; 6775 blob->active.num_pages = 0; 6776 blob_resize(blob, 0); 6777 6778 blob_persist(seq, blob, bs_delete_persist_cpl, blob); 6779 } 6780 6781 static int 6782 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone) 6783 { 6784 struct spdk_blob_list *snapshot_entry = NULL; 6785 struct spdk_blob_list *clone_entry = NULL; 6786 struct spdk_blob *clone = NULL; 6787 bool has_one_clone = false; 6788 6789 /* Check if this is a snapshot with clones */ 6790 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 6791 if (snapshot_entry != NULL) { 6792 if (snapshot_entry->clone_count > 1) { 6793 SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n"); 6794 return -EBUSY; 6795 } else if (snapshot_entry->clone_count == 1) { 6796 has_one_clone = true; 6797 } 6798 } 6799 6800 /* Check if someone has this blob open (besides this delete context): 6801 * - open_ref = 1 - only this context opened blob, so it is ok to remove it 6802 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot 6803 * and that is ok, because we will update it accordingly */ 6804 if (blob->open_ref <= 2 && has_one_clone) { 6805 clone_entry = TAILQ_FIRST(&snapshot_entry->clones); 6806 assert(clone_entry != NULL); 6807 clone = blob_lookup(blob->bs, clone_entry->id); 6808 6809 if (blob->open_ref == 2 && clone == NULL) { 6810 /* Clone is closed and someone else opened this blob */ 6811 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 6812 return -EBUSY; 6813 } 6814 6815 *update_clone = true; 6816 return 0; 6817 } 6818 6819 if (blob->open_ref > 1) { 6820 SPDK_ERRLOG("Cannot remove snapshot because it is open\n"); 6821 return -EBUSY; 6822 } 6823 6824 assert(has_one_clone == false); 6825 *update_clone = false; 6826 return 0; 6827 } 6828 6829 static void 6830 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno) 6831 { 6832 spdk_bs_sequence_t *seq = cb_arg; 6833 6834 bs_sequence_finish(seq, -ENOMEM); 6835 } 6836 6837 static void 6838 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) 6839 { 6840 spdk_bs_sequence_t *seq = cb_arg; 6841 struct delete_snapshot_ctx *ctx; 6842 bool update_clone = false; 6843 6844 if (bserrno != 0) { 6845 bs_sequence_finish(seq, bserrno); 6846 return; 6847 } 6848 6849 blob_verify_md_op(blob); 6850 6851 ctx = calloc(1, sizeof(*ctx)); 6852 if (ctx == NULL) { 6853 spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq); 6854 return; 6855 } 6856 6857 ctx->snapshot = blob; 6858 ctx->cb_fn = bs_delete_blob_finish; 6859 ctx->cb_arg = seq; 6860 6861 /* Check if blob can be removed and if it is a snapshot with clone on top of it */ 6862 ctx->bserrno = bs_is_blob_deletable(blob, &update_clone); 6863 if (ctx->bserrno) { 6864 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 6865 return; 6866 } 6867 6868 if (blob->locked_operation_in_progress) { 6869 SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n"); 6870 ctx->bserrno = -EBUSY; 6871 spdk_blob_close(blob, delete_blob_cleanup_finish, ctx); 6872 return; 6873 } 6874 6875 blob->locked_operation_in_progress = true; 6876 6877 /* 6878 * Remove the blob from the blob_store list now, to ensure it does not 6879 * get returned after this point by blob_lookup(). 6880 */ 6881 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 6882 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 6883 6884 if (update_clone) { 6885 /* This blob is a snapshot with active clone - update clone first */ 6886 update_clone_on_snapshot_deletion(blob, ctx); 6887 } else { 6888 /* This blob does not have any clones - just remove it */ 6889 bs_blob_list_remove(blob); 6890 bs_delete_blob_finish(seq, blob, 0); 6891 free(ctx); 6892 } 6893 } 6894 6895 void 6896 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 6897 spdk_blob_op_complete cb_fn, void *cb_arg) 6898 { 6899 struct spdk_bs_cpl cpl; 6900 spdk_bs_sequence_t *seq; 6901 6902 SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid); 6903 6904 assert(spdk_get_thread() == bs->md_thread); 6905 6906 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 6907 cpl.u.blob_basic.cb_fn = cb_fn; 6908 cpl.u.blob_basic.cb_arg = cb_arg; 6909 6910 seq = bs_sequence_start(bs->md_channel, &cpl); 6911 if (!seq) { 6912 cb_fn(cb_arg, -ENOMEM); 6913 return; 6914 } 6915 6916 spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq); 6917 } 6918 6919 /* END spdk_bs_delete_blob */ 6920 6921 /* START spdk_bs_open_blob */ 6922 6923 static void 6924 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 6925 { 6926 struct spdk_blob *blob = cb_arg; 6927 struct spdk_blob *existing; 6928 6929 if (bserrno != 0) { 6930 blob_free(blob); 6931 seq->cpl.u.blob_handle.blob = NULL; 6932 bs_sequence_finish(seq, bserrno); 6933 return; 6934 } 6935 6936 existing = blob_lookup(blob->bs, blob->id); 6937 if (existing) { 6938 blob_free(blob); 6939 existing->open_ref++; 6940 seq->cpl.u.blob_handle.blob = existing; 6941 bs_sequence_finish(seq, 0); 6942 return; 6943 } 6944 6945 blob->open_ref++; 6946 6947 spdk_bit_array_set(blob->bs->open_blobids, blob->id); 6948 RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob); 6949 6950 bs_sequence_finish(seq, bserrno); 6951 } 6952 6953 static inline void 6954 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst) 6955 { 6956 #define FIELD_OK(field) \ 6957 offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size 6958 6959 #define SET_FIELD(field) \ 6960 if (FIELD_OK(field)) { \ 6961 dst->field = src->field; \ 6962 } \ 6963 6964 SET_FIELD(clear_method); 6965 6966 dst->opts_size = src->opts_size; 6967 6968 /* You should not remove this statement, but need to update the assert statement 6969 * if you add a new field, and also add a corresponding SET_FIELD statement */ 6970 SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 16, "Incorrect size"); 6971 6972 #undef FIELD_OK 6973 #undef SET_FIELD 6974 } 6975 6976 static void 6977 bs_open_blob(struct spdk_blob_store *bs, 6978 spdk_blob_id blobid, 6979 struct spdk_blob_open_opts *opts, 6980 spdk_blob_op_with_handle_complete cb_fn, 6981 void *cb_arg) 6982 { 6983 struct spdk_blob *blob; 6984 struct spdk_bs_cpl cpl; 6985 struct spdk_blob_open_opts opts_local; 6986 spdk_bs_sequence_t *seq; 6987 uint32_t page_num; 6988 6989 SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid); 6990 assert(spdk_get_thread() == bs->md_thread); 6991 6992 page_num = bs_blobid_to_page(blobid); 6993 if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { 6994 /* Invalid blobid */ 6995 cb_fn(cb_arg, NULL, -ENOENT); 6996 return; 6997 } 6998 6999 blob = blob_lookup(bs, blobid); 7000 if (blob) { 7001 blob->open_ref++; 7002 cb_fn(cb_arg, blob, 0); 7003 return; 7004 } 7005 7006 blob = blob_alloc(bs, blobid); 7007 if (!blob) { 7008 cb_fn(cb_arg, NULL, -ENOMEM); 7009 return; 7010 } 7011 7012 spdk_blob_open_opts_init(&opts_local, sizeof(opts_local)); 7013 if (opts) { 7014 blob_open_opts_copy(opts, &opts_local); 7015 } 7016 7017 blob->clear_method = opts_local.clear_method; 7018 7019 cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; 7020 cpl.u.blob_handle.cb_fn = cb_fn; 7021 cpl.u.blob_handle.cb_arg = cb_arg; 7022 cpl.u.blob_handle.blob = blob; 7023 7024 seq = bs_sequence_start(bs->md_channel, &cpl); 7025 if (!seq) { 7026 blob_free(blob); 7027 cb_fn(cb_arg, NULL, -ENOMEM); 7028 return; 7029 } 7030 7031 blob_load(seq, blob, bs_open_blob_cpl, blob); 7032 } 7033 7034 void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, 7035 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7036 { 7037 bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg); 7038 } 7039 7040 void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid, 7041 struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7042 { 7043 bs_open_blob(bs, blobid, opts, cb_fn, cb_arg); 7044 } 7045 7046 /* END spdk_bs_open_blob */ 7047 7048 /* START spdk_blob_set_read_only */ 7049 int spdk_blob_set_read_only(struct spdk_blob *blob) 7050 { 7051 blob_verify_md_op(blob); 7052 7053 blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; 7054 7055 blob->state = SPDK_BLOB_STATE_DIRTY; 7056 return 0; 7057 } 7058 /* END spdk_blob_set_read_only */ 7059 7060 /* START spdk_blob_sync_md */ 7061 7062 static void 7063 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7064 { 7065 struct spdk_blob *blob = cb_arg; 7066 7067 if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { 7068 blob->data_ro = true; 7069 blob->md_ro = true; 7070 } 7071 7072 bs_sequence_finish(seq, bserrno); 7073 } 7074 7075 static void 7076 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7077 { 7078 struct spdk_bs_cpl cpl; 7079 spdk_bs_sequence_t *seq; 7080 7081 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7082 cpl.u.blob_basic.cb_fn = cb_fn; 7083 cpl.u.blob_basic.cb_arg = cb_arg; 7084 7085 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7086 if (!seq) { 7087 cb_fn(cb_arg, -ENOMEM); 7088 return; 7089 } 7090 7091 blob_persist(seq, blob, blob_sync_md_cpl, blob); 7092 } 7093 7094 void 7095 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7096 { 7097 blob_verify_md_op(blob); 7098 7099 SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id); 7100 7101 if (blob->md_ro) { 7102 assert(blob->state == SPDK_BLOB_STATE_CLEAN); 7103 cb_fn(cb_arg, 0); 7104 return; 7105 } 7106 7107 blob_sync_md(blob, cb_fn, cb_arg); 7108 } 7109 7110 /* END spdk_blob_sync_md */ 7111 7112 struct spdk_blob_insert_cluster_ctx { 7113 struct spdk_thread *thread; 7114 struct spdk_blob *blob; 7115 uint32_t cluster_num; /* cluster index in blob */ 7116 uint32_t cluster; /* cluster on disk */ 7117 uint32_t extent_page; /* extent page on disk */ 7118 int rc; 7119 spdk_blob_op_complete cb_fn; 7120 void *cb_arg; 7121 }; 7122 7123 static void 7124 blob_insert_cluster_msg_cpl(void *arg) 7125 { 7126 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7127 7128 ctx->cb_fn(ctx->cb_arg, ctx->rc); 7129 free(ctx); 7130 } 7131 7132 static void 7133 blob_insert_cluster_msg_cb(void *arg, int bserrno) 7134 { 7135 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7136 7137 ctx->rc = bserrno; 7138 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7139 } 7140 7141 static void 7142 blob_insert_new_ep_cb(void *arg, int bserrno) 7143 { 7144 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7145 uint32_t *extent_page; 7146 7147 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7148 *extent_page = ctx->extent_page; 7149 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7150 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7151 } 7152 7153 static void 7154 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7155 { 7156 struct spdk_blob_md_page *page = cb_arg; 7157 7158 bs_sequence_finish(seq, bserrno); 7159 spdk_free(page); 7160 } 7161 7162 static void 7163 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num, 7164 spdk_blob_op_complete cb_fn, void *cb_arg) 7165 { 7166 spdk_bs_sequence_t *seq; 7167 struct spdk_bs_cpl cpl; 7168 struct spdk_blob_md_page *page = NULL; 7169 uint32_t page_count = 0; 7170 int rc; 7171 7172 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7173 cpl.u.blob_basic.cb_fn = cb_fn; 7174 cpl.u.blob_basic.cb_arg = cb_arg; 7175 7176 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7177 if (!seq) { 7178 cb_fn(cb_arg, -ENOMEM); 7179 return; 7180 } 7181 rc = blob_serialize_add_page(blob, &page, &page_count, &page); 7182 if (rc < 0) { 7183 bs_sequence_finish(seq, rc); 7184 return; 7185 } 7186 7187 blob_serialize_extent_page(blob, cluster_num, page); 7188 7189 page->crc = blob_md_page_calc_crc(page); 7190 7191 assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true); 7192 7193 bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent), 7194 bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE), 7195 blob_persist_extent_page_cpl, page); 7196 } 7197 7198 static void 7199 blob_insert_cluster_msg(void *arg) 7200 { 7201 struct spdk_blob_insert_cluster_ctx *ctx = arg; 7202 uint32_t *extent_page; 7203 7204 ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); 7205 if (ctx->rc != 0) { 7206 spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx); 7207 return; 7208 } 7209 7210 if (ctx->blob->use_extent_table == false) { 7211 /* Extent table is not used, proceed with sync of md that will only use extents_rle. */ 7212 ctx->blob->state = SPDK_BLOB_STATE_DIRTY; 7213 blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx); 7214 return; 7215 } 7216 7217 extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num); 7218 if (*extent_page == 0) { 7219 /* Extent page requires allocation. 7220 * It was already claimed in the used_md_pages map and placed in ctx. */ 7221 assert(ctx->extent_page != 0); 7222 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7223 blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, 7224 blob_insert_new_ep_cb, ctx); 7225 } else { 7226 /* It is possible for original thread to allocate extent page for 7227 * different cluster in the same extent page. In such case proceed with 7228 * updating the existing extent page, but release the additional one. */ 7229 if (ctx->extent_page != 0) { 7230 assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true); 7231 bs_release_md_page(ctx->blob->bs, ctx->extent_page); 7232 ctx->extent_page = 0; 7233 } 7234 /* Extent page already allocated. 7235 * Every cluster allocation, requires just an update of single extent page. */ 7236 blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, 7237 blob_insert_cluster_msg_cb, ctx); 7238 } 7239 } 7240 7241 static void 7242 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, 7243 uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg) 7244 { 7245 struct spdk_blob_insert_cluster_ctx *ctx; 7246 7247 ctx = calloc(1, sizeof(*ctx)); 7248 if (ctx == NULL) { 7249 cb_fn(cb_arg, -ENOMEM); 7250 return; 7251 } 7252 7253 ctx->thread = spdk_get_thread(); 7254 ctx->blob = blob; 7255 ctx->cluster_num = cluster_num; 7256 ctx->cluster = cluster; 7257 ctx->extent_page = extent_page; 7258 ctx->cb_fn = cb_fn; 7259 ctx->cb_arg = cb_arg; 7260 7261 spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx); 7262 } 7263 7264 /* START spdk_blob_close */ 7265 7266 static void 7267 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) 7268 { 7269 struct spdk_blob *blob = cb_arg; 7270 7271 if (bserrno == 0) { 7272 blob->open_ref--; 7273 if (blob->open_ref == 0) { 7274 /* 7275 * Blobs with active.num_pages == 0 are deleted blobs. 7276 * these blobs are removed from the blob_store list 7277 * when the deletion process starts - so don't try to 7278 * remove them again. 7279 */ 7280 if (blob->active.num_pages > 0) { 7281 spdk_bit_array_clear(blob->bs->open_blobids, blob->id); 7282 RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob); 7283 } 7284 blob_free(blob); 7285 } 7286 } 7287 7288 bs_sequence_finish(seq, bserrno); 7289 } 7290 7291 void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) 7292 { 7293 struct spdk_bs_cpl cpl; 7294 spdk_bs_sequence_t *seq; 7295 7296 blob_verify_md_op(blob); 7297 7298 SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id); 7299 7300 if (blob->open_ref == 0) { 7301 cb_fn(cb_arg, -EBADF); 7302 return; 7303 } 7304 7305 cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; 7306 cpl.u.blob_basic.cb_fn = cb_fn; 7307 cpl.u.blob_basic.cb_arg = cb_arg; 7308 7309 seq = bs_sequence_start(blob->bs->md_channel, &cpl); 7310 if (!seq) { 7311 cb_fn(cb_arg, -ENOMEM); 7312 return; 7313 } 7314 7315 /* Sync metadata */ 7316 blob_persist(seq, blob, blob_close_cpl, blob); 7317 } 7318 7319 /* END spdk_blob_close */ 7320 7321 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) 7322 { 7323 return spdk_get_io_channel(bs); 7324 } 7325 7326 void spdk_bs_free_io_channel(struct spdk_io_channel *channel) 7327 { 7328 spdk_put_io_channel(channel); 7329 } 7330 7331 void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, 7332 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7333 { 7334 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7335 SPDK_BLOB_UNMAP); 7336 } 7337 7338 void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, 7339 uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) 7340 { 7341 blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, 7342 SPDK_BLOB_WRITE_ZEROES); 7343 } 7344 7345 void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, 7346 void *payload, uint64_t offset, uint64_t length, 7347 spdk_blob_op_complete cb_fn, void *cb_arg) 7348 { 7349 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7350 SPDK_BLOB_WRITE); 7351 } 7352 7353 void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, 7354 void *payload, uint64_t offset, uint64_t length, 7355 spdk_blob_op_complete cb_fn, void *cb_arg) 7356 { 7357 blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, 7358 SPDK_BLOB_READ); 7359 } 7360 7361 void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, 7362 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7363 spdk_blob_op_complete cb_fn, void *cb_arg) 7364 { 7365 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false); 7366 } 7367 7368 void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, 7369 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, 7370 spdk_blob_op_complete cb_fn, void *cb_arg) 7371 { 7372 blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true); 7373 } 7374 7375 struct spdk_bs_iter_ctx { 7376 int64_t page_num; 7377 struct spdk_blob_store *bs; 7378 7379 spdk_blob_op_with_handle_complete cb_fn; 7380 void *cb_arg; 7381 }; 7382 7383 static void 7384 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) 7385 { 7386 struct spdk_bs_iter_ctx *ctx = cb_arg; 7387 struct spdk_blob_store *bs = ctx->bs; 7388 spdk_blob_id id; 7389 7390 if (bserrno == 0) { 7391 ctx->cb_fn(ctx->cb_arg, _blob, bserrno); 7392 free(ctx); 7393 return; 7394 } 7395 7396 ctx->page_num++; 7397 ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); 7398 if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { 7399 ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); 7400 free(ctx); 7401 return; 7402 } 7403 7404 id = bs_page_to_blobid(ctx->page_num); 7405 7406 spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx); 7407 } 7408 7409 void 7410 spdk_bs_iter_first(struct spdk_blob_store *bs, 7411 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7412 { 7413 struct spdk_bs_iter_ctx *ctx; 7414 7415 ctx = calloc(1, sizeof(*ctx)); 7416 if (!ctx) { 7417 cb_fn(cb_arg, NULL, -ENOMEM); 7418 return; 7419 } 7420 7421 ctx->page_num = -1; 7422 ctx->bs = bs; 7423 ctx->cb_fn = cb_fn; 7424 ctx->cb_arg = cb_arg; 7425 7426 bs_iter_cpl(ctx, NULL, -1); 7427 } 7428 7429 static void 7430 bs_iter_close_cpl(void *cb_arg, int bserrno) 7431 { 7432 struct spdk_bs_iter_ctx *ctx = cb_arg; 7433 7434 bs_iter_cpl(ctx, NULL, -1); 7435 } 7436 7437 void 7438 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, 7439 spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) 7440 { 7441 struct spdk_bs_iter_ctx *ctx; 7442 7443 assert(blob != NULL); 7444 7445 ctx = calloc(1, sizeof(*ctx)); 7446 if (!ctx) { 7447 cb_fn(cb_arg, NULL, -ENOMEM); 7448 return; 7449 } 7450 7451 ctx->page_num = bs_blobid_to_page(blob->id); 7452 ctx->bs = bs; 7453 ctx->cb_fn = cb_fn; 7454 ctx->cb_arg = cb_arg; 7455 7456 /* Close the existing blob */ 7457 spdk_blob_close(blob, bs_iter_close_cpl, ctx); 7458 } 7459 7460 static int 7461 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7462 uint16_t value_len, bool internal) 7463 { 7464 struct spdk_xattr_tailq *xattrs; 7465 struct spdk_xattr *xattr; 7466 size_t desc_size; 7467 void *tmp; 7468 7469 blob_verify_md_op(blob); 7470 7471 if (blob->md_ro) { 7472 return -EPERM; 7473 } 7474 7475 desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len; 7476 if (desc_size > SPDK_BS_MAX_DESC_SIZE) { 7477 SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name, 7478 desc_size, SPDK_BS_MAX_DESC_SIZE); 7479 return -ENOMEM; 7480 } 7481 7482 if (internal) { 7483 xattrs = &blob->xattrs_internal; 7484 blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; 7485 } else { 7486 xattrs = &blob->xattrs; 7487 } 7488 7489 TAILQ_FOREACH(xattr, xattrs, link) { 7490 if (!strcmp(name, xattr->name)) { 7491 tmp = malloc(value_len); 7492 if (!tmp) { 7493 return -ENOMEM; 7494 } 7495 7496 free(xattr->value); 7497 xattr->value_len = value_len; 7498 xattr->value = tmp; 7499 memcpy(xattr->value, value, value_len); 7500 7501 blob->state = SPDK_BLOB_STATE_DIRTY; 7502 7503 return 0; 7504 } 7505 } 7506 7507 xattr = calloc(1, sizeof(*xattr)); 7508 if (!xattr) { 7509 return -ENOMEM; 7510 } 7511 7512 xattr->name = strdup(name); 7513 if (!xattr->name) { 7514 free(xattr); 7515 return -ENOMEM; 7516 } 7517 7518 xattr->value_len = value_len; 7519 xattr->value = malloc(value_len); 7520 if (!xattr->value) { 7521 free(xattr->name); 7522 free(xattr); 7523 return -ENOMEM; 7524 } 7525 memcpy(xattr->value, value, value_len); 7526 TAILQ_INSERT_TAIL(xattrs, xattr, link); 7527 7528 blob->state = SPDK_BLOB_STATE_DIRTY; 7529 7530 return 0; 7531 } 7532 7533 int 7534 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, 7535 uint16_t value_len) 7536 { 7537 return blob_set_xattr(blob, name, value, value_len, false); 7538 } 7539 7540 static int 7541 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) 7542 { 7543 struct spdk_xattr_tailq *xattrs; 7544 struct spdk_xattr *xattr; 7545 7546 blob_verify_md_op(blob); 7547 7548 if (blob->md_ro) { 7549 return -EPERM; 7550 } 7551 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7552 7553 TAILQ_FOREACH(xattr, xattrs, link) { 7554 if (!strcmp(name, xattr->name)) { 7555 TAILQ_REMOVE(xattrs, xattr, link); 7556 free(xattr->value); 7557 free(xattr->name); 7558 free(xattr); 7559 7560 if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { 7561 blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; 7562 } 7563 blob->state = SPDK_BLOB_STATE_DIRTY; 7564 7565 return 0; 7566 } 7567 } 7568 7569 return -ENOENT; 7570 } 7571 7572 int 7573 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) 7574 { 7575 return blob_remove_xattr(blob, name, false); 7576 } 7577 7578 static int 7579 blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7580 const void **value, size_t *value_len, bool internal) 7581 { 7582 struct spdk_xattr *xattr; 7583 struct spdk_xattr_tailq *xattrs; 7584 7585 xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; 7586 7587 TAILQ_FOREACH(xattr, xattrs, link) { 7588 if (!strcmp(name, xattr->name)) { 7589 *value = xattr->value; 7590 *value_len = xattr->value_len; 7591 return 0; 7592 } 7593 } 7594 return -ENOENT; 7595 } 7596 7597 int 7598 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, 7599 const void **value, size_t *value_len) 7600 { 7601 blob_verify_md_op(blob); 7602 7603 return blob_get_xattr_value(blob, name, value, value_len, false); 7604 } 7605 7606 struct spdk_xattr_names { 7607 uint32_t count; 7608 const char *names[0]; 7609 }; 7610 7611 static int 7612 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) 7613 { 7614 struct spdk_xattr *xattr; 7615 int count = 0; 7616 7617 TAILQ_FOREACH(xattr, xattrs, link) { 7618 count++; 7619 } 7620 7621 *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); 7622 if (*names == NULL) { 7623 return -ENOMEM; 7624 } 7625 7626 TAILQ_FOREACH(xattr, xattrs, link) { 7627 (*names)->names[(*names)->count++] = xattr->name; 7628 } 7629 7630 return 0; 7631 } 7632 7633 int 7634 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) 7635 { 7636 blob_verify_md_op(blob); 7637 7638 return blob_get_xattr_names(&blob->xattrs, names); 7639 } 7640 7641 uint32_t 7642 spdk_xattr_names_get_count(struct spdk_xattr_names *names) 7643 { 7644 assert(names != NULL); 7645 7646 return names->count; 7647 } 7648 7649 const char * 7650 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) 7651 { 7652 if (index >= names->count) { 7653 return NULL; 7654 } 7655 7656 return names->names[index]; 7657 } 7658 7659 void 7660 spdk_xattr_names_free(struct spdk_xattr_names *names) 7661 { 7662 free(names); 7663 } 7664 7665 struct spdk_bs_type 7666 spdk_bs_get_bstype(struct spdk_blob_store *bs) 7667 { 7668 return bs->bstype; 7669 } 7670 7671 void 7672 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) 7673 { 7674 memcpy(&bs->bstype, &bstype, sizeof(bstype)); 7675 } 7676 7677 bool 7678 spdk_blob_is_read_only(struct spdk_blob *blob) 7679 { 7680 assert(blob != NULL); 7681 return (blob->data_ro || blob->md_ro); 7682 } 7683 7684 bool 7685 spdk_blob_is_snapshot(struct spdk_blob *blob) 7686 { 7687 struct spdk_blob_list *snapshot_entry; 7688 7689 assert(blob != NULL); 7690 7691 snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id); 7692 if (snapshot_entry == NULL) { 7693 return false; 7694 } 7695 7696 return true; 7697 } 7698 7699 bool 7700 spdk_blob_is_clone(struct spdk_blob *blob) 7701 { 7702 assert(blob != NULL); 7703 7704 if (blob->parent_id != SPDK_BLOBID_INVALID) { 7705 assert(spdk_blob_is_thin_provisioned(blob)); 7706 return true; 7707 } 7708 7709 return false; 7710 } 7711 7712 bool 7713 spdk_blob_is_thin_provisioned(struct spdk_blob *blob) 7714 { 7715 assert(blob != NULL); 7716 return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); 7717 } 7718 7719 static void 7720 blob_update_clear_method(struct spdk_blob *blob) 7721 { 7722 enum blob_clear_method stored_cm; 7723 7724 assert(blob != NULL); 7725 7726 /* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored 7727 * in metadata previously. If something other than the default was 7728 * specified, ignore stored value and used what was passed in. 7729 */ 7730 stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT); 7731 7732 if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) { 7733 blob->clear_method = stored_cm; 7734 } else if (blob->clear_method != stored_cm) { 7735 SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n", 7736 blob->clear_method, stored_cm); 7737 } 7738 } 7739 7740 spdk_blob_id 7741 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) 7742 { 7743 struct spdk_blob_list *snapshot_entry = NULL; 7744 struct spdk_blob_list *clone_entry = NULL; 7745 7746 TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { 7747 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 7748 if (clone_entry->id == blob_id) { 7749 return snapshot_entry->id; 7750 } 7751 } 7752 } 7753 7754 return SPDK_BLOBID_INVALID; 7755 } 7756 7757 int 7758 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, 7759 size_t *count) 7760 { 7761 struct spdk_blob_list *snapshot_entry, *clone_entry; 7762 size_t n; 7763 7764 snapshot_entry = bs_get_snapshot_entry(bs, blobid); 7765 if (snapshot_entry == NULL) { 7766 *count = 0; 7767 return 0; 7768 } 7769 7770 if (ids == NULL || *count < snapshot_entry->clone_count) { 7771 *count = snapshot_entry->clone_count; 7772 return -ENOMEM; 7773 } 7774 *count = snapshot_entry->clone_count; 7775 7776 n = 0; 7777 TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { 7778 ids[n++] = clone_entry->id; 7779 } 7780 7781 return 0; 7782 } 7783 7784 SPDK_LOG_REGISTER_COMPONENT(blob) 7785