xref: /spdk/lib/blob/blobstore.c (revision ae0b53b1b63c826e412c8b12a73312be03c329c3)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2017 Intel Corporation.
3  *   All rights reserved.
4  *   Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "spdk/blob.h"
10 #include "spdk/crc32.h"
11 #include "spdk/env.h"
12 #include "spdk/queue.h"
13 #include "spdk/thread.h"
14 #include "spdk/bit_array.h"
15 #include "spdk/bit_pool.h"
16 #include "spdk/likely.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 
20 #include "spdk_internal/assert.h"
21 #include "spdk/log.h"
22 
23 #include "blobstore.h"
24 
25 #define BLOB_CRC32C_INITIAL    0xffffffffUL
26 
27 static int bs_register_md_thread(struct spdk_blob_store *bs);
28 static int bs_unregister_md_thread(struct spdk_blob_store *bs);
29 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
30 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
31 		uint64_t cluster, uint32_t extent, struct spdk_blob_md_page *page,
32 		spdk_blob_op_complete cb_fn, void *cb_arg);
33 
34 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
35 			  uint16_t value_len, bool internal);
36 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name,
37 				const void **value, size_t *value_len, bool internal);
38 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
39 
40 static void blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
41 				   struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg);
42 
43 /*
44  * External snapshots require a channel per thread per esnap bdev.  The tree
45  * is populated lazily as blob IOs are handled by the back_bs_dev. When this
46  * channel is destroyed, all the channels in the tree are destroyed.
47  */
48 
49 struct blob_esnap_channel {
50 	RB_ENTRY(blob_esnap_channel)	node;
51 	spdk_blob_id			blob_id;
52 	struct spdk_io_channel		*channel;
53 };
54 
55 static int blob_esnap_channel_compare(struct blob_esnap_channel *c1, struct blob_esnap_channel *c2);
56 static void blob_esnap_destroy_bs_dev_channels(struct spdk_blob *blob, bool abort_io,
57 		spdk_blob_op_with_handle_complete cb_fn, void *cb_arg);
58 static void blob_esnap_destroy_bs_channel(struct spdk_bs_channel *ch);
59 RB_GENERATE_STATIC(blob_esnap_channel_tree, blob_esnap_channel, node, blob_esnap_channel_compare)
60 
61 static inline bool
62 blob_is_esnap_clone(const struct spdk_blob *blob)
63 {
64 	assert(blob != NULL);
65 	return !!(blob->invalid_flags & SPDK_BLOB_EXTERNAL_SNAPSHOT);
66 }
67 
68 static int
69 blob_id_cmp(struct spdk_blob *blob1, struct spdk_blob *blob2)
70 {
71 	return (blob1->id < blob2->id ? -1 : blob1->id > blob2->id);
72 }
73 
74 RB_GENERATE_STATIC(spdk_blob_tree, spdk_blob, link, blob_id_cmp);
75 
76 static void
77 blob_verify_md_op(struct spdk_blob *blob)
78 {
79 	assert(blob != NULL);
80 	assert(spdk_get_thread() == blob->bs->md_thread);
81 	assert(blob->state != SPDK_BLOB_STATE_LOADING);
82 }
83 
84 static struct spdk_blob_list *
85 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
86 {
87 	struct spdk_blob_list *snapshot_entry = NULL;
88 
89 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
90 		if (snapshot_entry->id == blobid) {
91 			break;
92 		}
93 	}
94 
95 	return snapshot_entry;
96 }
97 
98 static void
99 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
100 {
101 	assert(spdk_spin_held(&bs->used_lock));
102 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
103 	assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
104 
105 	spdk_bit_array_set(bs->used_md_pages, page);
106 }
107 
108 static void
109 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
110 {
111 	assert(spdk_spin_held(&bs->used_lock));
112 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
113 	assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
114 
115 	spdk_bit_array_clear(bs->used_md_pages, page);
116 }
117 
118 static uint32_t
119 bs_claim_cluster(struct spdk_blob_store *bs)
120 {
121 	uint32_t cluster_num;
122 
123 	assert(spdk_spin_held(&bs->used_lock));
124 
125 	cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters);
126 	if (cluster_num == UINT32_MAX) {
127 		return UINT32_MAX;
128 	}
129 
130 	SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num);
131 	bs->num_free_clusters--;
132 
133 	return cluster_num;
134 }
135 
136 static void
137 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
138 {
139 	assert(spdk_spin_held(&bs->used_lock));
140 	assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters));
141 	assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true);
142 	assert(bs->num_free_clusters < bs->total_clusters);
143 
144 	SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num);
145 
146 	spdk_bit_pool_free_bit(bs->used_clusters, cluster_num);
147 	bs->num_free_clusters++;
148 }
149 
150 static int
151 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
152 {
153 	uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
154 
155 	blob_verify_md_op(blob);
156 
157 	if (*cluster_lba != 0) {
158 		return -EEXIST;
159 	}
160 
161 	*cluster_lba = bs_cluster_to_lba(blob->bs, cluster);
162 	return 0;
163 }
164 
165 static int
166 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
167 		    uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map)
168 {
169 	uint32_t *extent_page = 0;
170 
171 	assert(spdk_spin_held(&blob->bs->used_lock));
172 
173 	*cluster = bs_claim_cluster(blob->bs);
174 	if (*cluster == UINT32_MAX) {
175 		/* No more free clusters. Cannot satisfy the request */
176 		return -ENOSPC;
177 	}
178 
179 	if (blob->use_extent_table) {
180 		extent_page = bs_cluster_to_extent_page(blob, cluster_num);
181 		if (*extent_page == 0) {
182 			/* Extent page shall never occupy md_page so start the search from 1 */
183 			if (*lowest_free_md_page == 0) {
184 				*lowest_free_md_page = 1;
185 			}
186 			/* No extent_page is allocated for the cluster */
187 			*lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
188 					       *lowest_free_md_page);
189 			if (*lowest_free_md_page == UINT32_MAX) {
190 				/* No more free md pages. Cannot satisfy the request */
191 				bs_release_cluster(blob->bs, *cluster);
192 				return -ENOSPC;
193 			}
194 			bs_claim_md_page(blob->bs, *lowest_free_md_page);
195 		}
196 	}
197 
198 	SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob 0x%" PRIx64 "\n", *cluster,
199 		      blob->id);
200 
201 	if (update_map) {
202 		blob_insert_cluster(blob, cluster_num, *cluster);
203 		if (blob->use_extent_table && *extent_page == 0) {
204 			*extent_page = *lowest_free_md_page;
205 		}
206 	}
207 
208 	return 0;
209 }
210 
211 static void
212 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
213 {
214 	xattrs->count = 0;
215 	xattrs->names = NULL;
216 	xattrs->ctx = NULL;
217 	xattrs->get_value = NULL;
218 }
219 
220 void
221 spdk_blob_opts_init(struct spdk_blob_opts *opts, size_t opts_size)
222 {
223 	if (!opts) {
224 		SPDK_ERRLOG("opts should not be NULL\n");
225 		return;
226 	}
227 
228 	if (!opts_size) {
229 		SPDK_ERRLOG("opts_size should not be zero value\n");
230 		return;
231 	}
232 
233 	memset(opts, 0, opts_size);
234 	opts->opts_size = opts_size;
235 
236 #define FIELD_OK(field) \
237         offsetof(struct spdk_blob_opts, field) + sizeof(opts->field) <= opts_size
238 
239 #define SET_FIELD(field, value) \
240         if (FIELD_OK(field)) { \
241                 opts->field = value; \
242         } \
243 
244 	SET_FIELD(num_clusters, 0);
245 	SET_FIELD(thin_provision, false);
246 	SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT);
247 
248 	if (FIELD_OK(xattrs)) {
249 		blob_xattrs_init(&opts->xattrs);
250 	}
251 
252 	SET_FIELD(use_extent_table, true);
253 
254 #undef FIELD_OK
255 #undef SET_FIELD
256 }
257 
258 void
259 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts, size_t opts_size)
260 {
261 	if (!opts) {
262 		SPDK_ERRLOG("opts should not be NULL\n");
263 		return;
264 	}
265 
266 	if (!opts_size) {
267 		SPDK_ERRLOG("opts_size should not be zero value\n");
268 		return;
269 	}
270 
271 	memset(opts, 0, opts_size);
272 	opts->opts_size = opts_size;
273 
274 #define FIELD_OK(field) \
275         offsetof(struct spdk_blob_open_opts, field) + sizeof(opts->field) <= opts_size
276 
277 #define SET_FIELD(field, value) \
278         if (FIELD_OK(field)) { \
279                 opts->field = value; \
280         } \
281 
282 	SET_FIELD(clear_method, BLOB_CLEAR_WITH_DEFAULT);
283 
284 #undef FIELD_OK
285 #undef SET_FILED
286 }
287 
288 static struct spdk_blob *
289 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
290 {
291 	struct spdk_blob *blob;
292 
293 	blob = calloc(1, sizeof(*blob));
294 	if (!blob) {
295 		return NULL;
296 	}
297 
298 	blob->id = id;
299 	blob->bs = bs;
300 
301 	blob->parent_id = SPDK_BLOBID_INVALID;
302 
303 	blob->state = SPDK_BLOB_STATE_DIRTY;
304 	blob->extent_rle_found = false;
305 	blob->extent_table_found = false;
306 	blob->active.num_pages = 1;
307 	blob->active.pages = calloc(1, sizeof(*blob->active.pages));
308 	if (!blob->active.pages) {
309 		free(blob);
310 		return NULL;
311 	}
312 
313 	blob->active.pages[0] = bs_blobid_to_page(id);
314 
315 	TAILQ_INIT(&blob->xattrs);
316 	TAILQ_INIT(&blob->xattrs_internal);
317 	TAILQ_INIT(&blob->pending_persists);
318 	TAILQ_INIT(&blob->persists_to_complete);
319 
320 	return blob;
321 }
322 
323 static void
324 xattrs_free(struct spdk_xattr_tailq *xattrs)
325 {
326 	struct spdk_xattr	*xattr, *xattr_tmp;
327 
328 	TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
329 		TAILQ_REMOVE(xattrs, xattr, link);
330 		free(xattr->name);
331 		free(xattr->value);
332 		free(xattr);
333 	}
334 }
335 
336 static void
337 blob_free(struct spdk_blob *blob)
338 {
339 	assert(blob != NULL);
340 	assert(TAILQ_EMPTY(&blob->pending_persists));
341 	assert(TAILQ_EMPTY(&blob->persists_to_complete));
342 
343 	free(blob->active.extent_pages);
344 	free(blob->clean.extent_pages);
345 	free(blob->active.clusters);
346 	free(blob->clean.clusters);
347 	free(blob->active.pages);
348 	free(blob->clean.pages);
349 
350 	xattrs_free(&blob->xattrs);
351 	xattrs_free(&blob->xattrs_internal);
352 
353 	if (blob->back_bs_dev) {
354 		blob->back_bs_dev->destroy(blob->back_bs_dev);
355 	}
356 
357 	free(blob);
358 }
359 
360 static void
361 blob_back_bs_destroy_esnap_done(void *ctx, struct spdk_blob *blob, int bserrno)
362 {
363 	struct spdk_bs_dev	*bs_dev = ctx;
364 
365 	if (bserrno != 0) {
366 		/*
367 		 * This is probably due to a memory allocation failure when creating the
368 		 * blob_esnap_destroy_ctx before iterating threads.
369 		 */
370 		SPDK_ERRLOG("blob 0x%" PRIx64 ": Unable to destroy bs dev channels: error %d\n",
371 			    blob->id, bserrno);
372 		assert(false);
373 	}
374 
375 	if (bs_dev == NULL) {
376 		/*
377 		 * This check exists to make scanbuild happy.
378 		 *
379 		 * blob->back_bs_dev for an esnap is NULL during the first iteration of blobs while
380 		 * the blobstore is being loaded. It could also be NULL if there was an error
381 		 * opening the esnap device. In each of these cases, no channels could have been
382 		 * created because back_bs_dev->create_channel() would have led to a NULL pointer
383 		 * deref.
384 		 */
385 		assert(false);
386 		return;
387 	}
388 
389 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": calling destroy on back_bs_dev\n", blob->id);
390 	bs_dev->destroy(bs_dev);
391 }
392 
393 static void
394 blob_back_bs_destroy(struct spdk_blob *blob)
395 {
396 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": preparing to destroy back_bs_dev\n",
397 		      blob->id);
398 
399 	blob_esnap_destroy_bs_dev_channels(blob, false, blob_back_bs_destroy_esnap_done,
400 					   blob->back_bs_dev);
401 	blob->back_bs_dev = NULL;
402 }
403 
404 struct freeze_io_ctx {
405 	struct spdk_bs_cpl cpl;
406 	struct spdk_blob *blob;
407 };
408 
409 static void
410 blob_io_sync(struct spdk_io_channel_iter *i)
411 {
412 	spdk_for_each_channel_continue(i, 0);
413 }
414 
415 static void
416 blob_execute_queued_io(struct spdk_io_channel_iter *i)
417 {
418 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
419 	struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
420 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
421 	struct spdk_bs_request_set	*set;
422 	struct spdk_bs_user_op_args	*args;
423 	spdk_bs_user_op_t *op, *tmp;
424 
425 	TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
426 		set = (struct spdk_bs_request_set *)op;
427 		args = &set->u.user_op;
428 
429 		if (args->blob == ctx->blob) {
430 			TAILQ_REMOVE(&ch->queued_io, op, link);
431 			bs_user_op_execute(op);
432 		}
433 	}
434 
435 	spdk_for_each_channel_continue(i, 0);
436 }
437 
438 static void
439 blob_io_cpl(struct spdk_io_channel_iter *i, int status)
440 {
441 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
442 
443 	ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
444 
445 	free(ctx);
446 }
447 
448 static void
449 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
450 {
451 	struct freeze_io_ctx *ctx;
452 
453 	blob_verify_md_op(blob);
454 
455 	ctx = calloc(1, sizeof(*ctx));
456 	if (!ctx) {
457 		cb_fn(cb_arg, -ENOMEM);
458 		return;
459 	}
460 
461 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
462 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
463 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
464 	ctx->blob = blob;
465 
466 	/* Freeze I/O on blob */
467 	blob->frozen_refcnt++;
468 
469 	spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl);
470 }
471 
472 static void
473 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
474 {
475 	struct freeze_io_ctx *ctx;
476 
477 	blob_verify_md_op(blob);
478 
479 	ctx = calloc(1, sizeof(*ctx));
480 	if (!ctx) {
481 		cb_fn(cb_arg, -ENOMEM);
482 		return;
483 	}
484 
485 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
486 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
487 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
488 	ctx->blob = blob;
489 
490 	assert(blob->frozen_refcnt > 0);
491 
492 	blob->frozen_refcnt--;
493 
494 	spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl);
495 }
496 
497 static int
498 blob_mark_clean(struct spdk_blob *blob)
499 {
500 	uint32_t *extent_pages = NULL;
501 	uint64_t *clusters = NULL;
502 	uint32_t *pages = NULL;
503 
504 	assert(blob != NULL);
505 
506 	if (blob->active.num_extent_pages) {
507 		assert(blob->active.extent_pages);
508 		extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
509 		if (!extent_pages) {
510 			return -ENOMEM;
511 		}
512 		memcpy(extent_pages, blob->active.extent_pages,
513 		       blob->active.num_extent_pages * sizeof(*extent_pages));
514 	}
515 
516 	if (blob->active.num_clusters) {
517 		assert(blob->active.clusters);
518 		clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
519 		if (!clusters) {
520 			free(extent_pages);
521 			return -ENOMEM;
522 		}
523 		memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
524 	}
525 
526 	if (blob->active.num_pages) {
527 		assert(blob->active.pages);
528 		pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
529 		if (!pages) {
530 			free(extent_pages);
531 			free(clusters);
532 			return -ENOMEM;
533 		}
534 		memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
535 	}
536 
537 	free(blob->clean.extent_pages);
538 	free(blob->clean.clusters);
539 	free(blob->clean.pages);
540 
541 	blob->clean.num_extent_pages = blob->active.num_extent_pages;
542 	blob->clean.extent_pages = blob->active.extent_pages;
543 	blob->clean.num_clusters = blob->active.num_clusters;
544 	blob->clean.clusters = blob->active.clusters;
545 	blob->clean.num_pages = blob->active.num_pages;
546 	blob->clean.pages = blob->active.pages;
547 
548 	blob->active.extent_pages = extent_pages;
549 	blob->active.clusters = clusters;
550 	blob->active.pages = pages;
551 
552 	/* If the metadata was dirtied again while the metadata was being written to disk,
553 	 *  we do not want to revert the DIRTY state back to CLEAN here.
554 	 */
555 	if (blob->state == SPDK_BLOB_STATE_LOADING) {
556 		blob->state = SPDK_BLOB_STATE_CLEAN;
557 	}
558 
559 	return 0;
560 }
561 
562 static int
563 blob_deserialize_xattr(struct spdk_blob *blob,
564 		       struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
565 {
566 	struct spdk_xattr                       *xattr;
567 
568 	if (desc_xattr->length != sizeof(desc_xattr->name_length) +
569 	    sizeof(desc_xattr->value_length) +
570 	    desc_xattr->name_length + desc_xattr->value_length) {
571 		return -EINVAL;
572 	}
573 
574 	xattr = calloc(1, sizeof(*xattr));
575 	if (xattr == NULL) {
576 		return -ENOMEM;
577 	}
578 
579 	xattr->name = malloc(desc_xattr->name_length + 1);
580 	if (xattr->name == NULL) {
581 		free(xattr);
582 		return -ENOMEM;
583 	}
584 
585 	xattr->value = malloc(desc_xattr->value_length);
586 	if (xattr->value == NULL) {
587 		free(xattr->name);
588 		free(xattr);
589 		return -ENOMEM;
590 	}
591 
592 	memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
593 	xattr->name[desc_xattr->name_length] = '\0';
594 	xattr->value_len = desc_xattr->value_length;
595 	memcpy(xattr->value,
596 	       (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
597 	       desc_xattr->value_length);
598 
599 	TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
600 
601 	return 0;
602 }
603 
604 
605 static int
606 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
607 {
608 	struct spdk_blob_md_descriptor *desc;
609 	size_t	cur_desc = 0;
610 	void *tmp;
611 
612 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
613 	while (cur_desc < sizeof(page->descriptors)) {
614 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
615 			if (desc->length == 0) {
616 				/* If padding and length are 0, this terminates the page */
617 				break;
618 			}
619 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
620 			struct spdk_blob_md_descriptor_flags	*desc_flags;
621 
622 			desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
623 
624 			if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
625 				return -EINVAL;
626 			}
627 
628 			if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
629 			    SPDK_BLOB_INVALID_FLAGS_MASK) {
630 				return -EINVAL;
631 			}
632 
633 			if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
634 			    SPDK_BLOB_DATA_RO_FLAGS_MASK) {
635 				blob->data_ro = true;
636 				blob->md_ro = true;
637 			}
638 
639 			if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
640 			    SPDK_BLOB_MD_RO_FLAGS_MASK) {
641 				blob->md_ro = true;
642 			}
643 
644 			if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
645 				blob->data_ro = true;
646 				blob->md_ro = true;
647 			}
648 
649 			blob->invalid_flags = desc_flags->invalid_flags;
650 			blob->data_ro_flags = desc_flags->data_ro_flags;
651 			blob->md_ro_flags = desc_flags->md_ro_flags;
652 
653 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
654 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
655 			unsigned int				i, j;
656 			unsigned int				cluster_count = blob->active.num_clusters;
657 
658 			if (blob->extent_table_found) {
659 				/* Extent Table already present in the md,
660 				 * both descriptors should never be at the same time. */
661 				return -EINVAL;
662 			}
663 			blob->extent_rle_found = true;
664 
665 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
666 
667 			if (desc_extent_rle->length == 0 ||
668 			    (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
669 				return -EINVAL;
670 			}
671 
672 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
673 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
674 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
675 						if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters,
676 										desc_extent_rle->extents[i].cluster_idx + j)) {
677 							return -EINVAL;
678 						}
679 					}
680 					cluster_count++;
681 				}
682 			}
683 
684 			if (cluster_count == 0) {
685 				return -EINVAL;
686 			}
687 			tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
688 			if (tmp == NULL) {
689 				return -ENOMEM;
690 			}
691 			blob->active.clusters = tmp;
692 			blob->active.cluster_array_size = cluster_count;
693 
694 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
695 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
696 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
697 						blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
698 								desc_extent_rle->extents[i].cluster_idx + j);
699 					} else if (spdk_blob_is_thin_provisioned(blob)) {
700 						blob->active.clusters[blob->active.num_clusters++] = 0;
701 					} else {
702 						return -EINVAL;
703 					}
704 				}
705 			}
706 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
707 			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
708 			uint32_t num_extent_pages = blob->active.num_extent_pages;
709 			uint32_t i, j;
710 			size_t extent_pages_length;
711 
712 			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
713 			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
714 
715 			if (blob->extent_rle_found) {
716 				/* This means that Extent RLE is present in MD,
717 				 * both should never be at the same time. */
718 				return -EINVAL;
719 			} else if (blob->extent_table_found &&
720 				   desc_extent_table->num_clusters != blob->remaining_clusters_in_et) {
721 				/* Number of clusters in this ET does not match number
722 				 * from previously read EXTENT_TABLE. */
723 				return -EINVAL;
724 			}
725 
726 			if (desc_extent_table->length == 0 ||
727 			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
728 				return -EINVAL;
729 			}
730 
731 			blob->extent_table_found = true;
732 
733 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
734 				num_extent_pages += desc_extent_table->extent_page[i].num_pages;
735 			}
736 
737 			if (num_extent_pages > 0) {
738 				tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
739 				if (tmp == NULL) {
740 					return -ENOMEM;
741 				}
742 				blob->active.extent_pages = tmp;
743 			}
744 			blob->active.extent_pages_array_size = num_extent_pages;
745 
746 			blob->remaining_clusters_in_et = desc_extent_table->num_clusters;
747 
748 			/* Extent table entries contain md page numbers for extent pages.
749 			 * Zeroes represent unallocated extent pages, those are run-length-encoded.
750 			 */
751 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
752 				if (desc_extent_table->extent_page[i].page_idx != 0) {
753 					assert(desc_extent_table->extent_page[i].num_pages == 1);
754 					blob->active.extent_pages[blob->active.num_extent_pages++] =
755 						desc_extent_table->extent_page[i].page_idx;
756 				} else if (spdk_blob_is_thin_provisioned(blob)) {
757 					for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
758 						blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
759 					}
760 				} else {
761 					return -EINVAL;
762 				}
763 			}
764 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
765 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
766 			unsigned int					i;
767 			unsigned int					cluster_count = 0;
768 			size_t						cluster_idx_length;
769 
770 			if (blob->extent_rle_found) {
771 				/* This means that Extent RLE is present in MD,
772 				 * both should never be at the same time. */
773 				return -EINVAL;
774 			}
775 
776 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
777 			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
778 
779 			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
780 			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
781 				return -EINVAL;
782 			}
783 
784 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
785 				if (desc_extent->cluster_idx[i] != 0) {
786 					if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
787 						return -EINVAL;
788 					}
789 				}
790 				cluster_count++;
791 			}
792 
793 			if (cluster_count == 0) {
794 				return -EINVAL;
795 			}
796 
797 			/* When reading extent pages sequentially starting cluster idx should match
798 			 * current size of a blob.
799 			 * If changed to batch reading, this check shall be removed. */
800 			if (desc_extent->start_cluster_idx != blob->active.num_clusters) {
801 				return -EINVAL;
802 			}
803 
804 			tmp = realloc(blob->active.clusters,
805 				      (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters));
806 			if (tmp == NULL) {
807 				return -ENOMEM;
808 			}
809 			blob->active.clusters = tmp;
810 			blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters);
811 
812 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
813 				if (desc_extent->cluster_idx[i] != 0) {
814 					blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
815 							desc_extent->cluster_idx[i]);
816 				} else if (spdk_blob_is_thin_provisioned(blob)) {
817 					blob->active.clusters[blob->active.num_clusters++] = 0;
818 				} else {
819 					return -EINVAL;
820 				}
821 			}
822 			assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters);
823 			assert(blob->remaining_clusters_in_et >= cluster_count);
824 			blob->remaining_clusters_in_et -= cluster_count;
825 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
826 			int rc;
827 
828 			rc = blob_deserialize_xattr(blob,
829 						    (struct spdk_blob_md_descriptor_xattr *) desc, false);
830 			if (rc != 0) {
831 				return rc;
832 			}
833 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
834 			int rc;
835 
836 			rc = blob_deserialize_xattr(blob,
837 						    (struct spdk_blob_md_descriptor_xattr *) desc, true);
838 			if (rc != 0) {
839 				return rc;
840 			}
841 		} else {
842 			/* Unrecognized descriptor type.  Do not fail - just continue to the
843 			 *  next descriptor.  If this descriptor is associated with some feature
844 			 *  defined in a newer version of blobstore, that version of blobstore
845 			 *  should create and set an associated feature flag to specify if this
846 			 *  blob can be loaded or not.
847 			 */
848 		}
849 
850 		/* Advance to the next descriptor */
851 		cur_desc += sizeof(*desc) + desc->length;
852 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
853 			break;
854 		}
855 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
856 	}
857 
858 	return 0;
859 }
860 
861 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
862 
863 static int
864 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
865 {
866 	assert(blob != NULL);
867 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
868 
869 	if (bs_load_cur_extent_page_valid(extent_page) == false) {
870 		return -ENOENT;
871 	}
872 
873 	return blob_parse_page(extent_page, blob);
874 }
875 
876 static int
877 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
878 	   struct spdk_blob *blob)
879 {
880 	const struct spdk_blob_md_page *page;
881 	uint32_t i;
882 	int rc;
883 	void *tmp;
884 
885 	assert(page_count > 0);
886 	assert(pages[0].sequence_num == 0);
887 	assert(blob != NULL);
888 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
889 	assert(blob->active.clusters == NULL);
890 
891 	/* The blobid provided doesn't match what's in the MD, this can
892 	 * happen for example if a bogus blobid is passed in through open.
893 	 */
894 	if (blob->id != pages[0].id) {
895 		SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n",
896 			    blob->id, pages[0].id);
897 		return -ENOENT;
898 	}
899 
900 	tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages));
901 	if (!tmp) {
902 		return -ENOMEM;
903 	}
904 	blob->active.pages = tmp;
905 
906 	blob->active.pages[0] = pages[0].id;
907 
908 	for (i = 1; i < page_count; i++) {
909 		assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next));
910 		blob->active.pages[i] = pages[i - 1].next;
911 	}
912 	blob->active.num_pages = page_count;
913 
914 	for (i = 0; i < page_count; i++) {
915 		page = &pages[i];
916 
917 		assert(page->id == blob->id);
918 		assert(page->sequence_num == i);
919 
920 		rc = blob_parse_page(page, blob);
921 		if (rc != 0) {
922 			return rc;
923 		}
924 	}
925 
926 	return 0;
927 }
928 
929 static int
930 blob_serialize_add_page(const struct spdk_blob *blob,
931 			struct spdk_blob_md_page **pages,
932 			uint32_t *page_count,
933 			struct spdk_blob_md_page **last_page)
934 {
935 	struct spdk_blob_md_page *page, *tmp_pages;
936 
937 	assert(pages != NULL);
938 	assert(page_count != NULL);
939 
940 	*last_page = NULL;
941 	if (*page_count == 0) {
942 		assert(*pages == NULL);
943 		*pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0,
944 				     NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
945 		if (*pages == NULL) {
946 			return -ENOMEM;
947 		}
948 		*page_count = 1;
949 	} else {
950 		assert(*pages != NULL);
951 		tmp_pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count + 1), 0);
952 		if (tmp_pages == NULL) {
953 			return -ENOMEM;
954 		}
955 		(*page_count)++;
956 		*pages = tmp_pages;
957 	}
958 
959 	page = &(*pages)[*page_count - 1];
960 	memset(page, 0, sizeof(*page));
961 	page->id = blob->id;
962 	page->sequence_num = *page_count - 1;
963 	page->next = SPDK_INVALID_MD_PAGE;
964 	*last_page = page;
965 
966 	return 0;
967 }
968 
969 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
970  * Update required_sz on both success and failure.
971  *
972  */
973 static int
974 blob_serialize_xattr(const struct spdk_xattr *xattr,
975 		     uint8_t *buf, size_t buf_sz,
976 		     size_t *required_sz, bool internal)
977 {
978 	struct spdk_blob_md_descriptor_xattr	*desc;
979 
980 	*required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
981 		       strlen(xattr->name) +
982 		       xattr->value_len;
983 
984 	if (buf_sz < *required_sz) {
985 		return -1;
986 	}
987 
988 	desc = (struct spdk_blob_md_descriptor_xattr *)buf;
989 
990 	desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
991 	desc->length = sizeof(desc->name_length) +
992 		       sizeof(desc->value_length) +
993 		       strlen(xattr->name) +
994 		       xattr->value_len;
995 	desc->name_length = strlen(xattr->name);
996 	desc->value_length = xattr->value_len;
997 
998 	memcpy(desc->name, xattr->name, desc->name_length);
999 	memcpy((void *)((uintptr_t)desc->name + desc->name_length),
1000 	       xattr->value,
1001 	       desc->value_length);
1002 
1003 	return 0;
1004 }
1005 
1006 static void
1007 blob_serialize_extent_table_entry(const struct spdk_blob *blob,
1008 				  uint64_t start_ep, uint64_t *next_ep,
1009 				  uint8_t **buf, size_t *remaining_sz)
1010 {
1011 	struct spdk_blob_md_descriptor_extent_table *desc;
1012 	size_t cur_sz;
1013 	uint64_t i, et_idx;
1014 	uint32_t extent_page, ep_len;
1015 
1016 	/* The buffer must have room for at least num_clusters entry */
1017 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters);
1018 	if (*remaining_sz < cur_sz) {
1019 		*next_ep = start_ep;
1020 		return;
1021 	}
1022 
1023 	desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
1024 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
1025 
1026 	desc->num_clusters = blob->active.num_clusters;
1027 
1028 	ep_len = 1;
1029 	et_idx = 0;
1030 	for (i = start_ep; i < blob->active.num_extent_pages; i++) {
1031 		if (*remaining_sz < cur_sz  + sizeof(desc->extent_page[0])) {
1032 			/* If we ran out of buffer space, return */
1033 			break;
1034 		}
1035 
1036 		extent_page = blob->active.extent_pages[i];
1037 		/* Verify that next extent_page is unallocated */
1038 		if (extent_page == 0 &&
1039 		    (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) {
1040 			ep_len++;
1041 			continue;
1042 		}
1043 		desc->extent_page[et_idx].page_idx = extent_page;
1044 		desc->extent_page[et_idx].num_pages = ep_len;
1045 		et_idx++;
1046 
1047 		ep_len = 1;
1048 		cur_sz += sizeof(desc->extent_page[et_idx]);
1049 	}
1050 	*next_ep = i;
1051 
1052 	desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
1053 	*remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
1054 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
1055 }
1056 
1057 static int
1058 blob_serialize_extent_table(const struct spdk_blob *blob,
1059 			    struct spdk_blob_md_page **pages,
1060 			    struct spdk_blob_md_page *cur_page,
1061 			    uint32_t *page_count, uint8_t **buf,
1062 			    size_t *remaining_sz)
1063 {
1064 	uint64_t				last_extent_page;
1065 	int					rc;
1066 
1067 	last_extent_page = 0;
1068 	/* At least single extent table entry has to be always persisted.
1069 	 * Such case occurs with num_extent_pages == 0. */
1070 	while (last_extent_page <= blob->active.num_extent_pages) {
1071 		blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
1072 						  remaining_sz);
1073 
1074 		if (last_extent_page == blob->active.num_extent_pages) {
1075 			break;
1076 		}
1077 
1078 		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
1079 		if (rc < 0) {
1080 			return rc;
1081 		}
1082 
1083 		*buf = (uint8_t *)cur_page->descriptors;
1084 		*remaining_sz = sizeof(cur_page->descriptors);
1085 	}
1086 
1087 	return 0;
1088 }
1089 
1090 static void
1091 blob_serialize_extent_rle(const struct spdk_blob *blob,
1092 			  uint64_t start_cluster, uint64_t *next_cluster,
1093 			  uint8_t **buf, size_t *buf_sz)
1094 {
1095 	struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
1096 	size_t cur_sz;
1097 	uint64_t i, extent_idx;
1098 	uint64_t lba, lba_per_cluster, lba_count;
1099 
1100 	/* The buffer must have room for at least one extent */
1101 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
1102 	if (*buf_sz < cur_sz) {
1103 		*next_cluster = start_cluster;
1104 		return;
1105 	}
1106 
1107 	desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
1108 	desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
1109 
1110 	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
1111 	/* Assert for scan-build false positive */
1112 	assert(lba_per_cluster > 0);
1113 
1114 	lba = blob->active.clusters[start_cluster];
1115 	lba_count = lba_per_cluster;
1116 	extent_idx = 0;
1117 	for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
1118 		if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
1119 			/* Run-length encode sequential non-zero LBA */
1120 			lba_count += lba_per_cluster;
1121 			continue;
1122 		} else if (lba == 0 && blob->active.clusters[i] == 0) {
1123 			/* Run-length encode unallocated clusters */
1124 			lba_count += lba_per_cluster;
1125 			continue;
1126 		}
1127 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
1128 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
1129 		extent_idx++;
1130 
1131 		cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
1132 
1133 		if (*buf_sz < cur_sz) {
1134 			/* If we ran out of buffer space, return */
1135 			*next_cluster = i;
1136 			break;
1137 		}
1138 
1139 		lba = blob->active.clusters[i];
1140 		lba_count = lba_per_cluster;
1141 	}
1142 
1143 	if (*buf_sz >= cur_sz) {
1144 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
1145 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
1146 		extent_idx++;
1147 
1148 		*next_cluster = blob->active.num_clusters;
1149 	}
1150 
1151 	desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
1152 	*buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1153 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1154 }
1155 
1156 static int
1157 blob_serialize_extents_rle(const struct spdk_blob *blob,
1158 			   struct spdk_blob_md_page **pages,
1159 			   struct spdk_blob_md_page *cur_page,
1160 			   uint32_t *page_count, uint8_t **buf,
1161 			   size_t *remaining_sz)
1162 {
1163 	uint64_t				last_cluster;
1164 	int					rc;
1165 
1166 	last_cluster = 0;
1167 	while (last_cluster < blob->active.num_clusters) {
1168 		blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
1169 
1170 		if (last_cluster == blob->active.num_clusters) {
1171 			break;
1172 		}
1173 
1174 		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
1175 		if (rc < 0) {
1176 			return rc;
1177 		}
1178 
1179 		*buf = (uint8_t *)cur_page->descriptors;
1180 		*remaining_sz = sizeof(cur_page->descriptors);
1181 	}
1182 
1183 	return 0;
1184 }
1185 
1186 static void
1187 blob_serialize_extent_page(const struct spdk_blob *blob,
1188 			   uint64_t cluster, struct spdk_blob_md_page *page)
1189 {
1190 	struct spdk_blob_md_descriptor_extent_page *desc_extent;
1191 	uint64_t i, extent_idx;
1192 	uint64_t lba, lba_per_cluster;
1193 	uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
1194 
1195 	desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
1196 	desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
1197 
1198 	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
1199 
1200 	desc_extent->start_cluster_idx = start_cluster_idx;
1201 	extent_idx = 0;
1202 	for (i = start_cluster_idx; i < blob->active.num_clusters; i++) {
1203 		lba = blob->active.clusters[i];
1204 		desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
1205 		if (extent_idx >= SPDK_EXTENTS_PER_EP) {
1206 			break;
1207 		}
1208 	}
1209 	desc_extent->length = sizeof(desc_extent->start_cluster_idx) +
1210 			      sizeof(desc_extent->cluster_idx[0]) * extent_idx;
1211 }
1212 
1213 static void
1214 blob_serialize_flags(const struct spdk_blob *blob,
1215 		     uint8_t *buf, size_t *buf_sz)
1216 {
1217 	struct spdk_blob_md_descriptor_flags *desc;
1218 
1219 	/*
1220 	 * Flags get serialized first, so we should always have room for the flags
1221 	 *  descriptor.
1222 	 */
1223 	assert(*buf_sz >= sizeof(*desc));
1224 
1225 	desc = (struct spdk_blob_md_descriptor_flags *)buf;
1226 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
1227 	desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
1228 	desc->invalid_flags = blob->invalid_flags;
1229 	desc->data_ro_flags = blob->data_ro_flags;
1230 	desc->md_ro_flags = blob->md_ro_flags;
1231 
1232 	*buf_sz -= sizeof(*desc);
1233 }
1234 
1235 static int
1236 blob_serialize_xattrs(const struct spdk_blob *blob,
1237 		      const struct spdk_xattr_tailq *xattrs, bool internal,
1238 		      struct spdk_blob_md_page **pages,
1239 		      struct spdk_blob_md_page *cur_page,
1240 		      uint32_t *page_count, uint8_t **buf,
1241 		      size_t *remaining_sz)
1242 {
1243 	const struct spdk_xattr	*xattr;
1244 	int	rc;
1245 
1246 	TAILQ_FOREACH(xattr, xattrs, link) {
1247 		size_t required_sz = 0;
1248 
1249 		rc = blob_serialize_xattr(xattr,
1250 					  *buf, *remaining_sz,
1251 					  &required_sz, internal);
1252 		if (rc < 0) {
1253 			/* Need to add a new page to the chain */
1254 			rc = blob_serialize_add_page(blob, pages, page_count,
1255 						     &cur_page);
1256 			if (rc < 0) {
1257 				spdk_free(*pages);
1258 				*pages = NULL;
1259 				*page_count = 0;
1260 				return rc;
1261 			}
1262 
1263 			*buf = (uint8_t *)cur_page->descriptors;
1264 			*remaining_sz = sizeof(cur_page->descriptors);
1265 
1266 			/* Try again */
1267 			required_sz = 0;
1268 			rc = blob_serialize_xattr(xattr,
1269 						  *buf, *remaining_sz,
1270 						  &required_sz, internal);
1271 
1272 			if (rc < 0) {
1273 				spdk_free(*pages);
1274 				*pages = NULL;
1275 				*page_count = 0;
1276 				return rc;
1277 			}
1278 		}
1279 
1280 		*remaining_sz -= required_sz;
1281 		*buf += required_sz;
1282 	}
1283 
1284 	return 0;
1285 }
1286 
1287 static int
1288 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
1289 	       uint32_t *page_count)
1290 {
1291 	struct spdk_blob_md_page		*cur_page;
1292 	int					rc;
1293 	uint8_t					*buf;
1294 	size_t					remaining_sz;
1295 
1296 	assert(pages != NULL);
1297 	assert(page_count != NULL);
1298 	assert(blob != NULL);
1299 	assert(blob->state == SPDK_BLOB_STATE_DIRTY);
1300 
1301 	*pages = NULL;
1302 	*page_count = 0;
1303 
1304 	/* A blob always has at least 1 page, even if it has no descriptors */
1305 	rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
1306 	if (rc < 0) {
1307 		return rc;
1308 	}
1309 
1310 	buf = (uint8_t *)cur_page->descriptors;
1311 	remaining_sz = sizeof(cur_page->descriptors);
1312 
1313 	/* Serialize flags */
1314 	blob_serialize_flags(blob, buf, &remaining_sz);
1315 	buf += sizeof(struct spdk_blob_md_descriptor_flags);
1316 
1317 	/* Serialize xattrs */
1318 	rc = blob_serialize_xattrs(blob, &blob->xattrs, false,
1319 				   pages, cur_page, page_count, &buf, &remaining_sz);
1320 	if (rc < 0) {
1321 		return rc;
1322 	}
1323 
1324 	/* Serialize internal xattrs */
1325 	rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
1326 				   pages, cur_page, page_count, &buf, &remaining_sz);
1327 	if (rc < 0) {
1328 		return rc;
1329 	}
1330 
1331 	if (blob->use_extent_table) {
1332 		/* Serialize extent table */
1333 		rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1334 	} else {
1335 		/* Serialize extents */
1336 		rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1337 	}
1338 
1339 	return rc;
1340 }
1341 
1342 struct spdk_blob_load_ctx {
1343 	struct spdk_blob		*blob;
1344 
1345 	struct spdk_blob_md_page	*pages;
1346 	uint32_t			num_pages;
1347 	uint32_t			next_extent_page;
1348 	spdk_bs_sequence_t	        *seq;
1349 
1350 	spdk_bs_sequence_cpl		cb_fn;
1351 	void				*cb_arg;
1352 };
1353 
1354 static uint32_t
1355 blob_md_page_calc_crc(void *page)
1356 {
1357 	uint32_t		crc;
1358 
1359 	crc = BLOB_CRC32C_INITIAL;
1360 	crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
1361 	crc ^= BLOB_CRC32C_INITIAL;
1362 
1363 	return crc;
1364 
1365 }
1366 
1367 static void
1368 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno)
1369 {
1370 	struct spdk_blob		*blob = ctx->blob;
1371 
1372 	if (bserrno == 0) {
1373 		blob_mark_clean(blob);
1374 	}
1375 
1376 	ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
1377 
1378 	/* Free the memory */
1379 	spdk_free(ctx->pages);
1380 	free(ctx);
1381 }
1382 
1383 static void
1384 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
1385 {
1386 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1387 	struct spdk_blob		*blob = ctx->blob;
1388 
1389 	if (bserrno == 0) {
1390 		blob->back_bs_dev = bs_create_blob_bs_dev(snapshot);
1391 		if (blob->back_bs_dev == NULL) {
1392 			bserrno = -ENOMEM;
1393 		}
1394 	}
1395 	if (bserrno != 0) {
1396 		SPDK_ERRLOG("Snapshot fail\n");
1397 	}
1398 
1399 	blob_load_final(ctx, bserrno);
1400 }
1401 
1402 static void blob_update_clear_method(struct spdk_blob *blob);
1403 
1404 static int
1405 blob_load_esnap(struct spdk_blob *blob, void *blob_ctx)
1406 {
1407 	struct spdk_blob_store *bs = blob->bs;
1408 	struct spdk_bs_dev *bs_dev = NULL;
1409 	const void *esnap_id = NULL;
1410 	size_t id_len = 0;
1411 	int rc;
1412 
1413 	if (bs->esnap_bs_dev_create == NULL) {
1414 		SPDK_NOTICELOG("blob 0x%" PRIx64 " is an esnap clone but the blobstore was opened "
1415 			       "without support for esnap clones\n", blob->id);
1416 		return -ENOTSUP;
1417 	}
1418 	assert(blob->back_bs_dev == NULL);
1419 
1420 	rc = blob_get_xattr_value(blob, BLOB_EXTERNAL_SNAPSHOT_ID, &esnap_id, &id_len, true);
1421 	if (rc != 0) {
1422 		SPDK_ERRLOG("blob 0x%" PRIx64 " is an esnap clone but has no esnap ID\n", blob->id);
1423 		return -EINVAL;
1424 	}
1425 	assert(id_len > 0 && id_len < UINT32_MAX);
1426 
1427 	SPDK_INFOLOG(blob, "Creating external snapshot device\n");
1428 
1429 	rc = bs->esnap_bs_dev_create(bs->esnap_ctx, blob_ctx, blob, esnap_id, (uint32_t)id_len,
1430 				     &bs_dev);
1431 	if (rc != 0) {
1432 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": failed to load back_bs_dev "
1433 			      "with error %d\n", blob->id, rc);
1434 		return rc;
1435 	}
1436 
1437 	/*
1438 	 * Note: bs_dev might be NULL if the consumer chose to not open the external snapshot.
1439 	 * This especially might happen during spdk_bs_load() iteration.
1440 	 */
1441 	if (bs_dev != NULL) {
1442 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": loaded back_bs_dev\n", blob->id);
1443 		if ((bs->io_unit_size % bs_dev->blocklen) != 0) {
1444 			SPDK_NOTICELOG("blob 0x%" PRIx64 " external snapshot device block size %u "
1445 				       "is not compatible with blobstore block size %u\n",
1446 				       blob->id, bs_dev->blocklen, bs->io_unit_size);
1447 			bs_dev->destroy(bs_dev);
1448 			return -EINVAL;
1449 		}
1450 	}
1451 
1452 	blob->back_bs_dev = bs_dev;
1453 	blob->parent_id = SPDK_BLOBID_EXTERNAL_SNAPSHOT;
1454 
1455 	return 0;
1456 }
1457 
1458 static void
1459 blob_load_backing_dev(spdk_bs_sequence_t *seq, void *cb_arg)
1460 {
1461 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1462 	struct spdk_blob		*blob = ctx->blob;
1463 	const void			*value;
1464 	size_t				len;
1465 	int				rc;
1466 
1467 	if (blob_is_esnap_clone(blob)) {
1468 		rc = blob_load_esnap(blob, seq->cpl.u.blob_handle.esnap_ctx);
1469 		blob_load_final(ctx, rc);
1470 		return;
1471 	}
1472 
1473 	if (spdk_blob_is_thin_provisioned(blob)) {
1474 		rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
1475 		if (rc == 0) {
1476 			if (len != sizeof(spdk_blob_id)) {
1477 				blob_load_final(ctx, -EINVAL);
1478 				return;
1479 			}
1480 			/* open snapshot blob and continue in the callback function */
1481 			blob->parent_id = *(spdk_blob_id *)value;
1482 			spdk_bs_open_blob(blob->bs, blob->parent_id,
1483 					  blob_load_snapshot_cpl, ctx);
1484 			return;
1485 		} else {
1486 			/* add zeroes_dev for thin provisioned blob */
1487 			blob->back_bs_dev = bs_create_zeroes_dev();
1488 		}
1489 	} else {
1490 		/* standard blob */
1491 		blob->back_bs_dev = NULL;
1492 	}
1493 	blob_load_final(ctx, 0);
1494 }
1495 
1496 static void
1497 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1498 {
1499 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1500 	struct spdk_blob		*blob = ctx->blob;
1501 	struct spdk_blob_md_page	*page;
1502 	uint64_t			i;
1503 	uint32_t			crc;
1504 	uint64_t			lba;
1505 	void				*tmp;
1506 	uint64_t			sz;
1507 
1508 	if (bserrno) {
1509 		SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
1510 		blob_load_final(ctx, bserrno);
1511 		return;
1512 	}
1513 
1514 	if (ctx->pages == NULL) {
1515 		/* First iteration of this function, allocate buffer for single EXTENT_PAGE */
1516 		ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
1517 					  NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
1518 		if (!ctx->pages) {
1519 			blob_load_final(ctx, -ENOMEM);
1520 			return;
1521 		}
1522 		ctx->num_pages = 1;
1523 		ctx->next_extent_page = 0;
1524 	} else {
1525 		page = &ctx->pages[0];
1526 		crc = blob_md_page_calc_crc(page);
1527 		if (crc != page->crc) {
1528 			blob_load_final(ctx, -EINVAL);
1529 			return;
1530 		}
1531 
1532 		if (page->next != SPDK_INVALID_MD_PAGE) {
1533 			blob_load_final(ctx, -EINVAL);
1534 			return;
1535 		}
1536 
1537 		bserrno = blob_parse_extent_page(page, blob);
1538 		if (bserrno) {
1539 			blob_load_final(ctx, bserrno);
1540 			return;
1541 		}
1542 	}
1543 
1544 	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
1545 		if (blob->active.extent_pages[i] != 0) {
1546 			/* Extent page was allocated, read and parse it. */
1547 			lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
1548 			ctx->next_extent_page = i + 1;
1549 
1550 			bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1551 					     bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
1552 					     blob_load_cpl_extents_cpl, ctx);
1553 			return;
1554 		} else {
1555 			/* Thin provisioned blobs can point to unallocated extent pages.
1556 			 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */
1557 
1558 			sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP);
1559 			blob->active.num_clusters += sz;
1560 			blob->remaining_clusters_in_et -= sz;
1561 
1562 			assert(spdk_blob_is_thin_provisioned(blob));
1563 			assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0);
1564 
1565 			tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
1566 			if (tmp == NULL) {
1567 				blob_load_final(ctx, -ENOMEM);
1568 				return;
1569 			}
1570 			memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0,
1571 			       sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size));
1572 			blob->active.clusters = tmp;
1573 			blob->active.cluster_array_size = blob->active.num_clusters;
1574 		}
1575 	}
1576 
1577 	blob_load_backing_dev(seq, ctx);
1578 }
1579 
1580 static void
1581 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1582 {
1583 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1584 	struct spdk_blob		*blob = ctx->blob;
1585 	struct spdk_blob_md_page	*page;
1586 	int				rc;
1587 	uint32_t			crc;
1588 	uint32_t			current_page;
1589 
1590 	if (ctx->num_pages == 1) {
1591 		current_page = bs_blobid_to_page(blob->id);
1592 	} else {
1593 		assert(ctx->num_pages != 0);
1594 		page = &ctx->pages[ctx->num_pages - 2];
1595 		current_page = page->next;
1596 	}
1597 
1598 	if (bserrno) {
1599 		SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n",
1600 			    current_page, blob->id, bserrno);
1601 		blob_load_final(ctx, bserrno);
1602 		return;
1603 	}
1604 
1605 	page = &ctx->pages[ctx->num_pages - 1];
1606 	crc = blob_md_page_calc_crc(page);
1607 	if (crc != page->crc) {
1608 		SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n",
1609 			    current_page, blob->id);
1610 		blob_load_final(ctx, -EINVAL);
1611 		return;
1612 	}
1613 
1614 	if (page->next != SPDK_INVALID_MD_PAGE) {
1615 		struct spdk_blob_md_page *tmp_pages;
1616 		uint32_t next_page = page->next;
1617 		uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page);
1618 
1619 		/* Read the next page */
1620 		tmp_pages = spdk_realloc(ctx->pages, (sizeof(*page) * (ctx->num_pages + 1)), 0);
1621 		if (tmp_pages == NULL) {
1622 			blob_load_final(ctx, -ENOMEM);
1623 			return;
1624 		}
1625 		ctx->num_pages++;
1626 		ctx->pages = tmp_pages;
1627 
1628 		bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
1629 				     next_lba,
1630 				     bs_byte_to_lba(blob->bs, sizeof(*page)),
1631 				     blob_load_cpl, ctx);
1632 		return;
1633 	}
1634 
1635 	/* Parse the pages */
1636 	rc = blob_parse(ctx->pages, ctx->num_pages, blob);
1637 	if (rc) {
1638 		blob_load_final(ctx, rc);
1639 		return;
1640 	}
1641 
1642 	if (blob->extent_table_found == true) {
1643 		/* If EXTENT_TABLE was found, that means support for it should be enabled. */
1644 		assert(blob->extent_rle_found == false);
1645 		blob->use_extent_table = true;
1646 	} else {
1647 		/* If EXTENT_RLE or no extent_* descriptor was found disable support
1648 		 * for extent table. No extent_* descriptors means that blob has length of 0
1649 		 * and no extent_rle descriptors were persisted for it.
1650 		 * EXTENT_TABLE if used, is always present in metadata regardless of length. */
1651 		blob->use_extent_table = false;
1652 	}
1653 
1654 	/* Check the clear_method stored in metadata vs what may have been passed
1655 	 * via spdk_bs_open_blob_ext() and update accordingly.
1656 	 */
1657 	blob_update_clear_method(blob);
1658 
1659 	spdk_free(ctx->pages);
1660 	ctx->pages = NULL;
1661 
1662 	if (blob->extent_table_found) {
1663 		blob_load_cpl_extents_cpl(seq, ctx, 0);
1664 	} else {
1665 		blob_load_backing_dev(seq, ctx);
1666 	}
1667 }
1668 
1669 /* Load a blob from disk given a blobid */
1670 static void
1671 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
1672 	  spdk_bs_sequence_cpl cb_fn, void *cb_arg)
1673 {
1674 	struct spdk_blob_load_ctx *ctx;
1675 	struct spdk_blob_store *bs;
1676 	uint32_t page_num;
1677 	uint64_t lba;
1678 
1679 	blob_verify_md_op(blob);
1680 
1681 	bs = blob->bs;
1682 
1683 	ctx = calloc(1, sizeof(*ctx));
1684 	if (!ctx) {
1685 		cb_fn(seq, cb_arg, -ENOMEM);
1686 		return;
1687 	}
1688 
1689 	ctx->blob = blob;
1690 	ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0);
1691 	if (!ctx->pages) {
1692 		free(ctx);
1693 		cb_fn(seq, cb_arg, -ENOMEM);
1694 		return;
1695 	}
1696 	ctx->num_pages = 1;
1697 	ctx->cb_fn = cb_fn;
1698 	ctx->cb_arg = cb_arg;
1699 	ctx->seq = seq;
1700 
1701 	page_num = bs_blobid_to_page(blob->id);
1702 	lba = bs_md_page_to_lba(blob->bs, page_num);
1703 
1704 	blob->state = SPDK_BLOB_STATE_LOADING;
1705 
1706 	bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1707 			     bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE),
1708 			     blob_load_cpl, ctx);
1709 }
1710 
1711 struct spdk_blob_persist_ctx {
1712 	struct spdk_blob		*blob;
1713 
1714 	struct spdk_blob_md_page	*pages;
1715 	uint32_t			next_extent_page;
1716 	struct spdk_blob_md_page	*extent_page;
1717 
1718 	spdk_bs_sequence_t		*seq;
1719 	spdk_bs_sequence_cpl		cb_fn;
1720 	void				*cb_arg;
1721 	TAILQ_ENTRY(spdk_blob_persist_ctx) link;
1722 };
1723 
1724 static void
1725 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba,
1726 		   uint64_t lba_count)
1727 {
1728 	switch (ctx->blob->clear_method) {
1729 	case BLOB_CLEAR_WITH_DEFAULT:
1730 	case BLOB_CLEAR_WITH_UNMAP:
1731 		bs_batch_unmap_dev(batch, lba, lba_count);
1732 		break;
1733 	case BLOB_CLEAR_WITH_WRITE_ZEROES:
1734 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
1735 		break;
1736 	case BLOB_CLEAR_WITH_NONE:
1737 	default:
1738 		break;
1739 	}
1740 }
1741 
1742 static void bs_mark_dirty(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
1743 			  spdk_bs_sequence_cpl cb_fn, void *cb_arg);
1744 
1745 static void
1746 blob_persist_complete_cb(void *arg)
1747 {
1748 	struct spdk_blob_persist_ctx *ctx = arg;
1749 
1750 	/* Call user callback */
1751 	ctx->cb_fn(ctx->seq, ctx->cb_arg, 0);
1752 
1753 	/* Free the memory */
1754 	spdk_free(ctx->pages);
1755 	free(ctx);
1756 }
1757 
1758 static void blob_persist_start(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
1759 
1760 static void
1761 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno)
1762 {
1763 	struct spdk_blob_persist_ctx	*next_persist, *tmp;
1764 	struct spdk_blob		*blob = ctx->blob;
1765 
1766 	if (bserrno == 0) {
1767 		blob_mark_clean(blob);
1768 	}
1769 
1770 	assert(ctx == TAILQ_FIRST(&blob->persists_to_complete));
1771 
1772 	/* Complete all persists that were pending when the current persist started */
1773 	TAILQ_FOREACH_SAFE(next_persist, &blob->persists_to_complete, link, tmp) {
1774 		TAILQ_REMOVE(&blob->persists_to_complete, next_persist, link);
1775 		spdk_thread_send_msg(spdk_get_thread(), blob_persist_complete_cb, next_persist);
1776 	}
1777 
1778 	if (TAILQ_EMPTY(&blob->pending_persists)) {
1779 		return;
1780 	}
1781 
1782 	/* Queue up all pending persists for completion and start blob persist with first one */
1783 	TAILQ_SWAP(&blob->persists_to_complete, &blob->pending_persists, spdk_blob_persist_ctx, link);
1784 	next_persist = TAILQ_FIRST(&blob->persists_to_complete);
1785 
1786 	blob->state = SPDK_BLOB_STATE_DIRTY;
1787 	bs_mark_dirty(seq, blob->bs, blob_persist_start, next_persist);
1788 }
1789 
1790 static void
1791 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1792 {
1793 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1794 	struct spdk_blob		*blob = ctx->blob;
1795 	struct spdk_blob_store		*bs = blob->bs;
1796 	size_t				i;
1797 
1798 	if (bserrno != 0) {
1799 		blob_persist_complete(seq, ctx, bserrno);
1800 		return;
1801 	}
1802 
1803 	spdk_spin_lock(&bs->used_lock);
1804 
1805 	/* Release all extent_pages that were truncated */
1806 	for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
1807 		/* Nothing to release if it was not allocated */
1808 		if (blob->active.extent_pages[i] != 0) {
1809 			bs_release_md_page(bs, blob->active.extent_pages[i]);
1810 		}
1811 	}
1812 
1813 	spdk_spin_unlock(&bs->used_lock);
1814 
1815 	if (blob->active.num_extent_pages == 0) {
1816 		free(blob->active.extent_pages);
1817 		blob->active.extent_pages = NULL;
1818 		blob->active.extent_pages_array_size = 0;
1819 	} else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) {
1820 #ifndef __clang_analyzer__
1821 		void *tmp;
1822 
1823 		/* scan-build really can't figure reallocs, workaround it */
1824 		tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
1825 		assert(tmp != NULL);
1826 		blob->active.extent_pages = tmp;
1827 #endif
1828 		blob->active.extent_pages_array_size = blob->active.num_extent_pages;
1829 	}
1830 
1831 	blob_persist_complete(seq, ctx, bserrno);
1832 }
1833 
1834 static void
1835 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
1836 {
1837 	struct spdk_blob		*blob = ctx->blob;
1838 	struct spdk_blob_store		*bs = blob->bs;
1839 	size_t				i;
1840 	uint64_t                        lba;
1841 	uint64_t                        lba_count;
1842 	spdk_bs_batch_t                 *batch;
1843 
1844 	batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx);
1845 	lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
1846 
1847 	/* Clear all extent_pages that were truncated */
1848 	for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
1849 		/* Nothing to clear if it was not allocated */
1850 		if (blob->active.extent_pages[i] != 0) {
1851 			lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]);
1852 			bs_batch_write_zeroes_dev(batch, lba, lba_count);
1853 		}
1854 	}
1855 
1856 	bs_batch_close(batch);
1857 }
1858 
1859 static void
1860 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1861 {
1862 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1863 	struct spdk_blob		*blob = ctx->blob;
1864 	struct spdk_blob_store		*bs = blob->bs;
1865 	size_t				i;
1866 
1867 	if (bserrno != 0) {
1868 		blob_persist_complete(seq, ctx, bserrno);
1869 		return;
1870 	}
1871 
1872 	spdk_spin_lock(&bs->used_lock);
1873 	/* Release all clusters that were truncated */
1874 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1875 		uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]);
1876 
1877 		/* Nothing to release if it was not allocated */
1878 		if (blob->active.clusters[i] != 0) {
1879 			bs_release_cluster(bs, cluster_num);
1880 		}
1881 	}
1882 	spdk_spin_unlock(&bs->used_lock);
1883 
1884 	if (blob->active.num_clusters == 0) {
1885 		free(blob->active.clusters);
1886 		blob->active.clusters = NULL;
1887 		blob->active.cluster_array_size = 0;
1888 	} else if (blob->active.num_clusters != blob->active.cluster_array_size) {
1889 #ifndef __clang_analyzer__
1890 		void *tmp;
1891 
1892 		/* scan-build really can't figure reallocs, workaround it */
1893 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
1894 		assert(tmp != NULL);
1895 		blob->active.clusters = tmp;
1896 
1897 #endif
1898 		blob->active.cluster_array_size = blob->active.num_clusters;
1899 	}
1900 
1901 	/* Move on to clearing extent pages */
1902 	blob_persist_clear_extents(seq, ctx);
1903 }
1904 
1905 static void
1906 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
1907 {
1908 	struct spdk_blob		*blob = ctx->blob;
1909 	struct spdk_blob_store		*bs = blob->bs;
1910 	spdk_bs_batch_t			*batch;
1911 	size_t				i;
1912 	uint64_t			lba;
1913 	uint64_t			lba_count;
1914 
1915 	/* Clusters don't move around in blobs. The list shrinks or grows
1916 	 * at the end, but no changes ever occur in the middle of the list.
1917 	 */
1918 
1919 	batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx);
1920 
1921 	/* Clear all clusters that were truncated */
1922 	lba = 0;
1923 	lba_count = 0;
1924 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1925 		uint64_t next_lba = blob->active.clusters[i];
1926 		uint64_t next_lba_count = bs_cluster_to_lba(bs, 1);
1927 
1928 		if (next_lba > 0 && (lba + lba_count) == next_lba) {
1929 			/* This cluster is contiguous with the previous one. */
1930 			lba_count += next_lba_count;
1931 			continue;
1932 		} else if (next_lba == 0) {
1933 			continue;
1934 		}
1935 
1936 		/* This cluster is not contiguous with the previous one. */
1937 
1938 		/* If a run of LBAs previously existing, clear them now */
1939 		if (lba_count > 0) {
1940 			bs_batch_clear_dev(ctx, batch, lba, lba_count);
1941 		}
1942 
1943 		/* Start building the next batch */
1944 		lba = next_lba;
1945 		if (next_lba > 0) {
1946 			lba_count = next_lba_count;
1947 		} else {
1948 			lba_count = 0;
1949 		}
1950 	}
1951 
1952 	/* If we ended with a contiguous set of LBAs, clear them now */
1953 	if (lba_count > 0) {
1954 		bs_batch_clear_dev(ctx, batch, lba, lba_count);
1955 	}
1956 
1957 	bs_batch_close(batch);
1958 }
1959 
1960 static void
1961 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1962 {
1963 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1964 	struct spdk_blob		*blob = ctx->blob;
1965 	struct spdk_blob_store		*bs = blob->bs;
1966 	size_t				i;
1967 
1968 	if (bserrno != 0) {
1969 		blob_persist_complete(seq, ctx, bserrno);
1970 		return;
1971 	}
1972 
1973 	spdk_spin_lock(&bs->used_lock);
1974 
1975 	/* This loop starts at 1 because the first page is special and handled
1976 	 * below. The pages (except the first) are never written in place,
1977 	 * so any pages in the clean list must be zeroed.
1978 	 */
1979 	for (i = 1; i < blob->clean.num_pages; i++) {
1980 		bs_release_md_page(bs, blob->clean.pages[i]);
1981 	}
1982 
1983 	if (blob->active.num_pages == 0) {
1984 		uint32_t page_num;
1985 
1986 		page_num = bs_blobid_to_page(blob->id);
1987 		bs_release_md_page(bs, page_num);
1988 	}
1989 
1990 	spdk_spin_unlock(&bs->used_lock);
1991 
1992 	/* Move on to clearing clusters */
1993 	blob_persist_clear_clusters(seq, ctx);
1994 }
1995 
1996 static void
1997 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1998 {
1999 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
2000 	struct spdk_blob		*blob = ctx->blob;
2001 	struct spdk_blob_store		*bs = blob->bs;
2002 	uint64_t			lba;
2003 	uint64_t			lba_count;
2004 	spdk_bs_batch_t			*batch;
2005 	size_t				i;
2006 
2007 	if (bserrno != 0) {
2008 		blob_persist_complete(seq, ctx, bserrno);
2009 		return;
2010 	}
2011 
2012 	batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx);
2013 
2014 	lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
2015 
2016 	/* This loop starts at 1 because the first page is special and handled
2017 	 * below. The pages (except the first) are never written in place,
2018 	 * so any pages in the clean list must be zeroed.
2019 	 */
2020 	for (i = 1; i < blob->clean.num_pages; i++) {
2021 		lba = bs_md_page_to_lba(bs, blob->clean.pages[i]);
2022 
2023 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
2024 	}
2025 
2026 	/* The first page will only be zeroed if this is a delete. */
2027 	if (blob->active.num_pages == 0) {
2028 		uint32_t page_num;
2029 
2030 		/* The first page in the metadata goes where the blobid indicates */
2031 		page_num = bs_blobid_to_page(blob->id);
2032 		lba = bs_md_page_to_lba(bs, page_num);
2033 
2034 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
2035 	}
2036 
2037 	bs_batch_close(batch);
2038 }
2039 
2040 static void
2041 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2042 {
2043 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
2044 	struct spdk_blob		*blob = ctx->blob;
2045 	struct spdk_blob_store		*bs = blob->bs;
2046 	uint64_t			lba;
2047 	uint32_t			lba_count;
2048 	struct spdk_blob_md_page	*page;
2049 
2050 	if (bserrno != 0) {
2051 		blob_persist_complete(seq, ctx, bserrno);
2052 		return;
2053 	}
2054 
2055 	if (blob->active.num_pages == 0) {
2056 		/* Move on to the next step */
2057 		blob_persist_zero_pages(seq, ctx, 0);
2058 		return;
2059 	}
2060 
2061 	lba_count = bs_byte_to_lba(bs, sizeof(*page));
2062 
2063 	page = &ctx->pages[0];
2064 	/* The first page in the metadata goes where the blobid indicates */
2065 	lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id));
2066 
2067 	bs_sequence_write_dev(seq, page, lba, lba_count,
2068 			      blob_persist_zero_pages, ctx);
2069 }
2070 
2071 static void
2072 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
2073 {
2074 	struct spdk_blob		*blob = ctx->blob;
2075 	struct spdk_blob_store		*bs = blob->bs;
2076 	uint64_t			lba;
2077 	uint32_t			lba_count;
2078 	struct spdk_blob_md_page	*page;
2079 	spdk_bs_batch_t			*batch;
2080 	size_t				i;
2081 
2082 	/* Clusters don't move around in blobs. The list shrinks or grows
2083 	 * at the end, but no changes ever occur in the middle of the list.
2084 	 */
2085 
2086 	lba_count = bs_byte_to_lba(bs, sizeof(*page));
2087 
2088 	batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx);
2089 
2090 	/* This starts at 1. The root page is not written until
2091 	 * all of the others are finished
2092 	 */
2093 	for (i = 1; i < blob->active.num_pages; i++) {
2094 		page = &ctx->pages[i];
2095 		assert(page->sequence_num == i);
2096 
2097 		lba = bs_md_page_to_lba(bs, blob->active.pages[i]);
2098 
2099 		bs_batch_write_dev(batch, page, lba, lba_count);
2100 	}
2101 
2102 	bs_batch_close(batch);
2103 }
2104 
2105 static int
2106 blob_resize(struct spdk_blob *blob, uint64_t sz)
2107 {
2108 	uint64_t	i;
2109 	uint64_t	*tmp;
2110 	uint64_t	cluster;
2111 	uint32_t	lfmd; /*  lowest free md page */
2112 	uint64_t	num_clusters;
2113 	uint32_t	*ep_tmp;
2114 	uint64_t	new_num_ep = 0, current_num_ep = 0;
2115 	struct spdk_blob_store *bs;
2116 	int		rc;
2117 
2118 	bs = blob->bs;
2119 
2120 	blob_verify_md_op(blob);
2121 
2122 	if (blob->active.num_clusters == sz) {
2123 		return 0;
2124 	}
2125 
2126 	if (blob->active.num_clusters < blob->active.cluster_array_size) {
2127 		/* If this blob was resized to be larger, then smaller, then
2128 		 * larger without syncing, then the cluster array already
2129 		 * contains spare assigned clusters we can use.
2130 		 */
2131 		num_clusters = spdk_min(blob->active.cluster_array_size,
2132 					sz);
2133 	} else {
2134 		num_clusters = blob->active.num_clusters;
2135 	}
2136 
2137 	if (blob->use_extent_table) {
2138 		/* Round up since every cluster beyond current Extent Table size,
2139 		 * requires new extent page. */
2140 		new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
2141 		current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
2142 	}
2143 
2144 	assert(!spdk_spin_held(&bs->used_lock));
2145 
2146 	/* Check first that we have enough clusters and md pages before we start claiming them.
2147 	 * bs->used_lock is held to ensure that clusters we think are free are still free when we go
2148 	 * to claim them later in this function.
2149 	 */
2150 	if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) {
2151 		spdk_spin_lock(&bs->used_lock);
2152 		if ((sz - num_clusters) > bs->num_free_clusters) {
2153 			rc = -ENOSPC;
2154 			goto out;
2155 		}
2156 		lfmd = 0;
2157 		for (i = current_num_ep; i < new_num_ep ; i++) {
2158 			lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
2159 			if (lfmd == UINT32_MAX) {
2160 				/* No more free md pages. Cannot satisfy the request */
2161 				rc = -ENOSPC;
2162 				goto out;
2163 			}
2164 		}
2165 	}
2166 
2167 	if (sz > num_clusters) {
2168 		/* Expand the cluster array if necessary.
2169 		 * We only shrink the array when persisting.
2170 		 */
2171 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
2172 		if (sz > 0 && tmp == NULL) {
2173 			rc = -ENOMEM;
2174 			goto out;
2175 		}
2176 		memset(tmp + blob->active.cluster_array_size, 0,
2177 		       sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
2178 		blob->active.clusters = tmp;
2179 		blob->active.cluster_array_size = sz;
2180 
2181 		/* Expand the extents table, only if enough clusters were added */
2182 		if (new_num_ep > current_num_ep && blob->use_extent_table) {
2183 			ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
2184 			if (new_num_ep > 0 && ep_tmp == NULL) {
2185 				rc = -ENOMEM;
2186 				goto out;
2187 			}
2188 			memset(ep_tmp + blob->active.extent_pages_array_size, 0,
2189 			       sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
2190 			blob->active.extent_pages = ep_tmp;
2191 			blob->active.extent_pages_array_size = new_num_ep;
2192 		}
2193 	}
2194 
2195 	blob->state = SPDK_BLOB_STATE_DIRTY;
2196 
2197 	if (spdk_blob_is_thin_provisioned(blob) == false) {
2198 		cluster = 0;
2199 		lfmd = 0;
2200 		for (i = num_clusters; i < sz; i++) {
2201 			bs_allocate_cluster(blob, i, &cluster, &lfmd, true);
2202 			/* Do not increment lfmd here.  lfmd will get updated
2203 			 * to the md_page allocated (if any) when a new extent
2204 			 * page is needed.  Just pass that value again,
2205 			 * bs_allocate_cluster will just start at that index
2206 			 * to find the next free md_page when needed.
2207 			 */
2208 		}
2209 	}
2210 
2211 	blob->active.num_clusters = sz;
2212 	blob->active.num_extent_pages = new_num_ep;
2213 
2214 	rc = 0;
2215 out:
2216 	if (spdk_spin_held(&bs->used_lock)) {
2217 		spdk_spin_unlock(&bs->used_lock);
2218 	}
2219 
2220 	return rc;
2221 }
2222 
2223 static void
2224 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
2225 {
2226 	spdk_bs_sequence_t *seq = ctx->seq;
2227 	struct spdk_blob *blob = ctx->blob;
2228 	struct spdk_blob_store *bs = blob->bs;
2229 	uint64_t i;
2230 	uint32_t page_num;
2231 	void *tmp;
2232 	int rc;
2233 
2234 	/* Generate the new metadata */
2235 	rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
2236 	if (rc < 0) {
2237 		blob_persist_complete(seq, ctx, rc);
2238 		return;
2239 	}
2240 
2241 	assert(blob->active.num_pages >= 1);
2242 
2243 	/* Resize the cache of page indices */
2244 	tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
2245 	if (!tmp) {
2246 		blob_persist_complete(seq, ctx, -ENOMEM);
2247 		return;
2248 	}
2249 	blob->active.pages = tmp;
2250 
2251 	/* Assign this metadata to pages. This requires two passes - one to verify that there are
2252 	 * enough pages and a second to actually claim them. The used_lock is held across
2253 	 * both passes to ensure things don't change in the middle.
2254 	 */
2255 	spdk_spin_lock(&bs->used_lock);
2256 	page_num = 0;
2257 	/* Note that this loop starts at one. The first page location is fixed by the blobid. */
2258 	for (i = 1; i < blob->active.num_pages; i++) {
2259 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
2260 		if (page_num == UINT32_MAX) {
2261 			spdk_spin_unlock(&bs->used_lock);
2262 			blob_persist_complete(seq, ctx, -ENOMEM);
2263 			return;
2264 		}
2265 		page_num++;
2266 	}
2267 
2268 	page_num = 0;
2269 	blob->active.pages[0] = bs_blobid_to_page(blob->id);
2270 	for (i = 1; i < blob->active.num_pages; i++) {
2271 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
2272 		ctx->pages[i - 1].next = page_num;
2273 		/* Now that previous metadata page is complete, calculate the crc for it. */
2274 		ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
2275 		blob->active.pages[i] = page_num;
2276 		bs_claim_md_page(bs, page_num);
2277 		SPDK_DEBUGLOG(blob, "Claiming page %u for blob 0x%" PRIx64 "\n", page_num,
2278 			      blob->id);
2279 		page_num++;
2280 	}
2281 	spdk_spin_unlock(&bs->used_lock);
2282 	ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
2283 	/* Start writing the metadata from last page to first */
2284 	blob->state = SPDK_BLOB_STATE_CLEAN;
2285 	blob_persist_write_page_chain(seq, ctx);
2286 }
2287 
2288 static void
2289 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2290 {
2291 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
2292 	struct spdk_blob		*blob = ctx->blob;
2293 	size_t				i;
2294 	uint32_t			extent_page_id;
2295 	uint32_t                        page_count = 0;
2296 	int				rc;
2297 
2298 	if (ctx->extent_page != NULL) {
2299 		spdk_free(ctx->extent_page);
2300 		ctx->extent_page = NULL;
2301 	}
2302 
2303 	if (bserrno != 0) {
2304 		blob_persist_complete(seq, ctx, bserrno);
2305 		return;
2306 	}
2307 
2308 	/* Only write out Extent Pages when blob was resized. */
2309 	for (i = ctx->next_extent_page; i < blob->active.extent_pages_array_size; i++) {
2310 		extent_page_id = blob->active.extent_pages[i];
2311 		if (extent_page_id == 0) {
2312 			/* No Extent Page to persist */
2313 			assert(spdk_blob_is_thin_provisioned(blob));
2314 			continue;
2315 		}
2316 		assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
2317 		ctx->next_extent_page = i + 1;
2318 		rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page);
2319 		if (rc < 0) {
2320 			blob_persist_complete(seq, ctx, rc);
2321 			return;
2322 		}
2323 
2324 		blob->state = SPDK_BLOB_STATE_DIRTY;
2325 		blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page);
2326 
2327 		ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page);
2328 
2329 		bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id),
2330 				      bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
2331 				      blob_persist_write_extent_pages, ctx);
2332 		return;
2333 	}
2334 
2335 	blob_persist_generate_new_md(ctx);
2336 }
2337 
2338 static void
2339 blob_persist_start(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2340 {
2341 	struct spdk_blob_persist_ctx *ctx = cb_arg;
2342 	struct spdk_blob *blob = ctx->blob;
2343 
2344 	if (bserrno != 0) {
2345 		blob_persist_complete(seq, ctx, bserrno);
2346 		return;
2347 	}
2348 
2349 	if (blob->active.num_pages == 0) {
2350 		/* This is the signal that the blob should be deleted.
2351 		 * Immediately jump to the clean up routine. */
2352 		assert(blob->clean.num_pages > 0);
2353 		blob->state = SPDK_BLOB_STATE_CLEAN;
2354 		blob_persist_zero_pages(seq, ctx, 0);
2355 		return;
2356 
2357 	}
2358 
2359 	if (blob->clean.num_clusters < blob->active.num_clusters) {
2360 		/* Blob was resized up */
2361 		assert(blob->clean.num_extent_pages <= blob->active.num_extent_pages);
2362 		ctx->next_extent_page = spdk_max(1, blob->clean.num_extent_pages) - 1;
2363 	} else if (blob->active.num_clusters < blob->active.cluster_array_size) {
2364 		/* Blob was resized down */
2365 		assert(blob->clean.num_extent_pages >= blob->active.num_extent_pages);
2366 		ctx->next_extent_page = spdk_max(1, blob->active.num_extent_pages) - 1;
2367 	} else {
2368 		/* No change in size occurred */
2369 		blob_persist_generate_new_md(ctx);
2370 		return;
2371 	}
2372 
2373 	blob_persist_write_extent_pages(seq, ctx, 0);
2374 }
2375 
2376 struct spdk_bs_mark_dirty {
2377 	struct spdk_blob_store		*bs;
2378 	struct spdk_bs_super_block	*super;
2379 	spdk_bs_sequence_cpl		cb_fn;
2380 	void				*cb_arg;
2381 };
2382 
2383 static void
2384 bs_mark_dirty_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2385 {
2386 	struct spdk_bs_mark_dirty *ctx = cb_arg;
2387 
2388 	if (bserrno == 0) {
2389 		ctx->bs->clean = 0;
2390 	}
2391 
2392 	ctx->cb_fn(seq, ctx->cb_arg, bserrno);
2393 
2394 	spdk_free(ctx->super);
2395 	free(ctx);
2396 }
2397 
2398 static void bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
2399 			   struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
2400 
2401 
2402 static void
2403 bs_mark_dirty_write(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2404 {
2405 	struct spdk_bs_mark_dirty *ctx = cb_arg;
2406 
2407 	if (bserrno != 0) {
2408 		bs_mark_dirty_write_cpl(seq, ctx, bserrno);
2409 		return;
2410 	}
2411 
2412 	ctx->super->clean = 0;
2413 	if (ctx->super->size == 0) {
2414 		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
2415 	}
2416 
2417 	bs_write_super(seq, ctx->bs, ctx->super, bs_mark_dirty_write_cpl, ctx);
2418 }
2419 
2420 static void
2421 bs_mark_dirty(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
2422 	      spdk_bs_sequence_cpl cb_fn, void *cb_arg)
2423 {
2424 	struct spdk_bs_mark_dirty *ctx;
2425 
2426 	/* Blobstore is already marked dirty */
2427 	if (bs->clean == 0) {
2428 		cb_fn(seq, cb_arg, 0);
2429 		return;
2430 	}
2431 
2432 	ctx = calloc(1, sizeof(*ctx));
2433 	if (!ctx) {
2434 		cb_fn(seq, cb_arg, -ENOMEM);
2435 		return;
2436 	}
2437 	ctx->bs = bs;
2438 	ctx->cb_fn = cb_fn;
2439 	ctx->cb_arg = cb_arg;
2440 
2441 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
2442 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2443 	if (!ctx->super) {
2444 		free(ctx);
2445 		cb_fn(seq, cb_arg, -ENOMEM);
2446 		return;
2447 	}
2448 
2449 	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
2450 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
2451 			     bs_mark_dirty_write, ctx);
2452 }
2453 
2454 /* Write a blob to disk */
2455 static void
2456 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
2457 	     spdk_bs_sequence_cpl cb_fn, void *cb_arg)
2458 {
2459 	struct spdk_blob_persist_ctx *ctx;
2460 
2461 	blob_verify_md_op(blob);
2462 
2463 	if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->persists_to_complete)) {
2464 		cb_fn(seq, cb_arg, 0);
2465 		return;
2466 	}
2467 
2468 	ctx = calloc(1, sizeof(*ctx));
2469 	if (!ctx) {
2470 		cb_fn(seq, cb_arg, -ENOMEM);
2471 		return;
2472 	}
2473 	ctx->blob = blob;
2474 	ctx->seq = seq;
2475 	ctx->cb_fn = cb_fn;
2476 	ctx->cb_arg = cb_arg;
2477 
2478 	/* Multiple blob persists can affect one another, via blob->state or
2479 	 * blob mutable data changes. To prevent it, queue up the persists. */
2480 	if (!TAILQ_EMPTY(&blob->persists_to_complete)) {
2481 		TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link);
2482 		return;
2483 	}
2484 	TAILQ_INSERT_HEAD(&blob->persists_to_complete, ctx, link);
2485 
2486 	bs_mark_dirty(seq, blob->bs, blob_persist_start, ctx);
2487 }
2488 
2489 struct spdk_blob_copy_cluster_ctx {
2490 	struct spdk_blob *blob;
2491 	uint8_t *buf;
2492 	uint64_t page;
2493 	uint64_t new_cluster;
2494 	uint32_t new_extent_page;
2495 	spdk_bs_sequence_t *seq;
2496 	struct spdk_blob_md_page *new_cluster_page;
2497 };
2498 
2499 static void
2500 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
2501 {
2502 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2503 	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
2504 	TAILQ_HEAD(, spdk_bs_request_set) requests;
2505 	spdk_bs_user_op_t *op;
2506 
2507 	TAILQ_INIT(&requests);
2508 	TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
2509 
2510 	while (!TAILQ_EMPTY(&requests)) {
2511 		op = TAILQ_FIRST(&requests);
2512 		TAILQ_REMOVE(&requests, op, link);
2513 		if (bserrno == 0) {
2514 			bs_user_op_execute(op);
2515 		} else {
2516 			bs_user_op_abort(op, bserrno);
2517 		}
2518 	}
2519 
2520 	spdk_free(ctx->buf);
2521 	free(ctx);
2522 }
2523 
2524 static void
2525 blob_insert_cluster_cpl(void *cb_arg, int bserrno)
2526 {
2527 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2528 
2529 	if (bserrno) {
2530 		if (bserrno == -EEXIST) {
2531 			/* The metadata insert failed because another thread
2532 			 * allocated the cluster first. Free our cluster
2533 			 * but continue without error. */
2534 			bserrno = 0;
2535 		}
2536 		spdk_spin_lock(&ctx->blob->bs->used_lock);
2537 		bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
2538 		if (ctx->new_extent_page != 0) {
2539 			bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
2540 		}
2541 		spdk_spin_unlock(&ctx->blob->bs->used_lock);
2542 	}
2543 
2544 	bs_sequence_finish(ctx->seq, bserrno);
2545 }
2546 
2547 static void
2548 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2549 {
2550 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2551 	uint32_t cluster_number;
2552 
2553 	if (bserrno) {
2554 		/* The write failed, so jump to the final completion handler */
2555 		bs_sequence_finish(seq, bserrno);
2556 		return;
2557 	}
2558 
2559 	cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page);
2560 
2561 	blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2562 					 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);
2563 }
2564 
2565 static void
2566 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2567 {
2568 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2569 
2570 	if (bserrno != 0) {
2571 		/* The read failed, so jump to the final completion handler */
2572 		bs_sequence_finish(seq, bserrno);
2573 		return;
2574 	}
2575 
2576 	/* Write whole cluster */
2577 	bs_sequence_write_dev(seq, ctx->buf,
2578 			      bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
2579 			      bs_cluster_to_lba(ctx->blob->bs, 1),
2580 			      blob_write_copy_cpl, ctx);
2581 }
2582 
2583 static bool
2584 blob_can_copy(struct spdk_blob *blob, uint32_t cluster_start_page, uint64_t *base_lba)
2585 {
2586 	uint64_t lba = bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page);
2587 
2588 	return (!blob_is_esnap_clone(blob) && blob->bs->dev->copy != NULL) &&
2589 	       blob->back_bs_dev->translate_lba(blob->back_bs_dev, lba, base_lba);
2590 }
2591 
2592 static void
2593 blob_copy(struct spdk_blob_copy_cluster_ctx *ctx, spdk_bs_user_op_t *op, uint64_t src_lba)
2594 {
2595 	struct spdk_blob *blob = ctx->blob;
2596 	uint64_t lba_count = bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz);
2597 
2598 	bs_sequence_copy_dev(ctx->seq,
2599 			     bs_cluster_to_lba(blob->bs, ctx->new_cluster),
2600 			     src_lba,
2601 			     lba_count,
2602 			     blob_write_copy_cpl, ctx);
2603 }
2604 
2605 static void
2606 bs_allocate_and_copy_cluster(struct spdk_blob *blob,
2607 			     struct spdk_io_channel *_ch,
2608 			     uint64_t io_unit, spdk_bs_user_op_t *op)
2609 {
2610 	struct spdk_bs_cpl cpl;
2611 	struct spdk_bs_channel *ch;
2612 	struct spdk_blob_copy_cluster_ctx *ctx;
2613 	uint32_t cluster_start_page;
2614 	uint32_t cluster_number;
2615 	bool is_zeroes;
2616 	bool can_copy;
2617 	uint64_t copy_src_lba;
2618 	int rc;
2619 
2620 	ch = spdk_io_channel_get_ctx(_ch);
2621 
2622 	if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
2623 		/* There are already operations pending. Queue this user op
2624 		 * and return because it will be re-executed when the outstanding
2625 		 * cluster allocation completes. */
2626 		TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2627 		return;
2628 	}
2629 
2630 	/* Round the io_unit offset down to the first page in the cluster */
2631 	cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit);
2632 
2633 	/* Calculate which index in the metadata cluster array the corresponding
2634 	 * cluster is supposed to be at. */
2635 	cluster_number = bs_io_unit_to_cluster_number(blob, io_unit);
2636 
2637 	ctx = calloc(1, sizeof(*ctx));
2638 	if (!ctx) {
2639 		bs_user_op_abort(op, -ENOMEM);
2640 		return;
2641 	}
2642 
2643 	assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
2644 
2645 	ctx->blob = blob;
2646 	ctx->page = cluster_start_page;
2647 	ctx->new_cluster_page = ch->new_cluster_page;
2648 	memset(ctx->new_cluster_page, 0, SPDK_BS_PAGE_SIZE);
2649 	can_copy = blob_can_copy(blob, cluster_start_page, &copy_src_lba);
2650 
2651 	is_zeroes = blob->back_bs_dev->is_zeroes(blob->back_bs_dev,
2652 			bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
2653 			bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz));
2654 	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes && !can_copy) {
2655 		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
2656 				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2657 		if (!ctx->buf) {
2658 			SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
2659 				    blob->bs->cluster_sz);
2660 			free(ctx);
2661 			bs_user_op_abort(op, -ENOMEM);
2662 			return;
2663 		}
2664 	}
2665 
2666 	spdk_spin_lock(&blob->bs->used_lock);
2667 	rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
2668 				 false);
2669 	spdk_spin_unlock(&blob->bs->used_lock);
2670 	if (rc != 0) {
2671 		spdk_free(ctx->buf);
2672 		free(ctx);
2673 		bs_user_op_abort(op, rc);
2674 		return;
2675 	}
2676 
2677 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2678 	cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl;
2679 	cpl.u.blob_basic.cb_arg = ctx;
2680 
2681 	ctx->seq = bs_sequence_start_blob(_ch, &cpl, blob);
2682 	if (!ctx->seq) {
2683 		spdk_spin_lock(&blob->bs->used_lock);
2684 		bs_release_cluster(blob->bs, ctx->new_cluster);
2685 		spdk_spin_unlock(&blob->bs->used_lock);
2686 		spdk_free(ctx->buf);
2687 		free(ctx);
2688 		bs_user_op_abort(op, -ENOMEM);
2689 		return;
2690 	}
2691 
2692 	/* Queue the user op to block other incoming operations */
2693 	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2694 
2695 	if (blob->parent_id != SPDK_BLOBID_INVALID && !is_zeroes) {
2696 		if (can_copy) {
2697 			blob_copy(ctx, op, copy_src_lba);
2698 		} else {
2699 			/* Read cluster from backing device */
2700 			bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
2701 						bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
2702 						bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
2703 						blob_write_copy, ctx);
2704 		}
2705 
2706 	} else {
2707 		blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2708 						 ctx->new_extent_page, ctx->new_cluster_page, blob_insert_cluster_cpl, ctx);
2709 	}
2710 }
2711 
2712 static inline bool
2713 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
2714 				 uint64_t *lba,	uint64_t *lba_count)
2715 {
2716 	*lba_count = length;
2717 
2718 	if (!bs_io_unit_is_allocated(blob, io_unit)) {
2719 		assert(blob->back_bs_dev != NULL);
2720 		*lba = bs_io_unit_to_back_dev_lba(blob, io_unit);
2721 		*lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count);
2722 		return false;
2723 	} else {
2724 		*lba = bs_blob_io_unit_to_lba(blob, io_unit);
2725 		return true;
2726 	}
2727 }
2728 
2729 struct op_split_ctx {
2730 	struct spdk_blob *blob;
2731 	struct spdk_io_channel *channel;
2732 	uint64_t io_unit_offset;
2733 	uint64_t io_units_remaining;
2734 	void *curr_payload;
2735 	enum spdk_blob_op_type op_type;
2736 	spdk_bs_sequence_t *seq;
2737 	bool in_submit_ctx;
2738 	bool completed_in_submit_ctx;
2739 	bool done;
2740 };
2741 
2742 static void
2743 blob_request_submit_op_split_next(void *cb_arg, int bserrno)
2744 {
2745 	struct op_split_ctx	*ctx = cb_arg;
2746 	struct spdk_blob	*blob = ctx->blob;
2747 	struct spdk_io_channel	*ch = ctx->channel;
2748 	enum spdk_blob_op_type	op_type = ctx->op_type;
2749 	uint8_t			*buf;
2750 	uint64_t		offset;
2751 	uint64_t		length;
2752 	uint64_t		op_length;
2753 
2754 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
2755 		bs_sequence_finish(ctx->seq, bserrno);
2756 		if (ctx->in_submit_ctx) {
2757 			/* Defer freeing of the ctx object, since it will be
2758 			 * accessed when this unwinds back to the submisison
2759 			 * context.
2760 			 */
2761 			ctx->done = true;
2762 		} else {
2763 			free(ctx);
2764 		}
2765 		return;
2766 	}
2767 
2768 	if (ctx->in_submit_ctx) {
2769 		/* If this split operation completed in the context
2770 		 * of its submission, mark the flag and return immediately
2771 		 * to avoid recursion.
2772 		 */
2773 		ctx->completed_in_submit_ctx = true;
2774 		return;
2775 	}
2776 
2777 	while (true) {
2778 		ctx->completed_in_submit_ctx = false;
2779 
2780 		offset = ctx->io_unit_offset;
2781 		length = ctx->io_units_remaining;
2782 		buf = ctx->curr_payload;
2783 		op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob,
2784 				     offset));
2785 
2786 		/* Update length and payload for next operation */
2787 		ctx->io_units_remaining -= op_length;
2788 		ctx->io_unit_offset += op_length;
2789 		if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
2790 			ctx->curr_payload += op_length * blob->bs->io_unit_size;
2791 		}
2792 
2793 		assert(!ctx->in_submit_ctx);
2794 		ctx->in_submit_ctx = true;
2795 
2796 		switch (op_type) {
2797 		case SPDK_BLOB_READ:
2798 			spdk_blob_io_read(blob, ch, buf, offset, op_length,
2799 					  blob_request_submit_op_split_next, ctx);
2800 			break;
2801 		case SPDK_BLOB_WRITE:
2802 			spdk_blob_io_write(blob, ch, buf, offset, op_length,
2803 					   blob_request_submit_op_split_next, ctx);
2804 			break;
2805 		case SPDK_BLOB_UNMAP:
2806 			spdk_blob_io_unmap(blob, ch, offset, op_length,
2807 					   blob_request_submit_op_split_next, ctx);
2808 			break;
2809 		case SPDK_BLOB_WRITE_ZEROES:
2810 			spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
2811 						  blob_request_submit_op_split_next, ctx);
2812 			break;
2813 		case SPDK_BLOB_READV:
2814 		case SPDK_BLOB_WRITEV:
2815 			SPDK_ERRLOG("readv/write not valid\n");
2816 			bs_sequence_finish(ctx->seq, -EINVAL);
2817 			free(ctx);
2818 			return;
2819 		}
2820 
2821 #ifndef __clang_analyzer__
2822 		/* scan-build reports a false positive around accessing the ctx here. It
2823 		 * forms a path that recursively calls this function, but then says
2824 		 * "assuming ctx->in_submit_ctx is false", when that isn't possible.
2825 		 * This path does free(ctx), returns to here, and reports a use-after-free
2826 		 * bug.  Wrapping this bit of code so that scan-build doesn't see it
2827 		 * works around the scan-build bug.
2828 		 */
2829 		assert(ctx->in_submit_ctx);
2830 		ctx->in_submit_ctx = false;
2831 
2832 		/* If the operation completed immediately, loop back and submit the
2833 		 * next operation.  Otherwise we can return and the next split
2834 		 * operation will get submitted when this current operation is
2835 		 * later completed asynchronously.
2836 		 */
2837 		if (ctx->completed_in_submit_ctx) {
2838 			continue;
2839 		} else if (ctx->done) {
2840 			free(ctx);
2841 		}
2842 #endif
2843 		break;
2844 	}
2845 }
2846 
2847 static void
2848 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
2849 			     void *payload, uint64_t offset, uint64_t length,
2850 			     spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2851 {
2852 	struct op_split_ctx *ctx;
2853 	spdk_bs_sequence_t *seq;
2854 	struct spdk_bs_cpl cpl;
2855 
2856 	assert(blob != NULL);
2857 
2858 	ctx = calloc(1, sizeof(struct op_split_ctx));
2859 	if (ctx == NULL) {
2860 		cb_fn(cb_arg, -ENOMEM);
2861 		return;
2862 	}
2863 
2864 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2865 	cpl.u.blob_basic.cb_fn = cb_fn;
2866 	cpl.u.blob_basic.cb_arg = cb_arg;
2867 
2868 	seq = bs_sequence_start_blob(ch, &cpl, blob);
2869 	if (!seq) {
2870 		free(ctx);
2871 		cb_fn(cb_arg, -ENOMEM);
2872 		return;
2873 	}
2874 
2875 	ctx->blob = blob;
2876 	ctx->channel = ch;
2877 	ctx->curr_payload = payload;
2878 	ctx->io_unit_offset = offset;
2879 	ctx->io_units_remaining = length;
2880 	ctx->op_type = op_type;
2881 	ctx->seq = seq;
2882 
2883 	blob_request_submit_op_split_next(ctx, 0);
2884 }
2885 
2886 static void
2887 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
2888 			      void *payload, uint64_t offset, uint64_t length,
2889 			      spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2890 {
2891 	struct spdk_bs_cpl cpl;
2892 	uint64_t lba;
2893 	uint64_t lba_count;
2894 	bool is_allocated;
2895 
2896 	assert(blob != NULL);
2897 
2898 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2899 	cpl.u.blob_basic.cb_fn = cb_fn;
2900 	cpl.u.blob_basic.cb_arg = cb_arg;
2901 
2902 	if (blob->frozen_refcnt) {
2903 		/* This blob I/O is frozen */
2904 		spdk_bs_user_op_t *op;
2905 		struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
2906 
2907 		op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2908 		if (!op) {
2909 			cb_fn(cb_arg, -ENOMEM);
2910 			return;
2911 		}
2912 
2913 		TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
2914 
2915 		return;
2916 	}
2917 
2918 	is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
2919 
2920 	switch (op_type) {
2921 	case SPDK_BLOB_READ: {
2922 		spdk_bs_batch_t *batch;
2923 
2924 		batch = bs_batch_open(_ch, &cpl, blob);
2925 		if (!batch) {
2926 			cb_fn(cb_arg, -ENOMEM);
2927 			return;
2928 		}
2929 
2930 		if (is_allocated) {
2931 			/* Read from the blob */
2932 			bs_batch_read_dev(batch, payload, lba, lba_count);
2933 		} else {
2934 			/* Read from the backing block device */
2935 			bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
2936 		}
2937 
2938 		bs_batch_close(batch);
2939 		break;
2940 	}
2941 	case SPDK_BLOB_WRITE:
2942 	case SPDK_BLOB_WRITE_ZEROES: {
2943 		if (is_allocated) {
2944 			/* Write to the blob */
2945 			spdk_bs_batch_t *batch;
2946 
2947 			if (lba_count == 0) {
2948 				cb_fn(cb_arg, 0);
2949 				return;
2950 			}
2951 
2952 			batch = bs_batch_open(_ch, &cpl, blob);
2953 			if (!batch) {
2954 				cb_fn(cb_arg, -ENOMEM);
2955 				return;
2956 			}
2957 
2958 			if (op_type == SPDK_BLOB_WRITE) {
2959 				bs_batch_write_dev(batch, payload, lba, lba_count);
2960 			} else {
2961 				bs_batch_write_zeroes_dev(batch, lba, lba_count);
2962 			}
2963 
2964 			bs_batch_close(batch);
2965 		} else {
2966 			/* Queue this operation and allocate the cluster */
2967 			spdk_bs_user_op_t *op;
2968 
2969 			op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2970 			if (!op) {
2971 				cb_fn(cb_arg, -ENOMEM);
2972 				return;
2973 			}
2974 
2975 			bs_allocate_and_copy_cluster(blob, _ch, offset, op);
2976 		}
2977 		break;
2978 	}
2979 	case SPDK_BLOB_UNMAP: {
2980 		spdk_bs_batch_t *batch;
2981 
2982 		batch = bs_batch_open(_ch, &cpl, blob);
2983 		if (!batch) {
2984 			cb_fn(cb_arg, -ENOMEM);
2985 			return;
2986 		}
2987 
2988 		if (is_allocated) {
2989 			bs_batch_unmap_dev(batch, lba, lba_count);
2990 		}
2991 
2992 		bs_batch_close(batch);
2993 		break;
2994 	}
2995 	case SPDK_BLOB_READV:
2996 	case SPDK_BLOB_WRITEV:
2997 		SPDK_ERRLOG("readv/write not valid\n");
2998 		cb_fn(cb_arg, -EINVAL);
2999 		break;
3000 	}
3001 }
3002 
3003 static void
3004 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
3005 		       void *payload, uint64_t offset, uint64_t length,
3006 		       spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
3007 {
3008 	assert(blob != NULL);
3009 
3010 	if (blob->data_ro && op_type != SPDK_BLOB_READ) {
3011 		cb_fn(cb_arg, -EPERM);
3012 		return;
3013 	}
3014 
3015 	if (length == 0) {
3016 		cb_fn(cb_arg, 0);
3017 		return;
3018 	}
3019 
3020 	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
3021 		cb_fn(cb_arg, -EINVAL);
3022 		return;
3023 	}
3024 	if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) {
3025 		blob_request_submit_op_single(_channel, blob, payload, offset, length,
3026 					      cb_fn, cb_arg, op_type);
3027 	} else {
3028 		blob_request_submit_op_split(_channel, blob, payload, offset, length,
3029 					     cb_fn, cb_arg, op_type);
3030 	}
3031 }
3032 
3033 struct rw_iov_ctx {
3034 	struct spdk_blob *blob;
3035 	struct spdk_io_channel *channel;
3036 	spdk_blob_op_complete cb_fn;
3037 	void *cb_arg;
3038 	bool read;
3039 	int iovcnt;
3040 	struct iovec *orig_iov;
3041 	uint64_t io_unit_offset;
3042 	uint64_t io_units_remaining;
3043 	uint64_t io_units_done;
3044 	struct spdk_blob_ext_io_opts *ext_io_opts;
3045 	struct iovec iov[0];
3046 };
3047 
3048 static void
3049 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3050 {
3051 	assert(cb_arg == NULL);
3052 	bs_sequence_finish(seq, bserrno);
3053 }
3054 
3055 static void
3056 rw_iov_split_next(void *cb_arg, int bserrno)
3057 {
3058 	struct rw_iov_ctx *ctx = cb_arg;
3059 	struct spdk_blob *blob = ctx->blob;
3060 	struct iovec *iov, *orig_iov;
3061 	int iovcnt;
3062 	size_t orig_iovoff;
3063 	uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
3064 	uint64_t byte_count;
3065 
3066 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
3067 		ctx->cb_fn(ctx->cb_arg, bserrno);
3068 		free(ctx);
3069 		return;
3070 	}
3071 
3072 	io_unit_offset = ctx->io_unit_offset;
3073 	io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
3074 	io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
3075 	/*
3076 	 * Get index and offset into the original iov array for our current position in the I/O sequence.
3077 	 *  byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
3078 	 *  point to the current position in the I/O sequence.
3079 	 */
3080 	byte_count = ctx->io_units_done * blob->bs->io_unit_size;
3081 	orig_iov = &ctx->orig_iov[0];
3082 	orig_iovoff = 0;
3083 	while (byte_count > 0) {
3084 		if (byte_count >= orig_iov->iov_len) {
3085 			byte_count -= orig_iov->iov_len;
3086 			orig_iov++;
3087 		} else {
3088 			orig_iovoff = byte_count;
3089 			byte_count = 0;
3090 		}
3091 	}
3092 
3093 	/*
3094 	 * Build an iov array for the next I/O in the sequence.  byte_count will keep track of how many
3095 	 *  bytes of this next I/O remain to be accounted for in the new iov array.
3096 	 */
3097 	byte_count = io_units_count * blob->bs->io_unit_size;
3098 	iov = &ctx->iov[0];
3099 	iovcnt = 0;
3100 	while (byte_count > 0) {
3101 		assert(iovcnt < ctx->iovcnt);
3102 		iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
3103 		iov->iov_base = orig_iov->iov_base + orig_iovoff;
3104 		byte_count -= iov->iov_len;
3105 		orig_iovoff = 0;
3106 		orig_iov++;
3107 		iov++;
3108 		iovcnt++;
3109 	}
3110 
3111 	ctx->io_unit_offset += io_units_count;
3112 	ctx->io_units_remaining -= io_units_count;
3113 	ctx->io_units_done += io_units_count;
3114 	iov = &ctx->iov[0];
3115 
3116 	if (ctx->read) {
3117 		spdk_blob_io_readv_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
3118 				       io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts);
3119 	} else {
3120 		spdk_blob_io_writev_ext(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
3121 					io_units_count, rw_iov_split_next, ctx, ctx->ext_io_opts);
3122 	}
3123 }
3124 
3125 static void
3126 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
3127 			   struct iovec *iov, int iovcnt,
3128 			   uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg, bool read,
3129 			   struct spdk_blob_ext_io_opts *ext_io_opts)
3130 {
3131 	struct spdk_bs_cpl	cpl;
3132 
3133 	assert(blob != NULL);
3134 
3135 	if (!read && blob->data_ro) {
3136 		cb_fn(cb_arg, -EPERM);
3137 		return;
3138 	}
3139 
3140 	if (length == 0) {
3141 		cb_fn(cb_arg, 0);
3142 		return;
3143 	}
3144 
3145 	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
3146 		cb_fn(cb_arg, -EINVAL);
3147 		return;
3148 	}
3149 
3150 	/*
3151 	 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
3152 	 *  to split a request that spans a cluster boundary.  For I/O that do not span a cluster boundary,
3153 	 *  there will be no noticeable difference compared to using a batch.  For I/O that do span a cluster
3154 	 *  boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
3155 	 *  to allocate a separate iov array and split the I/O such that none of the resulting
3156 	 *  smaller I/O cross a cluster boundary.  These smaller I/O will be issued in sequence (not in parallel)
3157 	 *  but since this case happens very infrequently, any performance impact will be negligible.
3158 	 *
3159 	 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
3160 	 *  for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
3161 	 *  in a batch.  That would also require creating an intermediate spdk_bs_cpl that would get called
3162 	 *  when the batch was completed, to allow for freeing the memory for the iov arrays.
3163 	 */
3164 	if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) {
3165 		uint64_t lba_count;
3166 		uint64_t lba;
3167 		bool is_allocated;
3168 
3169 		cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
3170 		cpl.u.blob_basic.cb_fn = cb_fn;
3171 		cpl.u.blob_basic.cb_arg = cb_arg;
3172 
3173 		if (blob->frozen_refcnt) {
3174 			/* This blob I/O is frozen */
3175 			enum spdk_blob_op_type op_type;
3176 			spdk_bs_user_op_t *op;
3177 			struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
3178 
3179 			op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
3180 			op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
3181 			if (!op) {
3182 				cb_fn(cb_arg, -ENOMEM);
3183 				return;
3184 			}
3185 
3186 			TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
3187 
3188 			return;
3189 		}
3190 
3191 		is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
3192 
3193 		if (read) {
3194 			spdk_bs_sequence_t *seq;
3195 
3196 			seq = bs_sequence_start_blob(_channel, &cpl, blob);
3197 			if (!seq) {
3198 				cb_fn(cb_arg, -ENOMEM);
3199 				return;
3200 			}
3201 
3202 			seq->ext_io_opts = ext_io_opts;
3203 
3204 			if (is_allocated) {
3205 				bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
3206 			} else {
3207 				bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
3208 							 rw_iov_done, NULL);
3209 			}
3210 		} else {
3211 			if (is_allocated) {
3212 				spdk_bs_sequence_t *seq;
3213 
3214 				seq = bs_sequence_start_blob(_channel, &cpl, blob);
3215 				if (!seq) {
3216 					cb_fn(cb_arg, -ENOMEM);
3217 					return;
3218 				}
3219 
3220 				seq->ext_io_opts = ext_io_opts;
3221 
3222 				bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
3223 			} else {
3224 				/* Queue this operation and allocate the cluster */
3225 				spdk_bs_user_op_t *op;
3226 
3227 				op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
3228 						      length);
3229 				if (!op) {
3230 					cb_fn(cb_arg, -ENOMEM);
3231 					return;
3232 				}
3233 
3234 				op->ext_io_opts = ext_io_opts;
3235 
3236 				bs_allocate_and_copy_cluster(blob, _channel, offset, op);
3237 			}
3238 		}
3239 	} else {
3240 		struct rw_iov_ctx *ctx;
3241 
3242 		ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
3243 		if (ctx == NULL) {
3244 			cb_fn(cb_arg, -ENOMEM);
3245 			return;
3246 		}
3247 
3248 		ctx->blob = blob;
3249 		ctx->channel = _channel;
3250 		ctx->cb_fn = cb_fn;
3251 		ctx->cb_arg = cb_arg;
3252 		ctx->read = read;
3253 		ctx->orig_iov = iov;
3254 		ctx->iovcnt = iovcnt;
3255 		ctx->io_unit_offset = offset;
3256 		ctx->io_units_remaining = length;
3257 		ctx->io_units_done = 0;
3258 		ctx->ext_io_opts = ext_io_opts;
3259 
3260 		rw_iov_split_next(ctx, 0);
3261 	}
3262 }
3263 
3264 static struct spdk_blob *
3265 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
3266 {
3267 	struct spdk_blob find;
3268 
3269 	if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
3270 		return NULL;
3271 	}
3272 
3273 	find.id = blobid;
3274 	return RB_FIND(spdk_blob_tree, &bs->open_blobs, &find);
3275 }
3276 
3277 static void
3278 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
3279 				    struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
3280 {
3281 	assert(blob != NULL);
3282 	*snapshot_entry = NULL;
3283 	*clone_entry = NULL;
3284 
3285 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
3286 		return;
3287 	}
3288 
3289 	TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
3290 		if ((*snapshot_entry)->id == blob->parent_id) {
3291 			break;
3292 		}
3293 	}
3294 
3295 	if (*snapshot_entry != NULL) {
3296 		TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
3297 			if ((*clone_entry)->id == blob->id) {
3298 				break;
3299 			}
3300 		}
3301 
3302 		assert(*clone_entry != NULL);
3303 	}
3304 }
3305 
3306 static int
3307 bs_channel_create(void *io_device, void *ctx_buf)
3308 {
3309 	struct spdk_blob_store		*bs = io_device;
3310 	struct spdk_bs_channel		*channel = ctx_buf;
3311 	struct spdk_bs_dev		*dev;
3312 	uint32_t			max_ops = bs->max_channel_ops;
3313 	uint32_t			i;
3314 
3315 	dev = bs->dev;
3316 
3317 	channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
3318 	if (!channel->req_mem) {
3319 		return -1;
3320 	}
3321 
3322 	TAILQ_INIT(&channel->reqs);
3323 
3324 	for (i = 0; i < max_ops; i++) {
3325 		TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
3326 	}
3327 
3328 	channel->bs = bs;
3329 	channel->dev = dev;
3330 	channel->dev_channel = dev->create_channel(dev);
3331 
3332 	if (!channel->dev_channel) {
3333 		SPDK_ERRLOG("Failed to create device channel.\n");
3334 		free(channel->req_mem);
3335 		return -1;
3336 	}
3337 
3338 	channel->new_cluster_page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY,
3339 				    SPDK_MALLOC_DMA);
3340 	if (!channel->new_cluster_page) {
3341 		SPDK_ERRLOG("Failed to allocate new cluster page\n");
3342 		free(channel->req_mem);
3343 		channel->dev->destroy_channel(channel->dev, channel->dev_channel);
3344 		return -1;
3345 	}
3346 
3347 	TAILQ_INIT(&channel->need_cluster_alloc);
3348 	TAILQ_INIT(&channel->queued_io);
3349 	RB_INIT(&channel->esnap_channels);
3350 
3351 	return 0;
3352 }
3353 
3354 static void
3355 bs_channel_destroy(void *io_device, void *ctx_buf)
3356 {
3357 	struct spdk_bs_channel *channel = ctx_buf;
3358 	spdk_bs_user_op_t *op;
3359 
3360 	while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
3361 		op = TAILQ_FIRST(&channel->need_cluster_alloc);
3362 		TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
3363 		bs_user_op_abort(op, -EIO);
3364 	}
3365 
3366 	while (!TAILQ_EMPTY(&channel->queued_io)) {
3367 		op = TAILQ_FIRST(&channel->queued_io);
3368 		TAILQ_REMOVE(&channel->queued_io, op, link);
3369 		bs_user_op_abort(op, -EIO);
3370 	}
3371 
3372 	blob_esnap_destroy_bs_channel(channel);
3373 
3374 	free(channel->req_mem);
3375 	spdk_free(channel->new_cluster_page);
3376 	channel->dev->destroy_channel(channel->dev, channel->dev_channel);
3377 }
3378 
3379 static void
3380 bs_dev_destroy(void *io_device)
3381 {
3382 	struct spdk_blob_store *bs = io_device;
3383 	struct spdk_blob	*blob, *blob_tmp;
3384 
3385 	bs->dev->destroy(bs->dev);
3386 
3387 	RB_FOREACH_SAFE(blob, spdk_blob_tree, &bs->open_blobs, blob_tmp) {
3388 		RB_REMOVE(spdk_blob_tree, &bs->open_blobs, blob);
3389 		spdk_bit_array_clear(bs->open_blobids, blob->id);
3390 		blob_free(blob);
3391 	}
3392 
3393 	spdk_spin_destroy(&bs->used_lock);
3394 
3395 	spdk_bit_array_free(&bs->open_blobids);
3396 	spdk_bit_array_free(&bs->used_blobids);
3397 	spdk_bit_array_free(&bs->used_md_pages);
3398 	spdk_bit_pool_free(&bs->used_clusters);
3399 	/*
3400 	 * If this function is called for any reason except a successful unload,
3401 	 * the unload_cpl type will be NONE and this will be a nop.
3402 	 */
3403 	bs_call_cpl(&bs->unload_cpl, bs->unload_err);
3404 
3405 	free(bs);
3406 }
3407 
3408 static int
3409 bs_blob_list_add(struct spdk_blob *blob)
3410 {
3411 	spdk_blob_id snapshot_id;
3412 	struct spdk_blob_list *snapshot_entry = NULL;
3413 	struct spdk_blob_list *clone_entry = NULL;
3414 
3415 	assert(blob != NULL);
3416 
3417 	snapshot_id = blob->parent_id;
3418 	if (snapshot_id == SPDK_BLOBID_INVALID ||
3419 	    snapshot_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
3420 		return 0;
3421 	}
3422 
3423 	snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id);
3424 	if (snapshot_entry == NULL) {
3425 		/* Snapshot not found */
3426 		snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
3427 		if (snapshot_entry == NULL) {
3428 			return -ENOMEM;
3429 		}
3430 		snapshot_entry->id = snapshot_id;
3431 		TAILQ_INIT(&snapshot_entry->clones);
3432 		TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
3433 	} else {
3434 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
3435 			if (clone_entry->id == blob->id) {
3436 				break;
3437 			}
3438 		}
3439 	}
3440 
3441 	if (clone_entry == NULL) {
3442 		/* Clone not found */
3443 		clone_entry = calloc(1, sizeof(struct spdk_blob_list));
3444 		if (clone_entry == NULL) {
3445 			return -ENOMEM;
3446 		}
3447 		clone_entry->id = blob->id;
3448 		TAILQ_INIT(&clone_entry->clones);
3449 		TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
3450 		snapshot_entry->clone_count++;
3451 	}
3452 
3453 	return 0;
3454 }
3455 
3456 static void
3457 bs_blob_list_remove(struct spdk_blob *blob)
3458 {
3459 	struct spdk_blob_list *snapshot_entry = NULL;
3460 	struct spdk_blob_list *clone_entry = NULL;
3461 
3462 	blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
3463 
3464 	if (snapshot_entry == NULL) {
3465 		return;
3466 	}
3467 
3468 	blob->parent_id = SPDK_BLOBID_INVALID;
3469 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
3470 	free(clone_entry);
3471 
3472 	snapshot_entry->clone_count--;
3473 }
3474 
3475 static int
3476 bs_blob_list_free(struct spdk_blob_store *bs)
3477 {
3478 	struct spdk_blob_list *snapshot_entry;
3479 	struct spdk_blob_list *snapshot_entry_tmp;
3480 	struct spdk_blob_list *clone_entry;
3481 	struct spdk_blob_list *clone_entry_tmp;
3482 
3483 	TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
3484 		TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
3485 			TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
3486 			free(clone_entry);
3487 		}
3488 		TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
3489 		free(snapshot_entry);
3490 	}
3491 
3492 	return 0;
3493 }
3494 
3495 static void
3496 bs_free(struct spdk_blob_store *bs)
3497 {
3498 	bs_blob_list_free(bs);
3499 
3500 	bs_unregister_md_thread(bs);
3501 	spdk_io_device_unregister(bs, bs_dev_destroy);
3502 }
3503 
3504 void
3505 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size)
3506 {
3507 
3508 	if (!opts) {
3509 		SPDK_ERRLOG("opts should not be NULL\n");
3510 		return;
3511 	}
3512 
3513 	if (!opts_size) {
3514 		SPDK_ERRLOG("opts_size should not be zero value\n");
3515 		return;
3516 	}
3517 
3518 	memset(opts, 0, opts_size);
3519 	opts->opts_size = opts_size;
3520 
3521 #define FIELD_OK(field) \
3522 	offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size
3523 
3524 #define SET_FIELD(field, value) \
3525 	if (FIELD_OK(field)) { \
3526 		opts->field = value; \
3527 	} \
3528 
3529 	SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ);
3530 	SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES);
3531 	SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES);
3532 	SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS);
3533 	SET_FIELD(clear_method,  BS_CLEAR_WITH_UNMAP);
3534 
3535 	if (FIELD_OK(bstype)) {
3536 		memset(&opts->bstype, 0, sizeof(opts->bstype));
3537 	}
3538 
3539 	SET_FIELD(iter_cb_fn, NULL);
3540 	SET_FIELD(iter_cb_arg, NULL);
3541 	SET_FIELD(force_recover, false);
3542 	SET_FIELD(esnap_bs_dev_create, NULL);
3543 	SET_FIELD(esnap_ctx, NULL);
3544 
3545 #undef FIELD_OK
3546 #undef SET_FIELD
3547 }
3548 
3549 static int
3550 bs_opts_verify(struct spdk_bs_opts *opts)
3551 {
3552 	if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
3553 	    opts->max_channel_ops == 0) {
3554 		SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
3555 		return -1;
3556 	}
3557 
3558 	return 0;
3559 }
3560 
3561 /* START spdk_bs_load */
3562 
3563 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */
3564 
3565 struct spdk_bs_load_ctx {
3566 	struct spdk_blob_store		*bs;
3567 	struct spdk_bs_super_block	*super;
3568 
3569 	struct spdk_bs_md_mask		*mask;
3570 	bool				in_page_chain;
3571 	uint32_t			page_index;
3572 	uint32_t			cur_page;
3573 	struct spdk_blob_md_page	*page;
3574 
3575 	uint64_t			num_extent_pages;
3576 	uint32_t			*extent_page_num;
3577 	struct spdk_blob_md_page	*extent_pages;
3578 	struct spdk_bit_array		*used_clusters;
3579 
3580 	spdk_bs_sequence_t			*seq;
3581 	spdk_blob_op_with_handle_complete	iter_cb_fn;
3582 	void					*iter_cb_arg;
3583 	struct spdk_blob			*blob;
3584 	spdk_blob_id				blobid;
3585 
3586 	bool					force_recover;
3587 
3588 	/* These fields are used in the spdk_bs_dump path. */
3589 	bool					dumping;
3590 	FILE					*fp;
3591 	spdk_bs_dump_print_xattr		print_xattr_fn;
3592 	char					xattr_name[4096];
3593 };
3594 
3595 static int
3596 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs,
3597 	 struct spdk_bs_load_ctx **_ctx)
3598 {
3599 	struct spdk_blob_store	*bs;
3600 	struct spdk_bs_load_ctx	*ctx;
3601 	uint64_t dev_size;
3602 	int rc;
3603 
3604 	dev_size = dev->blocklen * dev->blockcnt;
3605 	if (dev_size < opts->cluster_sz) {
3606 		/* Device size cannot be smaller than cluster size of blobstore */
3607 		SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
3608 			     dev_size, opts->cluster_sz);
3609 		return -ENOSPC;
3610 	}
3611 	if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) {
3612 		/* Cluster size cannot be smaller than page size */
3613 		SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
3614 			    opts->cluster_sz, SPDK_BS_PAGE_SIZE);
3615 		return -EINVAL;
3616 	}
3617 	bs = calloc(1, sizeof(struct spdk_blob_store));
3618 	if (!bs) {
3619 		return -ENOMEM;
3620 	}
3621 
3622 	ctx = calloc(1, sizeof(struct spdk_bs_load_ctx));
3623 	if (!ctx) {
3624 		free(bs);
3625 		return -ENOMEM;
3626 	}
3627 
3628 	ctx->bs = bs;
3629 	ctx->iter_cb_fn = opts->iter_cb_fn;
3630 	ctx->iter_cb_arg = opts->iter_cb_arg;
3631 	ctx->force_recover = opts->force_recover;
3632 
3633 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
3634 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3635 	if (!ctx->super) {
3636 		free(ctx);
3637 		free(bs);
3638 		return -ENOMEM;
3639 	}
3640 
3641 	RB_INIT(&bs->open_blobs);
3642 	TAILQ_INIT(&bs->snapshots);
3643 	bs->dev = dev;
3644 	bs->md_thread = spdk_get_thread();
3645 	assert(bs->md_thread != NULL);
3646 
3647 	/*
3648 	 * Do not use bs_lba_to_cluster() here since blockcnt may not be an
3649 	 *  even multiple of the cluster size.
3650 	 */
3651 	bs->cluster_sz = opts->cluster_sz;
3652 	bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
3653 	ctx->used_clusters = spdk_bit_array_create(bs->total_clusters);
3654 	if (!ctx->used_clusters) {
3655 		spdk_free(ctx->super);
3656 		free(ctx);
3657 		free(bs);
3658 		return -ENOMEM;
3659 	}
3660 
3661 	bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE;
3662 	if (spdk_u32_is_pow2(bs->pages_per_cluster)) {
3663 		bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster);
3664 	}
3665 	bs->num_free_clusters = bs->total_clusters;
3666 	bs->io_unit_size = dev->blocklen;
3667 
3668 	bs->max_channel_ops = opts->max_channel_ops;
3669 	bs->super_blob = SPDK_BLOBID_INVALID;
3670 	memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
3671 	bs->esnap_bs_dev_create = opts->esnap_bs_dev_create;
3672 	bs->esnap_ctx = opts->esnap_ctx;
3673 
3674 	/* The metadata is assumed to be at least 1 page */
3675 	bs->used_md_pages = spdk_bit_array_create(1);
3676 	bs->used_blobids = spdk_bit_array_create(0);
3677 	bs->open_blobids = spdk_bit_array_create(0);
3678 
3679 	spdk_spin_init(&bs->used_lock);
3680 
3681 	spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy,
3682 				sizeof(struct spdk_bs_channel), "blobstore");
3683 	rc = bs_register_md_thread(bs);
3684 	if (rc == -1) {
3685 		spdk_io_device_unregister(bs, NULL);
3686 		spdk_spin_destroy(&bs->used_lock);
3687 		spdk_bit_array_free(&bs->open_blobids);
3688 		spdk_bit_array_free(&bs->used_blobids);
3689 		spdk_bit_array_free(&bs->used_md_pages);
3690 		spdk_bit_array_free(&ctx->used_clusters);
3691 		spdk_free(ctx->super);
3692 		free(ctx);
3693 		free(bs);
3694 		/* FIXME: this is a lie but don't know how to get a proper error code here */
3695 		return -ENOMEM;
3696 	}
3697 
3698 	*_ctx = ctx;
3699 	*_bs = bs;
3700 	return 0;
3701 }
3702 
3703 static void
3704 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
3705 {
3706 	assert(bserrno != 0);
3707 
3708 	spdk_free(ctx->super);
3709 	bs_sequence_finish(ctx->seq, bserrno);
3710 	bs_free(ctx->bs);
3711 	spdk_bit_array_free(&ctx->used_clusters);
3712 	free(ctx);
3713 }
3714 
3715 static void
3716 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
3717 	       struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
3718 {
3719 	/* Update the values in the super block */
3720 	super->super_blob = bs->super_blob;
3721 	memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
3722 	super->crc = blob_md_page_calc_crc(super);
3723 	bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0),
3724 			      bs_byte_to_lba(bs, sizeof(*super)),
3725 			      cb_fn, cb_arg);
3726 }
3727 
3728 static void
3729 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3730 {
3731 	struct spdk_bs_load_ctx	*ctx = arg;
3732 	uint64_t	mask_size, lba, lba_count;
3733 
3734 	/* Write out the used clusters mask */
3735 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
3736 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3737 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3738 	if (!ctx->mask) {
3739 		bs_load_ctx_fail(ctx, -ENOMEM);
3740 		return;
3741 	}
3742 
3743 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
3744 	ctx->mask->length = ctx->bs->total_clusters;
3745 	/* We could get here through the normal unload path, or through dirty
3746 	 * shutdown recovery.  For the normal unload path, we use the mask from
3747 	 * the bit pool.  For dirty shutdown recovery, we don't have a bit pool yet -
3748 	 * only the bit array from the load ctx.
3749 	 */
3750 	if (ctx->bs->used_clusters) {
3751 		assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters));
3752 		spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask);
3753 	} else {
3754 		assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters));
3755 		spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask);
3756 	}
3757 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
3758 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
3759 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3760 }
3761 
3762 static void
3763 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3764 {
3765 	struct spdk_bs_load_ctx	*ctx = arg;
3766 	uint64_t	mask_size, lba, lba_count;
3767 
3768 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
3769 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3770 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3771 	if (!ctx->mask) {
3772 		bs_load_ctx_fail(ctx, -ENOMEM);
3773 		return;
3774 	}
3775 
3776 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
3777 	ctx->mask->length = ctx->super->md_len;
3778 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
3779 
3780 	spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask);
3781 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
3782 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
3783 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3784 }
3785 
3786 static void
3787 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3788 {
3789 	struct spdk_bs_load_ctx	*ctx = arg;
3790 	uint64_t	mask_size, lba, lba_count;
3791 
3792 	if (ctx->super->used_blobid_mask_len == 0) {
3793 		/*
3794 		 * This is a pre-v3 on-disk format where the blobid mask does not get
3795 		 *  written to disk.
3796 		 */
3797 		cb_fn(seq, arg, 0);
3798 		return;
3799 	}
3800 
3801 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
3802 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3803 				 SPDK_MALLOC_DMA);
3804 	if (!ctx->mask) {
3805 		bs_load_ctx_fail(ctx, -ENOMEM);
3806 		return;
3807 	}
3808 
3809 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
3810 	ctx->mask->length = ctx->super->md_len;
3811 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
3812 
3813 	spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask);
3814 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
3815 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
3816 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3817 }
3818 
3819 static void
3820 blob_set_thin_provision(struct spdk_blob *blob)
3821 {
3822 	blob_verify_md_op(blob);
3823 	blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
3824 	blob->state = SPDK_BLOB_STATE_DIRTY;
3825 }
3826 
3827 static void
3828 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
3829 {
3830 	blob_verify_md_op(blob);
3831 	blob->clear_method = clear_method;
3832 	blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
3833 	blob->state = SPDK_BLOB_STATE_DIRTY;
3834 }
3835 
3836 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
3837 
3838 static void
3839 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
3840 {
3841 	struct spdk_bs_load_ctx *ctx = cb_arg;
3842 	spdk_blob_id id;
3843 	int64_t page_num;
3844 
3845 	/* Iterate to next blob (we can't use spdk_bs_iter_next function as our
3846 	 * last blob has been removed */
3847 	page_num = bs_blobid_to_page(ctx->blobid);
3848 	page_num++;
3849 	page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
3850 	if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
3851 		bs_load_iter(ctx, NULL, -ENOENT);
3852 		return;
3853 	}
3854 
3855 	id = bs_page_to_blobid(page_num);
3856 
3857 	spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx);
3858 }
3859 
3860 static void
3861 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
3862 {
3863 	struct spdk_bs_load_ctx *ctx = cb_arg;
3864 
3865 	if (bserrno != 0) {
3866 		SPDK_ERRLOG("Failed to close corrupted blob\n");
3867 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3868 		return;
3869 	}
3870 
3871 	spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx);
3872 }
3873 
3874 static void
3875 bs_delete_corrupted_blob(void *cb_arg, int bserrno)
3876 {
3877 	struct spdk_bs_load_ctx *ctx = cb_arg;
3878 	uint64_t i;
3879 
3880 	if (bserrno != 0) {
3881 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3882 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3883 		return;
3884 	}
3885 
3886 	/* Snapshot and clone have the same copy of cluster map and extent pages
3887 	 * at this point. Let's clear both for snapshot now,
3888 	 * so that it won't be cleared for clone later when we remove snapshot.
3889 	 * Also set thin provision to pass data corruption check */
3890 	for (i = 0; i < ctx->blob->active.num_clusters; i++) {
3891 		ctx->blob->active.clusters[i] = 0;
3892 	}
3893 	for (i = 0; i < ctx->blob->active.num_extent_pages; i++) {
3894 		ctx->blob->active.extent_pages[i] = 0;
3895 	}
3896 
3897 	ctx->blob->md_ro = false;
3898 
3899 	blob_set_thin_provision(ctx->blob);
3900 
3901 	ctx->blobid = ctx->blob->id;
3902 
3903 	spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx);
3904 }
3905 
3906 static void
3907 bs_update_corrupted_blob(void *cb_arg, int bserrno)
3908 {
3909 	struct spdk_bs_load_ctx *ctx = cb_arg;
3910 
3911 	if (bserrno != 0) {
3912 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3913 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3914 		return;
3915 	}
3916 
3917 	ctx->blob->md_ro = false;
3918 	blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
3919 	blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
3920 	spdk_blob_set_read_only(ctx->blob);
3921 
3922 	if (ctx->iter_cb_fn) {
3923 		ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
3924 	}
3925 	bs_blob_list_add(ctx->blob);
3926 
3927 	spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3928 }
3929 
3930 static void
3931 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
3932 {
3933 	struct spdk_bs_load_ctx *ctx = cb_arg;
3934 
3935 	if (bserrno != 0) {
3936 		SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
3937 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3938 		return;
3939 	}
3940 
3941 	if (blob->parent_id == ctx->blob->id) {
3942 		/* Power failure occurred before updating clone (snapshot delete case)
3943 		 * or after updating clone (creating snapshot case) - keep snapshot */
3944 		spdk_blob_close(blob, bs_update_corrupted_blob, ctx);
3945 	} else {
3946 		/* Power failure occurred after updating clone (snapshot delete case)
3947 		 * or before updating clone (creating snapshot case) - remove snapshot */
3948 		spdk_blob_close(blob, bs_delete_corrupted_blob, ctx);
3949 	}
3950 }
3951 
3952 static void
3953 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
3954 {
3955 	struct spdk_bs_load_ctx *ctx = arg;
3956 	const void *value;
3957 	size_t len;
3958 	int rc = 0;
3959 
3960 	if (bserrno == 0) {
3961 		/* Examine blob if it is corrupted after power failure. Fix
3962 		 * the ones that can be fixed and remove any other corrupted
3963 		 * ones. If it is not corrupted just process it */
3964 		rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
3965 		if (rc != 0) {
3966 			rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
3967 			if (rc != 0) {
3968 				/* Not corrupted - process it and continue with iterating through blobs */
3969 				if (ctx->iter_cb_fn) {
3970 					ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
3971 				}
3972 				bs_blob_list_add(blob);
3973 				spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx);
3974 				return;
3975 			}
3976 
3977 		}
3978 
3979 		assert(len == sizeof(spdk_blob_id));
3980 
3981 		ctx->blob = blob;
3982 
3983 		/* Open clone to check if we are able to fix this blob or should we remove it */
3984 		spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx);
3985 		return;
3986 	} else if (bserrno == -ENOENT) {
3987 		bserrno = 0;
3988 	} else {
3989 		/*
3990 		 * This case needs to be looked at further.  Same problem
3991 		 *  exists with applications that rely on explicit blob
3992 		 *  iteration.  We should just skip the blob that failed
3993 		 *  to load and continue on to the next one.
3994 		 */
3995 		SPDK_ERRLOG("Error in iterating blobs\n");
3996 	}
3997 
3998 	ctx->iter_cb_fn = NULL;
3999 
4000 	spdk_free(ctx->super);
4001 	spdk_free(ctx->mask);
4002 	bs_sequence_finish(ctx->seq, bserrno);
4003 	free(ctx);
4004 }
4005 
4006 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
4007 
4008 static void
4009 bs_load_complete(struct spdk_bs_load_ctx *ctx)
4010 {
4011 	ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
4012 	if (ctx->dumping) {
4013 		bs_dump_read_md_page(ctx->seq, ctx);
4014 		return;
4015 	}
4016 	spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx);
4017 }
4018 
4019 static void
4020 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4021 {
4022 	struct spdk_bs_load_ctx *ctx = cb_arg;
4023 	int rc;
4024 
4025 	/* The type must be correct */
4026 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
4027 
4028 	/* The length of the mask (in bits) must not be greater than
4029 	 * the length of the buffer (converted to bits) */
4030 	assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8));
4031 
4032 	/* The length of the mask must be exactly equal to the size
4033 	 * (in pages) of the metadata region */
4034 	assert(ctx->mask->length == ctx->super->md_len);
4035 
4036 	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length);
4037 	if (rc < 0) {
4038 		spdk_free(ctx->mask);
4039 		bs_load_ctx_fail(ctx, rc);
4040 		return;
4041 	}
4042 
4043 	spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask);
4044 	bs_load_complete(ctx);
4045 }
4046 
4047 static void
4048 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4049 {
4050 	struct spdk_bs_load_ctx *ctx = cb_arg;
4051 	uint64_t		lba, lba_count, mask_size;
4052 	int			rc;
4053 
4054 	if (bserrno != 0) {
4055 		bs_load_ctx_fail(ctx, bserrno);
4056 		return;
4057 	}
4058 
4059 	/* The type must be correct */
4060 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
4061 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
4062 	assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
4063 					     struct spdk_blob_md_page) * 8));
4064 	/*
4065 	 * The length of the mask must be equal to or larger than the total number of clusters. It may be
4066 	 * larger than the total number of clusters due to a failure spdk_bs_grow.
4067 	 */
4068 	assert(ctx->mask->length >= ctx->bs->total_clusters);
4069 	if (ctx->mask->length > ctx->bs->total_clusters) {
4070 		SPDK_WARNLOG("Shrink the used_custers mask length to total_clusters");
4071 		ctx->mask->length = ctx->bs->total_clusters;
4072 	}
4073 
4074 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length);
4075 	if (rc < 0) {
4076 		spdk_free(ctx->mask);
4077 		bs_load_ctx_fail(ctx, rc);
4078 		return;
4079 	}
4080 
4081 	spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask);
4082 	ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters);
4083 	assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
4084 
4085 	spdk_free(ctx->mask);
4086 
4087 	/* Read the used blobids mask */
4088 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
4089 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
4090 				 SPDK_MALLOC_DMA);
4091 	if (!ctx->mask) {
4092 		bs_load_ctx_fail(ctx, -ENOMEM);
4093 		return;
4094 	}
4095 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
4096 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
4097 	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
4098 			     bs_load_used_blobids_cpl, ctx);
4099 }
4100 
4101 static void
4102 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4103 {
4104 	struct spdk_bs_load_ctx *ctx = cb_arg;
4105 	uint64_t		lba, lba_count, mask_size;
4106 	int			rc;
4107 
4108 	if (bserrno != 0) {
4109 		bs_load_ctx_fail(ctx, bserrno);
4110 		return;
4111 	}
4112 
4113 	/* The type must be correct */
4114 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
4115 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
4116 	assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE *
4117 				     8));
4118 	/* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
4119 	if (ctx->mask->length != ctx->super->md_len) {
4120 		SPDK_ERRLOG("mismatched md_len in used_pages mask: "
4121 			    "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n",
4122 			    ctx->mask->length, ctx->super->md_len);
4123 		assert(false);
4124 	}
4125 
4126 	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length);
4127 	if (rc < 0) {
4128 		spdk_free(ctx->mask);
4129 		bs_load_ctx_fail(ctx, rc);
4130 		return;
4131 	}
4132 
4133 	spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask);
4134 	spdk_free(ctx->mask);
4135 
4136 	/* Read the used clusters mask */
4137 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
4138 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
4139 				 SPDK_MALLOC_DMA);
4140 	if (!ctx->mask) {
4141 		bs_load_ctx_fail(ctx, -ENOMEM);
4142 		return;
4143 	}
4144 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
4145 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
4146 	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
4147 			     bs_load_used_clusters_cpl, ctx);
4148 }
4149 
4150 static void
4151 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
4152 {
4153 	uint64_t lba, lba_count, mask_size;
4154 
4155 	/* Read the used pages mask */
4156 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
4157 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
4158 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4159 	if (!ctx->mask) {
4160 		bs_load_ctx_fail(ctx, -ENOMEM);
4161 		return;
4162 	}
4163 
4164 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
4165 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
4166 	bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
4167 			     bs_load_used_pages_cpl, ctx);
4168 }
4169 
4170 static int
4171 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page)
4172 {
4173 	struct spdk_blob_store *bs = ctx->bs;
4174 	struct spdk_blob_md_descriptor *desc;
4175 	size_t	cur_desc = 0;
4176 
4177 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
4178 	while (cur_desc < sizeof(page->descriptors)) {
4179 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
4180 			if (desc->length == 0) {
4181 				/* If padding and length are 0, this terminates the page */
4182 				break;
4183 			}
4184 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
4185 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
4186 			unsigned int				i, j;
4187 			unsigned int				cluster_count = 0;
4188 			uint32_t				cluster_idx;
4189 
4190 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
4191 
4192 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
4193 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
4194 					cluster_idx = desc_extent_rle->extents[i].cluster_idx;
4195 					/*
4196 					 * cluster_idx = 0 means an unallocated cluster - don't mark that
4197 					 * in the used cluster map.
4198 					 */
4199 					if (cluster_idx != 0) {
4200 						SPDK_NOTICELOG("Recover: cluster %" PRIu32 "\n", cluster_idx + j);
4201 						spdk_bit_array_set(ctx->used_clusters, cluster_idx + j);
4202 						if (bs->num_free_clusters == 0) {
4203 							return -ENOSPC;
4204 						}
4205 						bs->num_free_clusters--;
4206 					}
4207 					cluster_count++;
4208 				}
4209 			}
4210 			if (cluster_count == 0) {
4211 				return -EINVAL;
4212 			}
4213 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
4214 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
4215 			uint32_t					i;
4216 			uint32_t					cluster_count = 0;
4217 			uint32_t					cluster_idx;
4218 			size_t						cluster_idx_length;
4219 
4220 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
4221 			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
4222 
4223 			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
4224 			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
4225 				return -EINVAL;
4226 			}
4227 
4228 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
4229 				cluster_idx = desc_extent->cluster_idx[i];
4230 				/*
4231 				 * cluster_idx = 0 means an unallocated cluster - don't mark that
4232 				 * in the used cluster map.
4233 				 */
4234 				if (cluster_idx != 0) {
4235 					if (cluster_idx < desc_extent->start_cluster_idx &&
4236 					    cluster_idx >= desc_extent->start_cluster_idx + cluster_count) {
4237 						return -EINVAL;
4238 					}
4239 					spdk_bit_array_set(ctx->used_clusters, cluster_idx);
4240 					if (bs->num_free_clusters == 0) {
4241 						return -ENOSPC;
4242 					}
4243 					bs->num_free_clusters--;
4244 				}
4245 				cluster_count++;
4246 			}
4247 
4248 			if (cluster_count == 0) {
4249 				return -EINVAL;
4250 			}
4251 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
4252 			/* Skip this item */
4253 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
4254 			/* Skip this item */
4255 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
4256 			/* Skip this item */
4257 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
4258 			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
4259 			uint32_t num_extent_pages = ctx->num_extent_pages;
4260 			uint32_t i;
4261 			size_t extent_pages_length;
4262 			void *tmp;
4263 
4264 			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
4265 			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
4266 
4267 			if (desc_extent_table->length == 0 ||
4268 			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
4269 				return -EINVAL;
4270 			}
4271 
4272 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
4273 				if (desc_extent_table->extent_page[i].page_idx != 0) {
4274 					if (desc_extent_table->extent_page[i].num_pages != 1) {
4275 						return -EINVAL;
4276 					}
4277 					num_extent_pages += 1;
4278 				}
4279 			}
4280 
4281 			if (num_extent_pages > 0) {
4282 				tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t));
4283 				if (tmp == NULL) {
4284 					return -ENOMEM;
4285 				}
4286 				ctx->extent_page_num = tmp;
4287 
4288 				/* Extent table entries contain md page numbers for extent pages.
4289 				 * Zeroes represent unallocated extent pages, those are run-length-encoded.
4290 				 */
4291 				for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
4292 					if (desc_extent_table->extent_page[i].page_idx != 0) {
4293 						ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx;
4294 						ctx->num_extent_pages += 1;
4295 					}
4296 				}
4297 			}
4298 		} else {
4299 			/* Error */
4300 			return -EINVAL;
4301 		}
4302 		/* Advance to the next descriptor */
4303 		cur_desc += sizeof(*desc) + desc->length;
4304 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
4305 			break;
4306 		}
4307 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
4308 	}
4309 	return 0;
4310 }
4311 
4312 static bool
4313 bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
4314 {
4315 	uint32_t crc;
4316 	struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
4317 	size_t desc_len;
4318 
4319 	crc = blob_md_page_calc_crc(page);
4320 	if (crc != page->crc) {
4321 		return false;
4322 	}
4323 
4324 	/* Extent page should always be of sequence num 0. */
4325 	if (page->sequence_num != 0) {
4326 		return false;
4327 	}
4328 
4329 	/* Descriptor type must be EXTENT_PAGE. */
4330 	if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
4331 		return false;
4332 	}
4333 
4334 	/* Descriptor length cannot exceed the page. */
4335 	desc_len = sizeof(*desc) + desc->length;
4336 	if (desc_len > sizeof(page->descriptors)) {
4337 		return false;
4338 	}
4339 
4340 	/* It has to be the only descriptor in the page. */
4341 	if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
4342 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
4343 		if (desc->length != 0) {
4344 			return false;
4345 		}
4346 	}
4347 
4348 	return true;
4349 }
4350 
4351 static bool
4352 bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
4353 {
4354 	uint32_t crc;
4355 	struct spdk_blob_md_page *page = ctx->page;
4356 
4357 	crc = blob_md_page_calc_crc(page);
4358 	if (crc != page->crc) {
4359 		return false;
4360 	}
4361 
4362 	/* First page of a sequence should match the blobid. */
4363 	if (page->sequence_num == 0 &&
4364 	    bs_page_to_blobid(ctx->cur_page) != page->id) {
4365 		return false;
4366 	}
4367 	assert(bs_load_cur_extent_page_valid(page) == false);
4368 
4369 	return true;
4370 }
4371 
4372 static void bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
4373 
4374 static void
4375 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4376 {
4377 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4378 
4379 	if (bserrno != 0) {
4380 		bs_load_ctx_fail(ctx, bserrno);
4381 		return;
4382 	}
4383 
4384 	bs_load_complete(ctx);
4385 }
4386 
4387 static void
4388 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4389 {
4390 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4391 
4392 	spdk_free(ctx->mask);
4393 	ctx->mask = NULL;
4394 
4395 	if (bserrno != 0) {
4396 		bs_load_ctx_fail(ctx, bserrno);
4397 		return;
4398 	}
4399 
4400 	bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl);
4401 }
4402 
4403 static void
4404 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4405 {
4406 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4407 
4408 	spdk_free(ctx->mask);
4409 	ctx->mask = NULL;
4410 
4411 	if (bserrno != 0) {
4412 		bs_load_ctx_fail(ctx, bserrno);
4413 		return;
4414 	}
4415 
4416 	bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl);
4417 }
4418 
4419 static void
4420 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
4421 {
4422 	bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl);
4423 }
4424 
4425 static void
4426 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
4427 {
4428 	uint64_t num_md_clusters;
4429 	uint64_t i;
4430 
4431 	ctx->in_page_chain = false;
4432 
4433 	do {
4434 		ctx->page_index++;
4435 	} while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
4436 
4437 	if (ctx->page_index < ctx->super->md_len) {
4438 		ctx->cur_page = ctx->page_index;
4439 		bs_load_replay_cur_md_page(ctx);
4440 	} else {
4441 		/* Claim all of the clusters used by the metadata */
4442 		num_md_clusters = spdk_divide_round_up(
4443 					  ctx->super->md_start + ctx->super->md_len, ctx->bs->pages_per_cluster);
4444 		for (i = 0; i < num_md_clusters; i++) {
4445 			spdk_bit_array_set(ctx->used_clusters, i);
4446 		}
4447 		ctx->bs->num_free_clusters -= num_md_clusters;
4448 		spdk_free(ctx->page);
4449 		bs_load_write_used_md(ctx);
4450 	}
4451 }
4452 
4453 static void
4454 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4455 {
4456 	struct spdk_bs_load_ctx *ctx = cb_arg;
4457 	uint32_t page_num;
4458 	uint64_t i;
4459 
4460 	if (bserrno != 0) {
4461 		spdk_free(ctx->extent_pages);
4462 		bs_load_ctx_fail(ctx, bserrno);
4463 		return;
4464 	}
4465 
4466 	for (i = 0; i < ctx->num_extent_pages; i++) {
4467 		/* Extent pages are only read when present within in chain md.
4468 		 * Integrity of md is not right if that page was not a valid extent page. */
4469 		if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) {
4470 			spdk_free(ctx->extent_pages);
4471 			bs_load_ctx_fail(ctx, -EILSEQ);
4472 			return;
4473 		}
4474 
4475 		page_num = ctx->extent_page_num[i];
4476 		spdk_bit_array_set(ctx->bs->used_md_pages, page_num);
4477 		if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) {
4478 			spdk_free(ctx->extent_pages);
4479 			bs_load_ctx_fail(ctx, -EILSEQ);
4480 			return;
4481 		}
4482 	}
4483 
4484 	spdk_free(ctx->extent_pages);
4485 	free(ctx->extent_page_num);
4486 	ctx->extent_page_num = NULL;
4487 	ctx->num_extent_pages = 0;
4488 
4489 	bs_load_replay_md_chain_cpl(ctx);
4490 }
4491 
4492 static void
4493 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx)
4494 {
4495 	spdk_bs_batch_t *batch;
4496 	uint32_t page;
4497 	uint64_t lba;
4498 	uint64_t i;
4499 
4500 	ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0,
4501 					 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4502 	if (!ctx->extent_pages) {
4503 		bs_load_ctx_fail(ctx, -ENOMEM);
4504 		return;
4505 	}
4506 
4507 	batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx);
4508 
4509 	for (i = 0; i < ctx->num_extent_pages; i++) {
4510 		page = ctx->extent_page_num[i];
4511 		assert(page < ctx->super->md_len);
4512 		lba = bs_md_page_to_lba(ctx->bs, page);
4513 		bs_batch_read_dev(batch, &ctx->extent_pages[i], lba,
4514 				  bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE));
4515 	}
4516 
4517 	bs_batch_close(batch);
4518 }
4519 
4520 static void
4521 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4522 {
4523 	struct spdk_bs_load_ctx *ctx = cb_arg;
4524 	uint32_t page_num;
4525 	struct spdk_blob_md_page *page;
4526 
4527 	if (bserrno != 0) {
4528 		bs_load_ctx_fail(ctx, bserrno);
4529 		return;
4530 	}
4531 
4532 	page_num = ctx->cur_page;
4533 	page = ctx->page;
4534 	if (bs_load_cur_md_page_valid(ctx) == true) {
4535 		if (page->sequence_num == 0 || ctx->in_page_chain == true) {
4536 			spdk_spin_lock(&ctx->bs->used_lock);
4537 			bs_claim_md_page(ctx->bs, page_num);
4538 			spdk_spin_unlock(&ctx->bs->used_lock);
4539 			if (page->sequence_num == 0) {
4540 				SPDK_NOTICELOG("Recover: blob 0x%" PRIx32 "\n", page_num);
4541 				spdk_bit_array_set(ctx->bs->used_blobids, page_num);
4542 			}
4543 			if (bs_load_replay_md_parse_page(ctx, page)) {
4544 				bs_load_ctx_fail(ctx, -EILSEQ);
4545 				return;
4546 			}
4547 			if (page->next != SPDK_INVALID_MD_PAGE) {
4548 				ctx->in_page_chain = true;
4549 				ctx->cur_page = page->next;
4550 				bs_load_replay_cur_md_page(ctx);
4551 				return;
4552 			}
4553 			if (ctx->num_extent_pages != 0) {
4554 				bs_load_replay_extent_pages(ctx);
4555 				return;
4556 			}
4557 		}
4558 	}
4559 	bs_load_replay_md_chain_cpl(ctx);
4560 }
4561 
4562 static void
4563 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
4564 {
4565 	uint64_t lba;
4566 
4567 	assert(ctx->cur_page < ctx->super->md_len);
4568 	lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page);
4569 	bs_sequence_read_dev(ctx->seq, ctx->page, lba,
4570 			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
4571 			     bs_load_replay_md_cpl, ctx);
4572 }
4573 
4574 static void
4575 bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
4576 {
4577 	ctx->page_index = 0;
4578 	ctx->cur_page = 0;
4579 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
4580 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4581 	if (!ctx->page) {
4582 		bs_load_ctx_fail(ctx, -ENOMEM);
4583 		return;
4584 	}
4585 	bs_load_replay_cur_md_page(ctx);
4586 }
4587 
4588 static void
4589 bs_recover(struct spdk_bs_load_ctx *ctx)
4590 {
4591 	int		rc;
4592 
4593 	SPDK_NOTICELOG("Performing recovery on blobstore\n");
4594 	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
4595 	if (rc < 0) {
4596 		bs_load_ctx_fail(ctx, -ENOMEM);
4597 		return;
4598 	}
4599 
4600 	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
4601 	if (rc < 0) {
4602 		bs_load_ctx_fail(ctx, -ENOMEM);
4603 		return;
4604 	}
4605 
4606 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
4607 	if (rc < 0) {
4608 		bs_load_ctx_fail(ctx, -ENOMEM);
4609 		return;
4610 	}
4611 
4612 	rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len);
4613 	if (rc < 0) {
4614 		bs_load_ctx_fail(ctx, -ENOMEM);
4615 		return;
4616 	}
4617 
4618 	ctx->bs->num_free_clusters = ctx->bs->total_clusters;
4619 	bs_load_replay_md(ctx);
4620 }
4621 
4622 static int
4623 bs_parse_super(struct spdk_bs_load_ctx *ctx)
4624 {
4625 	int rc;
4626 
4627 	if (ctx->super->size == 0) {
4628 		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
4629 	}
4630 
4631 	if (ctx->super->io_unit_size == 0) {
4632 		ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
4633 	}
4634 
4635 	ctx->bs->clean = 1;
4636 	ctx->bs->cluster_sz = ctx->super->cluster_size;
4637 	ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
4638 	ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
4639 	if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) {
4640 		ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster);
4641 	}
4642 	ctx->bs->io_unit_size = ctx->super->io_unit_size;
4643 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
4644 	if (rc < 0) {
4645 		return -ENOMEM;
4646 	}
4647 	ctx->bs->md_start = ctx->super->md_start;
4648 	ctx->bs->md_len = ctx->super->md_len;
4649 	rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len);
4650 	if (rc < 0) {
4651 		return -ENOMEM;
4652 	}
4653 
4654 	ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
4655 					       ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
4656 	ctx->bs->super_blob = ctx->super->super_blob;
4657 	memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
4658 
4659 	return 0;
4660 }
4661 
4662 static void
4663 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4664 {
4665 	struct spdk_bs_load_ctx *ctx = cb_arg;
4666 	uint32_t	crc;
4667 	int		rc;
4668 	static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
4669 
4670 	if (ctx->super->version > SPDK_BS_VERSION ||
4671 	    ctx->super->version < SPDK_BS_INITIAL_VERSION) {
4672 		bs_load_ctx_fail(ctx, -EILSEQ);
4673 		return;
4674 	}
4675 
4676 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4677 		   sizeof(ctx->super->signature)) != 0) {
4678 		bs_load_ctx_fail(ctx, -EILSEQ);
4679 		return;
4680 	}
4681 
4682 	crc = blob_md_page_calc_crc(ctx->super);
4683 	if (crc != ctx->super->crc) {
4684 		bs_load_ctx_fail(ctx, -EILSEQ);
4685 		return;
4686 	}
4687 
4688 	if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
4689 		SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n");
4690 	} else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
4691 		SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n");
4692 	} else {
4693 		SPDK_DEBUGLOG(blob, "Unexpected bstype\n");
4694 		SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
4695 		SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
4696 		bs_load_ctx_fail(ctx, -ENXIO);
4697 		return;
4698 	}
4699 
4700 	if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
4701 		SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n",
4702 			       ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
4703 		bs_load_ctx_fail(ctx, -EILSEQ);
4704 		return;
4705 	}
4706 
4707 	rc = bs_parse_super(ctx);
4708 	if (rc < 0) {
4709 		bs_load_ctx_fail(ctx, rc);
4710 		return;
4711 	}
4712 
4713 	if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0 || ctx->force_recover) {
4714 		bs_recover(ctx);
4715 	} else {
4716 		bs_load_read_used_pages(ctx);
4717 	}
4718 }
4719 
4720 static inline int
4721 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst)
4722 {
4723 
4724 	if (!src->opts_size) {
4725 		SPDK_ERRLOG("opts_size should not be zero value\n");
4726 		return -1;
4727 	}
4728 
4729 #define FIELD_OK(field) \
4730         offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size
4731 
4732 #define SET_FIELD(field) \
4733         if (FIELD_OK(field)) { \
4734                 dst->field = src->field; \
4735         } \
4736 
4737 	SET_FIELD(cluster_sz);
4738 	SET_FIELD(num_md_pages);
4739 	SET_FIELD(max_md_ops);
4740 	SET_FIELD(max_channel_ops);
4741 	SET_FIELD(clear_method);
4742 
4743 	if (FIELD_OK(bstype)) {
4744 		memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype));
4745 	}
4746 	SET_FIELD(iter_cb_fn);
4747 	SET_FIELD(iter_cb_arg);
4748 	SET_FIELD(force_recover);
4749 	SET_FIELD(esnap_bs_dev_create);
4750 	SET_FIELD(esnap_ctx);
4751 
4752 	dst->opts_size = src->opts_size;
4753 
4754 	/* You should not remove this statement, but need to update the assert statement
4755 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
4756 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 88, "Incorrect size");
4757 
4758 #undef FIELD_OK
4759 #undef SET_FIELD
4760 
4761 	return 0;
4762 }
4763 
4764 void
4765 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
4766 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
4767 {
4768 	struct spdk_blob_store	*bs;
4769 	struct spdk_bs_cpl	cpl;
4770 	struct spdk_bs_load_ctx *ctx;
4771 	struct spdk_bs_opts	opts = {};
4772 	int err;
4773 
4774 	SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev);
4775 
4776 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
4777 		SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen);
4778 		dev->destroy(dev);
4779 		cb_fn(cb_arg, NULL, -EINVAL);
4780 		return;
4781 	}
4782 
4783 	spdk_bs_opts_init(&opts, sizeof(opts));
4784 	if (o) {
4785 		if (bs_opts_copy(o, &opts)) {
4786 			return;
4787 		}
4788 	}
4789 
4790 	if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
4791 		dev->destroy(dev);
4792 		cb_fn(cb_arg, NULL, -EINVAL);
4793 		return;
4794 	}
4795 
4796 	err = bs_alloc(dev, &opts, &bs, &ctx);
4797 	if (err) {
4798 		dev->destroy(dev);
4799 		cb_fn(cb_arg, NULL, err);
4800 		return;
4801 	}
4802 
4803 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
4804 	cpl.u.bs_handle.cb_fn = cb_fn;
4805 	cpl.u.bs_handle.cb_arg = cb_arg;
4806 	cpl.u.bs_handle.bs = bs;
4807 
4808 	ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
4809 	if (!ctx->seq) {
4810 		spdk_free(ctx->super);
4811 		free(ctx);
4812 		bs_free(bs);
4813 		cb_fn(cb_arg, NULL, -ENOMEM);
4814 		return;
4815 	}
4816 
4817 	/* Read the super block */
4818 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
4819 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
4820 			     bs_load_super_cpl, ctx);
4821 }
4822 
4823 /* END spdk_bs_load */
4824 
4825 /* START spdk_bs_dump */
4826 
4827 static void
4828 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno)
4829 {
4830 	spdk_free(ctx->super);
4831 
4832 	/*
4833 	 * We need to defer calling bs_call_cpl() until after
4834 	 * dev destruction, so tuck these away for later use.
4835 	 */
4836 	ctx->bs->unload_err = bserrno;
4837 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4838 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4839 
4840 	bs_sequence_finish(seq, 0);
4841 	bs_free(ctx->bs);
4842 	free(ctx);
4843 }
4844 
4845 static void
4846 bs_dump_print_xattr(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
4847 {
4848 	struct spdk_blob_md_descriptor_xattr *desc_xattr;
4849 	uint32_t i;
4850 	const char *type;
4851 
4852 	desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
4853 
4854 	if (desc_xattr->length !=
4855 	    sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
4856 	    desc_xattr->name_length + desc_xattr->value_length) {
4857 	}
4858 
4859 	memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
4860 	ctx->xattr_name[desc_xattr->name_length] = '\0';
4861 	if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
4862 		type = "XATTR";
4863 	} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
4864 		type = "XATTR_INTERNAL";
4865 	} else {
4866 		assert(false);
4867 		type = "XATTR_?";
4868 	}
4869 	fprintf(ctx->fp, "%s: name = \"%s\"\n", type, ctx->xattr_name);
4870 	fprintf(ctx->fp, "       value = \"");
4871 	ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
4872 			    (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
4873 			    desc_xattr->value_length);
4874 	fprintf(ctx->fp, "\"\n");
4875 	for (i = 0; i < desc_xattr->value_length; i++) {
4876 		if (i % 16 == 0) {
4877 			fprintf(ctx->fp, "               ");
4878 		}
4879 		fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
4880 		if ((i + 1) % 16 == 0) {
4881 			fprintf(ctx->fp, "\n");
4882 		}
4883 	}
4884 	if (i % 16 != 0) {
4885 		fprintf(ctx->fp, "\n");
4886 	}
4887 }
4888 
4889 struct type_flag_desc {
4890 	uint64_t mask;
4891 	uint64_t val;
4892 	const char *name;
4893 };
4894 
4895 static void
4896 bs_dump_print_type_bits(struct spdk_bs_load_ctx *ctx, uint64_t flags,
4897 			struct type_flag_desc *desc, size_t numflags)
4898 {
4899 	uint64_t covered = 0;
4900 	size_t i;
4901 
4902 	for (i = 0; i < numflags; i++) {
4903 		if ((desc[i].mask & flags) != desc[i].val) {
4904 			continue;
4905 		}
4906 		fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " %s", desc[i].val, desc[i].name);
4907 		if (desc[i].mask != desc[i].val) {
4908 			fprintf(ctx->fp, " (mask 0x%" PRIx64 " value 0x%" PRIx64 ")",
4909 				desc[i].mask, desc[i].val);
4910 		}
4911 		fprintf(ctx->fp, "\n");
4912 		covered |= desc[i].mask;
4913 	}
4914 	if ((flags & ~covered) != 0) {
4915 		fprintf(ctx->fp, "\t\t 0x%016" PRIx64 " Unknown\n", flags & ~covered);
4916 	}
4917 }
4918 
4919 static void
4920 bs_dump_print_type_flags(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
4921 {
4922 	struct spdk_blob_md_descriptor_flags *type_desc;
4923 #define ADD_FLAG(f) { f, f, #f }
4924 #define ADD_MASK_VAL(m, v) { m, v, #v }
4925 	static struct type_flag_desc invalid[] = {
4926 		ADD_FLAG(SPDK_BLOB_THIN_PROV),
4927 		ADD_FLAG(SPDK_BLOB_INTERNAL_XATTR),
4928 		ADD_FLAG(SPDK_BLOB_EXTENT_TABLE),
4929 	};
4930 	static struct type_flag_desc data_ro[] = {
4931 		ADD_FLAG(SPDK_BLOB_READ_ONLY),
4932 	};
4933 	static struct type_flag_desc md_ro[] = {
4934 		ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_DEFAULT),
4935 		ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_NONE),
4936 		ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_UNMAP),
4937 		ADD_MASK_VAL(SPDK_BLOB_MD_RO_FLAGS_MASK, BLOB_CLEAR_WITH_WRITE_ZEROES),
4938 	};
4939 #undef ADD_FLAG
4940 #undef ADD_MASK_VAL
4941 
4942 	type_desc = (struct spdk_blob_md_descriptor_flags *)desc;
4943 	fprintf(ctx->fp, "Flags:\n");
4944 	fprintf(ctx->fp, "\tinvalid: 0x%016" PRIx64 "\n", type_desc->invalid_flags);
4945 	bs_dump_print_type_bits(ctx, type_desc->invalid_flags, invalid,
4946 				SPDK_COUNTOF(invalid));
4947 	fprintf(ctx->fp, "\tdata_ro: 0x%016" PRIx64 "\n", type_desc->data_ro_flags);
4948 	bs_dump_print_type_bits(ctx, type_desc->data_ro_flags, data_ro,
4949 				SPDK_COUNTOF(data_ro));
4950 	fprintf(ctx->fp, "\t  md_ro: 0x%016" PRIx64 "\n", type_desc->md_ro_flags);
4951 	bs_dump_print_type_bits(ctx, type_desc->md_ro_flags, md_ro,
4952 				SPDK_COUNTOF(md_ro));
4953 }
4954 
4955 static void
4956 bs_dump_print_extent_table(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_descriptor *desc)
4957 {
4958 	struct spdk_blob_md_descriptor_extent_table *et_desc;
4959 	uint64_t num_extent_pages;
4960 	uint32_t et_idx;
4961 
4962 	et_desc = (struct spdk_blob_md_descriptor_extent_table *)desc;
4963 	num_extent_pages = (et_desc->length - sizeof(et_desc->num_clusters)) /
4964 			   sizeof(et_desc->extent_page[0]);
4965 
4966 	fprintf(ctx->fp, "Extent table:\n");
4967 	for (et_idx = 0; et_idx < num_extent_pages; et_idx++) {
4968 		if (et_desc->extent_page[et_idx].page_idx == 0) {
4969 			/* Zeroes represent unallocated extent pages. */
4970 			continue;
4971 		}
4972 		fprintf(ctx->fp, "\tExtent page: %5" PRIu32 " length %3" PRIu32
4973 			" at LBA %" PRIu64 "\n", et_desc->extent_page[et_idx].page_idx,
4974 			et_desc->extent_page[et_idx].num_pages,
4975 			bs_md_page_to_lba(ctx->bs, et_desc->extent_page[et_idx].page_idx));
4976 	}
4977 }
4978 
4979 static void
4980 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx)
4981 {
4982 	uint32_t page_idx = ctx->cur_page;
4983 	struct spdk_blob_md_page *page = ctx->page;
4984 	struct spdk_blob_md_descriptor *desc;
4985 	size_t cur_desc = 0;
4986 	uint32_t crc;
4987 
4988 	fprintf(ctx->fp, "=========\n");
4989 	fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
4990 	fprintf(ctx->fp, "Start LBA: %" PRIu64 "\n", bs_md_page_to_lba(ctx->bs, page_idx));
4991 	fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
4992 	fprintf(ctx->fp, "Sequence: %" PRIu32 "\n", page->sequence_num);
4993 	if (page->next == SPDK_INVALID_MD_PAGE) {
4994 		fprintf(ctx->fp, "Next: None\n");
4995 	} else {
4996 		fprintf(ctx->fp, "Next: %" PRIu32 "\n", page->next);
4997 	}
4998 	fprintf(ctx->fp, "In used bit array%s:", ctx->super->clean ? "" : " (not clean: dubious)");
4999 	if (spdk_bit_array_get(ctx->bs->used_md_pages, page_idx)) {
5000 		fprintf(ctx->fp, " md");
5001 	}
5002 	if (spdk_bit_array_get(ctx->bs->used_blobids, page_idx)) {
5003 		fprintf(ctx->fp, " blob");
5004 	}
5005 	fprintf(ctx->fp, "\n");
5006 
5007 	crc = blob_md_page_calc_crc(page);
5008 	fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
5009 
5010 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
5011 	while (cur_desc < sizeof(page->descriptors)) {
5012 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
5013 			if (desc->length == 0) {
5014 				/* If padding and length are 0, this terminates the page */
5015 				break;
5016 			}
5017 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
5018 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
5019 			unsigned int				i;
5020 
5021 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
5022 
5023 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
5024 				if (desc_extent_rle->extents[i].cluster_idx != 0) {
5025 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
5026 						desc_extent_rle->extents[i].cluster_idx);
5027 				} else {
5028 					fprintf(ctx->fp, "Unallocated Extent - ");
5029 				}
5030 				fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
5031 				fprintf(ctx->fp, "\n");
5032 			}
5033 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
5034 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
5035 			unsigned int					i;
5036 
5037 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
5038 
5039 			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
5040 				if (desc_extent->cluster_idx[i] != 0) {
5041 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
5042 						desc_extent->cluster_idx[i]);
5043 				} else {
5044 					fprintf(ctx->fp, "Unallocated Extent");
5045 				}
5046 				fprintf(ctx->fp, "\n");
5047 			}
5048 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
5049 			bs_dump_print_xattr(ctx, desc);
5050 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
5051 			bs_dump_print_xattr(ctx, desc);
5052 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
5053 			bs_dump_print_type_flags(ctx, desc);
5054 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
5055 			bs_dump_print_extent_table(ctx, desc);
5056 		} else {
5057 			/* Error */
5058 			fprintf(ctx->fp, "Unknown descriptor type %" PRIu8 "\n", desc->type);
5059 		}
5060 		/* Advance to the next descriptor */
5061 		cur_desc += sizeof(*desc) + desc->length;
5062 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
5063 			break;
5064 		}
5065 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
5066 	}
5067 }
5068 
5069 static void
5070 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5071 {
5072 	struct spdk_bs_load_ctx *ctx = cb_arg;
5073 
5074 	if (bserrno != 0) {
5075 		bs_dump_finish(seq, ctx, bserrno);
5076 		return;
5077 	}
5078 
5079 	if (ctx->page->id != 0) {
5080 		bs_dump_print_md_page(ctx);
5081 	}
5082 
5083 	ctx->cur_page++;
5084 
5085 	if (ctx->cur_page < ctx->super->md_len) {
5086 		bs_dump_read_md_page(seq, ctx);
5087 	} else {
5088 		spdk_free(ctx->page);
5089 		bs_dump_finish(seq, ctx, 0);
5090 	}
5091 }
5092 
5093 static void
5094 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
5095 {
5096 	struct spdk_bs_load_ctx *ctx = cb_arg;
5097 	uint64_t lba;
5098 
5099 	assert(ctx->cur_page < ctx->super->md_len);
5100 	lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
5101 	bs_sequence_read_dev(seq, ctx->page, lba,
5102 			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
5103 			     bs_dump_read_md_page_cpl, ctx);
5104 }
5105 
5106 static void
5107 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5108 {
5109 	struct spdk_bs_load_ctx *ctx = cb_arg;
5110 	int rc;
5111 
5112 	fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
5113 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
5114 		   sizeof(ctx->super->signature)) != 0) {
5115 		fprintf(ctx->fp, "(Mismatch)\n");
5116 		bs_dump_finish(seq, ctx, bserrno);
5117 		return;
5118 	} else {
5119 		fprintf(ctx->fp, "(OK)\n");
5120 	}
5121 	fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
5122 	fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
5123 		(ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
5124 	fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
5125 	fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
5126 	fprintf(ctx->fp, "Super Blob ID: ");
5127 	if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
5128 		fprintf(ctx->fp, "(None)\n");
5129 	} else {
5130 		fprintf(ctx->fp, "0x%" PRIx64 "\n", ctx->super->super_blob);
5131 	}
5132 	fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
5133 	fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
5134 	fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
5135 	fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
5136 	fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
5137 	fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
5138 	fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
5139 	fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
5140 	fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
5141 
5142 	ctx->cur_page = 0;
5143 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
5144 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
5145 	if (!ctx->page) {
5146 		bs_dump_finish(seq, ctx, -ENOMEM);
5147 		return;
5148 	}
5149 
5150 	rc = bs_parse_super(ctx);
5151 	if (rc < 0) {
5152 		bs_load_ctx_fail(ctx, rc);
5153 		return;
5154 	}
5155 
5156 	bs_load_read_used_pages(ctx);
5157 }
5158 
5159 void
5160 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
5161 	     spdk_bs_op_complete cb_fn, void *cb_arg)
5162 {
5163 	struct spdk_blob_store	*bs;
5164 	struct spdk_bs_cpl	cpl;
5165 	struct spdk_bs_load_ctx *ctx;
5166 	struct spdk_bs_opts	opts = {};
5167 	int err;
5168 
5169 	SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev);
5170 
5171 	spdk_bs_opts_init(&opts, sizeof(opts));
5172 
5173 	err = bs_alloc(dev, &opts, &bs, &ctx);
5174 	if (err) {
5175 		dev->destroy(dev);
5176 		cb_fn(cb_arg, err);
5177 		return;
5178 	}
5179 
5180 	ctx->dumping = true;
5181 	ctx->fp = fp;
5182 	ctx->print_xattr_fn = print_xattr_fn;
5183 
5184 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5185 	cpl.u.bs_basic.cb_fn = cb_fn;
5186 	cpl.u.bs_basic.cb_arg = cb_arg;
5187 
5188 	ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
5189 	if (!ctx->seq) {
5190 		spdk_free(ctx->super);
5191 		free(ctx);
5192 		bs_free(bs);
5193 		cb_fn(cb_arg, -ENOMEM);
5194 		return;
5195 	}
5196 
5197 	/* Read the super block */
5198 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
5199 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
5200 			     bs_dump_super_cpl, ctx);
5201 }
5202 
5203 /* END spdk_bs_dump */
5204 
5205 /* START spdk_bs_init */
5206 
5207 static void
5208 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5209 {
5210 	struct spdk_bs_load_ctx *ctx = cb_arg;
5211 
5212 	ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
5213 	spdk_free(ctx->super);
5214 	free(ctx);
5215 
5216 	bs_sequence_finish(seq, bserrno);
5217 }
5218 
5219 static void
5220 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5221 {
5222 	struct spdk_bs_load_ctx *ctx = cb_arg;
5223 
5224 	/* Write super block */
5225 	bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
5226 			      bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
5227 			      bs_init_persist_super_cpl, ctx);
5228 }
5229 
5230 void
5231 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
5232 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
5233 {
5234 	struct spdk_bs_load_ctx *ctx;
5235 	struct spdk_blob_store	*bs;
5236 	struct spdk_bs_cpl	cpl;
5237 	spdk_bs_sequence_t	*seq;
5238 	spdk_bs_batch_t		*batch;
5239 	uint64_t		num_md_lba;
5240 	uint64_t		num_md_pages;
5241 	uint64_t		num_md_clusters;
5242 	uint64_t		max_used_cluster_mask_len;
5243 	uint32_t		i;
5244 	struct spdk_bs_opts	opts = {};
5245 	int			rc;
5246 	uint64_t		lba, lba_count;
5247 
5248 	SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev);
5249 
5250 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
5251 		SPDK_ERRLOG("unsupported dev block length of %d\n",
5252 			    dev->blocklen);
5253 		dev->destroy(dev);
5254 		cb_fn(cb_arg, NULL, -EINVAL);
5255 		return;
5256 	}
5257 
5258 	spdk_bs_opts_init(&opts, sizeof(opts));
5259 	if (o) {
5260 		if (bs_opts_copy(o, &opts)) {
5261 			return;
5262 		}
5263 	}
5264 
5265 	if (bs_opts_verify(&opts) != 0) {
5266 		dev->destroy(dev);
5267 		cb_fn(cb_arg, NULL, -EINVAL);
5268 		return;
5269 	}
5270 
5271 	rc = bs_alloc(dev, &opts, &bs, &ctx);
5272 	if (rc) {
5273 		dev->destroy(dev);
5274 		cb_fn(cb_arg, NULL, rc);
5275 		return;
5276 	}
5277 
5278 	if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
5279 		/* By default, allocate 1 page per cluster.
5280 		 * Technically, this over-allocates metadata
5281 		 * because more metadata will reduce the number
5282 		 * of usable clusters. This can be addressed with
5283 		 * more complex math in the future.
5284 		 */
5285 		bs->md_len = bs->total_clusters;
5286 	} else {
5287 		bs->md_len = opts.num_md_pages;
5288 	}
5289 	rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
5290 	if (rc < 0) {
5291 		spdk_free(ctx->super);
5292 		free(ctx);
5293 		bs_free(bs);
5294 		cb_fn(cb_arg, NULL, -ENOMEM);
5295 		return;
5296 	}
5297 
5298 	rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
5299 	if (rc < 0) {
5300 		spdk_free(ctx->super);
5301 		free(ctx);
5302 		bs_free(bs);
5303 		cb_fn(cb_arg, NULL, -ENOMEM);
5304 		return;
5305 	}
5306 
5307 	rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
5308 	if (rc < 0) {
5309 		spdk_free(ctx->super);
5310 		free(ctx);
5311 		bs_free(bs);
5312 		cb_fn(cb_arg, NULL, -ENOMEM);
5313 		return;
5314 	}
5315 
5316 	memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
5317 	       sizeof(ctx->super->signature));
5318 	ctx->super->version = SPDK_BS_VERSION;
5319 	ctx->super->length = sizeof(*ctx->super);
5320 	ctx->super->super_blob = bs->super_blob;
5321 	ctx->super->clean = 0;
5322 	ctx->super->cluster_size = bs->cluster_sz;
5323 	ctx->super->io_unit_size = bs->io_unit_size;
5324 	memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
5325 
5326 	/* Calculate how many pages the metadata consumes at the front
5327 	 * of the disk.
5328 	 */
5329 
5330 	/* The super block uses 1 page */
5331 	num_md_pages = 1;
5332 
5333 	/* The used_md_pages mask requires 1 bit per metadata page, rounded
5334 	 * up to the nearest page, plus a header.
5335 	 */
5336 	ctx->super->used_page_mask_start = num_md_pages;
5337 	ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
5338 					 spdk_divide_round_up(bs->md_len, 8),
5339 					 SPDK_BS_PAGE_SIZE);
5340 	num_md_pages += ctx->super->used_page_mask_len;
5341 
5342 	/* The used_clusters mask requires 1 bit per cluster, rounded
5343 	 * up to the nearest page, plus a header.
5344 	 */
5345 	ctx->super->used_cluster_mask_start = num_md_pages;
5346 	ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
5347 					    spdk_divide_round_up(bs->total_clusters, 8),
5348 					    SPDK_BS_PAGE_SIZE);
5349 	/* The blobstore might be extended, then the used_cluster bitmap will need more space.
5350 	 * Here we calculate the max clusters we can support according to the
5351 	 * num_md_pages (bs->md_len).
5352 	 */
5353 	max_used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
5354 				    spdk_divide_round_up(bs->md_len, 8),
5355 				    SPDK_BS_PAGE_SIZE);
5356 	max_used_cluster_mask_len = spdk_max(max_used_cluster_mask_len,
5357 					     ctx->super->used_cluster_mask_len);
5358 	num_md_pages += max_used_cluster_mask_len;
5359 
5360 	/* The used_blobids mask requires 1 bit per metadata page, rounded
5361 	 * up to the nearest page, plus a header.
5362 	 */
5363 	ctx->super->used_blobid_mask_start = num_md_pages;
5364 	ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
5365 					   spdk_divide_round_up(bs->md_len, 8),
5366 					   SPDK_BS_PAGE_SIZE);
5367 	num_md_pages += ctx->super->used_blobid_mask_len;
5368 
5369 	/* The metadata region size was chosen above */
5370 	ctx->super->md_start = bs->md_start = num_md_pages;
5371 	ctx->super->md_len = bs->md_len;
5372 	num_md_pages += bs->md_len;
5373 
5374 	num_md_lba = bs_page_to_lba(bs, num_md_pages);
5375 
5376 	ctx->super->size = dev->blockcnt * dev->blocklen;
5377 
5378 	ctx->super->crc = blob_md_page_calc_crc(ctx->super);
5379 
5380 	num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
5381 	if (num_md_clusters > bs->total_clusters) {
5382 		SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
5383 			    "please decrease number of pages reserved for metadata "
5384 			    "or increase cluster size.\n");
5385 		spdk_free(ctx->super);
5386 		spdk_bit_array_free(&ctx->used_clusters);
5387 		free(ctx);
5388 		bs_free(bs);
5389 		cb_fn(cb_arg, NULL, -ENOMEM);
5390 		return;
5391 	}
5392 	/* Claim all of the clusters used by the metadata */
5393 	for (i = 0; i < num_md_clusters; i++) {
5394 		spdk_bit_array_set(ctx->used_clusters, i);
5395 	}
5396 
5397 	bs->num_free_clusters -= num_md_clusters;
5398 	bs->total_data_clusters = bs->num_free_clusters;
5399 
5400 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
5401 	cpl.u.bs_handle.cb_fn = cb_fn;
5402 	cpl.u.bs_handle.cb_arg = cb_arg;
5403 	cpl.u.bs_handle.bs = bs;
5404 
5405 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
5406 	if (!seq) {
5407 		spdk_free(ctx->super);
5408 		free(ctx);
5409 		bs_free(bs);
5410 		cb_fn(cb_arg, NULL, -ENOMEM);
5411 		return;
5412 	}
5413 
5414 	batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
5415 
5416 	/* Clear metadata space */
5417 	bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
5418 
5419 	lba = num_md_lba;
5420 	lba_count = ctx->bs->dev->blockcnt - lba;
5421 	switch (opts.clear_method) {
5422 	case BS_CLEAR_WITH_UNMAP:
5423 		/* Trim data clusters */
5424 		bs_batch_unmap_dev(batch, lba, lba_count);
5425 		break;
5426 	case BS_CLEAR_WITH_WRITE_ZEROES:
5427 		/* Write_zeroes to data clusters */
5428 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
5429 		break;
5430 	case BS_CLEAR_WITH_NONE:
5431 	default:
5432 		break;
5433 	}
5434 
5435 	bs_batch_close(batch);
5436 }
5437 
5438 /* END spdk_bs_init */
5439 
5440 /* START spdk_bs_destroy */
5441 
5442 static void
5443 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5444 {
5445 	struct spdk_bs_load_ctx *ctx = cb_arg;
5446 	struct spdk_blob_store *bs = ctx->bs;
5447 
5448 	/*
5449 	 * We need to defer calling bs_call_cpl() until after
5450 	 * dev destruction, so tuck these away for later use.
5451 	 */
5452 	bs->unload_err = bserrno;
5453 	memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
5454 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
5455 
5456 	bs_sequence_finish(seq, bserrno);
5457 
5458 	bs_free(bs);
5459 	free(ctx);
5460 }
5461 
5462 void
5463 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
5464 		void *cb_arg)
5465 {
5466 	struct spdk_bs_cpl	cpl;
5467 	spdk_bs_sequence_t	*seq;
5468 	struct spdk_bs_load_ctx *ctx;
5469 
5470 	SPDK_DEBUGLOG(blob, "Destroying blobstore\n");
5471 
5472 	if (!RB_EMPTY(&bs->open_blobs)) {
5473 		SPDK_ERRLOG("Blobstore still has open blobs\n");
5474 		cb_fn(cb_arg, -EBUSY);
5475 		return;
5476 	}
5477 
5478 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5479 	cpl.u.bs_basic.cb_fn = cb_fn;
5480 	cpl.u.bs_basic.cb_arg = cb_arg;
5481 
5482 	ctx = calloc(1, sizeof(*ctx));
5483 	if (!ctx) {
5484 		cb_fn(cb_arg, -ENOMEM);
5485 		return;
5486 	}
5487 
5488 	ctx->bs = bs;
5489 
5490 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
5491 	if (!seq) {
5492 		free(ctx);
5493 		cb_fn(cb_arg, -ENOMEM);
5494 		return;
5495 	}
5496 
5497 	/* Write zeroes to the super block */
5498 	bs_sequence_write_zeroes_dev(seq,
5499 				     bs_page_to_lba(bs, 0),
5500 				     bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
5501 				     bs_destroy_trim_cpl, ctx);
5502 }
5503 
5504 /* END spdk_bs_destroy */
5505 
5506 /* START spdk_bs_unload */
5507 
5508 static void
5509 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
5510 {
5511 	spdk_bs_sequence_t *seq = ctx->seq;
5512 
5513 	spdk_free(ctx->super);
5514 
5515 	/*
5516 	 * We need to defer calling bs_call_cpl() until after
5517 	 * dev destruction, so tuck these away for later use.
5518 	 */
5519 	ctx->bs->unload_err = bserrno;
5520 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
5521 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
5522 
5523 	bs_sequence_finish(seq, bserrno);
5524 
5525 	bs_free(ctx->bs);
5526 	free(ctx);
5527 }
5528 
5529 static void
5530 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5531 {
5532 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5533 
5534 	bs_unload_finish(ctx, bserrno);
5535 }
5536 
5537 static void
5538 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5539 {
5540 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5541 
5542 	spdk_free(ctx->mask);
5543 
5544 	if (bserrno != 0) {
5545 		bs_unload_finish(ctx, bserrno);
5546 		return;
5547 	}
5548 
5549 	ctx->super->clean = 1;
5550 
5551 	bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx);
5552 }
5553 
5554 static void
5555 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5556 {
5557 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5558 
5559 	spdk_free(ctx->mask);
5560 	ctx->mask = NULL;
5561 
5562 	if (bserrno != 0) {
5563 		bs_unload_finish(ctx, bserrno);
5564 		return;
5565 	}
5566 
5567 	bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl);
5568 }
5569 
5570 static void
5571 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5572 {
5573 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5574 
5575 	spdk_free(ctx->mask);
5576 	ctx->mask = NULL;
5577 
5578 	if (bserrno != 0) {
5579 		bs_unload_finish(ctx, bserrno);
5580 		return;
5581 	}
5582 
5583 	bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl);
5584 }
5585 
5586 static void
5587 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5588 {
5589 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5590 
5591 	if (bserrno != 0) {
5592 		bs_unload_finish(ctx, bserrno);
5593 		return;
5594 	}
5595 
5596 	bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl);
5597 }
5598 
5599 void
5600 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
5601 {
5602 	struct spdk_bs_cpl	cpl;
5603 	struct spdk_bs_load_ctx *ctx;
5604 
5605 	SPDK_DEBUGLOG(blob, "Syncing blobstore\n");
5606 
5607 	/*
5608 	 * If external snapshot channels are being destroyed while the blobstore is unloaded, the
5609 	 * unload is deferred until after the channel destruction completes.
5610 	 */
5611 	if (bs->esnap_channels_unloading != 0) {
5612 		if (bs->esnap_unload_cb_fn != NULL) {
5613 			SPDK_ERRLOG("Blobstore unload in progress\n");
5614 			cb_fn(cb_arg, -EBUSY);
5615 			return;
5616 		}
5617 		SPDK_DEBUGLOG(blob_esnap, "Blobstore unload deferred: %" PRIu32
5618 			      " esnap clones are unloading\n", bs->esnap_channels_unloading);
5619 		bs->esnap_unload_cb_fn = cb_fn;
5620 		bs->esnap_unload_cb_arg = cb_arg;
5621 		return;
5622 	}
5623 	if (bs->esnap_unload_cb_fn != NULL) {
5624 		SPDK_DEBUGLOG(blob_esnap, "Blobstore deferred unload progressing\n");
5625 		assert(bs->esnap_unload_cb_fn == cb_fn);
5626 		assert(bs->esnap_unload_cb_arg == cb_arg);
5627 		bs->esnap_unload_cb_fn = NULL;
5628 		bs->esnap_unload_cb_arg = NULL;
5629 	}
5630 
5631 	if (!RB_EMPTY(&bs->open_blobs)) {
5632 		SPDK_ERRLOG("Blobstore still has open blobs\n");
5633 		cb_fn(cb_arg, -EBUSY);
5634 		return;
5635 	}
5636 
5637 	ctx = calloc(1, sizeof(*ctx));
5638 	if (!ctx) {
5639 		cb_fn(cb_arg, -ENOMEM);
5640 		return;
5641 	}
5642 
5643 	ctx->bs = bs;
5644 
5645 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
5646 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
5647 	if (!ctx->super) {
5648 		free(ctx);
5649 		cb_fn(cb_arg, -ENOMEM);
5650 		return;
5651 	}
5652 
5653 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5654 	cpl.u.bs_basic.cb_fn = cb_fn;
5655 	cpl.u.bs_basic.cb_arg = cb_arg;
5656 
5657 	ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
5658 	if (!ctx->seq) {
5659 		spdk_free(ctx->super);
5660 		free(ctx);
5661 		cb_fn(cb_arg, -ENOMEM);
5662 		return;
5663 	}
5664 
5665 	/* Read super block */
5666 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
5667 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
5668 			     bs_unload_read_super_cpl, ctx);
5669 }
5670 
5671 /* END spdk_bs_unload */
5672 
5673 /* START spdk_bs_set_super */
5674 
5675 struct spdk_bs_set_super_ctx {
5676 	struct spdk_blob_store		*bs;
5677 	struct spdk_bs_super_block	*super;
5678 };
5679 
5680 static void
5681 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5682 {
5683 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
5684 
5685 	if (bserrno != 0) {
5686 		SPDK_ERRLOG("Unable to write to super block of blobstore\n");
5687 	}
5688 
5689 	spdk_free(ctx->super);
5690 
5691 	bs_sequence_finish(seq, bserrno);
5692 
5693 	free(ctx);
5694 }
5695 
5696 static void
5697 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5698 {
5699 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
5700 
5701 	if (bserrno != 0) {
5702 		SPDK_ERRLOG("Unable to read super block of blobstore\n");
5703 		spdk_free(ctx->super);
5704 		bs_sequence_finish(seq, bserrno);
5705 		free(ctx);
5706 		return;
5707 	}
5708 
5709 	bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx);
5710 }
5711 
5712 void
5713 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
5714 		  spdk_bs_op_complete cb_fn, void *cb_arg)
5715 {
5716 	struct spdk_bs_cpl		cpl;
5717 	spdk_bs_sequence_t		*seq;
5718 	struct spdk_bs_set_super_ctx	*ctx;
5719 
5720 	SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n");
5721 
5722 	ctx = calloc(1, sizeof(*ctx));
5723 	if (!ctx) {
5724 		cb_fn(cb_arg, -ENOMEM);
5725 		return;
5726 	}
5727 
5728 	ctx->bs = bs;
5729 
5730 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
5731 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
5732 	if (!ctx->super) {
5733 		free(ctx);
5734 		cb_fn(cb_arg, -ENOMEM);
5735 		return;
5736 	}
5737 
5738 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5739 	cpl.u.bs_basic.cb_fn = cb_fn;
5740 	cpl.u.bs_basic.cb_arg = cb_arg;
5741 
5742 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
5743 	if (!seq) {
5744 		spdk_free(ctx->super);
5745 		free(ctx);
5746 		cb_fn(cb_arg, -ENOMEM);
5747 		return;
5748 	}
5749 
5750 	bs->super_blob = blobid;
5751 
5752 	/* Read super block */
5753 	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
5754 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
5755 			     bs_set_super_read_cpl, ctx);
5756 }
5757 
5758 /* END spdk_bs_set_super */
5759 
5760 void
5761 spdk_bs_get_super(struct spdk_blob_store *bs,
5762 		  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5763 {
5764 	if (bs->super_blob == SPDK_BLOBID_INVALID) {
5765 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
5766 	} else {
5767 		cb_fn(cb_arg, bs->super_blob, 0);
5768 	}
5769 }
5770 
5771 uint64_t
5772 spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
5773 {
5774 	return bs->cluster_sz;
5775 }
5776 
5777 uint64_t
5778 spdk_bs_get_page_size(struct spdk_blob_store *bs)
5779 {
5780 	return SPDK_BS_PAGE_SIZE;
5781 }
5782 
5783 uint64_t
5784 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
5785 {
5786 	return bs->io_unit_size;
5787 }
5788 
5789 uint64_t
5790 spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
5791 {
5792 	return bs->num_free_clusters;
5793 }
5794 
5795 uint64_t
5796 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
5797 {
5798 	return bs->total_data_clusters;
5799 }
5800 
5801 static int
5802 bs_register_md_thread(struct spdk_blob_store *bs)
5803 {
5804 	bs->md_channel = spdk_get_io_channel(bs);
5805 	if (!bs->md_channel) {
5806 		SPDK_ERRLOG("Failed to get IO channel.\n");
5807 		return -1;
5808 	}
5809 
5810 	return 0;
5811 }
5812 
5813 static int
5814 bs_unregister_md_thread(struct spdk_blob_store *bs)
5815 {
5816 	spdk_put_io_channel(bs->md_channel);
5817 
5818 	return 0;
5819 }
5820 
5821 spdk_blob_id
5822 spdk_blob_get_id(struct spdk_blob *blob)
5823 {
5824 	assert(blob != NULL);
5825 
5826 	return blob->id;
5827 }
5828 
5829 uint64_t
5830 spdk_blob_get_num_pages(struct spdk_blob *blob)
5831 {
5832 	assert(blob != NULL);
5833 
5834 	return bs_cluster_to_page(blob->bs, blob->active.num_clusters);
5835 }
5836 
5837 uint64_t
5838 spdk_blob_get_num_io_units(struct spdk_blob *blob)
5839 {
5840 	assert(blob != NULL);
5841 
5842 	return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs);
5843 }
5844 
5845 uint64_t
5846 spdk_blob_get_num_clusters(struct spdk_blob *blob)
5847 {
5848 	assert(blob != NULL);
5849 
5850 	return blob->active.num_clusters;
5851 }
5852 
5853 static uint64_t
5854 blob_find_io_unit(struct spdk_blob *blob, uint64_t offset, bool is_allocated)
5855 {
5856 	uint64_t blob_io_unit_num = spdk_blob_get_num_io_units(blob);
5857 
5858 	while (offset < blob_io_unit_num) {
5859 		if (bs_io_unit_is_allocated(blob, offset) == is_allocated) {
5860 			return offset;
5861 		}
5862 
5863 		offset += bs_num_io_units_to_cluster_boundary(blob, offset);
5864 	}
5865 
5866 	return UINT64_MAX;
5867 }
5868 
5869 uint64_t
5870 spdk_blob_get_next_allocated_io_unit(struct spdk_blob *blob, uint64_t offset)
5871 {
5872 	return blob_find_io_unit(blob, offset, true);
5873 }
5874 
5875 uint64_t
5876 spdk_blob_get_next_unallocated_io_unit(struct spdk_blob *blob, uint64_t offset)
5877 {
5878 	return blob_find_io_unit(blob, offset, false);
5879 }
5880 
5881 /* START spdk_bs_create_blob */
5882 
5883 static void
5884 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5885 {
5886 	struct spdk_blob *blob = cb_arg;
5887 	uint32_t page_idx = bs_blobid_to_page(blob->id);
5888 
5889 	if (bserrno != 0) {
5890 		spdk_spin_lock(&blob->bs->used_lock);
5891 		spdk_bit_array_clear(blob->bs->used_blobids, page_idx);
5892 		bs_release_md_page(blob->bs, page_idx);
5893 		spdk_spin_unlock(&blob->bs->used_lock);
5894 	}
5895 
5896 	blob_free(blob);
5897 
5898 	bs_sequence_finish(seq, bserrno);
5899 }
5900 
5901 static int
5902 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
5903 		bool internal)
5904 {
5905 	uint64_t i;
5906 	size_t value_len = 0;
5907 	int rc;
5908 	const void *value = NULL;
5909 	if (xattrs->count > 0 && xattrs->get_value == NULL) {
5910 		return -EINVAL;
5911 	}
5912 	for (i = 0; i < xattrs->count; i++) {
5913 		xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
5914 		if (value == NULL || value_len == 0) {
5915 			return -EINVAL;
5916 		}
5917 		rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
5918 		if (rc < 0) {
5919 			return rc;
5920 		}
5921 	}
5922 	return 0;
5923 }
5924 
5925 static void
5926 blob_opts_copy(const struct spdk_blob_opts *src, struct spdk_blob_opts *dst)
5927 {
5928 #define FIELD_OK(field) \
5929         offsetof(struct spdk_blob_opts, field) + sizeof(src->field) <= src->opts_size
5930 
5931 #define SET_FIELD(field) \
5932         if (FIELD_OK(field)) { \
5933                 dst->field = src->field; \
5934         } \
5935 
5936 	SET_FIELD(num_clusters);
5937 	SET_FIELD(thin_provision);
5938 	SET_FIELD(clear_method);
5939 
5940 	if (FIELD_OK(xattrs)) {
5941 		memcpy(&dst->xattrs, &src->xattrs, sizeof(src->xattrs));
5942 	}
5943 
5944 	SET_FIELD(use_extent_table);
5945 	SET_FIELD(esnap_id);
5946 	SET_FIELD(esnap_id_len);
5947 
5948 	dst->opts_size = src->opts_size;
5949 
5950 	/* You should not remove this statement, but need to update the assert statement
5951 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
5952 	SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_opts) == 80, "Incorrect size");
5953 
5954 #undef FIELD_OK
5955 #undef SET_FIELD
5956 }
5957 
5958 static void
5959 bs_create_blob(struct spdk_blob_store *bs,
5960 	       const struct spdk_blob_opts *opts,
5961 	       const struct spdk_blob_xattr_opts *internal_xattrs,
5962 	       spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5963 {
5964 	struct spdk_blob	*blob;
5965 	uint32_t		page_idx;
5966 	struct spdk_bs_cpl	cpl;
5967 	struct spdk_blob_opts	opts_local;
5968 	struct spdk_blob_xattr_opts internal_xattrs_default;
5969 	spdk_bs_sequence_t	*seq;
5970 	spdk_blob_id		id;
5971 	int rc;
5972 
5973 	assert(spdk_get_thread() == bs->md_thread);
5974 
5975 	spdk_spin_lock(&bs->used_lock);
5976 	page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
5977 	if (page_idx == UINT32_MAX) {
5978 		spdk_spin_unlock(&bs->used_lock);
5979 		cb_fn(cb_arg, 0, -ENOMEM);
5980 		return;
5981 	}
5982 	spdk_bit_array_set(bs->used_blobids, page_idx);
5983 	bs_claim_md_page(bs, page_idx);
5984 	spdk_spin_unlock(&bs->used_lock);
5985 
5986 	id = bs_page_to_blobid(page_idx);
5987 
5988 	SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx);
5989 
5990 	spdk_blob_opts_init(&opts_local, sizeof(opts_local));
5991 	if (opts) {
5992 		blob_opts_copy(opts, &opts_local);
5993 	}
5994 
5995 	blob = blob_alloc(bs, id);
5996 	if (!blob) {
5997 		rc = -ENOMEM;
5998 		goto error;
5999 	}
6000 
6001 	blob->use_extent_table = opts_local.use_extent_table;
6002 	if (blob->use_extent_table) {
6003 		blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE;
6004 	}
6005 
6006 	if (!internal_xattrs) {
6007 		blob_xattrs_init(&internal_xattrs_default);
6008 		internal_xattrs = &internal_xattrs_default;
6009 	}
6010 
6011 	rc = blob_set_xattrs(blob, &opts_local.xattrs, false);
6012 	if (rc < 0) {
6013 		goto error;
6014 	}
6015 
6016 	rc = blob_set_xattrs(blob, internal_xattrs, true);
6017 	if (rc < 0) {
6018 		goto error;
6019 	}
6020 
6021 	if (opts_local.thin_provision) {
6022 		blob_set_thin_provision(blob);
6023 	}
6024 
6025 	blob_set_clear_method(blob, opts_local.clear_method);
6026 
6027 	if (opts_local.esnap_id != NULL) {
6028 		if (opts_local.esnap_id_len > UINT16_MAX) {
6029 			SPDK_ERRLOG("esnap id length %" PRIu64 "is too long\n",
6030 				    opts_local.esnap_id_len);
6031 			goto error;
6032 
6033 		}
6034 		blob_set_thin_provision(blob);
6035 		blob->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
6036 		rc = blob_set_xattr(blob, BLOB_EXTERNAL_SNAPSHOT_ID,
6037 				    opts_local.esnap_id, opts_local.esnap_id_len, true);
6038 		if (rc != 0) {
6039 			goto error;
6040 		}
6041 	}
6042 
6043 	rc = blob_resize(blob, opts_local.num_clusters);
6044 	if (rc < 0) {
6045 		goto error;
6046 	}
6047 	cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
6048 	cpl.u.blobid.cb_fn = cb_fn;
6049 	cpl.u.blobid.cb_arg = cb_arg;
6050 	cpl.u.blobid.blobid = blob->id;
6051 
6052 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
6053 	if (!seq) {
6054 		rc = -ENOMEM;
6055 		goto error;
6056 	}
6057 
6058 	blob_persist(seq, blob, bs_create_blob_cpl, blob);
6059 	return;
6060 
6061 error:
6062 	SPDK_ERRLOG("Failed to create blob: %s, size in clusters/size: %lu (clusters)\n",
6063 		    spdk_strerror(rc), opts_local.num_clusters);
6064 	if (blob != NULL) {
6065 		blob_free(blob);
6066 	}
6067 	spdk_spin_lock(&bs->used_lock);
6068 	spdk_bit_array_clear(bs->used_blobids, page_idx);
6069 	bs_release_md_page(bs, page_idx);
6070 	spdk_spin_unlock(&bs->used_lock);
6071 	cb_fn(cb_arg, 0, rc);
6072 }
6073 
6074 void
6075 spdk_bs_create_blob(struct spdk_blob_store *bs,
6076 		    spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
6077 {
6078 	bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
6079 }
6080 
6081 void
6082 spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
6083 			spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
6084 {
6085 	bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
6086 }
6087 
6088 /* END spdk_bs_create_blob */
6089 
6090 /* START blob_cleanup */
6091 
6092 struct spdk_clone_snapshot_ctx {
6093 	struct spdk_bs_cpl      cpl;
6094 	int bserrno;
6095 	bool frozen;
6096 
6097 	struct spdk_io_channel *channel;
6098 
6099 	/* Current cluster for inflate operation */
6100 	uint64_t cluster;
6101 
6102 	/* For inflation force allocation of all unallocated clusters and remove
6103 	 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
6104 	bool allocate_all;
6105 
6106 	struct {
6107 		spdk_blob_id id;
6108 		struct spdk_blob *blob;
6109 		bool md_ro;
6110 	} original;
6111 	struct {
6112 		spdk_blob_id id;
6113 		struct spdk_blob *blob;
6114 	} new;
6115 
6116 	/* xattrs specified for snapshot/clones only. They have no impact on
6117 	 * the original blobs xattrs. */
6118 	const struct spdk_blob_xattr_opts *xattrs;
6119 };
6120 
6121 static void
6122 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
6123 {
6124 	struct spdk_clone_snapshot_ctx *ctx = cb_arg;
6125 	struct spdk_bs_cpl *cpl = &ctx->cpl;
6126 
6127 	if (bserrno != 0) {
6128 		if (ctx->bserrno != 0) {
6129 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
6130 		} else {
6131 			ctx->bserrno = bserrno;
6132 		}
6133 	}
6134 
6135 	switch (cpl->type) {
6136 	case SPDK_BS_CPL_TYPE_BLOBID:
6137 		cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
6138 		break;
6139 	case SPDK_BS_CPL_TYPE_BLOB_BASIC:
6140 		cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
6141 		break;
6142 	default:
6143 		SPDK_UNREACHABLE();
6144 		break;
6145 	}
6146 
6147 	free(ctx);
6148 }
6149 
6150 static void
6151 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
6152 {
6153 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6154 	struct spdk_blob *origblob = ctx->original.blob;
6155 
6156 	if (bserrno != 0) {
6157 		if (ctx->bserrno != 0) {
6158 			SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
6159 		} else {
6160 			ctx->bserrno = bserrno;
6161 		}
6162 	}
6163 
6164 	ctx->original.id = origblob->id;
6165 	origblob->locked_operation_in_progress = false;
6166 
6167 	/* Revert md_ro to original state */
6168 	origblob->md_ro = ctx->original.md_ro;
6169 
6170 	spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx);
6171 }
6172 
6173 static void
6174 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
6175 {
6176 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6177 	struct spdk_blob *origblob = ctx->original.blob;
6178 
6179 	if (bserrno != 0) {
6180 		if (ctx->bserrno != 0) {
6181 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
6182 		} else {
6183 			ctx->bserrno = bserrno;
6184 		}
6185 	}
6186 
6187 	if (ctx->frozen) {
6188 		/* Unfreeze any outstanding I/O */
6189 		blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx);
6190 	} else {
6191 		bs_snapshot_unfreeze_cpl(ctx, 0);
6192 	}
6193 
6194 }
6195 
6196 static void
6197 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno)
6198 {
6199 	struct spdk_blob *newblob = ctx->new.blob;
6200 
6201 	if (bserrno != 0) {
6202 		if (ctx->bserrno != 0) {
6203 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
6204 		} else {
6205 			ctx->bserrno = bserrno;
6206 		}
6207 	}
6208 
6209 	ctx->new.id = newblob->id;
6210 	spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
6211 }
6212 
6213 /* END blob_cleanup */
6214 
6215 /* START spdk_bs_create_snapshot */
6216 
6217 static void
6218 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
6219 {
6220 	uint64_t *cluster_temp;
6221 	uint32_t *extent_page_temp;
6222 
6223 	cluster_temp = blob1->active.clusters;
6224 	blob1->active.clusters = blob2->active.clusters;
6225 	blob2->active.clusters = cluster_temp;
6226 
6227 	extent_page_temp = blob1->active.extent_pages;
6228 	blob1->active.extent_pages = blob2->active.extent_pages;
6229 	blob2->active.extent_pages = extent_page_temp;
6230 }
6231 
6232 /* Copies an internal xattr */
6233 static int
6234 bs_snapshot_copy_xattr(struct spdk_blob *toblob, struct spdk_blob *fromblob, const char *name)
6235 {
6236 	const void	*val = NULL;
6237 	size_t		len;
6238 	int		bserrno;
6239 
6240 	bserrno = blob_get_xattr_value(fromblob, name, &val, &len, true);
6241 	if (bserrno != 0) {
6242 		SPDK_ERRLOG("blob 0x%" PRIx64 " missing %s xattr"
6243 			    BLOB_EXTERNAL_SNAPSHOT_ID " XATTR\n", fromblob->id, name);
6244 		return bserrno;
6245 	}
6246 
6247 	bserrno = blob_set_xattr(toblob, name, val, len, true);
6248 	if (bserrno != 0) {
6249 		SPDK_ERRLOG("could not set %s XATTR on blob 0x%" PRIx64 "\n",
6250 			    name, toblob->id);
6251 		return bserrno;
6252 	}
6253 	return 0;
6254 }
6255 
6256 static void
6257 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
6258 {
6259 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6260 	struct spdk_blob *origblob = ctx->original.blob;
6261 	struct spdk_blob *newblob = ctx->new.blob;
6262 
6263 	if (bserrno != 0) {
6264 		bs_snapshot_swap_cluster_maps(newblob, origblob);
6265 		if (blob_is_esnap_clone(newblob)) {
6266 			bs_snapshot_copy_xattr(origblob, newblob, BLOB_EXTERNAL_SNAPSHOT_ID);
6267 			origblob->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
6268 		}
6269 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6270 		return;
6271 	}
6272 
6273 	/* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
6274 	bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
6275 	if (bserrno != 0) {
6276 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6277 		return;
6278 	}
6279 
6280 	bs_blob_list_add(ctx->original.blob);
6281 
6282 	spdk_blob_set_read_only(newblob);
6283 
6284 	/* sync snapshot metadata */
6285 	spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
6286 }
6287 
6288 static void
6289 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
6290 {
6291 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6292 	struct spdk_blob *origblob = ctx->original.blob;
6293 	struct spdk_blob *newblob = ctx->new.blob;
6294 
6295 	if (bserrno != 0) {
6296 		/* return cluster map back to original */
6297 		bs_snapshot_swap_cluster_maps(newblob, origblob);
6298 
6299 		/* Newblob md sync failed. Valid clusters are only present in origblob.
6300 		 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occurred.
6301 		 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */
6302 		blob_set_thin_provision(newblob);
6303 		assert(spdk_mem_all_zero(newblob->active.clusters,
6304 					 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
6305 		assert(spdk_mem_all_zero(newblob->active.extent_pages,
6306 					 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
6307 
6308 		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
6309 		return;
6310 	}
6311 
6312 	/* Set internal xattr for snapshot id */
6313 	bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
6314 	if (bserrno != 0) {
6315 		/* return cluster map back to original */
6316 		bs_snapshot_swap_cluster_maps(newblob, origblob);
6317 		blob_set_thin_provision(newblob);
6318 		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
6319 		return;
6320 	}
6321 
6322 	/* Create new back_bs_dev for snapshot */
6323 	origblob->back_bs_dev = bs_create_blob_bs_dev(newblob);
6324 	if (origblob->back_bs_dev == NULL) {
6325 		/* return cluster map back to original */
6326 		bs_snapshot_swap_cluster_maps(newblob, origblob);
6327 		blob_set_thin_provision(newblob);
6328 		bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
6329 		return;
6330 	}
6331 
6332 	/* Remove the xattr that references an external snapshot */
6333 	if (blob_is_esnap_clone(origblob)) {
6334 		origblob->invalid_flags &= ~SPDK_BLOB_EXTERNAL_SNAPSHOT;
6335 		bserrno = blob_remove_xattr(origblob, BLOB_EXTERNAL_SNAPSHOT_ID, true);
6336 		if (bserrno != 0) {
6337 			if (bserrno == -ENOENT) {
6338 				SPDK_ERRLOG("blob 0x%" PRIx64 " has no " BLOB_EXTERNAL_SNAPSHOT_ID
6339 					    " xattr to remove\n", origblob->id);
6340 				assert(false);
6341 			} else {
6342 				/* return cluster map back to original */
6343 				bs_snapshot_swap_cluster_maps(newblob, origblob);
6344 				blob_set_thin_provision(newblob);
6345 				bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
6346 				return;
6347 			}
6348 		}
6349 	}
6350 
6351 	bs_blob_list_remove(origblob);
6352 	origblob->parent_id = newblob->id;
6353 	/* set clone blob as thin provisioned */
6354 	blob_set_thin_provision(origblob);
6355 
6356 	bs_blob_list_add(newblob);
6357 
6358 	/* sync clone metadata */
6359 	spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx);
6360 }
6361 
6362 static void
6363 bs_snapshot_freeze_cpl(void *cb_arg, int rc)
6364 {
6365 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6366 	struct spdk_blob *origblob = ctx->original.blob;
6367 	struct spdk_blob *newblob = ctx->new.blob;
6368 	int bserrno;
6369 
6370 	if (rc != 0) {
6371 		bs_clone_snapshot_newblob_cleanup(ctx, rc);
6372 		return;
6373 	}
6374 
6375 	ctx->frozen = true;
6376 
6377 	if (blob_is_esnap_clone(origblob)) {
6378 		/* Clean up any channels associated with the original blob id because future IO will
6379 		 * perform IO using the snapshot blob_id.
6380 		 */
6381 		blob_esnap_destroy_bs_dev_channels(origblob, false, NULL, NULL);
6382 	}
6383 	if (newblob->back_bs_dev) {
6384 		blob_back_bs_destroy(newblob);
6385 	}
6386 	/* set new back_bs_dev for snapshot */
6387 	newblob->back_bs_dev = origblob->back_bs_dev;
6388 	/* Set invalid flags from origblob */
6389 	newblob->invalid_flags = origblob->invalid_flags;
6390 
6391 	/* inherit parent from original blob if set */
6392 	newblob->parent_id = origblob->parent_id;
6393 	switch (origblob->parent_id) {
6394 	case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
6395 		bserrno = bs_snapshot_copy_xattr(newblob, origblob, BLOB_EXTERNAL_SNAPSHOT_ID);
6396 		if (bserrno != 0) {
6397 			bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
6398 			return;
6399 		}
6400 		break;
6401 	case SPDK_BLOBID_INVALID:
6402 		break;
6403 	default:
6404 		/* Set internal xattr for snapshot id */
6405 		bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT,
6406 					 &origblob->parent_id, sizeof(spdk_blob_id), true);
6407 		if (bserrno != 0) {
6408 			bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
6409 			return;
6410 		}
6411 	}
6412 
6413 	/* swap cluster maps */
6414 	bs_snapshot_swap_cluster_maps(newblob, origblob);
6415 
6416 	/* Set the clear method on the new blob to match the original. */
6417 	blob_set_clear_method(newblob, origblob->clear_method);
6418 
6419 	/* sync snapshot metadata */
6420 	spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx);
6421 }
6422 
6423 static void
6424 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6425 {
6426 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6427 	struct spdk_blob *origblob = ctx->original.blob;
6428 	struct spdk_blob *newblob = _blob;
6429 
6430 	if (bserrno != 0) {
6431 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6432 		return;
6433 	}
6434 
6435 	ctx->new.blob = newblob;
6436 	assert(spdk_blob_is_thin_provisioned(newblob));
6437 	assert(spdk_mem_all_zero(newblob->active.clusters,
6438 				 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
6439 	assert(spdk_mem_all_zero(newblob->active.extent_pages,
6440 				 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
6441 
6442 	blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx);
6443 }
6444 
6445 static void
6446 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
6447 {
6448 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6449 	struct spdk_blob *origblob = ctx->original.blob;
6450 
6451 	if (bserrno != 0) {
6452 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6453 		return;
6454 	}
6455 
6456 	ctx->new.id = blobid;
6457 	ctx->cpl.u.blobid.blobid = blobid;
6458 
6459 	spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx);
6460 }
6461 
6462 
6463 static void
6464 bs_xattr_snapshot(void *arg, const char *name,
6465 		  const void **value, size_t *value_len)
6466 {
6467 	assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
6468 
6469 	struct spdk_blob *blob = (struct spdk_blob *)arg;
6470 	*value = &blob->id;
6471 	*value_len = sizeof(blob->id);
6472 }
6473 
6474 static void
6475 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6476 {
6477 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6478 	struct spdk_blob_opts opts;
6479 	struct spdk_blob_xattr_opts internal_xattrs;
6480 	char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
6481 
6482 	if (bserrno != 0) {
6483 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
6484 		return;
6485 	}
6486 
6487 	ctx->original.blob = _blob;
6488 
6489 	if (_blob->data_ro || _blob->md_ro) {
6490 		SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n",
6491 			      _blob->id);
6492 		ctx->bserrno = -EINVAL;
6493 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6494 		return;
6495 	}
6496 
6497 	if (_blob->locked_operation_in_progress) {
6498 		SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n");
6499 		ctx->bserrno = -EBUSY;
6500 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6501 		return;
6502 	}
6503 
6504 	_blob->locked_operation_in_progress = true;
6505 
6506 	spdk_blob_opts_init(&opts, sizeof(opts));
6507 	blob_xattrs_init(&internal_xattrs);
6508 
6509 	/* Change the size of new blob to the same as in original blob,
6510 	 * but do not allocate clusters */
6511 	opts.thin_provision = true;
6512 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
6513 	opts.use_extent_table = _blob->use_extent_table;
6514 
6515 	/* If there are any xattrs specified for snapshot, set them now */
6516 	if (ctx->xattrs) {
6517 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
6518 	}
6519 	/* Set internal xattr SNAPSHOT_IN_PROGRESS */
6520 	internal_xattrs.count = 1;
6521 	internal_xattrs.ctx = _blob;
6522 	internal_xattrs.names = xattrs_names;
6523 	internal_xattrs.get_value = bs_xattr_snapshot;
6524 
6525 	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
6526 		       bs_snapshot_newblob_create_cpl, ctx);
6527 }
6528 
6529 void
6530 spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
6531 			const struct spdk_blob_xattr_opts *snapshot_xattrs,
6532 			spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
6533 {
6534 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
6535 
6536 	if (!ctx) {
6537 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
6538 		return;
6539 	}
6540 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
6541 	ctx->cpl.u.blobid.cb_fn = cb_fn;
6542 	ctx->cpl.u.blobid.cb_arg = cb_arg;
6543 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
6544 	ctx->bserrno = 0;
6545 	ctx->frozen = false;
6546 	ctx->original.id = blobid;
6547 	ctx->xattrs = snapshot_xattrs;
6548 
6549 	spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx);
6550 }
6551 /* END spdk_bs_create_snapshot */
6552 
6553 /* START spdk_bs_create_clone */
6554 
6555 static void
6556 bs_xattr_clone(void *arg, const char *name,
6557 	       const void **value, size_t *value_len)
6558 {
6559 	assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
6560 
6561 	struct spdk_blob *blob = (struct spdk_blob *)arg;
6562 	*value = &blob->id;
6563 	*value_len = sizeof(blob->id);
6564 }
6565 
6566 static void
6567 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6568 {
6569 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6570 	struct spdk_blob *clone = _blob;
6571 
6572 	ctx->new.blob = clone;
6573 	bs_blob_list_add(clone);
6574 
6575 	spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx);
6576 }
6577 
6578 static void
6579 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
6580 {
6581 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6582 
6583 	ctx->cpl.u.blobid.blobid = blobid;
6584 	spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx);
6585 }
6586 
6587 static void
6588 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6589 {
6590 	struct spdk_clone_snapshot_ctx	*ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6591 	struct spdk_blob_opts		opts;
6592 	struct spdk_blob_xattr_opts internal_xattrs;
6593 	char *xattr_names[] = { BLOB_SNAPSHOT };
6594 
6595 	if (bserrno != 0) {
6596 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
6597 		return;
6598 	}
6599 
6600 	ctx->original.blob = _blob;
6601 	ctx->original.md_ro = _blob->md_ro;
6602 
6603 	if (!_blob->data_ro || !_blob->md_ro) {
6604 		SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n");
6605 		ctx->bserrno = -EINVAL;
6606 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6607 		return;
6608 	}
6609 
6610 	if (_blob->locked_operation_in_progress) {
6611 		SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n");
6612 		ctx->bserrno = -EBUSY;
6613 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6614 		return;
6615 	}
6616 
6617 	_blob->locked_operation_in_progress = true;
6618 
6619 	spdk_blob_opts_init(&opts, sizeof(opts));
6620 	blob_xattrs_init(&internal_xattrs);
6621 
6622 	opts.thin_provision = true;
6623 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
6624 	opts.use_extent_table = _blob->use_extent_table;
6625 	if (ctx->xattrs) {
6626 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
6627 	}
6628 
6629 	/* Set internal xattr BLOB_SNAPSHOT */
6630 	internal_xattrs.count = 1;
6631 	internal_xattrs.ctx = _blob;
6632 	internal_xattrs.names = xattr_names;
6633 	internal_xattrs.get_value = bs_xattr_clone;
6634 
6635 	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
6636 		       bs_clone_newblob_create_cpl, ctx);
6637 }
6638 
6639 void
6640 spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
6641 		     const struct spdk_blob_xattr_opts *clone_xattrs,
6642 		     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
6643 {
6644 	struct spdk_clone_snapshot_ctx	*ctx = calloc(1, sizeof(*ctx));
6645 
6646 	if (!ctx) {
6647 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
6648 		return;
6649 	}
6650 
6651 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
6652 	ctx->cpl.u.blobid.cb_fn = cb_fn;
6653 	ctx->cpl.u.blobid.cb_arg = cb_arg;
6654 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
6655 	ctx->bserrno = 0;
6656 	ctx->xattrs = clone_xattrs;
6657 	ctx->original.id = blobid;
6658 
6659 	spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx);
6660 }
6661 
6662 /* END spdk_bs_create_clone */
6663 
6664 /* START spdk_bs_inflate_blob */
6665 
6666 static void
6667 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
6668 {
6669 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6670 	struct spdk_blob *_blob = ctx->original.blob;
6671 
6672 	if (bserrno != 0) {
6673 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6674 		return;
6675 	}
6676 
6677 	/* Temporarily override md_ro flag for MD modification */
6678 	_blob->md_ro = false;
6679 
6680 	bserrno = blob_set_xattr(_blob, BLOB_SNAPSHOT, &_parent->id, sizeof(spdk_blob_id), true);
6681 	if (bserrno != 0) {
6682 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6683 		return;
6684 	}
6685 
6686 	assert(_parent != NULL);
6687 
6688 	bs_blob_list_remove(_blob);
6689 	_blob->parent_id = _parent->id;
6690 
6691 	blob_back_bs_destroy(_blob);
6692 	_blob->back_bs_dev = bs_create_blob_bs_dev(_parent);
6693 	bs_blob_list_add(_blob);
6694 
6695 	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
6696 }
6697 
6698 static void
6699 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx)
6700 {
6701 	struct spdk_blob *_blob = ctx->original.blob;
6702 	struct spdk_blob *_parent;
6703 
6704 	if (ctx->allocate_all) {
6705 		/* remove thin provisioning */
6706 		bs_blob_list_remove(_blob);
6707 		if (_blob->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
6708 			blob_remove_xattr(_blob, BLOB_EXTERNAL_SNAPSHOT_ID, true);
6709 			_blob->invalid_flags &= ~SPDK_BLOB_EXTERNAL_SNAPSHOT;
6710 		} else {
6711 			blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
6712 		}
6713 		_blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
6714 		blob_back_bs_destroy(_blob);
6715 		_blob->parent_id = SPDK_BLOBID_INVALID;
6716 	} else {
6717 		/* For now, esnap clones always have allocate_all set. */
6718 		assert(!blob_is_esnap_clone(_blob));
6719 
6720 		_parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
6721 		if (_parent->parent_id != SPDK_BLOBID_INVALID) {
6722 			/* We must change the parent of the inflated blob */
6723 			spdk_bs_open_blob(_blob->bs, _parent->parent_id,
6724 					  bs_inflate_blob_set_parent_cpl, ctx);
6725 			return;
6726 		}
6727 
6728 		bs_blob_list_remove(_blob);
6729 		_blob->parent_id = SPDK_BLOBID_INVALID;
6730 		blob_back_bs_destroy(_blob);
6731 		_blob->back_bs_dev = bs_create_zeroes_dev();
6732 	}
6733 
6734 	/* Temporarily override md_ro flag for MD modification */
6735 	_blob->md_ro = false;
6736 	blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
6737 	_blob->state = SPDK_BLOB_STATE_DIRTY;
6738 
6739 	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
6740 }
6741 
6742 /* Check if cluster needs allocation */
6743 static inline bool
6744 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
6745 {
6746 	struct spdk_blob_bs_dev *b;
6747 
6748 	assert(blob != NULL);
6749 
6750 	if (blob->active.clusters[cluster] != 0) {
6751 		/* Cluster is already allocated */
6752 		return false;
6753 	}
6754 
6755 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
6756 		/* Blob have no parent blob */
6757 		return allocate_all;
6758 	}
6759 
6760 	if (blob->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
6761 		return true;
6762 	}
6763 
6764 	b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
6765 	return (allocate_all || b->blob->active.clusters[cluster] != 0);
6766 }
6767 
6768 static void
6769 bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
6770 {
6771 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6772 	struct spdk_blob *_blob = ctx->original.blob;
6773 	struct spdk_bs_cpl cpl;
6774 	spdk_bs_user_op_t *op;
6775 	uint64_t offset;
6776 
6777 	if (bserrno != 0) {
6778 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6779 		return;
6780 	}
6781 
6782 	for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
6783 		if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
6784 			break;
6785 		}
6786 	}
6787 
6788 	if (ctx->cluster < _blob->active.num_clusters) {
6789 		offset = bs_cluster_to_lba(_blob->bs, ctx->cluster);
6790 
6791 		/* We may safely increment a cluster before copying */
6792 		ctx->cluster++;
6793 
6794 		/* Use a dummy 0B read as a context for cluster copy */
6795 		cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6796 		cpl.u.blob_basic.cb_fn = bs_inflate_blob_touch_next;
6797 		cpl.u.blob_basic.cb_arg = ctx;
6798 
6799 		op = bs_user_op_alloc(ctx->channel, &cpl, SPDK_BLOB_READ, _blob,
6800 				      NULL, 0, offset, 0);
6801 		if (!op) {
6802 			bs_clone_snapshot_origblob_cleanup(ctx, -ENOMEM);
6803 			return;
6804 		}
6805 
6806 		bs_allocate_and_copy_cluster(_blob, ctx->channel, offset, op);
6807 	} else {
6808 		bs_inflate_blob_done(ctx);
6809 	}
6810 }
6811 
6812 static void
6813 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6814 {
6815 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6816 	uint64_t clusters_needed;
6817 	uint64_t i;
6818 
6819 	if (bserrno != 0) {
6820 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
6821 		return;
6822 	}
6823 
6824 	ctx->original.blob = _blob;
6825 	ctx->original.md_ro = _blob->md_ro;
6826 
6827 	if (_blob->locked_operation_in_progress) {
6828 		SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n");
6829 		ctx->bserrno = -EBUSY;
6830 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6831 		return;
6832 	}
6833 
6834 	_blob->locked_operation_in_progress = true;
6835 
6836 	switch (_blob->parent_id) {
6837 	case SPDK_BLOBID_INVALID:
6838 		if (!ctx->allocate_all) {
6839 			/* This blob has no parent, so we cannot decouple it. */
6840 			SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
6841 			bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
6842 			return;
6843 		}
6844 		break;
6845 	case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
6846 		/*
6847 		 * It would be better to rely on back_bs_dev->is_zeroes(), to determine which
6848 		 * clusters require allocation. Until there is a blobstore consumer that
6849 		 * uses esnaps with an spdk_bs_dev that implements a useful is_zeroes() it is not
6850 		 * worth the effort.
6851 		 */
6852 		ctx->allocate_all = true;
6853 		break;
6854 	default:
6855 		break;
6856 	}
6857 
6858 	if (spdk_blob_is_thin_provisioned(_blob) == false) {
6859 		/* This is not thin provisioned blob. No need to inflate. */
6860 		bs_clone_snapshot_origblob_cleanup(ctx, 0);
6861 		return;
6862 	}
6863 
6864 	/* Do two passes - one to verify that we can obtain enough clusters
6865 	 * and another to actually claim them.
6866 	 */
6867 	clusters_needed = 0;
6868 	for (i = 0; i < _blob->active.num_clusters; i++) {
6869 		if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
6870 			clusters_needed++;
6871 		}
6872 	}
6873 
6874 	if (clusters_needed > _blob->bs->num_free_clusters) {
6875 		/* Not enough free clusters. Cannot satisfy the request. */
6876 		bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
6877 		return;
6878 	}
6879 
6880 	ctx->cluster = 0;
6881 	bs_inflate_blob_touch_next(ctx, 0);
6882 }
6883 
6884 static void
6885 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6886 		spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
6887 {
6888 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
6889 
6890 	if (!ctx) {
6891 		cb_fn(cb_arg, -ENOMEM);
6892 		return;
6893 	}
6894 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6895 	ctx->cpl.u.bs_basic.cb_fn = cb_fn;
6896 	ctx->cpl.u.bs_basic.cb_arg = cb_arg;
6897 	ctx->bserrno = 0;
6898 	ctx->original.id = blobid;
6899 	ctx->channel = channel;
6900 	ctx->allocate_all = allocate_all;
6901 
6902 	spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx);
6903 }
6904 
6905 void
6906 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6907 		     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
6908 {
6909 	bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
6910 }
6911 
6912 void
6913 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6914 			     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
6915 {
6916 	bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
6917 }
6918 /* END spdk_bs_inflate_blob */
6919 
6920 /* START spdk_blob_resize */
6921 struct spdk_bs_resize_ctx {
6922 	spdk_blob_op_complete cb_fn;
6923 	void *cb_arg;
6924 	struct spdk_blob *blob;
6925 	uint64_t sz;
6926 	int rc;
6927 };
6928 
6929 static void
6930 bs_resize_unfreeze_cpl(void *cb_arg, int rc)
6931 {
6932 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
6933 
6934 	if (rc != 0) {
6935 		SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
6936 	}
6937 
6938 	if (ctx->rc != 0) {
6939 		SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
6940 		rc = ctx->rc;
6941 	}
6942 
6943 	ctx->blob->locked_operation_in_progress = false;
6944 
6945 	ctx->cb_fn(ctx->cb_arg, rc);
6946 	free(ctx);
6947 }
6948 
6949 static void
6950 bs_resize_freeze_cpl(void *cb_arg, int rc)
6951 {
6952 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
6953 
6954 	if (rc != 0) {
6955 		ctx->blob->locked_operation_in_progress = false;
6956 		ctx->cb_fn(ctx->cb_arg, rc);
6957 		free(ctx);
6958 		return;
6959 	}
6960 
6961 	ctx->rc = blob_resize(ctx->blob, ctx->sz);
6962 
6963 	blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx);
6964 }
6965 
6966 void
6967 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
6968 {
6969 	struct spdk_bs_resize_ctx *ctx;
6970 
6971 	blob_verify_md_op(blob);
6972 
6973 	SPDK_DEBUGLOG(blob, "Resizing blob 0x%" PRIx64 " to %" PRIu64 " clusters\n", blob->id, sz);
6974 
6975 	if (blob->md_ro) {
6976 		cb_fn(cb_arg, -EPERM);
6977 		return;
6978 	}
6979 
6980 	if (sz == blob->active.num_clusters) {
6981 		cb_fn(cb_arg, 0);
6982 		return;
6983 	}
6984 
6985 	if (blob->locked_operation_in_progress) {
6986 		cb_fn(cb_arg, -EBUSY);
6987 		return;
6988 	}
6989 
6990 	ctx = calloc(1, sizeof(*ctx));
6991 	if (!ctx) {
6992 		cb_fn(cb_arg, -ENOMEM);
6993 		return;
6994 	}
6995 
6996 	blob->locked_operation_in_progress = true;
6997 	ctx->cb_fn = cb_fn;
6998 	ctx->cb_arg = cb_arg;
6999 	ctx->blob = blob;
7000 	ctx->sz = sz;
7001 	blob_freeze_io(blob, bs_resize_freeze_cpl, ctx);
7002 }
7003 
7004 /* END spdk_blob_resize */
7005 
7006 
7007 /* START spdk_bs_delete_blob */
7008 
7009 static void
7010 bs_delete_close_cpl(void *cb_arg, int bserrno)
7011 {
7012 	spdk_bs_sequence_t *seq = cb_arg;
7013 
7014 	bs_sequence_finish(seq, bserrno);
7015 }
7016 
7017 static void
7018 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7019 {
7020 	struct spdk_blob *blob = cb_arg;
7021 
7022 	if (bserrno != 0) {
7023 		/*
7024 		 * We already removed this blob from the blobstore tailq, so
7025 		 *  we need to free it here since this is the last reference
7026 		 *  to it.
7027 		 */
7028 		blob_free(blob);
7029 		bs_delete_close_cpl(seq, bserrno);
7030 		return;
7031 	}
7032 
7033 	/*
7034 	 * This will immediately decrement the ref_count and call
7035 	 *  the completion routine since the metadata state is clean.
7036 	 *  By calling spdk_blob_close, we reduce the number of call
7037 	 *  points into code that touches the blob->open_ref count
7038 	 *  and the blobstore's blob list.
7039 	 */
7040 	spdk_blob_close(blob, bs_delete_close_cpl, seq);
7041 }
7042 
7043 struct delete_snapshot_ctx {
7044 	struct spdk_blob_list *parent_snapshot_entry;
7045 	struct spdk_blob *snapshot;
7046 	struct spdk_blob_md_page *page;
7047 	bool snapshot_md_ro;
7048 	struct spdk_blob *clone;
7049 	bool clone_md_ro;
7050 	spdk_blob_op_with_handle_complete cb_fn;
7051 	void *cb_arg;
7052 	int bserrno;
7053 	uint32_t next_extent_page;
7054 };
7055 
7056 static void
7057 delete_blob_cleanup_finish(void *cb_arg, int bserrno)
7058 {
7059 	struct delete_snapshot_ctx *ctx = cb_arg;
7060 
7061 	if (bserrno != 0) {
7062 		SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
7063 	}
7064 
7065 	assert(ctx != NULL);
7066 
7067 	if (bserrno != 0 && ctx->bserrno == 0) {
7068 		ctx->bserrno = bserrno;
7069 	}
7070 
7071 	ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
7072 	spdk_free(ctx->page);
7073 	free(ctx);
7074 }
7075 
7076 static void
7077 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
7078 {
7079 	struct delete_snapshot_ctx *ctx = cb_arg;
7080 
7081 	if (bserrno != 0) {
7082 		ctx->bserrno = bserrno;
7083 		SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
7084 	}
7085 
7086 	if (ctx->bserrno != 0) {
7087 		assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL);
7088 		RB_INSERT(spdk_blob_tree, &ctx->snapshot->bs->open_blobs, ctx->snapshot);
7089 		spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id);
7090 	}
7091 
7092 	ctx->snapshot->locked_operation_in_progress = false;
7093 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
7094 
7095 	spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx);
7096 }
7097 
7098 static void
7099 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
7100 {
7101 	struct delete_snapshot_ctx *ctx = cb_arg;
7102 
7103 	ctx->clone->locked_operation_in_progress = false;
7104 	ctx->clone->md_ro = ctx->clone_md_ro;
7105 
7106 	spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
7107 }
7108 
7109 static void
7110 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
7111 {
7112 	struct delete_snapshot_ctx *ctx = cb_arg;
7113 
7114 	if (bserrno) {
7115 		ctx->bserrno = bserrno;
7116 		delete_snapshot_cleanup_clone(ctx, 0);
7117 		return;
7118 	}
7119 
7120 	ctx->clone->locked_operation_in_progress = false;
7121 	spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx);
7122 }
7123 
7124 static void
7125 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
7126 {
7127 	struct delete_snapshot_ctx *ctx = cb_arg;
7128 	struct spdk_blob_list *parent_snapshot_entry = NULL;
7129 	struct spdk_blob_list *snapshot_entry = NULL;
7130 	struct spdk_blob_list *clone_entry = NULL;
7131 	struct spdk_blob_list *snapshot_clone_entry = NULL;
7132 
7133 	if (bserrno) {
7134 		SPDK_ERRLOG("Failed to sync MD on blob\n");
7135 		ctx->bserrno = bserrno;
7136 		delete_snapshot_cleanup_clone(ctx, 0);
7137 		return;
7138 	}
7139 
7140 	/* Get snapshot entry for the snapshot we want to remove */
7141 	snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
7142 
7143 	assert(snapshot_entry != NULL);
7144 
7145 	/* Remove clone entry in this snapshot (at this point there can be only one clone) */
7146 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
7147 	assert(clone_entry != NULL);
7148 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
7149 	snapshot_entry->clone_count--;
7150 	assert(TAILQ_EMPTY(&snapshot_entry->clones));
7151 
7152 	switch (ctx->snapshot->parent_id) {
7153 	case SPDK_BLOBID_INVALID:
7154 	case SPDK_BLOBID_EXTERNAL_SNAPSHOT:
7155 		/* No parent snapshot - just remove clone entry */
7156 		free(clone_entry);
7157 		break;
7158 	default:
7159 		/* This snapshot is at the same time a clone of another snapshot - we need to
7160 		 * update parent snapshot (remove current clone, add new one inherited from
7161 		 * the snapshot that is being removed) */
7162 
7163 		/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
7164 		 * snapshot that we are removing */
7165 		blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
7166 						    &snapshot_clone_entry);
7167 
7168 		/* Switch clone entry in parent snapshot */
7169 		TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
7170 		TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
7171 		free(snapshot_clone_entry);
7172 	}
7173 
7174 	/* Restore md_ro flags */
7175 	ctx->clone->md_ro = ctx->clone_md_ro;
7176 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
7177 
7178 	blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx);
7179 }
7180 
7181 static void
7182 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
7183 {
7184 	struct delete_snapshot_ctx *ctx = cb_arg;
7185 	uint64_t i;
7186 
7187 	ctx->snapshot->md_ro = false;
7188 
7189 	if (bserrno) {
7190 		SPDK_ERRLOG("Failed to sync MD on clone\n");
7191 		ctx->bserrno = bserrno;
7192 
7193 		/* Restore snapshot to previous state */
7194 		bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
7195 		if (bserrno != 0) {
7196 			delete_snapshot_cleanup_clone(ctx, bserrno);
7197 			return;
7198 		}
7199 
7200 		spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
7201 		return;
7202 	}
7203 
7204 	/* Clear cluster map entries for snapshot */
7205 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
7206 		if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
7207 			ctx->snapshot->active.clusters[i] = 0;
7208 		}
7209 	}
7210 	for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
7211 	     i < ctx->clone->active.num_extent_pages; i++) {
7212 		if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) {
7213 			ctx->snapshot->active.extent_pages[i] = 0;
7214 		}
7215 	}
7216 
7217 	blob_set_thin_provision(ctx->snapshot);
7218 	ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
7219 
7220 	if (ctx->parent_snapshot_entry != NULL) {
7221 		ctx->snapshot->back_bs_dev = NULL;
7222 	}
7223 
7224 	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx);
7225 }
7226 
7227 static void
7228 delete_snapshot_update_extent_pages_cpl(struct delete_snapshot_ctx *ctx)
7229 {
7230 	int bserrno;
7231 
7232 	/* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
7233 	blob_back_bs_destroy(ctx->clone);
7234 
7235 	/* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
7236 	if (ctx->snapshot->parent_id == SPDK_BLOBID_EXTERNAL_SNAPSHOT) {
7237 		bserrno = bs_snapshot_copy_xattr(ctx->clone, ctx->snapshot,
7238 						 BLOB_EXTERNAL_SNAPSHOT_ID);
7239 		if (bserrno != 0) {
7240 			ctx->bserrno = bserrno;
7241 
7242 			/* Restore snapshot to previous state */
7243 			bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
7244 			if (bserrno != 0) {
7245 				delete_snapshot_cleanup_clone(ctx, bserrno);
7246 				return;
7247 			}
7248 
7249 			spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
7250 			return;
7251 		}
7252 		ctx->clone->parent_id = SPDK_BLOBID_EXTERNAL_SNAPSHOT;
7253 		ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
7254 		/* Do not delete the external snapshot along with this snapshot */
7255 		ctx->snapshot->back_bs_dev = NULL;
7256 		ctx->clone->invalid_flags |= SPDK_BLOB_EXTERNAL_SNAPSHOT;
7257 	} else if (ctx->parent_snapshot_entry != NULL) {
7258 		/* ...to parent snapshot */
7259 		ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
7260 		ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
7261 		blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
7262 			       sizeof(spdk_blob_id),
7263 			       true);
7264 	} else {
7265 		/* ...to blobid invalid and zeroes dev */
7266 		ctx->clone->parent_id = SPDK_BLOBID_INVALID;
7267 		ctx->clone->back_bs_dev = bs_create_zeroes_dev();
7268 		blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
7269 	}
7270 
7271 	spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx);
7272 }
7273 
7274 static void
7275 delete_snapshot_update_extent_pages(void *cb_arg, int bserrno)
7276 {
7277 	struct delete_snapshot_ctx *ctx = cb_arg;
7278 	uint32_t *extent_page;
7279 	uint64_t i;
7280 
7281 	for (i = ctx->next_extent_page; i < ctx->snapshot->active.num_extent_pages &&
7282 	     i < ctx->clone->active.num_extent_pages; i++) {
7283 		if (ctx->snapshot->active.extent_pages[i] == 0) {
7284 			/* No extent page to use from snapshot */
7285 			continue;
7286 		}
7287 
7288 		extent_page = &ctx->clone->active.extent_pages[i];
7289 		if (*extent_page == 0) {
7290 			/* Copy extent page from snapshot when clone did not have a matching one */
7291 			*extent_page = ctx->snapshot->active.extent_pages[i];
7292 			continue;
7293 		}
7294 
7295 		/* Clone and snapshot both contain partially filled matching extent pages.
7296 		 * Update the clone extent page in place with cluster map containing the mix of both. */
7297 		ctx->next_extent_page = i + 1;
7298 		memset(ctx->page, 0, SPDK_BS_PAGE_SIZE);
7299 
7300 		blob_write_extent_page(ctx->clone, *extent_page, i * SPDK_EXTENTS_PER_EP, ctx->page,
7301 				       delete_snapshot_update_extent_pages, ctx);
7302 		return;
7303 	}
7304 	delete_snapshot_update_extent_pages_cpl(ctx);
7305 }
7306 
7307 static void
7308 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
7309 {
7310 	struct delete_snapshot_ctx *ctx = cb_arg;
7311 	uint64_t i;
7312 
7313 	/* Temporarily override md_ro flag for clone for MD modification */
7314 	ctx->clone_md_ro = ctx->clone->md_ro;
7315 	ctx->clone->md_ro = false;
7316 
7317 	if (bserrno) {
7318 		SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
7319 		ctx->bserrno = bserrno;
7320 		delete_snapshot_cleanup_clone(ctx, 0);
7321 		return;
7322 	}
7323 
7324 	/* Copy snapshot map to clone map (only unallocated clusters in clone) */
7325 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
7326 		if (ctx->clone->active.clusters[i] == 0) {
7327 			ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
7328 		}
7329 	}
7330 	ctx->next_extent_page = 0;
7331 	delete_snapshot_update_extent_pages(ctx, 0);
7332 }
7333 
7334 static void
7335 delete_snapshot_esnap_channels_destroyed_cb(void *cb_arg, struct spdk_blob *blob, int bserrno)
7336 {
7337 	struct delete_snapshot_ctx *ctx = cb_arg;
7338 
7339 	if (bserrno != 0) {
7340 		SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to destroy esnap channels: %d\n",
7341 			    blob->id, bserrno);
7342 		/* That error should not stop us from syncing metadata. */
7343 	}
7344 
7345 	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
7346 }
7347 
7348 static void
7349 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
7350 {
7351 	struct delete_snapshot_ctx *ctx = cb_arg;
7352 
7353 	if (bserrno) {
7354 		SPDK_ERRLOG("Failed to freeze I/O on clone\n");
7355 		ctx->bserrno = bserrno;
7356 		delete_snapshot_cleanup_clone(ctx, 0);
7357 		return;
7358 	}
7359 
7360 	/* Temporarily override md_ro flag for snapshot for MD modification */
7361 	ctx->snapshot_md_ro = ctx->snapshot->md_ro;
7362 	ctx->snapshot->md_ro = false;
7363 
7364 	/* Mark blob as pending for removal for power failure safety, use clone id for recovery */
7365 	ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
7366 				      sizeof(spdk_blob_id), true);
7367 	if (ctx->bserrno != 0) {
7368 		delete_snapshot_cleanup_clone(ctx, 0);
7369 		return;
7370 	}
7371 
7372 	if (blob_is_esnap_clone(ctx->snapshot)) {
7373 		blob_esnap_destroy_bs_dev_channels(ctx->snapshot, false,
7374 						   delete_snapshot_esnap_channels_destroyed_cb,
7375 						   ctx);
7376 		return;
7377 	}
7378 
7379 	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
7380 }
7381 
7382 static void
7383 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
7384 {
7385 	struct delete_snapshot_ctx *ctx = cb_arg;
7386 
7387 	if (bserrno) {
7388 		SPDK_ERRLOG("Failed to open clone\n");
7389 		ctx->bserrno = bserrno;
7390 		delete_snapshot_cleanup_snapshot(ctx, 0);
7391 		return;
7392 	}
7393 
7394 	ctx->clone = clone;
7395 
7396 	if (clone->locked_operation_in_progress) {
7397 		SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n");
7398 		ctx->bserrno = -EBUSY;
7399 		spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
7400 		return;
7401 	}
7402 
7403 	clone->locked_operation_in_progress = true;
7404 
7405 	blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx);
7406 }
7407 
7408 static void
7409 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
7410 {
7411 	struct spdk_blob_list *snapshot_entry = NULL;
7412 	struct spdk_blob_list *clone_entry = NULL;
7413 	struct spdk_blob_list *snapshot_clone_entry = NULL;
7414 
7415 	/* Get snapshot entry for the snapshot we want to remove */
7416 	snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id);
7417 
7418 	assert(snapshot_entry != NULL);
7419 
7420 	/* Get clone of the snapshot (at this point there can be only one clone) */
7421 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
7422 	assert(snapshot_entry->clone_count == 1);
7423 	assert(clone_entry != NULL);
7424 
7425 	/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
7426 	 * snapshot that we are removing */
7427 	blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
7428 					    &snapshot_clone_entry);
7429 
7430 	spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx);
7431 }
7432 
7433 static void
7434 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
7435 {
7436 	spdk_bs_sequence_t *seq = cb_arg;
7437 	struct spdk_blob_list *snapshot_entry = NULL;
7438 	uint32_t page_num;
7439 
7440 	if (bserrno) {
7441 		SPDK_ERRLOG("Failed to remove blob\n");
7442 		bs_sequence_finish(seq, bserrno);
7443 		return;
7444 	}
7445 
7446 	/* Remove snapshot from the list */
7447 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
7448 	if (snapshot_entry != NULL) {
7449 		TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
7450 		free(snapshot_entry);
7451 	}
7452 
7453 	page_num = bs_blobid_to_page(blob->id);
7454 	spdk_bit_array_clear(blob->bs->used_blobids, page_num);
7455 	blob->state = SPDK_BLOB_STATE_DIRTY;
7456 	blob->active.num_pages = 0;
7457 	blob_resize(blob, 0);
7458 
7459 	blob_persist(seq, blob, bs_delete_persist_cpl, blob);
7460 }
7461 
7462 static int
7463 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
7464 {
7465 	struct spdk_blob_list *snapshot_entry = NULL;
7466 	struct spdk_blob_list *clone_entry = NULL;
7467 	struct spdk_blob *clone = NULL;
7468 	bool has_one_clone = false;
7469 
7470 	/* Check if this is a snapshot with clones */
7471 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
7472 	if (snapshot_entry != NULL) {
7473 		if (snapshot_entry->clone_count > 1) {
7474 			SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
7475 			return -EBUSY;
7476 		} else if (snapshot_entry->clone_count == 1) {
7477 			has_one_clone = true;
7478 		}
7479 	}
7480 
7481 	/* Check if someone has this blob open (besides this delete context):
7482 	 * - open_ref = 1 - only this context opened blob, so it is ok to remove it
7483 	 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
7484 	 *	and that is ok, because we will update it accordingly */
7485 	if (blob->open_ref <= 2 && has_one_clone) {
7486 		clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
7487 		assert(clone_entry != NULL);
7488 		clone = blob_lookup(blob->bs, clone_entry->id);
7489 
7490 		if (blob->open_ref == 2 && clone == NULL) {
7491 			/* Clone is closed and someone else opened this blob */
7492 			SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
7493 			return -EBUSY;
7494 		}
7495 
7496 		*update_clone = true;
7497 		return 0;
7498 	}
7499 
7500 	if (blob->open_ref > 1) {
7501 		SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
7502 		return -EBUSY;
7503 	}
7504 
7505 	assert(has_one_clone == false);
7506 	*update_clone = false;
7507 	return 0;
7508 }
7509 
7510 static void
7511 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
7512 {
7513 	spdk_bs_sequence_t *seq = cb_arg;
7514 
7515 	bs_sequence_finish(seq, -ENOMEM);
7516 }
7517 
7518 static void
7519 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
7520 {
7521 	spdk_bs_sequence_t *seq = cb_arg;
7522 	struct delete_snapshot_ctx *ctx;
7523 	bool update_clone = false;
7524 
7525 	if (bserrno != 0) {
7526 		bs_sequence_finish(seq, bserrno);
7527 		return;
7528 	}
7529 
7530 	blob_verify_md_op(blob);
7531 
7532 	ctx = calloc(1, sizeof(*ctx));
7533 	if (ctx == NULL) {
7534 		spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq);
7535 		return;
7536 	}
7537 
7538 	ctx->snapshot = blob;
7539 	ctx->cb_fn = bs_delete_blob_finish;
7540 	ctx->cb_arg = seq;
7541 
7542 	/* Check if blob can be removed and if it is a snapshot with clone on top of it */
7543 	ctx->bserrno = bs_is_blob_deletable(blob, &update_clone);
7544 	if (ctx->bserrno) {
7545 		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
7546 		return;
7547 	}
7548 
7549 	if (blob->locked_operation_in_progress) {
7550 		SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n");
7551 		ctx->bserrno = -EBUSY;
7552 		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
7553 		return;
7554 	}
7555 
7556 	blob->locked_operation_in_progress = true;
7557 
7558 	/*
7559 	 * Remove the blob from the blob_store list now, to ensure it does not
7560 	 *  get returned after this point by blob_lookup().
7561 	 */
7562 	spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
7563 	RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob);
7564 
7565 	if (update_clone) {
7566 		ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
7567 		if (!ctx->page) {
7568 			ctx->bserrno = -ENOMEM;
7569 			spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
7570 			return;
7571 		}
7572 		/* This blob is a snapshot with active clone - update clone first */
7573 		update_clone_on_snapshot_deletion(blob, ctx);
7574 	} else {
7575 		/* This blob does not have any clones - just remove it */
7576 		bs_blob_list_remove(blob);
7577 		bs_delete_blob_finish(seq, blob, 0);
7578 		free(ctx);
7579 	}
7580 }
7581 
7582 void
7583 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
7584 		    spdk_blob_op_complete cb_fn, void *cb_arg)
7585 {
7586 	struct spdk_bs_cpl	cpl;
7587 	spdk_bs_sequence_t	*seq;
7588 
7589 	SPDK_DEBUGLOG(blob, "Deleting blob 0x%" PRIx64 "\n", blobid);
7590 
7591 	assert(spdk_get_thread() == bs->md_thread);
7592 
7593 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
7594 	cpl.u.blob_basic.cb_fn = cb_fn;
7595 	cpl.u.blob_basic.cb_arg = cb_arg;
7596 
7597 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
7598 	if (!seq) {
7599 		cb_fn(cb_arg, -ENOMEM);
7600 		return;
7601 	}
7602 
7603 	spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq);
7604 }
7605 
7606 /* END spdk_bs_delete_blob */
7607 
7608 /* START spdk_bs_open_blob */
7609 
7610 static void
7611 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7612 {
7613 	struct spdk_blob *blob = cb_arg;
7614 	struct spdk_blob *existing;
7615 
7616 	if (bserrno != 0) {
7617 		blob_free(blob);
7618 		seq->cpl.u.blob_handle.blob = NULL;
7619 		bs_sequence_finish(seq, bserrno);
7620 		return;
7621 	}
7622 
7623 	existing = blob_lookup(blob->bs, blob->id);
7624 	if (existing) {
7625 		blob_free(blob);
7626 		existing->open_ref++;
7627 		seq->cpl.u.blob_handle.blob = existing;
7628 		bs_sequence_finish(seq, 0);
7629 		return;
7630 	}
7631 
7632 	blob->open_ref++;
7633 
7634 	spdk_bit_array_set(blob->bs->open_blobids, blob->id);
7635 	RB_INSERT(spdk_blob_tree, &blob->bs->open_blobs, blob);
7636 
7637 	bs_sequence_finish(seq, bserrno);
7638 }
7639 
7640 static inline void
7641 blob_open_opts_copy(const struct spdk_blob_open_opts *src, struct spdk_blob_open_opts *dst)
7642 {
7643 #define FIELD_OK(field) \
7644         offsetof(struct spdk_blob_open_opts, field) + sizeof(src->field) <= src->opts_size
7645 
7646 #define SET_FIELD(field) \
7647         if (FIELD_OK(field)) { \
7648                 dst->field = src->field; \
7649         } \
7650 
7651 	SET_FIELD(clear_method);
7652 	SET_FIELD(esnap_ctx);
7653 
7654 	dst->opts_size = src->opts_size;
7655 
7656 	/* You should not remove this statement, but need to update the assert statement
7657 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
7658 	SPDK_STATIC_ASSERT(sizeof(struct spdk_blob_open_opts) == 24, "Incorrect size");
7659 
7660 #undef FIELD_OK
7661 #undef SET_FIELD
7662 }
7663 
7664 static void
7665 bs_open_blob(struct spdk_blob_store *bs,
7666 	     spdk_blob_id blobid,
7667 	     struct spdk_blob_open_opts *opts,
7668 	     spdk_blob_op_with_handle_complete cb_fn,
7669 	     void *cb_arg)
7670 {
7671 	struct spdk_blob		*blob;
7672 	struct spdk_bs_cpl		cpl;
7673 	struct spdk_blob_open_opts	opts_local;
7674 	spdk_bs_sequence_t		*seq;
7675 	uint32_t			page_num;
7676 
7677 	SPDK_DEBUGLOG(blob, "Opening blob 0x%" PRIx64 "\n", blobid);
7678 	assert(spdk_get_thread() == bs->md_thread);
7679 
7680 	page_num = bs_blobid_to_page(blobid);
7681 	if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
7682 		/* Invalid blobid */
7683 		cb_fn(cb_arg, NULL, -ENOENT);
7684 		return;
7685 	}
7686 
7687 	blob = blob_lookup(bs, blobid);
7688 	if (blob) {
7689 		blob->open_ref++;
7690 		cb_fn(cb_arg, blob, 0);
7691 		return;
7692 	}
7693 
7694 	blob = blob_alloc(bs, blobid);
7695 	if (!blob) {
7696 		cb_fn(cb_arg, NULL, -ENOMEM);
7697 		return;
7698 	}
7699 
7700 	spdk_blob_open_opts_init(&opts_local, sizeof(opts_local));
7701 	if (opts) {
7702 		blob_open_opts_copy(opts, &opts_local);
7703 	}
7704 
7705 	blob->clear_method = opts_local.clear_method;
7706 
7707 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
7708 	cpl.u.blob_handle.cb_fn = cb_fn;
7709 	cpl.u.blob_handle.cb_arg = cb_arg;
7710 	cpl.u.blob_handle.blob = blob;
7711 	cpl.u.blob_handle.esnap_ctx = opts_local.esnap_ctx;
7712 
7713 	seq = bs_sequence_start_bs(bs->md_channel, &cpl);
7714 	if (!seq) {
7715 		blob_free(blob);
7716 		cb_fn(cb_arg, NULL, -ENOMEM);
7717 		return;
7718 	}
7719 
7720 	blob_load(seq, blob, bs_open_blob_cpl, blob);
7721 }
7722 
7723 void
7724 spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
7725 		  spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
7726 {
7727 	bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
7728 }
7729 
7730 void
7731 spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
7732 		      struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
7733 {
7734 	bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
7735 }
7736 
7737 /* END spdk_bs_open_blob */
7738 
7739 /* START spdk_blob_set_read_only */
7740 int
7741 spdk_blob_set_read_only(struct spdk_blob *blob)
7742 {
7743 	blob_verify_md_op(blob);
7744 
7745 	blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
7746 
7747 	blob->state = SPDK_BLOB_STATE_DIRTY;
7748 	return 0;
7749 }
7750 /* END spdk_blob_set_read_only */
7751 
7752 /* START spdk_blob_sync_md */
7753 
7754 static void
7755 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7756 {
7757 	struct spdk_blob *blob = cb_arg;
7758 
7759 	if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
7760 		blob->data_ro = true;
7761 		blob->md_ro = true;
7762 	}
7763 
7764 	bs_sequence_finish(seq, bserrno);
7765 }
7766 
7767 static void
7768 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
7769 {
7770 	struct spdk_bs_cpl	cpl;
7771 	spdk_bs_sequence_t	*seq;
7772 
7773 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
7774 	cpl.u.blob_basic.cb_fn = cb_fn;
7775 	cpl.u.blob_basic.cb_arg = cb_arg;
7776 
7777 	seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
7778 	if (!seq) {
7779 		cb_fn(cb_arg, -ENOMEM);
7780 		return;
7781 	}
7782 
7783 	blob_persist(seq, blob, blob_sync_md_cpl, blob);
7784 }
7785 
7786 void
7787 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
7788 {
7789 	blob_verify_md_op(blob);
7790 
7791 	SPDK_DEBUGLOG(blob, "Syncing blob 0x%" PRIx64 "\n", blob->id);
7792 
7793 	if (blob->md_ro) {
7794 		assert(blob->state == SPDK_BLOB_STATE_CLEAN);
7795 		cb_fn(cb_arg, 0);
7796 		return;
7797 	}
7798 
7799 	blob_sync_md(blob, cb_fn, cb_arg);
7800 }
7801 
7802 /* END spdk_blob_sync_md */
7803 
7804 struct spdk_blob_insert_cluster_ctx {
7805 	struct spdk_thread	*thread;
7806 	struct spdk_blob	*blob;
7807 	uint32_t		cluster_num;	/* cluster index in blob */
7808 	uint32_t		cluster;	/* cluster on disk */
7809 	uint32_t		extent_page;	/* extent page on disk */
7810 	struct spdk_blob_md_page *page; /* preallocated extent page */
7811 	int			rc;
7812 	spdk_blob_op_complete	cb_fn;
7813 	void			*cb_arg;
7814 };
7815 
7816 static void
7817 blob_insert_cluster_msg_cpl(void *arg)
7818 {
7819 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
7820 
7821 	ctx->cb_fn(ctx->cb_arg, ctx->rc);
7822 	free(ctx);
7823 }
7824 
7825 static void
7826 blob_insert_cluster_msg_cb(void *arg, int bserrno)
7827 {
7828 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
7829 
7830 	ctx->rc = bserrno;
7831 	spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
7832 }
7833 
7834 static void
7835 blob_insert_new_ep_cb(void *arg, int bserrno)
7836 {
7837 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
7838 	uint32_t *extent_page;
7839 
7840 	extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
7841 	*extent_page = ctx->extent_page;
7842 	ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
7843 	blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
7844 }
7845 
7846 struct spdk_blob_write_extent_page_ctx {
7847 	struct spdk_blob_store		*bs;
7848 
7849 	uint32_t			extent;
7850 	struct spdk_blob_md_page	*page;
7851 };
7852 
7853 static void
7854 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7855 {
7856 	struct spdk_blob_write_extent_page_ctx *ctx = cb_arg;
7857 
7858 	free(ctx);
7859 	bs_sequence_finish(seq, bserrno);
7860 }
7861 
7862 static void
7863 blob_write_extent_page_ready(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7864 {
7865 	struct spdk_blob_write_extent_page_ctx *ctx = cb_arg;
7866 
7867 	if (bserrno != 0) {
7868 		blob_persist_extent_page_cpl(seq, ctx, bserrno);
7869 		return;
7870 	}
7871 	bs_sequence_write_dev(seq, ctx->page, bs_md_page_to_lba(ctx->bs, ctx->extent),
7872 			      bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
7873 			      blob_persist_extent_page_cpl, ctx);
7874 }
7875 
7876 static void
7877 blob_write_extent_page(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
7878 		       struct spdk_blob_md_page *page, spdk_blob_op_complete cb_fn, void *cb_arg)
7879 {
7880 	struct spdk_blob_write_extent_page_ctx	*ctx;
7881 	spdk_bs_sequence_t			*seq;
7882 	struct spdk_bs_cpl			cpl;
7883 
7884 	ctx = calloc(1, sizeof(*ctx));
7885 	if (!ctx) {
7886 		cb_fn(cb_arg, -ENOMEM);
7887 		return;
7888 	}
7889 	ctx->bs = blob->bs;
7890 	ctx->extent = extent;
7891 	ctx->page = page;
7892 
7893 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
7894 	cpl.u.blob_basic.cb_fn = cb_fn;
7895 	cpl.u.blob_basic.cb_arg = cb_arg;
7896 
7897 	seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
7898 	if (!seq) {
7899 		free(ctx);
7900 		cb_fn(cb_arg, -ENOMEM);
7901 		return;
7902 	}
7903 
7904 	assert(page);
7905 	page->next = SPDK_INVALID_MD_PAGE;
7906 	page->id = blob->id;
7907 	page->sequence_num = 0;
7908 
7909 	blob_serialize_extent_page(blob, cluster_num, page);
7910 
7911 	page->crc = blob_md_page_calc_crc(page);
7912 
7913 	assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
7914 
7915 	bs_mark_dirty(seq, blob->bs, blob_write_extent_page_ready, ctx);
7916 }
7917 
7918 static void
7919 blob_insert_cluster_msg(void *arg)
7920 {
7921 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
7922 	uint32_t *extent_page;
7923 
7924 	ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
7925 	if (ctx->rc != 0) {
7926 		spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
7927 		return;
7928 	}
7929 
7930 	if (ctx->blob->use_extent_table == false) {
7931 		/* Extent table is not used, proceed with sync of md that will only use extents_rle. */
7932 		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
7933 		blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
7934 		return;
7935 	}
7936 
7937 	extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
7938 	if (*extent_page == 0) {
7939 		/* Extent page requires allocation.
7940 		 * It was already claimed in the used_md_pages map and placed in ctx. */
7941 		assert(ctx->extent_page != 0);
7942 		assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
7943 		blob_write_extent_page(ctx->blob, ctx->extent_page, ctx->cluster_num, ctx->page,
7944 				       blob_insert_new_ep_cb, ctx);
7945 	} else {
7946 		/* It is possible for original thread to allocate extent page for
7947 		 * different cluster in the same extent page. In such case proceed with
7948 		 * updating the existing extent page, but release the additional one. */
7949 		if (ctx->extent_page != 0) {
7950 			spdk_spin_lock(&ctx->blob->bs->used_lock);
7951 			assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
7952 			bs_release_md_page(ctx->blob->bs, ctx->extent_page);
7953 			spdk_spin_unlock(&ctx->blob->bs->used_lock);
7954 			ctx->extent_page = 0;
7955 		}
7956 		/* Extent page already allocated.
7957 		 * Every cluster allocation, requires just an update of single extent page. */
7958 		blob_write_extent_page(ctx->blob, *extent_page, ctx->cluster_num, ctx->page,
7959 				       blob_insert_cluster_msg_cb, ctx);
7960 	}
7961 }
7962 
7963 static void
7964 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
7965 				 uint64_t cluster, uint32_t extent_page, struct spdk_blob_md_page *page,
7966 				 spdk_blob_op_complete cb_fn, void *cb_arg)
7967 {
7968 	struct spdk_blob_insert_cluster_ctx *ctx;
7969 
7970 	ctx = calloc(1, sizeof(*ctx));
7971 	if (ctx == NULL) {
7972 		cb_fn(cb_arg, -ENOMEM);
7973 		return;
7974 	}
7975 
7976 	ctx->thread = spdk_get_thread();
7977 	ctx->blob = blob;
7978 	ctx->cluster_num = cluster_num;
7979 	ctx->cluster = cluster;
7980 	ctx->extent_page = extent_page;
7981 	ctx->page = page;
7982 	ctx->cb_fn = cb_fn;
7983 	ctx->cb_arg = cb_arg;
7984 
7985 	spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
7986 }
7987 
7988 /* START spdk_blob_close */
7989 
7990 static void
7991 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7992 {
7993 	struct spdk_blob *blob = cb_arg;
7994 
7995 	if (bserrno == 0) {
7996 		blob->open_ref--;
7997 		if (blob->open_ref == 0) {
7998 			/*
7999 			 * Blobs with active.num_pages == 0 are deleted blobs.
8000 			 *  these blobs are removed from the blob_store list
8001 			 *  when the deletion process starts - so don't try to
8002 			 *  remove them again.
8003 			 */
8004 			if (blob->active.num_pages > 0) {
8005 				spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
8006 				RB_REMOVE(spdk_blob_tree, &blob->bs->open_blobs, blob);
8007 			}
8008 			blob_free(blob);
8009 		}
8010 	}
8011 
8012 	bs_sequence_finish(seq, bserrno);
8013 }
8014 
8015 static void
8016 blob_close_esnap_done(void *cb_arg, struct spdk_blob *blob, int bserrno)
8017 {
8018 	spdk_bs_sequence_t	*seq = cb_arg;
8019 
8020 	if (bserrno != 0) {
8021 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": close failed with error %d\n",
8022 			      blob->id, bserrno);
8023 		bs_sequence_finish(seq, bserrno);
8024 		return;
8025 	}
8026 
8027 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": closed, syncing metadata on thread %s\n",
8028 		      blob->id, spdk_thread_get_name(spdk_get_thread()));
8029 
8030 	/* Sync metadata */
8031 	blob_persist(seq, blob, blob_close_cpl, blob);
8032 }
8033 
8034 void
8035 spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
8036 {
8037 	struct spdk_bs_cpl	cpl;
8038 	spdk_bs_sequence_t	*seq;
8039 
8040 	blob_verify_md_op(blob);
8041 
8042 	SPDK_DEBUGLOG(blob, "Closing blob 0x%" PRIx64 "\n", blob->id);
8043 
8044 	if (blob->open_ref == 0) {
8045 		cb_fn(cb_arg, -EBADF);
8046 		return;
8047 	}
8048 
8049 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
8050 	cpl.u.blob_basic.cb_fn = cb_fn;
8051 	cpl.u.blob_basic.cb_arg = cb_arg;
8052 
8053 	seq = bs_sequence_start_bs(blob->bs->md_channel, &cpl);
8054 	if (!seq) {
8055 		cb_fn(cb_arg, -ENOMEM);
8056 		return;
8057 	}
8058 
8059 	if (blob->open_ref == 1 && blob_is_esnap_clone(blob)) {
8060 		blob_esnap_destroy_bs_dev_channels(blob, false, blob_close_esnap_done, seq);
8061 		return;
8062 	}
8063 
8064 	/* Sync metadata */
8065 	blob_persist(seq, blob, blob_close_cpl, blob);
8066 }
8067 
8068 /* END spdk_blob_close */
8069 
8070 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
8071 {
8072 	return spdk_get_io_channel(bs);
8073 }
8074 
8075 void
8076 spdk_bs_free_io_channel(struct spdk_io_channel *channel)
8077 {
8078 	blob_esnap_destroy_bs_channel(spdk_io_channel_get_ctx(channel));
8079 	spdk_put_io_channel(channel);
8080 }
8081 
8082 void
8083 spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
8084 		   uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
8085 {
8086 	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
8087 			       SPDK_BLOB_UNMAP);
8088 }
8089 
8090 void
8091 spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
8092 			  uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
8093 {
8094 	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
8095 			       SPDK_BLOB_WRITE_ZEROES);
8096 }
8097 
8098 void
8099 spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
8100 		   void *payload, uint64_t offset, uint64_t length,
8101 		   spdk_blob_op_complete cb_fn, void *cb_arg)
8102 {
8103 	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
8104 			       SPDK_BLOB_WRITE);
8105 }
8106 
8107 void
8108 spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
8109 		  void *payload, uint64_t offset, uint64_t length,
8110 		  spdk_blob_op_complete cb_fn, void *cb_arg)
8111 {
8112 	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
8113 			       SPDK_BLOB_READ);
8114 }
8115 
8116 void
8117 spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
8118 		    struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
8119 		    spdk_blob_op_complete cb_fn, void *cb_arg)
8120 {
8121 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false, NULL);
8122 }
8123 
8124 void
8125 spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
8126 		   struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
8127 		   spdk_blob_op_complete cb_fn, void *cb_arg)
8128 {
8129 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true, NULL);
8130 }
8131 
8132 void
8133 spdk_blob_io_writev_ext(struct spdk_blob *blob, struct spdk_io_channel *channel,
8134 			struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
8135 			spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts)
8136 {
8137 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false,
8138 				   io_opts);
8139 }
8140 
8141 void
8142 spdk_blob_io_readv_ext(struct spdk_blob *blob, struct spdk_io_channel *channel,
8143 		       struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
8144 		       spdk_blob_op_complete cb_fn, void *cb_arg, struct spdk_blob_ext_io_opts *io_opts)
8145 {
8146 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true,
8147 				   io_opts);
8148 }
8149 
8150 struct spdk_bs_iter_ctx {
8151 	int64_t page_num;
8152 	struct spdk_blob_store *bs;
8153 
8154 	spdk_blob_op_with_handle_complete cb_fn;
8155 	void *cb_arg;
8156 };
8157 
8158 static void
8159 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
8160 {
8161 	struct spdk_bs_iter_ctx *ctx = cb_arg;
8162 	struct spdk_blob_store *bs = ctx->bs;
8163 	spdk_blob_id id;
8164 
8165 	if (bserrno == 0) {
8166 		ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
8167 		free(ctx);
8168 		return;
8169 	}
8170 
8171 	ctx->page_num++;
8172 	ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
8173 	if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
8174 		ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
8175 		free(ctx);
8176 		return;
8177 	}
8178 
8179 	id = bs_page_to_blobid(ctx->page_num);
8180 
8181 	spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx);
8182 }
8183 
8184 void
8185 spdk_bs_iter_first(struct spdk_blob_store *bs,
8186 		   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
8187 {
8188 	struct spdk_bs_iter_ctx *ctx;
8189 
8190 	ctx = calloc(1, sizeof(*ctx));
8191 	if (!ctx) {
8192 		cb_fn(cb_arg, NULL, -ENOMEM);
8193 		return;
8194 	}
8195 
8196 	ctx->page_num = -1;
8197 	ctx->bs = bs;
8198 	ctx->cb_fn = cb_fn;
8199 	ctx->cb_arg = cb_arg;
8200 
8201 	bs_iter_cpl(ctx, NULL, -1);
8202 }
8203 
8204 static void
8205 bs_iter_close_cpl(void *cb_arg, int bserrno)
8206 {
8207 	struct spdk_bs_iter_ctx *ctx = cb_arg;
8208 
8209 	bs_iter_cpl(ctx, NULL, -1);
8210 }
8211 
8212 void
8213 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
8214 		  spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
8215 {
8216 	struct spdk_bs_iter_ctx *ctx;
8217 
8218 	assert(blob != NULL);
8219 
8220 	ctx = calloc(1, sizeof(*ctx));
8221 	if (!ctx) {
8222 		cb_fn(cb_arg, NULL, -ENOMEM);
8223 		return;
8224 	}
8225 
8226 	ctx->page_num = bs_blobid_to_page(blob->id);
8227 	ctx->bs = bs;
8228 	ctx->cb_fn = cb_fn;
8229 	ctx->cb_arg = cb_arg;
8230 
8231 	/* Close the existing blob */
8232 	spdk_blob_close(blob, bs_iter_close_cpl, ctx);
8233 }
8234 
8235 static int
8236 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
8237 	       uint16_t value_len, bool internal)
8238 {
8239 	struct spdk_xattr_tailq *xattrs;
8240 	struct spdk_xattr	*xattr;
8241 	size_t			desc_size;
8242 	void			*tmp;
8243 
8244 	blob_verify_md_op(blob);
8245 
8246 	if (blob->md_ro) {
8247 		return -EPERM;
8248 	}
8249 
8250 	desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
8251 	if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
8252 		SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name,
8253 			      desc_size, SPDK_BS_MAX_DESC_SIZE);
8254 		return -ENOMEM;
8255 	}
8256 
8257 	if (internal) {
8258 		xattrs = &blob->xattrs_internal;
8259 		blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
8260 	} else {
8261 		xattrs = &blob->xattrs;
8262 	}
8263 
8264 	TAILQ_FOREACH(xattr, xattrs, link) {
8265 		if (!strcmp(name, xattr->name)) {
8266 			tmp = malloc(value_len);
8267 			if (!tmp) {
8268 				return -ENOMEM;
8269 			}
8270 
8271 			free(xattr->value);
8272 			xattr->value_len = value_len;
8273 			xattr->value = tmp;
8274 			memcpy(xattr->value, value, value_len);
8275 
8276 			blob->state = SPDK_BLOB_STATE_DIRTY;
8277 
8278 			return 0;
8279 		}
8280 	}
8281 
8282 	xattr = calloc(1, sizeof(*xattr));
8283 	if (!xattr) {
8284 		return -ENOMEM;
8285 	}
8286 
8287 	xattr->name = strdup(name);
8288 	if (!xattr->name) {
8289 		free(xattr);
8290 		return -ENOMEM;
8291 	}
8292 
8293 	xattr->value_len = value_len;
8294 	xattr->value = malloc(value_len);
8295 	if (!xattr->value) {
8296 		free(xattr->name);
8297 		free(xattr);
8298 		return -ENOMEM;
8299 	}
8300 	memcpy(xattr->value, value, value_len);
8301 	TAILQ_INSERT_TAIL(xattrs, xattr, link);
8302 
8303 	blob->state = SPDK_BLOB_STATE_DIRTY;
8304 
8305 	return 0;
8306 }
8307 
8308 int
8309 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
8310 		    uint16_t value_len)
8311 {
8312 	return blob_set_xattr(blob, name, value, value_len, false);
8313 }
8314 
8315 static int
8316 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
8317 {
8318 	struct spdk_xattr_tailq *xattrs;
8319 	struct spdk_xattr	*xattr;
8320 
8321 	blob_verify_md_op(blob);
8322 
8323 	if (blob->md_ro) {
8324 		return -EPERM;
8325 	}
8326 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
8327 
8328 	TAILQ_FOREACH(xattr, xattrs, link) {
8329 		if (!strcmp(name, xattr->name)) {
8330 			TAILQ_REMOVE(xattrs, xattr, link);
8331 			free(xattr->value);
8332 			free(xattr->name);
8333 			free(xattr);
8334 
8335 			if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
8336 				blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
8337 			}
8338 			blob->state = SPDK_BLOB_STATE_DIRTY;
8339 
8340 			return 0;
8341 		}
8342 	}
8343 
8344 	return -ENOENT;
8345 }
8346 
8347 int
8348 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
8349 {
8350 	return blob_remove_xattr(blob, name, false);
8351 }
8352 
8353 static int
8354 blob_get_xattr_value(struct spdk_blob *blob, const char *name,
8355 		     const void **value, size_t *value_len, bool internal)
8356 {
8357 	struct spdk_xattr	*xattr;
8358 	struct spdk_xattr_tailq *xattrs;
8359 
8360 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
8361 
8362 	TAILQ_FOREACH(xattr, xattrs, link) {
8363 		if (!strcmp(name, xattr->name)) {
8364 			*value = xattr->value;
8365 			*value_len = xattr->value_len;
8366 			return 0;
8367 		}
8368 	}
8369 	return -ENOENT;
8370 }
8371 
8372 int
8373 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
8374 			  const void **value, size_t *value_len)
8375 {
8376 	blob_verify_md_op(blob);
8377 
8378 	return blob_get_xattr_value(blob, name, value, value_len, false);
8379 }
8380 
8381 struct spdk_xattr_names {
8382 	uint32_t	count;
8383 	const char	*names[0];
8384 };
8385 
8386 static int
8387 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
8388 {
8389 	struct spdk_xattr	*xattr;
8390 	int			count = 0;
8391 
8392 	TAILQ_FOREACH(xattr, xattrs, link) {
8393 		count++;
8394 	}
8395 
8396 	*names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
8397 	if (*names == NULL) {
8398 		return -ENOMEM;
8399 	}
8400 
8401 	TAILQ_FOREACH(xattr, xattrs, link) {
8402 		(*names)->names[(*names)->count++] = xattr->name;
8403 	}
8404 
8405 	return 0;
8406 }
8407 
8408 int
8409 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
8410 {
8411 	blob_verify_md_op(blob);
8412 
8413 	return blob_get_xattr_names(&blob->xattrs, names);
8414 }
8415 
8416 uint32_t
8417 spdk_xattr_names_get_count(struct spdk_xattr_names *names)
8418 {
8419 	assert(names != NULL);
8420 
8421 	return names->count;
8422 }
8423 
8424 const char *
8425 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
8426 {
8427 	if (index >= names->count) {
8428 		return NULL;
8429 	}
8430 
8431 	return names->names[index];
8432 }
8433 
8434 void
8435 spdk_xattr_names_free(struct spdk_xattr_names *names)
8436 {
8437 	free(names);
8438 }
8439 
8440 struct spdk_bs_type
8441 spdk_bs_get_bstype(struct spdk_blob_store *bs)
8442 {
8443 	return bs->bstype;
8444 }
8445 
8446 void
8447 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
8448 {
8449 	memcpy(&bs->bstype, &bstype, sizeof(bstype));
8450 }
8451 
8452 bool
8453 spdk_blob_is_read_only(struct spdk_blob *blob)
8454 {
8455 	assert(blob != NULL);
8456 	return (blob->data_ro || blob->md_ro);
8457 }
8458 
8459 bool
8460 spdk_blob_is_snapshot(struct spdk_blob *blob)
8461 {
8462 	struct spdk_blob_list *snapshot_entry;
8463 
8464 	assert(blob != NULL);
8465 
8466 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
8467 	if (snapshot_entry == NULL) {
8468 		return false;
8469 	}
8470 
8471 	return true;
8472 }
8473 
8474 bool
8475 spdk_blob_is_clone(struct spdk_blob *blob)
8476 {
8477 	assert(blob != NULL);
8478 
8479 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
8480 		assert(spdk_blob_is_thin_provisioned(blob));
8481 		return true;
8482 	}
8483 
8484 	return false;
8485 }
8486 
8487 bool
8488 spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
8489 {
8490 	assert(blob != NULL);
8491 	return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
8492 }
8493 
8494 bool
8495 spdk_blob_is_esnap_clone(const struct spdk_blob *blob)
8496 {
8497 	return blob_is_esnap_clone(blob);
8498 }
8499 
8500 static void
8501 blob_update_clear_method(struct spdk_blob *blob)
8502 {
8503 	enum blob_clear_method stored_cm;
8504 
8505 	assert(blob != NULL);
8506 
8507 	/* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
8508 	 * in metadata previously.  If something other than the default was
8509 	 * specified, ignore stored value and used what was passed in.
8510 	 */
8511 	stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
8512 
8513 	if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
8514 		blob->clear_method = stored_cm;
8515 	} else if (blob->clear_method != stored_cm) {
8516 		SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
8517 			     blob->clear_method, stored_cm);
8518 	}
8519 }
8520 
8521 spdk_blob_id
8522 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
8523 {
8524 	struct spdk_blob_list *snapshot_entry = NULL;
8525 	struct spdk_blob_list *clone_entry = NULL;
8526 
8527 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
8528 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
8529 			if (clone_entry->id == blob_id) {
8530 				return snapshot_entry->id;
8531 			}
8532 		}
8533 	}
8534 
8535 	return SPDK_BLOBID_INVALID;
8536 }
8537 
8538 int
8539 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
8540 		     size_t *count)
8541 {
8542 	struct spdk_blob_list *snapshot_entry, *clone_entry;
8543 	size_t n;
8544 
8545 	snapshot_entry = bs_get_snapshot_entry(bs, blobid);
8546 	if (snapshot_entry == NULL) {
8547 		*count = 0;
8548 		return 0;
8549 	}
8550 
8551 	if (ids == NULL || *count < snapshot_entry->clone_count) {
8552 		*count = snapshot_entry->clone_count;
8553 		return -ENOMEM;
8554 	}
8555 	*count = snapshot_entry->clone_count;
8556 
8557 	n = 0;
8558 	TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
8559 		ids[n++] = clone_entry->id;
8560 	}
8561 
8562 	return 0;
8563 }
8564 
8565 static void
8566 bs_load_grow_continue(struct spdk_bs_load_ctx *ctx)
8567 {
8568 	int rc;
8569 
8570 	if (ctx->super->size == 0) {
8571 		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
8572 	}
8573 
8574 	if (ctx->super->io_unit_size == 0) {
8575 		ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
8576 	}
8577 
8578 	/* Parse the super block */
8579 	ctx->bs->clean = 1;
8580 	ctx->bs->cluster_sz = ctx->super->cluster_size;
8581 	ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
8582 	ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
8583 	if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) {
8584 		ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster);
8585 	}
8586 	ctx->bs->io_unit_size = ctx->super->io_unit_size;
8587 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
8588 	if (rc < 0) {
8589 		bs_load_ctx_fail(ctx, -ENOMEM);
8590 		return;
8591 	}
8592 	ctx->bs->md_start = ctx->super->md_start;
8593 	ctx->bs->md_len = ctx->super->md_len;
8594 	rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->bs->md_len);
8595 	if (rc < 0) {
8596 		bs_load_ctx_fail(ctx, -ENOMEM);
8597 		return;
8598 	}
8599 
8600 	ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
8601 					       ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
8602 	ctx->bs->super_blob = ctx->super->super_blob;
8603 	memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
8604 
8605 	if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
8606 		SPDK_ERRLOG("Can not grow an unclean blobstore, please load it normally to clean it.\n");
8607 		bs_load_ctx_fail(ctx, -EIO);
8608 		return;
8609 	} else {
8610 		bs_load_read_used_pages(ctx);
8611 	}
8612 }
8613 
8614 static void
8615 bs_load_grow_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
8616 {
8617 	struct spdk_bs_load_ctx	*ctx = cb_arg;
8618 
8619 	if (bserrno != 0) {
8620 		bs_load_ctx_fail(ctx, bserrno);
8621 		return;
8622 	}
8623 	bs_load_grow_continue(ctx);
8624 }
8625 
8626 static void
8627 bs_load_grow_used_clusters_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
8628 {
8629 	struct spdk_bs_load_ctx	*ctx = cb_arg;
8630 
8631 	if (bserrno != 0) {
8632 		bs_load_ctx_fail(ctx, bserrno);
8633 		return;
8634 	}
8635 
8636 	spdk_free(ctx->mask);
8637 
8638 	bs_sequence_write_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
8639 			      bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
8640 			      bs_load_grow_super_write_cpl, ctx);
8641 }
8642 
8643 static void
8644 bs_load_grow_used_clusters_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
8645 {
8646 	struct spdk_bs_load_ctx *ctx = cb_arg;
8647 	uint64_t		lba, lba_count;
8648 	uint64_t		dev_size;
8649 	uint64_t		total_clusters;
8650 
8651 	if (bserrno != 0) {
8652 		bs_load_ctx_fail(ctx, bserrno);
8653 		return;
8654 	}
8655 
8656 	/* The type must be correct */
8657 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
8658 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
8659 	assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
8660 					     struct spdk_blob_md_page) * 8));
8661 	dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
8662 	total_clusters = dev_size / ctx->super->cluster_size;
8663 	ctx->mask->length = total_clusters;
8664 
8665 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
8666 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
8667 	bs_sequence_write_dev(ctx->seq, ctx->mask, lba, lba_count,
8668 			      bs_load_grow_used_clusters_write_cpl, ctx);
8669 }
8670 
8671 static void
8672 bs_load_try_to_grow(struct spdk_bs_load_ctx *ctx)
8673 {
8674 	uint64_t dev_size, total_clusters, used_cluster_mask_len, max_used_cluster_mask;
8675 	uint64_t lba, lba_count, mask_size;
8676 
8677 	dev_size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
8678 	total_clusters = dev_size / ctx->super->cluster_size;
8679 	used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
8680 				spdk_divide_round_up(total_clusters, 8),
8681 				SPDK_BS_PAGE_SIZE);
8682 	max_used_cluster_mask = ctx->super->used_blobid_mask_start - ctx->super->used_cluster_mask_start;
8683 	/* No necessary to grow or no space to grow */
8684 	if (ctx->super->size >= dev_size || used_cluster_mask_len > max_used_cluster_mask) {
8685 		SPDK_DEBUGLOG(blob, "No grow\n");
8686 		bs_load_grow_continue(ctx);
8687 		return;
8688 	}
8689 
8690 	SPDK_DEBUGLOG(blob, "Resize blobstore\n");
8691 
8692 	ctx->super->size = dev_size;
8693 	ctx->super->used_cluster_mask_len = used_cluster_mask_len;
8694 	ctx->super->crc = blob_md_page_calc_crc(ctx->super);
8695 
8696 	mask_size = used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
8697 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
8698 				 SPDK_MALLOC_DMA);
8699 	if (!ctx->mask) {
8700 		bs_load_ctx_fail(ctx, -ENOMEM);
8701 		return;
8702 	}
8703 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
8704 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
8705 	bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
8706 			     bs_load_grow_used_clusters_read_cpl, ctx);
8707 }
8708 
8709 static void
8710 bs_grow_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
8711 {
8712 	struct spdk_bs_load_ctx *ctx = cb_arg;
8713 	uint32_t	crc;
8714 	static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
8715 
8716 	if (ctx->super->version > SPDK_BS_VERSION ||
8717 	    ctx->super->version < SPDK_BS_INITIAL_VERSION) {
8718 		bs_load_ctx_fail(ctx, -EILSEQ);
8719 		return;
8720 	}
8721 
8722 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
8723 		   sizeof(ctx->super->signature)) != 0) {
8724 		bs_load_ctx_fail(ctx, -EILSEQ);
8725 		return;
8726 	}
8727 
8728 	crc = blob_md_page_calc_crc(ctx->super);
8729 	if (crc != ctx->super->crc) {
8730 		bs_load_ctx_fail(ctx, -EILSEQ);
8731 		return;
8732 	}
8733 
8734 	if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
8735 		SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n");
8736 	} else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
8737 		SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n");
8738 	} else {
8739 		SPDK_DEBUGLOG(blob, "Unexpected bstype\n");
8740 		SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
8741 		SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
8742 		bs_load_ctx_fail(ctx, -ENXIO);
8743 		return;
8744 	}
8745 
8746 	if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
8747 		SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n",
8748 			       ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
8749 		bs_load_ctx_fail(ctx, -EILSEQ);
8750 		return;
8751 	}
8752 
8753 	bs_load_try_to_grow(ctx);
8754 
8755 }
8756 
8757 void
8758 spdk_bs_grow(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
8759 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
8760 {
8761 	struct spdk_blob_store	*bs;
8762 	struct spdk_bs_cpl	cpl;
8763 	struct spdk_bs_load_ctx *ctx;
8764 	struct spdk_bs_opts	opts = {};
8765 	int err;
8766 
8767 	SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev);
8768 
8769 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
8770 		SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen);
8771 		dev->destroy(dev);
8772 		cb_fn(cb_arg, NULL, -EINVAL);
8773 		return;
8774 	}
8775 
8776 	spdk_bs_opts_init(&opts, sizeof(opts));
8777 	if (o) {
8778 		if (bs_opts_copy(o, &opts)) {
8779 			return;
8780 		}
8781 	}
8782 
8783 	if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
8784 		dev->destroy(dev);
8785 		cb_fn(cb_arg, NULL, -EINVAL);
8786 		return;
8787 	}
8788 
8789 	err = bs_alloc(dev, &opts, &bs, &ctx);
8790 	if (err) {
8791 		dev->destroy(dev);
8792 		cb_fn(cb_arg, NULL, err);
8793 		return;
8794 	}
8795 
8796 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
8797 	cpl.u.bs_handle.cb_fn = cb_fn;
8798 	cpl.u.bs_handle.cb_arg = cb_arg;
8799 	cpl.u.bs_handle.bs = bs;
8800 
8801 	ctx->seq = bs_sequence_start_bs(bs->md_channel, &cpl);
8802 	if (!ctx->seq) {
8803 		spdk_free(ctx->super);
8804 		free(ctx);
8805 		bs_free(bs);
8806 		cb_fn(cb_arg, NULL, -ENOMEM);
8807 		return;
8808 	}
8809 
8810 	/* Read the super block */
8811 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
8812 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
8813 			     bs_grow_load_super_cpl, ctx);
8814 }
8815 
8816 int
8817 spdk_blob_get_esnap_id(struct spdk_blob *blob, const void **id, size_t *len)
8818 {
8819 	if (!blob_is_esnap_clone(blob)) {
8820 		return -EINVAL;
8821 	}
8822 
8823 	return blob_get_xattr_value(blob, BLOB_EXTERNAL_SNAPSHOT_ID, id, len, true);
8824 }
8825 
8826 struct spdk_io_channel *
8827 blob_esnap_get_io_channel(struct spdk_io_channel *ch, struct spdk_blob *blob)
8828 {
8829 	struct spdk_bs_channel		*bs_channel = spdk_io_channel_get_ctx(ch);
8830 	struct spdk_bs_dev		*bs_dev = blob->back_bs_dev;
8831 	struct blob_esnap_channel	find = {};
8832 	struct blob_esnap_channel	*esnap_channel, *existing;
8833 
8834 	find.blob_id = blob->id;
8835 	esnap_channel = RB_FIND(blob_esnap_channel_tree, &bs_channel->esnap_channels, &find);
8836 	if (spdk_likely(esnap_channel != NULL)) {
8837 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": using cached channel on thread %s\n",
8838 			      blob->id, spdk_thread_get_name(spdk_get_thread()));
8839 		return esnap_channel->channel;
8840 	}
8841 
8842 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": allocating channel on thread %s\n",
8843 		      blob->id, spdk_thread_get_name(spdk_get_thread()));
8844 
8845 	esnap_channel = calloc(1, sizeof(*esnap_channel));
8846 	if (esnap_channel == NULL) {
8847 		SPDK_NOTICELOG("blob 0x%" PRIx64 " channel allocation failed: no memory\n",
8848 			       find.blob_id);
8849 		return NULL;
8850 	}
8851 	esnap_channel->channel = bs_dev->create_channel(bs_dev);
8852 	if (esnap_channel->channel == NULL) {
8853 		SPDK_NOTICELOG("blob 0x%" PRIx64 " back channel allocation failed\n", blob->id);
8854 		free(esnap_channel);
8855 		return NULL;
8856 	}
8857 	esnap_channel->blob_id = find.blob_id;
8858 	existing = RB_INSERT(blob_esnap_channel_tree, &bs_channel->esnap_channels, esnap_channel);
8859 	if (spdk_unlikely(existing != NULL)) {
8860 		/*
8861 		 * This should be unreachable: all modifications to this tree happen on this thread.
8862 		 */
8863 		SPDK_ERRLOG("blob 0x%" PRIx64 "lost race to allocate a channel\n", find.blob_id);
8864 		assert(false);
8865 
8866 		bs_dev->destroy_channel(bs_dev, esnap_channel->channel);
8867 		free(esnap_channel);
8868 
8869 		return existing->channel;
8870 	}
8871 
8872 	return esnap_channel->channel;
8873 }
8874 
8875 static int
8876 blob_esnap_channel_compare(struct blob_esnap_channel *c1, struct blob_esnap_channel *c2)
8877 {
8878 	return (c1->blob_id < c2->blob_id ? -1 : c1->blob_id > c2->blob_id);
8879 }
8880 
8881 struct blob_esnap_destroy_ctx {
8882 	spdk_blob_op_with_handle_complete	cb_fn;
8883 	void					*cb_arg;
8884 	struct spdk_blob			*blob;
8885 	struct spdk_bs_dev			*back_bs_dev;
8886 	bool					abort_io;
8887 };
8888 
8889 static void
8890 blob_esnap_destroy_channels_done(struct spdk_io_channel_iter *i, int status)
8891 {
8892 	struct blob_esnap_destroy_ctx	*ctx = spdk_io_channel_iter_get_ctx(i);
8893 	struct spdk_blob		*blob = ctx->blob;
8894 	struct spdk_blob_store		*bs = blob->bs;
8895 
8896 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": done destroying channels for this blob\n",
8897 		      blob->id);
8898 
8899 	if (ctx->cb_fn != NULL) {
8900 		ctx->cb_fn(ctx->cb_arg, blob, status);
8901 	}
8902 	free(ctx);
8903 
8904 	bs->esnap_channels_unloading--;
8905 	if (bs->esnap_channels_unloading == 0 && bs->esnap_unload_cb_fn != NULL) {
8906 		spdk_bs_unload(bs, bs->esnap_unload_cb_fn, bs->esnap_unload_cb_arg);
8907 	}
8908 }
8909 
8910 static void
8911 blob_esnap_destroy_one_channel(struct spdk_io_channel_iter *i)
8912 {
8913 	struct blob_esnap_destroy_ctx	*ctx = spdk_io_channel_iter_get_ctx(i);
8914 	struct spdk_blob		*blob = ctx->blob;
8915 	struct spdk_bs_dev		*bs_dev = ctx->back_bs_dev;
8916 	struct spdk_io_channel		*channel = spdk_io_channel_iter_get_channel(i);
8917 	struct spdk_bs_channel		*bs_channel = spdk_io_channel_get_ctx(channel);
8918 	struct blob_esnap_channel	*esnap_channel;
8919 	struct blob_esnap_channel	find = {};
8920 
8921 	assert(spdk_get_thread() == spdk_io_channel_get_thread(channel));
8922 
8923 	find.blob_id = blob->id;
8924 	esnap_channel = RB_FIND(blob_esnap_channel_tree, &bs_channel->esnap_channels, &find);
8925 	if (esnap_channel != NULL) {
8926 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": destroying channel on thread %s\n",
8927 			      blob->id, spdk_thread_get_name(spdk_get_thread()));
8928 		RB_REMOVE(blob_esnap_channel_tree, &bs_channel->esnap_channels, esnap_channel);
8929 
8930 		if (ctx->abort_io) {
8931 			spdk_bs_user_op_t *op, *tmp;
8932 
8933 			TAILQ_FOREACH_SAFE(op, &bs_channel->queued_io, link, tmp) {
8934 				if (op->back_channel == esnap_channel->channel) {
8935 					TAILQ_REMOVE(&bs_channel->queued_io, op, link);
8936 					bs_user_op_abort(op, -EIO);
8937 				}
8938 			}
8939 		}
8940 
8941 		bs_dev->destroy_channel(bs_dev, esnap_channel->channel);
8942 		free(esnap_channel);
8943 	}
8944 
8945 	spdk_for_each_channel_continue(i, 0);
8946 }
8947 
8948 /*
8949  * Destroy the channels for a specific blob on each thread with a blobstore channel. This should be
8950  * used when closing an esnap clone blob and after decoupling from the parent.
8951  */
8952 static void
8953 blob_esnap_destroy_bs_dev_channels(struct spdk_blob *blob, bool abort_io,
8954 				   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
8955 {
8956 	struct blob_esnap_destroy_ctx	*ctx;
8957 
8958 	if (!blob_is_esnap_clone(blob) || blob->back_bs_dev == NULL) {
8959 		if (cb_fn != NULL) {
8960 			cb_fn(cb_arg, blob, 0);
8961 		}
8962 		return;
8963 	}
8964 
8965 	ctx = calloc(1, sizeof(*ctx));
8966 	if (ctx == NULL) {
8967 		if (cb_fn != NULL) {
8968 			cb_fn(cb_arg, blob, -ENOMEM);
8969 		}
8970 		return;
8971 	}
8972 	ctx->cb_fn = cb_fn;
8973 	ctx->cb_arg = cb_arg;
8974 	ctx->blob = blob;
8975 	ctx->back_bs_dev = blob->back_bs_dev;
8976 	ctx->abort_io = abort_io;
8977 
8978 	SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64 ": destroying channels for this blob\n",
8979 		      blob->id);
8980 
8981 	blob->bs->esnap_channels_unloading++;
8982 	spdk_for_each_channel(blob->bs, blob_esnap_destroy_one_channel, ctx,
8983 			      blob_esnap_destroy_channels_done);
8984 }
8985 
8986 /*
8987  * Destroy all bs_dev channels on a specific blobstore channel. This should be used when a
8988  * bs_channel is destroyed.
8989  */
8990 static void
8991 blob_esnap_destroy_bs_channel(struct spdk_bs_channel *ch)
8992 {
8993 	struct blob_esnap_channel *esnap_channel, *esnap_channel_tmp;
8994 
8995 	assert(spdk_get_thread() == spdk_io_channel_get_thread(spdk_io_channel_from_ctx(ch)));
8996 
8997 	SPDK_DEBUGLOG(blob_esnap, "destroying channels on thread %s\n",
8998 		      spdk_thread_get_name(spdk_get_thread()));
8999 	RB_FOREACH_SAFE(esnap_channel, blob_esnap_channel_tree, &ch->esnap_channels,
9000 			esnap_channel_tmp) {
9001 		SPDK_DEBUGLOG(blob_esnap, "blob 0x%" PRIx64
9002 			      ": destroying one channel in thread %s\n",
9003 			      esnap_channel->blob_id, spdk_thread_get_name(spdk_get_thread()));
9004 		RB_REMOVE(blob_esnap_channel_tree, &ch->esnap_channels, esnap_channel);
9005 		spdk_put_io_channel(esnap_channel->channel);
9006 		free(esnap_channel);
9007 	}
9008 	SPDK_DEBUGLOG(blob_esnap, "done destroying channels on thread %s\n",
9009 		      spdk_thread_get_name(spdk_get_thread()));
9010 }
9011 
9012 struct set_bs_dev_ctx {
9013 	struct spdk_blob	*blob;
9014 	struct spdk_bs_dev	*back_bs_dev;
9015 	spdk_blob_op_complete	cb_fn;
9016 	void			*cb_arg;
9017 	int			bserrno;
9018 };
9019 
9020 static void
9021 blob_set_back_bs_dev_done(void *_ctx, int bserrno)
9022 {
9023 	struct set_bs_dev_ctx	*ctx = _ctx;
9024 
9025 	if (bserrno != 0) {
9026 		/* Even though the unfreeze failed, the update may have succeed. */
9027 		SPDK_ERRLOG("blob 0x%" PRIx64 ": unfreeze failed with error %d\n", ctx->blob->id,
9028 			    bserrno);
9029 	}
9030 	ctx->cb_fn(ctx->cb_arg, ctx->bserrno);
9031 	free(ctx);
9032 }
9033 
9034 static void
9035 blob_frozen_set_back_bs_dev(void *_ctx, struct spdk_blob *blob, int bserrno)
9036 {
9037 	struct set_bs_dev_ctx	*ctx = _ctx;
9038 
9039 	if (bserrno != 0) {
9040 		SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to release old back_bs_dev with error %d\n",
9041 			    blob->id, bserrno);
9042 		ctx->bserrno = bserrno;
9043 		blob_unfreeze_io(blob, blob_set_back_bs_dev_done, ctx);
9044 		return;
9045 	}
9046 
9047 	if (blob->back_bs_dev != NULL) {
9048 		blob->back_bs_dev->destroy(blob->back_bs_dev);
9049 	}
9050 
9051 	SPDK_NOTICELOG("blob 0x%" PRIx64 ": hotplugged back_bs_dev\n", blob->id);
9052 	blob->back_bs_dev = ctx->back_bs_dev;
9053 	ctx->bserrno = 0;
9054 
9055 	blob_unfreeze_io(blob, blob_set_back_bs_dev_done, ctx);
9056 }
9057 
9058 static void
9059 blob_frozen_destroy_esnap_channels(void *_ctx, int bserrno)
9060 {
9061 	struct set_bs_dev_ctx	*ctx = _ctx;
9062 	struct spdk_blob	*blob = ctx->blob;
9063 
9064 	if (bserrno != 0) {
9065 		SPDK_ERRLOG("blob 0x%" PRIx64 ": failed to freeze with error %d\n", blob->id,
9066 			    bserrno);
9067 		ctx->cb_fn(ctx->cb_arg, bserrno);
9068 		free(ctx);
9069 		return;
9070 	}
9071 
9072 	/*
9073 	 * This does not prevent future reads from the esnap device because any future IO will
9074 	 * lazily create a new esnap IO channel.
9075 	 */
9076 	blob_esnap_destroy_bs_dev_channels(blob, true, blob_frozen_set_back_bs_dev, ctx);
9077 }
9078 
9079 void
9080 spdk_blob_set_esnap_bs_dev(struct spdk_blob *blob, struct spdk_bs_dev *back_bs_dev,
9081 			   spdk_blob_op_complete cb_fn, void *cb_arg)
9082 {
9083 	struct set_bs_dev_ctx	*ctx;
9084 
9085 	if (!blob_is_esnap_clone(blob)) {
9086 		SPDK_ERRLOG("blob 0x%" PRIx64 ": not an esnap clone\n", blob->id);
9087 		cb_fn(cb_arg, -EINVAL);
9088 		return;
9089 	}
9090 
9091 	ctx = calloc(1, sizeof(*ctx));
9092 	if (ctx == NULL) {
9093 		SPDK_ERRLOG("blob 0x%" PRIx64 ": out of memory while setting back_bs_dev\n",
9094 			    blob->id);
9095 		cb_fn(cb_arg, -ENOMEM);
9096 		return;
9097 	}
9098 	ctx->cb_fn = cb_fn;
9099 	ctx->cb_arg = cb_arg;
9100 	ctx->back_bs_dev = back_bs_dev;
9101 	ctx->blob = blob;
9102 	blob_freeze_io(blob, blob_frozen_destroy_esnap_channels, ctx);
9103 }
9104 
9105 SPDK_LOG_REGISTER_COMPONENT(blob)
9106 SPDK_LOG_REGISTER_COMPONENT(blob_esnap)
9107