xref: /spdk/lib/blob/blobstore.c (revision 3de9887d6d0ef46706c3f8807e6ffbeb73107510)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/blob.h"
37 #include "spdk/crc32.h"
38 #include "spdk/env.h"
39 #include "spdk/queue.h"
40 #include "spdk/thread.h"
41 #include "spdk/bit_array.h"
42 #include "spdk/bit_pool.h"
43 #include "spdk/likely.h"
44 #include "spdk/util.h"
45 #include "spdk/string.h"
46 
47 #include "spdk_internal/assert.h"
48 #include "spdk/log.h"
49 
50 #include "blobstore.h"
51 
52 #define BLOB_CRC32C_INITIAL    0xffffffffUL
53 
54 static int bs_register_md_thread(struct spdk_blob_store *bs);
55 static int bs_unregister_md_thread(struct spdk_blob_store *bs);
56 static void blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
57 static void blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
58 		uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg);
59 
60 static int blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
61 			  uint16_t value_len, bool internal);
62 static int blob_get_xattr_value(struct spdk_blob *blob, const char *name,
63 				const void **value, size_t *value_len, bool internal);
64 static int blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
65 
66 static void blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
67 			       spdk_blob_op_complete cb_fn, void *cb_arg);
68 
69 static void
70 blob_verify_md_op(struct spdk_blob *blob)
71 {
72 	assert(blob != NULL);
73 	assert(spdk_get_thread() == blob->bs->md_thread);
74 	assert(blob->state != SPDK_BLOB_STATE_LOADING);
75 }
76 
77 static struct spdk_blob_list *
78 bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
79 {
80 	struct spdk_blob_list *snapshot_entry = NULL;
81 
82 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
83 		if (snapshot_entry->id == blobid) {
84 			break;
85 		}
86 	}
87 
88 	return snapshot_entry;
89 }
90 
91 static void
92 bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
93 {
94 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
95 	assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
96 
97 	spdk_bit_array_set(bs->used_md_pages, page);
98 }
99 
100 static void
101 bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
102 {
103 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
104 	assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
105 
106 	spdk_bit_array_clear(bs->used_md_pages, page);
107 }
108 
109 static uint32_t
110 bs_claim_cluster(struct spdk_blob_store *bs)
111 {
112 	uint32_t cluster_num;
113 
114 	cluster_num = spdk_bit_pool_allocate_bit(bs->used_clusters);
115 	if (cluster_num == UINT32_MAX) {
116 		return UINT32_MAX;
117 	}
118 
119 	SPDK_DEBUGLOG(blob, "Claiming cluster %u\n", cluster_num);
120 	bs->num_free_clusters--;
121 
122 	return cluster_num;
123 }
124 
125 static void
126 bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
127 {
128 	assert(cluster_num < spdk_bit_pool_capacity(bs->used_clusters));
129 	assert(spdk_bit_pool_is_allocated(bs->used_clusters, cluster_num) == true);
130 	assert(bs->num_free_clusters < bs->total_clusters);
131 
132 	SPDK_DEBUGLOG(blob, "Releasing cluster %u\n", cluster_num);
133 
134 	spdk_bit_pool_free_bit(bs->used_clusters, cluster_num);
135 	bs->num_free_clusters++;
136 }
137 
138 static int
139 blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
140 {
141 	uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
142 
143 	blob_verify_md_op(blob);
144 
145 	if (*cluster_lba != 0) {
146 		return -EEXIST;
147 	}
148 
149 	*cluster_lba = bs_cluster_to_lba(blob->bs, cluster);
150 	return 0;
151 }
152 
153 static int
154 bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
155 		    uint64_t *cluster, uint32_t *lowest_free_md_page, bool update_map)
156 {
157 	uint32_t *extent_page = 0;
158 
159 	*cluster = bs_claim_cluster(blob->bs);
160 	if (*cluster == UINT32_MAX) {
161 		/* No more free clusters. Cannot satisfy the request */
162 		return -ENOSPC;
163 	}
164 
165 	if (blob->use_extent_table) {
166 		extent_page = bs_cluster_to_extent_page(blob, cluster_num);
167 		if (*extent_page == 0) {
168 			/* Extent page shall never occupy md_page so start the search from 1 */
169 			if (*lowest_free_md_page == 0) {
170 				*lowest_free_md_page = 1;
171 			}
172 			/* No extent_page is allocated for the cluster */
173 			*lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
174 					       *lowest_free_md_page);
175 			if (*lowest_free_md_page == UINT32_MAX) {
176 				/* No more free md pages. Cannot satisfy the request */
177 				bs_release_cluster(blob->bs, *cluster);
178 				return -ENOSPC;
179 			}
180 			bs_claim_md_page(blob->bs, *lowest_free_md_page);
181 		}
182 	}
183 
184 	SPDK_DEBUGLOG(blob, "Claiming cluster %" PRIu64 " for blob %" PRIu64 "\n", *cluster, blob->id);
185 
186 	if (update_map) {
187 		blob_insert_cluster(blob, cluster_num, *cluster);
188 		if (blob->use_extent_table && *extent_page == 0) {
189 			*extent_page = *lowest_free_md_page;
190 		}
191 	}
192 
193 	return 0;
194 }
195 
196 static void
197 blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
198 {
199 	xattrs->count = 0;
200 	xattrs->names = NULL;
201 	xattrs->ctx = NULL;
202 	xattrs->get_value = NULL;
203 }
204 
205 void
206 spdk_blob_opts_init(struct spdk_blob_opts *opts)
207 {
208 	opts->num_clusters = 0;
209 	opts->thin_provision = false;
210 	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
211 	blob_xattrs_init(&opts->xattrs);
212 	opts->use_extent_table = true;
213 }
214 
215 void
216 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts)
217 {
218 	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
219 }
220 
221 static struct spdk_blob *
222 blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
223 {
224 	struct spdk_blob *blob;
225 
226 	blob = calloc(1, sizeof(*blob));
227 	if (!blob) {
228 		return NULL;
229 	}
230 
231 	blob->id = id;
232 	blob->bs = bs;
233 
234 	blob->parent_id = SPDK_BLOBID_INVALID;
235 
236 	blob->state = SPDK_BLOB_STATE_DIRTY;
237 	blob->extent_rle_found = false;
238 	blob->extent_table_found = false;
239 	blob->active.num_pages = 1;
240 	blob->active.pages = calloc(1, sizeof(*blob->active.pages));
241 	if (!blob->active.pages) {
242 		free(blob);
243 		return NULL;
244 	}
245 
246 	blob->active.pages[0] = bs_blobid_to_page(id);
247 
248 	TAILQ_INIT(&blob->xattrs);
249 	TAILQ_INIT(&blob->xattrs_internal);
250 	TAILQ_INIT(&blob->pending_persists);
251 
252 	return blob;
253 }
254 
255 static void
256 xattrs_free(struct spdk_xattr_tailq *xattrs)
257 {
258 	struct spdk_xattr	*xattr, *xattr_tmp;
259 
260 	TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
261 		TAILQ_REMOVE(xattrs, xattr, link);
262 		free(xattr->name);
263 		free(xattr->value);
264 		free(xattr);
265 	}
266 }
267 
268 static void
269 blob_free(struct spdk_blob *blob)
270 {
271 	assert(blob != NULL);
272 	assert(TAILQ_EMPTY(&blob->pending_persists));
273 
274 	free(blob->active.extent_pages);
275 	free(blob->clean.extent_pages);
276 	free(blob->active.clusters);
277 	free(blob->clean.clusters);
278 	free(blob->active.pages);
279 	free(blob->clean.pages);
280 
281 	xattrs_free(&blob->xattrs);
282 	xattrs_free(&blob->xattrs_internal);
283 
284 	if (blob->back_bs_dev) {
285 		blob->back_bs_dev->destroy(blob->back_bs_dev);
286 	}
287 
288 	free(blob);
289 }
290 
291 struct freeze_io_ctx {
292 	struct spdk_bs_cpl cpl;
293 	struct spdk_blob *blob;
294 };
295 
296 static void
297 blob_io_sync(struct spdk_io_channel_iter *i)
298 {
299 	spdk_for_each_channel_continue(i, 0);
300 }
301 
302 static void
303 blob_execute_queued_io(struct spdk_io_channel_iter *i)
304 {
305 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
306 	struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
307 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
308 	struct spdk_bs_request_set	*set;
309 	struct spdk_bs_user_op_args	*args;
310 	spdk_bs_user_op_t *op, *tmp;
311 
312 	TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
313 		set = (struct spdk_bs_request_set *)op;
314 		args = &set->u.user_op;
315 
316 		if (args->blob == ctx->blob) {
317 			TAILQ_REMOVE(&ch->queued_io, op, link);
318 			bs_user_op_execute(op);
319 		}
320 	}
321 
322 	spdk_for_each_channel_continue(i, 0);
323 }
324 
325 static void
326 blob_io_cpl(struct spdk_io_channel_iter *i, int status)
327 {
328 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
329 
330 	ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
331 
332 	free(ctx);
333 }
334 
335 static void
336 blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
337 {
338 	struct freeze_io_ctx *ctx;
339 
340 	ctx = calloc(1, sizeof(*ctx));
341 	if (!ctx) {
342 		cb_fn(cb_arg, -ENOMEM);
343 		return;
344 	}
345 
346 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
347 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
348 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
349 	ctx->blob = blob;
350 
351 	/* Freeze I/O on blob */
352 	blob->frozen_refcnt++;
353 
354 	if (blob->frozen_refcnt == 1) {
355 		spdk_for_each_channel(blob->bs, blob_io_sync, ctx, blob_io_cpl);
356 	} else {
357 		cb_fn(cb_arg, 0);
358 		free(ctx);
359 	}
360 }
361 
362 static void
363 blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
364 {
365 	struct freeze_io_ctx *ctx;
366 
367 	ctx = calloc(1, sizeof(*ctx));
368 	if (!ctx) {
369 		cb_fn(cb_arg, -ENOMEM);
370 		return;
371 	}
372 
373 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
374 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
375 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
376 	ctx->blob = blob;
377 
378 	assert(blob->frozen_refcnt > 0);
379 
380 	blob->frozen_refcnt--;
381 
382 	if (blob->frozen_refcnt == 0) {
383 		spdk_for_each_channel(blob->bs, blob_execute_queued_io, ctx, blob_io_cpl);
384 	} else {
385 		cb_fn(cb_arg, 0);
386 		free(ctx);
387 	}
388 }
389 
390 static int
391 blob_mark_clean(struct spdk_blob *blob)
392 {
393 	uint32_t *extent_pages = NULL;
394 	uint64_t *clusters = NULL;
395 	uint32_t *pages = NULL;
396 
397 	assert(blob != NULL);
398 
399 	if (blob->active.num_extent_pages) {
400 		assert(blob->active.extent_pages);
401 		extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
402 		if (!extent_pages) {
403 			return -ENOMEM;
404 		}
405 		memcpy(extent_pages, blob->active.extent_pages,
406 		       blob->active.num_extent_pages * sizeof(*extent_pages));
407 	}
408 
409 	if (blob->active.num_clusters) {
410 		assert(blob->active.clusters);
411 		clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
412 		if (!clusters) {
413 			free(extent_pages);
414 			return -ENOMEM;
415 		}
416 		memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
417 	}
418 
419 	if (blob->active.num_pages) {
420 		assert(blob->active.pages);
421 		pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
422 		if (!pages) {
423 			free(extent_pages);
424 			free(clusters);
425 			return -ENOMEM;
426 		}
427 		memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
428 	}
429 
430 	free(blob->clean.extent_pages);
431 	free(blob->clean.clusters);
432 	free(blob->clean.pages);
433 
434 	blob->clean.num_extent_pages = blob->active.num_extent_pages;
435 	blob->clean.extent_pages = blob->active.extent_pages;
436 	blob->clean.num_clusters = blob->active.num_clusters;
437 	blob->clean.clusters = blob->active.clusters;
438 	blob->clean.num_pages = blob->active.num_pages;
439 	blob->clean.pages = blob->active.pages;
440 
441 	blob->active.extent_pages = extent_pages;
442 	blob->active.clusters = clusters;
443 	blob->active.pages = pages;
444 
445 	/* If the metadata was dirtied again while the metadata was being written to disk,
446 	 *  we do not want to revert the DIRTY state back to CLEAN here.
447 	 */
448 	if (blob->state == SPDK_BLOB_STATE_LOADING) {
449 		blob->state = SPDK_BLOB_STATE_CLEAN;
450 	}
451 
452 	return 0;
453 }
454 
455 static int
456 blob_deserialize_xattr(struct spdk_blob *blob,
457 		       struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
458 {
459 	struct spdk_xattr                       *xattr;
460 
461 	if (desc_xattr->length != sizeof(desc_xattr->name_length) +
462 	    sizeof(desc_xattr->value_length) +
463 	    desc_xattr->name_length + desc_xattr->value_length) {
464 		return -EINVAL;
465 	}
466 
467 	xattr = calloc(1, sizeof(*xattr));
468 	if (xattr == NULL) {
469 		return -ENOMEM;
470 	}
471 
472 	xattr->name = malloc(desc_xattr->name_length + 1);
473 	if (xattr->name == NULL) {
474 		free(xattr);
475 		return -ENOMEM;
476 	}
477 	memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
478 	xattr->name[desc_xattr->name_length] = '\0';
479 
480 	xattr->value = malloc(desc_xattr->value_length);
481 	if (xattr->value == NULL) {
482 		free(xattr->name);
483 		free(xattr);
484 		return -ENOMEM;
485 	}
486 	xattr->value_len = desc_xattr->value_length;
487 	memcpy(xattr->value,
488 	       (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
489 	       desc_xattr->value_length);
490 
491 	TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
492 
493 	return 0;
494 }
495 
496 
497 static int
498 blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
499 {
500 	struct spdk_blob_md_descriptor *desc;
501 	size_t	cur_desc = 0;
502 	void *tmp;
503 
504 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
505 	while (cur_desc < sizeof(page->descriptors)) {
506 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
507 			if (desc->length == 0) {
508 				/* If padding and length are 0, this terminates the page */
509 				break;
510 			}
511 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
512 			struct spdk_blob_md_descriptor_flags	*desc_flags;
513 
514 			desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
515 
516 			if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
517 				return -EINVAL;
518 			}
519 
520 			if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
521 			    SPDK_BLOB_INVALID_FLAGS_MASK) {
522 				return -EINVAL;
523 			}
524 
525 			if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
526 			    SPDK_BLOB_DATA_RO_FLAGS_MASK) {
527 				blob->data_ro = true;
528 				blob->md_ro = true;
529 			}
530 
531 			if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
532 			    SPDK_BLOB_MD_RO_FLAGS_MASK) {
533 				blob->md_ro = true;
534 			}
535 
536 			if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
537 				blob->data_ro = true;
538 				blob->md_ro = true;
539 			}
540 
541 			blob->invalid_flags = desc_flags->invalid_flags;
542 			blob->data_ro_flags = desc_flags->data_ro_flags;
543 			blob->md_ro_flags = desc_flags->md_ro_flags;
544 
545 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
546 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
547 			unsigned int				i, j;
548 			unsigned int				cluster_count = blob->active.num_clusters;
549 
550 			if (blob->extent_table_found) {
551 				/* Extent Table already present in the md,
552 				 * both descriptors should never be at the same time. */
553 				return -EINVAL;
554 			}
555 			blob->extent_rle_found = true;
556 
557 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
558 
559 			if (desc_extent_rle->length == 0 ||
560 			    (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
561 				return -EINVAL;
562 			}
563 
564 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
565 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
566 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
567 						if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters,
568 										desc_extent_rle->extents[i].cluster_idx + j)) {
569 							return -EINVAL;
570 						}
571 					}
572 					cluster_count++;
573 				}
574 			}
575 
576 			if (cluster_count == 0) {
577 				return -EINVAL;
578 			}
579 			tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
580 			if (tmp == NULL) {
581 				return -ENOMEM;
582 			}
583 			blob->active.clusters = tmp;
584 			blob->active.cluster_array_size = cluster_count;
585 
586 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
587 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
588 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
589 						blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
590 								desc_extent_rle->extents[i].cluster_idx + j);
591 					} else if (spdk_blob_is_thin_provisioned(blob)) {
592 						blob->active.clusters[blob->active.num_clusters++] = 0;
593 					} else {
594 						return -EINVAL;
595 					}
596 				}
597 			}
598 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
599 			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
600 			uint32_t num_extent_pages = blob->active.num_extent_pages;
601 			uint32_t i, j;
602 			size_t extent_pages_length;
603 
604 			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
605 			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
606 
607 			if (blob->extent_rle_found) {
608 				/* This means that Extent RLE is present in MD,
609 				 * both should never be at the same time. */
610 				return -EINVAL;
611 			} else if (blob->extent_table_found &&
612 				   desc_extent_table->num_clusters != blob->remaining_clusters_in_et) {
613 				/* Number of clusters in this ET does not match number
614 				 * from previously read EXTENT_TABLE. */
615 				return -EINVAL;
616 			}
617 
618 			blob->extent_table_found = true;
619 
620 			if (desc_extent_table->length == 0 ||
621 			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
622 				return -EINVAL;
623 			}
624 
625 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
626 				num_extent_pages += desc_extent_table->extent_page[i].num_pages;
627 			}
628 
629 			tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
630 			if (tmp == NULL) {
631 				return -ENOMEM;
632 			}
633 			blob->active.extent_pages = tmp;
634 			blob->active.extent_pages_array_size = num_extent_pages;
635 
636 			blob->remaining_clusters_in_et = desc_extent_table->num_clusters;
637 
638 			/* Extent table entries contain md page numbers for extent pages.
639 			 * Zeroes represent unallocated extent pages, those are run-length-encoded.
640 			 */
641 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
642 				if (desc_extent_table->extent_page[i].page_idx != 0) {
643 					assert(desc_extent_table->extent_page[i].num_pages == 1);
644 					blob->active.extent_pages[blob->active.num_extent_pages++] =
645 						desc_extent_table->extent_page[i].page_idx;
646 				} else if (spdk_blob_is_thin_provisioned(blob)) {
647 					for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
648 						blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
649 					}
650 				} else {
651 					return -EINVAL;
652 				}
653 			}
654 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
655 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
656 			unsigned int					i;
657 			unsigned int					cluster_count = 0;
658 			size_t						cluster_idx_length;
659 
660 			if (blob->extent_rle_found) {
661 				/* This means that Extent RLE is present in MD,
662 				 * both should never be at the same time. */
663 				return -EINVAL;
664 			}
665 
666 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
667 			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
668 
669 			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
670 			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
671 				return -EINVAL;
672 			}
673 
674 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
675 				if (desc_extent->cluster_idx[i] != 0) {
676 					if (!spdk_bit_pool_is_allocated(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
677 						return -EINVAL;
678 					}
679 				}
680 				cluster_count++;
681 			}
682 
683 			if (cluster_count == 0) {
684 				return -EINVAL;
685 			}
686 
687 			/* When reading extent pages sequentially starting cluster idx should match
688 			 * current size of a blob.
689 			 * If changed to batch reading, this check shall be removed. */
690 			if (desc_extent->start_cluster_idx != blob->active.num_clusters) {
691 				return -EINVAL;
692 			}
693 
694 			tmp = realloc(blob->active.clusters,
695 				      (cluster_count + blob->active.num_clusters) * sizeof(*blob->active.clusters));
696 			if (tmp == NULL) {
697 				return -ENOMEM;
698 			}
699 			blob->active.clusters = tmp;
700 			blob->active.cluster_array_size = (cluster_count + blob->active.num_clusters);
701 
702 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
703 				if (desc_extent->cluster_idx[i] != 0) {
704 					blob->active.clusters[blob->active.num_clusters++] = bs_cluster_to_lba(blob->bs,
705 							desc_extent->cluster_idx[i]);
706 				} else if (spdk_blob_is_thin_provisioned(blob)) {
707 					blob->active.clusters[blob->active.num_clusters++] = 0;
708 				} else {
709 					return -EINVAL;
710 				}
711 			}
712 			assert(desc_extent->start_cluster_idx + cluster_count == blob->active.num_clusters);
713 			assert(blob->remaining_clusters_in_et >= cluster_count);
714 			blob->remaining_clusters_in_et -= cluster_count;
715 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
716 			int rc;
717 
718 			rc = blob_deserialize_xattr(blob,
719 						    (struct spdk_blob_md_descriptor_xattr *) desc, false);
720 			if (rc != 0) {
721 				return rc;
722 			}
723 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
724 			int rc;
725 
726 			rc = blob_deserialize_xattr(blob,
727 						    (struct spdk_blob_md_descriptor_xattr *) desc, true);
728 			if (rc != 0) {
729 				return rc;
730 			}
731 		} else {
732 			/* Unrecognized descriptor type.  Do not fail - just continue to the
733 			 *  next descriptor.  If this descriptor is associated with some feature
734 			 *  defined in a newer version of blobstore, that version of blobstore
735 			 *  should create and set an associated feature flag to specify if this
736 			 *  blob can be loaded or not.
737 			 */
738 		}
739 
740 		/* Advance to the next descriptor */
741 		cur_desc += sizeof(*desc) + desc->length;
742 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
743 			break;
744 		}
745 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
746 	}
747 
748 	return 0;
749 }
750 
751 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
752 
753 static int
754 blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
755 {
756 	assert(blob != NULL);
757 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
758 
759 	if (bs_load_cur_extent_page_valid(extent_page) == false) {
760 		return -ENOENT;
761 	}
762 
763 	return blob_parse_page(extent_page, blob);
764 }
765 
766 static int
767 blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
768 	   struct spdk_blob *blob)
769 {
770 	const struct spdk_blob_md_page *page;
771 	uint32_t i;
772 	int rc;
773 	void *tmp;
774 
775 	assert(page_count > 0);
776 	assert(pages[0].sequence_num == 0);
777 	assert(blob != NULL);
778 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
779 	assert(blob->active.clusters == NULL);
780 
781 	/* The blobid provided doesn't match what's in the MD, this can
782 	 * happen for example if a bogus blobid is passed in through open.
783 	 */
784 	if (blob->id != pages[0].id) {
785 		SPDK_ERRLOG("Blobid (%" PRIu64 ") doesn't match what's in metadata (%" PRIu64 ")\n",
786 			    blob->id, pages[0].id);
787 		return -ENOENT;
788 	}
789 
790 	tmp = realloc(blob->active.pages, page_count * sizeof(*blob->active.pages));
791 	if (!tmp) {
792 		return -ENOMEM;
793 	}
794 	blob->active.pages = tmp;
795 
796 	blob->active.pages[0] = pages[0].id;
797 
798 	for (i = 1; i < page_count; i++) {
799 		assert(spdk_bit_array_get(blob->bs->used_md_pages, pages[i - 1].next));
800 		blob->active.pages[i] = pages[i - 1].next;
801 	}
802 	blob->active.num_pages = page_count;
803 
804 	for (i = 0; i < page_count; i++) {
805 		page = &pages[i];
806 
807 		assert(page->id == blob->id);
808 		assert(page->sequence_num == i);
809 
810 		rc = blob_parse_page(page, blob);
811 		if (rc != 0) {
812 			return rc;
813 		}
814 	}
815 
816 	return 0;
817 }
818 
819 static int
820 blob_serialize_add_page(const struct spdk_blob *blob,
821 			struct spdk_blob_md_page **pages,
822 			uint32_t *page_count,
823 			struct spdk_blob_md_page **last_page)
824 {
825 	struct spdk_blob_md_page *page;
826 
827 	assert(pages != NULL);
828 	assert(page_count != NULL);
829 
830 	if (*page_count == 0) {
831 		assert(*pages == NULL);
832 		*page_count = 1;
833 		*pages = spdk_malloc(SPDK_BS_PAGE_SIZE, 0,
834 				     NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
835 	} else {
836 		assert(*pages != NULL);
837 		(*page_count)++;
838 		*pages = spdk_realloc(*pages, SPDK_BS_PAGE_SIZE * (*page_count), 0);
839 	}
840 
841 	if (*pages == NULL) {
842 		*page_count = 0;
843 		*last_page = NULL;
844 		return -ENOMEM;
845 	}
846 
847 	page = &(*pages)[*page_count - 1];
848 	memset(page, 0, sizeof(*page));
849 	page->id = blob->id;
850 	page->sequence_num = *page_count - 1;
851 	page->next = SPDK_INVALID_MD_PAGE;
852 	*last_page = page;
853 
854 	return 0;
855 }
856 
857 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
858  * Update required_sz on both success and failure.
859  *
860  */
861 static int
862 blob_serialize_xattr(const struct spdk_xattr *xattr,
863 		     uint8_t *buf, size_t buf_sz,
864 		     size_t *required_sz, bool internal)
865 {
866 	struct spdk_blob_md_descriptor_xattr	*desc;
867 
868 	*required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
869 		       strlen(xattr->name) +
870 		       xattr->value_len;
871 
872 	if (buf_sz < *required_sz) {
873 		return -1;
874 	}
875 
876 	desc = (struct spdk_blob_md_descriptor_xattr *)buf;
877 
878 	desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
879 	desc->length = sizeof(desc->name_length) +
880 		       sizeof(desc->value_length) +
881 		       strlen(xattr->name) +
882 		       xattr->value_len;
883 	desc->name_length = strlen(xattr->name);
884 	desc->value_length = xattr->value_len;
885 
886 	memcpy(desc->name, xattr->name, desc->name_length);
887 	memcpy((void *)((uintptr_t)desc->name + desc->name_length),
888 	       xattr->value,
889 	       desc->value_length);
890 
891 	return 0;
892 }
893 
894 static void
895 blob_serialize_extent_table_entry(const struct spdk_blob *blob,
896 				  uint64_t start_ep, uint64_t *next_ep,
897 				  uint8_t **buf, size_t *remaining_sz)
898 {
899 	struct spdk_blob_md_descriptor_extent_table *desc;
900 	size_t cur_sz;
901 	uint64_t i, et_idx;
902 	uint32_t extent_page, ep_len;
903 
904 	/* The buffer must have room for at least num_clusters entry */
905 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters);
906 	if (*remaining_sz < cur_sz) {
907 		*next_ep = start_ep;
908 		return;
909 	}
910 
911 	desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
912 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
913 
914 	desc->num_clusters = blob->active.num_clusters;
915 
916 	ep_len = 1;
917 	et_idx = 0;
918 	for (i = start_ep; i < blob->active.num_extent_pages; i++) {
919 		if (*remaining_sz < cur_sz  + sizeof(desc->extent_page[0])) {
920 			/* If we ran out of buffer space, return */
921 			break;
922 		}
923 
924 		extent_page = blob->active.extent_pages[i];
925 		/* Verify that next extent_page is unallocated */
926 		if (extent_page == 0 &&
927 		    (i + 1 < blob->active.num_extent_pages && blob->active.extent_pages[i + 1] == 0)) {
928 			ep_len++;
929 			continue;
930 		}
931 		desc->extent_page[et_idx].page_idx = extent_page;
932 		desc->extent_page[et_idx].num_pages = ep_len;
933 		et_idx++;
934 
935 		ep_len = 1;
936 		cur_sz += sizeof(desc->extent_page[et_idx]);
937 	}
938 	*next_ep = i;
939 
940 	desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
941 	*remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
942 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
943 }
944 
945 static int
946 blob_serialize_extent_table(const struct spdk_blob *blob,
947 			    struct spdk_blob_md_page **pages,
948 			    struct spdk_blob_md_page *cur_page,
949 			    uint32_t *page_count, uint8_t **buf,
950 			    size_t *remaining_sz)
951 {
952 	uint64_t				last_extent_page;
953 	int					rc;
954 
955 	last_extent_page = 0;
956 	/* At least single extent table entry has to be always persisted.
957 	 * Such case occurs with num_extent_pages == 0. */
958 	while (last_extent_page <= blob->active.num_extent_pages) {
959 		blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
960 						  remaining_sz);
961 
962 		if (last_extent_page == blob->active.num_extent_pages) {
963 			break;
964 		}
965 
966 		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
967 		if (rc < 0) {
968 			return rc;
969 		}
970 
971 		*buf = (uint8_t *)cur_page->descriptors;
972 		*remaining_sz = sizeof(cur_page->descriptors);
973 	}
974 
975 	return 0;
976 }
977 
978 static void
979 blob_serialize_extent_rle(const struct spdk_blob *blob,
980 			  uint64_t start_cluster, uint64_t *next_cluster,
981 			  uint8_t **buf, size_t *buf_sz)
982 {
983 	struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
984 	size_t cur_sz;
985 	uint64_t i, extent_idx;
986 	uint64_t lba, lba_per_cluster, lba_count;
987 
988 	/* The buffer must have room for at least one extent */
989 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
990 	if (*buf_sz < cur_sz) {
991 		*next_cluster = start_cluster;
992 		return;
993 	}
994 
995 	desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
996 	desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
997 
998 	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
999 
1000 	lba = blob->active.clusters[start_cluster];
1001 	lba_count = lba_per_cluster;
1002 	extent_idx = 0;
1003 	for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
1004 		if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
1005 			/* Run-length encode sequential non-zero LBA */
1006 			lba_count += lba_per_cluster;
1007 			continue;
1008 		} else if (lba == 0 && blob->active.clusters[i] == 0) {
1009 			/* Run-length encode unallocated clusters */
1010 			lba_count += lba_per_cluster;
1011 			continue;
1012 		}
1013 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
1014 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
1015 		extent_idx++;
1016 
1017 		cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
1018 
1019 		if (*buf_sz < cur_sz) {
1020 			/* If we ran out of buffer space, return */
1021 			*next_cluster = i;
1022 			break;
1023 		}
1024 
1025 		lba = blob->active.clusters[i];
1026 		lba_count = lba_per_cluster;
1027 	}
1028 
1029 	if (*buf_sz >= cur_sz) {
1030 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
1031 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
1032 		extent_idx++;
1033 
1034 		*next_cluster = blob->active.num_clusters;
1035 	}
1036 
1037 	desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
1038 	*buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1039 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1040 }
1041 
1042 static int
1043 blob_serialize_extents_rle(const struct spdk_blob *blob,
1044 			   struct spdk_blob_md_page **pages,
1045 			   struct spdk_blob_md_page *cur_page,
1046 			   uint32_t *page_count, uint8_t **buf,
1047 			   size_t *remaining_sz)
1048 {
1049 	uint64_t				last_cluster;
1050 	int					rc;
1051 
1052 	last_cluster = 0;
1053 	while (last_cluster < blob->active.num_clusters) {
1054 		blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
1055 
1056 		if (last_cluster == blob->active.num_clusters) {
1057 			break;
1058 		}
1059 
1060 		rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
1061 		if (rc < 0) {
1062 			return rc;
1063 		}
1064 
1065 		*buf = (uint8_t *)cur_page->descriptors;
1066 		*remaining_sz = sizeof(cur_page->descriptors);
1067 	}
1068 
1069 	return 0;
1070 }
1071 
1072 static void
1073 blob_serialize_extent_page(const struct spdk_blob *blob,
1074 			   uint64_t cluster, struct spdk_blob_md_page *page)
1075 {
1076 	struct spdk_blob_md_descriptor_extent_page *desc_extent;
1077 	uint64_t i, extent_idx;
1078 	uint64_t lba, lba_per_cluster;
1079 	uint64_t start_cluster_idx = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
1080 
1081 	desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
1082 	desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
1083 
1084 	lba_per_cluster = bs_cluster_to_lba(blob->bs, 1);
1085 
1086 	desc_extent->start_cluster_idx = start_cluster_idx;
1087 	extent_idx = 0;
1088 	for (i = start_cluster_idx; i < blob->active.num_clusters; i++) {
1089 		lba = blob->active.clusters[i];
1090 		desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
1091 		if (extent_idx >= SPDK_EXTENTS_PER_EP) {
1092 			break;
1093 		}
1094 	}
1095 	desc_extent->length = sizeof(desc_extent->start_cluster_idx) +
1096 			      sizeof(desc_extent->cluster_idx[0]) * extent_idx;
1097 }
1098 
1099 static void
1100 blob_serialize_flags(const struct spdk_blob *blob,
1101 		     uint8_t *buf, size_t *buf_sz)
1102 {
1103 	struct spdk_blob_md_descriptor_flags *desc;
1104 
1105 	/*
1106 	 * Flags get serialized first, so we should always have room for the flags
1107 	 *  descriptor.
1108 	 */
1109 	assert(*buf_sz >= sizeof(*desc));
1110 
1111 	desc = (struct spdk_blob_md_descriptor_flags *)buf;
1112 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
1113 	desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
1114 	desc->invalid_flags = blob->invalid_flags;
1115 	desc->data_ro_flags = blob->data_ro_flags;
1116 	desc->md_ro_flags = blob->md_ro_flags;
1117 
1118 	*buf_sz -= sizeof(*desc);
1119 }
1120 
1121 static int
1122 blob_serialize_xattrs(const struct spdk_blob *blob,
1123 		      const struct spdk_xattr_tailq *xattrs, bool internal,
1124 		      struct spdk_blob_md_page **pages,
1125 		      struct spdk_blob_md_page *cur_page,
1126 		      uint32_t *page_count, uint8_t **buf,
1127 		      size_t *remaining_sz)
1128 {
1129 	const struct spdk_xattr	*xattr;
1130 	int	rc;
1131 
1132 	TAILQ_FOREACH(xattr, xattrs, link) {
1133 		size_t required_sz = 0;
1134 
1135 		rc = blob_serialize_xattr(xattr,
1136 					  *buf, *remaining_sz,
1137 					  &required_sz, internal);
1138 		if (rc < 0) {
1139 			/* Need to add a new page to the chain */
1140 			rc = blob_serialize_add_page(blob, pages, page_count,
1141 						     &cur_page);
1142 			if (rc < 0) {
1143 				spdk_free(*pages);
1144 				*pages = NULL;
1145 				*page_count = 0;
1146 				return rc;
1147 			}
1148 
1149 			*buf = (uint8_t *)cur_page->descriptors;
1150 			*remaining_sz = sizeof(cur_page->descriptors);
1151 
1152 			/* Try again */
1153 			required_sz = 0;
1154 			rc = blob_serialize_xattr(xattr,
1155 						  *buf, *remaining_sz,
1156 						  &required_sz, internal);
1157 
1158 			if (rc < 0) {
1159 				spdk_free(*pages);
1160 				*pages = NULL;
1161 				*page_count = 0;
1162 				return rc;
1163 			}
1164 		}
1165 
1166 		*remaining_sz -= required_sz;
1167 		*buf += required_sz;
1168 	}
1169 
1170 	return 0;
1171 }
1172 
1173 static int
1174 blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
1175 	       uint32_t *page_count)
1176 {
1177 	struct spdk_blob_md_page		*cur_page;
1178 	int					rc;
1179 	uint8_t					*buf;
1180 	size_t					remaining_sz;
1181 
1182 	assert(pages != NULL);
1183 	assert(page_count != NULL);
1184 	assert(blob != NULL);
1185 	assert(blob->state == SPDK_BLOB_STATE_DIRTY);
1186 
1187 	*pages = NULL;
1188 	*page_count = 0;
1189 
1190 	/* A blob always has at least 1 page, even if it has no descriptors */
1191 	rc = blob_serialize_add_page(blob, pages, page_count, &cur_page);
1192 	if (rc < 0) {
1193 		return rc;
1194 	}
1195 
1196 	buf = (uint8_t *)cur_page->descriptors;
1197 	remaining_sz = sizeof(cur_page->descriptors);
1198 
1199 	/* Serialize flags */
1200 	blob_serialize_flags(blob, buf, &remaining_sz);
1201 	buf += sizeof(struct spdk_blob_md_descriptor_flags);
1202 
1203 	/* Serialize xattrs */
1204 	rc = blob_serialize_xattrs(blob, &blob->xattrs, false,
1205 				   pages, cur_page, page_count, &buf, &remaining_sz);
1206 	if (rc < 0) {
1207 		return rc;
1208 	}
1209 
1210 	/* Serialize internal xattrs */
1211 	rc = blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
1212 				   pages, cur_page, page_count, &buf, &remaining_sz);
1213 	if (rc < 0) {
1214 		return rc;
1215 	}
1216 
1217 	if (blob->use_extent_table) {
1218 		/* Serialize extent table */
1219 		rc = blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1220 	} else {
1221 		/* Serialize extents */
1222 		rc = blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1223 	}
1224 
1225 	return rc;
1226 }
1227 
1228 struct spdk_blob_load_ctx {
1229 	struct spdk_blob		*blob;
1230 
1231 	struct spdk_blob_md_page	*pages;
1232 	uint32_t			num_pages;
1233 	uint32_t			next_extent_page;
1234 	spdk_bs_sequence_t	        *seq;
1235 
1236 	spdk_bs_sequence_cpl		cb_fn;
1237 	void				*cb_arg;
1238 };
1239 
1240 static uint32_t
1241 blob_md_page_calc_crc(void *page)
1242 {
1243 	uint32_t		crc;
1244 
1245 	crc = BLOB_CRC32C_INITIAL;
1246 	crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
1247 	crc ^= BLOB_CRC32C_INITIAL;
1248 
1249 	return crc;
1250 
1251 }
1252 
1253 static void
1254 blob_load_final(struct spdk_blob_load_ctx *ctx, int bserrno)
1255 {
1256 	struct spdk_blob		*blob = ctx->blob;
1257 
1258 	if (bserrno == 0) {
1259 		blob_mark_clean(blob);
1260 	}
1261 
1262 	ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
1263 
1264 	/* Free the memory */
1265 	spdk_free(ctx->pages);
1266 	free(ctx);
1267 }
1268 
1269 static void
1270 blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
1271 {
1272 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1273 	struct spdk_blob		*blob = ctx->blob;
1274 
1275 	if (bserrno == 0) {
1276 		blob->back_bs_dev = bs_create_blob_bs_dev(snapshot);
1277 		if (blob->back_bs_dev == NULL) {
1278 			bserrno = -ENOMEM;
1279 		}
1280 	}
1281 	if (bserrno != 0) {
1282 		SPDK_ERRLOG("Snapshot fail\n");
1283 	}
1284 
1285 	blob_load_final(ctx, bserrno);
1286 }
1287 
1288 static void blob_update_clear_method(struct spdk_blob *blob);
1289 
1290 static void
1291 blob_load_backing_dev(void *cb_arg)
1292 {
1293 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1294 	struct spdk_blob		*blob = ctx->blob;
1295 	const void			*value;
1296 	size_t				len;
1297 	int				rc;
1298 
1299 	if (spdk_blob_is_thin_provisioned(blob)) {
1300 		rc = blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
1301 		if (rc == 0) {
1302 			if (len != sizeof(spdk_blob_id)) {
1303 				blob_load_final(ctx, -EINVAL);
1304 				return;
1305 			}
1306 			/* open snapshot blob and continue in the callback function */
1307 			blob->parent_id = *(spdk_blob_id *)value;
1308 			spdk_bs_open_blob(blob->bs, blob->parent_id,
1309 					  blob_load_snapshot_cpl, ctx);
1310 			return;
1311 		} else {
1312 			/* add zeroes_dev for thin provisioned blob */
1313 			blob->back_bs_dev = bs_create_zeroes_dev();
1314 		}
1315 	} else {
1316 		/* standard blob */
1317 		blob->back_bs_dev = NULL;
1318 	}
1319 	blob_load_final(ctx, 0);
1320 }
1321 
1322 static void
1323 blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1324 {
1325 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1326 	struct spdk_blob		*blob = ctx->blob;
1327 	struct spdk_blob_md_page	*page;
1328 	uint64_t			i;
1329 	uint32_t			crc;
1330 	uint64_t			lba;
1331 	void				*tmp;
1332 	uint64_t			sz;
1333 
1334 	if (bserrno) {
1335 		SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
1336 		blob_load_final(ctx, bserrno);
1337 		return;
1338 	}
1339 
1340 	if (ctx->pages == NULL) {
1341 		/* First iteration of this function, allocate buffer for single EXTENT_PAGE */
1342 		ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
1343 					  NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
1344 		if (!ctx->pages) {
1345 			blob_load_final(ctx, -ENOMEM);
1346 			return;
1347 		}
1348 		ctx->num_pages = 1;
1349 		ctx->next_extent_page = 0;
1350 	} else {
1351 		page = &ctx->pages[0];
1352 		crc = blob_md_page_calc_crc(page);
1353 		if (crc != page->crc) {
1354 			blob_load_final(ctx, -EINVAL);
1355 			return;
1356 		}
1357 
1358 		if (page->next != SPDK_INVALID_MD_PAGE) {
1359 			blob_load_final(ctx, -EINVAL);
1360 			return;
1361 		}
1362 
1363 		bserrno = blob_parse_extent_page(page, blob);
1364 		if (bserrno) {
1365 			blob_load_final(ctx, bserrno);
1366 			return;
1367 		}
1368 	}
1369 
1370 	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
1371 		if (blob->active.extent_pages[i] != 0) {
1372 			/* Extent page was allocated, read and parse it. */
1373 			lba = bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
1374 			ctx->next_extent_page = i + 1;
1375 
1376 			bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1377 					     bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
1378 					     blob_load_cpl_extents_cpl, ctx);
1379 			return;
1380 		} else {
1381 			/* Thin provisioned blobs can point to unallocated extent pages.
1382 			 * In this case blob size should be increased by up to the amount left in remaining_clusters_in_et. */
1383 
1384 			sz = spdk_min(blob->remaining_clusters_in_et, SPDK_EXTENTS_PER_EP);
1385 			blob->active.num_clusters += sz;
1386 			blob->remaining_clusters_in_et -= sz;
1387 
1388 			assert(spdk_blob_is_thin_provisioned(blob));
1389 			assert(i + 1 < blob->active.num_extent_pages || blob->remaining_clusters_in_et == 0);
1390 
1391 			tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
1392 			if (tmp == NULL) {
1393 				blob_load_final(ctx, -ENOMEM);
1394 				return;
1395 			}
1396 			memset(tmp + sizeof(*blob->active.clusters) * blob->active.cluster_array_size, 0,
1397 			       sizeof(*blob->active.clusters) * (blob->active.num_clusters - blob->active.cluster_array_size));
1398 			blob->active.clusters = tmp;
1399 			blob->active.cluster_array_size = blob->active.num_clusters;
1400 		}
1401 	}
1402 
1403 	blob_load_backing_dev(ctx);
1404 }
1405 
1406 static void
1407 blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1408 {
1409 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1410 	struct spdk_blob		*blob = ctx->blob;
1411 	struct spdk_blob_md_page	*page;
1412 	int				rc;
1413 	uint32_t			crc;
1414 	uint32_t			current_page;
1415 
1416 	if (ctx->num_pages == 1) {
1417 		current_page = bs_blobid_to_page(blob->id);
1418 	} else {
1419 		assert(ctx->num_pages != 0);
1420 		page = &ctx->pages[ctx->num_pages - 2];
1421 		current_page = page->next;
1422 	}
1423 
1424 	if (bserrno) {
1425 		SPDK_ERRLOG("Metadata page %d read failed for blobid %" PRIu64 ": %d\n",
1426 			    current_page, blob->id, bserrno);
1427 		blob_load_final(ctx, bserrno);
1428 		return;
1429 	}
1430 
1431 	page = &ctx->pages[ctx->num_pages - 1];
1432 	crc = blob_md_page_calc_crc(page);
1433 	if (crc != page->crc) {
1434 		SPDK_ERRLOG("Metadata page %d crc mismatch for blobid %" PRIu64 "\n",
1435 			    current_page, blob->id);
1436 		blob_load_final(ctx, -EINVAL);
1437 		return;
1438 	}
1439 
1440 	if (page->next != SPDK_INVALID_MD_PAGE) {
1441 		uint32_t next_page = page->next;
1442 		uint64_t next_lba = bs_md_page_to_lba(blob->bs, next_page);
1443 
1444 		/* Read the next page */
1445 		ctx->num_pages++;
1446 		ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), 0);
1447 		if (ctx->pages == NULL) {
1448 			blob_load_final(ctx, -ENOMEM);
1449 			return;
1450 		}
1451 
1452 		bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
1453 				     next_lba,
1454 				     bs_byte_to_lba(blob->bs, sizeof(*page)),
1455 				     blob_load_cpl, ctx);
1456 		return;
1457 	}
1458 
1459 	/* Parse the pages */
1460 	rc = blob_parse(ctx->pages, ctx->num_pages, blob);
1461 	if (rc) {
1462 		blob_load_final(ctx, rc);
1463 		return;
1464 	}
1465 
1466 	if (blob->extent_table_found == true) {
1467 		/* If EXTENT_TABLE was found, that means support for it should be enabled. */
1468 		assert(blob->extent_rle_found == false);
1469 		blob->use_extent_table = true;
1470 	} else {
1471 		/* If EXTENT_RLE or no extent_* descriptor was found disable support
1472 		 * for extent table. No extent_* descriptors means that blob has length of 0
1473 		 * and no extent_rle descriptors were persisted for it.
1474 		 * EXTENT_TABLE if used, is always present in metadata regardless of length. */
1475 		blob->use_extent_table = false;
1476 	}
1477 
1478 	/* Check the clear_method stored in metadata vs what may have been passed
1479 	 * via spdk_bs_open_blob_ext() and update accordingly.
1480 	 */
1481 	blob_update_clear_method(blob);
1482 
1483 	spdk_free(ctx->pages);
1484 	ctx->pages = NULL;
1485 
1486 	if (blob->extent_table_found) {
1487 		blob_load_cpl_extents_cpl(seq, ctx, 0);
1488 	} else {
1489 		blob_load_backing_dev(ctx);
1490 	}
1491 }
1492 
1493 /* Load a blob from disk given a blobid */
1494 static void
1495 blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
1496 	  spdk_bs_sequence_cpl cb_fn, void *cb_arg)
1497 {
1498 	struct spdk_blob_load_ctx *ctx;
1499 	struct spdk_blob_store *bs;
1500 	uint32_t page_num;
1501 	uint64_t lba;
1502 
1503 	blob_verify_md_op(blob);
1504 
1505 	bs = blob->bs;
1506 
1507 	ctx = calloc(1, sizeof(*ctx));
1508 	if (!ctx) {
1509 		cb_fn(seq, cb_arg, -ENOMEM);
1510 		return;
1511 	}
1512 
1513 	ctx->blob = blob;
1514 	ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, 0);
1515 	if (!ctx->pages) {
1516 		free(ctx);
1517 		cb_fn(seq, cb_arg, -ENOMEM);
1518 		return;
1519 	}
1520 	ctx->num_pages = 1;
1521 	ctx->cb_fn = cb_fn;
1522 	ctx->cb_arg = cb_arg;
1523 	ctx->seq = seq;
1524 
1525 	page_num = bs_blobid_to_page(blob->id);
1526 	lba = bs_md_page_to_lba(blob->bs, page_num);
1527 
1528 	blob->state = SPDK_BLOB_STATE_LOADING;
1529 
1530 	bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1531 			     bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE),
1532 			     blob_load_cpl, ctx);
1533 }
1534 
1535 struct spdk_blob_persist_ctx {
1536 	struct spdk_blob		*blob;
1537 
1538 	struct spdk_bs_super_block	*super;
1539 
1540 	struct spdk_blob_md_page	*pages;
1541 	uint32_t			next_extent_page;
1542 	struct spdk_blob_md_page	*extent_page;
1543 
1544 	spdk_bs_sequence_t		*seq;
1545 	spdk_bs_sequence_cpl		cb_fn;
1546 	void				*cb_arg;
1547 	TAILQ_ENTRY(spdk_blob_persist_ctx) link;
1548 };
1549 
1550 static void
1551 bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba,
1552 		   uint32_t lba_count)
1553 {
1554 	switch (ctx->blob->clear_method) {
1555 	case BLOB_CLEAR_WITH_DEFAULT:
1556 	case BLOB_CLEAR_WITH_UNMAP:
1557 		bs_batch_unmap_dev(batch, lba, lba_count);
1558 		break;
1559 	case BLOB_CLEAR_WITH_WRITE_ZEROES:
1560 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
1561 		break;
1562 	case BLOB_CLEAR_WITH_NONE:
1563 	default:
1564 		break;
1565 	}
1566 }
1567 
1568 static void blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx);
1569 
1570 static void
1571 blob_persist_complete(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx, int bserrno)
1572 {
1573 	struct spdk_blob_persist_ctx	*next_persist;
1574 	struct spdk_blob		*blob = ctx->blob;
1575 
1576 	if (bserrno == 0) {
1577 		blob_mark_clean(blob);
1578 	}
1579 
1580 	assert(ctx == TAILQ_FIRST(&blob->pending_persists));
1581 	TAILQ_REMOVE(&blob->pending_persists, ctx, link);
1582 
1583 	next_persist = TAILQ_FIRST(&blob->pending_persists);
1584 
1585 	/* Call user callback */
1586 	ctx->cb_fn(seq, ctx->cb_arg, bserrno);
1587 
1588 	/* Free the memory */
1589 	spdk_free(ctx->pages);
1590 	free(ctx);
1591 
1592 	if (next_persist != NULL) {
1593 		blob_persist_check_dirty(next_persist);
1594 	}
1595 }
1596 
1597 static void
1598 blob_persist_clear_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1599 {
1600 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1601 	struct spdk_blob		*blob = ctx->blob;
1602 	struct spdk_blob_store		*bs = blob->bs;
1603 	size_t				i;
1604 
1605 	if (bserrno != 0) {
1606 		blob_persist_complete(seq, ctx, bserrno);
1607 		return;
1608 	}
1609 
1610 	/* Release all extent_pages that were truncated */
1611 	for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
1612 		/* Nothing to release if it was not allocated */
1613 		if (blob->active.extent_pages[i] != 0) {
1614 			bs_release_md_page(bs, blob->active.extent_pages[i]);
1615 		}
1616 	}
1617 
1618 	if (blob->active.num_extent_pages == 0) {
1619 		free(blob->active.extent_pages);
1620 		blob->active.extent_pages = NULL;
1621 		blob->active.extent_pages_array_size = 0;
1622 	} else if (blob->active.num_extent_pages != blob->active.extent_pages_array_size) {
1623 #ifndef __clang_analyzer__
1624 		void *tmp;
1625 
1626 		/* scan-build really can't figure reallocs, workaround it */
1627 		tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
1628 		assert(tmp != NULL);
1629 		blob->active.extent_pages = tmp;
1630 #endif
1631 		blob->active.extent_pages_array_size = blob->active.num_extent_pages;
1632 	}
1633 
1634 	blob_persist_complete(seq, ctx, bserrno);
1635 }
1636 
1637 static void
1638 blob_persist_clear_extents(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
1639 {
1640 	struct spdk_blob		*blob = ctx->blob;
1641 	struct spdk_blob_store		*bs = blob->bs;
1642 	size_t				i;
1643 	uint64_t                        lba;
1644 	uint32_t                        lba_count;
1645 	spdk_bs_batch_t                 *batch;
1646 
1647 	batch = bs_sequence_to_batch(seq, blob_persist_clear_extents_cpl, ctx);
1648 	lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
1649 
1650 	/* Clear all extent_pages that were truncated */
1651 	for (i = blob->active.num_extent_pages; i < blob->active.extent_pages_array_size; i++) {
1652 		/* Nothing to clear if it was not allocated */
1653 		if (blob->active.extent_pages[i] != 0) {
1654 			lba = bs_md_page_to_lba(bs, blob->active.extent_pages[i]);
1655 			bs_batch_write_zeroes_dev(batch, lba, lba_count);
1656 		}
1657 	}
1658 
1659 	bs_batch_close(batch);
1660 }
1661 
1662 static void
1663 blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1664 {
1665 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1666 	struct spdk_blob		*blob = ctx->blob;
1667 	struct spdk_blob_store		*bs = blob->bs;
1668 	size_t				i;
1669 
1670 	if (bserrno != 0) {
1671 		blob_persist_complete(seq, ctx, bserrno);
1672 		return;
1673 	}
1674 
1675 	pthread_mutex_lock(&bs->used_clusters_mutex);
1676 	/* Release all clusters that were truncated */
1677 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1678 		uint32_t cluster_num = bs_lba_to_cluster(bs, blob->active.clusters[i]);
1679 
1680 		/* Nothing to release if it was not allocated */
1681 		if (blob->active.clusters[i] != 0) {
1682 			bs_release_cluster(bs, cluster_num);
1683 		}
1684 	}
1685 	pthread_mutex_unlock(&bs->used_clusters_mutex);
1686 
1687 	if (blob->active.num_clusters == 0) {
1688 		free(blob->active.clusters);
1689 		blob->active.clusters = NULL;
1690 		blob->active.cluster_array_size = 0;
1691 	} else if (blob->active.num_clusters != blob->active.cluster_array_size) {
1692 #ifndef __clang_analyzer__
1693 		void *tmp;
1694 
1695 		/* scan-build really can't figure reallocs, workaround it */
1696 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
1697 		assert(tmp != NULL);
1698 		blob->active.clusters = tmp;
1699 
1700 #endif
1701 		blob->active.cluster_array_size = blob->active.num_clusters;
1702 	}
1703 
1704 	/* Move on to clearing extent pages */
1705 	blob_persist_clear_extents(seq, ctx);
1706 }
1707 
1708 static void
1709 blob_persist_clear_clusters(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
1710 {
1711 	struct spdk_blob		*blob = ctx->blob;
1712 	struct spdk_blob_store		*bs = blob->bs;
1713 	spdk_bs_batch_t			*batch;
1714 	size_t				i;
1715 	uint64_t			lba;
1716 	uint32_t			lba_count;
1717 
1718 	/* Clusters don't move around in blobs. The list shrinks or grows
1719 	 * at the end, but no changes ever occur in the middle of the list.
1720 	 */
1721 
1722 	batch = bs_sequence_to_batch(seq, blob_persist_clear_clusters_cpl, ctx);
1723 
1724 	/* Clear all clusters that were truncated */
1725 	lba = 0;
1726 	lba_count = 0;
1727 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1728 		uint64_t next_lba = blob->active.clusters[i];
1729 		uint32_t next_lba_count = bs_cluster_to_lba(bs, 1);
1730 
1731 		if (next_lba > 0 && (lba + lba_count) == next_lba) {
1732 			/* This cluster is contiguous with the previous one. */
1733 			lba_count += next_lba_count;
1734 			continue;
1735 		} else if (next_lba == 0) {
1736 			continue;
1737 		}
1738 
1739 		/* This cluster is not contiguous with the previous one. */
1740 
1741 		/* If a run of LBAs previously existing, clear them now */
1742 		if (lba_count > 0) {
1743 			bs_batch_clear_dev(ctx, batch, lba, lba_count);
1744 		}
1745 
1746 		/* Start building the next batch */
1747 		lba = next_lba;
1748 		if (next_lba > 0) {
1749 			lba_count = next_lba_count;
1750 		} else {
1751 			lba_count = 0;
1752 		}
1753 	}
1754 
1755 	/* If we ended with a contiguous set of LBAs, clear them now */
1756 	if (lba_count > 0) {
1757 		bs_batch_clear_dev(ctx, batch, lba, lba_count);
1758 	}
1759 
1760 	bs_batch_close(batch);
1761 }
1762 
1763 static void
1764 blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1765 {
1766 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1767 	struct spdk_blob		*blob = ctx->blob;
1768 	struct spdk_blob_store		*bs = blob->bs;
1769 	size_t				i;
1770 
1771 	if (bserrno != 0) {
1772 		blob_persist_complete(seq, ctx, bserrno);
1773 		return;
1774 	}
1775 
1776 	/* This loop starts at 1 because the first page is special and handled
1777 	 * below. The pages (except the first) are never written in place,
1778 	 * so any pages in the clean list must be zeroed.
1779 	 */
1780 	for (i = 1; i < blob->clean.num_pages; i++) {
1781 		bs_release_md_page(bs, blob->clean.pages[i]);
1782 	}
1783 
1784 	if (blob->active.num_pages == 0) {
1785 		uint32_t page_num;
1786 
1787 		page_num = bs_blobid_to_page(blob->id);
1788 		bs_release_md_page(bs, page_num);
1789 	}
1790 
1791 	/* Move on to clearing clusters */
1792 	blob_persist_clear_clusters(seq, ctx);
1793 }
1794 
1795 static void
1796 blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1797 {
1798 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1799 	struct spdk_blob		*blob = ctx->blob;
1800 	struct spdk_blob_store		*bs = blob->bs;
1801 	uint64_t			lba;
1802 	uint32_t			lba_count;
1803 	spdk_bs_batch_t			*batch;
1804 	size_t				i;
1805 
1806 	if (bserrno != 0) {
1807 		blob_persist_complete(seq, ctx, bserrno);
1808 		return;
1809 	}
1810 
1811 	batch = bs_sequence_to_batch(seq, blob_persist_zero_pages_cpl, ctx);
1812 
1813 	lba_count = bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
1814 
1815 	/* This loop starts at 1 because the first page is special and handled
1816 	 * below. The pages (except the first) are never written in place,
1817 	 * so any pages in the clean list must be zeroed.
1818 	 */
1819 	for (i = 1; i < blob->clean.num_pages; i++) {
1820 		lba = bs_md_page_to_lba(bs, blob->clean.pages[i]);
1821 
1822 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
1823 	}
1824 
1825 	/* The first page will only be zeroed if this is a delete. */
1826 	if (blob->active.num_pages == 0) {
1827 		uint32_t page_num;
1828 
1829 		/* The first page in the metadata goes where the blobid indicates */
1830 		page_num = bs_blobid_to_page(blob->id);
1831 		lba = bs_md_page_to_lba(bs, page_num);
1832 
1833 		bs_batch_write_zeroes_dev(batch, lba, lba_count);
1834 	}
1835 
1836 	bs_batch_close(batch);
1837 }
1838 
1839 static void
1840 blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1841 {
1842 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1843 	struct spdk_blob		*blob = ctx->blob;
1844 	struct spdk_blob_store		*bs = blob->bs;
1845 	uint64_t			lba;
1846 	uint32_t			lba_count;
1847 	struct spdk_blob_md_page	*page;
1848 
1849 	if (bserrno != 0) {
1850 		blob_persist_complete(seq, ctx, bserrno);
1851 		return;
1852 	}
1853 
1854 	if (blob->active.num_pages == 0) {
1855 		/* Move on to the next step */
1856 		blob_persist_zero_pages(seq, ctx, 0);
1857 		return;
1858 	}
1859 
1860 	lba_count = bs_byte_to_lba(bs, sizeof(*page));
1861 
1862 	page = &ctx->pages[0];
1863 	/* The first page in the metadata goes where the blobid indicates */
1864 	lba = bs_md_page_to_lba(bs, bs_blobid_to_page(blob->id));
1865 
1866 	bs_sequence_write_dev(seq, page, lba, lba_count,
1867 			      blob_persist_zero_pages, ctx);
1868 }
1869 
1870 static void
1871 blob_persist_write_page_chain(spdk_bs_sequence_t *seq, struct spdk_blob_persist_ctx *ctx)
1872 {
1873 	struct spdk_blob		*blob = ctx->blob;
1874 	struct spdk_blob_store		*bs = blob->bs;
1875 	uint64_t			lba;
1876 	uint32_t			lba_count;
1877 	struct spdk_blob_md_page	*page;
1878 	spdk_bs_batch_t			*batch;
1879 	size_t				i;
1880 
1881 	/* Clusters don't move around in blobs. The list shrinks or grows
1882 	 * at the end, but no changes ever occur in the middle of the list.
1883 	 */
1884 
1885 	lba_count = bs_byte_to_lba(bs, sizeof(*page));
1886 
1887 	batch = bs_sequence_to_batch(seq, blob_persist_write_page_root, ctx);
1888 
1889 	/* This starts at 1. The root page is not written until
1890 	 * all of the others are finished
1891 	 */
1892 	for (i = 1; i < blob->active.num_pages; i++) {
1893 		page = &ctx->pages[i];
1894 		assert(page->sequence_num == i);
1895 
1896 		lba = bs_md_page_to_lba(bs, blob->active.pages[i]);
1897 
1898 		bs_batch_write_dev(batch, page, lba, lba_count);
1899 	}
1900 
1901 	bs_batch_close(batch);
1902 }
1903 
1904 static int
1905 blob_resize(struct spdk_blob *blob, uint64_t sz)
1906 {
1907 	uint64_t	i;
1908 	uint64_t	*tmp;
1909 	uint64_t	cluster;
1910 	uint32_t	lfmd; /*  lowest free md page */
1911 	uint64_t	num_clusters;
1912 	uint32_t	*ep_tmp;
1913 	uint64_t	new_num_ep = 0, current_num_ep = 0;
1914 	struct spdk_blob_store *bs;
1915 
1916 	bs = blob->bs;
1917 
1918 	blob_verify_md_op(blob);
1919 
1920 	if (blob->active.num_clusters == sz) {
1921 		return 0;
1922 	}
1923 
1924 	if (blob->active.num_clusters < blob->active.cluster_array_size) {
1925 		/* If this blob was resized to be larger, then smaller, then
1926 		 * larger without syncing, then the cluster array already
1927 		 * contains spare assigned clusters we can use.
1928 		 */
1929 		num_clusters = spdk_min(blob->active.cluster_array_size,
1930 					sz);
1931 	} else {
1932 		num_clusters = blob->active.num_clusters;
1933 	}
1934 
1935 	if (blob->use_extent_table) {
1936 		/* Round up since every cluster beyond current Extent Table size,
1937 		 * requires new extent page. */
1938 		new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
1939 		current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
1940 	}
1941 
1942 	/* Check first that we have enough clusters and md pages before we start claiming them. */
1943 	if (sz > num_clusters && spdk_blob_is_thin_provisioned(blob) == false) {
1944 		if ((sz - num_clusters) > bs->num_free_clusters) {
1945 			return -ENOSPC;
1946 		}
1947 		lfmd = 0;
1948 		for (i = current_num_ep; i < new_num_ep ; i++) {
1949 			lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
1950 			if (lfmd == UINT32_MAX) {
1951 				/* No more free md pages. Cannot satisfy the request */
1952 				return -ENOSPC;
1953 			}
1954 		}
1955 	}
1956 
1957 	if (sz > num_clusters) {
1958 		/* Expand the cluster array if necessary.
1959 		 * We only shrink the array when persisting.
1960 		 */
1961 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
1962 		if (sz > 0 && tmp == NULL) {
1963 			return -ENOMEM;
1964 		}
1965 		memset(tmp + blob->active.cluster_array_size, 0,
1966 		       sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
1967 		blob->active.clusters = tmp;
1968 		blob->active.cluster_array_size = sz;
1969 
1970 		/* Expand the extents table, only if enough clusters were added */
1971 		if (new_num_ep > current_num_ep && blob->use_extent_table) {
1972 			ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
1973 			if (new_num_ep > 0 && ep_tmp == NULL) {
1974 				return -ENOMEM;
1975 			}
1976 			memset(ep_tmp + blob->active.extent_pages_array_size, 0,
1977 			       sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
1978 			blob->active.extent_pages = ep_tmp;
1979 			blob->active.extent_pages_array_size = new_num_ep;
1980 		}
1981 	}
1982 
1983 	blob->state = SPDK_BLOB_STATE_DIRTY;
1984 
1985 	if (spdk_blob_is_thin_provisioned(blob) == false) {
1986 		cluster = 0;
1987 		lfmd = 0;
1988 		pthread_mutex_lock(&blob->bs->used_clusters_mutex);
1989 		for (i = num_clusters; i < sz; i++) {
1990 			bs_allocate_cluster(blob, i, &cluster, &lfmd, true);
1991 			lfmd++;
1992 		}
1993 		pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
1994 	}
1995 
1996 	blob->active.num_clusters = sz;
1997 	blob->active.num_extent_pages = new_num_ep;
1998 
1999 	return 0;
2000 }
2001 
2002 static void
2003 blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
2004 {
2005 	spdk_bs_sequence_t *seq = ctx->seq;
2006 	struct spdk_blob *blob = ctx->blob;
2007 	struct spdk_blob_store *bs = blob->bs;
2008 	uint64_t i;
2009 	uint32_t page_num;
2010 	void *tmp;
2011 	int rc;
2012 
2013 	/* Generate the new metadata */
2014 	rc = blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
2015 	if (rc < 0) {
2016 		blob_persist_complete(seq, ctx, rc);
2017 		return;
2018 	}
2019 
2020 	assert(blob->active.num_pages >= 1);
2021 
2022 	/* Resize the cache of page indices */
2023 	tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
2024 	if (!tmp) {
2025 		blob_persist_complete(seq, ctx, -ENOMEM);
2026 		return;
2027 	}
2028 	blob->active.pages = tmp;
2029 
2030 	/* Assign this metadata to pages. This requires two passes -
2031 	 * one to verify that there are enough pages and a second
2032 	 * to actually claim them. */
2033 	page_num = 0;
2034 	/* Note that this loop starts at one. The first page location is fixed by the blobid. */
2035 	for (i = 1; i < blob->active.num_pages; i++) {
2036 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
2037 		if (page_num == UINT32_MAX) {
2038 			blob_persist_complete(seq, ctx, -ENOMEM);
2039 			return;
2040 		}
2041 		page_num++;
2042 	}
2043 
2044 	page_num = 0;
2045 	blob->active.pages[0] = bs_blobid_to_page(blob->id);
2046 	for (i = 1; i < blob->active.num_pages; i++) {
2047 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
2048 		ctx->pages[i - 1].next = page_num;
2049 		/* Now that previous metadata page is complete, calculate the crc for it. */
2050 		ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
2051 		blob->active.pages[i] = page_num;
2052 		bs_claim_md_page(bs, page_num);
2053 		SPDK_DEBUGLOG(blob, "Claiming page %u for blob %" PRIu64 "\n", page_num, blob->id);
2054 		page_num++;
2055 	}
2056 	ctx->pages[i - 1].crc = blob_md_page_calc_crc(&ctx->pages[i - 1]);
2057 	/* Start writing the metadata from last page to first */
2058 	blob->state = SPDK_BLOB_STATE_CLEAN;
2059 	blob_persist_write_page_chain(seq, ctx);
2060 }
2061 
2062 static void
2063 blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2064 {
2065 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
2066 	struct spdk_blob		*blob = ctx->blob;
2067 	size_t				i;
2068 	uint32_t			extent_page_id;
2069 	uint32_t                        page_count = 0;
2070 	int				rc;
2071 
2072 	if (ctx->extent_page != NULL) {
2073 		spdk_free(ctx->extent_page);
2074 		ctx->extent_page = NULL;
2075 	}
2076 
2077 	if (bserrno != 0) {
2078 		blob_persist_complete(seq, ctx, bserrno);
2079 		return;
2080 	}
2081 
2082 	/* Only write out changed extent pages */
2083 	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
2084 		extent_page_id = blob->active.extent_pages[i];
2085 		if (extent_page_id == 0) {
2086 			/* No Extent Page to persist */
2087 			assert(spdk_blob_is_thin_provisioned(blob));
2088 			continue;
2089 		}
2090 		/* Writing out new extent page for the first time. Either active extent pages is larger
2091 		 * than clean extent pages or there was no extent page assigned due to thin provisioning. */
2092 		if (i >= blob->clean.extent_pages_array_size || blob->clean.extent_pages[i] == 0) {
2093 			blob->state = SPDK_BLOB_STATE_DIRTY;
2094 			assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
2095 			ctx->next_extent_page = i + 1;
2096 			rc = blob_serialize_add_page(ctx->blob, &ctx->extent_page, &page_count, &ctx->extent_page);
2097 			if (rc < 0) {
2098 				blob_persist_complete(seq, ctx, rc);
2099 				return;
2100 			}
2101 
2102 			blob_serialize_extent_page(blob, i * SPDK_EXTENTS_PER_EP, ctx->extent_page);
2103 
2104 			ctx->extent_page->crc = blob_md_page_calc_crc(ctx->extent_page);
2105 
2106 			bs_sequence_write_dev(seq, ctx->extent_page, bs_md_page_to_lba(blob->bs, extent_page_id),
2107 					      bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
2108 					      blob_persist_write_extent_pages, ctx);
2109 			return;
2110 		}
2111 		assert(blob->clean.extent_pages[i] != 0);
2112 	}
2113 
2114 	blob_persist_generate_new_md(ctx);
2115 }
2116 
2117 static void
2118 blob_persist_start(struct spdk_blob_persist_ctx *ctx)
2119 {
2120 	spdk_bs_sequence_t *seq = ctx->seq;
2121 	struct spdk_blob *blob = ctx->blob;
2122 
2123 	if (blob->active.num_pages == 0) {
2124 		/* This is the signal that the blob should be deleted.
2125 		 * Immediately jump to the clean up routine. */
2126 		assert(blob->clean.num_pages > 0);
2127 		blob->state = SPDK_BLOB_STATE_CLEAN;
2128 		blob_persist_zero_pages(seq, ctx, 0);
2129 		return;
2130 
2131 	}
2132 
2133 	blob_persist_write_extent_pages(seq, ctx, 0);
2134 }
2135 
2136 static void
2137 blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2138 {
2139 	struct spdk_blob_persist_ctx *ctx = cb_arg;
2140 
2141 	spdk_free(ctx->super);
2142 
2143 	if (bserrno != 0) {
2144 		blob_persist_complete(seq, ctx, bserrno);
2145 		return;
2146 	}
2147 
2148 	ctx->blob->bs->clean = 0;
2149 
2150 	blob_persist_start(ctx);
2151 }
2152 
2153 static void
2154 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
2155 	       struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
2156 
2157 
2158 static void
2159 blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2160 {
2161 	struct spdk_blob_persist_ctx *ctx = cb_arg;
2162 
2163 	if (bserrno != 0) {
2164 		spdk_free(ctx->super);
2165 		blob_persist_complete(seq, ctx, bserrno);
2166 		return;
2167 	}
2168 
2169 	ctx->super->clean = 0;
2170 	if (ctx->super->size == 0) {
2171 		ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen;
2172 	}
2173 
2174 	bs_write_super(seq, ctx->blob->bs, ctx->super, blob_persist_dirty_cpl, ctx);
2175 }
2176 
2177 static void
2178 blob_persist_check_dirty(struct spdk_blob_persist_ctx *ctx)
2179 {
2180 	if (ctx->blob->bs->clean) {
2181 		ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
2182 					  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2183 		if (!ctx->super) {
2184 			blob_persist_complete(ctx->seq, ctx, -ENOMEM);
2185 			return;
2186 		}
2187 
2188 		bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(ctx->blob->bs, 0),
2189 				     bs_byte_to_lba(ctx->blob->bs, sizeof(*ctx->super)),
2190 				     blob_persist_dirty, ctx);
2191 	} else {
2192 		blob_persist_start(ctx);
2193 	}
2194 }
2195 
2196 /* Write a blob to disk */
2197 static void
2198 blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
2199 	     spdk_bs_sequence_cpl cb_fn, void *cb_arg)
2200 {
2201 	struct spdk_blob_persist_ctx *ctx;
2202 
2203 	blob_verify_md_op(blob);
2204 
2205 	if (blob->state == SPDK_BLOB_STATE_CLEAN && TAILQ_EMPTY(&blob->pending_persists)) {
2206 		cb_fn(seq, cb_arg, 0);
2207 		return;
2208 	}
2209 
2210 	ctx = calloc(1, sizeof(*ctx));
2211 	if (!ctx) {
2212 		cb_fn(seq, cb_arg, -ENOMEM);
2213 		return;
2214 	}
2215 	ctx->blob = blob;
2216 	ctx->seq = seq;
2217 	ctx->cb_fn = cb_fn;
2218 	ctx->cb_arg = cb_arg;
2219 	ctx->next_extent_page = 0;
2220 
2221 	/* Multiple blob persists can affect one another, via blob->state or
2222 	 * blob mutable data changes. To prevent it, queue up the persists. */
2223 	if (!TAILQ_EMPTY(&blob->pending_persists)) {
2224 		TAILQ_INSERT_TAIL(&blob->pending_persists, ctx, link);
2225 		return;
2226 	}
2227 	TAILQ_INSERT_HEAD(&blob->pending_persists, ctx, link);
2228 
2229 	blob_persist_check_dirty(ctx);
2230 }
2231 
2232 struct spdk_blob_copy_cluster_ctx {
2233 	struct spdk_blob *blob;
2234 	uint8_t *buf;
2235 	uint64_t page;
2236 	uint64_t new_cluster;
2237 	uint32_t new_extent_page;
2238 	spdk_bs_sequence_t *seq;
2239 };
2240 
2241 static void
2242 blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
2243 {
2244 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2245 	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
2246 	TAILQ_HEAD(, spdk_bs_request_set) requests;
2247 	spdk_bs_user_op_t *op;
2248 
2249 	TAILQ_INIT(&requests);
2250 	TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
2251 
2252 	while (!TAILQ_EMPTY(&requests)) {
2253 		op = TAILQ_FIRST(&requests);
2254 		TAILQ_REMOVE(&requests, op, link);
2255 		if (bserrno == 0) {
2256 			bs_user_op_execute(op);
2257 		} else {
2258 			bs_user_op_abort(op);
2259 		}
2260 	}
2261 
2262 	spdk_free(ctx->buf);
2263 	free(ctx);
2264 }
2265 
2266 static void
2267 blob_insert_cluster_cpl(void *cb_arg, int bserrno)
2268 {
2269 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2270 
2271 	if (bserrno) {
2272 		if (bserrno == -EEXIST) {
2273 			/* The metadata insert failed because another thread
2274 			 * allocated the cluster first. Free our cluster
2275 			 * but continue without error. */
2276 			bserrno = 0;
2277 		}
2278 		pthread_mutex_lock(&ctx->blob->bs->used_clusters_mutex);
2279 		bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
2280 		pthread_mutex_unlock(&ctx->blob->bs->used_clusters_mutex);
2281 		if (ctx->new_extent_page != 0) {
2282 			bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
2283 		}
2284 	}
2285 
2286 	bs_sequence_finish(ctx->seq, bserrno);
2287 }
2288 
2289 static void
2290 blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2291 {
2292 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2293 	uint32_t cluster_number;
2294 
2295 	if (bserrno) {
2296 		/* The write failed, so jump to the final completion handler */
2297 		bs_sequence_finish(seq, bserrno);
2298 		return;
2299 	}
2300 
2301 	cluster_number = bs_page_to_cluster(ctx->blob->bs, ctx->page);
2302 
2303 	blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2304 					 ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
2305 }
2306 
2307 static void
2308 blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2309 {
2310 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2311 
2312 	if (bserrno != 0) {
2313 		/* The read failed, so jump to the final completion handler */
2314 		bs_sequence_finish(seq, bserrno);
2315 		return;
2316 	}
2317 
2318 	/* Write whole cluster */
2319 	bs_sequence_write_dev(seq, ctx->buf,
2320 			      bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
2321 			      bs_cluster_to_lba(ctx->blob->bs, 1),
2322 			      blob_write_copy_cpl, ctx);
2323 }
2324 
2325 static void
2326 bs_allocate_and_copy_cluster(struct spdk_blob *blob,
2327 			     struct spdk_io_channel *_ch,
2328 			     uint64_t io_unit, spdk_bs_user_op_t *op)
2329 {
2330 	struct spdk_bs_cpl cpl;
2331 	struct spdk_bs_channel *ch;
2332 	struct spdk_blob_copy_cluster_ctx *ctx;
2333 	uint32_t cluster_start_page;
2334 	uint32_t cluster_number;
2335 	int rc;
2336 
2337 	ch = spdk_io_channel_get_ctx(_ch);
2338 
2339 	if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
2340 		/* There are already operations pending. Queue this user op
2341 		 * and return because it will be re-executed when the outstanding
2342 		 * cluster allocation completes. */
2343 		TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2344 		return;
2345 	}
2346 
2347 	/* Round the io_unit offset down to the first page in the cluster */
2348 	cluster_start_page = bs_io_unit_to_cluster_start(blob, io_unit);
2349 
2350 	/* Calculate which index in the metadata cluster array the corresponding
2351 	 * cluster is supposed to be at. */
2352 	cluster_number = bs_io_unit_to_cluster_number(blob, io_unit);
2353 
2354 	ctx = calloc(1, sizeof(*ctx));
2355 	if (!ctx) {
2356 		bs_user_op_abort(op);
2357 		return;
2358 	}
2359 
2360 	assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
2361 
2362 	ctx->blob = blob;
2363 	ctx->page = cluster_start_page;
2364 
2365 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
2366 		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
2367 				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2368 		if (!ctx->buf) {
2369 			SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
2370 				    blob->bs->cluster_sz);
2371 			free(ctx);
2372 			bs_user_op_abort(op);
2373 			return;
2374 		}
2375 	}
2376 
2377 	pthread_mutex_lock(&blob->bs->used_clusters_mutex);
2378 	rc = bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
2379 				 false);
2380 	pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
2381 	if (rc != 0) {
2382 		spdk_free(ctx->buf);
2383 		free(ctx);
2384 		bs_user_op_abort(op);
2385 		return;
2386 	}
2387 
2388 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2389 	cpl.u.blob_basic.cb_fn = blob_allocate_and_copy_cluster_cpl;
2390 	cpl.u.blob_basic.cb_arg = ctx;
2391 
2392 	ctx->seq = bs_sequence_start(_ch, &cpl);
2393 	if (!ctx->seq) {
2394 		pthread_mutex_lock(&blob->bs->used_clusters_mutex);
2395 		bs_release_cluster(blob->bs, ctx->new_cluster);
2396 		pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
2397 		spdk_free(ctx->buf);
2398 		free(ctx);
2399 		bs_user_op_abort(op);
2400 		return;
2401 	}
2402 
2403 	/* Queue the user op to block other incoming operations */
2404 	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2405 
2406 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
2407 		/* Read cluster from backing device */
2408 		bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
2409 					bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
2410 					bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
2411 					blob_write_copy, ctx);
2412 	} else {
2413 		blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2414 						 ctx->new_extent_page, blob_insert_cluster_cpl, ctx);
2415 	}
2416 }
2417 
2418 static inline bool
2419 blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
2420 				 uint64_t *lba,	uint32_t *lba_count)
2421 {
2422 	*lba_count = length;
2423 
2424 	if (!bs_io_unit_is_allocated(blob, io_unit)) {
2425 		assert(blob->back_bs_dev != NULL);
2426 		*lba = bs_io_unit_to_back_dev_lba(blob, io_unit);
2427 		*lba_count = bs_io_unit_to_back_dev_lba(blob, *lba_count);
2428 		return false;
2429 	} else {
2430 		*lba = bs_blob_io_unit_to_lba(blob, io_unit);
2431 		return true;
2432 	}
2433 }
2434 
2435 struct op_split_ctx {
2436 	struct spdk_blob *blob;
2437 	struct spdk_io_channel *channel;
2438 	uint64_t io_unit_offset;
2439 	uint64_t io_units_remaining;
2440 	void *curr_payload;
2441 	enum spdk_blob_op_type op_type;
2442 	spdk_bs_sequence_t *seq;
2443 };
2444 
2445 static void
2446 blob_request_submit_op_split_next(void *cb_arg, int bserrno)
2447 {
2448 	struct op_split_ctx	*ctx = cb_arg;
2449 	struct spdk_blob	*blob = ctx->blob;
2450 	struct spdk_io_channel	*ch = ctx->channel;
2451 	enum spdk_blob_op_type	op_type = ctx->op_type;
2452 	uint8_t			*buf = ctx->curr_payload;
2453 	uint64_t		offset = ctx->io_unit_offset;
2454 	uint64_t		length = ctx->io_units_remaining;
2455 	uint64_t		op_length;
2456 
2457 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
2458 		bs_sequence_finish(ctx->seq, bserrno);
2459 		free(ctx);
2460 		return;
2461 	}
2462 
2463 	op_length = spdk_min(length, bs_num_io_units_to_cluster_boundary(blob,
2464 			     offset));
2465 
2466 	/* Update length and payload for next operation */
2467 	ctx->io_units_remaining -= op_length;
2468 	ctx->io_unit_offset += op_length;
2469 	if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
2470 		ctx->curr_payload += op_length * blob->bs->io_unit_size;
2471 	}
2472 
2473 	switch (op_type) {
2474 	case SPDK_BLOB_READ:
2475 		spdk_blob_io_read(blob, ch, buf, offset, op_length,
2476 				  blob_request_submit_op_split_next, ctx);
2477 		break;
2478 	case SPDK_BLOB_WRITE:
2479 		spdk_blob_io_write(blob, ch, buf, offset, op_length,
2480 				   blob_request_submit_op_split_next, ctx);
2481 		break;
2482 	case SPDK_BLOB_UNMAP:
2483 		spdk_blob_io_unmap(blob, ch, offset, op_length,
2484 				   blob_request_submit_op_split_next, ctx);
2485 		break;
2486 	case SPDK_BLOB_WRITE_ZEROES:
2487 		spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
2488 					  blob_request_submit_op_split_next, ctx);
2489 		break;
2490 	case SPDK_BLOB_READV:
2491 	case SPDK_BLOB_WRITEV:
2492 		SPDK_ERRLOG("readv/write not valid\n");
2493 		bs_sequence_finish(ctx->seq, -EINVAL);
2494 		free(ctx);
2495 		break;
2496 	}
2497 }
2498 
2499 static void
2500 blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
2501 			     void *payload, uint64_t offset, uint64_t length,
2502 			     spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2503 {
2504 	struct op_split_ctx *ctx;
2505 	spdk_bs_sequence_t *seq;
2506 	struct spdk_bs_cpl cpl;
2507 
2508 	assert(blob != NULL);
2509 
2510 	ctx = calloc(1, sizeof(struct op_split_ctx));
2511 	if (ctx == NULL) {
2512 		cb_fn(cb_arg, -ENOMEM);
2513 		return;
2514 	}
2515 
2516 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2517 	cpl.u.blob_basic.cb_fn = cb_fn;
2518 	cpl.u.blob_basic.cb_arg = cb_arg;
2519 
2520 	seq = bs_sequence_start(ch, &cpl);
2521 	if (!seq) {
2522 		free(ctx);
2523 		cb_fn(cb_arg, -ENOMEM);
2524 		return;
2525 	}
2526 
2527 	ctx->blob = blob;
2528 	ctx->channel = ch;
2529 	ctx->curr_payload = payload;
2530 	ctx->io_unit_offset = offset;
2531 	ctx->io_units_remaining = length;
2532 	ctx->op_type = op_type;
2533 	ctx->seq = seq;
2534 
2535 	blob_request_submit_op_split_next(ctx, 0);
2536 }
2537 
2538 static void
2539 blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
2540 			      void *payload, uint64_t offset, uint64_t length,
2541 			      spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2542 {
2543 	struct spdk_bs_cpl cpl;
2544 	uint64_t lba;
2545 	uint32_t lba_count;
2546 	bool is_allocated;
2547 
2548 	assert(blob != NULL);
2549 
2550 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2551 	cpl.u.blob_basic.cb_fn = cb_fn;
2552 	cpl.u.blob_basic.cb_arg = cb_arg;
2553 
2554 	is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
2555 
2556 	if (blob->frozen_refcnt) {
2557 		/* This blob I/O is frozen */
2558 		spdk_bs_user_op_t *op;
2559 		struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
2560 
2561 		op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2562 		if (!op) {
2563 			cb_fn(cb_arg, -ENOMEM);
2564 			return;
2565 		}
2566 
2567 		TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
2568 
2569 		return;
2570 	}
2571 
2572 	switch (op_type) {
2573 	case SPDK_BLOB_READ: {
2574 		spdk_bs_batch_t *batch;
2575 
2576 		batch = bs_batch_open(_ch, &cpl);
2577 		if (!batch) {
2578 			cb_fn(cb_arg, -ENOMEM);
2579 			return;
2580 		}
2581 
2582 		if (is_allocated) {
2583 			/* Read from the blob */
2584 			bs_batch_read_dev(batch, payload, lba, lba_count);
2585 		} else {
2586 			/* Read from the backing block device */
2587 			bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
2588 		}
2589 
2590 		bs_batch_close(batch);
2591 		break;
2592 	}
2593 	case SPDK_BLOB_WRITE:
2594 	case SPDK_BLOB_WRITE_ZEROES: {
2595 		if (is_allocated) {
2596 			/* Write to the blob */
2597 			spdk_bs_batch_t *batch;
2598 
2599 			if (lba_count == 0) {
2600 				cb_fn(cb_arg, 0);
2601 				return;
2602 			}
2603 
2604 			batch = bs_batch_open(_ch, &cpl);
2605 			if (!batch) {
2606 				cb_fn(cb_arg, -ENOMEM);
2607 				return;
2608 			}
2609 
2610 			if (op_type == SPDK_BLOB_WRITE) {
2611 				bs_batch_write_dev(batch, payload, lba, lba_count);
2612 			} else {
2613 				bs_batch_write_zeroes_dev(batch, lba, lba_count);
2614 			}
2615 
2616 			bs_batch_close(batch);
2617 		} else {
2618 			/* Queue this operation and allocate the cluster */
2619 			spdk_bs_user_op_t *op;
2620 
2621 			op = bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2622 			if (!op) {
2623 				cb_fn(cb_arg, -ENOMEM);
2624 				return;
2625 			}
2626 
2627 			bs_allocate_and_copy_cluster(blob, _ch, offset, op);
2628 		}
2629 		break;
2630 	}
2631 	case SPDK_BLOB_UNMAP: {
2632 		spdk_bs_batch_t *batch;
2633 
2634 		batch = bs_batch_open(_ch, &cpl);
2635 		if (!batch) {
2636 			cb_fn(cb_arg, -ENOMEM);
2637 			return;
2638 		}
2639 
2640 		if (is_allocated) {
2641 			bs_batch_unmap_dev(batch, lba, lba_count);
2642 		}
2643 
2644 		bs_batch_close(batch);
2645 		break;
2646 	}
2647 	case SPDK_BLOB_READV:
2648 	case SPDK_BLOB_WRITEV:
2649 		SPDK_ERRLOG("readv/write not valid\n");
2650 		cb_fn(cb_arg, -EINVAL);
2651 		break;
2652 	}
2653 }
2654 
2655 static void
2656 blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
2657 		       void *payload, uint64_t offset, uint64_t length,
2658 		       spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2659 {
2660 	assert(blob != NULL);
2661 
2662 	if (blob->data_ro && op_type != SPDK_BLOB_READ) {
2663 		cb_fn(cb_arg, -EPERM);
2664 		return;
2665 	}
2666 
2667 	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
2668 		cb_fn(cb_arg, -EINVAL);
2669 		return;
2670 	}
2671 	if (length <= bs_num_io_units_to_cluster_boundary(blob, offset)) {
2672 		blob_request_submit_op_single(_channel, blob, payload, offset, length,
2673 					      cb_fn, cb_arg, op_type);
2674 	} else {
2675 		blob_request_submit_op_split(_channel, blob, payload, offset, length,
2676 					     cb_fn, cb_arg, op_type);
2677 	}
2678 }
2679 
2680 struct rw_iov_ctx {
2681 	struct spdk_blob *blob;
2682 	struct spdk_io_channel *channel;
2683 	spdk_blob_op_complete cb_fn;
2684 	void *cb_arg;
2685 	bool read;
2686 	int iovcnt;
2687 	struct iovec *orig_iov;
2688 	uint64_t io_unit_offset;
2689 	uint64_t io_units_remaining;
2690 	uint64_t io_units_done;
2691 	struct iovec iov[0];
2692 };
2693 
2694 static void
2695 rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2696 {
2697 	assert(cb_arg == NULL);
2698 	bs_sequence_finish(seq, bserrno);
2699 }
2700 
2701 static void
2702 rw_iov_split_next(void *cb_arg, int bserrno)
2703 {
2704 	struct rw_iov_ctx *ctx = cb_arg;
2705 	struct spdk_blob *blob = ctx->blob;
2706 	struct iovec *iov, *orig_iov;
2707 	int iovcnt;
2708 	size_t orig_iovoff;
2709 	uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
2710 	uint64_t byte_count;
2711 
2712 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
2713 		ctx->cb_fn(ctx->cb_arg, bserrno);
2714 		free(ctx);
2715 		return;
2716 	}
2717 
2718 	io_unit_offset = ctx->io_unit_offset;
2719 	io_units_to_boundary = bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
2720 	io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
2721 	/*
2722 	 * Get index and offset into the original iov array for our current position in the I/O sequence.
2723 	 *  byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
2724 	 *  point to the current position in the I/O sequence.
2725 	 */
2726 	byte_count = ctx->io_units_done * blob->bs->io_unit_size;
2727 	orig_iov = &ctx->orig_iov[0];
2728 	orig_iovoff = 0;
2729 	while (byte_count > 0) {
2730 		if (byte_count >= orig_iov->iov_len) {
2731 			byte_count -= orig_iov->iov_len;
2732 			orig_iov++;
2733 		} else {
2734 			orig_iovoff = byte_count;
2735 			byte_count = 0;
2736 		}
2737 	}
2738 
2739 	/*
2740 	 * Build an iov array for the next I/O in the sequence.  byte_count will keep track of how many
2741 	 *  bytes of this next I/O remain to be accounted for in the new iov array.
2742 	 */
2743 	byte_count = io_units_count * blob->bs->io_unit_size;
2744 	iov = &ctx->iov[0];
2745 	iovcnt = 0;
2746 	while (byte_count > 0) {
2747 		assert(iovcnt < ctx->iovcnt);
2748 		iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
2749 		iov->iov_base = orig_iov->iov_base + orig_iovoff;
2750 		byte_count -= iov->iov_len;
2751 		orig_iovoff = 0;
2752 		orig_iov++;
2753 		iov++;
2754 		iovcnt++;
2755 	}
2756 
2757 	ctx->io_unit_offset += io_units_count;
2758 	ctx->io_units_remaining -= io_units_count;
2759 	ctx->io_units_done += io_units_count;
2760 	iov = &ctx->iov[0];
2761 
2762 	if (ctx->read) {
2763 		spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
2764 				   io_units_count, rw_iov_split_next, ctx);
2765 	} else {
2766 		spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
2767 				    io_units_count, rw_iov_split_next, ctx);
2768 	}
2769 }
2770 
2771 static void
2772 blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
2773 			   struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
2774 			   spdk_blob_op_complete cb_fn, void *cb_arg, bool read)
2775 {
2776 	struct spdk_bs_cpl	cpl;
2777 
2778 	assert(blob != NULL);
2779 
2780 	if (!read && blob->data_ro) {
2781 		cb_fn(cb_arg, -EPERM);
2782 		return;
2783 	}
2784 
2785 	if (length == 0) {
2786 		cb_fn(cb_arg, 0);
2787 		return;
2788 	}
2789 
2790 	if (offset + length > bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
2791 		cb_fn(cb_arg, -EINVAL);
2792 		return;
2793 	}
2794 
2795 	/*
2796 	 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
2797 	 *  to split a request that spans a cluster boundary.  For I/O that do not span a cluster boundary,
2798 	 *  there will be no noticeable difference compared to using a batch.  For I/O that do span a cluster
2799 	 *  boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
2800 	 *  to allocate a separate iov array and split the I/O such that none of the resulting
2801 	 *  smaller I/O cross a cluster boundary.  These smaller I/O will be issued in sequence (not in parallel)
2802 	 *  but since this case happens very infrequently, any performance impact will be negligible.
2803 	 *
2804 	 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
2805 	 *  for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
2806 	 *  in a batch.  That would also require creating an intermediate spdk_bs_cpl that would get called
2807 	 *  when the batch was completed, to allow for freeing the memory for the iov arrays.
2808 	 */
2809 	if (spdk_likely(length <= bs_num_io_units_to_cluster_boundary(blob, offset))) {
2810 		uint32_t lba_count;
2811 		uint64_t lba;
2812 		bool is_allocated;
2813 
2814 		cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2815 		cpl.u.blob_basic.cb_fn = cb_fn;
2816 		cpl.u.blob_basic.cb_arg = cb_arg;
2817 
2818 		if (blob->frozen_refcnt) {
2819 			/* This blob I/O is frozen */
2820 			enum spdk_blob_op_type op_type;
2821 			spdk_bs_user_op_t *op;
2822 			struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
2823 
2824 			op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
2825 			op = bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
2826 			if (!op) {
2827 				cb_fn(cb_arg, -ENOMEM);
2828 				return;
2829 			}
2830 
2831 			TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
2832 
2833 			return;
2834 		}
2835 
2836 		is_allocated = blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
2837 
2838 		if (read) {
2839 			spdk_bs_sequence_t *seq;
2840 
2841 			seq = bs_sequence_start(_channel, &cpl);
2842 			if (!seq) {
2843 				cb_fn(cb_arg, -ENOMEM);
2844 				return;
2845 			}
2846 
2847 			if (is_allocated) {
2848 				bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
2849 			} else {
2850 				bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
2851 							 rw_iov_done, NULL);
2852 			}
2853 		} else {
2854 			if (is_allocated) {
2855 				spdk_bs_sequence_t *seq;
2856 
2857 				seq = bs_sequence_start(_channel, &cpl);
2858 				if (!seq) {
2859 					cb_fn(cb_arg, -ENOMEM);
2860 					return;
2861 				}
2862 
2863 				bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, rw_iov_done, NULL);
2864 			} else {
2865 				/* Queue this operation and allocate the cluster */
2866 				spdk_bs_user_op_t *op;
2867 
2868 				op = bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
2869 						      length);
2870 				if (!op) {
2871 					cb_fn(cb_arg, -ENOMEM);
2872 					return;
2873 				}
2874 
2875 				bs_allocate_and_copy_cluster(blob, _channel, offset, op);
2876 			}
2877 		}
2878 	} else {
2879 		struct rw_iov_ctx *ctx;
2880 
2881 		ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
2882 		if (ctx == NULL) {
2883 			cb_fn(cb_arg, -ENOMEM);
2884 			return;
2885 		}
2886 
2887 		ctx->blob = blob;
2888 		ctx->channel = _channel;
2889 		ctx->cb_fn = cb_fn;
2890 		ctx->cb_arg = cb_arg;
2891 		ctx->read = read;
2892 		ctx->orig_iov = iov;
2893 		ctx->iovcnt = iovcnt;
2894 		ctx->io_unit_offset = offset;
2895 		ctx->io_units_remaining = length;
2896 		ctx->io_units_done = 0;
2897 
2898 		rw_iov_split_next(ctx, 0);
2899 	}
2900 }
2901 
2902 static struct spdk_blob *
2903 blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
2904 {
2905 	struct spdk_blob *blob;
2906 
2907 	if (spdk_bit_array_get(bs->open_blobids, blobid) == 0) {
2908 		return NULL;
2909 	}
2910 
2911 	TAILQ_FOREACH(blob, &bs->blobs, link) {
2912 		if (blob->id == blobid) {
2913 			return blob;
2914 		}
2915 	}
2916 
2917 	return NULL;
2918 }
2919 
2920 static void
2921 blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
2922 				    struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
2923 {
2924 	assert(blob != NULL);
2925 	*snapshot_entry = NULL;
2926 	*clone_entry = NULL;
2927 
2928 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
2929 		return;
2930 	}
2931 
2932 	TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
2933 		if ((*snapshot_entry)->id == blob->parent_id) {
2934 			break;
2935 		}
2936 	}
2937 
2938 	if (*snapshot_entry != NULL) {
2939 		TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
2940 			if ((*clone_entry)->id == blob->id) {
2941 				break;
2942 			}
2943 		}
2944 
2945 		assert(clone_entry != NULL);
2946 	}
2947 }
2948 
2949 static int
2950 bs_channel_create(void *io_device, void *ctx_buf)
2951 {
2952 	struct spdk_blob_store		*bs = io_device;
2953 	struct spdk_bs_channel		*channel = ctx_buf;
2954 	struct spdk_bs_dev		*dev;
2955 	uint32_t			max_ops = bs->max_channel_ops;
2956 	uint32_t			i;
2957 
2958 	dev = bs->dev;
2959 
2960 	channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
2961 	if (!channel->req_mem) {
2962 		return -1;
2963 	}
2964 
2965 	TAILQ_INIT(&channel->reqs);
2966 
2967 	for (i = 0; i < max_ops; i++) {
2968 		TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
2969 	}
2970 
2971 	channel->bs = bs;
2972 	channel->dev = dev;
2973 	channel->dev_channel = dev->create_channel(dev);
2974 
2975 	if (!channel->dev_channel) {
2976 		SPDK_ERRLOG("Failed to create device channel.\n");
2977 		free(channel->req_mem);
2978 		return -1;
2979 	}
2980 
2981 	TAILQ_INIT(&channel->need_cluster_alloc);
2982 	TAILQ_INIT(&channel->queued_io);
2983 
2984 	return 0;
2985 }
2986 
2987 static void
2988 bs_channel_destroy(void *io_device, void *ctx_buf)
2989 {
2990 	struct spdk_bs_channel *channel = ctx_buf;
2991 	spdk_bs_user_op_t *op;
2992 
2993 	while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
2994 		op = TAILQ_FIRST(&channel->need_cluster_alloc);
2995 		TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
2996 		bs_user_op_abort(op);
2997 	}
2998 
2999 	while (!TAILQ_EMPTY(&channel->queued_io)) {
3000 		op = TAILQ_FIRST(&channel->queued_io);
3001 		TAILQ_REMOVE(&channel->queued_io, op, link);
3002 		bs_user_op_abort(op);
3003 	}
3004 
3005 	free(channel->req_mem);
3006 	channel->dev->destroy_channel(channel->dev, channel->dev_channel);
3007 }
3008 
3009 static void
3010 bs_dev_destroy(void *io_device)
3011 {
3012 	struct spdk_blob_store *bs = io_device;
3013 	struct spdk_blob	*blob, *blob_tmp;
3014 
3015 	bs->dev->destroy(bs->dev);
3016 
3017 	TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) {
3018 		TAILQ_REMOVE(&bs->blobs, blob, link);
3019 		spdk_bit_array_clear(bs->open_blobids, blob->id);
3020 		blob_free(blob);
3021 	}
3022 
3023 	pthread_mutex_destroy(&bs->used_clusters_mutex);
3024 
3025 	spdk_bit_array_free(&bs->open_blobids);
3026 	spdk_bit_array_free(&bs->used_blobids);
3027 	spdk_bit_array_free(&bs->used_md_pages);
3028 	spdk_bit_pool_free(&bs->used_clusters);
3029 	/*
3030 	 * If this function is called for any reason except a successful unload,
3031 	 * the unload_cpl type will be NONE and this will be a nop.
3032 	 */
3033 	bs_call_cpl(&bs->unload_cpl, bs->unload_err);
3034 
3035 	free(bs);
3036 }
3037 
3038 static int
3039 bs_blob_list_add(struct spdk_blob *blob)
3040 {
3041 	spdk_blob_id snapshot_id;
3042 	struct spdk_blob_list *snapshot_entry = NULL;
3043 	struct spdk_blob_list *clone_entry = NULL;
3044 
3045 	assert(blob != NULL);
3046 
3047 	snapshot_id = blob->parent_id;
3048 	if (snapshot_id == SPDK_BLOBID_INVALID) {
3049 		return 0;
3050 	}
3051 
3052 	snapshot_entry = bs_get_snapshot_entry(blob->bs, snapshot_id);
3053 	if (snapshot_entry == NULL) {
3054 		/* Snapshot not found */
3055 		snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
3056 		if (snapshot_entry == NULL) {
3057 			return -ENOMEM;
3058 		}
3059 		snapshot_entry->id = snapshot_id;
3060 		TAILQ_INIT(&snapshot_entry->clones);
3061 		TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
3062 	} else {
3063 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
3064 			if (clone_entry->id == blob->id) {
3065 				break;
3066 			}
3067 		}
3068 	}
3069 
3070 	if (clone_entry == NULL) {
3071 		/* Clone not found */
3072 		clone_entry = calloc(1, sizeof(struct spdk_blob_list));
3073 		if (clone_entry == NULL) {
3074 			return -ENOMEM;
3075 		}
3076 		clone_entry->id = blob->id;
3077 		TAILQ_INIT(&clone_entry->clones);
3078 		TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
3079 		snapshot_entry->clone_count++;
3080 	}
3081 
3082 	return 0;
3083 }
3084 
3085 static void
3086 bs_blob_list_remove(struct spdk_blob *blob)
3087 {
3088 	struct spdk_blob_list *snapshot_entry = NULL;
3089 	struct spdk_blob_list *clone_entry = NULL;
3090 
3091 	blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
3092 
3093 	if (snapshot_entry == NULL) {
3094 		return;
3095 	}
3096 
3097 	blob->parent_id = SPDK_BLOBID_INVALID;
3098 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
3099 	free(clone_entry);
3100 
3101 	snapshot_entry->clone_count--;
3102 }
3103 
3104 static int
3105 bs_blob_list_free(struct spdk_blob_store *bs)
3106 {
3107 	struct spdk_blob_list *snapshot_entry;
3108 	struct spdk_blob_list *snapshot_entry_tmp;
3109 	struct spdk_blob_list *clone_entry;
3110 	struct spdk_blob_list *clone_entry_tmp;
3111 
3112 	TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
3113 		TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
3114 			TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
3115 			free(clone_entry);
3116 		}
3117 		TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
3118 		free(snapshot_entry);
3119 	}
3120 
3121 	return 0;
3122 }
3123 
3124 static void
3125 bs_free(struct spdk_blob_store *bs)
3126 {
3127 	bs_blob_list_free(bs);
3128 
3129 	bs_unregister_md_thread(bs);
3130 	spdk_io_device_unregister(bs, bs_dev_destroy);
3131 }
3132 
3133 void
3134 spdk_bs_opts_init(struct spdk_bs_opts *opts, size_t opts_size)
3135 {
3136 
3137 	if (!opts) {
3138 		SPDK_ERRLOG("opts should not be NULL\n");
3139 		return;
3140 	}
3141 
3142 	if (!opts_size) {
3143 		SPDK_ERRLOG("opts_size should not be zero value\n");
3144 		return;
3145 	}
3146 
3147 	memset(opts, 0, opts_size);
3148 	opts->opts_size = opts_size;
3149 
3150 #define FIELD_OK(field) \
3151 	offsetof(struct spdk_bs_opts, field) + sizeof(opts->field) <= opts_size
3152 
3153 #define SET_FIELD(field, value) \
3154 	if (FIELD_OK(field)) { \
3155 		opts->field = value; \
3156 	} \
3157 
3158 	SET_FIELD(cluster_sz, SPDK_BLOB_OPTS_CLUSTER_SZ);
3159 	SET_FIELD(num_md_pages, SPDK_BLOB_OPTS_NUM_MD_PAGES);
3160 	SET_FIELD(max_md_ops, SPDK_BLOB_OPTS_NUM_MD_PAGES);
3161 	SET_FIELD(max_channel_ops, SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS);
3162 	SET_FIELD(clear_method,  BS_CLEAR_WITH_UNMAP);
3163 
3164 	if (FIELD_OK(bstype)) {
3165 		memset(&opts->bstype, 0, sizeof(opts->bstype));
3166 	}
3167 
3168 	SET_FIELD(iter_cb_fn, NULL);
3169 	SET_FIELD(iter_cb_arg, NULL);
3170 
3171 #undef FIELD_OK
3172 #undef SET_FIELD
3173 }
3174 
3175 static int
3176 bs_opts_verify(struct spdk_bs_opts *opts)
3177 {
3178 	if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
3179 	    opts->max_channel_ops == 0) {
3180 		SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
3181 		return -1;
3182 	}
3183 
3184 	return 0;
3185 }
3186 
3187 /* START spdk_bs_load */
3188 
3189 /* spdk_bs_load_ctx is used for init, load, unload and dump code paths. */
3190 
3191 struct spdk_bs_load_ctx {
3192 	struct spdk_blob_store		*bs;
3193 	struct spdk_bs_super_block	*super;
3194 
3195 	struct spdk_bs_md_mask		*mask;
3196 	bool				in_page_chain;
3197 	uint32_t			page_index;
3198 	uint32_t			cur_page;
3199 	struct spdk_blob_md_page	*page;
3200 
3201 	uint64_t			num_extent_pages;
3202 	uint32_t			*extent_page_num;
3203 	struct spdk_blob_md_page	*extent_pages;
3204 	struct spdk_bit_array		*used_clusters;
3205 
3206 	spdk_bs_sequence_t			*seq;
3207 	spdk_blob_op_with_handle_complete	iter_cb_fn;
3208 	void					*iter_cb_arg;
3209 	struct spdk_blob			*blob;
3210 	spdk_blob_id				blobid;
3211 
3212 	/* These fields are used in the spdk_bs_dump path. */
3213 	FILE					*fp;
3214 	spdk_bs_dump_print_xattr		print_xattr_fn;
3215 	char					xattr_name[4096];
3216 };
3217 
3218 static int
3219 bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs,
3220 	 struct spdk_bs_load_ctx **_ctx)
3221 {
3222 	struct spdk_blob_store	*bs;
3223 	struct spdk_bs_load_ctx	*ctx;
3224 	uint64_t dev_size;
3225 	int rc;
3226 
3227 	dev_size = dev->blocklen * dev->blockcnt;
3228 	if (dev_size < opts->cluster_sz) {
3229 		/* Device size cannot be smaller than cluster size of blobstore */
3230 		SPDK_INFOLOG(blob, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
3231 			     dev_size, opts->cluster_sz);
3232 		return -ENOSPC;
3233 	}
3234 	if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) {
3235 		/* Cluster size cannot be smaller than page size */
3236 		SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
3237 			    opts->cluster_sz, SPDK_BS_PAGE_SIZE);
3238 		return -EINVAL;
3239 	}
3240 	bs = calloc(1, sizeof(struct spdk_blob_store));
3241 	if (!bs) {
3242 		return -ENOMEM;
3243 	}
3244 
3245 	ctx = calloc(1, sizeof(struct spdk_bs_load_ctx));
3246 	if (!ctx) {
3247 		free(bs);
3248 		return -ENOMEM;
3249 	}
3250 
3251 	ctx->bs = bs;
3252 	ctx->iter_cb_fn = opts->iter_cb_fn;
3253 	ctx->iter_cb_arg = opts->iter_cb_arg;
3254 
3255 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
3256 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3257 	if (!ctx->super) {
3258 		free(ctx);
3259 		free(bs);
3260 		return -ENOMEM;
3261 	}
3262 
3263 	TAILQ_INIT(&bs->blobs);
3264 	TAILQ_INIT(&bs->snapshots);
3265 	bs->dev = dev;
3266 	bs->md_thread = spdk_get_thread();
3267 	assert(bs->md_thread != NULL);
3268 
3269 	/*
3270 	 * Do not use bs_lba_to_cluster() here since blockcnt may not be an
3271 	 *  even multiple of the cluster size.
3272 	 */
3273 	bs->cluster_sz = opts->cluster_sz;
3274 	bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
3275 	ctx->used_clusters = spdk_bit_array_create(bs->total_clusters);
3276 	if (!ctx->used_clusters) {
3277 		spdk_free(ctx->super);
3278 		free(ctx);
3279 		free(bs);
3280 		return -ENOMEM;
3281 	}
3282 
3283 	bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE;
3284 	if (spdk_u32_is_pow2(bs->pages_per_cluster)) {
3285 		bs->pages_per_cluster_shift = spdk_u32log2(bs->pages_per_cluster);
3286 	}
3287 	bs->num_free_clusters = bs->total_clusters;
3288 	bs->io_unit_size = dev->blocklen;
3289 
3290 	bs->max_channel_ops = opts->max_channel_ops;
3291 	bs->super_blob = SPDK_BLOBID_INVALID;
3292 	memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
3293 
3294 	/* The metadata is assumed to be at least 1 page */
3295 	bs->used_md_pages = spdk_bit_array_create(1);
3296 	bs->used_blobids = spdk_bit_array_create(0);
3297 	bs->open_blobids = spdk_bit_array_create(0);
3298 
3299 	pthread_mutex_init(&bs->used_clusters_mutex, NULL);
3300 
3301 	spdk_io_device_register(bs, bs_channel_create, bs_channel_destroy,
3302 				sizeof(struct spdk_bs_channel), "blobstore");
3303 	rc = bs_register_md_thread(bs);
3304 	if (rc == -1) {
3305 		spdk_io_device_unregister(bs, NULL);
3306 		pthread_mutex_destroy(&bs->used_clusters_mutex);
3307 		spdk_bit_array_free(&bs->open_blobids);
3308 		spdk_bit_array_free(&bs->used_blobids);
3309 		spdk_bit_array_free(&bs->used_md_pages);
3310 		spdk_bit_array_free(&ctx->used_clusters);
3311 		spdk_free(ctx->super);
3312 		free(ctx);
3313 		free(bs);
3314 		/* FIXME: this is a lie but don't know how to get a proper error code here */
3315 		return -ENOMEM;
3316 	}
3317 
3318 	*_ctx = ctx;
3319 	*_bs = bs;
3320 	return 0;
3321 }
3322 
3323 static void
3324 bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
3325 {
3326 	assert(bserrno != 0);
3327 
3328 	spdk_free(ctx->super);
3329 	bs_sequence_finish(ctx->seq, bserrno);
3330 	bs_free(ctx->bs);
3331 	spdk_bit_array_free(&ctx->used_clusters);
3332 	free(ctx);
3333 }
3334 
3335 static void
3336 bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
3337 	       struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
3338 {
3339 	/* Update the values in the super block */
3340 	super->super_blob = bs->super_blob;
3341 	memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
3342 	super->crc = blob_md_page_calc_crc(super);
3343 	bs_sequence_write_dev(seq, super, bs_page_to_lba(bs, 0),
3344 			      bs_byte_to_lba(bs, sizeof(*super)),
3345 			      cb_fn, cb_arg);
3346 }
3347 
3348 static void
3349 bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3350 {
3351 	struct spdk_bs_load_ctx	*ctx = arg;
3352 	uint64_t	mask_size, lba, lba_count;
3353 
3354 	/* Write out the used clusters mask */
3355 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
3356 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3357 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3358 	if (!ctx->mask) {
3359 		bs_load_ctx_fail(ctx, -ENOMEM);
3360 		return;
3361 	}
3362 
3363 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
3364 	ctx->mask->length = ctx->bs->total_clusters;
3365 	/* We could get here through the normal unload path, or through dirty
3366 	 * shutdown recovery.  For the normal unload path, we use the mask from
3367 	 * the bit pool.  For dirty shutdown recovery, we don't have a bit pool yet -
3368 	 * only the bit array from the load ctx.
3369 	 */
3370 	if (ctx->bs->used_clusters) {
3371 		assert(ctx->mask->length == spdk_bit_pool_capacity(ctx->bs->used_clusters));
3372 		spdk_bit_pool_store_mask(ctx->bs->used_clusters, ctx->mask->mask);
3373 	} else {
3374 		assert(ctx->mask->length == spdk_bit_array_capacity(ctx->used_clusters));
3375 		spdk_bit_array_store_mask(ctx->used_clusters, ctx->mask->mask);
3376 	}
3377 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
3378 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
3379 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3380 }
3381 
3382 static void
3383 bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3384 {
3385 	struct spdk_bs_load_ctx	*ctx = arg;
3386 	uint64_t	mask_size, lba, lba_count;
3387 
3388 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
3389 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3390 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3391 	if (!ctx->mask) {
3392 		bs_load_ctx_fail(ctx, -ENOMEM);
3393 		return;
3394 	}
3395 
3396 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
3397 	ctx->mask->length = ctx->super->md_len;
3398 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
3399 
3400 	spdk_bit_array_store_mask(ctx->bs->used_md_pages, ctx->mask->mask);
3401 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
3402 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
3403 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3404 }
3405 
3406 static void
3407 bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3408 {
3409 	struct spdk_bs_load_ctx	*ctx = arg;
3410 	uint64_t	mask_size, lba, lba_count;
3411 
3412 	if (ctx->super->used_blobid_mask_len == 0) {
3413 		/*
3414 		 * This is a pre-v3 on-disk format where the blobid mask does not get
3415 		 *  written to disk.
3416 		 */
3417 		cb_fn(seq, arg, 0);
3418 		return;
3419 	}
3420 
3421 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
3422 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3423 				 SPDK_MALLOC_DMA);
3424 	if (!ctx->mask) {
3425 		bs_load_ctx_fail(ctx, -ENOMEM);
3426 		return;
3427 	}
3428 
3429 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
3430 	ctx->mask->length = ctx->super->md_len;
3431 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
3432 
3433 	spdk_bit_array_store_mask(ctx->bs->used_blobids, ctx->mask->mask);
3434 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
3435 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
3436 	bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3437 }
3438 
3439 static void
3440 blob_set_thin_provision(struct spdk_blob *blob)
3441 {
3442 	blob_verify_md_op(blob);
3443 	blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
3444 	blob->state = SPDK_BLOB_STATE_DIRTY;
3445 }
3446 
3447 static void
3448 blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
3449 {
3450 	blob_verify_md_op(blob);
3451 	blob->clear_method = clear_method;
3452 	blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
3453 	blob->state = SPDK_BLOB_STATE_DIRTY;
3454 }
3455 
3456 static void bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
3457 
3458 static void
3459 bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
3460 {
3461 	struct spdk_bs_load_ctx *ctx = cb_arg;
3462 	spdk_blob_id id;
3463 	int64_t page_num;
3464 
3465 	/* Iterate to next blob (we can't use spdk_bs_iter_next function as our
3466 	 * last blob has been removed */
3467 	page_num = bs_blobid_to_page(ctx->blobid);
3468 	page_num++;
3469 	page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
3470 	if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
3471 		bs_load_iter(ctx, NULL, -ENOENT);
3472 		return;
3473 	}
3474 
3475 	id = bs_page_to_blobid(page_num);
3476 
3477 	spdk_bs_open_blob(ctx->bs, id, bs_load_iter, ctx);
3478 }
3479 
3480 static void
3481 bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
3482 {
3483 	struct spdk_bs_load_ctx *ctx = cb_arg;
3484 
3485 	if (bserrno != 0) {
3486 		SPDK_ERRLOG("Failed to close corrupted blob\n");
3487 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3488 		return;
3489 	}
3490 
3491 	spdk_bs_delete_blob(ctx->bs, ctx->blobid, bs_delete_corrupted_blob_cpl, ctx);
3492 }
3493 
3494 static void
3495 bs_delete_corrupted_blob(void *cb_arg, int bserrno)
3496 {
3497 	struct spdk_bs_load_ctx *ctx = cb_arg;
3498 	uint64_t i;
3499 
3500 	if (bserrno != 0) {
3501 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3502 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3503 		return;
3504 	}
3505 
3506 	/* Snapshot and clone have the same copy of cluster map and extent pages
3507 	 * at this point. Let's clear both for snpashot now,
3508 	 * so that it won't be cleared for clone later when we remove snapshot.
3509 	 * Also set thin provision to pass data corruption check */
3510 	for (i = 0; i < ctx->blob->active.num_clusters; i++) {
3511 		ctx->blob->active.clusters[i] = 0;
3512 	}
3513 	for (i = 0; i < ctx->blob->active.num_extent_pages; i++) {
3514 		ctx->blob->active.extent_pages[i] = 0;
3515 	}
3516 
3517 	ctx->blob->md_ro = false;
3518 
3519 	blob_set_thin_provision(ctx->blob);
3520 
3521 	ctx->blobid = ctx->blob->id;
3522 
3523 	spdk_blob_close(ctx->blob, bs_delete_corrupted_close_cb, ctx);
3524 }
3525 
3526 static void
3527 bs_update_corrupted_blob(void *cb_arg, int bserrno)
3528 {
3529 	struct spdk_bs_load_ctx *ctx = cb_arg;
3530 
3531 	if (bserrno != 0) {
3532 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3533 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3534 		return;
3535 	}
3536 
3537 	ctx->blob->md_ro = false;
3538 	blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
3539 	blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
3540 	spdk_blob_set_read_only(ctx->blob);
3541 
3542 	if (ctx->iter_cb_fn) {
3543 		ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
3544 	}
3545 	bs_blob_list_add(ctx->blob);
3546 
3547 	spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3548 }
3549 
3550 static void
3551 bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
3552 {
3553 	struct spdk_bs_load_ctx *ctx = cb_arg;
3554 
3555 	if (bserrno != 0) {
3556 		SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
3557 		spdk_bs_iter_next(ctx->bs, ctx->blob, bs_load_iter, ctx);
3558 		return;
3559 	}
3560 
3561 	if (blob->parent_id == ctx->blob->id) {
3562 		/* Power failure occured before updating clone (snapshot delete case)
3563 		 * or after updating clone (creating snapshot case) - keep snapshot */
3564 		spdk_blob_close(blob, bs_update_corrupted_blob, ctx);
3565 	} else {
3566 		/* Power failure occured after updating clone (snapshot delete case)
3567 		 * or before updating clone (creating snapshot case) - remove snapshot */
3568 		spdk_blob_close(blob, bs_delete_corrupted_blob, ctx);
3569 	}
3570 }
3571 
3572 static void
3573 bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
3574 {
3575 	struct spdk_bs_load_ctx *ctx = arg;
3576 	const void *value;
3577 	size_t len;
3578 	int rc = 0;
3579 
3580 	if (bserrno == 0) {
3581 		/* Examine blob if it is corrupted after power failure. Fix
3582 		 * the ones that can be fixed and remove any other corrupted
3583 		 * ones. If it is not corrupted just process it */
3584 		rc = blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
3585 		if (rc != 0) {
3586 			rc = blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
3587 			if (rc != 0) {
3588 				/* Not corrupted - process it and continue with iterating through blobs */
3589 				if (ctx->iter_cb_fn) {
3590 					ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
3591 				}
3592 				bs_blob_list_add(blob);
3593 				spdk_bs_iter_next(ctx->bs, blob, bs_load_iter, ctx);
3594 				return;
3595 			}
3596 
3597 		}
3598 
3599 		assert(len == sizeof(spdk_blob_id));
3600 
3601 		ctx->blob = blob;
3602 
3603 		/* Open clone to check if we are able to fix this blob or should we remove it */
3604 		spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, bs_examine_clone, ctx);
3605 		return;
3606 	} else if (bserrno == -ENOENT) {
3607 		bserrno = 0;
3608 	} else {
3609 		/*
3610 		 * This case needs to be looked at further.  Same problem
3611 		 *  exists with applications that rely on explicit blob
3612 		 *  iteration.  We should just skip the blob that failed
3613 		 *  to load and continue on to the next one.
3614 		 */
3615 		SPDK_ERRLOG("Error in iterating blobs\n");
3616 	}
3617 
3618 	ctx->iter_cb_fn = NULL;
3619 
3620 	spdk_free(ctx->super);
3621 	spdk_free(ctx->mask);
3622 	bs_sequence_finish(ctx->seq, bserrno);
3623 	free(ctx);
3624 }
3625 
3626 static void
3627 bs_load_complete(struct spdk_bs_load_ctx *ctx)
3628 {
3629 	ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
3630 	spdk_bs_iter_first(ctx->bs, bs_load_iter, ctx);
3631 }
3632 
3633 static void
3634 bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3635 {
3636 	struct spdk_bs_load_ctx *ctx = cb_arg;
3637 	int rc;
3638 
3639 	/* The type must be correct */
3640 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
3641 
3642 	/* The length of the mask (in bits) must not be greater than
3643 	 * the length of the buffer (converted to bits) */
3644 	assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8));
3645 
3646 	/* The length of the mask must be exactly equal to the size
3647 	 * (in pages) of the metadata region */
3648 	assert(ctx->mask->length == ctx->super->md_len);
3649 
3650 	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->mask->length);
3651 	if (rc < 0) {
3652 		spdk_free(ctx->mask);
3653 		bs_load_ctx_fail(ctx, rc);
3654 		return;
3655 	}
3656 
3657 	spdk_bit_array_load_mask(ctx->bs->used_blobids, ctx->mask->mask);
3658 	bs_load_complete(ctx);
3659 }
3660 
3661 static void
3662 bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3663 {
3664 	struct spdk_bs_load_ctx *ctx = cb_arg;
3665 	uint64_t		lba, lba_count, mask_size;
3666 	int			rc;
3667 
3668 	if (bserrno != 0) {
3669 		bs_load_ctx_fail(ctx, bserrno);
3670 		return;
3671 	}
3672 
3673 	/* The type must be correct */
3674 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
3675 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
3676 	assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
3677 					     struct spdk_blob_md_page) * 8));
3678 	/* The length of the mask must be exactly equal to the total number of clusters */
3679 	assert(ctx->mask->length == ctx->bs->total_clusters);
3680 
3681 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->mask->length);
3682 	if (rc < 0) {
3683 		spdk_free(ctx->mask);
3684 		bs_load_ctx_fail(ctx, rc);
3685 		return;
3686 	}
3687 
3688 	spdk_bit_array_load_mask(ctx->used_clusters, ctx->mask->mask);
3689 	ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->used_clusters);
3690 	assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
3691 
3692 	spdk_free(ctx->mask);
3693 
3694 	/* Read the used blobids mask */
3695 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
3696 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3697 				 SPDK_MALLOC_DMA);
3698 	if (!ctx->mask) {
3699 		bs_load_ctx_fail(ctx, -ENOMEM);
3700 		return;
3701 	}
3702 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
3703 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
3704 	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
3705 			     bs_load_used_blobids_cpl, ctx);
3706 }
3707 
3708 static void
3709 bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3710 {
3711 	struct spdk_bs_load_ctx *ctx = cb_arg;
3712 	uint64_t		lba, lba_count, mask_size;
3713 	int			rc;
3714 
3715 	if (bserrno != 0) {
3716 		bs_load_ctx_fail(ctx, bserrno);
3717 		return;
3718 	}
3719 
3720 	/* The type must be correct */
3721 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
3722 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
3723 	assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE *
3724 				     8));
3725 	/* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
3726 	if (ctx->mask->length != ctx->super->md_len) {
3727 		SPDK_ERRLOG("mismatched md_len in used_pages mask: "
3728 			    "mask->length=%" PRIu32 " super->md_len=%" PRIu32 "\n",
3729 			    ctx->mask->length, ctx->super->md_len);
3730 		assert(false);
3731 	}
3732 
3733 	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->mask->length);
3734 	if (rc < 0) {
3735 		spdk_free(ctx->mask);
3736 		bs_load_ctx_fail(ctx, rc);
3737 		return;
3738 	}
3739 
3740 	spdk_bit_array_load_mask(ctx->bs->used_md_pages, ctx->mask->mask);
3741 	spdk_free(ctx->mask);
3742 
3743 	/* Read the used clusters mask */
3744 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
3745 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3746 				 SPDK_MALLOC_DMA);
3747 	if (!ctx->mask) {
3748 		bs_load_ctx_fail(ctx, -ENOMEM);
3749 		return;
3750 	}
3751 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
3752 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
3753 	bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
3754 			     bs_load_used_clusters_cpl, ctx);
3755 }
3756 
3757 static void
3758 bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
3759 {
3760 	uint64_t lba, lba_count, mask_size;
3761 
3762 	/* Read the used pages mask */
3763 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
3764 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3765 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3766 	if (!ctx->mask) {
3767 		bs_load_ctx_fail(ctx, -ENOMEM);
3768 		return;
3769 	}
3770 
3771 	lba = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
3772 	lba_count = bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
3773 	bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
3774 			     bs_load_used_pages_cpl, ctx);
3775 }
3776 
3777 static int
3778 bs_load_replay_md_parse_page(struct spdk_bs_load_ctx *ctx, struct spdk_blob_md_page *page)
3779 {
3780 	struct spdk_blob_store *bs = ctx->bs;
3781 	struct spdk_blob_md_descriptor *desc;
3782 	size_t	cur_desc = 0;
3783 
3784 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
3785 	while (cur_desc < sizeof(page->descriptors)) {
3786 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
3787 			if (desc->length == 0) {
3788 				/* If padding and length are 0, this terminates the page */
3789 				break;
3790 			}
3791 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
3792 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
3793 			unsigned int				i, j;
3794 			unsigned int				cluster_count = 0;
3795 			uint32_t				cluster_idx;
3796 
3797 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
3798 
3799 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
3800 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
3801 					cluster_idx = desc_extent_rle->extents[i].cluster_idx;
3802 					/*
3803 					 * cluster_idx = 0 means an unallocated cluster - don't mark that
3804 					 * in the used cluster map.
3805 					 */
3806 					if (cluster_idx != 0) {
3807 						spdk_bit_array_set(ctx->used_clusters, cluster_idx + j);
3808 						if (bs->num_free_clusters == 0) {
3809 							return -ENOSPC;
3810 						}
3811 						bs->num_free_clusters--;
3812 					}
3813 					cluster_count++;
3814 				}
3815 			}
3816 			if (cluster_count == 0) {
3817 				return -EINVAL;
3818 			}
3819 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
3820 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
3821 			uint32_t					i;
3822 			uint32_t					cluster_count = 0;
3823 			uint32_t					cluster_idx;
3824 			size_t						cluster_idx_length;
3825 
3826 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
3827 			cluster_idx_length = desc_extent->length - sizeof(desc_extent->start_cluster_idx);
3828 
3829 			if (desc_extent->length <= sizeof(desc_extent->start_cluster_idx) ||
3830 			    (cluster_idx_length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
3831 				return -EINVAL;
3832 			}
3833 
3834 			for (i = 0; i < cluster_idx_length / sizeof(desc_extent->cluster_idx[0]); i++) {
3835 				cluster_idx = desc_extent->cluster_idx[i];
3836 				/*
3837 				 * cluster_idx = 0 means an unallocated cluster - don't mark that
3838 				 * in the used cluster map.
3839 				 */
3840 				if (cluster_idx != 0) {
3841 					if (cluster_idx < desc_extent->start_cluster_idx &&
3842 					    cluster_idx >= desc_extent->start_cluster_idx + cluster_count) {
3843 						return -EINVAL;
3844 					}
3845 					spdk_bit_array_set(ctx->used_clusters, cluster_idx);
3846 					if (bs->num_free_clusters == 0) {
3847 						return -ENOSPC;
3848 					}
3849 					bs->num_free_clusters--;
3850 				}
3851 				cluster_count++;
3852 			}
3853 
3854 			if (cluster_count == 0) {
3855 				return -EINVAL;
3856 			}
3857 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
3858 			/* Skip this item */
3859 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
3860 			/* Skip this item */
3861 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
3862 			/* Skip this item */
3863 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
3864 			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
3865 			uint32_t num_extent_pages = ctx->num_extent_pages;
3866 			uint32_t i;
3867 			size_t extent_pages_length;
3868 			void *tmp;
3869 
3870 			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
3871 			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
3872 
3873 			if (desc_extent_table->length == 0 ||
3874 			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
3875 				return -EINVAL;
3876 			}
3877 
3878 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
3879 				if (desc_extent_table->extent_page[i].page_idx != 0) {
3880 					if (desc_extent_table->extent_page[i].num_pages != 1) {
3881 						return -EINVAL;
3882 					}
3883 					num_extent_pages += 1;
3884 				}
3885 			}
3886 
3887 			if (num_extent_pages > 0) {
3888 				tmp = realloc(ctx->extent_page_num, num_extent_pages * sizeof(uint32_t));
3889 				if (tmp == NULL) {
3890 					return -ENOMEM;
3891 				}
3892 				ctx->extent_page_num = tmp;
3893 
3894 				/* Extent table entries contain md page numbers for extent pages.
3895 				 * Zeroes represent unallocated extent pages, those are run-length-encoded.
3896 				 */
3897 				for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
3898 					if (desc_extent_table->extent_page[i].page_idx != 0) {
3899 						ctx->extent_page_num[ctx->num_extent_pages] = desc_extent_table->extent_page[i].page_idx;
3900 						ctx->num_extent_pages += 1;
3901 					}
3902 				}
3903 			}
3904 		} else {
3905 			/* Error */
3906 			return -EINVAL;
3907 		}
3908 		/* Advance to the next descriptor */
3909 		cur_desc += sizeof(*desc) + desc->length;
3910 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
3911 			break;
3912 		}
3913 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
3914 	}
3915 	return 0;
3916 }
3917 
3918 static bool bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
3919 {
3920 	uint32_t crc;
3921 	struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
3922 	size_t desc_len;
3923 
3924 	crc = blob_md_page_calc_crc(page);
3925 	if (crc != page->crc) {
3926 		return false;
3927 	}
3928 
3929 	/* Extent page should always be of sequence num 0. */
3930 	if (page->sequence_num != 0) {
3931 		return false;
3932 	}
3933 
3934 	/* Descriptor type must be EXTENT_PAGE. */
3935 	if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
3936 		return false;
3937 	}
3938 
3939 	/* Descriptor length cannot exceed the page. */
3940 	desc_len = sizeof(*desc) + desc->length;
3941 	if (desc_len > sizeof(page->descriptors)) {
3942 		return false;
3943 	}
3944 
3945 	/* It has to be the only descriptor in the page. */
3946 	if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
3947 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
3948 		if (desc->length != 0) {
3949 			return false;
3950 		}
3951 	}
3952 
3953 	return true;
3954 }
3955 
3956 static bool bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
3957 {
3958 	uint32_t crc;
3959 	struct spdk_blob_md_page *page = ctx->page;
3960 
3961 	crc = blob_md_page_calc_crc(page);
3962 	if (crc != page->crc) {
3963 		return false;
3964 	}
3965 
3966 	/* First page of a sequence should match the blobid. */
3967 	if (page->sequence_num == 0 &&
3968 	    bs_page_to_blobid(ctx->cur_page) != page->id) {
3969 		return false;
3970 	}
3971 	assert(bs_load_cur_extent_page_valid(page) == false);
3972 
3973 	return true;
3974 }
3975 
3976 static void
3977 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
3978 
3979 static void
3980 bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3981 {
3982 	struct spdk_bs_load_ctx	*ctx = cb_arg;
3983 
3984 	if (bserrno != 0) {
3985 		bs_load_ctx_fail(ctx, bserrno);
3986 		return;
3987 	}
3988 
3989 	bs_load_complete(ctx);
3990 }
3991 
3992 static void
3993 bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3994 {
3995 	struct spdk_bs_load_ctx	*ctx = cb_arg;
3996 
3997 	spdk_free(ctx->mask);
3998 	ctx->mask = NULL;
3999 
4000 	if (bserrno != 0) {
4001 		bs_load_ctx_fail(ctx, bserrno);
4002 		return;
4003 	}
4004 
4005 	bs_write_used_clusters(seq, ctx, bs_load_write_used_clusters_cpl);
4006 }
4007 
4008 static void
4009 bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4010 {
4011 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4012 
4013 	spdk_free(ctx->mask);
4014 	ctx->mask = NULL;
4015 
4016 	if (bserrno != 0) {
4017 		bs_load_ctx_fail(ctx, bserrno);
4018 		return;
4019 	}
4020 
4021 	bs_write_used_blobids(seq, ctx, bs_load_write_used_blobids_cpl);
4022 }
4023 
4024 static void
4025 bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
4026 {
4027 	bs_write_used_md(ctx->seq, ctx, bs_load_write_used_pages_cpl);
4028 }
4029 
4030 static void
4031 bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
4032 {
4033 	uint64_t num_md_clusters;
4034 	uint64_t i;
4035 
4036 	ctx->in_page_chain = false;
4037 
4038 	do {
4039 		ctx->page_index++;
4040 	} while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
4041 
4042 	if (ctx->page_index < ctx->super->md_len) {
4043 		ctx->cur_page = ctx->page_index;
4044 		bs_load_replay_cur_md_page(ctx);
4045 	} else {
4046 		/* Claim all of the clusters used by the metadata */
4047 		num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster);
4048 		for (i = 0; i < num_md_clusters; i++) {
4049 			spdk_bit_array_set(ctx->used_clusters, i);
4050 		}
4051 		ctx->bs->num_free_clusters -= num_md_clusters;
4052 		spdk_free(ctx->page);
4053 		bs_load_write_used_md(ctx);
4054 	}
4055 }
4056 
4057 static void
4058 bs_load_replay_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4059 {
4060 	struct spdk_bs_load_ctx *ctx = cb_arg;
4061 	uint32_t page_num;
4062 	uint64_t i;
4063 
4064 	if (bserrno != 0) {
4065 		spdk_free(ctx->extent_pages);
4066 		bs_load_ctx_fail(ctx, bserrno);
4067 		return;
4068 	}
4069 
4070 	for (i = 0; i < ctx->num_extent_pages; i++) {
4071 		/* Extent pages are only read when present within in chain md.
4072 		 * Integrity of md is not right if that page was not a valid extent page. */
4073 		if (bs_load_cur_extent_page_valid(&ctx->extent_pages[i]) != true) {
4074 			spdk_free(ctx->extent_pages);
4075 			bs_load_ctx_fail(ctx, -EILSEQ);
4076 			return;
4077 		}
4078 
4079 		page_num = ctx->extent_page_num[i];
4080 		spdk_bit_array_set(ctx->bs->used_md_pages, page_num);
4081 		if (bs_load_replay_md_parse_page(ctx, &ctx->extent_pages[i])) {
4082 			spdk_free(ctx->extent_pages);
4083 			bs_load_ctx_fail(ctx, -EILSEQ);
4084 			return;
4085 		}
4086 	}
4087 
4088 	spdk_free(ctx->extent_pages);
4089 	free(ctx->extent_page_num);
4090 	ctx->extent_page_num = NULL;
4091 	ctx->num_extent_pages = 0;
4092 
4093 	bs_load_replay_md_chain_cpl(ctx);
4094 }
4095 
4096 static void
4097 bs_load_replay_extent_pages(struct spdk_bs_load_ctx *ctx)
4098 {
4099 	spdk_bs_batch_t *batch;
4100 	uint32_t page;
4101 	uint64_t lba;
4102 	uint64_t i;
4103 
4104 	ctx->extent_pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE * ctx->num_extent_pages, 0,
4105 					 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4106 	if (!ctx->extent_pages) {
4107 		bs_load_ctx_fail(ctx, -ENOMEM);
4108 		return;
4109 	}
4110 
4111 	batch = bs_sequence_to_batch(ctx->seq, bs_load_replay_extent_page_cpl, ctx);
4112 
4113 	for (i = 0; i < ctx->num_extent_pages; i++) {
4114 		page = ctx->extent_page_num[i];
4115 		assert(page < ctx->super->md_len);
4116 		lba = bs_md_page_to_lba(ctx->bs, page);
4117 		bs_batch_read_dev(batch, &ctx->extent_pages[i], lba,
4118 				  bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE));
4119 	}
4120 
4121 	bs_batch_close(batch);
4122 }
4123 
4124 static void
4125 bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4126 {
4127 	struct spdk_bs_load_ctx *ctx = cb_arg;
4128 	uint32_t page_num;
4129 	struct spdk_blob_md_page *page;
4130 
4131 	if (bserrno != 0) {
4132 		bs_load_ctx_fail(ctx, bserrno);
4133 		return;
4134 	}
4135 
4136 	page_num = ctx->cur_page;
4137 	page = ctx->page;
4138 	if (bs_load_cur_md_page_valid(ctx) == true) {
4139 		if (page->sequence_num == 0 || ctx->in_page_chain == true) {
4140 			bs_claim_md_page(ctx->bs, page_num);
4141 			if (page->sequence_num == 0) {
4142 				spdk_bit_array_set(ctx->bs->used_blobids, page_num);
4143 			}
4144 			if (bs_load_replay_md_parse_page(ctx, page)) {
4145 				bs_load_ctx_fail(ctx, -EILSEQ);
4146 				return;
4147 			}
4148 			if (page->next != SPDK_INVALID_MD_PAGE) {
4149 				ctx->in_page_chain = true;
4150 				ctx->cur_page = page->next;
4151 				bs_load_replay_cur_md_page(ctx);
4152 				return;
4153 			}
4154 			if (ctx->num_extent_pages != 0) {
4155 				bs_load_replay_extent_pages(ctx);
4156 				return;
4157 			}
4158 		}
4159 	}
4160 	bs_load_replay_md_chain_cpl(ctx);
4161 }
4162 
4163 static void
4164 bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
4165 {
4166 	uint64_t lba;
4167 
4168 	assert(ctx->cur_page < ctx->super->md_len);
4169 	lba = bs_md_page_to_lba(ctx->bs, ctx->cur_page);
4170 	bs_sequence_read_dev(ctx->seq, ctx->page, lba,
4171 			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
4172 			     bs_load_replay_md_cpl, ctx);
4173 }
4174 
4175 static void
4176 bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
4177 {
4178 	ctx->page_index = 0;
4179 	ctx->cur_page = 0;
4180 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
4181 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4182 	if (!ctx->page) {
4183 		bs_load_ctx_fail(ctx, -ENOMEM);
4184 		return;
4185 	}
4186 	bs_load_replay_cur_md_page(ctx);
4187 }
4188 
4189 static void
4190 bs_recover(struct spdk_bs_load_ctx *ctx)
4191 {
4192 	int		rc;
4193 
4194 	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
4195 	if (rc < 0) {
4196 		bs_load_ctx_fail(ctx, -ENOMEM);
4197 		return;
4198 	}
4199 
4200 	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
4201 	if (rc < 0) {
4202 		bs_load_ctx_fail(ctx, -ENOMEM);
4203 		return;
4204 	}
4205 
4206 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
4207 	if (rc < 0) {
4208 		bs_load_ctx_fail(ctx, -ENOMEM);
4209 		return;
4210 	}
4211 
4212 	rc = spdk_bit_array_resize(&ctx->bs->open_blobids, ctx->super->md_len);
4213 	if (rc < 0) {
4214 		bs_load_ctx_fail(ctx, -ENOMEM);
4215 		return;
4216 	}
4217 
4218 	ctx->bs->num_free_clusters = ctx->bs->total_clusters;
4219 	bs_load_replay_md(ctx);
4220 }
4221 
4222 static void
4223 bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4224 {
4225 	struct spdk_bs_load_ctx *ctx = cb_arg;
4226 	uint32_t	crc;
4227 	int		rc;
4228 	static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
4229 
4230 	if (ctx->super->version > SPDK_BS_VERSION ||
4231 	    ctx->super->version < SPDK_BS_INITIAL_VERSION) {
4232 		bs_load_ctx_fail(ctx, -EILSEQ);
4233 		return;
4234 	}
4235 
4236 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4237 		   sizeof(ctx->super->signature)) != 0) {
4238 		bs_load_ctx_fail(ctx, -EILSEQ);
4239 		return;
4240 	}
4241 
4242 	crc = blob_md_page_calc_crc(ctx->super);
4243 	if (crc != ctx->super->crc) {
4244 		bs_load_ctx_fail(ctx, -EILSEQ);
4245 		return;
4246 	}
4247 
4248 	if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
4249 		SPDK_DEBUGLOG(blob, "Bstype matched - loading blobstore\n");
4250 	} else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
4251 		SPDK_DEBUGLOG(blob, "Bstype wildcard used - loading blobstore regardless bstype\n");
4252 	} else {
4253 		SPDK_DEBUGLOG(blob, "Unexpected bstype\n");
4254 		SPDK_LOGDUMP(blob, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
4255 		SPDK_LOGDUMP(blob, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
4256 		bs_load_ctx_fail(ctx, -ENXIO);
4257 		return;
4258 	}
4259 
4260 	if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
4261 		SPDK_NOTICELOG("Size mismatch, dev size: %" PRIu64 ", blobstore size: %" PRIu64 "\n",
4262 			       ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
4263 		bs_load_ctx_fail(ctx, -EILSEQ);
4264 		return;
4265 	}
4266 
4267 	if (ctx->super->size == 0) {
4268 		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
4269 	}
4270 
4271 	if (ctx->super->io_unit_size == 0) {
4272 		ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
4273 	}
4274 
4275 	/* Parse the super block */
4276 	ctx->bs->clean = 1;
4277 	ctx->bs->cluster_sz = ctx->super->cluster_size;
4278 	ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
4279 	ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
4280 	if (spdk_u32_is_pow2(ctx->bs->pages_per_cluster)) {
4281 		ctx->bs->pages_per_cluster_shift = spdk_u32log2(ctx->bs->pages_per_cluster);
4282 	}
4283 	ctx->bs->io_unit_size = ctx->super->io_unit_size;
4284 	rc = spdk_bit_array_resize(&ctx->used_clusters, ctx->bs->total_clusters);
4285 	if (rc < 0) {
4286 		bs_load_ctx_fail(ctx, -ENOMEM);
4287 		return;
4288 	}
4289 	ctx->bs->md_start = ctx->super->md_start;
4290 	ctx->bs->md_len = ctx->super->md_len;
4291 	ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
4292 					       ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
4293 	ctx->bs->super_blob = ctx->super->super_blob;
4294 	memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
4295 
4296 	if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
4297 		bs_recover(ctx);
4298 	} else {
4299 		bs_load_read_used_pages(ctx);
4300 	}
4301 }
4302 
4303 static int
4304 bs_opts_copy(struct spdk_bs_opts *src, struct spdk_bs_opts *dst)
4305 {
4306 
4307 	if (!src->opts_size) {
4308 		SPDK_ERRLOG("opts_size should not be zero value\n");
4309 		return -1;
4310 	}
4311 
4312 #define FIELD_OK(field) \
4313         offsetof(struct spdk_bs_opts, field) + sizeof(src->field) <= src->opts_size
4314 
4315 #define SET_FIELD(field) \
4316         if (FIELD_OK(field)) { \
4317                 dst->field = src->field; \
4318         } \
4319 
4320 	SET_FIELD(cluster_sz);
4321 	SET_FIELD(num_md_pages);
4322 	SET_FIELD(max_md_ops);
4323 	SET_FIELD(max_channel_ops);
4324 	SET_FIELD(clear_method);
4325 
4326 	if (FIELD_OK(bstype)) {
4327 		memcpy(&dst->bstype, &src->bstype, sizeof(dst->bstype));
4328 	}
4329 	SET_FIELD(iter_cb_fn);
4330 	SET_FIELD(iter_cb_arg);
4331 
4332 	dst->opts_size = src->opts_size;
4333 
4334 	/* You should not remove this statement, but need to update the assert statement
4335 	 * if you add a new field, and also add a corresponding SET_FIELD statement */
4336 	SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_opts) == 64, "Incorrect size");
4337 
4338 #undef FIELD_OK
4339 #undef SET_FIELD
4340 
4341 	return 0;
4342 }
4343 
4344 void
4345 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
4346 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
4347 {
4348 	struct spdk_blob_store	*bs;
4349 	struct spdk_bs_cpl	cpl;
4350 	struct spdk_bs_load_ctx *ctx;
4351 	struct spdk_bs_opts	opts = {};
4352 	int err;
4353 
4354 	SPDK_DEBUGLOG(blob, "Loading blobstore from dev %p\n", dev);
4355 
4356 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
4357 		SPDK_DEBUGLOG(blob, "unsupported dev block length of %d\n", dev->blocklen);
4358 		dev->destroy(dev);
4359 		cb_fn(cb_arg, NULL, -EINVAL);
4360 		return;
4361 	}
4362 
4363 	spdk_bs_opts_init(&opts, sizeof(opts));
4364 	if (o) {
4365 		if (bs_opts_copy(o, &opts)) {
4366 			return;
4367 		}
4368 	}
4369 
4370 	if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
4371 		dev->destroy(dev);
4372 		cb_fn(cb_arg, NULL, -EINVAL);
4373 		return;
4374 	}
4375 
4376 	err = bs_alloc(dev, &opts, &bs, &ctx);
4377 	if (err) {
4378 		dev->destroy(dev);
4379 		cb_fn(cb_arg, NULL, err);
4380 		return;
4381 	}
4382 
4383 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
4384 	cpl.u.bs_handle.cb_fn = cb_fn;
4385 	cpl.u.bs_handle.cb_arg = cb_arg;
4386 	cpl.u.bs_handle.bs = bs;
4387 
4388 	ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
4389 	if (!ctx->seq) {
4390 		spdk_free(ctx->super);
4391 		free(ctx);
4392 		bs_free(bs);
4393 		cb_fn(cb_arg, NULL, -ENOMEM);
4394 		return;
4395 	}
4396 
4397 	/* Read the super block */
4398 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
4399 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
4400 			     bs_load_super_cpl, ctx);
4401 }
4402 
4403 /* END spdk_bs_load */
4404 
4405 /* START spdk_bs_dump */
4406 
4407 static void
4408 bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno)
4409 {
4410 	spdk_free(ctx->super);
4411 
4412 	/*
4413 	 * We need to defer calling bs_call_cpl() until after
4414 	 * dev destruction, so tuck these away for later use.
4415 	 */
4416 	ctx->bs->unload_err = bserrno;
4417 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4418 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4419 
4420 	bs_sequence_finish(seq, 0);
4421 	bs_free(ctx->bs);
4422 	free(ctx);
4423 }
4424 
4425 static void bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
4426 
4427 static void
4428 bs_dump_print_md_page(struct spdk_bs_load_ctx *ctx)
4429 {
4430 	uint32_t page_idx = ctx->cur_page;
4431 	struct spdk_blob_md_page *page = ctx->page;
4432 	struct spdk_blob_md_descriptor *desc;
4433 	size_t cur_desc = 0;
4434 	uint32_t crc;
4435 
4436 	fprintf(ctx->fp, "=========\n");
4437 	fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
4438 	fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
4439 
4440 	crc = blob_md_page_calc_crc(page);
4441 	fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
4442 
4443 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
4444 	while (cur_desc < sizeof(page->descriptors)) {
4445 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
4446 			if (desc->length == 0) {
4447 				/* If padding and length are 0, this terminates the page */
4448 				break;
4449 			}
4450 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
4451 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
4452 			unsigned int				i;
4453 
4454 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
4455 
4456 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
4457 				if (desc_extent_rle->extents[i].cluster_idx != 0) {
4458 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
4459 						desc_extent_rle->extents[i].cluster_idx);
4460 				} else {
4461 					fprintf(ctx->fp, "Unallocated Extent - ");
4462 				}
4463 				fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
4464 				fprintf(ctx->fp, "\n");
4465 			}
4466 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
4467 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
4468 			unsigned int					i;
4469 
4470 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
4471 
4472 			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
4473 				if (desc_extent->cluster_idx[i] != 0) {
4474 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
4475 						desc_extent->cluster_idx[i]);
4476 				} else {
4477 					fprintf(ctx->fp, "Unallocated Extent");
4478 				}
4479 				fprintf(ctx->fp, "\n");
4480 			}
4481 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
4482 			struct spdk_blob_md_descriptor_xattr *desc_xattr;
4483 			uint32_t i;
4484 
4485 			desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
4486 
4487 			if (desc_xattr->length !=
4488 			    sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
4489 			    desc_xattr->name_length + desc_xattr->value_length) {
4490 			}
4491 
4492 			memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
4493 			ctx->xattr_name[desc_xattr->name_length] = '\0';
4494 			fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name);
4495 			fprintf(ctx->fp, "       value = \"");
4496 			ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
4497 					    (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
4498 					    desc_xattr->value_length);
4499 			fprintf(ctx->fp, "\"\n");
4500 			for (i = 0; i < desc_xattr->value_length; i++) {
4501 				if (i % 16 == 0) {
4502 					fprintf(ctx->fp, "               ");
4503 				}
4504 				fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
4505 				if ((i + 1) % 16 == 0) {
4506 					fprintf(ctx->fp, "\n");
4507 				}
4508 			}
4509 			if (i % 16 != 0) {
4510 				fprintf(ctx->fp, "\n");
4511 			}
4512 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
4513 			/* TODO */
4514 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
4515 			/* TODO */
4516 		} else {
4517 			/* Error */
4518 		}
4519 		/* Advance to the next descriptor */
4520 		cur_desc += sizeof(*desc) + desc->length;
4521 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
4522 			break;
4523 		}
4524 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
4525 	}
4526 }
4527 
4528 static void
4529 bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4530 {
4531 	struct spdk_bs_load_ctx *ctx = cb_arg;
4532 
4533 	if (bserrno != 0) {
4534 		bs_dump_finish(seq, ctx, bserrno);
4535 		return;
4536 	}
4537 
4538 	if (ctx->page->id != 0) {
4539 		bs_dump_print_md_page(ctx);
4540 	}
4541 
4542 	ctx->cur_page++;
4543 
4544 	if (ctx->cur_page < ctx->super->md_len) {
4545 		bs_dump_read_md_page(seq, ctx);
4546 	} else {
4547 		spdk_free(ctx->page);
4548 		bs_dump_finish(seq, ctx, 0);
4549 	}
4550 }
4551 
4552 static void
4553 bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
4554 {
4555 	struct spdk_bs_load_ctx *ctx = cb_arg;
4556 	uint64_t lba;
4557 
4558 	assert(ctx->cur_page < ctx->super->md_len);
4559 	lba = bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
4560 	bs_sequence_read_dev(seq, ctx->page, lba,
4561 			     bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
4562 			     bs_dump_read_md_page_cpl, ctx);
4563 }
4564 
4565 static void
4566 bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4567 {
4568 	struct spdk_bs_load_ctx *ctx = cb_arg;
4569 
4570 	fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
4571 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4572 		   sizeof(ctx->super->signature)) != 0) {
4573 		fprintf(ctx->fp, "(Mismatch)\n");
4574 		bs_dump_finish(seq, ctx, bserrno);
4575 		return;
4576 	} else {
4577 		fprintf(ctx->fp, "(OK)\n");
4578 	}
4579 	fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
4580 	fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
4581 		(ctx->super->crc == blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
4582 	fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
4583 	fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
4584 	fprintf(ctx->fp, "Super Blob ID: ");
4585 	if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
4586 		fprintf(ctx->fp, "(None)\n");
4587 	} else {
4588 		fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob);
4589 	}
4590 	fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
4591 	fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
4592 	fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
4593 	fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
4594 	fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
4595 	fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
4596 	fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
4597 	fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
4598 	fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
4599 
4600 	ctx->cur_page = 0;
4601 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, 0,
4602 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4603 	if (!ctx->page) {
4604 		bs_dump_finish(seq, ctx, -ENOMEM);
4605 		return;
4606 	}
4607 	bs_dump_read_md_page(seq, ctx);
4608 }
4609 
4610 void
4611 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
4612 	     spdk_bs_op_complete cb_fn, void *cb_arg)
4613 {
4614 	struct spdk_blob_store	*bs;
4615 	struct spdk_bs_cpl	cpl;
4616 	spdk_bs_sequence_t	*seq;
4617 	struct spdk_bs_load_ctx *ctx;
4618 	struct spdk_bs_opts	opts = {};
4619 	int err;
4620 
4621 	SPDK_DEBUGLOG(blob, "Dumping blobstore from dev %p\n", dev);
4622 
4623 	spdk_bs_opts_init(&opts, sizeof(opts));
4624 
4625 	err = bs_alloc(dev, &opts, &bs, &ctx);
4626 	if (err) {
4627 		dev->destroy(dev);
4628 		cb_fn(cb_arg, err);
4629 		return;
4630 	}
4631 
4632 	ctx->fp = fp;
4633 	ctx->print_xattr_fn = print_xattr_fn;
4634 
4635 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4636 	cpl.u.bs_basic.cb_fn = cb_fn;
4637 	cpl.u.bs_basic.cb_arg = cb_arg;
4638 
4639 	seq = bs_sequence_start(bs->md_channel, &cpl);
4640 	if (!seq) {
4641 		spdk_free(ctx->super);
4642 		free(ctx);
4643 		bs_free(bs);
4644 		cb_fn(cb_arg, -ENOMEM);
4645 		return;
4646 	}
4647 
4648 	/* Read the super block */
4649 	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
4650 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
4651 			     bs_dump_super_cpl, ctx);
4652 }
4653 
4654 /* END spdk_bs_dump */
4655 
4656 /* START spdk_bs_init */
4657 
4658 static void
4659 bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4660 {
4661 	struct spdk_bs_load_ctx *ctx = cb_arg;
4662 
4663 	ctx->bs->used_clusters = spdk_bit_pool_create_from_array(ctx->used_clusters);
4664 	spdk_free(ctx->super);
4665 	free(ctx);
4666 
4667 	bs_sequence_finish(seq, bserrno);
4668 }
4669 
4670 static void
4671 bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4672 {
4673 	struct spdk_bs_load_ctx *ctx = cb_arg;
4674 
4675 	/* Write super block */
4676 	bs_sequence_write_dev(seq, ctx->super, bs_page_to_lba(ctx->bs, 0),
4677 			      bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
4678 			      bs_init_persist_super_cpl, ctx);
4679 }
4680 
4681 void
4682 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
4683 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
4684 {
4685 	struct spdk_bs_load_ctx *ctx;
4686 	struct spdk_blob_store	*bs;
4687 	struct spdk_bs_cpl	cpl;
4688 	spdk_bs_sequence_t	*seq;
4689 	spdk_bs_batch_t		*batch;
4690 	uint64_t		num_md_lba;
4691 	uint64_t		num_md_pages;
4692 	uint64_t		num_md_clusters;
4693 	uint32_t		i;
4694 	struct spdk_bs_opts	opts = {};
4695 	int			rc;
4696 	uint64_t		lba, lba_count;
4697 
4698 	SPDK_DEBUGLOG(blob, "Initializing blobstore on dev %p\n", dev);
4699 
4700 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
4701 		SPDK_ERRLOG("unsupported dev block length of %d\n",
4702 			    dev->blocklen);
4703 		dev->destroy(dev);
4704 		cb_fn(cb_arg, NULL, -EINVAL);
4705 		return;
4706 	}
4707 
4708 	spdk_bs_opts_init(&opts, sizeof(opts));
4709 	if (o) {
4710 		if (bs_opts_copy(o, &opts)) {
4711 			return;
4712 		}
4713 	}
4714 
4715 	if (bs_opts_verify(&opts) != 0) {
4716 		dev->destroy(dev);
4717 		cb_fn(cb_arg, NULL, -EINVAL);
4718 		return;
4719 	}
4720 
4721 	rc = bs_alloc(dev, &opts, &bs, &ctx);
4722 	if (rc) {
4723 		dev->destroy(dev);
4724 		cb_fn(cb_arg, NULL, rc);
4725 		return;
4726 	}
4727 
4728 	if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
4729 		/* By default, allocate 1 page per cluster.
4730 		 * Technically, this over-allocates metadata
4731 		 * because more metadata will reduce the number
4732 		 * of usable clusters. This can be addressed with
4733 		 * more complex math in the future.
4734 		 */
4735 		bs->md_len = bs->total_clusters;
4736 	} else {
4737 		bs->md_len = opts.num_md_pages;
4738 	}
4739 	rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
4740 	if (rc < 0) {
4741 		spdk_free(ctx->super);
4742 		free(ctx);
4743 		bs_free(bs);
4744 		cb_fn(cb_arg, NULL, -ENOMEM);
4745 		return;
4746 	}
4747 
4748 	rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
4749 	if (rc < 0) {
4750 		spdk_free(ctx->super);
4751 		free(ctx);
4752 		bs_free(bs);
4753 		cb_fn(cb_arg, NULL, -ENOMEM);
4754 		return;
4755 	}
4756 
4757 	rc = spdk_bit_array_resize(&bs->open_blobids, bs->md_len);
4758 	if (rc < 0) {
4759 		spdk_free(ctx->super);
4760 		free(ctx);
4761 		bs_free(bs);
4762 		cb_fn(cb_arg, NULL, -ENOMEM);
4763 		return;
4764 	}
4765 
4766 	memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4767 	       sizeof(ctx->super->signature));
4768 	ctx->super->version = SPDK_BS_VERSION;
4769 	ctx->super->length = sizeof(*ctx->super);
4770 	ctx->super->super_blob = bs->super_blob;
4771 	ctx->super->clean = 0;
4772 	ctx->super->cluster_size = bs->cluster_sz;
4773 	ctx->super->io_unit_size = bs->io_unit_size;
4774 	memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
4775 
4776 	/* Calculate how many pages the metadata consumes at the front
4777 	 * of the disk.
4778 	 */
4779 
4780 	/* The super block uses 1 page */
4781 	num_md_pages = 1;
4782 
4783 	/* The used_md_pages mask requires 1 bit per metadata page, rounded
4784 	 * up to the nearest page, plus a header.
4785 	 */
4786 	ctx->super->used_page_mask_start = num_md_pages;
4787 	ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4788 					 spdk_divide_round_up(bs->md_len, 8),
4789 					 SPDK_BS_PAGE_SIZE);
4790 	num_md_pages += ctx->super->used_page_mask_len;
4791 
4792 	/* The used_clusters mask requires 1 bit per cluster, rounded
4793 	 * up to the nearest page, plus a header.
4794 	 */
4795 	ctx->super->used_cluster_mask_start = num_md_pages;
4796 	ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4797 					    spdk_divide_round_up(bs->total_clusters, 8),
4798 					    SPDK_BS_PAGE_SIZE);
4799 	num_md_pages += ctx->super->used_cluster_mask_len;
4800 
4801 	/* The used_blobids mask requires 1 bit per metadata page, rounded
4802 	 * up to the nearest page, plus a header.
4803 	 */
4804 	ctx->super->used_blobid_mask_start = num_md_pages;
4805 	ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4806 					   spdk_divide_round_up(bs->md_len, 8),
4807 					   SPDK_BS_PAGE_SIZE);
4808 	num_md_pages += ctx->super->used_blobid_mask_len;
4809 
4810 	/* The metadata region size was chosen above */
4811 	ctx->super->md_start = bs->md_start = num_md_pages;
4812 	ctx->super->md_len = bs->md_len;
4813 	num_md_pages += bs->md_len;
4814 
4815 	num_md_lba = bs_page_to_lba(bs, num_md_pages);
4816 
4817 	ctx->super->size = dev->blockcnt * dev->blocklen;
4818 
4819 	ctx->super->crc = blob_md_page_calc_crc(ctx->super);
4820 
4821 	num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
4822 	if (num_md_clusters > bs->total_clusters) {
4823 		SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
4824 			    "please decrease number of pages reserved for metadata "
4825 			    "or increase cluster size.\n");
4826 		spdk_free(ctx->super);
4827 		spdk_bit_array_free(&ctx->used_clusters);
4828 		free(ctx);
4829 		bs_free(bs);
4830 		cb_fn(cb_arg, NULL, -ENOMEM);
4831 		return;
4832 	}
4833 	/* Claim all of the clusters used by the metadata */
4834 	for (i = 0; i < num_md_clusters; i++) {
4835 		spdk_bit_array_set(ctx->used_clusters, i);
4836 	}
4837 
4838 	bs->num_free_clusters -= num_md_clusters;
4839 	bs->total_data_clusters = bs->num_free_clusters;
4840 
4841 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
4842 	cpl.u.bs_handle.cb_fn = cb_fn;
4843 	cpl.u.bs_handle.cb_arg = cb_arg;
4844 	cpl.u.bs_handle.bs = bs;
4845 
4846 	seq = bs_sequence_start(bs->md_channel, &cpl);
4847 	if (!seq) {
4848 		spdk_free(ctx->super);
4849 		free(ctx);
4850 		bs_free(bs);
4851 		cb_fn(cb_arg, NULL, -ENOMEM);
4852 		return;
4853 	}
4854 
4855 	batch = bs_sequence_to_batch(seq, bs_init_trim_cpl, ctx);
4856 
4857 	/* Clear metadata space */
4858 	bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
4859 
4860 	lba = num_md_lba;
4861 	while (lba < ctx->bs->dev->blockcnt) {
4862 		lba_count = spdk_min(UINT32_MAX, ctx->bs->dev->blockcnt - lba);
4863 		switch (opts.clear_method) {
4864 		case BS_CLEAR_WITH_UNMAP:
4865 			/* Trim data clusters */
4866 			bs_batch_unmap_dev(batch, lba, lba_count);
4867 			break;
4868 		case BS_CLEAR_WITH_WRITE_ZEROES:
4869 			/* Write_zeroes to data clusters */
4870 			bs_batch_write_zeroes_dev(batch, lba, lba_count);
4871 			break;
4872 		case BS_CLEAR_WITH_NONE:
4873 		default:
4874 			break;
4875 		}
4876 		lba += lba_count;
4877 	}
4878 
4879 	bs_batch_close(batch);
4880 }
4881 
4882 /* END spdk_bs_init */
4883 
4884 /* START spdk_bs_destroy */
4885 
4886 static void
4887 bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4888 {
4889 	struct spdk_bs_load_ctx *ctx = cb_arg;
4890 	struct spdk_blob_store *bs = ctx->bs;
4891 
4892 	/*
4893 	 * We need to defer calling bs_call_cpl() until after
4894 	 * dev destruction, so tuck these away for later use.
4895 	 */
4896 	bs->unload_err = bserrno;
4897 	memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4898 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4899 
4900 	bs_sequence_finish(seq, bserrno);
4901 
4902 	bs_free(bs);
4903 	free(ctx);
4904 }
4905 
4906 void
4907 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
4908 		void *cb_arg)
4909 {
4910 	struct spdk_bs_cpl	cpl;
4911 	spdk_bs_sequence_t	*seq;
4912 	struct spdk_bs_load_ctx *ctx;
4913 
4914 	SPDK_DEBUGLOG(blob, "Destroying blobstore\n");
4915 
4916 	if (!TAILQ_EMPTY(&bs->blobs)) {
4917 		SPDK_ERRLOG("Blobstore still has open blobs\n");
4918 		cb_fn(cb_arg, -EBUSY);
4919 		return;
4920 	}
4921 
4922 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4923 	cpl.u.bs_basic.cb_fn = cb_fn;
4924 	cpl.u.bs_basic.cb_arg = cb_arg;
4925 
4926 	ctx = calloc(1, sizeof(*ctx));
4927 	if (!ctx) {
4928 		cb_fn(cb_arg, -ENOMEM);
4929 		return;
4930 	}
4931 
4932 	ctx->bs = bs;
4933 
4934 	seq = bs_sequence_start(bs->md_channel, &cpl);
4935 	if (!seq) {
4936 		free(ctx);
4937 		cb_fn(cb_arg, -ENOMEM);
4938 		return;
4939 	}
4940 
4941 	/* Write zeroes to the super block */
4942 	bs_sequence_write_zeroes_dev(seq,
4943 				     bs_page_to_lba(bs, 0),
4944 				     bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
4945 				     bs_destroy_trim_cpl, ctx);
4946 }
4947 
4948 /* END spdk_bs_destroy */
4949 
4950 /* START spdk_bs_unload */
4951 
4952 static void
4953 bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
4954 {
4955 	spdk_bs_sequence_t *seq = ctx->seq;
4956 
4957 	spdk_free(ctx->super);
4958 
4959 	/*
4960 	 * We need to defer calling bs_call_cpl() until after
4961 	 * dev destruction, so tuck these away for later use.
4962 	 */
4963 	ctx->bs->unload_err = bserrno;
4964 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4965 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4966 
4967 	bs_sequence_finish(seq, bserrno);
4968 
4969 	bs_free(ctx->bs);
4970 	free(ctx);
4971 }
4972 
4973 static void
4974 bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4975 {
4976 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4977 
4978 	bs_unload_finish(ctx, bserrno);
4979 }
4980 
4981 static void
4982 bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4983 {
4984 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4985 
4986 	spdk_free(ctx->mask);
4987 
4988 	if (bserrno != 0) {
4989 		bs_unload_finish(ctx, bserrno);
4990 		return;
4991 	}
4992 
4993 	ctx->super->clean = 1;
4994 
4995 	bs_write_super(seq, ctx->bs, ctx->super, bs_unload_write_super_cpl, ctx);
4996 }
4997 
4998 static void
4999 bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5000 {
5001 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5002 
5003 	spdk_free(ctx->mask);
5004 	ctx->mask = NULL;
5005 
5006 	if (bserrno != 0) {
5007 		bs_unload_finish(ctx, bserrno);
5008 		return;
5009 	}
5010 
5011 	bs_write_used_clusters(seq, ctx, bs_unload_write_used_clusters_cpl);
5012 }
5013 
5014 static void
5015 bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5016 {
5017 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5018 
5019 	spdk_free(ctx->mask);
5020 	ctx->mask = NULL;
5021 
5022 	if (bserrno != 0) {
5023 		bs_unload_finish(ctx, bserrno);
5024 		return;
5025 	}
5026 
5027 	bs_write_used_blobids(seq, ctx, bs_unload_write_used_blobids_cpl);
5028 }
5029 
5030 static void
5031 bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5032 {
5033 	struct spdk_bs_load_ctx	*ctx = cb_arg;
5034 
5035 	if (bserrno != 0) {
5036 		bs_unload_finish(ctx, bserrno);
5037 		return;
5038 	}
5039 
5040 	bs_write_used_md(seq, cb_arg, bs_unload_write_used_pages_cpl);
5041 }
5042 
5043 void
5044 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
5045 {
5046 	struct spdk_bs_cpl	cpl;
5047 	struct spdk_bs_load_ctx *ctx;
5048 
5049 	SPDK_DEBUGLOG(blob, "Syncing blobstore\n");
5050 
5051 	if (!TAILQ_EMPTY(&bs->blobs)) {
5052 		SPDK_ERRLOG("Blobstore still has open blobs\n");
5053 		cb_fn(cb_arg, -EBUSY);
5054 		return;
5055 	}
5056 
5057 	ctx = calloc(1, sizeof(*ctx));
5058 	if (!ctx) {
5059 		cb_fn(cb_arg, -ENOMEM);
5060 		return;
5061 	}
5062 
5063 	ctx->bs = bs;
5064 
5065 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
5066 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
5067 	if (!ctx->super) {
5068 		free(ctx);
5069 		cb_fn(cb_arg, -ENOMEM);
5070 		return;
5071 	}
5072 
5073 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5074 	cpl.u.bs_basic.cb_fn = cb_fn;
5075 	cpl.u.bs_basic.cb_arg = cb_arg;
5076 
5077 	ctx->seq = bs_sequence_start(bs->md_channel, &cpl);
5078 	if (!ctx->seq) {
5079 		spdk_free(ctx->super);
5080 		free(ctx);
5081 		cb_fn(cb_arg, -ENOMEM);
5082 		return;
5083 	}
5084 
5085 	/* Read super block */
5086 	bs_sequence_read_dev(ctx->seq, ctx->super, bs_page_to_lba(bs, 0),
5087 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
5088 			     bs_unload_read_super_cpl, ctx);
5089 }
5090 
5091 /* END spdk_bs_unload */
5092 
5093 /* START spdk_bs_set_super */
5094 
5095 struct spdk_bs_set_super_ctx {
5096 	struct spdk_blob_store		*bs;
5097 	struct spdk_bs_super_block	*super;
5098 };
5099 
5100 static void
5101 bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5102 {
5103 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
5104 
5105 	if (bserrno != 0) {
5106 		SPDK_ERRLOG("Unable to write to super block of blobstore\n");
5107 	}
5108 
5109 	spdk_free(ctx->super);
5110 
5111 	bs_sequence_finish(seq, bserrno);
5112 
5113 	free(ctx);
5114 }
5115 
5116 static void
5117 bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5118 {
5119 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
5120 
5121 	if (bserrno != 0) {
5122 		SPDK_ERRLOG("Unable to read super block of blobstore\n");
5123 		spdk_free(ctx->super);
5124 		bs_sequence_finish(seq, bserrno);
5125 		free(ctx);
5126 		return;
5127 	}
5128 
5129 	bs_write_super(seq, ctx->bs, ctx->super, bs_set_super_write_cpl, ctx);
5130 }
5131 
5132 void
5133 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
5134 		  spdk_bs_op_complete cb_fn, void *cb_arg)
5135 {
5136 	struct spdk_bs_cpl		cpl;
5137 	spdk_bs_sequence_t		*seq;
5138 	struct spdk_bs_set_super_ctx	*ctx;
5139 
5140 	SPDK_DEBUGLOG(blob, "Setting super blob id on blobstore\n");
5141 
5142 	ctx = calloc(1, sizeof(*ctx));
5143 	if (!ctx) {
5144 		cb_fn(cb_arg, -ENOMEM);
5145 		return;
5146 	}
5147 
5148 	ctx->bs = bs;
5149 
5150 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
5151 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
5152 	if (!ctx->super) {
5153 		free(ctx);
5154 		cb_fn(cb_arg, -ENOMEM);
5155 		return;
5156 	}
5157 
5158 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
5159 	cpl.u.bs_basic.cb_fn = cb_fn;
5160 	cpl.u.bs_basic.cb_arg = cb_arg;
5161 
5162 	seq = bs_sequence_start(bs->md_channel, &cpl);
5163 	if (!seq) {
5164 		spdk_free(ctx->super);
5165 		free(ctx);
5166 		cb_fn(cb_arg, -ENOMEM);
5167 		return;
5168 	}
5169 
5170 	bs->super_blob = blobid;
5171 
5172 	/* Read super block */
5173 	bs_sequence_read_dev(seq, ctx->super, bs_page_to_lba(bs, 0),
5174 			     bs_byte_to_lba(bs, sizeof(*ctx->super)),
5175 			     bs_set_super_read_cpl, ctx);
5176 }
5177 
5178 /* END spdk_bs_set_super */
5179 
5180 void
5181 spdk_bs_get_super(struct spdk_blob_store *bs,
5182 		  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5183 {
5184 	if (bs->super_blob == SPDK_BLOBID_INVALID) {
5185 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
5186 	} else {
5187 		cb_fn(cb_arg, bs->super_blob, 0);
5188 	}
5189 }
5190 
5191 uint64_t
5192 spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
5193 {
5194 	return bs->cluster_sz;
5195 }
5196 
5197 uint64_t
5198 spdk_bs_get_page_size(struct spdk_blob_store *bs)
5199 {
5200 	return SPDK_BS_PAGE_SIZE;
5201 }
5202 
5203 uint64_t
5204 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
5205 {
5206 	return bs->io_unit_size;
5207 }
5208 
5209 uint64_t
5210 spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
5211 {
5212 	return bs->num_free_clusters;
5213 }
5214 
5215 uint64_t
5216 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
5217 {
5218 	return bs->total_data_clusters;
5219 }
5220 
5221 static int
5222 bs_register_md_thread(struct spdk_blob_store *bs)
5223 {
5224 	bs->md_channel = spdk_get_io_channel(bs);
5225 	if (!bs->md_channel) {
5226 		SPDK_ERRLOG("Failed to get IO channel.\n");
5227 		return -1;
5228 	}
5229 
5230 	return 0;
5231 }
5232 
5233 static int
5234 bs_unregister_md_thread(struct spdk_blob_store *bs)
5235 {
5236 	spdk_put_io_channel(bs->md_channel);
5237 
5238 	return 0;
5239 }
5240 
5241 spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob)
5242 {
5243 	assert(blob != NULL);
5244 
5245 	return blob->id;
5246 }
5247 
5248 uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob)
5249 {
5250 	assert(blob != NULL);
5251 
5252 	return bs_cluster_to_page(blob->bs, blob->active.num_clusters);
5253 }
5254 
5255 uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob)
5256 {
5257 	assert(blob != NULL);
5258 
5259 	return spdk_blob_get_num_pages(blob) * bs_io_unit_per_page(blob->bs);
5260 }
5261 
5262 uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob)
5263 {
5264 	assert(blob != NULL);
5265 
5266 	return blob->active.num_clusters;
5267 }
5268 
5269 /* START spdk_bs_create_blob */
5270 
5271 static void
5272 bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5273 {
5274 	struct spdk_blob *blob = cb_arg;
5275 	uint32_t page_idx = bs_blobid_to_page(blob->id);
5276 
5277 	if (bserrno != 0) {
5278 		spdk_bit_array_clear(blob->bs->used_blobids, page_idx);
5279 		bs_release_md_page(blob->bs, page_idx);
5280 	}
5281 
5282 	blob_free(blob);
5283 
5284 	bs_sequence_finish(seq, bserrno);
5285 }
5286 
5287 static int
5288 blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
5289 		bool internal)
5290 {
5291 	uint64_t i;
5292 	size_t value_len = 0;
5293 	int rc;
5294 	const void *value = NULL;
5295 	if (xattrs->count > 0 && xattrs->get_value == NULL) {
5296 		return -EINVAL;
5297 	}
5298 	for (i = 0; i < xattrs->count; i++) {
5299 		xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
5300 		if (value == NULL || value_len == 0) {
5301 			return -EINVAL;
5302 		}
5303 		rc = blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
5304 		if (rc < 0) {
5305 			return rc;
5306 		}
5307 	}
5308 	return 0;
5309 }
5310 
5311 static void
5312 bs_create_blob(struct spdk_blob_store *bs,
5313 	       const struct spdk_blob_opts *opts,
5314 	       const struct spdk_blob_xattr_opts *internal_xattrs,
5315 	       spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5316 {
5317 	struct spdk_blob	*blob;
5318 	uint32_t		page_idx;
5319 	struct spdk_bs_cpl	cpl;
5320 	struct spdk_blob_opts	opts_default;
5321 	struct spdk_blob_xattr_opts internal_xattrs_default;
5322 	spdk_bs_sequence_t	*seq;
5323 	spdk_blob_id		id;
5324 	int rc;
5325 
5326 	assert(spdk_get_thread() == bs->md_thread);
5327 
5328 	page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
5329 	if (page_idx == UINT32_MAX) {
5330 		cb_fn(cb_arg, 0, -ENOMEM);
5331 		return;
5332 	}
5333 	spdk_bit_array_set(bs->used_blobids, page_idx);
5334 	bs_claim_md_page(bs, page_idx);
5335 
5336 	id = bs_page_to_blobid(page_idx);
5337 
5338 	SPDK_DEBUGLOG(blob, "Creating blob with id %" PRIu64 " at page %u\n", id, page_idx);
5339 
5340 	blob = blob_alloc(bs, id);
5341 	if (!blob) {
5342 		spdk_bit_array_clear(bs->used_blobids, page_idx);
5343 		bs_release_md_page(bs, page_idx);
5344 		cb_fn(cb_arg, 0, -ENOMEM);
5345 		return;
5346 	}
5347 
5348 	if (!opts) {
5349 		spdk_blob_opts_init(&opts_default);
5350 		opts = &opts_default;
5351 	}
5352 
5353 	blob->use_extent_table = opts->use_extent_table;
5354 	if (blob->use_extent_table) {
5355 		blob->invalid_flags |= SPDK_BLOB_EXTENT_TABLE;
5356 	}
5357 
5358 	if (!internal_xattrs) {
5359 		blob_xattrs_init(&internal_xattrs_default);
5360 		internal_xattrs = &internal_xattrs_default;
5361 	}
5362 
5363 	rc = blob_set_xattrs(blob, &opts->xattrs, false);
5364 	if (rc < 0) {
5365 		blob_free(blob);
5366 		spdk_bit_array_clear(bs->used_blobids, page_idx);
5367 		bs_release_md_page(bs, page_idx);
5368 		cb_fn(cb_arg, 0, rc);
5369 		return;
5370 	}
5371 
5372 	rc = blob_set_xattrs(blob, internal_xattrs, true);
5373 	if (rc < 0) {
5374 		blob_free(blob);
5375 		spdk_bit_array_clear(bs->used_blobids, page_idx);
5376 		bs_release_md_page(bs, page_idx);
5377 		cb_fn(cb_arg, 0, rc);
5378 		return;
5379 	}
5380 
5381 	if (opts->thin_provision) {
5382 		blob_set_thin_provision(blob);
5383 	}
5384 
5385 	blob_set_clear_method(blob, opts->clear_method);
5386 
5387 	rc = blob_resize(blob, opts->num_clusters);
5388 	if (rc < 0) {
5389 		blob_free(blob);
5390 		spdk_bit_array_clear(bs->used_blobids, page_idx);
5391 		bs_release_md_page(bs, page_idx);
5392 		cb_fn(cb_arg, 0, rc);
5393 		return;
5394 	}
5395 	cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5396 	cpl.u.blobid.cb_fn = cb_fn;
5397 	cpl.u.blobid.cb_arg = cb_arg;
5398 	cpl.u.blobid.blobid = blob->id;
5399 
5400 	seq = bs_sequence_start(bs->md_channel, &cpl);
5401 	if (!seq) {
5402 		blob_free(blob);
5403 		spdk_bit_array_clear(bs->used_blobids, page_idx);
5404 		bs_release_md_page(bs, page_idx);
5405 		cb_fn(cb_arg, 0, -ENOMEM);
5406 		return;
5407 	}
5408 
5409 	blob_persist(seq, blob, bs_create_blob_cpl, blob);
5410 }
5411 
5412 void spdk_bs_create_blob(struct spdk_blob_store *bs,
5413 			 spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5414 {
5415 	bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
5416 }
5417 
5418 void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
5419 			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5420 {
5421 	bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
5422 }
5423 
5424 /* END spdk_bs_create_blob */
5425 
5426 /* START blob_cleanup */
5427 
5428 struct spdk_clone_snapshot_ctx {
5429 	struct spdk_bs_cpl      cpl;
5430 	int bserrno;
5431 	bool frozen;
5432 
5433 	struct spdk_io_channel *channel;
5434 
5435 	/* Current cluster for inflate operation */
5436 	uint64_t cluster;
5437 
5438 	/* For inflation force allocation of all unallocated clusters and remove
5439 	 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
5440 	bool allocate_all;
5441 
5442 	struct {
5443 		spdk_blob_id id;
5444 		struct spdk_blob *blob;
5445 	} original;
5446 	struct {
5447 		spdk_blob_id id;
5448 		struct spdk_blob *blob;
5449 	} new;
5450 
5451 	/* xattrs specified for snapshot/clones only. They have no impact on
5452 	 * the original blobs xattrs. */
5453 	const struct spdk_blob_xattr_opts *xattrs;
5454 };
5455 
5456 static void
5457 bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
5458 {
5459 	struct spdk_clone_snapshot_ctx *ctx = cb_arg;
5460 	struct spdk_bs_cpl *cpl = &ctx->cpl;
5461 
5462 	if (bserrno != 0) {
5463 		if (ctx->bserrno != 0) {
5464 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5465 		} else {
5466 			ctx->bserrno = bserrno;
5467 		}
5468 	}
5469 
5470 	switch (cpl->type) {
5471 	case SPDK_BS_CPL_TYPE_BLOBID:
5472 		cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
5473 		break;
5474 	case SPDK_BS_CPL_TYPE_BLOB_BASIC:
5475 		cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
5476 		break;
5477 	default:
5478 		SPDK_UNREACHABLE();
5479 		break;
5480 	}
5481 
5482 	free(ctx);
5483 }
5484 
5485 static void
5486 bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
5487 {
5488 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5489 	struct spdk_blob *origblob = ctx->original.blob;
5490 
5491 	if (bserrno != 0) {
5492 		if (ctx->bserrno != 0) {
5493 			SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
5494 		} else {
5495 			ctx->bserrno = bserrno;
5496 		}
5497 	}
5498 
5499 	ctx->original.id = origblob->id;
5500 	origblob->locked_operation_in_progress = false;
5501 
5502 	spdk_blob_close(origblob, bs_clone_snapshot_cleanup_finish, ctx);
5503 }
5504 
5505 static void
5506 bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
5507 {
5508 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5509 	struct spdk_blob *origblob = ctx->original.blob;
5510 
5511 	if (bserrno != 0) {
5512 		if (ctx->bserrno != 0) {
5513 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5514 		} else {
5515 			ctx->bserrno = bserrno;
5516 		}
5517 	}
5518 
5519 	if (ctx->frozen) {
5520 		/* Unfreeze any outstanding I/O */
5521 		blob_unfreeze_io(origblob, bs_snapshot_unfreeze_cpl, ctx);
5522 	} else {
5523 		bs_snapshot_unfreeze_cpl(ctx, 0);
5524 	}
5525 
5526 }
5527 
5528 static void
5529 bs_clone_snapshot_newblob_cleanup(struct spdk_clone_snapshot_ctx *ctx, int bserrno)
5530 {
5531 	struct spdk_blob *newblob = ctx->new.blob;
5532 
5533 	if (bserrno != 0) {
5534 		if (ctx->bserrno != 0) {
5535 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5536 		} else {
5537 			ctx->bserrno = bserrno;
5538 		}
5539 	}
5540 
5541 	ctx->new.id = newblob->id;
5542 	spdk_blob_close(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
5543 }
5544 
5545 /* END blob_cleanup */
5546 
5547 /* START spdk_bs_create_snapshot */
5548 
5549 static void
5550 bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
5551 {
5552 	uint64_t *cluster_temp;
5553 	uint32_t *extent_page_temp;
5554 
5555 	cluster_temp = blob1->active.clusters;
5556 	blob1->active.clusters = blob2->active.clusters;
5557 	blob2->active.clusters = cluster_temp;
5558 
5559 	extent_page_temp = blob1->active.extent_pages;
5560 	blob1->active.extent_pages = blob2->active.extent_pages;
5561 	blob2->active.extent_pages = extent_page_temp;
5562 }
5563 
5564 static void
5565 bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
5566 {
5567 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5568 	struct spdk_blob *origblob = ctx->original.blob;
5569 	struct spdk_blob *newblob = ctx->new.blob;
5570 
5571 	if (bserrno != 0) {
5572 		bs_snapshot_swap_cluster_maps(newblob, origblob);
5573 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5574 		return;
5575 	}
5576 
5577 	/* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
5578 	bserrno = blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
5579 	if (bserrno != 0) {
5580 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5581 		return;
5582 	}
5583 
5584 	bs_blob_list_add(ctx->original.blob);
5585 
5586 	spdk_blob_set_read_only(newblob);
5587 
5588 	/* sync snapshot metadata */
5589 	spdk_blob_sync_md(newblob, bs_clone_snapshot_origblob_cleanup, ctx);
5590 }
5591 
5592 static void
5593 bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
5594 {
5595 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5596 	struct spdk_blob *origblob = ctx->original.blob;
5597 	struct spdk_blob *newblob = ctx->new.blob;
5598 
5599 	if (bserrno != 0) {
5600 		/* return cluster map back to original */
5601 		bs_snapshot_swap_cluster_maps(newblob, origblob);
5602 
5603 		/* Newblob md sync failed. Valid clusters are only present in origblob.
5604 		 * Since I/O is frozen on origblob, not changes to zeroed out cluster map should have occured.
5605 		 * Newblob needs to be reverted to thin_provisioned state at creation to properly close. */
5606 		blob_set_thin_provision(newblob);
5607 		assert(spdk_mem_all_zero(newblob->active.clusters,
5608 					 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
5609 		assert(spdk_mem_all_zero(newblob->active.extent_pages,
5610 					 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
5611 
5612 		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5613 		return;
5614 	}
5615 
5616 	/* Set internal xattr for snapshot id */
5617 	bserrno = blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
5618 	if (bserrno != 0) {
5619 		/* return cluster map back to original */
5620 		bs_snapshot_swap_cluster_maps(newblob, origblob);
5621 		blob_set_thin_provision(newblob);
5622 		bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5623 		return;
5624 	}
5625 
5626 	/* Create new back_bs_dev for snapshot */
5627 	origblob->back_bs_dev = bs_create_blob_bs_dev(newblob);
5628 	if (origblob->back_bs_dev == NULL) {
5629 		/* return cluster map back to original */
5630 		bs_snapshot_swap_cluster_maps(newblob, origblob);
5631 		blob_set_thin_provision(newblob);
5632 		bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
5633 		return;
5634 	}
5635 
5636 	bs_blob_list_remove(origblob);
5637 	origblob->parent_id = newblob->id;
5638 	/* set clone blob as thin provisioned */
5639 	blob_set_thin_provision(origblob);
5640 
5641 	bs_blob_list_add(newblob);
5642 
5643 	/* sync clone metadata */
5644 	spdk_blob_sync_md(origblob, bs_snapshot_origblob_sync_cpl, ctx);
5645 }
5646 
5647 static void
5648 bs_snapshot_freeze_cpl(void *cb_arg, int rc)
5649 {
5650 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5651 	struct spdk_blob *origblob = ctx->original.blob;
5652 	struct spdk_blob *newblob = ctx->new.blob;
5653 	int bserrno;
5654 
5655 	if (rc != 0) {
5656 		bs_clone_snapshot_newblob_cleanup(ctx, rc);
5657 		return;
5658 	}
5659 
5660 	ctx->frozen = true;
5661 
5662 	/* set new back_bs_dev for snapshot */
5663 	newblob->back_bs_dev = origblob->back_bs_dev;
5664 	/* Set invalid flags from origblob */
5665 	newblob->invalid_flags = origblob->invalid_flags;
5666 
5667 	/* inherit parent from original blob if set */
5668 	newblob->parent_id = origblob->parent_id;
5669 	if (origblob->parent_id != SPDK_BLOBID_INVALID) {
5670 		/* Set internal xattr for snapshot id */
5671 		bserrno = blob_set_xattr(newblob, BLOB_SNAPSHOT,
5672 					 &origblob->parent_id, sizeof(spdk_blob_id), true);
5673 		if (bserrno != 0) {
5674 			bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5675 			return;
5676 		}
5677 	}
5678 
5679 	/* swap cluster maps */
5680 	bs_snapshot_swap_cluster_maps(newblob, origblob);
5681 
5682 	/* Set the clear method on the new blob to match the original. */
5683 	blob_set_clear_method(newblob, origblob->clear_method);
5684 
5685 	/* sync snapshot metadata */
5686 	spdk_blob_sync_md(newblob, bs_snapshot_newblob_sync_cpl, ctx);
5687 }
5688 
5689 static void
5690 bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5691 {
5692 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5693 	struct spdk_blob *origblob = ctx->original.blob;
5694 	struct spdk_blob *newblob = _blob;
5695 
5696 	if (bserrno != 0) {
5697 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5698 		return;
5699 	}
5700 
5701 	ctx->new.blob = newblob;
5702 	assert(spdk_blob_is_thin_provisioned(newblob));
5703 	assert(spdk_mem_all_zero(newblob->active.clusters,
5704 				 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
5705 	assert(spdk_mem_all_zero(newblob->active.extent_pages,
5706 				 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
5707 
5708 	blob_freeze_io(origblob, bs_snapshot_freeze_cpl, ctx);
5709 }
5710 
5711 static void
5712 bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
5713 {
5714 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5715 	struct spdk_blob *origblob = ctx->original.blob;
5716 
5717 	if (bserrno != 0) {
5718 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5719 		return;
5720 	}
5721 
5722 	ctx->new.id = blobid;
5723 	ctx->cpl.u.blobid.blobid = blobid;
5724 
5725 	spdk_bs_open_blob(origblob->bs, ctx->new.id, bs_snapshot_newblob_open_cpl, ctx);
5726 }
5727 
5728 
5729 static void
5730 bs_xattr_snapshot(void *arg, const char *name,
5731 		  const void **value, size_t *value_len)
5732 {
5733 	assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
5734 
5735 	struct spdk_blob *blob = (struct spdk_blob *)arg;
5736 	*value = &blob->id;
5737 	*value_len = sizeof(blob->id);
5738 }
5739 
5740 static void
5741 bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5742 {
5743 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5744 	struct spdk_blob_opts opts;
5745 	struct spdk_blob_xattr_opts internal_xattrs;
5746 	char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
5747 
5748 	if (bserrno != 0) {
5749 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
5750 		return;
5751 	}
5752 
5753 	ctx->original.blob = _blob;
5754 
5755 	if (_blob->data_ro || _blob->md_ro) {
5756 		SPDK_DEBUGLOG(blob, "Cannot create snapshot from read only blob with id %" PRIu64 "\n",
5757 			      _blob->id);
5758 		ctx->bserrno = -EINVAL;
5759 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
5760 		return;
5761 	}
5762 
5763 	if (_blob->locked_operation_in_progress) {
5764 		SPDK_DEBUGLOG(blob, "Cannot create snapshot - another operation in progress\n");
5765 		ctx->bserrno = -EBUSY;
5766 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
5767 		return;
5768 	}
5769 
5770 	_blob->locked_operation_in_progress = true;
5771 
5772 	spdk_blob_opts_init(&opts);
5773 	blob_xattrs_init(&internal_xattrs);
5774 
5775 	/* Change the size of new blob to the same as in original blob,
5776 	 * but do not allocate clusters */
5777 	opts.thin_provision = true;
5778 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
5779 	opts.use_extent_table = _blob->use_extent_table;
5780 
5781 	/* If there are any xattrs specified for snapshot, set them now */
5782 	if (ctx->xattrs) {
5783 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
5784 	}
5785 	/* Set internal xattr SNAPSHOT_IN_PROGRESS */
5786 	internal_xattrs.count = 1;
5787 	internal_xattrs.ctx = _blob;
5788 	internal_xattrs.names = xattrs_names;
5789 	internal_xattrs.get_value = bs_xattr_snapshot;
5790 
5791 	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
5792 		       bs_snapshot_newblob_create_cpl, ctx);
5793 }
5794 
5795 void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
5796 			     const struct spdk_blob_xattr_opts *snapshot_xattrs,
5797 			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5798 {
5799 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
5800 
5801 	if (!ctx) {
5802 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
5803 		return;
5804 	}
5805 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5806 	ctx->cpl.u.blobid.cb_fn = cb_fn;
5807 	ctx->cpl.u.blobid.cb_arg = cb_arg;
5808 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
5809 	ctx->bserrno = 0;
5810 	ctx->frozen = false;
5811 	ctx->original.id = blobid;
5812 	ctx->xattrs = snapshot_xattrs;
5813 
5814 	spdk_bs_open_blob(bs, ctx->original.id, bs_snapshot_origblob_open_cpl, ctx);
5815 }
5816 /* END spdk_bs_create_snapshot */
5817 
5818 /* START spdk_bs_create_clone */
5819 
5820 static void
5821 bs_xattr_clone(void *arg, const char *name,
5822 	       const void **value, size_t *value_len)
5823 {
5824 	assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
5825 
5826 	struct spdk_blob *blob = (struct spdk_blob *)arg;
5827 	*value = &blob->id;
5828 	*value_len = sizeof(blob->id);
5829 }
5830 
5831 static void
5832 bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5833 {
5834 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5835 	struct spdk_blob *clone = _blob;
5836 
5837 	ctx->new.blob = clone;
5838 	bs_blob_list_add(clone);
5839 
5840 	spdk_blob_close(clone, bs_clone_snapshot_origblob_cleanup, ctx);
5841 }
5842 
5843 static void
5844 bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
5845 {
5846 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5847 
5848 	ctx->cpl.u.blobid.blobid = blobid;
5849 	spdk_bs_open_blob(ctx->original.blob->bs, blobid, bs_clone_newblob_open_cpl, ctx);
5850 }
5851 
5852 static void
5853 bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5854 {
5855 	struct spdk_clone_snapshot_ctx	*ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5856 	struct spdk_blob_opts		opts;
5857 	struct spdk_blob_xattr_opts internal_xattrs;
5858 	char *xattr_names[] = { BLOB_SNAPSHOT };
5859 
5860 	if (bserrno != 0) {
5861 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
5862 		return;
5863 	}
5864 
5865 	ctx->original.blob = _blob;
5866 
5867 	if (!_blob->data_ro || !_blob->md_ro) {
5868 		SPDK_DEBUGLOG(blob, "Clone not from read-only blob\n");
5869 		ctx->bserrno = -EINVAL;
5870 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
5871 		return;
5872 	}
5873 
5874 	if (_blob->locked_operation_in_progress) {
5875 		SPDK_DEBUGLOG(blob, "Cannot create clone - another operation in progress\n");
5876 		ctx->bserrno = -EBUSY;
5877 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
5878 		return;
5879 	}
5880 
5881 	_blob->locked_operation_in_progress = true;
5882 
5883 	spdk_blob_opts_init(&opts);
5884 	blob_xattrs_init(&internal_xattrs);
5885 
5886 	opts.thin_provision = true;
5887 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
5888 	opts.use_extent_table = _blob->use_extent_table;
5889 	if (ctx->xattrs) {
5890 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
5891 	}
5892 
5893 	/* Set internal xattr BLOB_SNAPSHOT */
5894 	internal_xattrs.count = 1;
5895 	internal_xattrs.ctx = _blob;
5896 	internal_xattrs.names = xattr_names;
5897 	internal_xattrs.get_value = bs_xattr_clone;
5898 
5899 	bs_create_blob(_blob->bs, &opts, &internal_xattrs,
5900 		       bs_clone_newblob_create_cpl, ctx);
5901 }
5902 
5903 void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
5904 			  const struct spdk_blob_xattr_opts *clone_xattrs,
5905 			  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5906 {
5907 	struct spdk_clone_snapshot_ctx	*ctx = calloc(1, sizeof(*ctx));
5908 
5909 	if (!ctx) {
5910 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
5911 		return;
5912 	}
5913 
5914 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5915 	ctx->cpl.u.blobid.cb_fn = cb_fn;
5916 	ctx->cpl.u.blobid.cb_arg = cb_arg;
5917 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
5918 	ctx->bserrno = 0;
5919 	ctx->xattrs = clone_xattrs;
5920 	ctx->original.id = blobid;
5921 
5922 	spdk_bs_open_blob(bs, ctx->original.id, bs_clone_origblob_open_cpl, ctx);
5923 }
5924 
5925 /* END spdk_bs_create_clone */
5926 
5927 /* START spdk_bs_inflate_blob */
5928 
5929 static void
5930 bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
5931 {
5932 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5933 	struct spdk_blob *_blob = ctx->original.blob;
5934 
5935 	if (bserrno != 0) {
5936 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5937 		return;
5938 	}
5939 
5940 	assert(_parent != NULL);
5941 
5942 	bs_blob_list_remove(_blob);
5943 	_blob->parent_id = _parent->id;
5944 	blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id,
5945 		       sizeof(spdk_blob_id), true);
5946 
5947 	_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5948 	_blob->back_bs_dev = bs_create_blob_bs_dev(_parent);
5949 	bs_blob_list_add(_blob);
5950 
5951 	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
5952 }
5953 
5954 static void
5955 bs_inflate_blob_done(struct spdk_clone_snapshot_ctx *ctx)
5956 {
5957 	struct spdk_blob *_blob = ctx->original.blob;
5958 	struct spdk_blob *_parent;
5959 
5960 	if (ctx->allocate_all) {
5961 		/* remove thin provisioning */
5962 		bs_blob_list_remove(_blob);
5963 		blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
5964 		_blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
5965 		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5966 		_blob->back_bs_dev = NULL;
5967 		_blob->parent_id = SPDK_BLOBID_INVALID;
5968 	} else {
5969 		_parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
5970 		if (_parent->parent_id != SPDK_BLOBID_INVALID) {
5971 			/* We must change the parent of the inflated blob */
5972 			spdk_bs_open_blob(_blob->bs, _parent->parent_id,
5973 					  bs_inflate_blob_set_parent_cpl, ctx);
5974 			return;
5975 		}
5976 
5977 		bs_blob_list_remove(_blob);
5978 		blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
5979 		_blob->parent_id = SPDK_BLOBID_INVALID;
5980 		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5981 		_blob->back_bs_dev = bs_create_zeroes_dev();
5982 	}
5983 
5984 	_blob->state = SPDK_BLOB_STATE_DIRTY;
5985 	spdk_blob_sync_md(_blob, bs_clone_snapshot_origblob_cleanup, ctx);
5986 }
5987 
5988 /* Check if cluster needs allocation */
5989 static inline bool
5990 bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
5991 {
5992 	struct spdk_blob_bs_dev *b;
5993 
5994 	assert(blob != NULL);
5995 
5996 	if (blob->active.clusters[cluster] != 0) {
5997 		/* Cluster is already allocated */
5998 		return false;
5999 	}
6000 
6001 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
6002 		/* Blob have no parent blob */
6003 		return allocate_all;
6004 	}
6005 
6006 	b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
6007 	return (allocate_all || b->blob->active.clusters[cluster] != 0);
6008 }
6009 
6010 static void
6011 bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
6012 {
6013 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6014 	struct spdk_blob *_blob = ctx->original.blob;
6015 	uint64_t offset;
6016 
6017 	if (bserrno != 0) {
6018 		bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
6019 		return;
6020 	}
6021 
6022 	for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
6023 		if (bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
6024 			break;
6025 		}
6026 	}
6027 
6028 	if (ctx->cluster < _blob->active.num_clusters) {
6029 		offset = bs_cluster_to_lba(_blob->bs, ctx->cluster);
6030 
6031 		/* We may safely increment a cluster before write */
6032 		ctx->cluster++;
6033 
6034 		/* Use zero length write to touch a cluster */
6035 		spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0,
6036 				   bs_inflate_blob_touch_next, ctx);
6037 	} else {
6038 		bs_inflate_blob_done(ctx);
6039 	}
6040 }
6041 
6042 static void
6043 bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6044 {
6045 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
6046 	uint64_t clusters_needed;
6047 	uint64_t i;
6048 
6049 	if (bserrno != 0) {
6050 		bs_clone_snapshot_cleanup_finish(ctx, bserrno);
6051 		return;
6052 	}
6053 
6054 	ctx->original.blob = _blob;
6055 
6056 	if (_blob->locked_operation_in_progress) {
6057 		SPDK_DEBUGLOG(blob, "Cannot inflate blob - another operation in progress\n");
6058 		ctx->bserrno = -EBUSY;
6059 		spdk_blob_close(_blob, bs_clone_snapshot_cleanup_finish, ctx);
6060 		return;
6061 	}
6062 
6063 	_blob->locked_operation_in_progress = true;
6064 
6065 	if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) {
6066 		/* This blob have no parent, so we cannot decouple it. */
6067 		SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
6068 		bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
6069 		return;
6070 	}
6071 
6072 	if (spdk_blob_is_thin_provisioned(_blob) == false) {
6073 		/* This is not thin provisioned blob. No need to inflate. */
6074 		bs_clone_snapshot_origblob_cleanup(ctx, 0);
6075 		return;
6076 	}
6077 
6078 	/* Do two passes - one to verify that we can obtain enough clusters
6079 	 * and another to actually claim them.
6080 	 */
6081 	clusters_needed = 0;
6082 	for (i = 0; i < _blob->active.num_clusters; i++) {
6083 		if (bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
6084 			clusters_needed++;
6085 		}
6086 	}
6087 
6088 	if (clusters_needed > _blob->bs->num_free_clusters) {
6089 		/* Not enough free clusters. Cannot satisfy the request. */
6090 		bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
6091 		return;
6092 	}
6093 
6094 	ctx->cluster = 0;
6095 	bs_inflate_blob_touch_next(ctx, 0);
6096 }
6097 
6098 static void
6099 bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6100 		spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
6101 {
6102 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
6103 
6104 	if (!ctx) {
6105 		cb_fn(cb_arg, -ENOMEM);
6106 		return;
6107 	}
6108 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6109 	ctx->cpl.u.bs_basic.cb_fn = cb_fn;
6110 	ctx->cpl.u.bs_basic.cb_arg = cb_arg;
6111 	ctx->bserrno = 0;
6112 	ctx->original.id = blobid;
6113 	ctx->channel = channel;
6114 	ctx->allocate_all = allocate_all;
6115 
6116 	spdk_bs_open_blob(bs, ctx->original.id, bs_inflate_blob_open_cpl, ctx);
6117 }
6118 
6119 void
6120 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6121 		     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
6122 {
6123 	bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
6124 }
6125 
6126 void
6127 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
6128 			     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
6129 {
6130 	bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
6131 }
6132 /* END spdk_bs_inflate_blob */
6133 
6134 /* START spdk_blob_resize */
6135 struct spdk_bs_resize_ctx {
6136 	spdk_blob_op_complete cb_fn;
6137 	void *cb_arg;
6138 	struct spdk_blob *blob;
6139 	uint64_t sz;
6140 	int rc;
6141 };
6142 
6143 static void
6144 bs_resize_unfreeze_cpl(void *cb_arg, int rc)
6145 {
6146 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
6147 
6148 	if (rc != 0) {
6149 		SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
6150 	}
6151 
6152 	if (ctx->rc != 0) {
6153 		SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
6154 		rc = ctx->rc;
6155 	}
6156 
6157 	ctx->blob->locked_operation_in_progress = false;
6158 
6159 	ctx->cb_fn(ctx->cb_arg, rc);
6160 	free(ctx);
6161 }
6162 
6163 static void
6164 bs_resize_freeze_cpl(void *cb_arg, int rc)
6165 {
6166 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
6167 
6168 	if (rc != 0) {
6169 		ctx->blob->locked_operation_in_progress = false;
6170 		ctx->cb_fn(ctx->cb_arg, rc);
6171 		free(ctx);
6172 		return;
6173 	}
6174 
6175 	ctx->rc = blob_resize(ctx->blob, ctx->sz);
6176 
6177 	blob_unfreeze_io(ctx->blob, bs_resize_unfreeze_cpl, ctx);
6178 }
6179 
6180 void
6181 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
6182 {
6183 	struct spdk_bs_resize_ctx *ctx;
6184 
6185 	blob_verify_md_op(blob);
6186 
6187 	SPDK_DEBUGLOG(blob, "Resizing blob %" PRIu64 " to %" PRIu64 " clusters\n", blob->id, sz);
6188 
6189 	if (blob->md_ro) {
6190 		cb_fn(cb_arg, -EPERM);
6191 		return;
6192 	}
6193 
6194 	if (sz == blob->active.num_clusters) {
6195 		cb_fn(cb_arg, 0);
6196 		return;
6197 	}
6198 
6199 	if (blob->locked_operation_in_progress) {
6200 		cb_fn(cb_arg, -EBUSY);
6201 		return;
6202 	}
6203 
6204 	ctx = calloc(1, sizeof(*ctx));
6205 	if (!ctx) {
6206 		cb_fn(cb_arg, -ENOMEM);
6207 		return;
6208 	}
6209 
6210 	blob->locked_operation_in_progress = true;
6211 	ctx->cb_fn = cb_fn;
6212 	ctx->cb_arg = cb_arg;
6213 	ctx->blob = blob;
6214 	ctx->sz = sz;
6215 	blob_freeze_io(blob, bs_resize_freeze_cpl, ctx);
6216 }
6217 
6218 /* END spdk_blob_resize */
6219 
6220 
6221 /* START spdk_bs_delete_blob */
6222 
6223 static void
6224 bs_delete_close_cpl(void *cb_arg, int bserrno)
6225 {
6226 	spdk_bs_sequence_t *seq = cb_arg;
6227 
6228 	bs_sequence_finish(seq, bserrno);
6229 }
6230 
6231 static void
6232 bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6233 {
6234 	struct spdk_blob *blob = cb_arg;
6235 
6236 	if (bserrno != 0) {
6237 		/*
6238 		 * We already removed this blob from the blobstore tailq, so
6239 		 *  we need to free it here since this is the last reference
6240 		 *  to it.
6241 		 */
6242 		blob_free(blob);
6243 		bs_delete_close_cpl(seq, bserrno);
6244 		return;
6245 	}
6246 
6247 	/*
6248 	 * This will immediately decrement the ref_count and call
6249 	 *  the completion routine since the metadata state is clean.
6250 	 *  By calling spdk_blob_close, we reduce the number of call
6251 	 *  points into code that touches the blob->open_ref count
6252 	 *  and the blobstore's blob list.
6253 	 */
6254 	spdk_blob_close(blob, bs_delete_close_cpl, seq);
6255 }
6256 
6257 struct delete_snapshot_ctx {
6258 	struct spdk_blob_list *parent_snapshot_entry;
6259 	struct spdk_blob *snapshot;
6260 	bool snapshot_md_ro;
6261 	struct spdk_blob *clone;
6262 	bool clone_md_ro;
6263 	spdk_blob_op_with_handle_complete cb_fn;
6264 	void *cb_arg;
6265 	int bserrno;
6266 };
6267 
6268 static void
6269 delete_blob_cleanup_finish(void *cb_arg, int bserrno)
6270 {
6271 	struct delete_snapshot_ctx *ctx = cb_arg;
6272 
6273 	if (bserrno != 0) {
6274 		SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
6275 	}
6276 
6277 	assert(ctx != NULL);
6278 
6279 	if (bserrno != 0 && ctx->bserrno == 0) {
6280 		ctx->bserrno = bserrno;
6281 	}
6282 
6283 	ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
6284 	free(ctx);
6285 }
6286 
6287 static void
6288 delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
6289 {
6290 	struct delete_snapshot_ctx *ctx = cb_arg;
6291 
6292 	if (bserrno != 0) {
6293 		ctx->bserrno = bserrno;
6294 		SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
6295 	}
6296 
6297 	if (ctx->bserrno != 0) {
6298 		assert(blob_lookup(ctx->snapshot->bs, ctx->snapshot->id) == NULL);
6299 		TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link);
6300 		spdk_bit_array_set(ctx->snapshot->bs->open_blobids, ctx->snapshot->id);
6301 	}
6302 
6303 	ctx->snapshot->locked_operation_in_progress = false;
6304 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
6305 
6306 	spdk_blob_close(ctx->snapshot, delete_blob_cleanup_finish, ctx);
6307 }
6308 
6309 static void
6310 delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
6311 {
6312 	struct delete_snapshot_ctx *ctx = cb_arg;
6313 
6314 	ctx->clone->locked_operation_in_progress = false;
6315 	ctx->clone->md_ro = ctx->clone_md_ro;
6316 
6317 	spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
6318 }
6319 
6320 static void
6321 delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
6322 {
6323 	struct delete_snapshot_ctx *ctx = cb_arg;
6324 
6325 	if (bserrno) {
6326 		ctx->bserrno = bserrno;
6327 		delete_snapshot_cleanup_clone(ctx, 0);
6328 		return;
6329 	}
6330 
6331 	ctx->clone->locked_operation_in_progress = false;
6332 	spdk_blob_close(ctx->clone, delete_blob_cleanup_finish, ctx);
6333 }
6334 
6335 static void
6336 delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
6337 {
6338 	struct delete_snapshot_ctx *ctx = cb_arg;
6339 	struct spdk_blob_list *parent_snapshot_entry = NULL;
6340 	struct spdk_blob_list *snapshot_entry = NULL;
6341 	struct spdk_blob_list *clone_entry = NULL;
6342 	struct spdk_blob_list *snapshot_clone_entry = NULL;
6343 
6344 	if (bserrno) {
6345 		SPDK_ERRLOG("Failed to sync MD on blob\n");
6346 		ctx->bserrno = bserrno;
6347 		delete_snapshot_cleanup_clone(ctx, 0);
6348 		return;
6349 	}
6350 
6351 	/* Get snapshot entry for the snapshot we want to remove */
6352 	snapshot_entry = bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
6353 
6354 	assert(snapshot_entry != NULL);
6355 
6356 	/* Remove clone entry in this snapshot (at this point there can be only one clone) */
6357 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
6358 	assert(clone_entry != NULL);
6359 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
6360 	snapshot_entry->clone_count--;
6361 	assert(TAILQ_EMPTY(&snapshot_entry->clones));
6362 
6363 	if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) {
6364 		/* This snapshot is at the same time a clone of another snapshot - we need to
6365 		 * update parent snapshot (remove current clone, add new one inherited from
6366 		 * the snapshot that is being removed) */
6367 
6368 		/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
6369 		 * snapshot that we are removing */
6370 		blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
6371 						    &snapshot_clone_entry);
6372 
6373 		/* Switch clone entry in parent snapshot */
6374 		TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
6375 		TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
6376 		free(snapshot_clone_entry);
6377 	} else {
6378 		/* No parent snapshot - just remove clone entry */
6379 		free(clone_entry);
6380 	}
6381 
6382 	/* Restore md_ro flags */
6383 	ctx->clone->md_ro = ctx->clone_md_ro;
6384 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
6385 
6386 	blob_unfreeze_io(ctx->clone, delete_snapshot_unfreeze_cpl, ctx);
6387 }
6388 
6389 static void
6390 delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
6391 {
6392 	struct delete_snapshot_ctx *ctx = cb_arg;
6393 	uint64_t i;
6394 
6395 	ctx->snapshot->md_ro = false;
6396 
6397 	if (bserrno) {
6398 		SPDK_ERRLOG("Failed to sync MD on clone\n");
6399 		ctx->bserrno = bserrno;
6400 
6401 		/* Restore snapshot to previous state */
6402 		bserrno = blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
6403 		if (bserrno != 0) {
6404 			delete_snapshot_cleanup_clone(ctx, bserrno);
6405 			return;
6406 		}
6407 
6408 		spdk_blob_sync_md(ctx->snapshot, delete_snapshot_cleanup_clone, ctx);
6409 		return;
6410 	}
6411 
6412 	/* Clear cluster map entries for snapshot */
6413 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
6414 		if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
6415 			ctx->snapshot->active.clusters[i] = 0;
6416 		}
6417 	}
6418 	for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
6419 	     i < ctx->clone->active.num_extent_pages; i++) {
6420 		if (ctx->clone->active.extent_pages[i] == ctx->snapshot->active.extent_pages[i]) {
6421 			ctx->snapshot->active.extent_pages[i] = 0;
6422 		}
6423 	}
6424 
6425 	blob_set_thin_provision(ctx->snapshot);
6426 	ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
6427 
6428 	if (ctx->parent_snapshot_entry != NULL) {
6429 		ctx->snapshot->back_bs_dev = NULL;
6430 	}
6431 
6432 	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_cpl, ctx);
6433 }
6434 
6435 static void
6436 delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
6437 {
6438 	struct delete_snapshot_ctx *ctx = cb_arg;
6439 	uint64_t i;
6440 
6441 	/* Temporarily override md_ro flag for clone for MD modification */
6442 	ctx->clone_md_ro = ctx->clone->md_ro;
6443 	ctx->clone->md_ro = false;
6444 
6445 	if (bserrno) {
6446 		SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
6447 		ctx->bserrno = bserrno;
6448 		delete_snapshot_cleanup_clone(ctx, 0);
6449 		return;
6450 	}
6451 
6452 	/* Copy snapshot map to clone map (only unallocated clusters in clone) */
6453 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
6454 		if (ctx->clone->active.clusters[i] == 0) {
6455 			ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
6456 		}
6457 	}
6458 	for (i = 0; i < ctx->snapshot->active.num_extent_pages &&
6459 	     i < ctx->clone->active.num_extent_pages; i++) {
6460 		if (ctx->clone->active.extent_pages[i] == 0) {
6461 			ctx->clone->active.extent_pages[i] = ctx->snapshot->active.extent_pages[i];
6462 		}
6463 	}
6464 
6465 	/* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
6466 	ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev);
6467 
6468 	/* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
6469 	if (ctx->parent_snapshot_entry != NULL) {
6470 		/* ...to parent snapshot */
6471 		ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
6472 		ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
6473 		blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
6474 			       sizeof(spdk_blob_id),
6475 			       true);
6476 	} else {
6477 		/* ...to blobid invalid and zeroes dev */
6478 		ctx->clone->parent_id = SPDK_BLOBID_INVALID;
6479 		ctx->clone->back_bs_dev = bs_create_zeroes_dev();
6480 		blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
6481 	}
6482 
6483 	spdk_blob_sync_md(ctx->clone, delete_snapshot_sync_clone_cpl, ctx);
6484 }
6485 
6486 static void
6487 delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
6488 {
6489 	struct delete_snapshot_ctx *ctx = cb_arg;
6490 
6491 	if (bserrno) {
6492 		SPDK_ERRLOG("Failed to freeze I/O on clone\n");
6493 		ctx->bserrno = bserrno;
6494 		delete_snapshot_cleanup_clone(ctx, 0);
6495 		return;
6496 	}
6497 
6498 	/* Temporarily override md_ro flag for snapshot for MD modification */
6499 	ctx->snapshot_md_ro = ctx->snapshot->md_ro;
6500 	ctx->snapshot->md_ro = false;
6501 
6502 	/* Mark blob as pending for removal for power failure safety, use clone id for recovery */
6503 	ctx->bserrno = blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
6504 				      sizeof(spdk_blob_id), true);
6505 	if (ctx->bserrno != 0) {
6506 		delete_snapshot_cleanup_clone(ctx, 0);
6507 		return;
6508 	}
6509 
6510 	spdk_blob_sync_md(ctx->snapshot, delete_snapshot_sync_snapshot_xattr_cpl, ctx);
6511 }
6512 
6513 static void
6514 delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
6515 {
6516 	struct delete_snapshot_ctx *ctx = cb_arg;
6517 
6518 	if (bserrno) {
6519 		SPDK_ERRLOG("Failed to open clone\n");
6520 		ctx->bserrno = bserrno;
6521 		delete_snapshot_cleanup_snapshot(ctx, 0);
6522 		return;
6523 	}
6524 
6525 	ctx->clone = clone;
6526 
6527 	if (clone->locked_operation_in_progress) {
6528 		SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress on its clone\n");
6529 		ctx->bserrno = -EBUSY;
6530 		spdk_blob_close(ctx->clone, delete_snapshot_cleanup_snapshot, ctx);
6531 		return;
6532 	}
6533 
6534 	clone->locked_operation_in_progress = true;
6535 
6536 	blob_freeze_io(clone, delete_snapshot_freeze_io_cb, ctx);
6537 }
6538 
6539 static void
6540 update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
6541 {
6542 	struct spdk_blob_list *snapshot_entry = NULL;
6543 	struct spdk_blob_list *clone_entry = NULL;
6544 	struct spdk_blob_list *snapshot_clone_entry = NULL;
6545 
6546 	/* Get snapshot entry for the snapshot we want to remove */
6547 	snapshot_entry = bs_get_snapshot_entry(snapshot->bs, snapshot->id);
6548 
6549 	assert(snapshot_entry != NULL);
6550 
6551 	/* Get clone of the snapshot (at this point there can be only one clone) */
6552 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
6553 	assert(snapshot_entry->clone_count == 1);
6554 	assert(clone_entry != NULL);
6555 
6556 	/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
6557 	 * snapshot that we are removing */
6558 	blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
6559 					    &snapshot_clone_entry);
6560 
6561 	spdk_bs_open_blob(snapshot->bs, clone_entry->id, delete_snapshot_open_clone_cb, ctx);
6562 }
6563 
6564 static void
6565 bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
6566 {
6567 	spdk_bs_sequence_t *seq = cb_arg;
6568 	struct spdk_blob_list *snapshot_entry = NULL;
6569 	uint32_t page_num;
6570 
6571 	if (bserrno) {
6572 		SPDK_ERRLOG("Failed to remove blob\n");
6573 		bs_sequence_finish(seq, bserrno);
6574 		return;
6575 	}
6576 
6577 	/* Remove snapshot from the list */
6578 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
6579 	if (snapshot_entry != NULL) {
6580 		TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
6581 		free(snapshot_entry);
6582 	}
6583 
6584 	page_num = bs_blobid_to_page(blob->id);
6585 	spdk_bit_array_clear(blob->bs->used_blobids, page_num);
6586 	blob->state = SPDK_BLOB_STATE_DIRTY;
6587 	blob->active.num_pages = 0;
6588 	blob_resize(blob, 0);
6589 
6590 	blob_persist(seq, blob, bs_delete_persist_cpl, blob);
6591 }
6592 
6593 static int
6594 bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
6595 {
6596 	struct spdk_blob_list *snapshot_entry = NULL;
6597 	struct spdk_blob_list *clone_entry = NULL;
6598 	struct spdk_blob *clone = NULL;
6599 	bool has_one_clone = false;
6600 
6601 	/* Check if this is a snapshot with clones */
6602 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
6603 	if (snapshot_entry != NULL) {
6604 		if (snapshot_entry->clone_count > 1) {
6605 			SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
6606 			return -EBUSY;
6607 		} else if (snapshot_entry->clone_count == 1) {
6608 			has_one_clone = true;
6609 		}
6610 	}
6611 
6612 	/* Check if someone has this blob open (besides this delete context):
6613 	 * - open_ref = 1 - only this context opened blob, so it is ok to remove it
6614 	 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
6615 	 *	and that is ok, because we will update it accordingly */
6616 	if (blob->open_ref <= 2 && has_one_clone) {
6617 		clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
6618 		assert(clone_entry != NULL);
6619 		clone = blob_lookup(blob->bs, clone_entry->id);
6620 
6621 		if (blob->open_ref == 2 && clone == NULL) {
6622 			/* Clone is closed and someone else opened this blob */
6623 			SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
6624 			return -EBUSY;
6625 		}
6626 
6627 		*update_clone = true;
6628 		return 0;
6629 	}
6630 
6631 	if (blob->open_ref > 1) {
6632 		SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
6633 		return -EBUSY;
6634 	}
6635 
6636 	assert(has_one_clone == false);
6637 	*update_clone = false;
6638 	return 0;
6639 }
6640 
6641 static void
6642 bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
6643 {
6644 	spdk_bs_sequence_t *seq = cb_arg;
6645 
6646 	bs_sequence_finish(seq, -ENOMEM);
6647 }
6648 
6649 static void
6650 bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
6651 {
6652 	spdk_bs_sequence_t *seq = cb_arg;
6653 	struct delete_snapshot_ctx *ctx;
6654 	bool update_clone = false;
6655 
6656 	if (bserrno != 0) {
6657 		bs_sequence_finish(seq, bserrno);
6658 		return;
6659 	}
6660 
6661 	blob_verify_md_op(blob);
6662 
6663 	ctx = calloc(1, sizeof(*ctx));
6664 	if (ctx == NULL) {
6665 		spdk_blob_close(blob, bs_delete_enomem_close_cpl, seq);
6666 		return;
6667 	}
6668 
6669 	ctx->snapshot = blob;
6670 	ctx->cb_fn = bs_delete_blob_finish;
6671 	ctx->cb_arg = seq;
6672 
6673 	/* Check if blob can be removed and if it is a snapshot with clone on top of it */
6674 	ctx->bserrno = bs_is_blob_deletable(blob, &update_clone);
6675 	if (ctx->bserrno) {
6676 		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
6677 		return;
6678 	}
6679 
6680 	if (blob->locked_operation_in_progress) {
6681 		SPDK_DEBUGLOG(blob, "Cannot remove blob - another operation in progress\n");
6682 		ctx->bserrno = -EBUSY;
6683 		spdk_blob_close(blob, delete_blob_cleanup_finish, ctx);
6684 		return;
6685 	}
6686 
6687 	blob->locked_operation_in_progress = true;
6688 
6689 	/*
6690 	 * Remove the blob from the blob_store list now, to ensure it does not
6691 	 *  get returned after this point by blob_lookup().
6692 	 */
6693 	spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
6694 	TAILQ_REMOVE(&blob->bs->blobs, blob, link);
6695 
6696 	if (update_clone) {
6697 		/* This blob is a snapshot with active clone - update clone first */
6698 		update_clone_on_snapshot_deletion(blob, ctx);
6699 	} else {
6700 		/* This blob does not have any clones - just remove it */
6701 		bs_blob_list_remove(blob);
6702 		bs_delete_blob_finish(seq, blob, 0);
6703 		free(ctx);
6704 	}
6705 }
6706 
6707 void
6708 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
6709 		    spdk_blob_op_complete cb_fn, void *cb_arg)
6710 {
6711 	struct spdk_bs_cpl	cpl;
6712 	spdk_bs_sequence_t	*seq;
6713 
6714 	SPDK_DEBUGLOG(blob, "Deleting blob %" PRIu64 "\n", blobid);
6715 
6716 	assert(spdk_get_thread() == bs->md_thread);
6717 
6718 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6719 	cpl.u.blob_basic.cb_fn = cb_fn;
6720 	cpl.u.blob_basic.cb_arg = cb_arg;
6721 
6722 	seq = bs_sequence_start(bs->md_channel, &cpl);
6723 	if (!seq) {
6724 		cb_fn(cb_arg, -ENOMEM);
6725 		return;
6726 	}
6727 
6728 	spdk_bs_open_blob(bs, blobid, bs_delete_open_cpl, seq);
6729 }
6730 
6731 /* END spdk_bs_delete_blob */
6732 
6733 /* START spdk_bs_open_blob */
6734 
6735 static void
6736 bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6737 {
6738 	struct spdk_blob *blob = cb_arg;
6739 	struct spdk_blob *existing;
6740 
6741 	if (bserrno != 0) {
6742 		blob_free(blob);
6743 		seq->cpl.u.blob_handle.blob = NULL;
6744 		bs_sequence_finish(seq, bserrno);
6745 		return;
6746 	}
6747 
6748 	existing = blob_lookup(blob->bs, blob->id);
6749 	if (existing) {
6750 		blob_free(blob);
6751 		existing->open_ref++;
6752 		seq->cpl.u.blob_handle.blob = existing;
6753 		bs_sequence_finish(seq, 0);
6754 		return;
6755 	}
6756 
6757 	blob->open_ref++;
6758 
6759 	spdk_bit_array_set(blob->bs->open_blobids, blob->id);
6760 	TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link);
6761 
6762 	bs_sequence_finish(seq, bserrno);
6763 }
6764 
6765 static void
6766 bs_open_blob(struct spdk_blob_store *bs,
6767 	     spdk_blob_id blobid,
6768 	     struct spdk_blob_open_opts *opts,
6769 	     spdk_blob_op_with_handle_complete cb_fn,
6770 	     void *cb_arg)
6771 {
6772 	struct spdk_blob		*blob;
6773 	struct spdk_bs_cpl		cpl;
6774 	struct spdk_blob_open_opts	opts_default;
6775 	spdk_bs_sequence_t		*seq;
6776 	uint32_t			page_num;
6777 
6778 	SPDK_DEBUGLOG(blob, "Opening blob %" PRIu64 "\n", blobid);
6779 	assert(spdk_get_thread() == bs->md_thread);
6780 
6781 	page_num = bs_blobid_to_page(blobid);
6782 	if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
6783 		/* Invalid blobid */
6784 		cb_fn(cb_arg, NULL, -ENOENT);
6785 		return;
6786 	}
6787 
6788 	blob = blob_lookup(bs, blobid);
6789 	if (blob) {
6790 		blob->open_ref++;
6791 		cb_fn(cb_arg, blob, 0);
6792 		return;
6793 	}
6794 
6795 	blob = blob_alloc(bs, blobid);
6796 	if (!blob) {
6797 		cb_fn(cb_arg, NULL, -ENOMEM);
6798 		return;
6799 	}
6800 
6801 	if (!opts) {
6802 		spdk_blob_open_opts_init(&opts_default);
6803 		opts = &opts_default;
6804 	}
6805 
6806 	blob->clear_method = opts->clear_method;
6807 
6808 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
6809 	cpl.u.blob_handle.cb_fn = cb_fn;
6810 	cpl.u.blob_handle.cb_arg = cb_arg;
6811 	cpl.u.blob_handle.blob = blob;
6812 
6813 	seq = bs_sequence_start(bs->md_channel, &cpl);
6814 	if (!seq) {
6815 		blob_free(blob);
6816 		cb_fn(cb_arg, NULL, -ENOMEM);
6817 		return;
6818 	}
6819 
6820 	blob_load(seq, blob, bs_open_blob_cpl, blob);
6821 }
6822 
6823 void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
6824 		       spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6825 {
6826 	bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
6827 }
6828 
6829 void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
6830 			   struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6831 {
6832 	bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
6833 }
6834 
6835 /* END spdk_bs_open_blob */
6836 
6837 /* START spdk_blob_set_read_only */
6838 int spdk_blob_set_read_only(struct spdk_blob *blob)
6839 {
6840 	blob_verify_md_op(blob);
6841 
6842 	blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
6843 
6844 	blob->state = SPDK_BLOB_STATE_DIRTY;
6845 	return 0;
6846 }
6847 /* END spdk_blob_set_read_only */
6848 
6849 /* START spdk_blob_sync_md */
6850 
6851 static void
6852 blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6853 {
6854 	struct spdk_blob *blob = cb_arg;
6855 
6856 	if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
6857 		blob->data_ro = true;
6858 		blob->md_ro = true;
6859 	}
6860 
6861 	bs_sequence_finish(seq, bserrno);
6862 }
6863 
6864 static void
6865 blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
6866 {
6867 	struct spdk_bs_cpl	cpl;
6868 	spdk_bs_sequence_t	*seq;
6869 
6870 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6871 	cpl.u.blob_basic.cb_fn = cb_fn;
6872 	cpl.u.blob_basic.cb_arg = cb_arg;
6873 
6874 	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
6875 	if (!seq) {
6876 		cb_fn(cb_arg, -ENOMEM);
6877 		return;
6878 	}
6879 
6880 	blob_persist(seq, blob, blob_sync_md_cpl, blob);
6881 }
6882 
6883 void
6884 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
6885 {
6886 	blob_verify_md_op(blob);
6887 
6888 	SPDK_DEBUGLOG(blob, "Syncing blob %" PRIu64 "\n", blob->id);
6889 
6890 	if (blob->md_ro) {
6891 		assert(blob->state == SPDK_BLOB_STATE_CLEAN);
6892 		cb_fn(cb_arg, 0);
6893 		return;
6894 	}
6895 
6896 	blob_sync_md(blob, cb_fn, cb_arg);
6897 }
6898 
6899 /* END spdk_blob_sync_md */
6900 
6901 struct spdk_blob_insert_cluster_ctx {
6902 	struct spdk_thread	*thread;
6903 	struct spdk_blob	*blob;
6904 	uint32_t		cluster_num;	/* cluster index in blob */
6905 	uint32_t		cluster;	/* cluster on disk */
6906 	uint32_t		extent_page;	/* extent page on disk */
6907 	int			rc;
6908 	spdk_blob_op_complete	cb_fn;
6909 	void			*cb_arg;
6910 };
6911 
6912 static void
6913 blob_insert_cluster_msg_cpl(void *arg)
6914 {
6915 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6916 
6917 	ctx->cb_fn(ctx->cb_arg, ctx->rc);
6918 	free(ctx);
6919 }
6920 
6921 static void
6922 blob_insert_cluster_msg_cb(void *arg, int bserrno)
6923 {
6924 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6925 
6926 	ctx->rc = bserrno;
6927 	spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
6928 }
6929 
6930 static void
6931 blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6932 {
6933 	struct spdk_blob_md_page        *page = cb_arg;
6934 
6935 	bs_sequence_finish(seq, bserrno);
6936 	spdk_free(page);
6937 }
6938 
6939 static void
6940 blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
6941 		   spdk_blob_op_complete cb_fn, void *cb_arg)
6942 {
6943 	spdk_bs_sequence_t		*seq;
6944 	struct spdk_bs_cpl		cpl;
6945 	struct spdk_blob_md_page	*page = NULL;
6946 	uint32_t			page_count = 0;
6947 	int				rc;
6948 
6949 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6950 	cpl.u.blob_basic.cb_fn = cb_fn;
6951 	cpl.u.blob_basic.cb_arg = cb_arg;
6952 
6953 	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
6954 	if (!seq) {
6955 		cb_fn(cb_arg, -ENOMEM);
6956 		return;
6957 	}
6958 	rc = blob_serialize_add_page(blob, &page, &page_count, &page);
6959 	if (rc < 0) {
6960 		bs_sequence_finish(seq, rc);
6961 		return;
6962 	}
6963 
6964 	blob_serialize_extent_page(blob, cluster_num, page);
6965 
6966 	page->crc = blob_md_page_calc_crc(page);
6967 
6968 	assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
6969 
6970 	bs_sequence_write_dev(seq, page, bs_md_page_to_lba(blob->bs, extent),
6971 			      bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
6972 			      blob_persist_extent_page_cpl, page);
6973 }
6974 
6975 static void
6976 blob_insert_cluster_msg(void *arg)
6977 {
6978 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6979 	uint32_t *extent_page;
6980 
6981 	ctx->rc = blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
6982 	if (ctx->rc != 0) {
6983 		spdk_thread_send_msg(ctx->thread, blob_insert_cluster_msg_cpl, ctx);
6984 		return;
6985 	}
6986 
6987 	if (ctx->blob->use_extent_table == false) {
6988 		/* Extent table is not used, proceed with sync of md that will only use extents_rle. */
6989 		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
6990 		blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
6991 		return;
6992 	}
6993 
6994 	extent_page = bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
6995 	if (*extent_page == 0) {
6996 		/* Extent page requires allocation.
6997 		 * It was already claimed in the used_md_pages map and placed in ctx.
6998 		 * Blob persist will take care of writing out new extent page on disk. */
6999 		assert(ctx->extent_page != 0);
7000 		assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
7001 		*extent_page = ctx->extent_page;
7002 		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
7003 		blob_sync_md(ctx->blob, blob_insert_cluster_msg_cb, ctx);
7004 	} else {
7005 		/* It is possible for original thread to allocate extent page for
7006 		 * different cluster in the same extent page. In such case proceed with
7007 		 * updating the existing extent page, but release the additional one. */
7008 		if (ctx->extent_page != 0) {
7009 			assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
7010 			bs_release_md_page(ctx->blob->bs, ctx->extent_page);
7011 			ctx->extent_page = 0;
7012 		}
7013 		/* Extent page already allocated.
7014 		 * Every cluster allocation, requires just an update of single extent page. */
7015 		blob_insert_extent(ctx->blob, *extent_page, ctx->cluster_num,
7016 				   blob_insert_cluster_msg_cb, ctx);
7017 	}
7018 }
7019 
7020 static void
7021 blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
7022 				 uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg)
7023 {
7024 	struct spdk_blob_insert_cluster_ctx *ctx;
7025 
7026 	ctx = calloc(1, sizeof(*ctx));
7027 	if (ctx == NULL) {
7028 		cb_fn(cb_arg, -ENOMEM);
7029 		return;
7030 	}
7031 
7032 	ctx->thread = spdk_get_thread();
7033 	ctx->blob = blob;
7034 	ctx->cluster_num = cluster_num;
7035 	ctx->cluster = cluster;
7036 	ctx->extent_page = extent_page;
7037 	ctx->cb_fn = cb_fn;
7038 	ctx->cb_arg = cb_arg;
7039 
7040 	spdk_thread_send_msg(blob->bs->md_thread, blob_insert_cluster_msg, ctx);
7041 }
7042 
7043 /* START spdk_blob_close */
7044 
7045 static void
7046 blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
7047 {
7048 	struct spdk_blob *blob = cb_arg;
7049 
7050 	if (bserrno == 0) {
7051 		blob->open_ref--;
7052 		if (blob->open_ref == 0) {
7053 			/*
7054 			 * Blobs with active.num_pages == 0 are deleted blobs.
7055 			 *  these blobs are removed from the blob_store list
7056 			 *  when the deletion process starts - so don't try to
7057 			 *  remove them again.
7058 			 */
7059 			if (blob->active.num_pages > 0) {
7060 				spdk_bit_array_clear(blob->bs->open_blobids, blob->id);
7061 				TAILQ_REMOVE(&blob->bs->blobs, blob, link);
7062 			}
7063 			blob_free(blob);
7064 		}
7065 	}
7066 
7067 	bs_sequence_finish(seq, bserrno);
7068 }
7069 
7070 void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
7071 {
7072 	struct spdk_bs_cpl	cpl;
7073 	spdk_bs_sequence_t	*seq;
7074 
7075 	blob_verify_md_op(blob);
7076 
7077 	SPDK_DEBUGLOG(blob, "Closing blob %" PRIu64 "\n", blob->id);
7078 
7079 	if (blob->open_ref == 0) {
7080 		cb_fn(cb_arg, -EBADF);
7081 		return;
7082 	}
7083 
7084 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
7085 	cpl.u.blob_basic.cb_fn = cb_fn;
7086 	cpl.u.blob_basic.cb_arg = cb_arg;
7087 
7088 	seq = bs_sequence_start(blob->bs->md_channel, &cpl);
7089 	if (!seq) {
7090 		cb_fn(cb_arg, -ENOMEM);
7091 		return;
7092 	}
7093 
7094 	/* Sync metadata */
7095 	blob_persist(seq, blob, blob_close_cpl, blob);
7096 }
7097 
7098 /* END spdk_blob_close */
7099 
7100 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
7101 {
7102 	return spdk_get_io_channel(bs);
7103 }
7104 
7105 void spdk_bs_free_io_channel(struct spdk_io_channel *channel)
7106 {
7107 	spdk_put_io_channel(channel);
7108 }
7109 
7110 void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
7111 			uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
7112 {
7113 	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
7114 			       SPDK_BLOB_UNMAP);
7115 }
7116 
7117 void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
7118 			       uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
7119 {
7120 	blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
7121 			       SPDK_BLOB_WRITE_ZEROES);
7122 }
7123 
7124 void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
7125 			void *payload, uint64_t offset, uint64_t length,
7126 			spdk_blob_op_complete cb_fn, void *cb_arg)
7127 {
7128 	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
7129 			       SPDK_BLOB_WRITE);
7130 }
7131 
7132 void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
7133 		       void *payload, uint64_t offset, uint64_t length,
7134 		       spdk_blob_op_complete cb_fn, void *cb_arg)
7135 {
7136 	blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
7137 			       SPDK_BLOB_READ);
7138 }
7139 
7140 void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
7141 			 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
7142 			 spdk_blob_op_complete cb_fn, void *cb_arg)
7143 {
7144 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false);
7145 }
7146 
7147 void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
7148 			struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
7149 			spdk_blob_op_complete cb_fn, void *cb_arg)
7150 {
7151 	blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true);
7152 }
7153 
7154 struct spdk_bs_iter_ctx {
7155 	int64_t page_num;
7156 	struct spdk_blob_store *bs;
7157 
7158 	spdk_blob_op_with_handle_complete cb_fn;
7159 	void *cb_arg;
7160 };
7161 
7162 static void
7163 bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
7164 {
7165 	struct spdk_bs_iter_ctx *ctx = cb_arg;
7166 	struct spdk_blob_store *bs = ctx->bs;
7167 	spdk_blob_id id;
7168 
7169 	if (bserrno == 0) {
7170 		ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
7171 		free(ctx);
7172 		return;
7173 	}
7174 
7175 	ctx->page_num++;
7176 	ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
7177 	if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
7178 		ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
7179 		free(ctx);
7180 		return;
7181 	}
7182 
7183 	id = bs_page_to_blobid(ctx->page_num);
7184 
7185 	spdk_bs_open_blob(bs, id, bs_iter_cpl, ctx);
7186 }
7187 
7188 void
7189 spdk_bs_iter_first(struct spdk_blob_store *bs,
7190 		   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
7191 {
7192 	struct spdk_bs_iter_ctx *ctx;
7193 
7194 	ctx = calloc(1, sizeof(*ctx));
7195 	if (!ctx) {
7196 		cb_fn(cb_arg, NULL, -ENOMEM);
7197 		return;
7198 	}
7199 
7200 	ctx->page_num = -1;
7201 	ctx->bs = bs;
7202 	ctx->cb_fn = cb_fn;
7203 	ctx->cb_arg = cb_arg;
7204 
7205 	bs_iter_cpl(ctx, NULL, -1);
7206 }
7207 
7208 static void
7209 bs_iter_close_cpl(void *cb_arg, int bserrno)
7210 {
7211 	struct spdk_bs_iter_ctx *ctx = cb_arg;
7212 
7213 	bs_iter_cpl(ctx, NULL, -1);
7214 }
7215 
7216 void
7217 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
7218 		  spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
7219 {
7220 	struct spdk_bs_iter_ctx *ctx;
7221 
7222 	assert(blob != NULL);
7223 
7224 	ctx = calloc(1, sizeof(*ctx));
7225 	if (!ctx) {
7226 		cb_fn(cb_arg, NULL, -ENOMEM);
7227 		return;
7228 	}
7229 
7230 	ctx->page_num = bs_blobid_to_page(blob->id);
7231 	ctx->bs = bs;
7232 	ctx->cb_fn = cb_fn;
7233 	ctx->cb_arg = cb_arg;
7234 
7235 	/* Close the existing blob */
7236 	spdk_blob_close(blob, bs_iter_close_cpl, ctx);
7237 }
7238 
7239 static int
7240 blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
7241 	       uint16_t value_len, bool internal)
7242 {
7243 	struct spdk_xattr_tailq *xattrs;
7244 	struct spdk_xattr	*xattr;
7245 	size_t			desc_size;
7246 	void			*tmp;
7247 
7248 	blob_verify_md_op(blob);
7249 
7250 	if (blob->md_ro) {
7251 		return -EPERM;
7252 	}
7253 
7254 	desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
7255 	if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
7256 		SPDK_DEBUGLOG(blob, "Xattr '%s' of size %zu does not fix into single page %zu\n", name,
7257 			      desc_size, SPDK_BS_MAX_DESC_SIZE);
7258 		return -ENOMEM;
7259 	}
7260 
7261 	if (internal) {
7262 		xattrs = &blob->xattrs_internal;
7263 		blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
7264 	} else {
7265 		xattrs = &blob->xattrs;
7266 	}
7267 
7268 	TAILQ_FOREACH(xattr, xattrs, link) {
7269 		if (!strcmp(name, xattr->name)) {
7270 			tmp = malloc(value_len);
7271 			if (!tmp) {
7272 				return -ENOMEM;
7273 			}
7274 
7275 			free(xattr->value);
7276 			xattr->value_len = value_len;
7277 			xattr->value = tmp;
7278 			memcpy(xattr->value, value, value_len);
7279 
7280 			blob->state = SPDK_BLOB_STATE_DIRTY;
7281 
7282 			return 0;
7283 		}
7284 	}
7285 
7286 	xattr = calloc(1, sizeof(*xattr));
7287 	if (!xattr) {
7288 		return -ENOMEM;
7289 	}
7290 
7291 	xattr->name = strdup(name);
7292 	if (!xattr->name) {
7293 		free(xattr);
7294 		return -ENOMEM;
7295 	}
7296 
7297 	xattr->value_len = value_len;
7298 	xattr->value = malloc(value_len);
7299 	if (!xattr->value) {
7300 		free(xattr->name);
7301 		free(xattr);
7302 		return -ENOMEM;
7303 	}
7304 	memcpy(xattr->value, value, value_len);
7305 	TAILQ_INSERT_TAIL(xattrs, xattr, link);
7306 
7307 	blob->state = SPDK_BLOB_STATE_DIRTY;
7308 
7309 	return 0;
7310 }
7311 
7312 int
7313 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
7314 		    uint16_t value_len)
7315 {
7316 	return blob_set_xattr(blob, name, value, value_len, false);
7317 }
7318 
7319 static int
7320 blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
7321 {
7322 	struct spdk_xattr_tailq *xattrs;
7323 	struct spdk_xattr	*xattr;
7324 
7325 	blob_verify_md_op(blob);
7326 
7327 	if (blob->md_ro) {
7328 		return -EPERM;
7329 	}
7330 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
7331 
7332 	TAILQ_FOREACH(xattr, xattrs, link) {
7333 		if (!strcmp(name, xattr->name)) {
7334 			TAILQ_REMOVE(xattrs, xattr, link);
7335 			free(xattr->value);
7336 			free(xattr->name);
7337 			free(xattr);
7338 
7339 			if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
7340 				blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
7341 			}
7342 			blob->state = SPDK_BLOB_STATE_DIRTY;
7343 
7344 			return 0;
7345 		}
7346 	}
7347 
7348 	return -ENOENT;
7349 }
7350 
7351 int
7352 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
7353 {
7354 	return blob_remove_xattr(blob, name, false);
7355 }
7356 
7357 static int
7358 blob_get_xattr_value(struct spdk_blob *blob, const char *name,
7359 		     const void **value, size_t *value_len, bool internal)
7360 {
7361 	struct spdk_xattr	*xattr;
7362 	struct spdk_xattr_tailq *xattrs;
7363 
7364 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
7365 
7366 	TAILQ_FOREACH(xattr, xattrs, link) {
7367 		if (!strcmp(name, xattr->name)) {
7368 			*value = xattr->value;
7369 			*value_len = xattr->value_len;
7370 			return 0;
7371 		}
7372 	}
7373 	return -ENOENT;
7374 }
7375 
7376 int
7377 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
7378 			  const void **value, size_t *value_len)
7379 {
7380 	blob_verify_md_op(blob);
7381 
7382 	return blob_get_xattr_value(blob, name, value, value_len, false);
7383 }
7384 
7385 struct spdk_xattr_names {
7386 	uint32_t	count;
7387 	const char	*names[0];
7388 };
7389 
7390 static int
7391 blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
7392 {
7393 	struct spdk_xattr	*xattr;
7394 	int			count = 0;
7395 
7396 	TAILQ_FOREACH(xattr, xattrs, link) {
7397 		count++;
7398 	}
7399 
7400 	*names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
7401 	if (*names == NULL) {
7402 		return -ENOMEM;
7403 	}
7404 
7405 	TAILQ_FOREACH(xattr, xattrs, link) {
7406 		(*names)->names[(*names)->count++] = xattr->name;
7407 	}
7408 
7409 	return 0;
7410 }
7411 
7412 int
7413 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
7414 {
7415 	blob_verify_md_op(blob);
7416 
7417 	return blob_get_xattr_names(&blob->xattrs, names);
7418 }
7419 
7420 uint32_t
7421 spdk_xattr_names_get_count(struct spdk_xattr_names *names)
7422 {
7423 	assert(names != NULL);
7424 
7425 	return names->count;
7426 }
7427 
7428 const char *
7429 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
7430 {
7431 	if (index >= names->count) {
7432 		return NULL;
7433 	}
7434 
7435 	return names->names[index];
7436 }
7437 
7438 void
7439 spdk_xattr_names_free(struct spdk_xattr_names *names)
7440 {
7441 	free(names);
7442 }
7443 
7444 struct spdk_bs_type
7445 spdk_bs_get_bstype(struct spdk_blob_store *bs)
7446 {
7447 	return bs->bstype;
7448 }
7449 
7450 void
7451 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
7452 {
7453 	memcpy(&bs->bstype, &bstype, sizeof(bstype));
7454 }
7455 
7456 bool
7457 spdk_blob_is_read_only(struct spdk_blob *blob)
7458 {
7459 	assert(blob != NULL);
7460 	return (blob->data_ro || blob->md_ro);
7461 }
7462 
7463 bool
7464 spdk_blob_is_snapshot(struct spdk_blob *blob)
7465 {
7466 	struct spdk_blob_list *snapshot_entry;
7467 
7468 	assert(blob != NULL);
7469 
7470 	snapshot_entry = bs_get_snapshot_entry(blob->bs, blob->id);
7471 	if (snapshot_entry == NULL) {
7472 		return false;
7473 	}
7474 
7475 	return true;
7476 }
7477 
7478 bool
7479 spdk_blob_is_clone(struct spdk_blob *blob)
7480 {
7481 	assert(blob != NULL);
7482 
7483 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
7484 		assert(spdk_blob_is_thin_provisioned(blob));
7485 		return true;
7486 	}
7487 
7488 	return false;
7489 }
7490 
7491 bool
7492 spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
7493 {
7494 	assert(blob != NULL);
7495 	return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
7496 }
7497 
7498 static void
7499 blob_update_clear_method(struct spdk_blob *blob)
7500 {
7501 	enum blob_clear_method stored_cm;
7502 
7503 	assert(blob != NULL);
7504 
7505 	/* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
7506 	 * in metadata previously.  If something other than the default was
7507 	 * specified, ignore stored value and used what was passed in.
7508 	 */
7509 	stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
7510 
7511 	if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
7512 		blob->clear_method = stored_cm;
7513 	} else if (blob->clear_method != stored_cm) {
7514 		SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
7515 			     blob->clear_method, stored_cm);
7516 	}
7517 }
7518 
7519 spdk_blob_id
7520 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
7521 {
7522 	struct spdk_blob_list *snapshot_entry = NULL;
7523 	struct spdk_blob_list *clone_entry = NULL;
7524 
7525 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
7526 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
7527 			if (clone_entry->id == blob_id) {
7528 				return snapshot_entry->id;
7529 			}
7530 		}
7531 	}
7532 
7533 	return SPDK_BLOBID_INVALID;
7534 }
7535 
7536 int
7537 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
7538 		     size_t *count)
7539 {
7540 	struct spdk_blob_list *snapshot_entry, *clone_entry;
7541 	size_t n;
7542 
7543 	snapshot_entry = bs_get_snapshot_entry(bs, blobid);
7544 	if (snapshot_entry == NULL) {
7545 		*count = 0;
7546 		return 0;
7547 	}
7548 
7549 	if (ids == NULL || *count < snapshot_entry->clone_count) {
7550 		*count = snapshot_entry->clone_count;
7551 		return -ENOMEM;
7552 	}
7553 	*count = snapshot_entry->clone_count;
7554 
7555 	n = 0;
7556 	TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
7557 		ids[n++] = clone_entry->id;
7558 	}
7559 
7560 	return 0;
7561 }
7562 
7563 SPDK_LOG_REGISTER_COMPONENT(blob)
7564