xref: /spdk/lib/blob/blobstore.c (revision b5e993483f951b8faa855512ceb3ee3723f9b456)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "spdk/blob.h"
37 #include "spdk/crc32.h"
38 #include "spdk/env.h"
39 #include "spdk/queue.h"
40 #include "spdk/thread.h"
41 #include "spdk/bit_array.h"
42 #include "spdk/likely.h"
43 #include "spdk/util.h"
44 #include "spdk/string.h"
45 
46 #include "spdk_internal/assert.h"
47 #include "spdk_internal/log.h"
48 
49 #include "blobstore.h"
50 
51 #define BLOB_CRC32C_INITIAL    0xffffffffUL
52 
53 static int spdk_bs_register_md_thread(struct spdk_blob_store *bs);
54 static int spdk_bs_unregister_md_thread(struct spdk_blob_store *bs);
55 static void _spdk_blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno);
56 static void _spdk_blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
57 		uint64_t cluster, uint32_t extent, spdk_blob_op_complete cb_fn, void *cb_arg);
58 
59 static int _spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
60 				uint16_t value_len, bool internal);
61 static int _spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
62 				      const void **value, size_t *value_len, bool internal);
63 static int _spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal);
64 
65 static void _spdk_blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
66 				     spdk_blob_op_complete cb_fn, void *cb_arg);
67 
68 static void
69 _spdk_blob_verify_md_op(struct spdk_blob *blob)
70 {
71 	assert(blob != NULL);
72 	assert(spdk_get_thread() == blob->bs->md_thread);
73 	assert(blob->state != SPDK_BLOB_STATE_LOADING);
74 }
75 
76 static struct spdk_blob_list *
77 _spdk_bs_get_snapshot_entry(struct spdk_blob_store *bs, spdk_blob_id blobid)
78 {
79 	struct spdk_blob_list *snapshot_entry = NULL;
80 
81 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
82 		if (snapshot_entry->id == blobid) {
83 			break;
84 		}
85 	}
86 
87 	return snapshot_entry;
88 }
89 
90 static void
91 _spdk_bs_claim_md_page(struct spdk_blob_store *bs, uint32_t page)
92 {
93 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
94 	assert(spdk_bit_array_get(bs->used_md_pages, page) == false);
95 
96 	spdk_bit_array_set(bs->used_md_pages, page);
97 }
98 
99 static void
100 _spdk_bs_release_md_page(struct spdk_blob_store *bs, uint32_t page)
101 {
102 	assert(page < spdk_bit_array_capacity(bs->used_md_pages));
103 	assert(spdk_bit_array_get(bs->used_md_pages, page) == true);
104 
105 	spdk_bit_array_clear(bs->used_md_pages, page);
106 }
107 
108 static void
109 _spdk_bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
110 {
111 	assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
112 	assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false);
113 	assert(bs->num_free_clusters > 0);
114 
115 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num);
116 
117 	spdk_bit_array_set(bs->used_clusters, cluster_num);
118 	bs->num_free_clusters--;
119 }
120 
121 static int
122 _spdk_blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster)
123 {
124 	uint64_t *cluster_lba = &blob->active.clusters[cluster_num];
125 
126 	_spdk_blob_verify_md_op(blob);
127 
128 	if (*cluster_lba != 0) {
129 		return -EEXIST;
130 	}
131 
132 	*cluster_lba = _spdk_bs_cluster_to_lba(blob->bs, cluster);
133 	return 0;
134 }
135 
136 static int
137 _spdk_bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num,
138 			  uint64_t *lowest_free_cluster, uint32_t *lowest_free_md_page, bool update_map)
139 {
140 	uint32_t *extent_page;
141 
142 	pthread_mutex_lock(&blob->bs->used_clusters_mutex);
143 	*lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters,
144 			       *lowest_free_cluster);
145 	if (*lowest_free_cluster == UINT32_MAX) {
146 		/* No more free clusters. Cannot satisfy the request */
147 		pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
148 		return -ENOSPC;
149 	}
150 
151 	if (blob->use_extent_table) {
152 		extent_page = _spdk_bs_cluster_to_extent_page(blob, cluster_num);
153 		if (*extent_page == 0) {
154 			/* No extent_page is allocated for the cluster */
155 			*lowest_free_md_page = spdk_bit_array_find_first_clear(blob->bs->used_md_pages,
156 					       *lowest_free_md_page);
157 			if (*lowest_free_md_page == UINT32_MAX) {
158 				/* No more free md pages. Cannot satisfy the request */
159 				pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
160 				return -ENOSPC;
161 			}
162 			_spdk_bs_claim_md_page(blob->bs, *lowest_free_md_page);
163 		}
164 	}
165 
166 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id);
167 	_spdk_bs_claim_cluster(blob->bs, *lowest_free_cluster);
168 
169 	pthread_mutex_unlock(&blob->bs->used_clusters_mutex);
170 
171 	if (update_map) {
172 		_spdk_blob_insert_cluster(blob, cluster_num, *lowest_free_cluster);
173 		if (blob->use_extent_table && *extent_page == 0) {
174 			*extent_page = *lowest_free_md_page;
175 		}
176 	}
177 
178 	return 0;
179 }
180 
181 static void
182 _spdk_bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num)
183 {
184 	assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters));
185 	assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true);
186 	assert(bs->num_free_clusters < bs->total_clusters);
187 
188 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num);
189 
190 	pthread_mutex_lock(&bs->used_clusters_mutex);
191 	spdk_bit_array_clear(bs->used_clusters, cluster_num);
192 	bs->num_free_clusters++;
193 	pthread_mutex_unlock(&bs->used_clusters_mutex);
194 }
195 
196 static void
197 _spdk_blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs)
198 {
199 	xattrs->count = 0;
200 	xattrs->names = NULL;
201 	xattrs->ctx = NULL;
202 	xattrs->get_value = NULL;
203 }
204 
205 void
206 spdk_blob_opts_init(struct spdk_blob_opts *opts)
207 {
208 	opts->num_clusters = 0;
209 	opts->thin_provision = false;
210 	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
211 	_spdk_blob_xattrs_init(&opts->xattrs);
212 	opts->use_extent_table = false;
213 }
214 
215 void
216 spdk_blob_open_opts_init(struct spdk_blob_open_opts *opts)
217 {
218 	opts->clear_method = BLOB_CLEAR_WITH_DEFAULT;
219 }
220 
221 static struct spdk_blob *
222 _spdk_blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id)
223 {
224 	struct spdk_blob *blob;
225 
226 	blob = calloc(1, sizeof(*blob));
227 	if (!blob) {
228 		return NULL;
229 	}
230 
231 	blob->id = id;
232 	blob->bs = bs;
233 
234 	blob->parent_id = SPDK_BLOBID_INVALID;
235 
236 	blob->state = SPDK_BLOB_STATE_DIRTY;
237 	blob->extent_rle_found = false;
238 	blob->extent_table_found = false;
239 	blob->active.num_pages = 1;
240 	blob->active.pages = calloc(1, sizeof(*blob->active.pages));
241 	if (!blob->active.pages) {
242 		free(blob);
243 		return NULL;
244 	}
245 
246 	blob->active.pages[0] = _spdk_bs_blobid_to_page(id);
247 
248 	TAILQ_INIT(&blob->xattrs);
249 	TAILQ_INIT(&blob->xattrs_internal);
250 
251 	return blob;
252 }
253 
254 static void
255 _spdk_xattrs_free(struct spdk_xattr_tailq *xattrs)
256 {
257 	struct spdk_xattr	*xattr, *xattr_tmp;
258 
259 	TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) {
260 		TAILQ_REMOVE(xattrs, xattr, link);
261 		free(xattr->name);
262 		free(xattr->value);
263 		free(xattr);
264 	}
265 }
266 
267 static void
268 _spdk_blob_free(struct spdk_blob *blob)
269 {
270 	assert(blob != NULL);
271 
272 	free(blob->active.extent_pages);
273 	free(blob->clean.extent_pages);
274 	free(blob->active.clusters);
275 	free(blob->clean.clusters);
276 	free(blob->active.pages);
277 	free(blob->clean.pages);
278 
279 	_spdk_xattrs_free(&blob->xattrs);
280 	_spdk_xattrs_free(&blob->xattrs_internal);
281 
282 	if (blob->back_bs_dev) {
283 		blob->back_bs_dev->destroy(blob->back_bs_dev);
284 	}
285 
286 	free(blob);
287 }
288 
289 struct freeze_io_ctx {
290 	struct spdk_bs_cpl cpl;
291 	struct spdk_blob *blob;
292 };
293 
294 static void
295 _spdk_blob_io_sync(struct spdk_io_channel_iter *i)
296 {
297 	spdk_for_each_channel_continue(i, 0);
298 }
299 
300 static void
301 _spdk_blob_execute_queued_io(struct spdk_io_channel_iter *i)
302 {
303 	struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i);
304 	struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch);
305 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
306 	struct spdk_bs_request_set	*set;
307 	struct spdk_bs_user_op_args	*args;
308 	spdk_bs_user_op_t *op, *tmp;
309 
310 	TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) {
311 		set = (struct spdk_bs_request_set *)op;
312 		args = &set->u.user_op;
313 
314 		if (args->blob == ctx->blob) {
315 			TAILQ_REMOVE(&ch->queued_io, op, link);
316 			spdk_bs_user_op_execute(op);
317 		}
318 	}
319 
320 	spdk_for_each_channel_continue(i, 0);
321 }
322 
323 static void
324 _spdk_blob_io_cpl(struct spdk_io_channel_iter *i, int status)
325 {
326 	struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
327 
328 	ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0);
329 
330 	free(ctx);
331 }
332 
333 static void
334 _spdk_blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
335 {
336 	struct freeze_io_ctx *ctx;
337 
338 	ctx = calloc(1, sizeof(*ctx));
339 	if (!ctx) {
340 		cb_fn(cb_arg, -ENOMEM);
341 		return;
342 	}
343 
344 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
345 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
346 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
347 	ctx->blob = blob;
348 
349 	/* Freeze I/O on blob */
350 	blob->frozen_refcnt++;
351 
352 	if (blob->frozen_refcnt == 1) {
353 		spdk_for_each_channel(blob->bs, _spdk_blob_io_sync, ctx, _spdk_blob_io_cpl);
354 	} else {
355 		cb_fn(cb_arg, 0);
356 		free(ctx);
357 	}
358 }
359 
360 static void
361 _spdk_blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
362 {
363 	struct freeze_io_ctx *ctx;
364 
365 	ctx = calloc(1, sizeof(*ctx));
366 	if (!ctx) {
367 		cb_fn(cb_arg, -ENOMEM);
368 		return;
369 	}
370 
371 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
372 	ctx->cpl.u.blob_basic.cb_fn = cb_fn;
373 	ctx->cpl.u.blob_basic.cb_arg = cb_arg;
374 	ctx->blob = blob;
375 
376 	assert(blob->frozen_refcnt > 0);
377 
378 	blob->frozen_refcnt--;
379 
380 	if (blob->frozen_refcnt == 0) {
381 		spdk_for_each_channel(blob->bs, _spdk_blob_execute_queued_io, ctx, _spdk_blob_io_cpl);
382 	} else {
383 		cb_fn(cb_arg, 0);
384 		free(ctx);
385 	}
386 }
387 
388 static int
389 _spdk_blob_mark_clean(struct spdk_blob *blob)
390 {
391 	uint32_t *extent_pages = NULL;
392 	uint64_t *clusters = NULL;
393 	uint32_t *pages = NULL;
394 
395 	assert(blob != NULL);
396 
397 	if (blob->active.num_extent_pages) {
398 		assert(blob->active.extent_pages);
399 		extent_pages = calloc(blob->active.num_extent_pages, sizeof(*blob->active.extent_pages));
400 		if (!extent_pages) {
401 			return -ENOMEM;
402 		}
403 		memcpy(extent_pages, blob->active.extent_pages,
404 		       blob->active.num_extent_pages * sizeof(*extent_pages));
405 	}
406 
407 	if (blob->active.num_clusters) {
408 		assert(blob->active.clusters);
409 		clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters));
410 		if (!clusters) {
411 			free(extent_pages);
412 			return -ENOMEM;
413 		}
414 		memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
415 	}
416 
417 	if (blob->active.num_pages) {
418 		assert(blob->active.pages);
419 		pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages));
420 		if (!pages) {
421 			free(extent_pages);
422 			free(clusters);
423 			return -ENOMEM;
424 		}
425 		memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
426 	}
427 
428 	free(blob->clean.extent_pages);
429 	free(blob->clean.clusters);
430 	free(blob->clean.pages);
431 
432 	blob->clean.num_extent_pages = blob->active.num_extent_pages;
433 	blob->clean.extent_pages = blob->active.extent_pages;
434 	blob->clean.num_clusters = blob->active.num_clusters;
435 	blob->clean.clusters = blob->active.clusters;
436 	blob->clean.num_pages = blob->active.num_pages;
437 	blob->clean.pages = blob->active.pages;
438 
439 	blob->active.extent_pages = extent_pages;
440 	blob->active.clusters = clusters;
441 	blob->active.pages = pages;
442 
443 	/* If the metadata was dirtied again while the metadata was being written to disk,
444 	 *  we do not want to revert the DIRTY state back to CLEAN here.
445 	 */
446 	if (blob->state == SPDK_BLOB_STATE_LOADING) {
447 		blob->state = SPDK_BLOB_STATE_CLEAN;
448 	}
449 
450 	return 0;
451 }
452 
453 static int
454 _spdk_blob_deserialize_xattr(struct spdk_blob *blob,
455 			     struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal)
456 {
457 	struct spdk_xattr                       *xattr;
458 
459 	if (desc_xattr->length != sizeof(desc_xattr->name_length) +
460 	    sizeof(desc_xattr->value_length) +
461 	    desc_xattr->name_length + desc_xattr->value_length) {
462 		return -EINVAL;
463 	}
464 
465 	xattr = calloc(1, sizeof(*xattr));
466 	if (xattr == NULL) {
467 		return -ENOMEM;
468 	}
469 
470 	xattr->name = malloc(desc_xattr->name_length + 1);
471 	if (xattr->name == NULL) {
472 		free(xattr);
473 		return -ENOMEM;
474 	}
475 	memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length);
476 	xattr->name[desc_xattr->name_length] = '\0';
477 
478 	xattr->value = malloc(desc_xattr->value_length);
479 	if (xattr->value == NULL) {
480 		free(xattr->name);
481 		free(xattr);
482 		return -ENOMEM;
483 	}
484 	xattr->value_len = desc_xattr->value_length;
485 	memcpy(xattr->value,
486 	       (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
487 	       desc_xattr->value_length);
488 
489 	TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link);
490 
491 	return 0;
492 }
493 
494 
495 static int
496 _spdk_blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob)
497 {
498 	struct spdk_blob_md_descriptor *desc;
499 	size_t	cur_desc = 0;
500 	void *tmp;
501 
502 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
503 	while (cur_desc < sizeof(page->descriptors)) {
504 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
505 			if (desc->length == 0) {
506 				/* If padding and length are 0, this terminates the page */
507 				break;
508 			}
509 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
510 			struct spdk_blob_md_descriptor_flags	*desc_flags;
511 
512 			desc_flags = (struct spdk_blob_md_descriptor_flags *)desc;
513 
514 			if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) {
515 				return -EINVAL;
516 			}
517 
518 			if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) !=
519 			    SPDK_BLOB_INVALID_FLAGS_MASK) {
520 				return -EINVAL;
521 			}
522 
523 			if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) !=
524 			    SPDK_BLOB_DATA_RO_FLAGS_MASK) {
525 				blob->data_ro = true;
526 				blob->md_ro = true;
527 			}
528 
529 			if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) !=
530 			    SPDK_BLOB_MD_RO_FLAGS_MASK) {
531 				blob->md_ro = true;
532 			}
533 
534 			if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
535 				blob->data_ro = true;
536 				blob->md_ro = true;
537 			}
538 
539 			blob->invalid_flags = desc_flags->invalid_flags;
540 			blob->data_ro_flags = desc_flags->data_ro_flags;
541 			blob->md_ro_flags = desc_flags->md_ro_flags;
542 
543 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
544 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
545 			unsigned int				i, j;
546 			unsigned int				cluster_count = blob->active.num_clusters;
547 
548 			if (blob->extent_table_found) {
549 				/* Extent Table already present in the md,
550 				 * both descriptors should never be at the same time. */
551 				return -EINVAL;
552 			}
553 			blob->extent_rle_found = true;
554 
555 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
556 
557 			if (desc_extent_rle->length == 0 ||
558 			    (desc_extent_rle->length % sizeof(desc_extent_rle->extents[0]) != 0)) {
559 				return -EINVAL;
560 			}
561 
562 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
563 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
564 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
565 						if (!spdk_bit_array_get(blob->bs->used_clusters,
566 									desc_extent_rle->extents[i].cluster_idx + j)) {
567 							return -EINVAL;
568 						}
569 					}
570 					cluster_count++;
571 				}
572 			}
573 
574 			if (cluster_count == 0) {
575 				return -EINVAL;
576 			}
577 			tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
578 			if (tmp == NULL) {
579 				return -ENOMEM;
580 			}
581 			blob->active.clusters = tmp;
582 			blob->active.cluster_array_size = cluster_count;
583 
584 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
585 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
586 					if (desc_extent_rle->extents[i].cluster_idx != 0) {
587 						blob->active.clusters[blob->active.num_clusters++] = _spdk_bs_cluster_to_lba(blob->bs,
588 								desc_extent_rle->extents[i].cluster_idx + j);
589 					} else if (spdk_blob_is_thin_provisioned(blob)) {
590 						blob->active.clusters[blob->active.num_clusters++] = 0;
591 					} else {
592 						return -EINVAL;
593 					}
594 				}
595 			}
596 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
597 			struct spdk_blob_md_descriptor_extent_table *desc_extent_table;
598 			uint32_t num_extent_pages = blob->active.num_extent_pages;
599 			uint32_t i, j;
600 			size_t extent_pages_length;
601 
602 			desc_extent_table = (struct spdk_blob_md_descriptor_extent_table *)desc;
603 			extent_pages_length = desc_extent_table->length - sizeof(desc_extent_table->num_clusters);
604 
605 			if (blob->extent_rle_found) {
606 				/* This means that Extent RLE is present in MD,
607 				 * both should never be at the same time. */
608 				return -EINVAL;
609 			} else if (blob->extent_table_found &&
610 				   desc_extent_table->num_clusters != blob->num_clusters_in_et) {
611 				/* Number of clusters in this ET does not match number
612 				 * from previously read EXTENT_TABLE. */
613 				return -EINVAL;
614 			}
615 
616 			blob->extent_table_found = true;
617 
618 			if (desc_extent_table->length == 0 ||
619 			    (extent_pages_length % sizeof(desc_extent_table->extent_page[0]) != 0)) {
620 				return -EINVAL;
621 			}
622 
623 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
624 				num_extent_pages += desc_extent_table->extent_page[i].num_pages;
625 			}
626 
627 			tmp = realloc(blob->active.extent_pages, num_extent_pages * sizeof(uint32_t));
628 			if (tmp == NULL) {
629 				return -ENOMEM;
630 			}
631 			blob->active.extent_pages = tmp;
632 			blob->active.extent_pages_array_size = num_extent_pages;
633 
634 			blob->num_clusters_in_et = desc_extent_table->num_clusters;
635 
636 			/* Extent table entries contain md page numbers for extent pages.
637 			 * Zeroes represent unallocated extent pages, those are run-length-encoded.
638 			 */
639 			for (i = 0; i < extent_pages_length / sizeof(desc_extent_table->extent_page[0]); i++) {
640 				if (desc_extent_table->extent_page[i].page_idx != 0) {
641 					assert(desc_extent_table->extent_page[i].num_pages == 1);
642 					blob->active.extent_pages[blob->active.num_extent_pages++] =
643 						desc_extent_table->extent_page[i].page_idx;
644 				} else if (spdk_blob_is_thin_provisioned(blob)) {
645 					for (j = 0; j < desc_extent_table->extent_page[i].num_pages; j++) {
646 						blob->active.extent_pages[blob->active.num_extent_pages++] = 0;
647 					}
648 				} else {
649 					return -EINVAL;
650 				}
651 			}
652 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
653 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
654 			unsigned int					i;
655 			unsigned int					cluster_count = blob->active.num_clusters;
656 
657 			if (blob->extent_rle_found) {
658 				/* This means that Extent RLE is present in MD,
659 				 * both should never be at the same time. */
660 				return -EINVAL;
661 			}
662 
663 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
664 
665 			if (desc_extent->length == 0 ||
666 			    (desc_extent->length % sizeof(desc_extent->cluster_idx[0]) != 0)) {
667 				return -EINVAL;
668 			}
669 
670 			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
671 				if (desc_extent->cluster_idx[i] != 0) {
672 					if (!spdk_bit_array_get(blob->bs->used_clusters, desc_extent->cluster_idx[i])) {
673 						return -EINVAL;
674 					}
675 				}
676 				cluster_count++;
677 			}
678 
679 			if (cluster_count == 0) {
680 				return -EINVAL;
681 			}
682 			tmp = realloc(blob->active.clusters, cluster_count * sizeof(*blob->active.clusters));
683 			if (tmp == NULL) {
684 				return -ENOMEM;
685 			}
686 			blob->active.clusters = tmp;
687 			blob->active.cluster_array_size = cluster_count;
688 
689 			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
690 				if (desc_extent->cluster_idx[i] != 0) {
691 					blob->active.clusters[blob->active.num_clusters++] = _spdk_bs_cluster_to_lba(blob->bs,
692 							desc_extent->cluster_idx[i]);
693 				} else if (spdk_blob_is_thin_provisioned(blob)) {
694 					blob->active.clusters[blob->active.num_clusters++] = 0;
695 				} else {
696 					return -EINVAL;
697 				}
698 			}
699 			assert(blob->num_clusters_in_et >= cluster_count);
700 			blob->num_clusters_in_et -= cluster_count;
701 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
702 			int rc;
703 
704 			rc = _spdk_blob_deserialize_xattr(blob,
705 							  (struct spdk_blob_md_descriptor_xattr *) desc, false);
706 			if (rc != 0) {
707 				return rc;
708 			}
709 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
710 			int rc;
711 
712 			rc = _spdk_blob_deserialize_xattr(blob,
713 							  (struct spdk_blob_md_descriptor_xattr *) desc, true);
714 			if (rc != 0) {
715 				return rc;
716 			}
717 		} else {
718 			/* Unrecognized descriptor type.  Do not fail - just continue to the
719 			 *  next descriptor.  If this descriptor is associated with some feature
720 			 *  defined in a newer version of blobstore, that version of blobstore
721 			 *  should create and set an associated feature flag to specify if this
722 			 *  blob can be loaded or not.
723 			 */
724 		}
725 
726 		/* Advance to the next descriptor */
727 		cur_desc += sizeof(*desc) + desc->length;
728 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
729 			break;
730 		}
731 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
732 	}
733 
734 	return 0;
735 }
736 
737 static bool _spdk_bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page);
738 
739 static int
740 _spdk_blob_parse_extent_page(struct spdk_blob_md_page *extent_page, struct spdk_blob *blob)
741 {
742 	assert(blob != NULL);
743 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
744 	assert(blob->active.clusters == NULL);
745 
746 	if (_spdk_bs_load_cur_extent_page_valid(extent_page) == false) {
747 		return -ENOENT;
748 	}
749 
750 	return _spdk_blob_parse_page(extent_page, blob);
751 }
752 
753 static int
754 _spdk_blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count,
755 		 struct spdk_blob *blob)
756 {
757 	const struct spdk_blob_md_page *page;
758 	uint32_t i;
759 	int rc;
760 
761 	assert(page_count > 0);
762 	assert(pages[0].sequence_num == 0);
763 	assert(blob != NULL);
764 	assert(blob->state == SPDK_BLOB_STATE_LOADING);
765 	assert(blob->active.clusters == NULL);
766 
767 	/* The blobid provided doesn't match what's in the MD, this can
768 	 * happen for example if a bogus blobid is passed in through open.
769 	 */
770 	if (blob->id != pages[0].id) {
771 		SPDK_ERRLOG("Blobid (%lu) doesn't match what's in metadata (%lu)\n",
772 			    blob->id, pages[0].id);
773 		return -ENOENT;
774 	}
775 
776 	for (i = 0; i < page_count; i++) {
777 		page = &pages[i];
778 
779 		assert(page->id == blob->id);
780 		assert(page->sequence_num == i);
781 
782 		rc = _spdk_blob_parse_page(page, blob);
783 		if (rc != 0) {
784 			return rc;
785 		}
786 	}
787 
788 	return 0;
789 }
790 
791 static int
792 _spdk_blob_serialize_add_page(const struct spdk_blob *blob,
793 			      struct spdk_blob_md_page **pages,
794 			      uint32_t *page_count,
795 			      struct spdk_blob_md_page **last_page)
796 {
797 	struct spdk_blob_md_page *page;
798 
799 	assert(pages != NULL);
800 	assert(page_count != NULL);
801 
802 	if (*page_count == 0) {
803 		assert(*pages == NULL);
804 		*page_count = 1;
805 		*pages = spdk_malloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
806 				     NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
807 	} else {
808 		assert(*pages != NULL);
809 		(*page_count)++;
810 		*pages = spdk_realloc(*pages,
811 				      SPDK_BS_PAGE_SIZE * (*page_count),
812 				      SPDK_BS_PAGE_SIZE);
813 	}
814 
815 	if (*pages == NULL) {
816 		*page_count = 0;
817 		*last_page = NULL;
818 		return -ENOMEM;
819 	}
820 
821 	page = &(*pages)[*page_count - 1];
822 	memset(page, 0, sizeof(*page));
823 	page->id = blob->id;
824 	page->sequence_num = *page_count - 1;
825 	page->next = SPDK_INVALID_MD_PAGE;
826 	*last_page = page;
827 
828 	return 0;
829 }
830 
831 /* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor.
832  * Update required_sz on both success and failure.
833  *
834  */
835 static int
836 _spdk_blob_serialize_xattr(const struct spdk_xattr *xattr,
837 			   uint8_t *buf, size_t buf_sz,
838 			   size_t *required_sz, bool internal)
839 {
840 	struct spdk_blob_md_descriptor_xattr	*desc;
841 
842 	*required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) +
843 		       strlen(xattr->name) +
844 		       xattr->value_len;
845 
846 	if (buf_sz < *required_sz) {
847 		return -1;
848 	}
849 
850 	desc = (struct spdk_blob_md_descriptor_xattr *)buf;
851 
852 	desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR;
853 	desc->length = sizeof(desc->name_length) +
854 		       sizeof(desc->value_length) +
855 		       strlen(xattr->name) +
856 		       xattr->value_len;
857 	desc->name_length = strlen(xattr->name);
858 	desc->value_length = xattr->value_len;
859 
860 	memcpy(desc->name, xattr->name, desc->name_length);
861 	memcpy((void *)((uintptr_t)desc->name + desc->name_length),
862 	       xattr->value,
863 	       desc->value_length);
864 
865 	return 0;
866 }
867 
868 static void
869 _spdk_blob_serialize_extent_table_entry(const struct spdk_blob *blob,
870 					uint64_t start_ep, uint64_t *next_ep,
871 					uint8_t **buf, size_t *remaining_sz)
872 {
873 	struct spdk_blob_md_descriptor_extent_table *desc;
874 	size_t cur_sz;
875 	uint64_t i, et_idx;
876 	uint32_t extent_page, ep_len;
877 
878 	/* The buffer must have room for at least one extent page */
879 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->num_clusters) + sizeof(
880 			 desc->extent_page[0]);
881 	if (*remaining_sz < cur_sz) {
882 		*next_ep = start_ep;
883 		return;
884 	}
885 
886 	desc = (struct spdk_blob_md_descriptor_extent_table *)*buf;
887 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE;
888 
889 	desc->num_clusters = blob->active.num_clusters;
890 
891 	extent_page = blob->active.extent_pages[start_ep];
892 	ep_len = 1;
893 	et_idx = 0;
894 	for (i = start_ep + 1; i < blob->active.num_extent_pages; i++) {
895 		/* Extent table entries contain md page offsets for extent pages.
896 		 * Zeroes represent unallocated extent pages, which are run-length-encoded.
897 		 */
898 		if (extent_page == 0 && blob->active.extent_pages[i] == 0) {
899 			ep_len++;
900 			continue;
901 		}
902 		desc->extent_page[et_idx].page_idx = extent_page;
903 		desc->extent_page[et_idx].num_pages = ep_len;
904 		et_idx++;
905 
906 		cur_sz += sizeof(desc->extent_page[et_idx]);
907 
908 		if (*remaining_sz < cur_sz) {
909 			/* If we ran out of buffer space, return */
910 			*next_ep = i;
911 			break;
912 		}
913 		extent_page = blob->active.extent_pages[i];
914 		ep_len = 1;
915 	}
916 
917 	if (*remaining_sz >= cur_sz) {
918 		desc->extent_page[et_idx].page_idx = extent_page;
919 		desc->extent_page[et_idx].num_pages = ep_len;
920 		et_idx++;
921 
922 		*next_ep = blob->active.num_extent_pages;
923 	}
924 
925 	desc->length = sizeof(desc->num_clusters) + sizeof(desc->extent_page[0]) * et_idx;
926 	*remaining_sz -= sizeof(struct spdk_blob_md_descriptor) + desc->length;
927 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc->length;
928 }
929 
930 static int
931 _spdk_blob_serialize_extent_table(const struct spdk_blob *blob,
932 				  struct spdk_blob_md_page **pages,
933 				  struct spdk_blob_md_page *cur_page,
934 				  uint32_t *page_count, uint8_t **buf,
935 				  size_t *remaining_sz)
936 {
937 	uint64_t				last_extent_page;
938 	int					rc;
939 
940 	last_extent_page = 0;
941 	while (last_extent_page < blob->active.num_extent_pages) {
942 		_spdk_blob_serialize_extent_table_entry(blob, last_extent_page, &last_extent_page, buf,
943 							remaining_sz);
944 
945 		if (last_extent_page == blob->active.num_extent_pages) {
946 			break;
947 		}
948 
949 		rc = _spdk_blob_serialize_add_page(blob, pages, page_count, &cur_page);
950 		if (rc < 0) {
951 			return rc;
952 		}
953 
954 		*buf = (uint8_t *)cur_page->descriptors;
955 		*remaining_sz = sizeof(cur_page->descriptors);
956 	}
957 
958 	return 0;
959 }
960 
961 static void
962 _spdk_blob_serialize_extent_rle(const struct spdk_blob *blob,
963 				uint64_t start_cluster, uint64_t *next_cluster,
964 				uint8_t **buf, size_t *buf_sz)
965 {
966 	struct spdk_blob_md_descriptor_extent_rle *desc_extent_rle;
967 	size_t cur_sz;
968 	uint64_t i, extent_idx;
969 	uint64_t lba, lba_per_cluster, lba_count;
970 
971 	/* The buffer must have room for at least one extent */
972 	cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc_extent_rle->extents[0]);
973 	if (*buf_sz < cur_sz) {
974 		*next_cluster = start_cluster;
975 		return;
976 	}
977 
978 	desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)*buf;
979 	desc_extent_rle->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE;
980 
981 	lba_per_cluster = _spdk_bs_cluster_to_lba(blob->bs, 1);
982 
983 	lba = blob->active.clusters[start_cluster];
984 	lba_count = lba_per_cluster;
985 	extent_idx = 0;
986 	for (i = start_cluster + 1; i < blob->active.num_clusters; i++) {
987 		if ((lba + lba_count) == blob->active.clusters[i] && lba != 0) {
988 			/* Run-length encode sequential non-zero LBA */
989 			lba_count += lba_per_cluster;
990 			continue;
991 		} else if (lba == 0 && blob->active.clusters[i] == 0) {
992 			/* Run-length encode unallocated clusters */
993 			lba_count += lba_per_cluster;
994 			continue;
995 		}
996 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
997 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
998 		extent_idx++;
999 
1000 		cur_sz += sizeof(desc_extent_rle->extents[extent_idx]);
1001 
1002 		if (*buf_sz < cur_sz) {
1003 			/* If we ran out of buffer space, return */
1004 			*next_cluster = i;
1005 			break;
1006 		}
1007 
1008 		lba = blob->active.clusters[i];
1009 		lba_count = lba_per_cluster;
1010 	}
1011 
1012 	if (*buf_sz >= cur_sz) {
1013 		desc_extent_rle->extents[extent_idx].cluster_idx = lba / lba_per_cluster;
1014 		desc_extent_rle->extents[extent_idx].length = lba_count / lba_per_cluster;
1015 		extent_idx++;
1016 
1017 		*next_cluster = blob->active.num_clusters;
1018 	}
1019 
1020 	desc_extent_rle->length = sizeof(desc_extent_rle->extents[0]) * extent_idx;
1021 	*buf_sz -= sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1022 	*buf += sizeof(struct spdk_blob_md_descriptor) + desc_extent_rle->length;
1023 }
1024 
1025 static int
1026 _spdk_blob_serialize_extents_rle(const struct spdk_blob *blob,
1027 				 struct spdk_blob_md_page **pages,
1028 				 struct spdk_blob_md_page *cur_page,
1029 				 uint32_t *page_count, uint8_t **buf,
1030 				 size_t *remaining_sz)
1031 {
1032 	uint64_t				last_cluster;
1033 	int					rc;
1034 
1035 	last_cluster = 0;
1036 	while (last_cluster < blob->active.num_clusters) {
1037 		_spdk_blob_serialize_extent_rle(blob, last_cluster, &last_cluster, buf, remaining_sz);
1038 
1039 		if (last_cluster == blob->active.num_clusters) {
1040 			break;
1041 		}
1042 
1043 		rc = _spdk_blob_serialize_add_page(blob, pages, page_count, &cur_page);
1044 		if (rc < 0) {
1045 			return rc;
1046 		}
1047 
1048 		*buf = (uint8_t *)cur_page->descriptors;
1049 		*remaining_sz = sizeof(cur_page->descriptors);
1050 	}
1051 
1052 	return 0;
1053 }
1054 
1055 static void
1056 _spdk_blob_serialize_extent_page(const struct spdk_blob *blob,
1057 				 uint64_t cluster, struct spdk_blob_md_page *page)
1058 {
1059 	struct spdk_blob_md_descriptor_extent_page *desc_extent;
1060 	uint64_t i, extent_idx;
1061 	uint64_t lba, lba_per_cluster;
1062 	uint64_t start_cluster = (cluster / SPDK_EXTENTS_PER_EP) * SPDK_EXTENTS_PER_EP;
1063 	uint64_t end_cluster = spdk_min(start_cluster + SPDK_EXTENTS_PER_EP, blob->active.num_clusters);
1064 
1065 	desc_extent = (struct spdk_blob_md_descriptor_extent_page *) page->descriptors;
1066 	desc_extent->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE;
1067 
1068 	lba_per_cluster = _spdk_bs_cluster_to_lba(blob->bs, 1);
1069 
1070 	extent_idx = 0;
1071 	for (i = start_cluster; i < end_cluster; i++) {
1072 		lba = blob->active.clusters[i];
1073 		desc_extent->cluster_idx[extent_idx++] = lba / lba_per_cluster;
1074 	}
1075 
1076 	desc_extent->length = sizeof(desc_extent->cluster_idx[0]) * extent_idx;
1077 }
1078 
1079 static void
1080 _spdk_blob_serialize_flags(const struct spdk_blob *blob,
1081 			   uint8_t *buf, size_t *buf_sz)
1082 {
1083 	struct spdk_blob_md_descriptor_flags *desc;
1084 
1085 	/*
1086 	 * Flags get serialized first, so we should always have room for the flags
1087 	 *  descriptor.
1088 	 */
1089 	assert(*buf_sz >= sizeof(*desc));
1090 
1091 	desc = (struct spdk_blob_md_descriptor_flags *)buf;
1092 	desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS;
1093 	desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor);
1094 	desc->invalid_flags = blob->invalid_flags;
1095 	desc->data_ro_flags = blob->data_ro_flags;
1096 	desc->md_ro_flags = blob->md_ro_flags;
1097 
1098 	*buf_sz -= sizeof(*desc);
1099 }
1100 
1101 static int
1102 _spdk_blob_serialize_xattrs(const struct spdk_blob *blob,
1103 			    const struct spdk_xattr_tailq *xattrs, bool internal,
1104 			    struct spdk_blob_md_page **pages,
1105 			    struct spdk_blob_md_page *cur_page,
1106 			    uint32_t *page_count, uint8_t **buf,
1107 			    size_t *remaining_sz)
1108 {
1109 	const struct spdk_xattr	*xattr;
1110 	int	rc;
1111 
1112 	TAILQ_FOREACH(xattr, xattrs, link) {
1113 		size_t required_sz = 0;
1114 
1115 		rc = _spdk_blob_serialize_xattr(xattr,
1116 						*buf, *remaining_sz,
1117 						&required_sz, internal);
1118 		if (rc < 0) {
1119 			/* Need to add a new page to the chain */
1120 			rc = _spdk_blob_serialize_add_page(blob, pages, page_count,
1121 							   &cur_page);
1122 			if (rc < 0) {
1123 				spdk_free(*pages);
1124 				*pages = NULL;
1125 				*page_count = 0;
1126 				return rc;
1127 			}
1128 
1129 			*buf = (uint8_t *)cur_page->descriptors;
1130 			*remaining_sz = sizeof(cur_page->descriptors);
1131 
1132 			/* Try again */
1133 			required_sz = 0;
1134 			rc = _spdk_blob_serialize_xattr(xattr,
1135 							*buf, *remaining_sz,
1136 							&required_sz, internal);
1137 
1138 			if (rc < 0) {
1139 				spdk_free(*pages);
1140 				*pages = NULL;
1141 				*page_count = 0;
1142 				return rc;
1143 			}
1144 		}
1145 
1146 		*remaining_sz -= required_sz;
1147 		*buf += required_sz;
1148 	}
1149 
1150 	return 0;
1151 }
1152 
1153 static int
1154 _spdk_blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages,
1155 		     uint32_t *page_count)
1156 {
1157 	struct spdk_blob_md_page		*cur_page;
1158 	int					rc;
1159 	uint8_t					*buf;
1160 	size_t					remaining_sz;
1161 
1162 	assert(pages != NULL);
1163 	assert(page_count != NULL);
1164 	assert(blob != NULL);
1165 	assert(blob->state == SPDK_BLOB_STATE_DIRTY);
1166 
1167 	*pages = NULL;
1168 	*page_count = 0;
1169 
1170 	/* A blob always has at least 1 page, even if it has no descriptors */
1171 	rc = _spdk_blob_serialize_add_page(blob, pages, page_count, &cur_page);
1172 	if (rc < 0) {
1173 		return rc;
1174 	}
1175 
1176 	buf = (uint8_t *)cur_page->descriptors;
1177 	remaining_sz = sizeof(cur_page->descriptors);
1178 
1179 	/* Serialize flags */
1180 	_spdk_blob_serialize_flags(blob, buf, &remaining_sz);
1181 	buf += sizeof(struct spdk_blob_md_descriptor_flags);
1182 
1183 	/* Serialize xattrs */
1184 	rc = _spdk_blob_serialize_xattrs(blob, &blob->xattrs, false,
1185 					 pages, cur_page, page_count, &buf, &remaining_sz);
1186 	if (rc < 0) {
1187 		return rc;
1188 	}
1189 
1190 	/* Serialize internal xattrs */
1191 	rc = _spdk_blob_serialize_xattrs(blob, &blob->xattrs_internal, true,
1192 					 pages, cur_page, page_count, &buf, &remaining_sz);
1193 	if (rc < 0) {
1194 		return rc;
1195 	}
1196 
1197 	if (blob->use_extent_table) {
1198 		/* Serialize extent table */
1199 		rc = _spdk_blob_serialize_extent_table(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1200 	} else {
1201 		/* Serialize extents */
1202 		rc = _spdk_blob_serialize_extents_rle(blob, pages, cur_page, page_count, &buf, &remaining_sz);
1203 	}
1204 
1205 	return rc;
1206 }
1207 
1208 struct spdk_blob_load_ctx {
1209 	struct spdk_blob		*blob;
1210 
1211 	struct spdk_blob_md_page	*pages;
1212 	uint32_t			num_pages;
1213 	uint32_t			next_extent_page;
1214 	spdk_bs_sequence_t	        *seq;
1215 
1216 	spdk_bs_sequence_cpl		cb_fn;
1217 	void				*cb_arg;
1218 };
1219 
1220 static uint32_t
1221 _spdk_blob_md_page_calc_crc(void *page)
1222 {
1223 	uint32_t		crc;
1224 
1225 	crc = BLOB_CRC32C_INITIAL;
1226 	crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc);
1227 	crc ^= BLOB_CRC32C_INITIAL;
1228 
1229 	return crc;
1230 
1231 }
1232 
1233 static void
1234 _spdk_blob_load_final(void *cb_arg, int bserrno)
1235 {
1236 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1237 	struct spdk_blob		*blob = ctx->blob;
1238 
1239 	if (bserrno == 0) {
1240 		_spdk_blob_mark_clean(blob);
1241 	}
1242 
1243 	ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno);
1244 
1245 	/* Free the memory */
1246 	spdk_free(ctx->pages);
1247 	free(ctx);
1248 }
1249 
1250 static void
1251 _spdk_blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno)
1252 {
1253 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1254 	struct spdk_blob		*blob = ctx->blob;
1255 
1256 	if (bserrno == 0) {
1257 		blob->back_bs_dev = spdk_bs_create_blob_bs_dev(snapshot);
1258 		if (blob->back_bs_dev == NULL) {
1259 			bserrno = -ENOMEM;
1260 		}
1261 	}
1262 	if (bserrno != 0) {
1263 		SPDK_ERRLOG("Snapshot fail\n");
1264 	}
1265 
1266 	_spdk_blob_load_final(ctx, bserrno);
1267 }
1268 
1269 static void _spdk_blob_update_clear_method(struct spdk_blob *blob);
1270 
1271 static void
1272 _spdk_blob_load_backing_dev(void *cb_arg)
1273 {
1274 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1275 	struct spdk_blob		*blob = ctx->blob;
1276 	const void			*value;
1277 	size_t				len;
1278 	int				rc;
1279 
1280 	if (spdk_blob_is_thin_provisioned(blob)) {
1281 		rc = _spdk_blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true);
1282 		if (rc == 0) {
1283 			if (len != sizeof(spdk_blob_id)) {
1284 				_spdk_blob_load_final(ctx, -EINVAL);
1285 				return;
1286 			}
1287 			/* open snapshot blob and continue in the callback function */
1288 			blob->parent_id = *(spdk_blob_id *)value;
1289 			spdk_bs_open_blob(blob->bs, blob->parent_id,
1290 					  _spdk_blob_load_snapshot_cpl, ctx);
1291 			return;
1292 		} else {
1293 			/* add zeroes_dev for thin provisioned blob */
1294 			blob->back_bs_dev = spdk_bs_create_zeroes_dev();
1295 		}
1296 	} else {
1297 		/* standard blob */
1298 		blob->back_bs_dev = NULL;
1299 	}
1300 	_spdk_blob_load_final(ctx, 0);
1301 }
1302 
1303 static void
1304 _spdk_blob_load_cpl_extents_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1305 {
1306 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1307 	struct spdk_blob		*blob = ctx->blob;
1308 	struct spdk_blob_md_page	*page;
1309 	uint64_t			i;
1310 	uint32_t			crc;
1311 	uint64_t			lba;
1312 	void				*tmp;
1313 	uint64_t			sz;
1314 
1315 	if (bserrno) {
1316 		SPDK_ERRLOG("Extent page read failed: %d\n", bserrno);
1317 		_spdk_blob_load_final(ctx, bserrno);
1318 		return;
1319 	}
1320 
1321 	if (ctx->pages == NULL) {
1322 		/* First iteration of this function, allocate buffer for single EXTENT_PAGE */
1323 		ctx->pages = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE, NULL, SPDK_ENV_SOCKET_ID_ANY,
1324 					  SPDK_MALLOC_DMA);
1325 		if (!ctx->pages) {
1326 			_spdk_blob_load_final(ctx, -ENOMEM);
1327 			return;
1328 		}
1329 		ctx->num_pages = 1;
1330 		ctx->next_extent_page = 0;
1331 	} else {
1332 		page = &ctx->pages[0];
1333 		crc = _spdk_blob_md_page_calc_crc(page);
1334 		if (crc != page->crc) {
1335 			_spdk_blob_load_final(ctx, -EINVAL);
1336 			return;
1337 		}
1338 
1339 		if (page->next != SPDK_INVALID_MD_PAGE) {
1340 			_spdk_blob_load_final(ctx, -EINVAL);
1341 			return;
1342 		}
1343 
1344 		bserrno = _spdk_blob_parse_extent_page(page, blob);
1345 		if (bserrno) {
1346 			_spdk_blob_load_final(ctx, bserrno);
1347 			return;
1348 		}
1349 	}
1350 
1351 	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
1352 		if (blob->active.extent_pages[i] != 0) {
1353 			/* Extent page was allocated, read and parse it. */
1354 			lba = _spdk_bs_md_page_to_lba(blob->bs, blob->active.extent_pages[i]);
1355 			ctx->next_extent_page = i + 1;
1356 
1357 			spdk_bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1358 						  _spdk_bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
1359 						  _spdk_blob_load_cpl_extents_cpl, ctx);
1360 			return;
1361 		} else {
1362 			/* Thin provisioned blobs can point to unallocated extent pages.
1363 			 * In this case blob size should be increased by up to the amount left in num_clusters_in_et. */
1364 
1365 			sz = spdk_min(blob->num_clusters_in_et, SPDK_EXTENTS_PER_EP);
1366 			blob->active.num_clusters += sz;
1367 			blob->num_clusters_in_et -= sz;
1368 
1369 			assert(spdk_blob_is_thin_provisioned(blob));
1370 			assert(i + 1 < blob->active.num_extent_pages || blob->num_clusters_in_et == 0);
1371 
1372 			tmp = realloc(blob->active.clusters, blob->active.num_clusters * sizeof(*blob->active.clusters));
1373 			if (tmp == NULL) {
1374 				_spdk_blob_load_final(ctx, -ENOMEM);
1375 				return;
1376 			}
1377 			memset(tmp + blob->active.cluster_array_size, 0,
1378 			       sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
1379 			blob->active.clusters = tmp;
1380 			blob->active.cluster_array_size = blob->active.num_clusters;
1381 		}
1382 	}
1383 
1384 	_spdk_blob_load_backing_dev(ctx);
1385 }
1386 
1387 static void
1388 _spdk_blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1389 {
1390 	struct spdk_blob_load_ctx	*ctx = cb_arg;
1391 	struct spdk_blob		*blob = ctx->blob;
1392 	struct spdk_blob_md_page	*page;
1393 	int				rc;
1394 	uint32_t			crc;
1395 
1396 	if (bserrno) {
1397 		SPDK_ERRLOG("Metadata page read failed: %d\n", bserrno);
1398 		_spdk_blob_load_final(ctx, bserrno);
1399 		return;
1400 	}
1401 
1402 	page = &ctx->pages[ctx->num_pages - 1];
1403 	crc = _spdk_blob_md_page_calc_crc(page);
1404 	if (crc != page->crc) {
1405 		SPDK_ERRLOG("Metadata page %d crc mismatch\n", ctx->num_pages);
1406 		_spdk_blob_load_final(ctx, -EINVAL);
1407 		return;
1408 	}
1409 
1410 	if (page->next != SPDK_INVALID_MD_PAGE) {
1411 		uint32_t next_page = page->next;
1412 		uint64_t next_lba = _spdk_bs_md_page_to_lba(blob->bs, next_page);
1413 
1414 		/* Read the next page */
1415 		ctx->num_pages++;
1416 		ctx->pages = spdk_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages),
1417 					  sizeof(*page));
1418 		if (ctx->pages == NULL) {
1419 			_spdk_blob_load_final(ctx, -ENOMEM);
1420 			return;
1421 		}
1422 
1423 		spdk_bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1],
1424 					  next_lba,
1425 					  _spdk_bs_byte_to_lba(blob->bs, sizeof(*page)),
1426 					  _spdk_blob_load_cpl, ctx);
1427 		return;
1428 	}
1429 
1430 	/* Parse the pages */
1431 	rc = _spdk_blob_parse(ctx->pages, ctx->num_pages, blob);
1432 	if (rc) {
1433 		_spdk_blob_load_final(ctx, rc);
1434 		return;
1435 	}
1436 
1437 	if (blob->extent_table_found == true) {
1438 		/* If EXTENT_TABLE was found, that means support for it should be enabled. */
1439 		assert(blob->extent_rle_found == false);
1440 		blob->use_extent_table = true;
1441 	} else {
1442 		/* If EXTENT_RLE or no extent_* descriptor was found disable support
1443 		 * for extent table. No extent_* descriptors means that blob has length of 0
1444 		 * and no extent_rle descriptors were persisted for it.
1445 		 * EXTENT_TABLE if used, is always present in metadata regardless of length. */
1446 		blob->use_extent_table = false;
1447 	}
1448 
1449 	/* Check the clear_method stored in metadata vs what may have been passed
1450 	 * via spdk_bs_open_blob_ext() and update accordingly.
1451 	 */
1452 	_spdk_blob_update_clear_method(blob);
1453 
1454 	spdk_free(ctx->pages);
1455 	ctx->pages = NULL;
1456 
1457 	if (blob->extent_table_found) {
1458 		_spdk_blob_load_cpl_extents_cpl(seq, ctx, 0);
1459 	} else {
1460 		_spdk_blob_load_backing_dev(ctx);
1461 	}
1462 }
1463 
1464 /* Load a blob from disk given a blobid */
1465 static void
1466 _spdk_blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
1467 		spdk_bs_sequence_cpl cb_fn, void *cb_arg)
1468 {
1469 	struct spdk_blob_load_ctx *ctx;
1470 	struct spdk_blob_store *bs;
1471 	uint32_t page_num;
1472 	uint64_t lba;
1473 
1474 	_spdk_blob_verify_md_op(blob);
1475 
1476 	bs = blob->bs;
1477 
1478 	ctx = calloc(1, sizeof(*ctx));
1479 	if (!ctx) {
1480 		cb_fn(seq, cb_arg, -ENOMEM);
1481 		return;
1482 	}
1483 
1484 	ctx->blob = blob;
1485 	ctx->pages = spdk_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE);
1486 	if (!ctx->pages) {
1487 		free(ctx);
1488 		cb_fn(seq, cb_arg, -ENOMEM);
1489 		return;
1490 	}
1491 	ctx->num_pages = 1;
1492 	ctx->cb_fn = cb_fn;
1493 	ctx->cb_arg = cb_arg;
1494 	ctx->seq = seq;
1495 
1496 	page_num = _spdk_bs_blobid_to_page(blob->id);
1497 	lba = _spdk_bs_md_page_to_lba(blob->bs, page_num);
1498 
1499 	blob->state = SPDK_BLOB_STATE_LOADING;
1500 
1501 	spdk_bs_sequence_read_dev(seq, &ctx->pages[0], lba,
1502 				  _spdk_bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE),
1503 				  _spdk_blob_load_cpl, ctx);
1504 }
1505 
1506 struct spdk_blob_persist_ctx {
1507 	struct spdk_blob		*blob;
1508 
1509 	struct spdk_bs_super_block	*super;
1510 
1511 	struct spdk_blob_md_page	*pages;
1512 	uint32_t			next_extent_page;
1513 	struct spdk_blob_md_page	*extent_page;
1514 
1515 	spdk_bs_sequence_t		*seq;
1516 	spdk_bs_sequence_cpl		cb_fn;
1517 	void				*cb_arg;
1518 };
1519 
1520 static void
1521 spdk_bs_batch_clear_dev(struct spdk_blob_persist_ctx *ctx, spdk_bs_batch_t *batch, uint64_t lba,
1522 			uint32_t lba_count)
1523 {
1524 	switch (ctx->blob->clear_method) {
1525 	case BLOB_CLEAR_WITH_DEFAULT:
1526 	case BLOB_CLEAR_WITH_UNMAP:
1527 		spdk_bs_batch_unmap_dev(batch, lba, lba_count);
1528 		break;
1529 	case BLOB_CLEAR_WITH_WRITE_ZEROES:
1530 		spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count);
1531 		break;
1532 	case BLOB_CLEAR_WITH_NONE:
1533 	default:
1534 		break;
1535 	}
1536 }
1537 
1538 static void
1539 _spdk_blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1540 {
1541 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1542 	struct spdk_blob		*blob = ctx->blob;
1543 
1544 	if (bserrno == 0) {
1545 		_spdk_blob_mark_clean(blob);
1546 	}
1547 
1548 	/* Call user callback */
1549 	ctx->cb_fn(seq, ctx->cb_arg, bserrno);
1550 
1551 	/* Free the memory */
1552 	spdk_free(ctx->pages);
1553 	free(ctx);
1554 }
1555 
1556 static void
1557 _spdk_blob_persist_clear_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1558 {
1559 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1560 	struct spdk_blob		*blob = ctx->blob;
1561 	struct spdk_blob_store		*bs = blob->bs;
1562 	size_t				i;
1563 
1564 	/* Release all clusters that were truncated */
1565 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1566 		uint32_t cluster_num = _spdk_bs_lba_to_cluster(bs, blob->active.clusters[i]);
1567 
1568 		/* Nothing to release if it was not allocated */
1569 		if (blob->active.clusters[i] != 0) {
1570 			_spdk_bs_release_cluster(bs, cluster_num);
1571 		}
1572 	}
1573 
1574 	if (blob->active.num_clusters == 0) {
1575 		free(blob->active.clusters);
1576 		blob->active.clusters = NULL;
1577 		blob->active.cluster_array_size = 0;
1578 	} else if (blob->active.num_clusters != blob->active.cluster_array_size) {
1579 #ifndef __clang_analyzer__
1580 		void *tmp;
1581 
1582 		/* scan-build really can't figure reallocs, workaround it */
1583 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * blob->active.num_clusters);
1584 		assert(tmp != NULL);
1585 		blob->active.clusters = tmp;
1586 
1587 		tmp = realloc(blob->active.extent_pages, sizeof(uint32_t) * blob->active.num_extent_pages);
1588 		assert(tmp != NULL);
1589 		blob->active.extent_pages = tmp;
1590 #endif
1591 		blob->active.extent_pages_array_size = blob->active.num_extent_pages;
1592 		blob->active.cluster_array_size = blob->active.num_clusters;
1593 	}
1594 
1595 	/* TODO: Add path to persist clear extent pages. */
1596 	_spdk_blob_persist_complete(seq, ctx, bserrno);
1597 }
1598 
1599 static void
1600 _spdk_blob_persist_clear_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1601 {
1602 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1603 	struct spdk_blob		*blob = ctx->blob;
1604 	struct spdk_blob_store		*bs = blob->bs;
1605 	spdk_bs_batch_t			*batch;
1606 	size_t				i;
1607 	uint64_t			lba;
1608 	uint32_t			lba_count;
1609 
1610 	/* Clusters don't move around in blobs. The list shrinks or grows
1611 	 * at the end, but no changes ever occur in the middle of the list.
1612 	 */
1613 
1614 	batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_clear_clusters_cpl, ctx);
1615 
1616 	/* Clear all clusters that were truncated */
1617 	lba = 0;
1618 	lba_count = 0;
1619 	for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) {
1620 		uint64_t next_lba = blob->active.clusters[i];
1621 		uint32_t next_lba_count = _spdk_bs_cluster_to_lba(bs, 1);
1622 
1623 		if (next_lba > 0 && (lba + lba_count) == next_lba) {
1624 			/* This cluster is contiguous with the previous one. */
1625 			lba_count += next_lba_count;
1626 			continue;
1627 		}
1628 
1629 		/* This cluster is not contiguous with the previous one. */
1630 
1631 		/* If a run of LBAs previously existing, clear them now */
1632 		if (lba_count > 0) {
1633 			spdk_bs_batch_clear_dev(ctx, batch, lba, lba_count);
1634 		}
1635 
1636 		/* Start building the next batch */
1637 		lba = next_lba;
1638 		if (next_lba > 0) {
1639 			lba_count = next_lba_count;
1640 		} else {
1641 			lba_count = 0;
1642 		}
1643 	}
1644 
1645 	/* If we ended with a contiguous set of LBAs, clear them now */
1646 	if (lba_count > 0) {
1647 		spdk_bs_batch_clear_dev(ctx, batch, lba, lba_count);
1648 	}
1649 
1650 	spdk_bs_batch_close(batch);
1651 }
1652 
1653 static void
1654 _spdk_blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1655 {
1656 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1657 	struct spdk_blob		*blob = ctx->blob;
1658 	struct spdk_blob_store		*bs = blob->bs;
1659 	size_t				i;
1660 
1661 	/* This loop starts at 1 because the first page is special and handled
1662 	 * below. The pages (except the first) are never written in place,
1663 	 * so any pages in the clean list must be zeroed.
1664 	 */
1665 	for (i = 1; i < blob->clean.num_pages; i++) {
1666 		_spdk_bs_release_md_page(bs, blob->clean.pages[i]);
1667 	}
1668 
1669 	if (blob->active.num_pages == 0) {
1670 		uint32_t page_num;
1671 
1672 		page_num = _spdk_bs_blobid_to_page(blob->id);
1673 		_spdk_bs_release_md_page(bs, page_num);
1674 	}
1675 
1676 	/* Move on to clearing clusters */
1677 	_spdk_blob_persist_clear_clusters(seq, ctx, 0);
1678 }
1679 
1680 static void
1681 _spdk_blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1682 {
1683 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1684 	struct spdk_blob		*blob = ctx->blob;
1685 	struct spdk_blob_store		*bs = blob->bs;
1686 	uint64_t			lba;
1687 	uint32_t			lba_count;
1688 	spdk_bs_batch_t			*batch;
1689 	size_t				i;
1690 
1691 	batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_zero_pages_cpl, ctx);
1692 
1693 	lba_count = _spdk_bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE);
1694 
1695 	/* This loop starts at 1 because the first page is special and handled
1696 	 * below. The pages (except the first) are never written in place,
1697 	 * so any pages in the clean list must be zeroed.
1698 	 */
1699 	for (i = 1; i < blob->clean.num_pages; i++) {
1700 		lba = _spdk_bs_md_page_to_lba(bs, blob->clean.pages[i]);
1701 
1702 		spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count);
1703 	}
1704 
1705 	/* The first page will only be zeroed if this is a delete. */
1706 	if (blob->active.num_pages == 0) {
1707 		uint32_t page_num;
1708 
1709 		/* The first page in the metadata goes where the blobid indicates */
1710 		page_num = _spdk_bs_blobid_to_page(blob->id);
1711 		lba = _spdk_bs_md_page_to_lba(bs, page_num);
1712 
1713 		spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count);
1714 	}
1715 
1716 	spdk_bs_batch_close(batch);
1717 }
1718 
1719 static void
1720 _spdk_blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1721 {
1722 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1723 	struct spdk_blob		*blob = ctx->blob;
1724 	struct spdk_blob_store		*bs = blob->bs;
1725 	uint64_t			lba;
1726 	uint32_t			lba_count;
1727 	struct spdk_blob_md_page	*page;
1728 
1729 	if (blob->active.num_pages == 0) {
1730 		/* Move on to the next step */
1731 		_spdk_blob_persist_zero_pages(seq, ctx, 0);
1732 		return;
1733 	}
1734 
1735 	lba_count = _spdk_bs_byte_to_lba(bs, sizeof(*page));
1736 
1737 	page = &ctx->pages[0];
1738 	/* The first page in the metadata goes where the blobid indicates */
1739 	lba = _spdk_bs_md_page_to_lba(bs, _spdk_bs_blobid_to_page(blob->id));
1740 
1741 	spdk_bs_sequence_write_dev(seq, page, lba, lba_count,
1742 				   _spdk_blob_persist_zero_pages, ctx);
1743 }
1744 
1745 static void
1746 _spdk_blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1747 {
1748 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1749 	struct spdk_blob		*blob = ctx->blob;
1750 	struct spdk_blob_store		*bs = blob->bs;
1751 	uint64_t			lba;
1752 	uint32_t			lba_count;
1753 	struct spdk_blob_md_page	*page;
1754 	spdk_bs_batch_t			*batch;
1755 	size_t				i;
1756 
1757 	/* Clusters don't move around in blobs. The list shrinks or grows
1758 	 * at the end, but no changes ever occur in the middle of the list.
1759 	 */
1760 
1761 	lba_count = _spdk_bs_byte_to_lba(bs, sizeof(*page));
1762 
1763 	batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_write_page_root, ctx);
1764 
1765 	/* This starts at 1. The root page is not written until
1766 	 * all of the others are finished
1767 	 */
1768 	for (i = 1; i < blob->active.num_pages; i++) {
1769 		page = &ctx->pages[i];
1770 		assert(page->sequence_num == i);
1771 
1772 		lba = _spdk_bs_md_page_to_lba(bs, blob->active.pages[i]);
1773 
1774 		spdk_bs_batch_write_dev(batch, page, lba, lba_count);
1775 	}
1776 
1777 	spdk_bs_batch_close(batch);
1778 }
1779 
1780 static int
1781 _spdk_blob_resize(struct spdk_blob *blob, uint64_t sz)
1782 {
1783 	uint64_t	i;
1784 	uint64_t	*tmp;
1785 	uint64_t	lfc; /* lowest free cluster */
1786 	uint32_t	lfmd; /*  lowest free md page */
1787 	uint64_t	num_clusters;
1788 	uint32_t	*ep_tmp;
1789 	uint64_t	new_num_ep = 0, current_num_ep = 0;
1790 	struct spdk_blob_store *bs;
1791 
1792 	bs = blob->bs;
1793 
1794 	_spdk_blob_verify_md_op(blob);
1795 
1796 	if (blob->active.num_clusters == sz) {
1797 		return 0;
1798 	}
1799 
1800 	if (blob->active.num_clusters < blob->active.cluster_array_size) {
1801 		/* If this blob was resized to be larger, then smaller, then
1802 		 * larger without syncing, then the cluster array already
1803 		 * contains spare assigned clusters we can use.
1804 		 */
1805 		num_clusters = spdk_min(blob->active.cluster_array_size,
1806 					sz);
1807 	} else {
1808 		num_clusters = blob->active.num_clusters;
1809 	}
1810 
1811 	if (blob->use_extent_table) {
1812 		/* Round up since every cluster beyond current Extent Table size,
1813 		 * requires new extent page. */
1814 		new_num_ep = spdk_divide_round_up(sz, SPDK_EXTENTS_PER_EP);
1815 		current_num_ep = spdk_divide_round_up(num_clusters, SPDK_EXTENTS_PER_EP);
1816 	}
1817 
1818 	/* Do two passes - one to verify that we can obtain enough clusters
1819 	 * and md pages, another to actually claim them.
1820 	 */
1821 
1822 	if (spdk_blob_is_thin_provisioned(blob) == false) {
1823 		lfc = 0;
1824 		for (i = num_clusters; i < sz; i++) {
1825 			lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc);
1826 			if (lfc == UINT32_MAX) {
1827 				/* No more free clusters. Cannot satisfy the request */
1828 				return -ENOSPC;
1829 			}
1830 			lfc++;
1831 		}
1832 		lfmd = 0;
1833 		for (i = current_num_ep; i < new_num_ep ; i++) {
1834 			lfmd = spdk_bit_array_find_first_clear(blob->bs->used_md_pages, lfmd);
1835 			if (lfmd == UINT32_MAX) {
1836 				/* No more free md pages. Cannot satisfy the request */
1837 				return -ENOSPC;
1838 			}
1839 		}
1840 	}
1841 
1842 	if (sz > num_clusters) {
1843 		/* Expand the cluster array if necessary.
1844 		 * We only shrink the array when persisting.
1845 		 */
1846 		tmp = realloc(blob->active.clusters, sizeof(*blob->active.clusters) * sz);
1847 		if (sz > 0 && tmp == NULL) {
1848 			return -ENOMEM;
1849 		}
1850 		memset(tmp + blob->active.cluster_array_size, 0,
1851 		       sizeof(*blob->active.clusters) * (sz - blob->active.cluster_array_size));
1852 		blob->active.clusters = tmp;
1853 		blob->active.cluster_array_size = sz;
1854 
1855 		/* Expand the extents table, only if enough clusters were added */
1856 		if (new_num_ep > current_num_ep && blob->use_extent_table) {
1857 			ep_tmp = realloc(blob->active.extent_pages, sizeof(*blob->active.extent_pages) * new_num_ep);
1858 			if (new_num_ep > 0 && ep_tmp == NULL) {
1859 				return -ENOMEM;
1860 			}
1861 			memset(ep_tmp + blob->active.extent_pages_array_size, 0,
1862 			       sizeof(*blob->active.extent_pages) * (new_num_ep - blob->active.extent_pages_array_size));
1863 			blob->active.extent_pages = ep_tmp;
1864 			blob->active.extent_pages_array_size = new_num_ep;
1865 		}
1866 	}
1867 
1868 	blob->state = SPDK_BLOB_STATE_DIRTY;
1869 
1870 	if (spdk_blob_is_thin_provisioned(blob) == false) {
1871 		lfc = 0;
1872 		lfmd = 0;
1873 		for (i = num_clusters; i < sz; i++) {
1874 			_spdk_bs_allocate_cluster(blob, i, &lfc, &lfmd, true);
1875 			lfc++;
1876 			lfmd++;
1877 		}
1878 	}
1879 
1880 	blob->active.num_clusters = sz;
1881 	blob->active.num_extent_pages = new_num_ep;
1882 
1883 	return 0;
1884 }
1885 
1886 static void
1887 _spdk_blob_persist_generate_new_md(struct spdk_blob_persist_ctx *ctx)
1888 {
1889 	spdk_bs_sequence_t *seq = ctx->seq;
1890 	struct spdk_blob *blob = ctx->blob;
1891 	struct spdk_blob_store *bs = blob->bs;
1892 	uint64_t i;
1893 	uint32_t page_num;
1894 	void *tmp;
1895 	int rc;
1896 
1897 	/* Generate the new metadata */
1898 	rc = _spdk_blob_serialize(blob, &ctx->pages, &blob->active.num_pages);
1899 	if (rc < 0) {
1900 		_spdk_blob_persist_complete(seq, ctx, rc);
1901 		return;
1902 	}
1903 
1904 	assert(blob->active.num_pages >= 1);
1905 
1906 	/* Resize the cache of page indices */
1907 	tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages));
1908 	if (!tmp) {
1909 		_spdk_blob_persist_complete(seq, ctx, -ENOMEM);
1910 		return;
1911 	}
1912 	blob->active.pages = tmp;
1913 
1914 	/* Assign this metadata to pages. This requires two passes -
1915 	 * one to verify that there are enough pages and a second
1916 	 * to actually claim them. */
1917 	page_num = 0;
1918 	/* Note that this loop starts at one. The first page location is fixed by the blobid. */
1919 	for (i = 1; i < blob->active.num_pages; i++) {
1920 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
1921 		if (page_num == UINT32_MAX) {
1922 			_spdk_blob_persist_complete(seq, ctx, -ENOMEM);
1923 			return;
1924 		}
1925 		page_num++;
1926 	}
1927 
1928 	page_num = 0;
1929 	blob->active.pages[0] = _spdk_bs_blobid_to_page(blob->id);
1930 	for (i = 1; i < blob->active.num_pages; i++) {
1931 		page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num);
1932 		ctx->pages[i - 1].next = page_num;
1933 		/* Now that previous metadata page is complete, calculate the crc for it. */
1934 		ctx->pages[i - 1].crc = _spdk_blob_md_page_calc_crc(&ctx->pages[i - 1]);
1935 		blob->active.pages[i] = page_num;
1936 		_spdk_bs_claim_md_page(bs, page_num);
1937 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming page %u for blob %lu\n", page_num, blob->id);
1938 		page_num++;
1939 	}
1940 	ctx->pages[i - 1].crc = _spdk_blob_md_page_calc_crc(&ctx->pages[i - 1]);
1941 	/* Start writing the metadata from last page to first */
1942 	blob->state = SPDK_BLOB_STATE_CLEAN;
1943 	_spdk_blob_persist_write_page_chain(seq, ctx, 0);
1944 }
1945 
1946 static void _spdk_blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg,
1947 		int bserrno);
1948 
1949 static void
1950 _spdk_blob_persist_write_extent_page(uint32_t extent, uint64_t cluster_num,
1951 				     struct spdk_blob_persist_ctx *ctx)
1952 {
1953 	spdk_bs_sequence_t		*seq = ctx->seq;
1954 	uint32_t                        page_count = 0;
1955 	struct spdk_blob		*blob = ctx->blob;
1956 	int				rc;
1957 
1958 	rc = _spdk_blob_serialize_add_page(blob, &ctx->extent_page, &page_count, &ctx->extent_page);
1959 	if (rc < 0) {
1960 		assert(false);
1961 		return;
1962 	}
1963 
1964 	_spdk_blob_serialize_extent_page(blob, cluster_num, ctx->extent_page);
1965 
1966 	ctx->extent_page->crc = _spdk_blob_md_page_calc_crc(ctx->extent_page);
1967 
1968 	spdk_bs_sequence_write_dev(seq, ctx->extent_page, _spdk_bs_md_page_to_lba(blob->bs, extent),
1969 				   _spdk_bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
1970 				   _spdk_blob_persist_write_extent_pages, ctx);
1971 }
1972 
1973 static void
1974 _spdk_blob_persist_write_extent_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
1975 {
1976 	struct spdk_blob_persist_ctx	*ctx = cb_arg;
1977 	struct spdk_blob		*blob = ctx->blob;
1978 	size_t				i;
1979 	uint32_t			extent_page_id;
1980 
1981 	if (ctx->extent_page != NULL) {
1982 		spdk_free(ctx->extent_page);
1983 		ctx->extent_page = NULL;
1984 	}
1985 
1986 	/* Only write out changed extent pages */
1987 	for (i = ctx->next_extent_page; i < blob->active.num_extent_pages; i++) {
1988 		extent_page_id = blob->active.extent_pages[i];
1989 		if (extent_page_id == 0) {
1990 			/* No Extent Page to persist */
1991 			assert(spdk_blob_is_thin_provisioned(blob));
1992 			continue;
1993 		}
1994 		/* Writing out new extent page for the first time. Either active extent pages is larger
1995 		 * than clean extent pages or there was no extent page assigned due to thin provisioning. */
1996 		if (i >= blob->clean.extent_pages_array_size || blob->clean.extent_pages[i] == 0) {
1997 			assert(spdk_bit_array_get(blob->bs->used_md_pages, extent_page_id));
1998 			ctx->next_extent_page = i + 1;
1999 			_spdk_blob_persist_write_extent_page(extent_page_id, i * SPDK_EXTENTS_PER_EP, ctx);
2000 			return;
2001 		}
2002 		assert(blob->clean.extent_pages[i] != 0);
2003 	}
2004 
2005 	_spdk_blob_persist_generate_new_md(ctx);
2006 }
2007 
2008 static void
2009 _spdk_blob_persist_start(struct spdk_blob_persist_ctx *ctx)
2010 {
2011 	spdk_bs_sequence_t *seq = ctx->seq;
2012 	struct spdk_blob *blob = ctx->blob;
2013 
2014 	if (blob->active.num_pages == 0) {
2015 		/* This is the signal that the blob should be deleted.
2016 		 * Immediately jump to the clean up routine. */
2017 		assert(blob->clean.num_pages > 0);
2018 		blob->state = SPDK_BLOB_STATE_CLEAN;
2019 		_spdk_blob_persist_zero_pages(seq, ctx, 0);
2020 		return;
2021 
2022 	}
2023 
2024 	_spdk_blob_persist_write_extent_pages(seq, ctx, 0);
2025 }
2026 
2027 static void
2028 _spdk_blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2029 {
2030 	struct spdk_blob_persist_ctx *ctx = cb_arg;
2031 
2032 	ctx->blob->bs->clean = 0;
2033 
2034 	spdk_free(ctx->super);
2035 
2036 	_spdk_blob_persist_start(ctx);
2037 }
2038 
2039 static void
2040 _spdk_bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
2041 		     struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg);
2042 
2043 
2044 static void
2045 _spdk_blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2046 {
2047 	struct spdk_blob_persist_ctx *ctx = cb_arg;
2048 
2049 	ctx->super->clean = 0;
2050 	if (ctx->super->size == 0) {
2051 		ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen;
2052 	}
2053 
2054 	_spdk_bs_write_super(seq, ctx->blob->bs, ctx->super, _spdk_blob_persist_dirty_cpl, ctx);
2055 }
2056 
2057 
2058 /* Write a blob to disk */
2059 static void
2060 _spdk_blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob,
2061 		   spdk_bs_sequence_cpl cb_fn, void *cb_arg)
2062 {
2063 	struct spdk_blob_persist_ctx *ctx;
2064 
2065 	_spdk_blob_verify_md_op(blob);
2066 
2067 	if (blob->state == SPDK_BLOB_STATE_CLEAN) {
2068 		cb_fn(seq, cb_arg, 0);
2069 		return;
2070 	}
2071 
2072 	ctx = calloc(1, sizeof(*ctx));
2073 	if (!ctx) {
2074 		cb_fn(seq, cb_arg, -ENOMEM);
2075 		return;
2076 	}
2077 	ctx->blob = blob;
2078 	ctx->seq = seq;
2079 	ctx->cb_fn = cb_fn;
2080 	ctx->cb_arg = cb_arg;
2081 	ctx->next_extent_page = 0;
2082 
2083 	if (blob->bs->clean) {
2084 		ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
2085 					  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2086 		if (!ctx->super) {
2087 			cb_fn(seq, cb_arg, -ENOMEM);
2088 			free(ctx);
2089 			return;
2090 		}
2091 
2092 		spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(blob->bs, 0),
2093 					  _spdk_bs_byte_to_lba(blob->bs, sizeof(*ctx->super)),
2094 					  _spdk_blob_persist_dirty, ctx);
2095 	} else {
2096 		_spdk_blob_persist_start(ctx);
2097 	}
2098 }
2099 
2100 struct spdk_blob_copy_cluster_ctx {
2101 	struct spdk_blob *blob;
2102 	uint8_t *buf;
2103 	uint64_t page;
2104 	uint64_t new_cluster;
2105 	uint32_t new_extent_page;
2106 	spdk_bs_sequence_t *seq;
2107 };
2108 
2109 static void
2110 _spdk_blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno)
2111 {
2112 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2113 	struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq;
2114 	TAILQ_HEAD(, spdk_bs_request_set) requests;
2115 	spdk_bs_user_op_t *op;
2116 
2117 	TAILQ_INIT(&requests);
2118 	TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link);
2119 
2120 	while (!TAILQ_EMPTY(&requests)) {
2121 		op = TAILQ_FIRST(&requests);
2122 		TAILQ_REMOVE(&requests, op, link);
2123 		if (bserrno == 0) {
2124 			spdk_bs_user_op_execute(op);
2125 		} else {
2126 			spdk_bs_user_op_abort(op);
2127 		}
2128 	}
2129 
2130 	spdk_free(ctx->buf);
2131 	free(ctx);
2132 }
2133 
2134 static void
2135 _spdk_blob_insert_cluster_cpl(void *cb_arg, int bserrno)
2136 {
2137 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2138 
2139 	if (bserrno) {
2140 		if (bserrno == -EEXIST) {
2141 			/* The metadata insert failed because another thread
2142 			 * allocated the cluster first. Free our cluster
2143 			 * but continue without error. */
2144 			bserrno = 0;
2145 		}
2146 		_spdk_bs_release_cluster(ctx->blob->bs, ctx->new_cluster);
2147 		if (ctx->new_extent_page != 0) {
2148 			_spdk_bs_release_md_page(ctx->blob->bs, ctx->new_extent_page);
2149 		}
2150 	}
2151 
2152 	spdk_bs_sequence_finish(ctx->seq, bserrno);
2153 }
2154 
2155 static void
2156 _spdk_blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2157 {
2158 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2159 	uint32_t cluster_number;
2160 
2161 	if (bserrno) {
2162 		/* The write failed, so jump to the final completion handler */
2163 		spdk_bs_sequence_finish(seq, bserrno);
2164 		return;
2165 	}
2166 
2167 	cluster_number = _spdk_bs_page_to_cluster(ctx->blob->bs, ctx->page);
2168 
2169 	_spdk_blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2170 					       ctx->new_extent_page, _spdk_blob_insert_cluster_cpl, ctx);
2171 }
2172 
2173 static void
2174 _spdk_blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2175 {
2176 	struct spdk_blob_copy_cluster_ctx *ctx = cb_arg;
2177 
2178 	if (bserrno != 0) {
2179 		/* The read failed, so jump to the final completion handler */
2180 		spdk_bs_sequence_finish(seq, bserrno);
2181 		return;
2182 	}
2183 
2184 	/* Write whole cluster */
2185 	spdk_bs_sequence_write_dev(seq, ctx->buf,
2186 				   _spdk_bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster),
2187 				   _spdk_bs_cluster_to_lba(ctx->blob->bs, 1),
2188 				   _spdk_blob_write_copy_cpl, ctx);
2189 }
2190 
2191 static void
2192 _spdk_bs_allocate_and_copy_cluster(struct spdk_blob *blob,
2193 				   struct spdk_io_channel *_ch,
2194 				   uint64_t io_unit, spdk_bs_user_op_t *op)
2195 {
2196 	struct spdk_bs_cpl cpl;
2197 	struct spdk_bs_channel *ch;
2198 	struct spdk_blob_copy_cluster_ctx *ctx;
2199 	uint32_t cluster_start_page;
2200 	uint32_t cluster_number;
2201 	int rc;
2202 
2203 	ch = spdk_io_channel_get_ctx(_ch);
2204 
2205 	if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) {
2206 		/* There are already operations pending. Queue this user op
2207 		 * and return because it will be re-executed when the outstanding
2208 		 * cluster allocation completes. */
2209 		TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2210 		return;
2211 	}
2212 
2213 	/* Round the io_unit offset down to the first page in the cluster */
2214 	cluster_start_page = _spdk_bs_io_unit_to_cluster_start(blob, io_unit);
2215 
2216 	/* Calculate which index in the metadata cluster array the corresponding
2217 	 * cluster is supposed to be at. */
2218 	cluster_number = _spdk_bs_io_unit_to_cluster_number(blob, io_unit);
2219 
2220 	ctx = calloc(1, sizeof(*ctx));
2221 	if (!ctx) {
2222 		spdk_bs_user_op_abort(op);
2223 		return;
2224 	}
2225 
2226 	assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0);
2227 
2228 	ctx->blob = blob;
2229 	ctx->page = cluster_start_page;
2230 
2231 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
2232 		ctx->buf = spdk_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen,
2233 				       NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
2234 		if (!ctx->buf) {
2235 			SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n",
2236 				    blob->bs->cluster_sz);
2237 			free(ctx);
2238 			spdk_bs_user_op_abort(op);
2239 			return;
2240 		}
2241 	}
2242 
2243 	rc = _spdk_bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, &ctx->new_extent_page,
2244 				       false);
2245 	if (rc != 0) {
2246 		spdk_free(ctx->buf);
2247 		free(ctx);
2248 		spdk_bs_user_op_abort(op);
2249 		return;
2250 	}
2251 
2252 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2253 	cpl.u.blob_basic.cb_fn = _spdk_blob_allocate_and_copy_cluster_cpl;
2254 	cpl.u.blob_basic.cb_arg = ctx;
2255 
2256 	ctx->seq = spdk_bs_sequence_start(_ch, &cpl);
2257 	if (!ctx->seq) {
2258 		_spdk_bs_release_cluster(blob->bs, ctx->new_cluster);
2259 		spdk_free(ctx->buf);
2260 		free(ctx);
2261 		spdk_bs_user_op_abort(op);
2262 		return;
2263 	}
2264 
2265 	/* Queue the user op to block other incoming operations */
2266 	TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link);
2267 
2268 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
2269 		/* Read cluster from backing device */
2270 		spdk_bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf,
2271 					     _spdk_bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page),
2272 					     _spdk_bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz),
2273 					     _spdk_blob_write_copy, ctx);
2274 	} else {
2275 		_spdk_blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster,
2276 						       ctx->new_extent_page, _spdk_blob_insert_cluster_cpl, ctx);
2277 	}
2278 }
2279 
2280 static void
2281 _spdk_blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length,
2282 				       uint64_t *lba,	uint32_t *lba_count)
2283 {
2284 	*lba_count = length;
2285 
2286 	if (!_spdk_bs_io_unit_is_allocated(blob, io_unit)) {
2287 		assert(blob->back_bs_dev != NULL);
2288 		*lba = _spdk_bs_io_unit_to_back_dev_lba(blob, io_unit);
2289 		*lba_count = _spdk_bs_io_unit_to_back_dev_lba(blob, *lba_count);
2290 	} else {
2291 		*lba = _spdk_bs_blob_io_unit_to_lba(blob, io_unit);
2292 	}
2293 }
2294 
2295 struct op_split_ctx {
2296 	struct spdk_blob *blob;
2297 	struct spdk_io_channel *channel;
2298 	uint64_t io_unit_offset;
2299 	uint64_t io_units_remaining;
2300 	void *curr_payload;
2301 	enum spdk_blob_op_type op_type;
2302 	spdk_bs_sequence_t *seq;
2303 };
2304 
2305 static void
2306 _spdk_blob_request_submit_op_split_next(void *cb_arg, int bserrno)
2307 {
2308 	struct op_split_ctx	*ctx = cb_arg;
2309 	struct spdk_blob	*blob = ctx->blob;
2310 	struct spdk_io_channel	*ch = ctx->channel;
2311 	enum spdk_blob_op_type	op_type = ctx->op_type;
2312 	uint8_t			*buf = ctx->curr_payload;
2313 	uint64_t		offset = ctx->io_unit_offset;
2314 	uint64_t		length = ctx->io_units_remaining;
2315 	uint64_t		op_length;
2316 
2317 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
2318 		spdk_bs_sequence_finish(ctx->seq, bserrno);
2319 		free(ctx);
2320 		return;
2321 	}
2322 
2323 	op_length = spdk_min(length, _spdk_bs_num_io_units_to_cluster_boundary(blob,
2324 			     offset));
2325 
2326 	/* Update length and payload for next operation */
2327 	ctx->io_units_remaining -= op_length;
2328 	ctx->io_unit_offset += op_length;
2329 	if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) {
2330 		ctx->curr_payload += op_length * blob->bs->io_unit_size;
2331 	}
2332 
2333 	switch (op_type) {
2334 	case SPDK_BLOB_READ:
2335 		spdk_blob_io_read(blob, ch, buf, offset, op_length,
2336 				  _spdk_blob_request_submit_op_split_next, ctx);
2337 		break;
2338 	case SPDK_BLOB_WRITE:
2339 		spdk_blob_io_write(blob, ch, buf, offset, op_length,
2340 				   _spdk_blob_request_submit_op_split_next, ctx);
2341 		break;
2342 	case SPDK_BLOB_UNMAP:
2343 		spdk_blob_io_unmap(blob, ch, offset, op_length,
2344 				   _spdk_blob_request_submit_op_split_next, ctx);
2345 		break;
2346 	case SPDK_BLOB_WRITE_ZEROES:
2347 		spdk_blob_io_write_zeroes(blob, ch, offset, op_length,
2348 					  _spdk_blob_request_submit_op_split_next, ctx);
2349 		break;
2350 	case SPDK_BLOB_READV:
2351 	case SPDK_BLOB_WRITEV:
2352 		SPDK_ERRLOG("readv/write not valid\n");
2353 		spdk_bs_sequence_finish(ctx->seq, -EINVAL);
2354 		free(ctx);
2355 		break;
2356 	}
2357 }
2358 
2359 static void
2360 _spdk_blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob,
2361 				   void *payload, uint64_t offset, uint64_t length,
2362 				   spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2363 {
2364 	struct op_split_ctx *ctx;
2365 	spdk_bs_sequence_t *seq;
2366 	struct spdk_bs_cpl cpl;
2367 
2368 	assert(blob != NULL);
2369 
2370 	ctx = calloc(1, sizeof(struct op_split_ctx));
2371 	if (ctx == NULL) {
2372 		cb_fn(cb_arg, -ENOMEM);
2373 		return;
2374 	}
2375 
2376 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2377 	cpl.u.blob_basic.cb_fn = cb_fn;
2378 	cpl.u.blob_basic.cb_arg = cb_arg;
2379 
2380 	seq = spdk_bs_sequence_start(ch, &cpl);
2381 	if (!seq) {
2382 		free(ctx);
2383 		cb_fn(cb_arg, -ENOMEM);
2384 		return;
2385 	}
2386 
2387 	ctx->blob = blob;
2388 	ctx->channel = ch;
2389 	ctx->curr_payload = payload;
2390 	ctx->io_unit_offset = offset;
2391 	ctx->io_units_remaining = length;
2392 	ctx->op_type = op_type;
2393 	ctx->seq = seq;
2394 
2395 	_spdk_blob_request_submit_op_split_next(ctx, 0);
2396 }
2397 
2398 static void
2399 _spdk_blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob,
2400 				    void *payload, uint64_t offset, uint64_t length,
2401 				    spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2402 {
2403 	struct spdk_bs_cpl cpl;
2404 	uint64_t lba;
2405 	uint32_t lba_count;
2406 
2407 	assert(blob != NULL);
2408 
2409 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2410 	cpl.u.blob_basic.cb_fn = cb_fn;
2411 	cpl.u.blob_basic.cb_arg = cb_arg;
2412 
2413 	_spdk_blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
2414 
2415 	if (blob->frozen_refcnt) {
2416 		/* This blob I/O is frozen */
2417 		spdk_bs_user_op_t *op;
2418 		struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch);
2419 
2420 		op = spdk_bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2421 		if (!op) {
2422 			cb_fn(cb_arg, -ENOMEM);
2423 			return;
2424 		}
2425 
2426 		TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
2427 
2428 		return;
2429 	}
2430 
2431 	switch (op_type) {
2432 	case SPDK_BLOB_READ: {
2433 		spdk_bs_batch_t *batch;
2434 
2435 		batch = spdk_bs_batch_open(_ch, &cpl);
2436 		if (!batch) {
2437 			cb_fn(cb_arg, -ENOMEM);
2438 			return;
2439 		}
2440 
2441 		if (_spdk_bs_io_unit_is_allocated(blob, offset)) {
2442 			/* Read from the blob */
2443 			spdk_bs_batch_read_dev(batch, payload, lba, lba_count);
2444 		} else {
2445 			/* Read from the backing block device */
2446 			spdk_bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count);
2447 		}
2448 
2449 		spdk_bs_batch_close(batch);
2450 		break;
2451 	}
2452 	case SPDK_BLOB_WRITE:
2453 	case SPDK_BLOB_WRITE_ZEROES: {
2454 		if (_spdk_bs_io_unit_is_allocated(blob, offset)) {
2455 			/* Write to the blob */
2456 			spdk_bs_batch_t *batch;
2457 
2458 			if (lba_count == 0) {
2459 				cb_fn(cb_arg, 0);
2460 				return;
2461 			}
2462 
2463 			batch = spdk_bs_batch_open(_ch, &cpl);
2464 			if (!batch) {
2465 				cb_fn(cb_arg, -ENOMEM);
2466 				return;
2467 			}
2468 
2469 			if (op_type == SPDK_BLOB_WRITE) {
2470 				spdk_bs_batch_write_dev(batch, payload, lba, lba_count);
2471 			} else {
2472 				spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count);
2473 			}
2474 
2475 			spdk_bs_batch_close(batch);
2476 		} else {
2477 			/* Queue this operation and allocate the cluster */
2478 			spdk_bs_user_op_t *op;
2479 
2480 			op = spdk_bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length);
2481 			if (!op) {
2482 				cb_fn(cb_arg, -ENOMEM);
2483 				return;
2484 			}
2485 
2486 			_spdk_bs_allocate_and_copy_cluster(blob, _ch, offset, op);
2487 		}
2488 		break;
2489 	}
2490 	case SPDK_BLOB_UNMAP: {
2491 		spdk_bs_batch_t *batch;
2492 
2493 		batch = spdk_bs_batch_open(_ch, &cpl);
2494 		if (!batch) {
2495 			cb_fn(cb_arg, -ENOMEM);
2496 			return;
2497 		}
2498 
2499 		if (_spdk_bs_io_unit_is_allocated(blob, offset)) {
2500 			spdk_bs_batch_unmap_dev(batch, lba, lba_count);
2501 		}
2502 
2503 		spdk_bs_batch_close(batch);
2504 		break;
2505 	}
2506 	case SPDK_BLOB_READV:
2507 	case SPDK_BLOB_WRITEV:
2508 		SPDK_ERRLOG("readv/write not valid\n");
2509 		cb_fn(cb_arg, -EINVAL);
2510 		break;
2511 	}
2512 }
2513 
2514 static void
2515 _spdk_blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel,
2516 			     void *payload, uint64_t offset, uint64_t length,
2517 			     spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type)
2518 {
2519 	assert(blob != NULL);
2520 
2521 	if (blob->data_ro && op_type != SPDK_BLOB_READ) {
2522 		cb_fn(cb_arg, -EPERM);
2523 		return;
2524 	}
2525 
2526 	if (offset + length > _spdk_bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
2527 		cb_fn(cb_arg, -EINVAL);
2528 		return;
2529 	}
2530 	if (length <= _spdk_bs_num_io_units_to_cluster_boundary(blob, offset)) {
2531 		_spdk_blob_request_submit_op_single(_channel, blob, payload, offset, length,
2532 						    cb_fn, cb_arg, op_type);
2533 	} else {
2534 		_spdk_blob_request_submit_op_split(_channel, blob, payload, offset, length,
2535 						   cb_fn, cb_arg, op_type);
2536 	}
2537 }
2538 
2539 struct rw_iov_ctx {
2540 	struct spdk_blob *blob;
2541 	struct spdk_io_channel *channel;
2542 	spdk_blob_op_complete cb_fn;
2543 	void *cb_arg;
2544 	bool read;
2545 	int iovcnt;
2546 	struct iovec *orig_iov;
2547 	uint64_t io_unit_offset;
2548 	uint64_t io_units_remaining;
2549 	uint64_t io_units_done;
2550 	struct iovec iov[0];
2551 };
2552 
2553 static void
2554 _spdk_rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
2555 {
2556 	assert(cb_arg == NULL);
2557 	spdk_bs_sequence_finish(seq, bserrno);
2558 }
2559 
2560 static void
2561 _spdk_rw_iov_split_next(void *cb_arg, int bserrno)
2562 {
2563 	struct rw_iov_ctx *ctx = cb_arg;
2564 	struct spdk_blob *blob = ctx->blob;
2565 	struct iovec *iov, *orig_iov;
2566 	int iovcnt;
2567 	size_t orig_iovoff;
2568 	uint64_t io_units_count, io_units_to_boundary, io_unit_offset;
2569 	uint64_t byte_count;
2570 
2571 	if (bserrno != 0 || ctx->io_units_remaining == 0) {
2572 		ctx->cb_fn(ctx->cb_arg, bserrno);
2573 		free(ctx);
2574 		return;
2575 	}
2576 
2577 	io_unit_offset = ctx->io_unit_offset;
2578 	io_units_to_boundary = _spdk_bs_num_io_units_to_cluster_boundary(blob, io_unit_offset);
2579 	io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary);
2580 	/*
2581 	 * Get index and offset into the original iov array for our current position in the I/O sequence.
2582 	 *  byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will
2583 	 *  point to the current position in the I/O sequence.
2584 	 */
2585 	byte_count = ctx->io_units_done * blob->bs->io_unit_size;
2586 	orig_iov = &ctx->orig_iov[0];
2587 	orig_iovoff = 0;
2588 	while (byte_count > 0) {
2589 		if (byte_count >= orig_iov->iov_len) {
2590 			byte_count -= orig_iov->iov_len;
2591 			orig_iov++;
2592 		} else {
2593 			orig_iovoff = byte_count;
2594 			byte_count = 0;
2595 		}
2596 	}
2597 
2598 	/*
2599 	 * Build an iov array for the next I/O in the sequence.  byte_count will keep track of how many
2600 	 *  bytes of this next I/O remain to be accounted for in the new iov array.
2601 	 */
2602 	byte_count = io_units_count * blob->bs->io_unit_size;
2603 	iov = &ctx->iov[0];
2604 	iovcnt = 0;
2605 	while (byte_count > 0) {
2606 		assert(iovcnt < ctx->iovcnt);
2607 		iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff);
2608 		iov->iov_base = orig_iov->iov_base + orig_iovoff;
2609 		byte_count -= iov->iov_len;
2610 		orig_iovoff = 0;
2611 		orig_iov++;
2612 		iov++;
2613 		iovcnt++;
2614 	}
2615 
2616 	ctx->io_unit_offset += io_units_count;
2617 	ctx->io_units_remaining -= io_units_count;
2618 	ctx->io_units_done += io_units_count;
2619 	iov = &ctx->iov[0];
2620 
2621 	if (ctx->read) {
2622 		spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
2623 				   io_units_count, _spdk_rw_iov_split_next, ctx);
2624 	} else {
2625 		spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset,
2626 				    io_units_count, _spdk_rw_iov_split_next, ctx);
2627 	}
2628 }
2629 
2630 static void
2631 _spdk_blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel,
2632 				 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
2633 				 spdk_blob_op_complete cb_fn, void *cb_arg, bool read)
2634 {
2635 	struct spdk_bs_cpl	cpl;
2636 
2637 	assert(blob != NULL);
2638 
2639 	if (!read && blob->data_ro) {
2640 		cb_fn(cb_arg, -EPERM);
2641 		return;
2642 	}
2643 
2644 	if (length == 0) {
2645 		cb_fn(cb_arg, 0);
2646 		return;
2647 	}
2648 
2649 	if (offset + length > _spdk_bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) {
2650 		cb_fn(cb_arg, -EINVAL);
2651 		return;
2652 	}
2653 
2654 	/*
2655 	 * For now, we implement readv/writev using a sequence (instead of a batch) to account for having
2656 	 *  to split a request that spans a cluster boundary.  For I/O that do not span a cluster boundary,
2657 	 *  there will be no noticeable difference compared to using a batch.  For I/O that do span a cluster
2658 	 *  boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need
2659 	 *  to allocate a separate iov array and split the I/O such that none of the resulting
2660 	 *  smaller I/O cross a cluster boundary.  These smaller I/O will be issued in sequence (not in parallel)
2661 	 *  but since this case happens very infrequently, any performance impact will be negligible.
2662 	 *
2663 	 * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs
2664 	 *  for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them
2665 	 *  in a batch.  That would also require creating an intermediate spdk_bs_cpl that would get called
2666 	 *  when the batch was completed, to allow for freeing the memory for the iov arrays.
2667 	 */
2668 	if (spdk_likely(length <= _spdk_bs_num_io_units_to_cluster_boundary(blob, offset))) {
2669 		uint32_t lba_count;
2670 		uint64_t lba;
2671 
2672 		cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
2673 		cpl.u.blob_basic.cb_fn = cb_fn;
2674 		cpl.u.blob_basic.cb_arg = cb_arg;
2675 
2676 		if (blob->frozen_refcnt) {
2677 			/* This blob I/O is frozen */
2678 			enum spdk_blob_op_type op_type;
2679 			spdk_bs_user_op_t *op;
2680 			struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel);
2681 
2682 			op_type = read ? SPDK_BLOB_READV : SPDK_BLOB_WRITEV;
2683 			op = spdk_bs_user_op_alloc(_channel, &cpl, op_type, blob, iov, iovcnt, offset, length);
2684 			if (!op) {
2685 				cb_fn(cb_arg, -ENOMEM);
2686 				return;
2687 			}
2688 
2689 			TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link);
2690 
2691 			return;
2692 		}
2693 
2694 		_spdk_blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count);
2695 
2696 		if (read) {
2697 			spdk_bs_sequence_t *seq;
2698 
2699 			seq = spdk_bs_sequence_start(_channel, &cpl);
2700 			if (!seq) {
2701 				cb_fn(cb_arg, -ENOMEM);
2702 				return;
2703 			}
2704 
2705 			if (_spdk_bs_io_unit_is_allocated(blob, offset)) {
2706 				spdk_bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, _spdk_rw_iov_done, NULL);
2707 			} else {
2708 				spdk_bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count,
2709 							      _spdk_rw_iov_done, NULL);
2710 			}
2711 		} else {
2712 			if (_spdk_bs_io_unit_is_allocated(blob, offset)) {
2713 				spdk_bs_sequence_t *seq;
2714 
2715 				seq = spdk_bs_sequence_start(_channel, &cpl);
2716 				if (!seq) {
2717 					cb_fn(cb_arg, -ENOMEM);
2718 					return;
2719 				}
2720 
2721 				spdk_bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, _spdk_rw_iov_done, NULL);
2722 			} else {
2723 				/* Queue this operation and allocate the cluster */
2724 				spdk_bs_user_op_t *op;
2725 
2726 				op = spdk_bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset,
2727 							   length);
2728 				if (!op) {
2729 					cb_fn(cb_arg, -ENOMEM);
2730 					return;
2731 				}
2732 
2733 				_spdk_bs_allocate_and_copy_cluster(blob, _channel, offset, op);
2734 			}
2735 		}
2736 	} else {
2737 		struct rw_iov_ctx *ctx;
2738 
2739 		ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec));
2740 		if (ctx == NULL) {
2741 			cb_fn(cb_arg, -ENOMEM);
2742 			return;
2743 		}
2744 
2745 		ctx->blob = blob;
2746 		ctx->channel = _channel;
2747 		ctx->cb_fn = cb_fn;
2748 		ctx->cb_arg = cb_arg;
2749 		ctx->read = read;
2750 		ctx->orig_iov = iov;
2751 		ctx->iovcnt = iovcnt;
2752 		ctx->io_unit_offset = offset;
2753 		ctx->io_units_remaining = length;
2754 		ctx->io_units_done = 0;
2755 
2756 		_spdk_rw_iov_split_next(ctx, 0);
2757 	}
2758 }
2759 
2760 static struct spdk_blob *
2761 _spdk_blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid)
2762 {
2763 	struct spdk_blob *blob;
2764 
2765 	TAILQ_FOREACH(blob, &bs->blobs, link) {
2766 		if (blob->id == blobid) {
2767 			return blob;
2768 		}
2769 	}
2770 
2771 	return NULL;
2772 }
2773 
2774 static void
2775 _spdk_blob_get_snapshot_and_clone_entries(struct spdk_blob *blob,
2776 		struct spdk_blob_list **snapshot_entry, struct spdk_blob_list **clone_entry)
2777 {
2778 	assert(blob != NULL);
2779 	*snapshot_entry = NULL;
2780 	*clone_entry = NULL;
2781 
2782 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
2783 		return;
2784 	}
2785 
2786 	TAILQ_FOREACH(*snapshot_entry, &blob->bs->snapshots, link) {
2787 		if ((*snapshot_entry)->id == blob->parent_id) {
2788 			break;
2789 		}
2790 	}
2791 
2792 	if (*snapshot_entry != NULL) {
2793 		TAILQ_FOREACH(*clone_entry, &(*snapshot_entry)->clones, link) {
2794 			if ((*clone_entry)->id == blob->id) {
2795 				break;
2796 			}
2797 		}
2798 
2799 		assert(clone_entry != NULL);
2800 	}
2801 }
2802 
2803 static int
2804 _spdk_bs_channel_create(void *io_device, void *ctx_buf)
2805 {
2806 	struct spdk_blob_store		*bs = io_device;
2807 	struct spdk_bs_channel		*channel = ctx_buf;
2808 	struct spdk_bs_dev		*dev;
2809 	uint32_t			max_ops = bs->max_channel_ops;
2810 	uint32_t			i;
2811 
2812 	dev = bs->dev;
2813 
2814 	channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set));
2815 	if (!channel->req_mem) {
2816 		return -1;
2817 	}
2818 
2819 	TAILQ_INIT(&channel->reqs);
2820 
2821 	for (i = 0; i < max_ops; i++) {
2822 		TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link);
2823 	}
2824 
2825 	channel->bs = bs;
2826 	channel->dev = dev;
2827 	channel->dev_channel = dev->create_channel(dev);
2828 
2829 	if (!channel->dev_channel) {
2830 		SPDK_ERRLOG("Failed to create device channel.\n");
2831 		free(channel->req_mem);
2832 		return -1;
2833 	}
2834 
2835 	TAILQ_INIT(&channel->need_cluster_alloc);
2836 	TAILQ_INIT(&channel->queued_io);
2837 
2838 	return 0;
2839 }
2840 
2841 static void
2842 _spdk_bs_channel_destroy(void *io_device, void *ctx_buf)
2843 {
2844 	struct spdk_bs_channel *channel = ctx_buf;
2845 	spdk_bs_user_op_t *op;
2846 
2847 	while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) {
2848 		op = TAILQ_FIRST(&channel->need_cluster_alloc);
2849 		TAILQ_REMOVE(&channel->need_cluster_alloc, op, link);
2850 		spdk_bs_user_op_abort(op);
2851 	}
2852 
2853 	while (!TAILQ_EMPTY(&channel->queued_io)) {
2854 		op = TAILQ_FIRST(&channel->queued_io);
2855 		TAILQ_REMOVE(&channel->queued_io, op, link);
2856 		spdk_bs_user_op_abort(op);
2857 	}
2858 
2859 	free(channel->req_mem);
2860 	channel->dev->destroy_channel(channel->dev, channel->dev_channel);
2861 }
2862 
2863 static void
2864 _spdk_bs_dev_destroy(void *io_device)
2865 {
2866 	struct spdk_blob_store *bs = io_device;
2867 	struct spdk_blob	*blob, *blob_tmp;
2868 
2869 	bs->dev->destroy(bs->dev);
2870 
2871 	TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) {
2872 		TAILQ_REMOVE(&bs->blobs, blob, link);
2873 		_spdk_blob_free(blob);
2874 	}
2875 
2876 	pthread_mutex_destroy(&bs->used_clusters_mutex);
2877 
2878 	spdk_bit_array_free(&bs->used_blobids);
2879 	spdk_bit_array_free(&bs->used_md_pages);
2880 	spdk_bit_array_free(&bs->used_clusters);
2881 	/*
2882 	 * If this function is called for any reason except a successful unload,
2883 	 * the unload_cpl type will be NONE and this will be a nop.
2884 	 */
2885 	spdk_bs_call_cpl(&bs->unload_cpl, bs->unload_err);
2886 
2887 	free(bs);
2888 }
2889 
2890 static int
2891 _spdk_bs_blob_list_add(struct spdk_blob *blob)
2892 {
2893 	spdk_blob_id snapshot_id;
2894 	struct spdk_blob_list *snapshot_entry = NULL;
2895 	struct spdk_blob_list *clone_entry = NULL;
2896 
2897 	assert(blob != NULL);
2898 
2899 	snapshot_id = blob->parent_id;
2900 	if (snapshot_id == SPDK_BLOBID_INVALID) {
2901 		return 0;
2902 	}
2903 
2904 	snapshot_entry = _spdk_bs_get_snapshot_entry(blob->bs, snapshot_id);
2905 	if (snapshot_entry == NULL) {
2906 		/* Snapshot not found */
2907 		snapshot_entry = calloc(1, sizeof(struct spdk_blob_list));
2908 		if (snapshot_entry == NULL) {
2909 			return -ENOMEM;
2910 		}
2911 		snapshot_entry->id = snapshot_id;
2912 		TAILQ_INIT(&snapshot_entry->clones);
2913 		TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link);
2914 	} else {
2915 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
2916 			if (clone_entry->id == blob->id) {
2917 				break;
2918 			}
2919 		}
2920 	}
2921 
2922 	if (clone_entry == NULL) {
2923 		/* Clone not found */
2924 		clone_entry = calloc(1, sizeof(struct spdk_blob_list));
2925 		if (clone_entry == NULL) {
2926 			return -ENOMEM;
2927 		}
2928 		clone_entry->id = blob->id;
2929 		TAILQ_INIT(&clone_entry->clones);
2930 		TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link);
2931 		snapshot_entry->clone_count++;
2932 	}
2933 
2934 	return 0;
2935 }
2936 
2937 static void
2938 _spdk_bs_blob_list_remove(struct spdk_blob *blob)
2939 {
2940 	struct spdk_blob_list *snapshot_entry = NULL;
2941 	struct spdk_blob_list *clone_entry = NULL;
2942 
2943 	_spdk_blob_get_snapshot_and_clone_entries(blob, &snapshot_entry, &clone_entry);
2944 
2945 	if (snapshot_entry == NULL) {
2946 		return;
2947 	}
2948 
2949 	blob->parent_id = SPDK_BLOBID_INVALID;
2950 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
2951 	free(clone_entry);
2952 
2953 	snapshot_entry->clone_count--;
2954 }
2955 
2956 static int
2957 _spdk_bs_blob_list_free(struct spdk_blob_store *bs)
2958 {
2959 	struct spdk_blob_list *snapshot_entry;
2960 	struct spdk_blob_list *snapshot_entry_tmp;
2961 	struct spdk_blob_list *clone_entry;
2962 	struct spdk_blob_list *clone_entry_tmp;
2963 
2964 	TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) {
2965 		TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) {
2966 			TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
2967 			free(clone_entry);
2968 		}
2969 		TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link);
2970 		free(snapshot_entry);
2971 	}
2972 
2973 	return 0;
2974 }
2975 
2976 static void
2977 _spdk_bs_free(struct spdk_blob_store *bs)
2978 {
2979 	_spdk_bs_blob_list_free(bs);
2980 
2981 	spdk_bs_unregister_md_thread(bs);
2982 	spdk_io_device_unregister(bs, _spdk_bs_dev_destroy);
2983 }
2984 
2985 void
2986 spdk_bs_opts_init(struct spdk_bs_opts *opts)
2987 {
2988 	opts->cluster_sz = SPDK_BLOB_OPTS_CLUSTER_SZ;
2989 	opts->num_md_pages = SPDK_BLOB_OPTS_NUM_MD_PAGES;
2990 	opts->max_md_ops = SPDK_BLOB_OPTS_MAX_MD_OPS;
2991 	opts->max_channel_ops = SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS;
2992 	opts->clear_method = BS_CLEAR_WITH_UNMAP;
2993 	memset(&opts->bstype, 0, sizeof(opts->bstype));
2994 	opts->iter_cb_fn = NULL;
2995 	opts->iter_cb_arg = NULL;
2996 }
2997 
2998 static int
2999 _spdk_bs_opts_verify(struct spdk_bs_opts *opts)
3000 {
3001 	if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 ||
3002 	    opts->max_channel_ops == 0) {
3003 		SPDK_ERRLOG("Blobstore options cannot be set to 0\n");
3004 		return -1;
3005 	}
3006 
3007 	return 0;
3008 }
3009 
3010 static int
3011 _spdk_bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs)
3012 {
3013 	struct spdk_blob_store	*bs;
3014 	uint64_t dev_size;
3015 	int rc;
3016 
3017 	dev_size = dev->blocklen * dev->blockcnt;
3018 	if (dev_size < opts->cluster_sz) {
3019 		/* Device size cannot be smaller than cluster size of blobstore */
3020 		SPDK_INFOLOG(SPDK_LOG_BLOB, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n",
3021 			     dev_size, opts->cluster_sz);
3022 		return -ENOSPC;
3023 	}
3024 	if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) {
3025 		/* Cluster size cannot be smaller than page size */
3026 		SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n",
3027 			    opts->cluster_sz, SPDK_BS_PAGE_SIZE);
3028 		return -EINVAL;
3029 	}
3030 	bs = calloc(1, sizeof(struct spdk_blob_store));
3031 	if (!bs) {
3032 		return -ENOMEM;
3033 	}
3034 
3035 	TAILQ_INIT(&bs->blobs);
3036 	TAILQ_INIT(&bs->snapshots);
3037 	bs->dev = dev;
3038 	bs->md_thread = spdk_get_thread();
3039 	assert(bs->md_thread != NULL);
3040 
3041 	/*
3042 	 * Do not use _spdk_bs_lba_to_cluster() here since blockcnt may not be an
3043 	 *  even multiple of the cluster size.
3044 	 */
3045 	bs->cluster_sz = opts->cluster_sz;
3046 	bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen);
3047 	bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE;
3048 	bs->num_free_clusters = bs->total_clusters;
3049 	bs->used_clusters = spdk_bit_array_create(bs->total_clusters);
3050 	bs->io_unit_size = dev->blocklen;
3051 	if (bs->used_clusters == NULL) {
3052 		free(bs);
3053 		return -ENOMEM;
3054 	}
3055 
3056 	bs->max_channel_ops = opts->max_channel_ops;
3057 	bs->super_blob = SPDK_BLOBID_INVALID;
3058 	memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype));
3059 
3060 	/* The metadata is assumed to be at least 1 page */
3061 	bs->used_md_pages = spdk_bit_array_create(1);
3062 	bs->used_blobids = spdk_bit_array_create(0);
3063 
3064 	pthread_mutex_init(&bs->used_clusters_mutex, NULL);
3065 
3066 	spdk_io_device_register(bs, _spdk_bs_channel_create, _spdk_bs_channel_destroy,
3067 				sizeof(struct spdk_bs_channel), "blobstore");
3068 	rc = spdk_bs_register_md_thread(bs);
3069 	if (rc == -1) {
3070 		spdk_io_device_unregister(bs, NULL);
3071 		pthread_mutex_destroy(&bs->used_clusters_mutex);
3072 		spdk_bit_array_free(&bs->used_blobids);
3073 		spdk_bit_array_free(&bs->used_md_pages);
3074 		spdk_bit_array_free(&bs->used_clusters);
3075 		free(bs);
3076 		/* FIXME: this is a lie but don't know how to get a proper error code here */
3077 		return -ENOMEM;
3078 	}
3079 
3080 	*_bs = bs;
3081 	return 0;
3082 }
3083 
3084 /* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */
3085 
3086 struct spdk_bs_load_ctx {
3087 	struct spdk_blob_store		*bs;
3088 	struct spdk_bs_super_block	*super;
3089 
3090 	struct spdk_bs_md_mask		*mask;
3091 	bool				in_page_chain;
3092 	uint32_t			page_index;
3093 	uint32_t			cur_page;
3094 	struct spdk_blob_md_page	*page;
3095 
3096 	spdk_bs_sequence_t			*seq;
3097 	spdk_blob_op_with_handle_complete	iter_cb_fn;
3098 	void					*iter_cb_arg;
3099 	struct spdk_blob			*blob;
3100 	spdk_blob_id				blobid;
3101 };
3102 
3103 static void
3104 _spdk_bs_load_ctx_fail(struct spdk_bs_load_ctx *ctx, int bserrno)
3105 {
3106 	assert(bserrno != 0);
3107 
3108 	spdk_free(ctx->super);
3109 	spdk_bs_sequence_finish(ctx->seq, bserrno);
3110 	_spdk_bs_free(ctx->bs);
3111 	free(ctx);
3112 }
3113 
3114 static void
3115 _spdk_bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask)
3116 {
3117 	uint32_t i = 0;
3118 
3119 	while (true) {
3120 		i = spdk_bit_array_find_first_set(array, i);
3121 		if (i >= mask->length) {
3122 			break;
3123 		}
3124 		mask->mask[i / 8] |= 1U << (i % 8);
3125 		i++;
3126 	}
3127 }
3128 
3129 static int
3130 _spdk_bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask)
3131 {
3132 	struct spdk_bit_array *array;
3133 	uint32_t i;
3134 
3135 	if (spdk_bit_array_resize(array_ptr, mask->length) < 0) {
3136 		return -ENOMEM;
3137 	}
3138 
3139 	array = *array_ptr;
3140 	for (i = 0; i < mask->length; i++) {
3141 		if (mask->mask[i / 8] & (1U << (i % 8))) {
3142 			spdk_bit_array_set(array, i);
3143 		}
3144 	}
3145 
3146 	return 0;
3147 }
3148 
3149 static void
3150 _spdk_bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs,
3151 		     struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg)
3152 {
3153 	/* Update the values in the super block */
3154 	super->super_blob = bs->super_blob;
3155 	memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype));
3156 	super->crc = _spdk_blob_md_page_calc_crc(super);
3157 	spdk_bs_sequence_write_dev(seq, super, _spdk_bs_page_to_lba(bs, 0),
3158 				   _spdk_bs_byte_to_lba(bs, sizeof(*super)),
3159 				   cb_fn, cb_arg);
3160 }
3161 
3162 static void
3163 _spdk_bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3164 {
3165 	struct spdk_bs_load_ctx	*ctx = arg;
3166 	uint64_t	mask_size, lba, lba_count;
3167 
3168 	/* Write out the used clusters mask */
3169 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
3170 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3171 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3172 	if (!ctx->mask) {
3173 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3174 		return;
3175 	}
3176 
3177 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS;
3178 	ctx->mask->length = ctx->bs->total_clusters;
3179 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters));
3180 
3181 	_spdk_bs_set_mask(ctx->bs->used_clusters, ctx->mask);
3182 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
3183 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
3184 	spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3185 }
3186 
3187 static void
3188 _spdk_bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3189 {
3190 	struct spdk_bs_load_ctx	*ctx = arg;
3191 	uint64_t	mask_size, lba, lba_count;
3192 
3193 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
3194 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3195 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3196 	if (!ctx->mask) {
3197 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3198 		return;
3199 	}
3200 
3201 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES;
3202 	ctx->mask->length = ctx->super->md_len;
3203 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages));
3204 
3205 	_spdk_bs_set_mask(ctx->bs->used_md_pages, ctx->mask);
3206 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
3207 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
3208 	spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3209 }
3210 
3211 static void
3212 _spdk_bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn)
3213 {
3214 	struct spdk_bs_load_ctx	*ctx = arg;
3215 	uint64_t	mask_size, lba, lba_count;
3216 
3217 	if (ctx->super->used_blobid_mask_len == 0) {
3218 		/*
3219 		 * This is a pre-v3 on-disk format where the blobid mask does not get
3220 		 *  written to disk.
3221 		 */
3222 		cb_fn(seq, arg, 0);
3223 		return;
3224 	}
3225 
3226 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
3227 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3228 				 SPDK_MALLOC_DMA);
3229 	if (!ctx->mask) {
3230 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3231 		return;
3232 	}
3233 
3234 	ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS;
3235 	ctx->mask->length = ctx->super->md_len;
3236 	assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids));
3237 
3238 	_spdk_bs_set_mask(ctx->bs->used_blobids, ctx->mask);
3239 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
3240 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
3241 	spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg);
3242 }
3243 
3244 static void
3245 _spdk_blob_set_thin_provision(struct spdk_blob *blob)
3246 {
3247 	_spdk_blob_verify_md_op(blob);
3248 	blob->invalid_flags |= SPDK_BLOB_THIN_PROV;
3249 	blob->state = SPDK_BLOB_STATE_DIRTY;
3250 }
3251 
3252 static void
3253 _spdk_blob_set_clear_method(struct spdk_blob *blob, enum blob_clear_method clear_method)
3254 {
3255 	_spdk_blob_verify_md_op(blob);
3256 	blob->clear_method = clear_method;
3257 	blob->md_ro_flags |= (clear_method << SPDK_BLOB_CLEAR_METHOD_SHIFT);
3258 	blob->state = SPDK_BLOB_STATE_DIRTY;
3259 }
3260 
3261 static void _spdk_bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno);
3262 
3263 static void
3264 _spdk_bs_delete_corrupted_blob_cpl(void *cb_arg, int bserrno)
3265 {
3266 	struct spdk_bs_load_ctx *ctx = cb_arg;
3267 	spdk_blob_id id;
3268 	int64_t page_num;
3269 
3270 	/* Iterate to next blob (we can't use spdk_bs_iter_next function as our
3271 	 * last blob has been removed */
3272 	page_num = _spdk_bs_blobid_to_page(ctx->blobid);
3273 	page_num++;
3274 	page_num = spdk_bit_array_find_first_set(ctx->bs->used_blobids, page_num);
3275 	if (page_num >= spdk_bit_array_capacity(ctx->bs->used_blobids)) {
3276 		_spdk_bs_load_iter(ctx, NULL, -ENOENT);
3277 		return;
3278 	}
3279 
3280 	id = _spdk_bs_page_to_blobid(page_num);
3281 
3282 	spdk_bs_open_blob(ctx->bs, id, _spdk_bs_load_iter, ctx);
3283 }
3284 
3285 static void
3286 _spdk_bs_delete_corrupted_close_cb(void *cb_arg, int bserrno)
3287 {
3288 	struct spdk_bs_load_ctx *ctx = cb_arg;
3289 
3290 	if (bserrno != 0) {
3291 		SPDK_ERRLOG("Failed to close corrupted blob\n");
3292 		spdk_bs_iter_next(ctx->bs, ctx->blob, _spdk_bs_load_iter, ctx);
3293 		return;
3294 	}
3295 
3296 	spdk_bs_delete_blob(ctx->bs, ctx->blobid, _spdk_bs_delete_corrupted_blob_cpl, ctx);
3297 }
3298 
3299 static void
3300 _spdk_bs_delete_corrupted_blob(void *cb_arg, int bserrno)
3301 {
3302 	struct spdk_bs_load_ctx *ctx = cb_arg;
3303 	uint64_t i;
3304 
3305 	if (bserrno != 0) {
3306 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3307 		spdk_bs_iter_next(ctx->bs, ctx->blob, _spdk_bs_load_iter, ctx);
3308 		return;
3309 	}
3310 
3311 	/* Snapshot and clone have the same copy of cluster map at this point.
3312 	 * Let's clear cluster map for snpashot now so that it won't be cleared
3313 	 * for clone later when we remove snapshot. Also set thin provision to
3314 	 * pass data corruption check */
3315 	for (i = 0; i < ctx->blob->active.num_clusters; i++) {
3316 		ctx->blob->active.clusters[i] = 0;
3317 	}
3318 
3319 	ctx->blob->md_ro = false;
3320 
3321 	_spdk_blob_set_thin_provision(ctx->blob);
3322 
3323 	ctx->blobid = ctx->blob->id;
3324 
3325 	spdk_blob_close(ctx->blob, _spdk_bs_delete_corrupted_close_cb, ctx);
3326 }
3327 
3328 static void
3329 _spdk_bs_update_corrupted_blob(void *cb_arg, int bserrno)
3330 {
3331 	struct spdk_bs_load_ctx *ctx = cb_arg;
3332 
3333 	if (bserrno != 0) {
3334 		SPDK_ERRLOG("Failed to close clone of a corrupted blob\n");
3335 		spdk_bs_iter_next(ctx->bs, ctx->blob, _spdk_bs_load_iter, ctx);
3336 		return;
3337 	}
3338 
3339 	ctx->blob->md_ro = false;
3340 	_spdk_blob_remove_xattr(ctx->blob, SNAPSHOT_PENDING_REMOVAL, true);
3341 	_spdk_blob_remove_xattr(ctx->blob, SNAPSHOT_IN_PROGRESS, true);
3342 	spdk_blob_set_read_only(ctx->blob);
3343 
3344 	if (ctx->iter_cb_fn) {
3345 		ctx->iter_cb_fn(ctx->iter_cb_arg, ctx->blob, 0);
3346 	}
3347 	_spdk_bs_blob_list_add(ctx->blob);
3348 
3349 	spdk_bs_iter_next(ctx->bs, ctx->blob, _spdk_bs_load_iter, ctx);
3350 }
3351 
3352 static void
3353 _spdk_bs_examine_clone(void *cb_arg, struct spdk_blob *blob, int bserrno)
3354 {
3355 	struct spdk_bs_load_ctx *ctx = cb_arg;
3356 
3357 	if (bserrno != 0) {
3358 		SPDK_ERRLOG("Failed to open clone of a corrupted blob\n");
3359 		spdk_bs_iter_next(ctx->bs, ctx->blob, _spdk_bs_load_iter, ctx);
3360 		return;
3361 	}
3362 
3363 	if (blob->parent_id == ctx->blob->id) {
3364 		/* Power failure occured before updating clone (snapshot delete case)
3365 		 * or after updating clone (creating snapshot case) - keep snapshot */
3366 		spdk_blob_close(blob, _spdk_bs_update_corrupted_blob, ctx);
3367 	} else {
3368 		/* Power failure occured after updating clone (snapshot delete case)
3369 		 * or before updating clone (creating snapshot case) - remove snapshot */
3370 		spdk_blob_close(blob, _spdk_bs_delete_corrupted_blob, ctx);
3371 	}
3372 }
3373 
3374 static void
3375 _spdk_bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno)
3376 {
3377 	struct spdk_bs_load_ctx *ctx = arg;
3378 	const void *value;
3379 	size_t len;
3380 	int rc = 0;
3381 
3382 	if (bserrno == 0) {
3383 		/* Examine blob if it is corrupted after power failure. Fix
3384 		 * the ones that can be fixed and remove any other corrupted
3385 		 * ones. If it is not corrupted just process it */
3386 		rc = _spdk_blob_get_xattr_value(blob, SNAPSHOT_PENDING_REMOVAL, &value, &len, true);
3387 		if (rc != 0) {
3388 			rc = _spdk_blob_get_xattr_value(blob, SNAPSHOT_IN_PROGRESS, &value, &len, true);
3389 			if (rc != 0) {
3390 				/* Not corrupted - process it and continue with iterating through blobs */
3391 				if (ctx->iter_cb_fn) {
3392 					ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0);
3393 				}
3394 				_spdk_bs_blob_list_add(blob);
3395 				spdk_bs_iter_next(ctx->bs, blob, _spdk_bs_load_iter, ctx);
3396 				return;
3397 			}
3398 
3399 		}
3400 
3401 		assert(len == sizeof(spdk_blob_id));
3402 
3403 		ctx->blob = blob;
3404 
3405 		/* Open clone to check if we are able to fix this blob or should we remove it */
3406 		spdk_bs_open_blob(ctx->bs, *(spdk_blob_id *)value, _spdk_bs_examine_clone, ctx);
3407 		return;
3408 	} else if (bserrno == -ENOENT) {
3409 		bserrno = 0;
3410 	} else {
3411 		/*
3412 		 * This case needs to be looked at further.  Same problem
3413 		 *  exists with applications that rely on explicit blob
3414 		 *  iteration.  We should just skip the blob that failed
3415 		 *  to load and continue on to the next one.
3416 		 */
3417 		SPDK_ERRLOG("Error in iterating blobs\n");
3418 	}
3419 
3420 	ctx->iter_cb_fn = NULL;
3421 
3422 	spdk_free(ctx->super);
3423 	spdk_free(ctx->mask);
3424 	spdk_bs_sequence_finish(ctx->seq, bserrno);
3425 	free(ctx);
3426 }
3427 
3428 static void
3429 _spdk_bs_load_complete(struct spdk_bs_load_ctx *ctx)
3430 {
3431 	spdk_bs_iter_first(ctx->bs, _spdk_bs_load_iter, ctx);
3432 }
3433 
3434 static void
3435 _spdk_bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3436 {
3437 	struct spdk_bs_load_ctx *ctx = cb_arg;
3438 	int rc;
3439 
3440 	/* The type must be correct */
3441 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS);
3442 
3443 	/* The length of the mask (in bits) must not be greater than
3444 	 * the length of the buffer (converted to bits) */
3445 	assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8));
3446 
3447 	/* The length of the mask must be exactly equal to the size
3448 	 * (in pages) of the metadata region */
3449 	assert(ctx->mask->length == ctx->super->md_len);
3450 
3451 	rc = _spdk_bs_load_mask(&ctx->bs->used_blobids, ctx->mask);
3452 	if (rc < 0) {
3453 		spdk_free(ctx->mask);
3454 		_spdk_bs_load_ctx_fail(ctx, rc);
3455 		return;
3456 	}
3457 
3458 	_spdk_bs_load_complete(ctx);
3459 }
3460 
3461 static void
3462 _spdk_bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3463 {
3464 	struct spdk_bs_load_ctx *ctx = cb_arg;
3465 	uint64_t		lba, lba_count, mask_size;
3466 	int			rc;
3467 
3468 	if (bserrno != 0) {
3469 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3470 		return;
3471 	}
3472 
3473 	/* The type must be correct */
3474 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS);
3475 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
3476 	assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof(
3477 					     struct spdk_blob_md_page) * 8));
3478 	/* The length of the mask must be exactly equal to the total number of clusters */
3479 	assert(ctx->mask->length == ctx->bs->total_clusters);
3480 
3481 	rc = _spdk_bs_load_mask(&ctx->bs->used_clusters, ctx->mask);
3482 	if (rc < 0) {
3483 		spdk_free(ctx->mask);
3484 		_spdk_bs_load_ctx_fail(ctx, rc);
3485 		return;
3486 	}
3487 
3488 	ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters);
3489 	assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters);
3490 
3491 	spdk_free(ctx->mask);
3492 
3493 	/* Read the used blobids mask */
3494 	mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE;
3495 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3496 				 SPDK_MALLOC_DMA);
3497 	if (!ctx->mask) {
3498 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3499 		return;
3500 	}
3501 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start);
3502 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len);
3503 	spdk_bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
3504 				  _spdk_bs_load_used_blobids_cpl, ctx);
3505 }
3506 
3507 static void
3508 _spdk_bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3509 {
3510 	struct spdk_bs_load_ctx *ctx = cb_arg;
3511 	uint64_t		lba, lba_count, mask_size;
3512 	int			rc;
3513 
3514 	if (bserrno != 0) {
3515 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3516 		return;
3517 	}
3518 
3519 	/* The type must be correct */
3520 	assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES);
3521 	/* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */
3522 	assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE *
3523 				     8));
3524 	/* The length of the mask must be exactly equal to the size (in pages) of the metadata region */
3525 	assert(ctx->mask->length == ctx->super->md_len);
3526 
3527 	rc = _spdk_bs_load_mask(&ctx->bs->used_md_pages, ctx->mask);
3528 	if (rc < 0) {
3529 		spdk_free(ctx->mask);
3530 		_spdk_bs_load_ctx_fail(ctx, rc);
3531 		return;
3532 	}
3533 
3534 	spdk_free(ctx->mask);
3535 
3536 	/* Read the used clusters mask */
3537 	mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE;
3538 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY,
3539 				 SPDK_MALLOC_DMA);
3540 	if (!ctx->mask) {
3541 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3542 		return;
3543 	}
3544 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start);
3545 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len);
3546 	spdk_bs_sequence_read_dev(seq, ctx->mask, lba, lba_count,
3547 				  _spdk_bs_load_used_clusters_cpl, ctx);
3548 }
3549 
3550 static void
3551 _spdk_bs_load_read_used_pages(struct spdk_bs_load_ctx *ctx)
3552 {
3553 	uint64_t lba, lba_count, mask_size;
3554 
3555 	/* Read the used pages mask */
3556 	mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE;
3557 	ctx->mask = spdk_zmalloc(mask_size, 0x1000, NULL,
3558 				 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3559 	if (!ctx->mask) {
3560 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3561 		return;
3562 	}
3563 
3564 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start);
3565 	lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len);
3566 	spdk_bs_sequence_read_dev(ctx->seq, ctx->mask, lba, lba_count,
3567 				  _spdk_bs_load_used_pages_cpl, ctx);
3568 }
3569 
3570 static int
3571 _spdk_bs_load_replay_md_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob_store *bs)
3572 {
3573 	struct spdk_blob_md_descriptor *desc;
3574 	size_t	cur_desc = 0;
3575 
3576 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
3577 	while (cur_desc < sizeof(page->descriptors)) {
3578 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
3579 			if (desc->length == 0) {
3580 				/* If padding and length are 0, this terminates the page */
3581 				break;
3582 			}
3583 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
3584 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
3585 			unsigned int				i, j;
3586 			unsigned int				cluster_count = 0;
3587 			uint32_t				cluster_idx;
3588 
3589 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
3590 
3591 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
3592 				for (j = 0; j < desc_extent_rle->extents[i].length; j++) {
3593 					cluster_idx = desc_extent_rle->extents[i].cluster_idx;
3594 					/*
3595 					 * cluster_idx = 0 means an unallocated cluster - don't mark that
3596 					 * in the used cluster map.
3597 					 */
3598 					if (cluster_idx != 0) {
3599 						spdk_bit_array_set(bs->used_clusters, cluster_idx + j);
3600 						if (bs->num_free_clusters == 0) {
3601 							return -ENOSPC;
3602 						}
3603 						bs->num_free_clusters--;
3604 					}
3605 					cluster_count++;
3606 				}
3607 			}
3608 			if (cluster_count == 0) {
3609 				return -EINVAL;
3610 			}
3611 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
3612 			/* Skip this item */
3613 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
3614 			/* Skip this item */
3615 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
3616 			/* Skip this item */
3617 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
3618 			/* Skip this item */
3619 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_TABLE) {
3620 			/* TODO: Read the extent pages when replaying the md,
3621 			 * only after particular blob md chain was read */
3622 		} else {
3623 			/* Error */
3624 			return -EINVAL;
3625 		}
3626 		/* Advance to the next descriptor */
3627 		cur_desc += sizeof(*desc) + desc->length;
3628 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
3629 			break;
3630 		}
3631 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
3632 	}
3633 	return 0;
3634 }
3635 
3636 static bool _spdk_bs_load_cur_extent_page_valid(struct spdk_blob_md_page *page)
3637 {
3638 	uint32_t crc;
3639 	struct spdk_blob_md_descriptor *desc = (struct spdk_blob_md_descriptor *)page->descriptors;
3640 	size_t desc_len;
3641 
3642 	crc = _spdk_blob_md_page_calc_crc(page);
3643 	if (crc != page->crc) {
3644 		return false;
3645 	}
3646 
3647 	/* Extent page should always be of sequence num 0. */
3648 	if (page->sequence_num != 0) {
3649 		return false;
3650 	}
3651 
3652 	/* Descriptor type must be EXTENT_PAGE. */
3653 	if (desc->type != SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
3654 		return false;
3655 	}
3656 
3657 	/* Descriptor length cannot exceed the page. */
3658 	desc_len = sizeof(*desc) + desc->length;
3659 	if (desc_len > sizeof(page->descriptors)) {
3660 		return false;
3661 	}
3662 
3663 	/* It has to be the only descriptor in the page. */
3664 	if (desc_len + sizeof(*desc) <= sizeof(page->descriptors)) {
3665 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + desc_len);
3666 		if (desc->length != 0) {
3667 			return false;
3668 		}
3669 	}
3670 
3671 	return true;
3672 }
3673 
3674 static bool _spdk_bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx)
3675 {
3676 	uint32_t crc;
3677 
3678 	crc = _spdk_blob_md_page_calc_crc(ctx->page);
3679 	if (crc != ctx->page->crc) {
3680 		return false;
3681 	}
3682 
3683 	/* First page of a sequence should match the blobid. */
3684 	if (ctx->page->sequence_num == 0 &&
3685 	    _spdk_bs_page_to_blobid(ctx->cur_page) != ctx->page->id) {
3686 		return false;
3687 	}
3688 	return true;
3689 }
3690 
3691 static void
3692 _spdk_bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx);
3693 
3694 static void
3695 _spdk_bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3696 {
3697 	struct spdk_bs_load_ctx	*ctx = cb_arg;
3698 
3699 	if (bserrno != 0) {
3700 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3701 		return;
3702 	}
3703 
3704 	_spdk_bs_load_complete(ctx);
3705 }
3706 
3707 static void
3708 _spdk_bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3709 {
3710 	struct spdk_bs_load_ctx	*ctx = cb_arg;
3711 
3712 	spdk_free(ctx->mask);
3713 	ctx->mask = NULL;
3714 
3715 	if (bserrno != 0) {
3716 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3717 		return;
3718 	}
3719 
3720 	_spdk_bs_write_used_clusters(seq, ctx, _spdk_bs_load_write_used_clusters_cpl);
3721 }
3722 
3723 static void
3724 _spdk_bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3725 {
3726 	struct spdk_bs_load_ctx	*ctx = cb_arg;
3727 
3728 	spdk_free(ctx->mask);
3729 	ctx->mask = NULL;
3730 
3731 	if (bserrno != 0) {
3732 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3733 		return;
3734 	}
3735 
3736 	_spdk_bs_write_used_blobids(seq, ctx, _spdk_bs_load_write_used_blobids_cpl);
3737 }
3738 
3739 static void
3740 _spdk_bs_load_write_used_md(struct spdk_bs_load_ctx *ctx)
3741 {
3742 	_spdk_bs_write_used_md(ctx->seq, ctx, _spdk_bs_load_write_used_pages_cpl);
3743 }
3744 
3745 static void
3746 _spdk_bs_load_replay_md_chain_cpl(struct spdk_bs_load_ctx *ctx)
3747 {
3748 	uint64_t num_md_clusters;
3749 	uint64_t i;
3750 
3751 	ctx->in_page_chain = false;
3752 
3753 	do {
3754 		ctx->page_index++;
3755 	} while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true);
3756 
3757 	if (ctx->page_index < ctx->super->md_len) {
3758 		ctx->cur_page = ctx->page_index;
3759 		_spdk_bs_load_replay_cur_md_page(ctx);
3760 	} else {
3761 		/* Claim all of the clusters used by the metadata */
3762 		num_md_clusters = spdk_divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster);
3763 		for (i = 0; i < num_md_clusters; i++) {
3764 			_spdk_bs_claim_cluster(ctx->bs, i);
3765 		}
3766 		spdk_free(ctx->page);
3767 		_spdk_bs_load_write_used_md(ctx);
3768 	}
3769 }
3770 
3771 static void
3772 _spdk_bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3773 {
3774 	struct spdk_bs_load_ctx *ctx = cb_arg;
3775 	uint32_t page_num;
3776 
3777 	if (bserrno != 0) {
3778 		_spdk_bs_load_ctx_fail(ctx, bserrno);
3779 		return;
3780 	}
3781 
3782 	page_num = ctx->cur_page;
3783 	if (_spdk_bs_load_cur_md_page_valid(ctx) == true) {
3784 		if (ctx->page->sequence_num == 0 || ctx->in_page_chain == true) {
3785 			_spdk_bs_claim_md_page(ctx->bs, page_num);
3786 			if (ctx->page->sequence_num == 0) {
3787 				spdk_bit_array_set(ctx->bs->used_blobids, page_num);
3788 			}
3789 			if (_spdk_bs_load_replay_md_parse_page(ctx->page, ctx->bs)) {
3790 				_spdk_bs_load_ctx_fail(ctx, -EILSEQ);
3791 				return;
3792 			}
3793 			if (ctx->page->next != SPDK_INVALID_MD_PAGE) {
3794 				ctx->in_page_chain = true;
3795 				ctx->cur_page = ctx->page->next;
3796 				_spdk_bs_load_replay_cur_md_page(ctx);
3797 				return;
3798 			}
3799 		}
3800 	}
3801 	_spdk_bs_load_replay_md_chain_cpl(ctx);
3802 }
3803 
3804 static void
3805 _spdk_bs_load_replay_cur_md_page(struct spdk_bs_load_ctx *ctx)
3806 {
3807 	uint64_t lba;
3808 
3809 	assert(ctx->cur_page < ctx->super->md_len);
3810 	lba = _spdk_bs_md_page_to_lba(ctx->bs, ctx->cur_page);
3811 	spdk_bs_sequence_read_dev(ctx->seq, ctx->page, lba,
3812 				  _spdk_bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
3813 				  _spdk_bs_load_replay_md_cpl, ctx);
3814 }
3815 
3816 static void
3817 _spdk_bs_load_replay_md(struct spdk_bs_load_ctx *ctx)
3818 {
3819 	ctx->page_index = 0;
3820 	ctx->cur_page = 0;
3821 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
3822 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3823 	if (!ctx->page) {
3824 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3825 		return;
3826 	}
3827 	_spdk_bs_load_replay_cur_md_page(ctx);
3828 }
3829 
3830 static void
3831 _spdk_bs_recover(struct spdk_bs_load_ctx *ctx)
3832 {
3833 	int		rc;
3834 
3835 	rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len);
3836 	if (rc < 0) {
3837 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3838 		return;
3839 	}
3840 
3841 	rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len);
3842 	if (rc < 0) {
3843 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3844 		return;
3845 	}
3846 
3847 	rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
3848 	if (rc < 0) {
3849 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3850 		return;
3851 	}
3852 
3853 	ctx->bs->num_free_clusters = ctx->bs->total_clusters;
3854 	_spdk_bs_load_replay_md(ctx);
3855 }
3856 
3857 static void
3858 _spdk_bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
3859 {
3860 	struct spdk_bs_load_ctx *ctx = cb_arg;
3861 	uint32_t	crc;
3862 	int		rc;
3863 	static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH];
3864 
3865 	if (ctx->super->version > SPDK_BS_VERSION ||
3866 	    ctx->super->version < SPDK_BS_INITIAL_VERSION) {
3867 		_spdk_bs_load_ctx_fail(ctx, -EILSEQ);
3868 		return;
3869 	}
3870 
3871 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
3872 		   sizeof(ctx->super->signature)) != 0) {
3873 		_spdk_bs_load_ctx_fail(ctx, -EILSEQ);
3874 		return;
3875 	}
3876 
3877 	crc = _spdk_blob_md_page_calc_crc(ctx->super);
3878 	if (crc != ctx->super->crc) {
3879 		_spdk_bs_load_ctx_fail(ctx, -EILSEQ);
3880 		return;
3881 	}
3882 
3883 	if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
3884 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype matched - loading blobstore\n");
3885 	} else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) {
3886 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype wildcard used - loading blobstore regardless bstype\n");
3887 	} else {
3888 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Unexpected bstype\n");
3889 		SPDK_LOGDUMP(SPDK_LOG_BLOB, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
3890 		SPDK_LOGDUMP(SPDK_LOG_BLOB, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH);
3891 		_spdk_bs_load_ctx_fail(ctx, -ENXIO);
3892 		return;
3893 	}
3894 
3895 	if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) {
3896 		SPDK_NOTICELOG("Size mismatch, dev size: %lu, blobstore size: %lu\n",
3897 			       ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size);
3898 		_spdk_bs_load_ctx_fail(ctx, -EILSEQ);
3899 		return;
3900 	}
3901 
3902 	if (ctx->super->size == 0) {
3903 		ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen;
3904 	}
3905 
3906 	if (ctx->super->io_unit_size == 0) {
3907 		ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE;
3908 	}
3909 
3910 	/* Parse the super block */
3911 	ctx->bs->clean = 1;
3912 	ctx->bs->cluster_sz = ctx->super->cluster_size;
3913 	ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size;
3914 	ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE;
3915 	ctx->bs->io_unit_size = ctx->super->io_unit_size;
3916 	rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters);
3917 	if (rc < 0) {
3918 		_spdk_bs_load_ctx_fail(ctx, -ENOMEM);
3919 		return;
3920 	}
3921 	ctx->bs->md_start = ctx->super->md_start;
3922 	ctx->bs->md_len = ctx->super->md_len;
3923 	ctx->bs->total_data_clusters = ctx->bs->total_clusters - spdk_divide_round_up(
3924 					       ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster);
3925 	ctx->bs->super_blob = ctx->super->super_blob;
3926 	memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype));
3927 
3928 	if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) {
3929 		_spdk_bs_recover(ctx);
3930 	} else {
3931 		_spdk_bs_load_read_used_pages(ctx);
3932 	}
3933 }
3934 
3935 void
3936 spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
3937 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
3938 {
3939 	struct spdk_blob_store	*bs;
3940 	struct spdk_bs_cpl	cpl;
3941 	struct spdk_bs_load_ctx *ctx;
3942 	struct spdk_bs_opts	opts = {};
3943 	int err;
3944 
3945 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Loading blobstore from dev %p\n", dev);
3946 
3947 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
3948 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "unsupported dev block length of %d\n", dev->blocklen);
3949 		dev->destroy(dev);
3950 		cb_fn(cb_arg, NULL, -EINVAL);
3951 		return;
3952 	}
3953 
3954 	if (o) {
3955 		opts = *o;
3956 	} else {
3957 		spdk_bs_opts_init(&opts);
3958 	}
3959 
3960 	if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) {
3961 		dev->destroy(dev);
3962 		cb_fn(cb_arg, NULL, -EINVAL);
3963 		return;
3964 	}
3965 
3966 	err = _spdk_bs_alloc(dev, &opts, &bs);
3967 	if (err) {
3968 		dev->destroy(dev);
3969 		cb_fn(cb_arg, NULL, err);
3970 		return;
3971 	}
3972 
3973 	ctx = calloc(1, sizeof(*ctx));
3974 	if (!ctx) {
3975 		_spdk_bs_free(bs);
3976 		cb_fn(cb_arg, NULL, -ENOMEM);
3977 		return;
3978 	}
3979 
3980 	ctx->bs = bs;
3981 	ctx->iter_cb_fn = opts.iter_cb_fn;
3982 	ctx->iter_cb_arg = opts.iter_cb_arg;
3983 
3984 	/* Allocate memory for the super block */
3985 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
3986 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
3987 	if (!ctx->super) {
3988 		free(ctx);
3989 		_spdk_bs_free(bs);
3990 		cb_fn(cb_arg, NULL, -ENOMEM);
3991 		return;
3992 	}
3993 
3994 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
3995 	cpl.u.bs_handle.cb_fn = cb_fn;
3996 	cpl.u.bs_handle.cb_arg = cb_arg;
3997 	cpl.u.bs_handle.bs = bs;
3998 
3999 	ctx->seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4000 	if (!ctx->seq) {
4001 		spdk_free(ctx->super);
4002 		free(ctx);
4003 		_spdk_bs_free(bs);
4004 		cb_fn(cb_arg, NULL, -ENOMEM);
4005 		return;
4006 	}
4007 
4008 	/* Read the super block */
4009 	spdk_bs_sequence_read_dev(ctx->seq, ctx->super, _spdk_bs_page_to_lba(bs, 0),
4010 				  _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)),
4011 				  _spdk_bs_load_super_cpl, ctx);
4012 }
4013 
4014 /* END spdk_bs_load */
4015 
4016 /* START spdk_bs_dump */
4017 
4018 struct spdk_bs_dump_ctx {
4019 	struct spdk_blob_store		*bs;
4020 	struct spdk_bs_super_block	*super;
4021 	uint32_t			cur_page;
4022 	struct spdk_blob_md_page	*page;
4023 	spdk_bs_sequence_t		*seq;
4024 	FILE				*fp;
4025 	spdk_bs_dump_print_xattr	print_xattr_fn;
4026 	char				xattr_name[4096];
4027 };
4028 
4029 static void
4030 _spdk_bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno)
4031 {
4032 	spdk_free(ctx->super);
4033 
4034 	/*
4035 	 * We need to defer calling spdk_bs_call_cpl() until after
4036 	 * dev destruction, so tuck these away for later use.
4037 	 */
4038 	ctx->bs->unload_err = bserrno;
4039 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4040 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4041 
4042 	spdk_bs_sequence_finish(seq, 0);
4043 	_spdk_bs_free(ctx->bs);
4044 	free(ctx);
4045 }
4046 
4047 static void _spdk_bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg);
4048 
4049 static void
4050 _spdk_bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx)
4051 {
4052 	uint32_t page_idx = ctx->cur_page;
4053 	struct spdk_blob_md_page *page = ctx->page;
4054 	struct spdk_blob_md_descriptor *desc;
4055 	size_t cur_desc = 0;
4056 	uint32_t crc;
4057 
4058 	fprintf(ctx->fp, "=========\n");
4059 	fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx);
4060 	fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id);
4061 
4062 	crc = _spdk_blob_md_page_calc_crc(page);
4063 	fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch");
4064 
4065 	desc = (struct spdk_blob_md_descriptor *)page->descriptors;
4066 	while (cur_desc < sizeof(page->descriptors)) {
4067 		if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) {
4068 			if (desc->length == 0) {
4069 				/* If padding and length are 0, this terminates the page */
4070 				break;
4071 			}
4072 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_RLE) {
4073 			struct spdk_blob_md_descriptor_extent_rle	*desc_extent_rle;
4074 			unsigned int				i;
4075 
4076 			desc_extent_rle = (struct spdk_blob_md_descriptor_extent_rle *)desc;
4077 
4078 			for (i = 0; i < desc_extent_rle->length / sizeof(desc_extent_rle->extents[0]); i++) {
4079 				if (desc_extent_rle->extents[i].cluster_idx != 0) {
4080 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
4081 						desc_extent_rle->extents[i].cluster_idx);
4082 				} else {
4083 					fprintf(ctx->fp, "Unallocated Extent - ");
4084 				}
4085 				fprintf(ctx->fp, " Length: %" PRIu32, desc_extent_rle->extents[i].length);
4086 				fprintf(ctx->fp, "\n");
4087 			}
4088 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT_PAGE) {
4089 			struct spdk_blob_md_descriptor_extent_page	*desc_extent;
4090 			unsigned int					i;
4091 
4092 			desc_extent = (struct spdk_blob_md_descriptor_extent_page *)desc;
4093 
4094 			for (i = 0; i < desc_extent->length / sizeof(desc_extent->cluster_idx[0]); i++) {
4095 				if (desc_extent->cluster_idx[i] != 0) {
4096 					fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32,
4097 						desc_extent->cluster_idx[i]);
4098 				} else {
4099 					fprintf(ctx->fp, "Unallocated Extent");
4100 				}
4101 				fprintf(ctx->fp, "\n");
4102 			}
4103 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) {
4104 			struct spdk_blob_md_descriptor_xattr *desc_xattr;
4105 			uint32_t i;
4106 
4107 			desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc;
4108 
4109 			if (desc_xattr->length !=
4110 			    sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) +
4111 			    desc_xattr->name_length + desc_xattr->value_length) {
4112 			}
4113 
4114 			memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length);
4115 			ctx->xattr_name[desc_xattr->name_length] = '\0';
4116 			fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name);
4117 			fprintf(ctx->fp, "       value = \"");
4118 			ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name,
4119 					    (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length),
4120 					    desc_xattr->value_length);
4121 			fprintf(ctx->fp, "\"\n");
4122 			for (i = 0; i < desc_xattr->value_length; i++) {
4123 				if (i % 16 == 0) {
4124 					fprintf(ctx->fp, "               ");
4125 				}
4126 				fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i));
4127 				if ((i + 1) % 16 == 0) {
4128 					fprintf(ctx->fp, "\n");
4129 				}
4130 			}
4131 			if (i % 16 != 0) {
4132 				fprintf(ctx->fp, "\n");
4133 			}
4134 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) {
4135 			/* TODO */
4136 		} else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) {
4137 			/* TODO */
4138 		} else {
4139 			/* Error */
4140 		}
4141 		/* Advance to the next descriptor */
4142 		cur_desc += sizeof(*desc) + desc->length;
4143 		if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) {
4144 			break;
4145 		}
4146 		desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc);
4147 	}
4148 }
4149 
4150 static void
4151 _spdk_bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4152 {
4153 	struct spdk_bs_dump_ctx *ctx = cb_arg;
4154 
4155 	if (bserrno != 0) {
4156 		_spdk_bs_dump_finish(seq, ctx, bserrno);
4157 		return;
4158 	}
4159 
4160 	if (ctx->page->id != 0) {
4161 		_spdk_bs_dump_print_md_page(ctx);
4162 	}
4163 
4164 	ctx->cur_page++;
4165 
4166 	if (ctx->cur_page < ctx->super->md_len) {
4167 		_spdk_bs_dump_read_md_page(seq, ctx);
4168 	} else {
4169 		spdk_free(ctx->page);
4170 		_spdk_bs_dump_finish(seq, ctx, 0);
4171 	}
4172 }
4173 
4174 static void
4175 _spdk_bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg)
4176 {
4177 	struct spdk_bs_dump_ctx *ctx = cb_arg;
4178 	uint64_t lba;
4179 
4180 	assert(ctx->cur_page < ctx->super->md_len);
4181 	lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page);
4182 	spdk_bs_sequence_read_dev(seq, ctx->page, lba,
4183 				  _spdk_bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE),
4184 				  _spdk_bs_dump_read_md_page_cpl, ctx);
4185 }
4186 
4187 static void
4188 _spdk_bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4189 {
4190 	struct spdk_bs_dump_ctx *ctx = cb_arg;
4191 
4192 	fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature);
4193 	if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4194 		   sizeof(ctx->super->signature)) != 0) {
4195 		fprintf(ctx->fp, "(Mismatch)\n");
4196 		_spdk_bs_dump_finish(seq, ctx, bserrno);
4197 		return;
4198 	} else {
4199 		fprintf(ctx->fp, "(OK)\n");
4200 	}
4201 	fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version);
4202 	fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc,
4203 		(ctx->super->crc == _spdk_blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch");
4204 	fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype);
4205 	fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size);
4206 	fprintf(ctx->fp, "Super Blob ID: ");
4207 	if (ctx->super->super_blob == SPDK_BLOBID_INVALID) {
4208 		fprintf(ctx->fp, "(None)\n");
4209 	} else {
4210 		fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob);
4211 	}
4212 	fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean);
4213 	fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start);
4214 	fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len);
4215 	fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start);
4216 	fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len);
4217 	fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start);
4218 	fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len);
4219 	fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start);
4220 	fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len);
4221 
4222 	ctx->cur_page = 0;
4223 	ctx->page = spdk_zmalloc(SPDK_BS_PAGE_SIZE, SPDK_BS_PAGE_SIZE,
4224 				 NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4225 	if (!ctx->page) {
4226 		_spdk_bs_dump_finish(seq, ctx, -ENOMEM);
4227 		return;
4228 	}
4229 	_spdk_bs_dump_read_md_page(seq, ctx);
4230 }
4231 
4232 void
4233 spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn,
4234 	     spdk_bs_op_complete cb_fn, void *cb_arg)
4235 {
4236 	struct spdk_blob_store	*bs;
4237 	struct spdk_bs_cpl	cpl;
4238 	spdk_bs_sequence_t	*seq;
4239 	struct spdk_bs_dump_ctx *ctx;
4240 	struct spdk_bs_opts	opts = {};
4241 	int err;
4242 
4243 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Dumping blobstore from dev %p\n", dev);
4244 
4245 	spdk_bs_opts_init(&opts);
4246 
4247 	err = _spdk_bs_alloc(dev, &opts, &bs);
4248 	if (err) {
4249 		dev->destroy(dev);
4250 		cb_fn(cb_arg, err);
4251 		return;
4252 	}
4253 
4254 	ctx = calloc(1, sizeof(*ctx));
4255 	if (!ctx) {
4256 		_spdk_bs_free(bs);
4257 		cb_fn(cb_arg, -ENOMEM);
4258 		return;
4259 	}
4260 
4261 	ctx->bs = bs;
4262 	ctx->fp = fp;
4263 	ctx->print_xattr_fn = print_xattr_fn;
4264 
4265 	/* Allocate memory for the super block */
4266 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
4267 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4268 	if (!ctx->super) {
4269 		free(ctx);
4270 		_spdk_bs_free(bs);
4271 		cb_fn(cb_arg, -ENOMEM);
4272 		return;
4273 	}
4274 
4275 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4276 	cpl.u.bs_basic.cb_fn = cb_fn;
4277 	cpl.u.bs_basic.cb_arg = cb_arg;
4278 
4279 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4280 	if (!seq) {
4281 		spdk_free(ctx->super);
4282 		free(ctx);
4283 		_spdk_bs_free(bs);
4284 		cb_fn(cb_arg, -ENOMEM);
4285 		return;
4286 	}
4287 
4288 	/* Read the super block */
4289 	spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0),
4290 				  _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)),
4291 				  _spdk_bs_dump_super_cpl, ctx);
4292 }
4293 
4294 /* END spdk_bs_dump */
4295 
4296 /* START spdk_bs_init */
4297 
4298 struct spdk_bs_init_ctx {
4299 	struct spdk_blob_store		*bs;
4300 	struct spdk_bs_super_block	*super;
4301 };
4302 
4303 static void
4304 _spdk_bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4305 {
4306 	struct spdk_bs_init_ctx *ctx = cb_arg;
4307 
4308 	spdk_free(ctx->super);
4309 	free(ctx);
4310 
4311 	spdk_bs_sequence_finish(seq, bserrno);
4312 }
4313 
4314 static void
4315 _spdk_bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4316 {
4317 	struct spdk_bs_init_ctx *ctx = cb_arg;
4318 
4319 	/* Write super block */
4320 	spdk_bs_sequence_write_dev(seq, ctx->super, _spdk_bs_page_to_lba(ctx->bs, 0),
4321 				   _spdk_bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)),
4322 				   _spdk_bs_init_persist_super_cpl, ctx);
4323 }
4324 
4325 void
4326 spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o,
4327 	     spdk_bs_op_with_handle_complete cb_fn, void *cb_arg)
4328 {
4329 	struct spdk_bs_init_ctx *ctx;
4330 	struct spdk_blob_store	*bs;
4331 	struct spdk_bs_cpl	cpl;
4332 	spdk_bs_sequence_t	*seq;
4333 	spdk_bs_batch_t		*batch;
4334 	uint64_t		num_md_lba;
4335 	uint64_t		num_md_pages;
4336 	uint64_t		num_md_clusters;
4337 	uint32_t		i;
4338 	struct spdk_bs_opts	opts = {};
4339 	int			rc;
4340 
4341 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev);
4342 
4343 	if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) {
4344 		SPDK_ERRLOG("unsupported dev block length of %d\n",
4345 			    dev->blocklen);
4346 		dev->destroy(dev);
4347 		cb_fn(cb_arg, NULL, -EINVAL);
4348 		return;
4349 	}
4350 
4351 	if (o) {
4352 		opts = *o;
4353 	} else {
4354 		spdk_bs_opts_init(&opts);
4355 	}
4356 
4357 	if (_spdk_bs_opts_verify(&opts) != 0) {
4358 		dev->destroy(dev);
4359 		cb_fn(cb_arg, NULL, -EINVAL);
4360 		return;
4361 	}
4362 
4363 	rc = _spdk_bs_alloc(dev, &opts, &bs);
4364 	if (rc) {
4365 		dev->destroy(dev);
4366 		cb_fn(cb_arg, NULL, rc);
4367 		return;
4368 	}
4369 
4370 	if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) {
4371 		/* By default, allocate 1 page per cluster.
4372 		 * Technically, this over-allocates metadata
4373 		 * because more metadata will reduce the number
4374 		 * of usable clusters. This can be addressed with
4375 		 * more complex math in the future.
4376 		 */
4377 		bs->md_len = bs->total_clusters;
4378 	} else {
4379 		bs->md_len = opts.num_md_pages;
4380 	}
4381 	rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len);
4382 	if (rc < 0) {
4383 		_spdk_bs_free(bs);
4384 		cb_fn(cb_arg, NULL, -ENOMEM);
4385 		return;
4386 	}
4387 
4388 	rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len);
4389 	if (rc < 0) {
4390 		_spdk_bs_free(bs);
4391 		cb_fn(cb_arg, NULL, -ENOMEM);
4392 		return;
4393 	}
4394 
4395 	ctx = calloc(1, sizeof(*ctx));
4396 	if (!ctx) {
4397 		_spdk_bs_free(bs);
4398 		cb_fn(cb_arg, NULL, -ENOMEM);
4399 		return;
4400 	}
4401 
4402 	ctx->bs = bs;
4403 
4404 	/* Allocate memory for the super block */
4405 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
4406 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4407 	if (!ctx->super) {
4408 		free(ctx);
4409 		_spdk_bs_free(bs);
4410 		cb_fn(cb_arg, NULL, -ENOMEM);
4411 		return;
4412 	}
4413 	memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG,
4414 	       sizeof(ctx->super->signature));
4415 	ctx->super->version = SPDK_BS_VERSION;
4416 	ctx->super->length = sizeof(*ctx->super);
4417 	ctx->super->super_blob = bs->super_blob;
4418 	ctx->super->clean = 0;
4419 	ctx->super->cluster_size = bs->cluster_sz;
4420 	ctx->super->io_unit_size = bs->io_unit_size;
4421 	memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype));
4422 
4423 	/* Calculate how many pages the metadata consumes at the front
4424 	 * of the disk.
4425 	 */
4426 
4427 	/* The super block uses 1 page */
4428 	num_md_pages = 1;
4429 
4430 	/* The used_md_pages mask requires 1 bit per metadata page, rounded
4431 	 * up to the nearest page, plus a header.
4432 	 */
4433 	ctx->super->used_page_mask_start = num_md_pages;
4434 	ctx->super->used_page_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4435 					 spdk_divide_round_up(bs->md_len, 8),
4436 					 SPDK_BS_PAGE_SIZE);
4437 	num_md_pages += ctx->super->used_page_mask_len;
4438 
4439 	/* The used_clusters mask requires 1 bit per cluster, rounded
4440 	 * up to the nearest page, plus a header.
4441 	 */
4442 	ctx->super->used_cluster_mask_start = num_md_pages;
4443 	ctx->super->used_cluster_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4444 					    spdk_divide_round_up(bs->total_clusters, 8),
4445 					    SPDK_BS_PAGE_SIZE);
4446 	num_md_pages += ctx->super->used_cluster_mask_len;
4447 
4448 	/* The used_blobids mask requires 1 bit per metadata page, rounded
4449 	 * up to the nearest page, plus a header.
4450 	 */
4451 	ctx->super->used_blobid_mask_start = num_md_pages;
4452 	ctx->super->used_blobid_mask_len = spdk_divide_round_up(sizeof(struct spdk_bs_md_mask) +
4453 					   spdk_divide_round_up(bs->md_len, 8),
4454 					   SPDK_BS_PAGE_SIZE);
4455 	num_md_pages += ctx->super->used_blobid_mask_len;
4456 
4457 	/* The metadata region size was chosen above */
4458 	ctx->super->md_start = bs->md_start = num_md_pages;
4459 	ctx->super->md_len = bs->md_len;
4460 	num_md_pages += bs->md_len;
4461 
4462 	num_md_lba = _spdk_bs_page_to_lba(bs, num_md_pages);
4463 
4464 	ctx->super->size = dev->blockcnt * dev->blocklen;
4465 
4466 	ctx->super->crc = _spdk_blob_md_page_calc_crc(ctx->super);
4467 
4468 	num_md_clusters = spdk_divide_round_up(num_md_pages, bs->pages_per_cluster);
4469 	if (num_md_clusters > bs->total_clusters) {
4470 		SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, "
4471 			    "please decrease number of pages reserved for metadata "
4472 			    "or increase cluster size.\n");
4473 		spdk_free(ctx->super);
4474 		free(ctx);
4475 		_spdk_bs_free(bs);
4476 		cb_fn(cb_arg, NULL, -ENOMEM);
4477 		return;
4478 	}
4479 	/* Claim all of the clusters used by the metadata */
4480 	for (i = 0; i < num_md_clusters; i++) {
4481 		_spdk_bs_claim_cluster(bs, i);
4482 	}
4483 
4484 	bs->total_data_clusters = bs->num_free_clusters;
4485 
4486 	cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE;
4487 	cpl.u.bs_handle.cb_fn = cb_fn;
4488 	cpl.u.bs_handle.cb_arg = cb_arg;
4489 	cpl.u.bs_handle.bs = bs;
4490 
4491 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4492 	if (!seq) {
4493 		spdk_free(ctx->super);
4494 		free(ctx);
4495 		_spdk_bs_free(bs);
4496 		cb_fn(cb_arg, NULL, -ENOMEM);
4497 		return;
4498 	}
4499 
4500 	batch = spdk_bs_sequence_to_batch(seq, _spdk_bs_init_trim_cpl, ctx);
4501 
4502 	/* Clear metadata space */
4503 	spdk_bs_batch_write_zeroes_dev(batch, 0, num_md_lba);
4504 
4505 	switch (opts.clear_method) {
4506 	case BS_CLEAR_WITH_UNMAP:
4507 		/* Trim data clusters */
4508 		spdk_bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
4509 		break;
4510 	case BS_CLEAR_WITH_WRITE_ZEROES:
4511 		/* Write_zeroes to data clusters */
4512 		spdk_bs_batch_write_zeroes_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba);
4513 		break;
4514 	case BS_CLEAR_WITH_NONE:
4515 	default:
4516 		break;
4517 	}
4518 
4519 	spdk_bs_batch_close(batch);
4520 }
4521 
4522 /* END spdk_bs_init */
4523 
4524 /* START spdk_bs_destroy */
4525 
4526 static void
4527 _spdk_bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4528 {
4529 	struct spdk_bs_init_ctx *ctx = cb_arg;
4530 	struct spdk_blob_store *bs = ctx->bs;
4531 
4532 	/*
4533 	 * We need to defer calling spdk_bs_call_cpl() until after
4534 	 * dev destruction, so tuck these away for later use.
4535 	 */
4536 	bs->unload_err = bserrno;
4537 	memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4538 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4539 
4540 	spdk_bs_sequence_finish(seq, bserrno);
4541 
4542 	_spdk_bs_free(bs);
4543 	free(ctx);
4544 }
4545 
4546 void
4547 spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn,
4548 		void *cb_arg)
4549 {
4550 	struct spdk_bs_cpl	cpl;
4551 	spdk_bs_sequence_t	*seq;
4552 	struct spdk_bs_init_ctx *ctx;
4553 
4554 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n");
4555 
4556 	if (!TAILQ_EMPTY(&bs->blobs)) {
4557 		SPDK_ERRLOG("Blobstore still has open blobs\n");
4558 		cb_fn(cb_arg, -EBUSY);
4559 		return;
4560 	}
4561 
4562 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4563 	cpl.u.bs_basic.cb_fn = cb_fn;
4564 	cpl.u.bs_basic.cb_arg = cb_arg;
4565 
4566 	ctx = calloc(1, sizeof(*ctx));
4567 	if (!ctx) {
4568 		cb_fn(cb_arg, -ENOMEM);
4569 		return;
4570 	}
4571 
4572 	ctx->bs = bs;
4573 
4574 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4575 	if (!seq) {
4576 		free(ctx);
4577 		cb_fn(cb_arg, -ENOMEM);
4578 		return;
4579 	}
4580 
4581 	/* Write zeroes to the super block */
4582 	spdk_bs_sequence_write_zeroes_dev(seq,
4583 					  _spdk_bs_page_to_lba(bs, 0),
4584 					  _spdk_bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)),
4585 					  _spdk_bs_destroy_trim_cpl, ctx);
4586 }
4587 
4588 /* END spdk_bs_destroy */
4589 
4590 /* START spdk_bs_unload */
4591 
4592 static void
4593 _spdk_bs_unload_finish(struct spdk_bs_load_ctx *ctx, int bserrno)
4594 {
4595 	spdk_bs_sequence_t *seq = ctx->seq;
4596 
4597 	spdk_free(ctx->super);
4598 
4599 	/*
4600 	 * We need to defer calling spdk_bs_call_cpl() until after
4601 	 * dev destruction, so tuck these away for later use.
4602 	 */
4603 	ctx->bs->unload_err = bserrno;
4604 	memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl));
4605 	seq->cpl.type = SPDK_BS_CPL_TYPE_NONE;
4606 
4607 	spdk_bs_sequence_finish(seq, bserrno);
4608 
4609 	_spdk_bs_free(ctx->bs);
4610 	free(ctx);
4611 }
4612 
4613 static void
4614 _spdk_bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4615 {
4616 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4617 
4618 	_spdk_bs_unload_finish(ctx, bserrno);
4619 }
4620 
4621 static void
4622 _spdk_bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4623 {
4624 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4625 
4626 	spdk_free(ctx->mask);
4627 
4628 	if (bserrno != 0) {
4629 		_spdk_bs_unload_finish(ctx, bserrno);
4630 		return;
4631 	}
4632 
4633 	ctx->super->clean = 1;
4634 
4635 	_spdk_bs_write_super(seq, ctx->bs, ctx->super, _spdk_bs_unload_write_super_cpl, ctx);
4636 }
4637 
4638 static void
4639 _spdk_bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4640 {
4641 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4642 
4643 	spdk_free(ctx->mask);
4644 	ctx->mask = NULL;
4645 
4646 	if (bserrno != 0) {
4647 		_spdk_bs_unload_finish(ctx, bserrno);
4648 		return;
4649 	}
4650 
4651 	_spdk_bs_write_used_clusters(seq, ctx, _spdk_bs_unload_write_used_clusters_cpl);
4652 }
4653 
4654 static void
4655 _spdk_bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4656 {
4657 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4658 
4659 	spdk_free(ctx->mask);
4660 	ctx->mask = NULL;
4661 
4662 	if (bserrno != 0) {
4663 		_spdk_bs_unload_finish(ctx, bserrno);
4664 		return;
4665 	}
4666 
4667 	_spdk_bs_write_used_blobids(seq, ctx, _spdk_bs_unload_write_used_blobids_cpl);
4668 }
4669 
4670 static void
4671 _spdk_bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4672 {
4673 	struct spdk_bs_load_ctx	*ctx = cb_arg;
4674 
4675 	if (bserrno != 0) {
4676 		_spdk_bs_unload_finish(ctx, bserrno);
4677 		return;
4678 	}
4679 
4680 	_spdk_bs_write_used_md(seq, cb_arg, _spdk_bs_unload_write_used_pages_cpl);
4681 }
4682 
4683 void
4684 spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg)
4685 {
4686 	struct spdk_bs_cpl	cpl;
4687 	struct spdk_bs_load_ctx *ctx;
4688 
4689 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blobstore\n");
4690 
4691 	if (!TAILQ_EMPTY(&bs->blobs)) {
4692 		SPDK_ERRLOG("Blobstore still has open blobs\n");
4693 		cb_fn(cb_arg, -EBUSY);
4694 		return;
4695 	}
4696 
4697 	ctx = calloc(1, sizeof(*ctx));
4698 	if (!ctx) {
4699 		cb_fn(cb_arg, -ENOMEM);
4700 		return;
4701 	}
4702 
4703 	ctx->bs = bs;
4704 
4705 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
4706 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4707 	if (!ctx->super) {
4708 		free(ctx);
4709 		cb_fn(cb_arg, -ENOMEM);
4710 		return;
4711 	}
4712 
4713 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4714 	cpl.u.bs_basic.cb_fn = cb_fn;
4715 	cpl.u.bs_basic.cb_arg = cb_arg;
4716 
4717 	ctx->seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4718 	if (!ctx->seq) {
4719 		spdk_free(ctx->super);
4720 		free(ctx);
4721 		cb_fn(cb_arg, -ENOMEM);
4722 		return;
4723 	}
4724 
4725 	/* Read super block */
4726 	spdk_bs_sequence_read_dev(ctx->seq, ctx->super, _spdk_bs_page_to_lba(bs, 0),
4727 				  _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)),
4728 				  _spdk_bs_unload_read_super_cpl, ctx);
4729 }
4730 
4731 /* END spdk_bs_unload */
4732 
4733 /* START spdk_bs_set_super */
4734 
4735 struct spdk_bs_set_super_ctx {
4736 	struct spdk_blob_store		*bs;
4737 	struct spdk_bs_super_block	*super;
4738 };
4739 
4740 static void
4741 _spdk_bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4742 {
4743 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
4744 
4745 	if (bserrno != 0) {
4746 		SPDK_ERRLOG("Unable to write to super block of blobstore\n");
4747 	}
4748 
4749 	spdk_free(ctx->super);
4750 
4751 	spdk_bs_sequence_finish(seq, bserrno);
4752 
4753 	free(ctx);
4754 }
4755 
4756 static void
4757 _spdk_bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4758 {
4759 	struct spdk_bs_set_super_ctx	*ctx = cb_arg;
4760 
4761 	if (bserrno != 0) {
4762 		SPDK_ERRLOG("Unable to read super block of blobstore\n");
4763 		spdk_free(ctx->super);
4764 		spdk_bs_sequence_finish(seq, bserrno);
4765 		free(ctx);
4766 		return;
4767 	}
4768 
4769 	_spdk_bs_write_super(seq, ctx->bs, ctx->super, _spdk_bs_set_super_write_cpl, ctx);
4770 }
4771 
4772 void
4773 spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid,
4774 		  spdk_bs_op_complete cb_fn, void *cb_arg)
4775 {
4776 	struct spdk_bs_cpl		cpl;
4777 	spdk_bs_sequence_t		*seq;
4778 	struct spdk_bs_set_super_ctx	*ctx;
4779 
4780 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Setting super blob id on blobstore\n");
4781 
4782 	ctx = calloc(1, sizeof(*ctx));
4783 	if (!ctx) {
4784 		cb_fn(cb_arg, -ENOMEM);
4785 		return;
4786 	}
4787 
4788 	ctx->bs = bs;
4789 
4790 	ctx->super = spdk_zmalloc(sizeof(*ctx->super), 0x1000, NULL,
4791 				  SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
4792 	if (!ctx->super) {
4793 		free(ctx);
4794 		cb_fn(cb_arg, -ENOMEM);
4795 		return;
4796 	}
4797 
4798 	cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC;
4799 	cpl.u.bs_basic.cb_fn = cb_fn;
4800 	cpl.u.bs_basic.cb_arg = cb_arg;
4801 
4802 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
4803 	if (!seq) {
4804 		spdk_free(ctx->super);
4805 		free(ctx);
4806 		cb_fn(cb_arg, -ENOMEM);
4807 		return;
4808 	}
4809 
4810 	bs->super_blob = blobid;
4811 
4812 	/* Read super block */
4813 	spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0),
4814 				  _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)),
4815 				  _spdk_bs_set_super_read_cpl, ctx);
4816 }
4817 
4818 /* END spdk_bs_set_super */
4819 
4820 void
4821 spdk_bs_get_super(struct spdk_blob_store *bs,
4822 		  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
4823 {
4824 	if (bs->super_blob == SPDK_BLOBID_INVALID) {
4825 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT);
4826 	} else {
4827 		cb_fn(cb_arg, bs->super_blob, 0);
4828 	}
4829 }
4830 
4831 uint64_t
4832 spdk_bs_get_cluster_size(struct spdk_blob_store *bs)
4833 {
4834 	return bs->cluster_sz;
4835 }
4836 
4837 uint64_t
4838 spdk_bs_get_page_size(struct spdk_blob_store *bs)
4839 {
4840 	return SPDK_BS_PAGE_SIZE;
4841 }
4842 
4843 uint64_t
4844 spdk_bs_get_io_unit_size(struct spdk_blob_store *bs)
4845 {
4846 	return bs->io_unit_size;
4847 }
4848 
4849 uint64_t
4850 spdk_bs_free_cluster_count(struct spdk_blob_store *bs)
4851 {
4852 	return bs->num_free_clusters;
4853 }
4854 
4855 uint64_t
4856 spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs)
4857 {
4858 	return bs->total_data_clusters;
4859 }
4860 
4861 static int
4862 spdk_bs_register_md_thread(struct spdk_blob_store *bs)
4863 {
4864 	bs->md_channel = spdk_get_io_channel(bs);
4865 	if (!bs->md_channel) {
4866 		SPDK_ERRLOG("Failed to get IO channel.\n");
4867 		return -1;
4868 	}
4869 
4870 	return 0;
4871 }
4872 
4873 static int
4874 spdk_bs_unregister_md_thread(struct spdk_blob_store *bs)
4875 {
4876 	spdk_put_io_channel(bs->md_channel);
4877 
4878 	return 0;
4879 }
4880 
4881 spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob)
4882 {
4883 	assert(blob != NULL);
4884 
4885 	return blob->id;
4886 }
4887 
4888 uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob)
4889 {
4890 	assert(blob != NULL);
4891 
4892 	return _spdk_bs_cluster_to_page(blob->bs, blob->active.num_clusters);
4893 }
4894 
4895 uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob)
4896 {
4897 	assert(blob != NULL);
4898 
4899 	return spdk_blob_get_num_pages(blob) * _spdk_bs_io_unit_per_page(blob->bs);
4900 }
4901 
4902 uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob)
4903 {
4904 	assert(blob != NULL);
4905 
4906 	return blob->active.num_clusters;
4907 }
4908 
4909 /* START spdk_bs_create_blob */
4910 
4911 static void
4912 _spdk_bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
4913 {
4914 	struct spdk_blob *blob = cb_arg;
4915 
4916 	_spdk_blob_free(blob);
4917 
4918 	spdk_bs_sequence_finish(seq, bserrno);
4919 }
4920 
4921 static int
4922 _spdk_blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs,
4923 		      bool internal)
4924 {
4925 	uint64_t i;
4926 	size_t value_len = 0;
4927 	int rc;
4928 	const void *value = NULL;
4929 	if (xattrs->count > 0 && xattrs->get_value == NULL) {
4930 		return -EINVAL;
4931 	}
4932 	for (i = 0; i < xattrs->count; i++) {
4933 		xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len);
4934 		if (value == NULL || value_len == 0) {
4935 			return -EINVAL;
4936 		}
4937 		rc = _spdk_blob_set_xattr(blob, xattrs->names[i], value, value_len, internal);
4938 		if (rc < 0) {
4939 			return rc;
4940 		}
4941 	}
4942 	return 0;
4943 }
4944 
4945 static void
4946 _spdk_bs_create_blob(struct spdk_blob_store *bs,
4947 		     const struct spdk_blob_opts *opts,
4948 		     const struct spdk_blob_xattr_opts *internal_xattrs,
4949 		     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
4950 {
4951 	struct spdk_blob	*blob;
4952 	uint32_t		page_idx;
4953 	struct spdk_bs_cpl	cpl;
4954 	struct spdk_blob_opts	opts_default;
4955 	struct spdk_blob_xattr_opts internal_xattrs_default;
4956 	spdk_bs_sequence_t	*seq;
4957 	spdk_blob_id		id;
4958 	int rc;
4959 
4960 	assert(spdk_get_thread() == bs->md_thread);
4961 
4962 	page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0);
4963 	if (page_idx == UINT32_MAX) {
4964 		cb_fn(cb_arg, 0, -ENOMEM);
4965 		return;
4966 	}
4967 	spdk_bit_array_set(bs->used_blobids, page_idx);
4968 	_spdk_bs_claim_md_page(bs, page_idx);
4969 
4970 	id = _spdk_bs_page_to_blobid(page_idx);
4971 
4972 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Creating blob with id %lu at page %u\n", id, page_idx);
4973 
4974 	blob = _spdk_blob_alloc(bs, id);
4975 	if (!blob) {
4976 		cb_fn(cb_arg, 0, -ENOMEM);
4977 		return;
4978 	}
4979 
4980 	if (!opts) {
4981 		spdk_blob_opts_init(&opts_default);
4982 		opts = &opts_default;
4983 	}
4984 
4985 	blob->use_extent_table = opts->use_extent_table;
4986 
4987 	if (!internal_xattrs) {
4988 		_spdk_blob_xattrs_init(&internal_xattrs_default);
4989 		internal_xattrs = &internal_xattrs_default;
4990 	}
4991 
4992 	rc = _spdk_blob_set_xattrs(blob, &opts->xattrs, false);
4993 	if (rc < 0) {
4994 		_spdk_blob_free(blob);
4995 		cb_fn(cb_arg, 0, rc);
4996 		return;
4997 	}
4998 
4999 	rc = _spdk_blob_set_xattrs(blob, internal_xattrs, true);
5000 	if (rc < 0) {
5001 		_spdk_blob_free(blob);
5002 		cb_fn(cb_arg, 0, rc);
5003 		return;
5004 	}
5005 
5006 	if (opts->thin_provision) {
5007 		_spdk_blob_set_thin_provision(blob);
5008 	}
5009 
5010 	_spdk_blob_set_clear_method(blob, opts->clear_method);
5011 
5012 	rc = _spdk_blob_resize(blob, opts->num_clusters);
5013 	if (rc < 0) {
5014 		_spdk_blob_free(blob);
5015 		cb_fn(cb_arg, 0, rc);
5016 		return;
5017 	}
5018 	cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5019 	cpl.u.blobid.cb_fn = cb_fn;
5020 	cpl.u.blobid.cb_arg = cb_arg;
5021 	cpl.u.blobid.blobid = blob->id;
5022 
5023 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
5024 	if (!seq) {
5025 		_spdk_blob_free(blob);
5026 		cb_fn(cb_arg, 0, -ENOMEM);
5027 		return;
5028 	}
5029 
5030 	_spdk_blob_persist(seq, blob, _spdk_bs_create_blob_cpl, blob);
5031 }
5032 
5033 void spdk_bs_create_blob(struct spdk_blob_store *bs,
5034 			 spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5035 {
5036 	_spdk_bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg);
5037 }
5038 
5039 void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts,
5040 			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5041 {
5042 	_spdk_bs_create_blob(bs, opts, NULL, cb_fn, cb_arg);
5043 }
5044 
5045 /* END spdk_bs_create_blob */
5046 
5047 /* START blob_cleanup */
5048 
5049 struct spdk_clone_snapshot_ctx {
5050 	struct spdk_bs_cpl      cpl;
5051 	int bserrno;
5052 	bool frozen;
5053 
5054 	struct spdk_io_channel *channel;
5055 
5056 	/* Current cluster for inflate operation */
5057 	uint64_t cluster;
5058 
5059 	/* For inflation force allocation of all unallocated clusters and remove
5060 	 * thin-provisioning. Otherwise only decouple parent and keep clone thin. */
5061 	bool allocate_all;
5062 
5063 	struct {
5064 		spdk_blob_id id;
5065 		struct spdk_blob *blob;
5066 	} original;
5067 	struct {
5068 		spdk_blob_id id;
5069 		struct spdk_blob *blob;
5070 	} new;
5071 
5072 	/* xattrs specified for snapshot/clones only. They have no impact on
5073 	 * the original blobs xattrs. */
5074 	const struct spdk_blob_xattr_opts *xattrs;
5075 };
5076 
5077 static void
5078 _spdk_bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno)
5079 {
5080 	struct spdk_clone_snapshot_ctx *ctx = cb_arg;
5081 	struct spdk_bs_cpl *cpl = &ctx->cpl;
5082 
5083 	if (bserrno != 0) {
5084 		if (ctx->bserrno != 0) {
5085 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5086 		} else {
5087 			ctx->bserrno = bserrno;
5088 		}
5089 	}
5090 
5091 	switch (cpl->type) {
5092 	case SPDK_BS_CPL_TYPE_BLOBID:
5093 		cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno);
5094 		break;
5095 	case SPDK_BS_CPL_TYPE_BLOB_BASIC:
5096 		cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno);
5097 		break;
5098 	default:
5099 		SPDK_UNREACHABLE();
5100 		break;
5101 	}
5102 
5103 	free(ctx);
5104 }
5105 
5106 static void
5107 _spdk_bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
5108 {
5109 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5110 	struct spdk_blob *origblob = ctx->original.blob;
5111 
5112 	if (bserrno != 0) {
5113 		if (ctx->bserrno != 0) {
5114 			SPDK_ERRLOG("Unfreeze error %d\n", bserrno);
5115 		} else {
5116 			ctx->bserrno = bserrno;
5117 		}
5118 	}
5119 
5120 	ctx->original.id = origblob->id;
5121 	origblob->locked_operation_in_progress = false;
5122 
5123 	spdk_blob_close(origblob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5124 }
5125 
5126 static void
5127 _spdk_bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno)
5128 {
5129 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5130 	struct spdk_blob *origblob = ctx->original.blob;
5131 
5132 	if (bserrno != 0) {
5133 		if (ctx->bserrno != 0) {
5134 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5135 		} else {
5136 			ctx->bserrno = bserrno;
5137 		}
5138 	}
5139 
5140 	if (ctx->frozen) {
5141 		/* Unfreeze any outstanding I/O */
5142 		_spdk_blob_unfreeze_io(origblob, _spdk_bs_snapshot_unfreeze_cpl, ctx);
5143 	} else {
5144 		_spdk_bs_snapshot_unfreeze_cpl(ctx, 0);
5145 	}
5146 
5147 }
5148 
5149 static void
5150 _spdk_bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno)
5151 {
5152 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5153 	struct spdk_blob *newblob = ctx->new.blob;
5154 
5155 	if (bserrno != 0) {
5156 		if (ctx->bserrno != 0) {
5157 			SPDK_ERRLOG("Cleanup error %d\n", bserrno);
5158 		} else {
5159 			ctx->bserrno = bserrno;
5160 		}
5161 	}
5162 
5163 	ctx->new.id = newblob->id;
5164 	spdk_blob_close(newblob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx);
5165 }
5166 
5167 /* END blob_cleanup */
5168 
5169 /* START spdk_bs_create_snapshot */
5170 
5171 static void
5172 _spdk_bs_snapshot_swap_cluster_maps(struct spdk_blob *blob1, struct spdk_blob *blob2)
5173 {
5174 	uint64_t *cluster_temp;
5175 	uint32_t *extent_page_temp;
5176 
5177 	cluster_temp = blob1->active.clusters;
5178 	blob1->active.clusters = blob2->active.clusters;
5179 	blob2->active.clusters = cluster_temp;
5180 
5181 	extent_page_temp = blob1->active.extent_pages;
5182 	blob1->active.extent_pages = blob2->active.extent_pages;
5183 	blob2->active.extent_pages = extent_page_temp;
5184 }
5185 
5186 static void
5187 _spdk_bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno)
5188 {
5189 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5190 	struct spdk_blob *origblob = ctx->original.blob;
5191 	struct spdk_blob *newblob = ctx->new.blob;
5192 
5193 	if (bserrno != 0) {
5194 		_spdk_bs_snapshot_swap_cluster_maps(newblob, origblob);
5195 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5196 		return;
5197 	}
5198 
5199 	/* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */
5200 	bserrno = _spdk_blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true);
5201 	if (bserrno != 0) {
5202 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5203 		return;
5204 	}
5205 
5206 	_spdk_bs_blob_list_add(ctx->original.blob);
5207 
5208 	spdk_blob_set_read_only(newblob);
5209 
5210 	/* sync snapshot metadata */
5211 	spdk_blob_sync_md(newblob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx);
5212 }
5213 
5214 static void
5215 _spdk_bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno)
5216 {
5217 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5218 	struct spdk_blob *origblob = ctx->original.blob;
5219 	struct spdk_blob *newblob = ctx->new.blob;
5220 
5221 	if (bserrno != 0) {
5222 		/* return cluster map back to original */
5223 		_spdk_bs_snapshot_swap_cluster_maps(newblob, origblob);
5224 		_spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5225 		return;
5226 	}
5227 
5228 	/* Set internal xattr for snapshot id */
5229 	bserrno = _spdk_blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true);
5230 	if (bserrno != 0) {
5231 		/* return cluster map back to original */
5232 		_spdk_bs_snapshot_swap_cluster_maps(newblob, origblob);
5233 		_spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5234 		return;
5235 	}
5236 
5237 	_spdk_bs_blob_list_remove(origblob);
5238 	origblob->parent_id = newblob->id;
5239 
5240 	/* Create new back_bs_dev for snapshot */
5241 	origblob->back_bs_dev = spdk_bs_create_blob_bs_dev(newblob);
5242 	if (origblob->back_bs_dev == NULL) {
5243 		/* return cluster map back to original */
5244 		_spdk_bs_snapshot_swap_cluster_maps(newblob, origblob);
5245 		_spdk_bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL);
5246 		return;
5247 	}
5248 
5249 	/* set clone blob as thin provisioned */
5250 	_spdk_blob_set_thin_provision(origblob);
5251 
5252 	_spdk_bs_blob_list_add(newblob);
5253 
5254 	/* sync clone metadata */
5255 	spdk_blob_sync_md(origblob, _spdk_bs_snapshot_origblob_sync_cpl, ctx);
5256 }
5257 
5258 static void
5259 _spdk_bs_snapshot_freeze_cpl(void *cb_arg, int rc)
5260 {
5261 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5262 	struct spdk_blob *origblob = ctx->original.blob;
5263 	struct spdk_blob *newblob = ctx->new.blob;
5264 	int bserrno;
5265 
5266 	if (rc != 0) {
5267 		_spdk_bs_clone_snapshot_newblob_cleanup(ctx, rc);
5268 		return;
5269 	}
5270 
5271 	ctx->frozen = true;
5272 
5273 	/* set new back_bs_dev for snapshot */
5274 	newblob->back_bs_dev = origblob->back_bs_dev;
5275 	/* Set invalid flags from origblob */
5276 	newblob->invalid_flags = origblob->invalid_flags;
5277 
5278 	/* inherit parent from original blob if set */
5279 	newblob->parent_id = origblob->parent_id;
5280 	if (origblob->parent_id != SPDK_BLOBID_INVALID) {
5281 		/* Set internal xattr for snapshot id */
5282 		bserrno = _spdk_blob_set_xattr(newblob, BLOB_SNAPSHOT,
5283 					       &origblob->parent_id, sizeof(spdk_blob_id), true);
5284 		if (bserrno != 0) {
5285 			_spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno);
5286 			return;
5287 		}
5288 	}
5289 
5290 	/* swap cluster maps */
5291 	_spdk_bs_snapshot_swap_cluster_maps(newblob, origblob);
5292 
5293 	/* Set the clear method on the new blob to match the original. */
5294 	_spdk_blob_set_clear_method(newblob, origblob->clear_method);
5295 
5296 	/* sync snapshot metadata */
5297 	spdk_blob_sync_md(newblob, _spdk_bs_snapshot_newblob_sync_cpl, ctx);
5298 }
5299 
5300 static void
5301 _spdk_bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5302 {
5303 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5304 	struct spdk_blob *origblob = ctx->original.blob;
5305 	struct spdk_blob *newblob = _blob;
5306 
5307 	if (bserrno != 0) {
5308 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5309 		return;
5310 	}
5311 
5312 	ctx->new.blob = newblob;
5313 	assert(spdk_blob_is_thin_provisioned(newblob));
5314 	assert(spdk_mem_all_zero(newblob->active.clusters,
5315 				 newblob->active.num_clusters * sizeof(*newblob->active.clusters)));
5316 	assert(spdk_mem_all_zero(newblob->active.extent_pages,
5317 				 newblob->active.num_extent_pages * sizeof(*newblob->active.extent_pages)));
5318 
5319 	_spdk_blob_freeze_io(origblob, _spdk_bs_snapshot_freeze_cpl, ctx);
5320 }
5321 
5322 static void
5323 _spdk_bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
5324 {
5325 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5326 	struct spdk_blob *origblob = ctx->original.blob;
5327 
5328 	if (bserrno != 0) {
5329 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5330 		return;
5331 	}
5332 
5333 	ctx->new.id = blobid;
5334 	ctx->cpl.u.blobid.blobid = blobid;
5335 
5336 	spdk_bs_open_blob(origblob->bs, ctx->new.id, _spdk_bs_snapshot_newblob_open_cpl, ctx);
5337 }
5338 
5339 
5340 static void
5341 _spdk_bs_xattr_snapshot(void *arg, const char *name,
5342 			const void **value, size_t *value_len)
5343 {
5344 	assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0);
5345 
5346 	struct spdk_blob *blob = (struct spdk_blob *)arg;
5347 	*value = &blob->id;
5348 	*value_len = sizeof(blob->id);
5349 }
5350 
5351 static void
5352 _spdk_bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5353 {
5354 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5355 	struct spdk_blob_opts opts;
5356 	struct spdk_blob_xattr_opts internal_xattrs;
5357 	char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS };
5358 
5359 	if (bserrno != 0) {
5360 		_spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno);
5361 		return;
5362 	}
5363 
5364 	ctx->original.blob = _blob;
5365 
5366 	if (_blob->data_ro || _blob->md_ro) {
5367 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot from read only blob with id %lu\n",
5368 			      _blob->id);
5369 		ctx->bserrno = -EINVAL;
5370 		spdk_blob_close(_blob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5371 		return;
5372 	}
5373 
5374 	if (_blob->locked_operation_in_progress) {
5375 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot - another operation in progress\n");
5376 		ctx->bserrno = -EBUSY;
5377 		spdk_blob_close(_blob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5378 		return;
5379 	}
5380 
5381 	_blob->locked_operation_in_progress = true;
5382 
5383 	spdk_blob_opts_init(&opts);
5384 	_spdk_blob_xattrs_init(&internal_xattrs);
5385 
5386 	/* Change the size of new blob to the same as in original blob,
5387 	 * but do not allocate clusters */
5388 	opts.thin_provision = true;
5389 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
5390 	opts.use_extent_table = _blob->use_extent_table;
5391 
5392 	/* If there are any xattrs specified for snapshot, set them now */
5393 	if (ctx->xattrs) {
5394 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
5395 	}
5396 	/* Set internal xattr SNAPSHOT_IN_PROGRESS */
5397 	internal_xattrs.count = 1;
5398 	internal_xattrs.ctx = _blob;
5399 	internal_xattrs.names = xattrs_names;
5400 	internal_xattrs.get_value = _spdk_bs_xattr_snapshot;
5401 
5402 	_spdk_bs_create_blob(_blob->bs, &opts, &internal_xattrs,
5403 			     _spdk_bs_snapshot_newblob_create_cpl, ctx);
5404 }
5405 
5406 void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid,
5407 			     const struct spdk_blob_xattr_opts *snapshot_xattrs,
5408 			     spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5409 {
5410 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
5411 
5412 	if (!ctx) {
5413 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
5414 		return;
5415 	}
5416 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5417 	ctx->cpl.u.blobid.cb_fn = cb_fn;
5418 	ctx->cpl.u.blobid.cb_arg = cb_arg;
5419 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
5420 	ctx->bserrno = 0;
5421 	ctx->frozen = false;
5422 	ctx->original.id = blobid;
5423 	ctx->xattrs = snapshot_xattrs;
5424 
5425 	spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_snapshot_origblob_open_cpl, ctx);
5426 }
5427 /* END spdk_bs_create_snapshot */
5428 
5429 /* START spdk_bs_create_clone */
5430 
5431 static void
5432 _spdk_bs_xattr_clone(void *arg, const char *name,
5433 		     const void **value, size_t *value_len)
5434 {
5435 	assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0);
5436 
5437 	struct spdk_blob *blob = (struct spdk_blob *)arg;
5438 	*value = &blob->id;
5439 	*value_len = sizeof(blob->id);
5440 }
5441 
5442 static void
5443 _spdk_bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5444 {
5445 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5446 	struct spdk_blob *clone = _blob;
5447 
5448 	ctx->new.blob = clone;
5449 	_spdk_bs_blob_list_add(clone);
5450 
5451 	spdk_blob_close(clone, _spdk_bs_clone_snapshot_origblob_cleanup, ctx);
5452 }
5453 
5454 static void
5455 _spdk_bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno)
5456 {
5457 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5458 
5459 	ctx->cpl.u.blobid.blobid = blobid;
5460 	spdk_bs_open_blob(ctx->original.blob->bs, blobid, _spdk_bs_clone_newblob_open_cpl, ctx);
5461 }
5462 
5463 static void
5464 _spdk_bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5465 {
5466 	struct spdk_clone_snapshot_ctx	*ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5467 	struct spdk_blob_opts		opts;
5468 	struct spdk_blob_xattr_opts internal_xattrs;
5469 	char *xattr_names[] = { BLOB_SNAPSHOT };
5470 
5471 	if (bserrno != 0) {
5472 		_spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno);
5473 		return;
5474 	}
5475 
5476 	ctx->original.blob = _blob;
5477 
5478 	if (!_blob->data_ro || !_blob->md_ro) {
5479 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Clone not from read-only blob\n");
5480 		ctx->bserrno = -EINVAL;
5481 		spdk_blob_close(_blob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5482 		return;
5483 	}
5484 
5485 	if (_blob->locked_operation_in_progress) {
5486 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create clone - another operation in progress\n");
5487 		ctx->bserrno = -EBUSY;
5488 		spdk_blob_close(_blob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5489 		return;
5490 	}
5491 
5492 	_blob->locked_operation_in_progress = true;
5493 
5494 	spdk_blob_opts_init(&opts);
5495 	_spdk_blob_xattrs_init(&internal_xattrs);
5496 
5497 	opts.thin_provision = true;
5498 	opts.num_clusters = spdk_blob_get_num_clusters(_blob);
5499 	opts.use_extent_table = _blob->use_extent_table;
5500 	if (ctx->xattrs) {
5501 		memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs));
5502 	}
5503 
5504 	/* Set internal xattr BLOB_SNAPSHOT */
5505 	internal_xattrs.count = 1;
5506 	internal_xattrs.ctx = _blob;
5507 	internal_xattrs.names = xattr_names;
5508 	internal_xattrs.get_value = _spdk_bs_xattr_clone;
5509 
5510 	_spdk_bs_create_blob(_blob->bs, &opts, &internal_xattrs,
5511 			     _spdk_bs_clone_newblob_create_cpl, ctx);
5512 }
5513 
5514 void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid,
5515 			  const struct spdk_blob_xattr_opts *clone_xattrs,
5516 			  spdk_blob_op_with_id_complete cb_fn, void *cb_arg)
5517 {
5518 	struct spdk_clone_snapshot_ctx	*ctx = calloc(1, sizeof(*ctx));
5519 
5520 	if (!ctx) {
5521 		cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM);
5522 		return;
5523 	}
5524 
5525 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID;
5526 	ctx->cpl.u.blobid.cb_fn = cb_fn;
5527 	ctx->cpl.u.blobid.cb_arg = cb_arg;
5528 	ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID;
5529 	ctx->bserrno = 0;
5530 	ctx->xattrs = clone_xattrs;
5531 	ctx->original.id = blobid;
5532 
5533 	spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_clone_origblob_open_cpl, ctx);
5534 }
5535 
5536 /* END spdk_bs_create_clone */
5537 
5538 /* START spdk_bs_inflate_blob */
5539 
5540 static void
5541 _spdk_bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno)
5542 {
5543 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5544 	struct spdk_blob *_blob = ctx->original.blob;
5545 
5546 	if (bserrno != 0) {
5547 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5548 		return;
5549 	}
5550 
5551 	assert(_parent != NULL);
5552 
5553 	_spdk_bs_blob_list_remove(_blob);
5554 	_blob->parent_id = _parent->id;
5555 	_spdk_blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id,
5556 			     sizeof(spdk_blob_id), true);
5557 
5558 	_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5559 	_blob->back_bs_dev = spdk_bs_create_blob_bs_dev(_parent);
5560 	_spdk_bs_blob_list_add(_blob);
5561 
5562 	spdk_blob_sync_md(_blob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx);
5563 }
5564 
5565 static void
5566 _spdk_bs_inflate_blob_done(void *cb_arg, int bserrno)
5567 {
5568 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5569 	struct spdk_blob *_blob = ctx->original.blob;
5570 	struct spdk_blob *_parent;
5571 
5572 	if (bserrno != 0) {
5573 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5574 		return;
5575 	}
5576 
5577 	if (ctx->allocate_all) {
5578 		/* remove thin provisioning */
5579 		_spdk_bs_blob_list_remove(_blob);
5580 		_spdk_blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
5581 		_blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV;
5582 		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5583 		_blob->back_bs_dev = NULL;
5584 		_blob->parent_id = SPDK_BLOBID_INVALID;
5585 	} else {
5586 		_parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob;
5587 		if (_parent->parent_id != SPDK_BLOBID_INVALID) {
5588 			/* We must change the parent of the inflated blob */
5589 			spdk_bs_open_blob(_blob->bs, _parent->parent_id,
5590 					  _spdk_bs_inflate_blob_set_parent_cpl, ctx);
5591 			return;
5592 		}
5593 
5594 		_spdk_bs_blob_list_remove(_blob);
5595 		_spdk_blob_remove_xattr(_blob, BLOB_SNAPSHOT, true);
5596 		_blob->parent_id = SPDK_BLOBID_INVALID;
5597 		_blob->back_bs_dev->destroy(_blob->back_bs_dev);
5598 		_blob->back_bs_dev = spdk_bs_create_zeroes_dev();
5599 	}
5600 
5601 	_blob->state = SPDK_BLOB_STATE_DIRTY;
5602 	spdk_blob_sync_md(_blob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx);
5603 }
5604 
5605 /* Check if cluster needs allocation */
5606 static inline bool
5607 _spdk_bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all)
5608 {
5609 	struct spdk_blob_bs_dev *b;
5610 
5611 	assert(blob != NULL);
5612 
5613 	if (blob->active.clusters[cluster] != 0) {
5614 		/* Cluster is already allocated */
5615 		return false;
5616 	}
5617 
5618 	if (blob->parent_id == SPDK_BLOBID_INVALID) {
5619 		/* Blob have no parent blob */
5620 		return allocate_all;
5621 	}
5622 
5623 	b = (struct spdk_blob_bs_dev *)blob->back_bs_dev;
5624 	return (allocate_all || b->blob->active.clusters[cluster] != 0);
5625 }
5626 
5627 static void
5628 _spdk_bs_inflate_blob_touch_next(void *cb_arg, int bserrno)
5629 {
5630 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5631 	struct spdk_blob *_blob = ctx->original.blob;
5632 	uint64_t offset;
5633 
5634 	if (bserrno != 0) {
5635 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno);
5636 		return;
5637 	}
5638 
5639 	for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) {
5640 		if (_spdk_bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) {
5641 			break;
5642 		}
5643 	}
5644 
5645 	if (ctx->cluster < _blob->active.num_clusters) {
5646 		offset = _spdk_bs_cluster_to_lba(_blob->bs, ctx->cluster);
5647 
5648 		/* We may safely increment a cluster before write */
5649 		ctx->cluster++;
5650 
5651 		/* Use zero length write to touch a cluster */
5652 		spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0,
5653 				   _spdk_bs_inflate_blob_touch_next, ctx);
5654 	} else {
5655 		_spdk_bs_inflate_blob_done(cb_arg, bserrno);
5656 	}
5657 }
5658 
5659 static void
5660 _spdk_bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
5661 {
5662 	struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg;
5663 	uint64_t lfc; /* lowest free cluster */
5664 	uint64_t i;
5665 
5666 	if (bserrno != 0) {
5667 		_spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno);
5668 		return;
5669 	}
5670 
5671 	ctx->original.blob = _blob;
5672 
5673 	if (_blob->locked_operation_in_progress) {
5674 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot inflate blob - another operation in progress\n");
5675 		ctx->bserrno = -EBUSY;
5676 		spdk_blob_close(_blob, _spdk_bs_clone_snapshot_cleanup_finish, ctx);
5677 		return;
5678 	}
5679 
5680 	_blob->locked_operation_in_progress = true;
5681 
5682 	if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) {
5683 		/* This blob have no parent, so we cannot decouple it. */
5684 		SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n");
5685 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL);
5686 		return;
5687 	}
5688 
5689 	if (spdk_blob_is_thin_provisioned(_blob) == false) {
5690 		/* This is not thin provisioned blob. No need to inflate. */
5691 		_spdk_bs_clone_snapshot_origblob_cleanup(ctx, 0);
5692 		return;
5693 	}
5694 
5695 	/* Do two passes - one to verify that we can obtain enough clusters
5696 	 * and another to actually claim them.
5697 	 */
5698 	lfc = 0;
5699 	for (i = 0; i < _blob->active.num_clusters; i++) {
5700 		if (_spdk_bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) {
5701 			lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc);
5702 			if (lfc == UINT32_MAX) {
5703 				/* No more free clusters. Cannot satisfy the request */
5704 				_spdk_bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC);
5705 				return;
5706 			}
5707 			lfc++;
5708 		}
5709 	}
5710 
5711 	ctx->cluster = 0;
5712 	_spdk_bs_inflate_blob_touch_next(ctx, 0);
5713 }
5714 
5715 static void
5716 _spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
5717 		      spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg)
5718 {
5719 	struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx));
5720 
5721 	if (!ctx) {
5722 		cb_fn(cb_arg, -ENOMEM);
5723 		return;
5724 	}
5725 	ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
5726 	ctx->cpl.u.bs_basic.cb_fn = cb_fn;
5727 	ctx->cpl.u.bs_basic.cb_arg = cb_arg;
5728 	ctx->bserrno = 0;
5729 	ctx->original.id = blobid;
5730 	ctx->channel = channel;
5731 	ctx->allocate_all = allocate_all;
5732 
5733 	spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_inflate_blob_open_cpl, ctx);
5734 }
5735 
5736 void
5737 spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
5738 		     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
5739 {
5740 	_spdk_bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg);
5741 }
5742 
5743 void
5744 spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel,
5745 			     spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg)
5746 {
5747 	_spdk_bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg);
5748 }
5749 /* END spdk_bs_inflate_blob */
5750 
5751 /* START spdk_blob_resize */
5752 struct spdk_bs_resize_ctx {
5753 	spdk_blob_op_complete cb_fn;
5754 	void *cb_arg;
5755 	struct spdk_blob *blob;
5756 	uint64_t sz;
5757 	int rc;
5758 };
5759 
5760 static void
5761 _spdk_bs_resize_unfreeze_cpl(void *cb_arg, int rc)
5762 {
5763 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
5764 
5765 	if (rc != 0) {
5766 		SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc);
5767 	}
5768 
5769 	if (ctx->rc != 0) {
5770 		SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc);
5771 		rc = ctx->rc;
5772 	}
5773 
5774 	ctx->blob->locked_operation_in_progress = false;
5775 
5776 	ctx->cb_fn(ctx->cb_arg, rc);
5777 	free(ctx);
5778 }
5779 
5780 static void
5781 _spdk_bs_resize_freeze_cpl(void *cb_arg, int rc)
5782 {
5783 	struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg;
5784 
5785 	if (rc != 0) {
5786 		ctx->blob->locked_operation_in_progress = false;
5787 		ctx->cb_fn(ctx->cb_arg, rc);
5788 		free(ctx);
5789 		return;
5790 	}
5791 
5792 	ctx->rc = _spdk_blob_resize(ctx->blob, ctx->sz);
5793 
5794 	_spdk_blob_unfreeze_io(ctx->blob, _spdk_bs_resize_unfreeze_cpl, ctx);
5795 }
5796 
5797 void
5798 spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg)
5799 {
5800 	struct spdk_bs_resize_ctx *ctx;
5801 
5802 	_spdk_blob_verify_md_op(blob);
5803 
5804 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Resizing blob %lu to %lu clusters\n", blob->id, sz);
5805 
5806 	if (blob->md_ro) {
5807 		cb_fn(cb_arg, -EPERM);
5808 		return;
5809 	}
5810 
5811 	if (sz == blob->active.num_clusters) {
5812 		cb_fn(cb_arg, 0);
5813 		return;
5814 	}
5815 
5816 	if (blob->locked_operation_in_progress) {
5817 		cb_fn(cb_arg, -EBUSY);
5818 		return;
5819 	}
5820 
5821 	ctx = calloc(1, sizeof(*ctx));
5822 	if (!ctx) {
5823 		cb_fn(cb_arg, -ENOMEM);
5824 		return;
5825 	}
5826 
5827 	blob->locked_operation_in_progress = true;
5828 	ctx->cb_fn = cb_fn;
5829 	ctx->cb_arg = cb_arg;
5830 	ctx->blob = blob;
5831 	ctx->sz = sz;
5832 	_spdk_blob_freeze_io(blob, _spdk_bs_resize_freeze_cpl, ctx);
5833 }
5834 
5835 /* END spdk_blob_resize */
5836 
5837 
5838 /* START spdk_bs_delete_blob */
5839 
5840 static void
5841 _spdk_bs_delete_close_cpl(void *cb_arg, int bserrno)
5842 {
5843 	spdk_bs_sequence_t *seq = cb_arg;
5844 
5845 	spdk_bs_sequence_finish(seq, bserrno);
5846 }
5847 
5848 static void
5849 _spdk_bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
5850 {
5851 	struct spdk_blob *blob = cb_arg;
5852 
5853 	if (bserrno != 0) {
5854 		/*
5855 		 * We already removed this blob from the blobstore tailq, so
5856 		 *  we need to free it here since this is the last reference
5857 		 *  to it.
5858 		 */
5859 		_spdk_blob_free(blob);
5860 		_spdk_bs_delete_close_cpl(seq, bserrno);
5861 		return;
5862 	}
5863 
5864 	/*
5865 	 * This will immediately decrement the ref_count and call
5866 	 *  the completion routine since the metadata state is clean.
5867 	 *  By calling spdk_blob_close, we reduce the number of call
5868 	 *  points into code that touches the blob->open_ref count
5869 	 *  and the blobstore's blob list.
5870 	 */
5871 	spdk_blob_close(blob, _spdk_bs_delete_close_cpl, seq);
5872 }
5873 
5874 struct delete_snapshot_ctx {
5875 	struct spdk_blob_list *parent_snapshot_entry;
5876 	struct spdk_blob *snapshot;
5877 	bool snapshot_md_ro;
5878 	struct spdk_blob *clone;
5879 	bool clone_md_ro;
5880 	spdk_blob_op_with_handle_complete cb_fn;
5881 	void *cb_arg;
5882 	int bserrno;
5883 };
5884 
5885 static void
5886 _spdk_delete_blob_cleanup_finish(void *cb_arg, int bserrno)
5887 {
5888 	struct delete_snapshot_ctx *ctx = cb_arg;
5889 
5890 	if (bserrno != 0) {
5891 		SPDK_ERRLOG("Snapshot cleanup error %d\n", bserrno);
5892 	}
5893 
5894 	assert(ctx != NULL);
5895 
5896 	if (bserrno != 0 && ctx->bserrno == 0) {
5897 		ctx->bserrno = bserrno;
5898 	}
5899 
5900 	ctx->cb_fn(ctx->cb_arg, ctx->snapshot, ctx->bserrno);
5901 	free(ctx);
5902 }
5903 
5904 static void
5905 _spdk_delete_snapshot_cleanup_snapshot(void *cb_arg, int bserrno)
5906 {
5907 	struct delete_snapshot_ctx *ctx = cb_arg;
5908 
5909 	if (bserrno != 0) {
5910 		ctx->bserrno = bserrno;
5911 		SPDK_ERRLOG("Clone cleanup error %d\n", bserrno);
5912 	}
5913 
5914 	/* open_ref == 1 menas that only deletion context has opened this snapshot
5915 	 * open_ref == 2 menas that clone has opened this snapshot as well,
5916 	 * so we have to add it back to the blobs list */
5917 	if (ctx->snapshot->open_ref == 2) {
5918 		TAILQ_INSERT_HEAD(&ctx->snapshot->bs->blobs, ctx->snapshot, link);
5919 	}
5920 
5921 	ctx->snapshot->locked_operation_in_progress = false;
5922 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
5923 
5924 	spdk_blob_close(ctx->snapshot, _spdk_delete_blob_cleanup_finish, ctx);
5925 }
5926 
5927 static void
5928 _spdk_delete_snapshot_cleanup_clone(void *cb_arg, int bserrno)
5929 {
5930 	struct delete_snapshot_ctx *ctx = cb_arg;
5931 
5932 	ctx->clone->locked_operation_in_progress = false;
5933 	ctx->clone->md_ro = ctx->clone_md_ro;
5934 
5935 	spdk_blob_close(ctx->clone, _spdk_delete_snapshot_cleanup_snapshot, ctx);
5936 }
5937 
5938 static void
5939 _spdk_delete_snapshot_unfreeze_cpl(void *cb_arg, int bserrno)
5940 {
5941 	struct delete_snapshot_ctx *ctx = cb_arg;
5942 
5943 	if (bserrno) {
5944 		ctx->bserrno = bserrno;
5945 		_spdk_delete_snapshot_cleanup_clone(ctx, 0);
5946 		return;
5947 	}
5948 
5949 	ctx->clone->locked_operation_in_progress = false;
5950 	spdk_blob_close(ctx->clone, _spdk_delete_blob_cleanup_finish, ctx);
5951 }
5952 
5953 static void
5954 _spdk_delete_snapshot_sync_snapshot_cpl(void *cb_arg, int bserrno)
5955 {
5956 	struct delete_snapshot_ctx *ctx = cb_arg;
5957 	struct spdk_blob_list *parent_snapshot_entry = NULL;
5958 	struct spdk_blob_list *snapshot_entry = NULL;
5959 	struct spdk_blob_list *clone_entry = NULL;
5960 	struct spdk_blob_list *snapshot_clone_entry = NULL;
5961 
5962 	if (bserrno) {
5963 		SPDK_ERRLOG("Failed to sync MD on blob\n");
5964 		ctx->bserrno = bserrno;
5965 		_spdk_delete_snapshot_cleanup_clone(ctx, 0);
5966 		return;
5967 	}
5968 
5969 	/* Get snapshot entry for the snapshot we want to remove */
5970 	snapshot_entry = _spdk_bs_get_snapshot_entry(ctx->snapshot->bs, ctx->snapshot->id);
5971 
5972 	assert(snapshot_entry != NULL);
5973 
5974 	/* Remove clone entry in this snapshot (at this point there can be only one clone) */
5975 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
5976 	assert(clone_entry != NULL);
5977 	TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link);
5978 	snapshot_entry->clone_count--;
5979 	assert(TAILQ_EMPTY(&snapshot_entry->clones));
5980 
5981 	if (ctx->snapshot->parent_id != SPDK_BLOBID_INVALID) {
5982 		/* This snapshot is at the same time a clone of another snapshot - we need to
5983 		 * update parent snapshot (remove current clone, add new one inherited from
5984 		 * the snapshot that is being removed) */
5985 
5986 		/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
5987 		 * snapshot that we are removing */
5988 		_spdk_blob_get_snapshot_and_clone_entries(ctx->snapshot, &parent_snapshot_entry,
5989 				&snapshot_clone_entry);
5990 
5991 		/* Switch clone entry in parent snapshot */
5992 		TAILQ_INSERT_TAIL(&parent_snapshot_entry->clones, clone_entry, link);
5993 		TAILQ_REMOVE(&parent_snapshot_entry->clones, snapshot_clone_entry, link);
5994 		free(snapshot_clone_entry);
5995 	} else {
5996 		/* No parent snapshot - just remove clone entry */
5997 		free(clone_entry);
5998 	}
5999 
6000 	/* Restore md_ro flags */
6001 	ctx->clone->md_ro = ctx->clone_md_ro;
6002 	ctx->snapshot->md_ro = ctx->snapshot_md_ro;
6003 
6004 	_spdk_blob_unfreeze_io(ctx->clone, _spdk_delete_snapshot_unfreeze_cpl, ctx);
6005 }
6006 
6007 static void
6008 _spdk_delete_snapshot_sync_clone_cpl(void *cb_arg, int bserrno)
6009 {
6010 	struct delete_snapshot_ctx *ctx = cb_arg;
6011 	uint64_t i;
6012 
6013 	ctx->snapshot->md_ro = false;
6014 
6015 	if (bserrno) {
6016 		SPDK_ERRLOG("Failed to sync MD on clone\n");
6017 		ctx->bserrno = bserrno;
6018 
6019 		/* Restore snapshot to previous state */
6020 		bserrno = _spdk_blob_remove_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, true);
6021 		if (bserrno != 0) {
6022 			_spdk_delete_snapshot_cleanup_clone(ctx, bserrno);
6023 			return;
6024 		}
6025 
6026 		spdk_blob_sync_md(ctx->snapshot, _spdk_delete_snapshot_cleanup_clone, ctx);
6027 		return;
6028 	}
6029 
6030 	/* Clear cluster map entries for snapshot */
6031 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
6032 		if (ctx->clone->active.clusters[i] == ctx->snapshot->active.clusters[i]) {
6033 			ctx->snapshot->active.clusters[i] = 0;
6034 		}
6035 	}
6036 
6037 	ctx->snapshot->state = SPDK_BLOB_STATE_DIRTY;
6038 
6039 	if (ctx->parent_snapshot_entry != NULL) {
6040 		ctx->snapshot->back_bs_dev = NULL;
6041 	}
6042 
6043 	spdk_blob_sync_md(ctx->snapshot, _spdk_delete_snapshot_sync_snapshot_cpl, ctx);
6044 }
6045 
6046 static void
6047 _spdk_delete_snapshot_sync_snapshot_xattr_cpl(void *cb_arg, int bserrno)
6048 {
6049 	struct delete_snapshot_ctx *ctx = cb_arg;
6050 	uint64_t i;
6051 
6052 	/* Temporarily override md_ro flag for clone for MD modification */
6053 	ctx->clone_md_ro = ctx->clone->md_ro;
6054 	ctx->clone->md_ro = false;
6055 
6056 	if (bserrno) {
6057 		SPDK_ERRLOG("Failed to sync MD with xattr on blob\n");
6058 		ctx->bserrno = bserrno;
6059 		_spdk_delete_snapshot_cleanup_clone(ctx, 0);
6060 		return;
6061 	}
6062 
6063 	/* Copy snapshot map to clone map (only unallocated clusters in clone) */
6064 	for (i = 0; i < ctx->snapshot->active.num_clusters && i < ctx->clone->active.num_clusters; i++) {
6065 		if (ctx->clone->active.clusters[i] == 0) {
6066 			ctx->clone->active.clusters[i] = ctx->snapshot->active.clusters[i];
6067 		}
6068 	}
6069 
6070 	/* Delete old backing bs_dev from clone (related to snapshot that will be removed) */
6071 	ctx->clone->back_bs_dev->destroy(ctx->clone->back_bs_dev);
6072 
6073 	/* Set/remove snapshot xattr and switch parent ID and backing bs_dev on clone... */
6074 	if (ctx->parent_snapshot_entry != NULL) {
6075 		/* ...to parent snapshot */
6076 		ctx->clone->parent_id = ctx->parent_snapshot_entry->id;
6077 		ctx->clone->back_bs_dev = ctx->snapshot->back_bs_dev;
6078 		_spdk_blob_set_xattr(ctx->clone, BLOB_SNAPSHOT, &ctx->parent_snapshot_entry->id,
6079 				     sizeof(spdk_blob_id),
6080 				     true);
6081 	} else {
6082 		/* ...to blobid invalid and zeroes dev */
6083 		ctx->clone->parent_id = SPDK_BLOBID_INVALID;
6084 		ctx->clone->back_bs_dev = spdk_bs_create_zeroes_dev();
6085 		_spdk_blob_remove_xattr(ctx->clone, BLOB_SNAPSHOT, true);
6086 	}
6087 
6088 	spdk_blob_sync_md(ctx->clone, _spdk_delete_snapshot_sync_clone_cpl, ctx);
6089 }
6090 
6091 static void
6092 _spdk_delete_snapshot_freeze_io_cb(void *cb_arg, int bserrno)
6093 {
6094 	struct delete_snapshot_ctx *ctx = cb_arg;
6095 
6096 	if (bserrno) {
6097 		SPDK_ERRLOG("Failed to freeze I/O on clone\n");
6098 		ctx->bserrno = bserrno;
6099 		_spdk_delete_snapshot_cleanup_clone(ctx, 0);
6100 		return;
6101 	}
6102 
6103 	/* Temporarily override md_ro flag for snapshot for MD modification */
6104 	ctx->snapshot_md_ro = ctx->snapshot->md_ro;
6105 	ctx->snapshot->md_ro = false;
6106 
6107 	/* Mark blob as pending for removal for power failure safety, use clone id for recovery */
6108 	ctx->bserrno = _spdk_blob_set_xattr(ctx->snapshot, SNAPSHOT_PENDING_REMOVAL, &ctx->clone->id,
6109 					    sizeof(spdk_blob_id), true);
6110 	if (ctx->bserrno != 0) {
6111 		_spdk_delete_snapshot_cleanup_clone(ctx, 0);
6112 		return;
6113 	}
6114 
6115 	spdk_blob_sync_md(ctx->snapshot, _spdk_delete_snapshot_sync_snapshot_xattr_cpl, ctx);
6116 }
6117 
6118 static void
6119 _spdk_delete_snapshot_open_clone_cb(void *cb_arg, struct spdk_blob *clone, int bserrno)
6120 {
6121 	struct delete_snapshot_ctx *ctx = cb_arg;
6122 
6123 	if (bserrno) {
6124 		SPDK_ERRLOG("Failed to open clone\n");
6125 		ctx->bserrno = bserrno;
6126 		_spdk_delete_snapshot_cleanup_snapshot(ctx, 0);
6127 		return;
6128 	}
6129 
6130 	ctx->clone = clone;
6131 
6132 	if (clone->locked_operation_in_progress) {
6133 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress on its clone\n");
6134 		ctx->bserrno = -EBUSY;
6135 		spdk_blob_close(ctx->clone, _spdk_delete_snapshot_cleanup_snapshot, ctx);
6136 		return;
6137 	}
6138 
6139 	clone->locked_operation_in_progress = true;
6140 
6141 	_spdk_blob_freeze_io(clone, _spdk_delete_snapshot_freeze_io_cb, ctx);
6142 }
6143 
6144 static void
6145 _spdk_update_clone_on_snapshot_deletion(struct spdk_blob *snapshot, struct delete_snapshot_ctx *ctx)
6146 {
6147 	struct spdk_blob_list *snapshot_entry = NULL;
6148 	struct spdk_blob_list *clone_entry = NULL;
6149 	struct spdk_blob_list *snapshot_clone_entry = NULL;
6150 
6151 	/* Get snapshot entry for the snapshot we want to remove */
6152 	snapshot_entry = _spdk_bs_get_snapshot_entry(snapshot->bs, snapshot->id);
6153 
6154 	assert(snapshot_entry != NULL);
6155 
6156 	/* Get clone of the snapshot (at this point there can be only one clone) */
6157 	clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
6158 	assert(snapshot_entry->clone_count == 1);
6159 	assert(clone_entry != NULL);
6160 
6161 	/* Get snapshot entry for parent snapshot and clone entry within that snapshot for
6162 	 * snapshot that we are removing */
6163 	_spdk_blob_get_snapshot_and_clone_entries(snapshot, &ctx->parent_snapshot_entry,
6164 			&snapshot_clone_entry);
6165 
6166 	spdk_bs_open_blob(snapshot->bs, clone_entry->id, _spdk_delete_snapshot_open_clone_cb, ctx);
6167 }
6168 
6169 static void
6170 _spdk_bs_delete_blob_finish(void *cb_arg, struct spdk_blob *blob, int bserrno)
6171 {
6172 	spdk_bs_sequence_t *seq = cb_arg;
6173 	struct spdk_blob_list *snapshot_entry = NULL;
6174 	uint32_t page_num;
6175 
6176 	if (bserrno) {
6177 		SPDK_ERRLOG("Failed to remove blob\n");
6178 		spdk_bs_sequence_finish(seq, bserrno);
6179 		return;
6180 	}
6181 
6182 	/* Remove snapshot from the list */
6183 	snapshot_entry = _spdk_bs_get_snapshot_entry(blob->bs, blob->id);
6184 	if (snapshot_entry != NULL) {
6185 		TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link);
6186 		free(snapshot_entry);
6187 	}
6188 
6189 	page_num = _spdk_bs_blobid_to_page(blob->id);
6190 	spdk_bit_array_clear(blob->bs->used_blobids, page_num);
6191 	blob->state = SPDK_BLOB_STATE_DIRTY;
6192 	blob->active.num_pages = 0;
6193 	_spdk_blob_resize(blob, 0);
6194 
6195 	_spdk_blob_persist(seq, blob, _spdk_bs_delete_persist_cpl, blob);
6196 }
6197 
6198 static int
6199 _spdk_bs_is_blob_deletable(struct spdk_blob *blob, bool *update_clone)
6200 {
6201 	struct spdk_blob_list *snapshot_entry = NULL;
6202 	struct spdk_blob_list *clone_entry = NULL;
6203 	struct spdk_blob *clone = NULL;
6204 	bool has_one_clone = false;
6205 
6206 	/* Check if this is a snapshot with clones */
6207 	snapshot_entry = _spdk_bs_get_snapshot_entry(blob->bs, blob->id);
6208 	if (snapshot_entry != NULL) {
6209 		if (snapshot_entry->clone_count > 1) {
6210 			SPDK_ERRLOG("Cannot remove snapshot with more than one clone\n");
6211 			return -EBUSY;
6212 		} else if (snapshot_entry->clone_count == 1) {
6213 			has_one_clone = true;
6214 		}
6215 	}
6216 
6217 	/* Check if someone has this blob open (besides this delete context):
6218 	 * - open_ref = 1 - only this context opened blob, so it is ok to remove it
6219 	 * - open_ref <= 2 && has_one_clone = true - clone is holding snapshot
6220 	 *	and that is ok, because we will update it accordingly */
6221 	if (blob->open_ref <= 2 && has_one_clone) {
6222 		clone_entry = TAILQ_FIRST(&snapshot_entry->clones);
6223 		assert(clone_entry != NULL);
6224 		clone = _spdk_blob_lookup(blob->bs, clone_entry->id);
6225 
6226 		if (blob->open_ref == 2 && clone == NULL) {
6227 			/* Clone is closed and someone else opened this blob */
6228 			SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
6229 			return -EBUSY;
6230 		}
6231 
6232 		*update_clone = true;
6233 		return 0;
6234 	}
6235 
6236 	if (blob->open_ref > 1) {
6237 		SPDK_ERRLOG("Cannot remove snapshot because it is open\n");
6238 		return -EBUSY;
6239 	}
6240 
6241 	assert(has_one_clone == false);
6242 	*update_clone = false;
6243 	return 0;
6244 }
6245 
6246 static void
6247 _spdk_bs_delete_enomem_close_cpl(void *cb_arg, int bserrno)
6248 {
6249 	spdk_bs_sequence_t *seq = cb_arg;
6250 
6251 	spdk_bs_sequence_finish(seq, -ENOMEM);
6252 }
6253 
6254 static void
6255 _spdk_bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno)
6256 {
6257 	spdk_bs_sequence_t *seq = cb_arg;
6258 	struct delete_snapshot_ctx *ctx;
6259 	bool update_clone = false;
6260 
6261 	if (bserrno != 0) {
6262 		spdk_bs_sequence_finish(seq, bserrno);
6263 		return;
6264 	}
6265 
6266 	_spdk_blob_verify_md_op(blob);
6267 
6268 	ctx = calloc(1, sizeof(*ctx));
6269 	if (ctx == NULL) {
6270 		spdk_blob_close(blob, _spdk_bs_delete_enomem_close_cpl, seq);
6271 		return;
6272 	}
6273 
6274 	ctx->snapshot = blob;
6275 	ctx->cb_fn = _spdk_bs_delete_blob_finish;
6276 	ctx->cb_arg = seq;
6277 
6278 	/* Check if blob can be removed and if it is a snapshot with clone on top of it */
6279 	ctx->bserrno = _spdk_bs_is_blob_deletable(blob, &update_clone);
6280 	if (ctx->bserrno) {
6281 		spdk_blob_close(blob, _spdk_delete_blob_cleanup_finish, ctx);
6282 		return;
6283 	}
6284 
6285 	if (blob->locked_operation_in_progress) {
6286 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot remove blob - another operation in progress\n");
6287 		ctx->bserrno = -EBUSY;
6288 		spdk_blob_close(blob, _spdk_delete_blob_cleanup_finish, ctx);
6289 		return;
6290 	}
6291 
6292 	blob->locked_operation_in_progress = true;
6293 
6294 	/*
6295 	 * Remove the blob from the blob_store list now, to ensure it does not
6296 	 *  get returned after this point by _spdk_blob_lookup().
6297 	 */
6298 	TAILQ_REMOVE(&blob->bs->blobs, blob, link);
6299 
6300 	if (update_clone) {
6301 		/* This blob is a snapshot with active clone - update clone first */
6302 		_spdk_update_clone_on_snapshot_deletion(blob, ctx);
6303 	} else {
6304 		/* This blob does not have any clones - just remove it */
6305 		_spdk_bs_blob_list_remove(blob);
6306 		_spdk_bs_delete_blob_finish(seq, blob, 0);
6307 		free(ctx);
6308 	}
6309 }
6310 
6311 void
6312 spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
6313 		    spdk_blob_op_complete cb_fn, void *cb_arg)
6314 {
6315 	struct spdk_bs_cpl	cpl;
6316 	spdk_bs_sequence_t	*seq;
6317 
6318 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Deleting blob %lu\n", blobid);
6319 
6320 	assert(spdk_get_thread() == bs->md_thread);
6321 
6322 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6323 	cpl.u.blob_basic.cb_fn = cb_fn;
6324 	cpl.u.blob_basic.cb_arg = cb_arg;
6325 
6326 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
6327 	if (!seq) {
6328 		cb_fn(cb_arg, -ENOMEM);
6329 		return;
6330 	}
6331 
6332 	spdk_bs_open_blob(bs, blobid, _spdk_bs_delete_open_cpl, seq);
6333 }
6334 
6335 /* END spdk_bs_delete_blob */
6336 
6337 /* START spdk_bs_open_blob */
6338 
6339 static void
6340 _spdk_bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6341 {
6342 	struct spdk_blob *blob = cb_arg;
6343 
6344 	if (bserrno != 0) {
6345 		_spdk_blob_free(blob);
6346 		seq->cpl.u.blob_handle.blob = NULL;
6347 		spdk_bs_sequence_finish(seq, bserrno);
6348 		return;
6349 	}
6350 
6351 	blob->open_ref++;
6352 
6353 	TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link);
6354 
6355 	spdk_bs_sequence_finish(seq, bserrno);
6356 }
6357 
6358 static void _spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
6359 			       struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6360 {
6361 	struct spdk_blob		*blob;
6362 	struct spdk_bs_cpl		cpl;
6363 	struct spdk_blob_open_opts	opts_default;
6364 	spdk_bs_sequence_t		*seq;
6365 	uint32_t			page_num;
6366 
6367 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Opening blob %lu\n", blobid);
6368 	assert(spdk_get_thread() == bs->md_thread);
6369 
6370 	page_num = _spdk_bs_blobid_to_page(blobid);
6371 	if (spdk_bit_array_get(bs->used_blobids, page_num) == false) {
6372 		/* Invalid blobid */
6373 		cb_fn(cb_arg, NULL, -ENOENT);
6374 		return;
6375 	}
6376 
6377 	blob = _spdk_blob_lookup(bs, blobid);
6378 	if (blob) {
6379 		blob->open_ref++;
6380 		cb_fn(cb_arg, blob, 0);
6381 		return;
6382 	}
6383 
6384 	blob = _spdk_blob_alloc(bs, blobid);
6385 	if (!blob) {
6386 		cb_fn(cb_arg, NULL, -ENOMEM);
6387 		return;
6388 	}
6389 
6390 	if (!opts) {
6391 		spdk_blob_open_opts_init(&opts_default);
6392 		opts = &opts_default;
6393 	}
6394 
6395 	blob->clear_method = opts->clear_method;
6396 
6397 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE;
6398 	cpl.u.blob_handle.cb_fn = cb_fn;
6399 	cpl.u.blob_handle.cb_arg = cb_arg;
6400 	cpl.u.blob_handle.blob = blob;
6401 
6402 	seq = spdk_bs_sequence_start(bs->md_channel, &cpl);
6403 	if (!seq) {
6404 		_spdk_blob_free(blob);
6405 		cb_fn(cb_arg, NULL, -ENOMEM);
6406 		return;
6407 	}
6408 
6409 	_spdk_blob_load(seq, blob, _spdk_bs_open_blob_cpl, blob);
6410 }
6411 
6412 void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid,
6413 		       spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6414 {
6415 	_spdk_bs_open_blob(bs, blobid, NULL, cb_fn, cb_arg);
6416 }
6417 
6418 void spdk_bs_open_blob_ext(struct spdk_blob_store *bs, spdk_blob_id blobid,
6419 			   struct spdk_blob_open_opts *opts, spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6420 {
6421 	_spdk_bs_open_blob(bs, blobid, opts, cb_fn, cb_arg);
6422 }
6423 
6424 /* END spdk_bs_open_blob */
6425 
6426 /* START spdk_blob_set_read_only */
6427 int spdk_blob_set_read_only(struct spdk_blob *blob)
6428 {
6429 	_spdk_blob_verify_md_op(blob);
6430 
6431 	blob->data_ro_flags |= SPDK_BLOB_READ_ONLY;
6432 
6433 	blob->state = SPDK_BLOB_STATE_DIRTY;
6434 	return 0;
6435 }
6436 /* END spdk_blob_set_read_only */
6437 
6438 /* START spdk_blob_sync_md */
6439 
6440 static void
6441 _spdk_blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6442 {
6443 	struct spdk_blob *blob = cb_arg;
6444 
6445 	if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) {
6446 		blob->data_ro = true;
6447 		blob->md_ro = true;
6448 	}
6449 
6450 	spdk_bs_sequence_finish(seq, bserrno);
6451 }
6452 
6453 static void
6454 _spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
6455 {
6456 	struct spdk_bs_cpl	cpl;
6457 	spdk_bs_sequence_t	*seq;
6458 
6459 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6460 	cpl.u.blob_basic.cb_fn = cb_fn;
6461 	cpl.u.blob_basic.cb_arg = cb_arg;
6462 
6463 	seq = spdk_bs_sequence_start(blob->bs->md_channel, &cpl);
6464 	if (!seq) {
6465 		cb_fn(cb_arg, -ENOMEM);
6466 		return;
6467 	}
6468 
6469 	_spdk_blob_persist(seq, blob, _spdk_blob_sync_md_cpl, blob);
6470 }
6471 
6472 void
6473 spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
6474 {
6475 	_spdk_blob_verify_md_op(blob);
6476 
6477 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blob %lu\n", blob->id);
6478 
6479 	if (blob->md_ro) {
6480 		assert(blob->state == SPDK_BLOB_STATE_CLEAN);
6481 		cb_fn(cb_arg, 0);
6482 		return;
6483 	}
6484 
6485 	_spdk_blob_sync_md(blob, cb_fn, cb_arg);
6486 }
6487 
6488 /* END spdk_blob_sync_md */
6489 
6490 struct spdk_blob_insert_cluster_ctx {
6491 	struct spdk_thread	*thread;
6492 	struct spdk_blob	*blob;
6493 	uint32_t		cluster_num;	/* cluster index in blob */
6494 	uint32_t		cluster;	/* cluster on disk */
6495 	uint32_t		extent_page;	/* extent page on disk */
6496 	int			rc;
6497 	spdk_blob_op_complete	cb_fn;
6498 	void			*cb_arg;
6499 };
6500 
6501 static void
6502 _spdk_blob_insert_cluster_msg_cpl(void *arg)
6503 {
6504 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6505 
6506 	ctx->cb_fn(ctx->cb_arg, ctx->rc);
6507 	free(ctx);
6508 }
6509 
6510 static void
6511 _spdk_blob_insert_cluster_msg_cb(void *arg, int bserrno)
6512 {
6513 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6514 
6515 	ctx->rc = bserrno;
6516 	spdk_thread_send_msg(ctx->thread, _spdk_blob_insert_cluster_msg_cpl, ctx);
6517 }
6518 
6519 static void
6520 _spdk_blob_persist_extent_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6521 {
6522 	struct spdk_blob_md_page        *page = cb_arg;
6523 
6524 	spdk_bs_sequence_finish(seq, bserrno);
6525 	spdk_free(page);
6526 }
6527 
6528 static void
6529 _spdk_blob_insert_extent(struct spdk_blob *blob, uint32_t extent, uint64_t cluster_num,
6530 			 spdk_blob_op_complete cb_fn, void *cb_arg)
6531 {
6532 	spdk_bs_sequence_t		*seq;
6533 	struct spdk_bs_cpl		cpl;
6534 	struct spdk_blob_md_page	*page = NULL;
6535 	uint32_t			page_count = 0;
6536 	int				rc;
6537 
6538 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6539 	cpl.u.blob_basic.cb_fn = cb_fn;
6540 	cpl.u.blob_basic.cb_arg = cb_arg;
6541 
6542 	seq = spdk_bs_sequence_start(blob->bs->md_channel, &cpl);
6543 	if (!seq) {
6544 		cb_fn(cb_arg, -ENOMEM);
6545 		return;
6546 	}
6547 	rc = _spdk_blob_serialize_add_page(blob, &page, &page_count, &page);
6548 	if (rc < 0) {
6549 		spdk_bs_sequence_finish(seq, rc);
6550 		return;
6551 	}
6552 
6553 	_spdk_blob_serialize_extent_page(blob, cluster_num, page);
6554 
6555 	page->crc = _spdk_blob_md_page_calc_crc(page);
6556 
6557 	assert(spdk_bit_array_get(blob->bs->used_md_pages, extent) == true);
6558 
6559 	spdk_bs_sequence_write_dev(seq, page, _spdk_bs_md_page_to_lba(blob->bs, extent),
6560 				   _spdk_bs_byte_to_lba(blob->bs, SPDK_BS_PAGE_SIZE),
6561 				   _spdk_blob_persist_extent_page_cpl, page);
6562 }
6563 
6564 static void
6565 _spdk_blob_insert_cluster_msg(void *arg)
6566 {
6567 	struct spdk_blob_insert_cluster_ctx *ctx = arg;
6568 	uint32_t *extent_page;
6569 
6570 	ctx->rc = _spdk_blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster);
6571 	if (ctx->rc != 0) {
6572 		spdk_thread_send_msg(ctx->thread, _spdk_blob_insert_cluster_msg_cpl, ctx);
6573 		return;
6574 	}
6575 
6576 	if (ctx->blob->use_extent_table == false) {
6577 		/* Extent table is not used, proceed with sync of md that will only use extents_rle. */
6578 		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
6579 		_spdk_blob_sync_md(ctx->blob, _spdk_blob_insert_cluster_msg_cb, ctx);
6580 		return;
6581 	}
6582 
6583 	extent_page = _spdk_bs_cluster_to_extent_page(ctx->blob, ctx->cluster_num);
6584 	if (*extent_page == 0) {
6585 		/* Extent page requires allocation.
6586 		 * It was already claimed in the used_md_pages map and placed in ctx.
6587 		 * Blob persist will take care of writing out new extent page on disk. */
6588 		assert(ctx->extent_page != 0);
6589 		assert(spdk_bit_array_get(ctx->blob->bs->used_md_pages, ctx->extent_page) == true);
6590 		*extent_page = ctx->extent_page;
6591 		ctx->blob->state = SPDK_BLOB_STATE_DIRTY;
6592 		_spdk_blob_sync_md(ctx->blob, _spdk_blob_insert_cluster_msg_cb, ctx);
6593 	} else {
6594 		assert(ctx->extent_page == 0);
6595 		/* Extent page already allocated.
6596 		 * Every cluster allocation, requires just an update of single extent page. */
6597 		_spdk_blob_insert_extent(ctx->blob, ctx->extent_page, ctx->cluster_num,
6598 					 _spdk_blob_insert_cluster_msg_cb, ctx);
6599 	}
6600 }
6601 
6602 static void
6603 _spdk_blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num,
6604 				       uint64_t cluster, uint32_t extent_page, spdk_blob_op_complete cb_fn, void *cb_arg)
6605 {
6606 	struct spdk_blob_insert_cluster_ctx *ctx;
6607 
6608 	ctx = calloc(1, sizeof(*ctx));
6609 	if (ctx == NULL) {
6610 		cb_fn(cb_arg, -ENOMEM);
6611 		return;
6612 	}
6613 
6614 	ctx->thread = spdk_get_thread();
6615 	ctx->blob = blob;
6616 	ctx->cluster_num = cluster_num;
6617 	ctx->cluster = cluster;
6618 	ctx->extent_page = extent_page;
6619 	ctx->cb_fn = cb_fn;
6620 	ctx->cb_arg = cb_arg;
6621 
6622 	spdk_thread_send_msg(blob->bs->md_thread, _spdk_blob_insert_cluster_msg, ctx);
6623 }
6624 
6625 /* START spdk_blob_close */
6626 
6627 static void
6628 _spdk_blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno)
6629 {
6630 	struct spdk_blob *blob = cb_arg;
6631 
6632 	if (bserrno == 0) {
6633 		blob->open_ref--;
6634 		if (blob->open_ref == 0) {
6635 			/*
6636 			 * Blobs with active.num_pages == 0 are deleted blobs.
6637 			 *  these blobs are removed from the blob_store list
6638 			 *  when the deletion process starts - so don't try to
6639 			 *  remove them again.
6640 			 */
6641 			if (blob->active.num_pages > 0) {
6642 				TAILQ_REMOVE(&blob->bs->blobs, blob, link);
6643 			}
6644 			_spdk_blob_free(blob);
6645 		}
6646 	}
6647 
6648 	spdk_bs_sequence_finish(seq, bserrno);
6649 }
6650 
6651 void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg)
6652 {
6653 	struct spdk_bs_cpl	cpl;
6654 	spdk_bs_sequence_t	*seq;
6655 
6656 	_spdk_blob_verify_md_op(blob);
6657 
6658 	SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Closing blob %lu\n", blob->id);
6659 
6660 	if (blob->open_ref == 0) {
6661 		cb_fn(cb_arg, -EBADF);
6662 		return;
6663 	}
6664 
6665 	cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC;
6666 	cpl.u.blob_basic.cb_fn = cb_fn;
6667 	cpl.u.blob_basic.cb_arg = cb_arg;
6668 
6669 	seq = spdk_bs_sequence_start(blob->bs->md_channel, &cpl);
6670 	if (!seq) {
6671 		cb_fn(cb_arg, -ENOMEM);
6672 		return;
6673 	}
6674 
6675 	/* Sync metadata */
6676 	_spdk_blob_persist(seq, blob, _spdk_blob_close_cpl, blob);
6677 }
6678 
6679 /* END spdk_blob_close */
6680 
6681 struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs)
6682 {
6683 	return spdk_get_io_channel(bs);
6684 }
6685 
6686 void spdk_bs_free_io_channel(struct spdk_io_channel *channel)
6687 {
6688 	spdk_put_io_channel(channel);
6689 }
6690 
6691 void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel,
6692 			uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
6693 {
6694 	_spdk_blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
6695 				     SPDK_BLOB_UNMAP);
6696 }
6697 
6698 void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel,
6699 			       uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg)
6700 {
6701 	_spdk_blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg,
6702 				     SPDK_BLOB_WRITE_ZEROES);
6703 }
6704 
6705 void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel,
6706 			void *payload, uint64_t offset, uint64_t length,
6707 			spdk_blob_op_complete cb_fn, void *cb_arg)
6708 {
6709 	_spdk_blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
6710 				     SPDK_BLOB_WRITE);
6711 }
6712 
6713 void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel,
6714 		       void *payload, uint64_t offset, uint64_t length,
6715 		       spdk_blob_op_complete cb_fn, void *cb_arg)
6716 {
6717 	_spdk_blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg,
6718 				     SPDK_BLOB_READ);
6719 }
6720 
6721 void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel,
6722 			 struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
6723 			 spdk_blob_op_complete cb_fn, void *cb_arg)
6724 {
6725 	_spdk_blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false);
6726 }
6727 
6728 void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel,
6729 			struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length,
6730 			spdk_blob_op_complete cb_fn, void *cb_arg)
6731 {
6732 	_spdk_blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true);
6733 }
6734 
6735 struct spdk_bs_iter_ctx {
6736 	int64_t page_num;
6737 	struct spdk_blob_store *bs;
6738 
6739 	spdk_blob_op_with_handle_complete cb_fn;
6740 	void *cb_arg;
6741 };
6742 
6743 static void
6744 _spdk_bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno)
6745 {
6746 	struct spdk_bs_iter_ctx *ctx = cb_arg;
6747 	struct spdk_blob_store *bs = ctx->bs;
6748 	spdk_blob_id id;
6749 
6750 	if (bserrno == 0) {
6751 		ctx->cb_fn(ctx->cb_arg, _blob, bserrno);
6752 		free(ctx);
6753 		return;
6754 	}
6755 
6756 	ctx->page_num++;
6757 	ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num);
6758 	if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) {
6759 		ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT);
6760 		free(ctx);
6761 		return;
6762 	}
6763 
6764 	id = _spdk_bs_page_to_blobid(ctx->page_num);
6765 
6766 	spdk_bs_open_blob(bs, id, _spdk_bs_iter_cpl, ctx);
6767 }
6768 
6769 void
6770 spdk_bs_iter_first(struct spdk_blob_store *bs,
6771 		   spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6772 {
6773 	struct spdk_bs_iter_ctx *ctx;
6774 
6775 	ctx = calloc(1, sizeof(*ctx));
6776 	if (!ctx) {
6777 		cb_fn(cb_arg, NULL, -ENOMEM);
6778 		return;
6779 	}
6780 
6781 	ctx->page_num = -1;
6782 	ctx->bs = bs;
6783 	ctx->cb_fn = cb_fn;
6784 	ctx->cb_arg = cb_arg;
6785 
6786 	_spdk_bs_iter_cpl(ctx, NULL, -1);
6787 }
6788 
6789 static void
6790 _spdk_bs_iter_close_cpl(void *cb_arg, int bserrno)
6791 {
6792 	struct spdk_bs_iter_ctx *ctx = cb_arg;
6793 
6794 	_spdk_bs_iter_cpl(ctx, NULL, -1);
6795 }
6796 
6797 void
6798 spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob,
6799 		  spdk_blob_op_with_handle_complete cb_fn, void *cb_arg)
6800 {
6801 	struct spdk_bs_iter_ctx *ctx;
6802 
6803 	assert(blob != NULL);
6804 
6805 	ctx = calloc(1, sizeof(*ctx));
6806 	if (!ctx) {
6807 		cb_fn(cb_arg, NULL, -ENOMEM);
6808 		return;
6809 	}
6810 
6811 	ctx->page_num = _spdk_bs_blobid_to_page(blob->id);
6812 	ctx->bs = bs;
6813 	ctx->cb_fn = cb_fn;
6814 	ctx->cb_arg = cb_arg;
6815 
6816 	/* Close the existing blob */
6817 	spdk_blob_close(blob, _spdk_bs_iter_close_cpl, ctx);
6818 }
6819 
6820 static int
6821 _spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
6822 		     uint16_t value_len, bool internal)
6823 {
6824 	struct spdk_xattr_tailq *xattrs;
6825 	struct spdk_xattr	*xattr;
6826 	size_t			desc_size;
6827 
6828 	_spdk_blob_verify_md_op(blob);
6829 
6830 	if (blob->md_ro) {
6831 		return -EPERM;
6832 	}
6833 
6834 	desc_size = sizeof(struct spdk_blob_md_descriptor_xattr) + strlen(name) + value_len;
6835 	if (desc_size > SPDK_BS_MAX_DESC_SIZE) {
6836 		SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Xattr '%s' of size %ld does not fix into single page %ld\n", name,
6837 			      desc_size, SPDK_BS_MAX_DESC_SIZE);
6838 		return -ENOMEM;
6839 	}
6840 
6841 	if (internal) {
6842 		xattrs = &blob->xattrs_internal;
6843 		blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR;
6844 	} else {
6845 		xattrs = &blob->xattrs;
6846 	}
6847 
6848 	TAILQ_FOREACH(xattr, xattrs, link) {
6849 		if (!strcmp(name, xattr->name)) {
6850 			free(xattr->value);
6851 			xattr->value_len = value_len;
6852 			xattr->value = malloc(value_len);
6853 			memcpy(xattr->value, value, value_len);
6854 
6855 			blob->state = SPDK_BLOB_STATE_DIRTY;
6856 
6857 			return 0;
6858 		}
6859 	}
6860 
6861 	xattr = calloc(1, sizeof(*xattr));
6862 	if (!xattr) {
6863 		return -ENOMEM;
6864 	}
6865 	xattr->name = strdup(name);
6866 	xattr->value_len = value_len;
6867 	xattr->value = malloc(value_len);
6868 	memcpy(xattr->value, value, value_len);
6869 	TAILQ_INSERT_TAIL(xattrs, xattr, link);
6870 
6871 	blob->state = SPDK_BLOB_STATE_DIRTY;
6872 
6873 	return 0;
6874 }
6875 
6876 int
6877 spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value,
6878 		    uint16_t value_len)
6879 {
6880 	return _spdk_blob_set_xattr(blob, name, value, value_len, false);
6881 }
6882 
6883 static int
6884 _spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal)
6885 {
6886 	struct spdk_xattr_tailq *xattrs;
6887 	struct spdk_xattr	*xattr;
6888 
6889 	_spdk_blob_verify_md_op(blob);
6890 
6891 	if (blob->md_ro) {
6892 		return -EPERM;
6893 	}
6894 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
6895 
6896 	TAILQ_FOREACH(xattr, xattrs, link) {
6897 		if (!strcmp(name, xattr->name)) {
6898 			TAILQ_REMOVE(xattrs, xattr, link);
6899 			free(xattr->value);
6900 			free(xattr->name);
6901 			free(xattr);
6902 
6903 			if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) {
6904 				blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR;
6905 			}
6906 			blob->state = SPDK_BLOB_STATE_DIRTY;
6907 
6908 			return 0;
6909 		}
6910 	}
6911 
6912 	return -ENOENT;
6913 }
6914 
6915 int
6916 spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name)
6917 {
6918 	return _spdk_blob_remove_xattr(blob, name, false);
6919 }
6920 
6921 static int
6922 _spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
6923 			   const void **value, size_t *value_len, bool internal)
6924 {
6925 	struct spdk_xattr	*xattr;
6926 	struct spdk_xattr_tailq *xattrs;
6927 
6928 	xattrs = internal ? &blob->xattrs_internal : &blob->xattrs;
6929 
6930 	TAILQ_FOREACH(xattr, xattrs, link) {
6931 		if (!strcmp(name, xattr->name)) {
6932 			*value = xattr->value;
6933 			*value_len = xattr->value_len;
6934 			return 0;
6935 		}
6936 	}
6937 	return -ENOENT;
6938 }
6939 
6940 int
6941 spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name,
6942 			  const void **value, size_t *value_len)
6943 {
6944 	_spdk_blob_verify_md_op(blob);
6945 
6946 	return _spdk_blob_get_xattr_value(blob, name, value, value_len, false);
6947 }
6948 
6949 struct spdk_xattr_names {
6950 	uint32_t	count;
6951 	const char	*names[0];
6952 };
6953 
6954 static int
6955 _spdk_blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names)
6956 {
6957 	struct spdk_xattr	*xattr;
6958 	int			count = 0;
6959 
6960 	TAILQ_FOREACH(xattr, xattrs, link) {
6961 		count++;
6962 	}
6963 
6964 	*names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *));
6965 	if (*names == NULL) {
6966 		return -ENOMEM;
6967 	}
6968 
6969 	TAILQ_FOREACH(xattr, xattrs, link) {
6970 		(*names)->names[(*names)->count++] = xattr->name;
6971 	}
6972 
6973 	return 0;
6974 }
6975 
6976 int
6977 spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names)
6978 {
6979 	_spdk_blob_verify_md_op(blob);
6980 
6981 	return _spdk_blob_get_xattr_names(&blob->xattrs, names);
6982 }
6983 
6984 uint32_t
6985 spdk_xattr_names_get_count(struct spdk_xattr_names *names)
6986 {
6987 	assert(names != NULL);
6988 
6989 	return names->count;
6990 }
6991 
6992 const char *
6993 spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index)
6994 {
6995 	if (index >= names->count) {
6996 		return NULL;
6997 	}
6998 
6999 	return names->names[index];
7000 }
7001 
7002 void
7003 spdk_xattr_names_free(struct spdk_xattr_names *names)
7004 {
7005 	free(names);
7006 }
7007 
7008 struct spdk_bs_type
7009 spdk_bs_get_bstype(struct spdk_blob_store *bs)
7010 {
7011 	return bs->bstype;
7012 }
7013 
7014 void
7015 spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype)
7016 {
7017 	memcpy(&bs->bstype, &bstype, sizeof(bstype));
7018 }
7019 
7020 bool
7021 spdk_blob_is_read_only(struct spdk_blob *blob)
7022 {
7023 	assert(blob != NULL);
7024 	return (blob->data_ro || blob->md_ro);
7025 }
7026 
7027 bool
7028 spdk_blob_is_snapshot(struct spdk_blob *blob)
7029 {
7030 	struct spdk_blob_list *snapshot_entry;
7031 
7032 	assert(blob != NULL);
7033 
7034 	snapshot_entry = _spdk_bs_get_snapshot_entry(blob->bs, blob->id);
7035 	if (snapshot_entry == NULL) {
7036 		return false;
7037 	}
7038 
7039 	return true;
7040 }
7041 
7042 bool
7043 spdk_blob_is_clone(struct spdk_blob *blob)
7044 {
7045 	assert(blob != NULL);
7046 
7047 	if (blob->parent_id != SPDK_BLOBID_INVALID) {
7048 		assert(spdk_blob_is_thin_provisioned(blob));
7049 		return true;
7050 	}
7051 
7052 	return false;
7053 }
7054 
7055 bool
7056 spdk_blob_is_thin_provisioned(struct spdk_blob *blob)
7057 {
7058 	assert(blob != NULL);
7059 	return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV);
7060 }
7061 
7062 static void
7063 _spdk_blob_update_clear_method(struct spdk_blob *blob)
7064 {
7065 	enum blob_clear_method stored_cm;
7066 
7067 	assert(blob != NULL);
7068 
7069 	/* If BLOB_CLEAR_WITH_DEFAULT was passed in, use the setting stored
7070 	 * in metadata previously.  If something other than the default was
7071 	 * specified, ignore stored value and used what was passed in.
7072 	 */
7073 	stored_cm = ((blob->md_ro_flags & SPDK_BLOB_CLEAR_METHOD) >> SPDK_BLOB_CLEAR_METHOD_SHIFT);
7074 
7075 	if (blob->clear_method == BLOB_CLEAR_WITH_DEFAULT) {
7076 		blob->clear_method = stored_cm;
7077 	} else if (blob->clear_method != stored_cm) {
7078 		SPDK_WARNLOG("Using passed in clear method 0x%x instead of stored value of 0x%x\n",
7079 			     blob->clear_method, stored_cm);
7080 	}
7081 }
7082 
7083 spdk_blob_id
7084 spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id)
7085 {
7086 	struct spdk_blob_list *snapshot_entry = NULL;
7087 	struct spdk_blob_list *clone_entry = NULL;
7088 
7089 	TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) {
7090 		TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
7091 			if (clone_entry->id == blob_id) {
7092 				return snapshot_entry->id;
7093 			}
7094 		}
7095 	}
7096 
7097 	return SPDK_BLOBID_INVALID;
7098 }
7099 
7100 int
7101 spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids,
7102 		     size_t *count)
7103 {
7104 	struct spdk_blob_list *snapshot_entry, *clone_entry;
7105 	size_t n;
7106 
7107 	snapshot_entry = _spdk_bs_get_snapshot_entry(bs, blobid);
7108 	if (snapshot_entry == NULL) {
7109 		*count = 0;
7110 		return 0;
7111 	}
7112 
7113 	if (ids == NULL || *count < snapshot_entry->clone_count) {
7114 		*count = snapshot_entry->clone_count;
7115 		return -ENOMEM;
7116 	}
7117 	*count = snapshot_entry->clone_count;
7118 
7119 	n = 0;
7120 	TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) {
7121 		ids[n++] = clone_entry->id;
7122 	}
7123 
7124 	return 0;
7125 }
7126 
7127 SPDK_LOG_REGISTER_COMPONENT("blob", SPDK_LOG_BLOB)
7128