xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision a6dbe3721eb3b5990707fc3e378c95e505dd8ab5)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "vbdev_zone_block.h"
10 
11 #include "spdk/config.h"
12 #include "spdk/nvme.h"
13 #include "spdk/bdev_zone.h"
14 
15 #include "spdk/log.h"
16 
17 static int zone_block_init(void);
18 static int zone_block_get_ctx_size(void);
19 static void zone_block_finish(void);
20 static int zone_block_config_json(struct spdk_json_write_ctx *w);
21 static void zone_block_examine(struct spdk_bdev *bdev);
22 
23 static struct spdk_bdev_module bdev_zoned_if = {
24 	.name = "bdev_zoned_block",
25 	.module_init = zone_block_init,
26 	.module_fini = zone_block_finish,
27 	.config_json = zone_block_config_json,
28 	.examine_config = zone_block_examine,
29 	.get_ctx_size = zone_block_get_ctx_size,
30 };
31 
32 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
33 
34 /* List of block vbdev names and their base bdevs via configuration file.
35  * Used so we can parse the conf once at init and use this list in examine().
36  */
37 struct bdev_zone_block_config {
38 	char					*vbdev_name;
39 	char					*bdev_name;
40 	uint64_t				zone_capacity;
41 	uint64_t				optimal_open_zones;
42 	TAILQ_ENTRY(bdev_zone_block_config)	link;
43 };
44 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
45 
46 struct block_zone {
47 	struct spdk_bdev_zone_info zone_info;
48 	pthread_spinlock_t lock;
49 };
50 
51 /* List of block vbdevs and associated info for each. */
52 struct bdev_zone_block {
53 	struct spdk_bdev		bdev;    /* the block zoned bdev */
54 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
55 	struct block_zone		*zones; /* array of zones */
56 	uint64_t			num_zones; /* number of zones */
57 	uint64_t			zone_capacity; /* zone capacity */
58 	uint64_t                        zone_shift; /* log2 of zone_size */
59 	TAILQ_ENTRY(bdev_zone_block)	link;
60 	struct spdk_thread		*thread; /* thread where base device is opened */
61 };
62 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
63 
64 struct zone_block_io_channel {
65 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
66 };
67 
68 struct zone_block_io {
69 	/* vbdev to which IO was issued */
70 	struct bdev_zone_block *bdev_zone_block;
71 };
72 
73 static int
74 zone_block_init(void)
75 {
76 	return 0;
77 }
78 
79 static void
80 zone_block_remove_config(struct bdev_zone_block_config *name)
81 {
82 	TAILQ_REMOVE(&g_bdev_configs, name, link);
83 	free(name->bdev_name);
84 	free(name->vbdev_name);
85 	free(name);
86 }
87 
88 static void
89 zone_block_finish(void)
90 {
91 	struct bdev_zone_block_config *name;
92 
93 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
94 		zone_block_remove_config(name);
95 	}
96 }
97 
98 static int
99 zone_block_get_ctx_size(void)
100 {
101 	return sizeof(struct zone_block_io);
102 }
103 
104 static int
105 zone_block_config_json(struct spdk_json_write_ctx *w)
106 {
107 	struct bdev_zone_block *bdev_node;
108 	struct spdk_bdev *base_bdev = NULL;
109 
110 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
111 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
112 		spdk_json_write_object_begin(w);
113 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
114 		spdk_json_write_named_object_begin(w, "params");
115 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
116 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
117 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
118 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
119 		spdk_json_write_object_end(w);
120 		spdk_json_write_object_end(w);
121 	}
122 
123 	return 0;
124 }
125 
126 /* Callback for unregistering the IO device. */
127 static void
128 _device_unregister_cb(void *io_device)
129 {
130 	struct bdev_zone_block *bdev_node = io_device;
131 	uint64_t i;
132 
133 	free(bdev_node->bdev.name);
134 	for (i = 0; i < bdev_node->num_zones; i++) {
135 		pthread_spin_destroy(&bdev_node->zones[i].lock);
136 	}
137 	free(bdev_node->zones);
138 	free(bdev_node);
139 }
140 
141 static void
142 _zone_block_destruct(void *ctx)
143 {
144 	struct spdk_bdev_desc *desc = ctx;
145 
146 	spdk_bdev_close(desc);
147 }
148 
149 static int
150 zone_block_destruct(void *ctx)
151 {
152 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
153 
154 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
155 
156 	/* Unclaim the underlying bdev. */
157 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
158 
159 	/* Close the underlying bdev on its same opened thread. */
160 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
161 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
162 	} else {
163 		spdk_bdev_close(bdev_node->base_desc);
164 	}
165 
166 	/* Unregister the io_device. */
167 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
168 
169 	return 0;
170 }
171 
172 static struct block_zone *
173 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
174 {
175 	size_t index = lba >> bdev_node->zone_shift;
176 
177 	if (index >= bdev_node->num_zones) {
178 		return NULL;
179 	}
180 
181 	return &bdev_node->zones[index];
182 }
183 
184 static struct block_zone *
185 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
186 {
187 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
188 
189 	if (zone && zone->zone_info.zone_id == start_lba) {
190 		return zone;
191 	} else {
192 		return NULL;
193 	}
194 }
195 
196 static int
197 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
198 {
199 	struct block_zone *zone;
200 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
201 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
202 	size_t i;
203 
204 	/* User can request info for more zones than exist, need to check both internal and user
205 	 * boundaries
206 	 */
207 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
208 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
209 		if (!zone) {
210 			return -EINVAL;
211 		}
212 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
213 	}
214 
215 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
216 	return 0;
217 }
218 
219 static int
220 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
221 {
222 	pthread_spin_lock(&zone->lock);
223 
224 	switch (zone->zone_info.state) {
225 	case SPDK_BDEV_ZONE_STATE_EMPTY:
226 	case SPDK_BDEV_ZONE_STATE_OPEN:
227 	case SPDK_BDEV_ZONE_STATE_CLOSED:
228 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
229 		pthread_spin_unlock(&zone->lock);
230 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
231 		return 0;
232 	default:
233 		pthread_spin_unlock(&zone->lock);
234 		return -EINVAL;
235 	}
236 }
237 
238 static void
239 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
240 {
241 	struct spdk_bdev_io *orig_io = cb_arg;
242 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
243 
244 	/* Complete the original IO and then free the one that we created here
245 	 * as a result of issuing an IO via submit_request.
246 	 */
247 	spdk_bdev_io_complete(orig_io, status);
248 	spdk_bdev_free_io(bdev_io);
249 }
250 
251 static int
252 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
253 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
254 {
255 	pthread_spin_lock(&zone->lock);
256 
257 	switch (zone->zone_info.state) {
258 	case SPDK_BDEV_ZONE_STATE_EMPTY:
259 		pthread_spin_unlock(&zone->lock);
260 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
261 		return 0;
262 	case SPDK_BDEV_ZONE_STATE_OPEN:
263 	case SPDK_BDEV_ZONE_STATE_FULL:
264 	case SPDK_BDEV_ZONE_STATE_CLOSED:
265 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
266 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
267 		pthread_spin_unlock(&zone->lock);
268 
269 		/* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */
270 		if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc),
271 						 SPDK_BDEV_IO_TYPE_UNMAP)) {
272 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
273 			return 0;
274 		}
275 
276 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
277 					      zone->zone_info.zone_id, zone->zone_info.capacity,
278 					      _zone_block_complete_unmap, bdev_io);
279 	default:
280 		pthread_spin_unlock(&zone->lock);
281 		return -EINVAL;
282 	}
283 }
284 
285 static int
286 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
287 {
288 	pthread_spin_lock(&zone->lock);
289 
290 	switch (zone->zone_info.state) {
291 	case SPDK_BDEV_ZONE_STATE_OPEN:
292 	case SPDK_BDEV_ZONE_STATE_CLOSED:
293 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
294 		pthread_spin_unlock(&zone->lock);
295 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
296 		return 0;
297 	default:
298 		pthread_spin_unlock(&zone->lock);
299 		return -EINVAL;
300 	}
301 }
302 
303 static int
304 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
305 {
306 	pthread_spin_lock(&zone->lock);
307 
308 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
309 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
310 
311 	pthread_spin_unlock(&zone->lock);
312 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
313 	return 0;
314 }
315 
316 static int
317 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
318 			   struct spdk_bdev_io *bdev_io)
319 {
320 	struct block_zone *zone;
321 
322 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
323 	if (!zone) {
324 		return -EINVAL;
325 	}
326 
327 	switch (bdev_io->u.zone_mgmt.zone_action) {
328 	case SPDK_BDEV_ZONE_RESET:
329 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
330 	case SPDK_BDEV_ZONE_OPEN:
331 		return zone_block_open_zone(zone, bdev_io);
332 	case SPDK_BDEV_ZONE_CLOSE:
333 		return zone_block_close_zone(zone, bdev_io);
334 	case SPDK_BDEV_ZONE_FINISH:
335 		return zone_block_finish_zone(zone, bdev_io);
336 	default:
337 		return -EINVAL;
338 	}
339 }
340 
341 static void
342 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
343 {
344 	struct spdk_bdev_io *orig_io = cb_arg;
345 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
346 
347 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
348 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
349 	}
350 
351 	/* Complete the original IO and then free the one that we created here
352 	 * as a result of issuing an IO via submit_request.
353 	 */
354 	spdk_bdev_io_complete(orig_io, status);
355 	spdk_bdev_free_io(bdev_io);
356 }
357 
358 static int
359 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
360 		 struct spdk_bdev_io *bdev_io)
361 {
362 	struct block_zone *zone;
363 	uint64_t len = bdev_io->u.bdev.num_blocks;
364 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
365 	uint64_t num_blocks_left, wp;
366 	int rc = 0;
367 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
368 
369 	if (is_append) {
370 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
371 	} else {
372 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
373 	}
374 	if (!zone) {
375 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
376 		return -EINVAL;
377 	}
378 
379 	pthread_spin_lock(&zone->lock);
380 
381 	switch (zone->zone_info.state) {
382 	case SPDK_BDEV_ZONE_STATE_OPEN:
383 	case SPDK_BDEV_ZONE_STATE_EMPTY:
384 	case SPDK_BDEV_ZONE_STATE_CLOSED:
385 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
386 		break;
387 	default:
388 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
389 		rc = -EINVAL;
390 		goto write_fail;
391 	}
392 
393 	wp = zone->zone_info.write_pointer;
394 	if (is_append) {
395 		lba = wp;
396 	} else {
397 		if (lba != wp) {
398 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
399 				    lba, wp);
400 			rc = -EINVAL;
401 			goto write_fail;
402 		}
403 	}
404 
405 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
406 	if (len > num_blocks_left) {
407 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
408 			    ")\n", lba, len, wp);
409 		rc = -EINVAL;
410 		goto write_fail;
411 	}
412 
413 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
414 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
415 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
416 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
417 	}
418 	pthread_spin_unlock(&zone->lock);
419 
420 	rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
421 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
422 					     bdev_io->u.bdev.md_buf,
423 					     lba, bdev_io->u.bdev.num_blocks,
424 					     _zone_block_complete_write, bdev_io);
425 
426 	return rc;
427 
428 write_fail:
429 	pthread_spin_unlock(&zone->lock);
430 	return rc;
431 }
432 
433 static void
434 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
435 {
436 	struct spdk_bdev_io *orig_io = cb_arg;
437 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
438 
439 	/* Complete the original IO and then free the one that we created here
440 	 * as a result of issuing an IO via submit_request.
441 	 */
442 	spdk_bdev_io_complete(orig_io, status);
443 	spdk_bdev_free_io(bdev_io);
444 }
445 
446 static int
447 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
448 		struct spdk_bdev_io *bdev_io)
449 {
450 	struct block_zone *zone;
451 	uint64_t len = bdev_io->u.bdev.num_blocks;
452 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
453 	int rc;
454 
455 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
456 	if (!zone) {
457 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
458 		return -EINVAL;
459 	}
460 
461 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
462 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
463 		return -EINVAL;
464 	}
465 
466 	rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
467 					    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
468 					    bdev_io->u.bdev.md_buf,
469 					    lba, len,
470 					    _zone_block_complete_read, bdev_io);
471 
472 	return rc;
473 }
474 
475 static void
476 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
477 {
478 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
479 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
480 	int rc = 0;
481 
482 	switch (bdev_io->type) {
483 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
484 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
485 		break;
486 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
487 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
488 		break;
489 	case SPDK_BDEV_IO_TYPE_WRITE:
490 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
491 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
492 		break;
493 	case SPDK_BDEV_IO_TYPE_READ:
494 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
495 		break;
496 	default:
497 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
498 		rc = -ENOTSUP;
499 		break;
500 	}
501 
502 	if (rc != 0) {
503 		if (rc == -ENOMEM) {
504 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
505 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
506 		} else {
507 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
508 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
509 		}
510 	}
511 }
512 
513 static bool
514 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
515 {
516 	switch (io_type) {
517 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
518 	case SPDK_BDEV_IO_TYPE_WRITE:
519 	case SPDK_BDEV_IO_TYPE_READ:
520 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
521 		return true;
522 	default:
523 		return false;
524 	}
525 }
526 
527 static struct spdk_io_channel *
528 zone_block_get_io_channel(void *ctx)
529 {
530 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
531 
532 	return spdk_get_io_channel(bdev_node);
533 }
534 
535 static int
536 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
537 {
538 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
539 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
540 
541 	spdk_json_write_name(w, "zoned_block");
542 	spdk_json_write_object_begin(w);
543 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
544 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
545 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
546 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
547 	spdk_json_write_object_end(w);
548 
549 	return 0;
550 }
551 
552 /* When we register our vbdev this is how we specify our entry points. */
553 static const struct spdk_bdev_fn_table zone_block_fn_table = {
554 	.destruct		= zone_block_destruct,
555 	.submit_request		= zone_block_submit_request,
556 	.io_type_supported	= zone_block_io_type_supported,
557 	.get_io_channel		= zone_block_get_io_channel,
558 	.dump_info_json		= zone_block_dump_info_json,
559 };
560 
561 static void
562 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
563 {
564 	struct bdev_zone_block *bdev_node, *tmp;
565 
566 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
567 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
568 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
569 		}
570 	}
571 }
572 
573 static void
574 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
575 			      void *event_ctx)
576 {
577 	switch (type) {
578 	case SPDK_BDEV_EVENT_REMOVE:
579 		zone_block_base_bdev_hotremove_cb(bdev);
580 		break;
581 	default:
582 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
583 		break;
584 	}
585 }
586 
587 static int
588 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
589 {
590 	struct zone_block_io_channel *bdev_ch = ctx_buf;
591 	struct bdev_zone_block *bdev_node = io_device;
592 
593 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
594 	if (!bdev_ch->base_ch) {
595 		return -ENOMEM;
596 	}
597 
598 	return 0;
599 }
600 
601 static void
602 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
603 {
604 	struct zone_block_io_channel *bdev_ch = ctx_buf;
605 
606 	spdk_put_io_channel(bdev_ch->base_ch);
607 }
608 
609 static int
610 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
611 		       uint64_t optimal_open_zones)
612 {
613 	struct bdev_zone_block_config *name;
614 
615 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
616 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
617 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
618 			return -EEXIST;
619 		}
620 		if (strcmp(bdev_name, name->bdev_name) == 0) {
621 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
622 			return -EEXIST;
623 		}
624 	}
625 
626 	name = calloc(1, sizeof(*name));
627 	if (!name) {
628 		SPDK_ERRLOG("could not allocate bdev_names\n");
629 		return -ENOMEM;
630 	}
631 
632 	name->bdev_name = strdup(bdev_name);
633 	if (!name->bdev_name) {
634 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
635 		free(name);
636 		return -ENOMEM;
637 	}
638 
639 	name->vbdev_name = strdup(vbdev_name);
640 	if (!name->vbdev_name) {
641 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
642 		free(name->bdev_name);
643 		free(name);
644 		return -ENOMEM;
645 	}
646 
647 	name->zone_capacity = zone_capacity;
648 	name->optimal_open_zones = optimal_open_zones;
649 
650 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
651 
652 	return 0;
653 }
654 
655 static int
656 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
657 {
658 	size_t i;
659 	struct block_zone *zone;
660 	int rc = 0;
661 
662 	for (i = 0; i < bdev_node->num_zones; i++) {
663 		zone = &bdev_node->zones[i];
664 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
665 		zone->zone_info.capacity = bdev_node->zone_capacity;
666 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
667 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
668 		zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR;
669 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
670 			SPDK_ERRLOG("pthread_spin_init() failed\n");
671 			rc = -ENOMEM;
672 			break;
673 		}
674 	}
675 
676 	if (rc) {
677 		for (; i > 0; i--) {
678 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
679 		}
680 	}
681 
682 	return rc;
683 }
684 
685 static int
686 zone_block_register(const char *base_bdev_name)
687 {
688 	struct spdk_bdev_desc *base_desc;
689 	struct spdk_bdev *base_bdev;
690 	struct bdev_zone_block_config *name, *tmp;
691 	struct bdev_zone_block *bdev_node;
692 	uint64_t zone_size;
693 	int rc = 0;
694 
695 	/* Check our list of names from config versus this bdev and if
696 	 * there's a match, create the bdev_node & bdev accordingly.
697 	 */
698 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
699 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
700 			continue;
701 		}
702 
703 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
704 					NULL, &base_desc);
705 		if (rc == -ENODEV) {
706 			return -ENODEV;
707 		} else if (rc) {
708 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
709 			goto free_config;
710 		}
711 
712 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
713 
714 		if (spdk_bdev_is_zoned(base_bdev)) {
715 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
716 			rc = -EEXIST;
717 			goto zone_exist;
718 		}
719 
720 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
721 		if (!bdev_node) {
722 			rc = -ENOMEM;
723 			SPDK_ERRLOG("could not allocate bdev_node\n");
724 			goto zone_exist;
725 		}
726 
727 		bdev_node->base_desc = base_desc;
728 
729 		/* The base bdev that we're attaching to. */
730 		bdev_node->bdev.name = strdup(name->vbdev_name);
731 		if (!bdev_node->bdev.name) {
732 			rc = -ENOMEM;
733 			SPDK_ERRLOG("could not allocate bdev_node name\n");
734 			goto strdup_failed;
735 		}
736 
737 		zone_size = spdk_align64pow2(name->zone_capacity);
738 		if (zone_size == 0) {
739 			rc = -EINVAL;
740 			SPDK_ERRLOG("invalid zone size\n");
741 			goto roundup_failed;
742 		}
743 
744 		bdev_node->zone_shift = spdk_u64log2(zone_size);
745 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
746 
747 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
748 		if (!bdev_node->zones) {
749 			rc = -ENOMEM;
750 			SPDK_ERRLOG("could not allocate zones\n");
751 			goto calloc_failed;
752 		}
753 
754 		bdev_node->bdev.product_name = "zone_block";
755 
756 		/* Copy some properties from the underlying base bdev. */
757 		bdev_node->bdev.write_cache = base_bdev->write_cache;
758 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
759 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
760 
761 		bdev_node->bdev.blocklen = base_bdev->blocklen;
762 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
763 
764 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
765 			SPDK_DEBUGLOG(vbdev_zone_block,
766 				      "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
767 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
768 		}
769 
770 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
771 
772 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
773 		bdev_node->bdev.md_len = base_bdev->md_len;
774 		bdev_node->bdev.dif_type = base_bdev->dif_type;
775 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
776 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
777 
778 		bdev_node->bdev.zoned = true;
779 		bdev_node->bdev.ctxt = bdev_node;
780 		bdev_node->bdev.fn_table = &zone_block_fn_table;
781 		bdev_node->bdev.module = &bdev_zoned_if;
782 
783 		/* bdev specific info */
784 		bdev_node->bdev.zone_size = zone_size;
785 
786 		bdev_node->zone_capacity = name->zone_capacity;
787 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
788 		bdev_node->bdev.max_open_zones = 0;
789 		rc = zone_block_init_zone_info(bdev_node);
790 		if (rc) {
791 			SPDK_ERRLOG("could not init zone info\n");
792 			goto zone_info_failed;
793 		}
794 
795 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
796 
797 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
798 					sizeof(struct zone_block_io_channel),
799 					name->vbdev_name);
800 
801 		/* Save the thread where the base device is opened */
802 		bdev_node->thread = spdk_get_thread();
803 
804 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
805 		if (rc) {
806 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
807 			goto claim_failed;
808 		}
809 
810 		rc = spdk_bdev_register(&bdev_node->bdev);
811 		if (rc) {
812 			SPDK_ERRLOG("could not register zoned bdev\n");
813 			goto register_failed;
814 		}
815 	}
816 
817 	return rc;
818 
819 register_failed:
820 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
821 claim_failed:
822 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
823 	spdk_io_device_unregister(bdev_node, NULL);
824 zone_info_failed:
825 	free(bdev_node->zones);
826 calloc_failed:
827 roundup_failed:
828 	free(bdev_node->bdev.name);
829 strdup_failed:
830 	free(bdev_node);
831 zone_exist:
832 	spdk_bdev_close(base_desc);
833 free_config:
834 	zone_block_remove_config(name);
835 	return rc;
836 }
837 
838 int
839 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
840 			uint64_t optimal_open_zones)
841 {
842 	int rc = 0;
843 
844 	if (zone_capacity == 0) {
845 		SPDK_ERRLOG("Zone capacity can't be 0\n");
846 		return -EINVAL;
847 	}
848 
849 	if (optimal_open_zones == 0) {
850 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
851 		return -EINVAL;
852 	}
853 
854 	/* Insert the bdev into our global name list even if it doesn't exist yet,
855 	 * it may show up soon...
856 	 */
857 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
858 	if (rc) {
859 		return rc;
860 	}
861 
862 	rc = zone_block_register(bdev_name);
863 	if (rc == -ENODEV) {
864 		/* This is not an error, even though the bdev is not present at this time it may
865 		 * still show up later.
866 		 */
867 		rc = 0;
868 	}
869 	return rc;
870 }
871 
872 void
873 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
874 {
875 	struct bdev_zone_block_config *name_node;
876 	int rc;
877 
878 	rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg);
879 	if (rc == 0) {
880 		TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
881 			if (strcmp(name_node->vbdev_name, name) == 0) {
882 				zone_block_remove_config(name_node);
883 				break;
884 			}
885 		}
886 	} else {
887 		cb_fn(cb_arg, rc);
888 	}
889 }
890 
891 static void
892 zone_block_examine(struct spdk_bdev *bdev)
893 {
894 	zone_block_register(bdev->name);
895 
896 	spdk_bdev_module_examine_done(&bdev_zoned_if);
897 }
898 
899 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
900