xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision 307b8c112ffd90a26d53dd15fad67bd9038ef526)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (c) Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include "vbdev_zone_block.h"
9 
10 #include "spdk/config.h"
11 #include "spdk/nvme.h"
12 #include "spdk/bdev_zone.h"
13 
14 #include "spdk/log.h"
15 
16 static int zone_block_init(void);
17 static int zone_block_get_ctx_size(void);
18 static void zone_block_finish(void);
19 static int zone_block_config_json(struct spdk_json_write_ctx *w);
20 static void zone_block_examine(struct spdk_bdev *bdev);
21 
22 static struct spdk_bdev_module bdev_zoned_if = {
23 	.name = "bdev_zoned_block",
24 	.module_init = zone_block_init,
25 	.module_fini = zone_block_finish,
26 	.config_json = zone_block_config_json,
27 	.examine_config = zone_block_examine,
28 	.get_ctx_size = zone_block_get_ctx_size,
29 };
30 
31 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
32 
33 /* List of block vbdev names and their base bdevs via configuration file.
34  * Used so we can parse the conf once at init and use this list in examine().
35  */
36 struct bdev_zone_block_config {
37 	char					*vbdev_name;
38 	char					*bdev_name;
39 	uint64_t				zone_capacity;
40 	uint64_t				optimal_open_zones;
41 	TAILQ_ENTRY(bdev_zone_block_config)	link;
42 };
43 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
44 
45 struct block_zone {
46 	struct spdk_bdev_zone_info zone_info;
47 	pthread_spinlock_t lock;
48 };
49 
50 /* List of block vbdevs and associated info for each. */
51 struct bdev_zone_block {
52 	struct spdk_bdev		bdev;    /* the block zoned bdev */
53 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
54 	struct block_zone		*zones; /* array of zones */
55 	uint64_t			num_zones; /* number of zones */
56 	uint64_t			zone_capacity; /* zone capacity */
57 	uint64_t                        zone_shift; /* log2 of zone_size */
58 	TAILQ_ENTRY(bdev_zone_block)	link;
59 	struct spdk_thread		*thread; /* thread where base device is opened */
60 };
61 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
62 
63 struct zone_block_io_channel {
64 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
65 };
66 
67 struct zone_block_io {
68 	/* vbdev to which IO was issued */
69 	struct bdev_zone_block *bdev_zone_block;
70 };
71 
72 static int
73 zone_block_init(void)
74 {
75 	return 0;
76 }
77 
78 static void
79 zone_block_remove_config(struct bdev_zone_block_config *name)
80 {
81 	TAILQ_REMOVE(&g_bdev_configs, name, link);
82 	free(name->bdev_name);
83 	free(name->vbdev_name);
84 	free(name);
85 }
86 
87 static void
88 zone_block_finish(void)
89 {
90 	struct bdev_zone_block_config *name;
91 
92 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
93 		zone_block_remove_config(name);
94 	}
95 }
96 
97 static int
98 zone_block_get_ctx_size(void)
99 {
100 	return sizeof(struct zone_block_io);
101 }
102 
103 static int
104 zone_block_config_json(struct spdk_json_write_ctx *w)
105 {
106 	struct bdev_zone_block *bdev_node;
107 	struct spdk_bdev *base_bdev = NULL;
108 
109 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
110 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
111 		spdk_json_write_object_begin(w);
112 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
113 		spdk_json_write_named_object_begin(w, "params");
114 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
115 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
116 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
117 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
118 		spdk_json_write_object_end(w);
119 		spdk_json_write_object_end(w);
120 	}
121 
122 	return 0;
123 }
124 
125 /* Callback for unregistering the IO device. */
126 static void
127 _device_unregister_cb(void *io_device)
128 {
129 	struct bdev_zone_block *bdev_node = io_device;
130 	uint64_t i;
131 
132 	free(bdev_node->bdev.name);
133 	for (i = 0; i < bdev_node->num_zones; i++) {
134 		pthread_spin_destroy(&bdev_node->zones[i].lock);
135 	}
136 	free(bdev_node->zones);
137 	free(bdev_node);
138 }
139 
140 static void
141 _zone_block_destruct(void *ctx)
142 {
143 	struct spdk_bdev_desc *desc = ctx;
144 
145 	spdk_bdev_close(desc);
146 }
147 
148 static int
149 zone_block_destruct(void *ctx)
150 {
151 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
152 
153 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
154 
155 	/* Unclaim the underlying bdev. */
156 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
157 
158 	/* Close the underlying bdev on its same opened thread. */
159 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
160 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
161 	} else {
162 		spdk_bdev_close(bdev_node->base_desc);
163 	}
164 
165 	/* Unregister the io_device. */
166 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
167 
168 	return 0;
169 }
170 
171 static struct block_zone *
172 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
173 {
174 	size_t index = lba >> bdev_node->zone_shift;
175 
176 	if (index >= bdev_node->num_zones) {
177 		return NULL;
178 	}
179 
180 	return &bdev_node->zones[index];
181 }
182 
183 static struct block_zone *
184 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
185 {
186 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
187 
188 	if (zone && zone->zone_info.zone_id == start_lba) {
189 		return zone;
190 	} else {
191 		return NULL;
192 	}
193 }
194 
195 static int
196 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
197 {
198 	struct block_zone *zone;
199 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
200 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
201 	size_t i;
202 
203 	/* User can request info for more zones than exist, need to check both internal and user
204 	 * boundaries
205 	 */
206 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
207 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
208 		if (!zone) {
209 			return -EINVAL;
210 		}
211 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
212 	}
213 
214 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
215 	return 0;
216 }
217 
218 static int
219 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
220 {
221 	pthread_spin_lock(&zone->lock);
222 
223 	switch (zone->zone_info.state) {
224 	case SPDK_BDEV_ZONE_STATE_EMPTY:
225 	case SPDK_BDEV_ZONE_STATE_OPEN:
226 	case SPDK_BDEV_ZONE_STATE_CLOSED:
227 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
228 		pthread_spin_unlock(&zone->lock);
229 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
230 		return 0;
231 	default:
232 		pthread_spin_unlock(&zone->lock);
233 		return -EINVAL;
234 	}
235 }
236 
237 static void
238 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
239 {
240 	struct spdk_bdev_io *orig_io = cb_arg;
241 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
242 
243 	/* Complete the original IO and then free the one that we created here
244 	 * as a result of issuing an IO via submit_request.
245 	 */
246 	spdk_bdev_io_complete(orig_io, status);
247 	spdk_bdev_free_io(bdev_io);
248 }
249 
250 static int
251 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
252 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
253 {
254 	pthread_spin_lock(&zone->lock);
255 
256 	switch (zone->zone_info.state) {
257 	case SPDK_BDEV_ZONE_STATE_EMPTY:
258 		pthread_spin_unlock(&zone->lock);
259 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
260 		return 0;
261 	case SPDK_BDEV_ZONE_STATE_OPEN:
262 	case SPDK_BDEV_ZONE_STATE_FULL:
263 	case SPDK_BDEV_ZONE_STATE_CLOSED:
264 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
265 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
266 		pthread_spin_unlock(&zone->lock);
267 
268 		/* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */
269 		if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc),
270 						 SPDK_BDEV_IO_TYPE_UNMAP)) {
271 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
272 			return 0;
273 		}
274 
275 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
276 					      zone->zone_info.zone_id, zone->zone_info.capacity,
277 					      _zone_block_complete_unmap, bdev_io);
278 	default:
279 		pthread_spin_unlock(&zone->lock);
280 		return -EINVAL;
281 	}
282 }
283 
284 static int
285 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
286 {
287 	pthread_spin_lock(&zone->lock);
288 
289 	switch (zone->zone_info.state) {
290 	case SPDK_BDEV_ZONE_STATE_OPEN:
291 	case SPDK_BDEV_ZONE_STATE_CLOSED:
292 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
293 		pthread_spin_unlock(&zone->lock);
294 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
295 		return 0;
296 	default:
297 		pthread_spin_unlock(&zone->lock);
298 		return -EINVAL;
299 	}
300 }
301 
302 static int
303 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
304 {
305 	pthread_spin_lock(&zone->lock);
306 
307 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
308 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
309 
310 	pthread_spin_unlock(&zone->lock);
311 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
312 	return 0;
313 }
314 
315 static int
316 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
317 			   struct spdk_bdev_io *bdev_io)
318 {
319 	struct block_zone *zone;
320 
321 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
322 	if (!zone) {
323 		return -EINVAL;
324 	}
325 
326 	switch (bdev_io->u.zone_mgmt.zone_action) {
327 	case SPDK_BDEV_ZONE_RESET:
328 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
329 	case SPDK_BDEV_ZONE_OPEN:
330 		return zone_block_open_zone(zone, bdev_io);
331 	case SPDK_BDEV_ZONE_CLOSE:
332 		return zone_block_close_zone(zone, bdev_io);
333 	case SPDK_BDEV_ZONE_FINISH:
334 		return zone_block_finish_zone(zone, bdev_io);
335 	default:
336 		return -EINVAL;
337 	}
338 }
339 
340 static void
341 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
342 {
343 	struct spdk_bdev_io *orig_io = cb_arg;
344 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
345 
346 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
347 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
348 	}
349 
350 	/* Complete the original IO and then free the one that we created here
351 	 * as a result of issuing an IO via submit_request.
352 	 */
353 	spdk_bdev_io_complete(orig_io, status);
354 	spdk_bdev_free_io(bdev_io);
355 }
356 
357 static int
358 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
359 		 struct spdk_bdev_io *bdev_io)
360 {
361 	struct block_zone *zone;
362 	uint64_t len = bdev_io->u.bdev.num_blocks;
363 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
364 	uint64_t num_blocks_left, wp;
365 	int rc = 0;
366 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
367 
368 	if (is_append) {
369 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
370 	} else {
371 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
372 	}
373 	if (!zone) {
374 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
375 		return -EINVAL;
376 	}
377 
378 	pthread_spin_lock(&zone->lock);
379 
380 	switch (zone->zone_info.state) {
381 	case SPDK_BDEV_ZONE_STATE_OPEN:
382 	case SPDK_BDEV_ZONE_STATE_EMPTY:
383 	case SPDK_BDEV_ZONE_STATE_CLOSED:
384 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
385 		break;
386 	default:
387 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
388 		rc = -EINVAL;
389 		goto write_fail;
390 	}
391 
392 	wp = zone->zone_info.write_pointer;
393 	if (is_append) {
394 		lba = wp;
395 	} else {
396 		if (lba != wp) {
397 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
398 				    lba, wp);
399 			rc = -EINVAL;
400 			goto write_fail;
401 		}
402 	}
403 
404 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
405 	if (len > num_blocks_left) {
406 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
407 			    ")\n", lba, len, wp);
408 		rc = -EINVAL;
409 		goto write_fail;
410 	}
411 
412 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
413 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
414 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
415 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
416 	}
417 	pthread_spin_unlock(&zone->lock);
418 
419 	if (bdev_io->u.bdev.md_buf == NULL) {
420 		rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
421 					     bdev_io->u.bdev.iovcnt, lba,
422 					     bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
423 					     bdev_io);
424 	} else {
425 		rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
426 						     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
427 						     bdev_io->u.bdev.md_buf,
428 						     lba, bdev_io->u.bdev.num_blocks,
429 						     _zone_block_complete_write, bdev_io);
430 	}
431 
432 	return rc;
433 
434 write_fail:
435 	pthread_spin_unlock(&zone->lock);
436 	return rc;
437 }
438 
439 static void
440 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
441 {
442 	struct spdk_bdev_io *orig_io = cb_arg;
443 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
444 
445 	/* Complete the original IO and then free the one that we created here
446 	 * as a result of issuing an IO via submit_request.
447 	 */
448 	spdk_bdev_io_complete(orig_io, status);
449 	spdk_bdev_free_io(bdev_io);
450 }
451 
452 static int
453 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
454 		struct spdk_bdev_io *bdev_io)
455 {
456 	struct block_zone *zone;
457 	uint64_t len = bdev_io->u.bdev.num_blocks;
458 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
459 	int rc;
460 
461 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
462 	if (!zone) {
463 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
464 		return -EINVAL;
465 	}
466 
467 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
468 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
469 		return -EINVAL;
470 	}
471 
472 	if (bdev_io->u.bdev.md_buf == NULL) {
473 		rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
474 					    bdev_io->u.bdev.iovcnt, lba,
475 					    len, _zone_block_complete_read,
476 					    bdev_io);
477 	} else {
478 		rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
479 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
480 						    bdev_io->u.bdev.md_buf,
481 						    lba, len,
482 						    _zone_block_complete_read, bdev_io);
483 	}
484 
485 	return rc;
486 }
487 
488 static void
489 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
490 {
491 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
492 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
493 	int rc = 0;
494 
495 	switch (bdev_io->type) {
496 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
497 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
498 		break;
499 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
500 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
501 		break;
502 	case SPDK_BDEV_IO_TYPE_WRITE:
503 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
504 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
505 		break;
506 	case SPDK_BDEV_IO_TYPE_READ:
507 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
508 		break;
509 	default:
510 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
511 		rc = -ENOTSUP;
512 		break;
513 	}
514 
515 	if (rc != 0) {
516 		if (rc == -ENOMEM) {
517 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
518 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
519 		} else {
520 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
521 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
522 		}
523 	}
524 }
525 
526 static bool
527 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
528 {
529 	switch (io_type) {
530 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
531 	case SPDK_BDEV_IO_TYPE_WRITE:
532 	case SPDK_BDEV_IO_TYPE_READ:
533 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
534 		return true;
535 	default:
536 		return false;
537 	}
538 }
539 
540 static struct spdk_io_channel *
541 zone_block_get_io_channel(void *ctx)
542 {
543 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
544 
545 	return spdk_get_io_channel(bdev_node);
546 }
547 
548 static int
549 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
550 {
551 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
552 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
553 
554 	spdk_json_write_name(w, "zoned_block");
555 	spdk_json_write_object_begin(w);
556 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
557 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
558 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
559 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
560 	spdk_json_write_object_end(w);
561 
562 	return 0;
563 }
564 
565 /* When we register our vbdev this is how we specify our entry points. */
566 static const struct spdk_bdev_fn_table zone_block_fn_table = {
567 	.destruct		= zone_block_destruct,
568 	.submit_request		= zone_block_submit_request,
569 	.io_type_supported	= zone_block_io_type_supported,
570 	.get_io_channel		= zone_block_get_io_channel,
571 	.dump_info_json		= zone_block_dump_info_json,
572 };
573 
574 static void
575 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
576 {
577 	struct bdev_zone_block *bdev_node, *tmp;
578 
579 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
580 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
581 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
582 		}
583 	}
584 }
585 
586 static void
587 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
588 			      void *event_ctx)
589 {
590 	switch (type) {
591 	case SPDK_BDEV_EVENT_REMOVE:
592 		zone_block_base_bdev_hotremove_cb(bdev);
593 		break;
594 	default:
595 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
596 		break;
597 	}
598 }
599 
600 static int
601 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
602 {
603 	struct zone_block_io_channel *bdev_ch = ctx_buf;
604 	struct bdev_zone_block *bdev_node = io_device;
605 
606 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
607 	if (!bdev_ch->base_ch) {
608 		return -ENOMEM;
609 	}
610 
611 	return 0;
612 }
613 
614 static void
615 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
616 {
617 	struct zone_block_io_channel *bdev_ch = ctx_buf;
618 
619 	spdk_put_io_channel(bdev_ch->base_ch);
620 }
621 
622 static int
623 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
624 		       uint64_t optimal_open_zones)
625 {
626 	struct bdev_zone_block_config *name;
627 
628 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
629 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
630 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
631 			return -EEXIST;
632 		}
633 		if (strcmp(bdev_name, name->bdev_name) == 0) {
634 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
635 			return -EEXIST;
636 		}
637 	}
638 
639 	name = calloc(1, sizeof(*name));
640 	if (!name) {
641 		SPDK_ERRLOG("could not allocate bdev_names\n");
642 		return -ENOMEM;
643 	}
644 
645 	name->bdev_name = strdup(bdev_name);
646 	if (!name->bdev_name) {
647 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
648 		free(name);
649 		return -ENOMEM;
650 	}
651 
652 	name->vbdev_name = strdup(vbdev_name);
653 	if (!name->vbdev_name) {
654 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
655 		free(name->bdev_name);
656 		free(name);
657 		return -ENOMEM;
658 	}
659 
660 	name->zone_capacity = zone_capacity;
661 	name->optimal_open_zones = optimal_open_zones;
662 
663 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
664 
665 	return 0;
666 }
667 
668 static int
669 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
670 {
671 	size_t i;
672 	struct block_zone *zone;
673 	int rc = 0;
674 
675 	for (i = 0; i < bdev_node->num_zones; i++) {
676 		zone = &bdev_node->zones[i];
677 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
678 		zone->zone_info.capacity = bdev_node->zone_capacity;
679 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
680 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
681 		zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR;
682 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
683 			SPDK_ERRLOG("pthread_spin_init() failed\n");
684 			rc = -ENOMEM;
685 			break;
686 		}
687 	}
688 
689 	if (rc) {
690 		for (; i > 0; i--) {
691 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
692 		}
693 	}
694 
695 	return rc;
696 }
697 
698 static int
699 zone_block_register(const char *base_bdev_name)
700 {
701 	struct spdk_bdev_desc *base_desc;
702 	struct spdk_bdev *base_bdev;
703 	struct bdev_zone_block_config *name, *tmp;
704 	struct bdev_zone_block *bdev_node;
705 	uint64_t zone_size;
706 	int rc = 0;
707 
708 	/* Check our list of names from config versus this bdev and if
709 	 * there's a match, create the bdev_node & bdev accordingly.
710 	 */
711 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
712 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
713 			continue;
714 		}
715 
716 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
717 					NULL, &base_desc);
718 		if (rc == -ENODEV) {
719 			return -ENODEV;
720 		} else if (rc) {
721 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
722 			goto free_config;
723 		}
724 
725 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
726 
727 		if (spdk_bdev_is_zoned(base_bdev)) {
728 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
729 			rc = -EEXIST;
730 			goto zone_exist;
731 		}
732 
733 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
734 		if (!bdev_node) {
735 			rc = -ENOMEM;
736 			SPDK_ERRLOG("could not allocate bdev_node\n");
737 			goto zone_exist;
738 		}
739 
740 		bdev_node->base_desc = base_desc;
741 
742 		/* The base bdev that we're attaching to. */
743 		bdev_node->bdev.name = strdup(name->vbdev_name);
744 		if (!bdev_node->bdev.name) {
745 			rc = -ENOMEM;
746 			SPDK_ERRLOG("could not allocate bdev_node name\n");
747 			goto strdup_failed;
748 		}
749 
750 		zone_size = spdk_align64pow2(name->zone_capacity);
751 		if (zone_size == 0) {
752 			rc = -EINVAL;
753 			SPDK_ERRLOG("invalid zone size\n");
754 			goto roundup_failed;
755 		}
756 
757 		bdev_node->zone_shift = spdk_u64log2(zone_size);
758 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
759 
760 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
761 		if (!bdev_node->zones) {
762 			rc = -ENOMEM;
763 			SPDK_ERRLOG("could not allocate zones\n");
764 			goto calloc_failed;
765 		}
766 
767 		bdev_node->bdev.product_name = "zone_block";
768 
769 		/* Copy some properties from the underlying base bdev. */
770 		bdev_node->bdev.write_cache = base_bdev->write_cache;
771 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
772 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
773 
774 		bdev_node->bdev.blocklen = base_bdev->blocklen;
775 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
776 
777 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
778 			SPDK_DEBUGLOG(vbdev_zone_block,
779 				      "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
780 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
781 		}
782 
783 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
784 
785 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
786 		bdev_node->bdev.md_len = base_bdev->md_len;
787 		bdev_node->bdev.dif_type = base_bdev->dif_type;
788 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
789 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
790 
791 		bdev_node->bdev.zoned = true;
792 		bdev_node->bdev.ctxt = bdev_node;
793 		bdev_node->bdev.fn_table = &zone_block_fn_table;
794 		bdev_node->bdev.module = &bdev_zoned_if;
795 
796 		/* bdev specific info */
797 		bdev_node->bdev.zone_size = zone_size;
798 
799 		bdev_node->zone_capacity = name->zone_capacity;
800 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
801 		bdev_node->bdev.max_open_zones = 0;
802 		rc = zone_block_init_zone_info(bdev_node);
803 		if (rc) {
804 			SPDK_ERRLOG("could not init zone info\n");
805 			goto zone_info_failed;
806 		}
807 
808 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
809 
810 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
811 					sizeof(struct zone_block_io_channel),
812 					name->vbdev_name);
813 
814 		/* Save the thread where the base device is opened */
815 		bdev_node->thread = spdk_get_thread();
816 
817 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
818 		if (rc) {
819 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
820 			goto claim_failed;
821 		}
822 
823 		rc = spdk_bdev_register(&bdev_node->bdev);
824 		if (rc) {
825 			SPDK_ERRLOG("could not register zoned bdev\n");
826 			goto register_failed;
827 		}
828 	}
829 
830 	return rc;
831 
832 register_failed:
833 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
834 claim_failed:
835 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
836 	spdk_io_device_unregister(bdev_node, NULL);
837 zone_info_failed:
838 	free(bdev_node->zones);
839 calloc_failed:
840 roundup_failed:
841 	free(bdev_node->bdev.name);
842 strdup_failed:
843 	free(bdev_node);
844 zone_exist:
845 	spdk_bdev_close(base_desc);
846 free_config:
847 	zone_block_remove_config(name);
848 	return rc;
849 }
850 
851 int
852 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
853 			uint64_t optimal_open_zones)
854 {
855 	int rc = 0;
856 
857 	if (zone_capacity == 0) {
858 		SPDK_ERRLOG("Zone capacity can't be 0\n");
859 		return -EINVAL;
860 	}
861 
862 	if (optimal_open_zones == 0) {
863 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
864 		return -EINVAL;
865 	}
866 
867 	/* Insert the bdev into our global name list even if it doesn't exist yet,
868 	 * it may show up soon...
869 	 */
870 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
871 	if (rc) {
872 		return rc;
873 	}
874 
875 	rc = zone_block_register(bdev_name);
876 	if (rc == -ENODEV) {
877 		/* This is not an error, even though the bdev is not present at this time it may
878 		 * still show up later.
879 		 */
880 		rc = 0;
881 	}
882 	return rc;
883 }
884 
885 void
886 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
887 {
888 	struct bdev_zone_block_config *name_node;
889 	int rc;
890 
891 	rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg);
892 	if (rc == 0) {
893 		TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
894 			if (strcmp(name_node->vbdev_name, name) == 0) {
895 				zone_block_remove_config(name_node);
896 				break;
897 			}
898 		}
899 	} else {
900 		cb_fn(cb_arg, rc);
901 	}
902 }
903 
904 static void
905 zone_block_examine(struct spdk_bdev *bdev)
906 {
907 	zone_block_register(bdev->name);
908 
909 	spdk_bdev_module_examine_done(&bdev_zoned_if);
910 }
911 
912 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
913