xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision cc6920a4763d4b9a43aa40583c8397d8f14fa100)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "vbdev_zone_block.h"
37 
38 #include "spdk/config.h"
39 #include "spdk/nvme.h"
40 #include "spdk/bdev_zone.h"
41 
42 #include "spdk/log.h"
43 
44 static int zone_block_init(void);
45 static int zone_block_get_ctx_size(void);
46 static void zone_block_finish(void);
47 static int zone_block_config_json(struct spdk_json_write_ctx *w);
48 static void zone_block_examine(struct spdk_bdev *bdev);
49 
50 static struct spdk_bdev_module bdev_zoned_if = {
51 	.name = "bdev_zoned_block",
52 	.module_init = zone_block_init,
53 	.module_fini = zone_block_finish,
54 	.config_json = zone_block_config_json,
55 	.examine_config = zone_block_examine,
56 	.get_ctx_size = zone_block_get_ctx_size,
57 };
58 
59 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
60 
61 /* List of block vbdev names and their base bdevs via configuration file.
62  * Used so we can parse the conf once at init and use this list in examine().
63  */
64 struct bdev_zone_block_config {
65 	char					*vbdev_name;
66 	char					*bdev_name;
67 	uint64_t				zone_capacity;
68 	uint64_t				optimal_open_zones;
69 	TAILQ_ENTRY(bdev_zone_block_config)	link;
70 };
71 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
72 
73 struct block_zone {
74 	struct spdk_bdev_zone_info zone_info;
75 	pthread_spinlock_t lock;
76 };
77 
78 /* List of block vbdevs and associated info for each. */
79 struct bdev_zone_block {
80 	struct spdk_bdev		bdev;    /* the block zoned bdev */
81 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
82 	struct block_zone		*zones; /* array of zones */
83 	uint64_t			num_zones; /* number of zones */
84 	uint64_t			zone_capacity; /* zone capacity */
85 	uint64_t                        zone_shift; /* log2 of zone_size */
86 	TAILQ_ENTRY(bdev_zone_block)	link;
87 	struct spdk_thread		*thread; /* thread where base device is opened */
88 };
89 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
90 
91 struct zone_block_io_channel {
92 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
93 };
94 
95 struct zone_block_io {
96 	/* vbdev to which IO was issued */
97 	struct bdev_zone_block *bdev_zone_block;
98 };
99 
100 static int
101 zone_block_init(void)
102 {
103 	return 0;
104 }
105 
106 static void
107 zone_block_remove_config(struct bdev_zone_block_config *name)
108 {
109 	TAILQ_REMOVE(&g_bdev_configs, name, link);
110 	free(name->bdev_name);
111 	free(name->vbdev_name);
112 	free(name);
113 }
114 
115 static void
116 zone_block_finish(void)
117 {
118 	struct bdev_zone_block_config *name;
119 
120 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
121 		zone_block_remove_config(name);
122 	}
123 }
124 
125 static int
126 zone_block_get_ctx_size(void)
127 {
128 	return sizeof(struct zone_block_io);
129 }
130 
131 static int
132 zone_block_config_json(struct spdk_json_write_ctx *w)
133 {
134 	struct bdev_zone_block *bdev_node;
135 	struct spdk_bdev *base_bdev = NULL;
136 
137 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
138 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
139 		spdk_json_write_object_begin(w);
140 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
141 		spdk_json_write_named_object_begin(w, "params");
142 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
143 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
144 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
145 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
146 		spdk_json_write_object_end(w);
147 		spdk_json_write_object_end(w);
148 	}
149 
150 	return 0;
151 }
152 
153 /* Callback for unregistering the IO device. */
154 static void
155 _device_unregister_cb(void *io_device)
156 {
157 	struct bdev_zone_block *bdev_node = io_device;
158 	uint64_t i;
159 
160 	free(bdev_node->bdev.name);
161 	for (i = 0; i < bdev_node->num_zones; i++) {
162 		pthread_spin_destroy(&bdev_node->zones[i].lock);
163 	}
164 	free(bdev_node->zones);
165 	free(bdev_node);
166 }
167 
168 static void
169 _zone_block_destruct(void *ctx)
170 {
171 	struct spdk_bdev_desc *desc = ctx;
172 
173 	spdk_bdev_close(desc);
174 }
175 
176 static int
177 zone_block_destruct(void *ctx)
178 {
179 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
180 
181 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
182 
183 	/* Unclaim the underlying bdev. */
184 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
185 
186 	/* Close the underlying bdev on its same opened thread. */
187 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
188 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
189 	} else {
190 		spdk_bdev_close(bdev_node->base_desc);
191 	}
192 
193 	/* Unregister the io_device. */
194 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
195 
196 	return 0;
197 }
198 
199 static struct block_zone *
200 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
201 {
202 	size_t index = lba >> bdev_node->zone_shift;
203 
204 	if (index >= bdev_node->num_zones) {
205 		return NULL;
206 	}
207 
208 	return &bdev_node->zones[index];
209 }
210 
211 static struct block_zone *
212 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
213 {
214 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
215 
216 	if (zone && zone->zone_info.zone_id == start_lba) {
217 		return zone;
218 	} else {
219 		return NULL;
220 	}
221 }
222 
223 static int
224 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
225 {
226 	struct block_zone *zone;
227 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
228 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
229 	size_t i;
230 
231 	/* User can request info for more zones than exist, need to check both internal and user
232 	 * boundaries
233 	 */
234 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
235 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
236 		if (!zone) {
237 			return -EINVAL;
238 		}
239 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
240 	}
241 
242 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
243 	return 0;
244 }
245 
246 static int
247 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
248 {
249 	pthread_spin_lock(&zone->lock);
250 
251 	switch (zone->zone_info.state) {
252 	case SPDK_BDEV_ZONE_STATE_EMPTY:
253 	case SPDK_BDEV_ZONE_STATE_OPEN:
254 	case SPDK_BDEV_ZONE_STATE_CLOSED:
255 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
256 		pthread_spin_unlock(&zone->lock);
257 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
258 		return 0;
259 	default:
260 		pthread_spin_unlock(&zone->lock);
261 		return -EINVAL;
262 	}
263 }
264 
265 static void
266 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
267 {
268 	struct spdk_bdev_io *orig_io = cb_arg;
269 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
270 
271 	/* Complete the original IO and then free the one that we created here
272 	 * as a result of issuing an IO via submit_request.
273 	 */
274 	spdk_bdev_io_complete(orig_io, status);
275 	spdk_bdev_free_io(bdev_io);
276 }
277 
278 static int
279 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
280 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
281 {
282 	pthread_spin_lock(&zone->lock);
283 
284 	switch (zone->zone_info.state) {
285 	case SPDK_BDEV_ZONE_STATE_EMPTY:
286 		pthread_spin_unlock(&zone->lock);
287 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
288 		return 0;
289 	case SPDK_BDEV_ZONE_STATE_OPEN:
290 	case SPDK_BDEV_ZONE_STATE_FULL:
291 	case SPDK_BDEV_ZONE_STATE_CLOSED:
292 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
293 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
294 		pthread_spin_unlock(&zone->lock);
295 
296 		/* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */
297 		if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc),
298 						 SPDK_BDEV_IO_TYPE_UNMAP)) {
299 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
300 			return 0;
301 		}
302 
303 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
304 					      zone->zone_info.zone_id, zone->zone_info.capacity,
305 					      _zone_block_complete_unmap, bdev_io);
306 	default:
307 		pthread_spin_unlock(&zone->lock);
308 		return -EINVAL;
309 	}
310 }
311 
312 static int
313 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
314 {
315 	pthread_spin_lock(&zone->lock);
316 
317 	switch (zone->zone_info.state) {
318 	case SPDK_BDEV_ZONE_STATE_OPEN:
319 	case SPDK_BDEV_ZONE_STATE_CLOSED:
320 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
321 		pthread_spin_unlock(&zone->lock);
322 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
323 		return 0;
324 	default:
325 		pthread_spin_unlock(&zone->lock);
326 		return -EINVAL;
327 	}
328 }
329 
330 static int
331 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
332 {
333 	pthread_spin_lock(&zone->lock);
334 
335 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
336 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
337 
338 	pthread_spin_unlock(&zone->lock);
339 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
340 	return 0;
341 }
342 
343 static int
344 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
345 			   struct spdk_bdev_io *bdev_io)
346 {
347 	struct block_zone *zone;
348 
349 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
350 	if (!zone) {
351 		return -EINVAL;
352 	}
353 
354 	switch (bdev_io->u.zone_mgmt.zone_action) {
355 	case SPDK_BDEV_ZONE_RESET:
356 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
357 	case SPDK_BDEV_ZONE_OPEN:
358 		return zone_block_open_zone(zone, bdev_io);
359 	case SPDK_BDEV_ZONE_CLOSE:
360 		return zone_block_close_zone(zone, bdev_io);
361 	case SPDK_BDEV_ZONE_FINISH:
362 		return zone_block_finish_zone(zone, bdev_io);
363 	default:
364 		return -EINVAL;
365 	}
366 }
367 
368 static void
369 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
370 {
371 	struct spdk_bdev_io *orig_io = cb_arg;
372 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
373 
374 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
375 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
376 	}
377 
378 	/* Complete the original IO and then free the one that we created here
379 	 * as a result of issuing an IO via submit_request.
380 	 */
381 	spdk_bdev_io_complete(orig_io, status);
382 	spdk_bdev_free_io(bdev_io);
383 }
384 
385 static int
386 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
387 		 struct spdk_bdev_io *bdev_io)
388 {
389 	struct block_zone *zone;
390 	uint64_t len = bdev_io->u.bdev.num_blocks;
391 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
392 	uint64_t num_blocks_left, wp;
393 	int rc = 0;
394 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
395 
396 	if (is_append) {
397 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
398 	} else {
399 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
400 	}
401 	if (!zone) {
402 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
403 		return -EINVAL;
404 	}
405 
406 	pthread_spin_lock(&zone->lock);
407 
408 	switch (zone->zone_info.state) {
409 	case SPDK_BDEV_ZONE_STATE_OPEN:
410 	case SPDK_BDEV_ZONE_STATE_EMPTY:
411 	case SPDK_BDEV_ZONE_STATE_CLOSED:
412 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
413 		break;
414 	default:
415 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
416 		rc = -EINVAL;
417 		goto write_fail;
418 	}
419 
420 	wp = zone->zone_info.write_pointer;
421 	if (is_append) {
422 		lba = wp;
423 	} else {
424 		if (lba != wp) {
425 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
426 				    lba, wp);
427 			rc = -EINVAL;
428 			goto write_fail;
429 		}
430 	}
431 
432 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
433 	if (len > num_blocks_left) {
434 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
435 			    ")\n", lba, len, wp);
436 		rc = -EINVAL;
437 		goto write_fail;
438 	}
439 
440 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
441 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
442 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
443 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
444 	}
445 	pthread_spin_unlock(&zone->lock);
446 
447 	if (bdev_io->u.bdev.md_buf == NULL) {
448 		rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
449 					     bdev_io->u.bdev.iovcnt, lba,
450 					     bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
451 					     bdev_io);
452 	} else {
453 		rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
454 						     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
455 						     bdev_io->u.bdev.md_buf,
456 						     lba, bdev_io->u.bdev.num_blocks,
457 						     _zone_block_complete_write, bdev_io);
458 	}
459 
460 	return rc;
461 
462 write_fail:
463 	pthread_spin_unlock(&zone->lock);
464 	return rc;
465 }
466 
467 static void
468 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
469 {
470 	struct spdk_bdev_io *orig_io = cb_arg;
471 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
472 
473 	/* Complete the original IO and then free the one that we created here
474 	 * as a result of issuing an IO via submit_request.
475 	 */
476 	spdk_bdev_io_complete(orig_io, status);
477 	spdk_bdev_free_io(bdev_io);
478 }
479 
480 static int
481 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
482 		struct spdk_bdev_io *bdev_io)
483 {
484 	struct block_zone *zone;
485 	uint64_t len = bdev_io->u.bdev.num_blocks;
486 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
487 	int rc;
488 
489 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
490 	if (!zone) {
491 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
492 		return -EINVAL;
493 	}
494 
495 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
496 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
497 		return -EINVAL;
498 	}
499 
500 	if (bdev_io->u.bdev.md_buf == NULL) {
501 		rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
502 					    bdev_io->u.bdev.iovcnt, lba,
503 					    len, _zone_block_complete_read,
504 					    bdev_io);
505 	} else {
506 		rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
507 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
508 						    bdev_io->u.bdev.md_buf,
509 						    lba, len,
510 						    _zone_block_complete_read, bdev_io);
511 	}
512 
513 	return rc;
514 }
515 
516 static void
517 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
518 {
519 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
520 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
521 	int rc = 0;
522 
523 	switch (bdev_io->type) {
524 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
525 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
526 		break;
527 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
528 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
529 		break;
530 	case SPDK_BDEV_IO_TYPE_WRITE:
531 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
532 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
533 		break;
534 	case SPDK_BDEV_IO_TYPE_READ:
535 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
536 		break;
537 	default:
538 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
539 		rc = -ENOTSUP;
540 		break;
541 	}
542 
543 	if (rc != 0) {
544 		if (rc == -ENOMEM) {
545 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
546 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
547 		} else {
548 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
549 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
550 		}
551 	}
552 }
553 
554 static bool
555 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
556 {
557 	switch (io_type) {
558 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
559 	case SPDK_BDEV_IO_TYPE_WRITE:
560 	case SPDK_BDEV_IO_TYPE_READ:
561 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
562 		return true;
563 	default:
564 		return false;
565 	}
566 }
567 
568 static struct spdk_io_channel *
569 zone_block_get_io_channel(void *ctx)
570 {
571 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
572 
573 	return spdk_get_io_channel(bdev_node);
574 }
575 
576 static int
577 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
578 {
579 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
580 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
581 
582 	spdk_json_write_name(w, "zoned_block");
583 	spdk_json_write_object_begin(w);
584 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
585 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
586 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
587 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
588 	spdk_json_write_object_end(w);
589 
590 	return 0;
591 }
592 
593 /* When we register our vbdev this is how we specify our entry points. */
594 static const struct spdk_bdev_fn_table zone_block_fn_table = {
595 	.destruct		= zone_block_destruct,
596 	.submit_request		= zone_block_submit_request,
597 	.io_type_supported	= zone_block_io_type_supported,
598 	.get_io_channel		= zone_block_get_io_channel,
599 	.dump_info_json		= zone_block_dump_info_json,
600 };
601 
602 static void
603 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
604 {
605 	struct bdev_zone_block *bdev_node, *tmp;
606 
607 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
608 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
609 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
610 		}
611 	}
612 }
613 
614 static void
615 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
616 			      void *event_ctx)
617 {
618 	switch (type) {
619 	case SPDK_BDEV_EVENT_REMOVE:
620 		zone_block_base_bdev_hotremove_cb(bdev);
621 		break;
622 	default:
623 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
624 		break;
625 	}
626 }
627 
628 static int
629 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
630 {
631 	struct zone_block_io_channel *bdev_ch = ctx_buf;
632 	struct bdev_zone_block *bdev_node = io_device;
633 
634 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
635 	if (!bdev_ch->base_ch) {
636 		return -ENOMEM;
637 	}
638 
639 	return 0;
640 }
641 
642 static void
643 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
644 {
645 	struct zone_block_io_channel *bdev_ch = ctx_buf;
646 
647 	spdk_put_io_channel(bdev_ch->base_ch);
648 }
649 
650 static int
651 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
652 		       uint64_t optimal_open_zones)
653 {
654 	struct bdev_zone_block_config *name;
655 
656 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
657 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
658 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
659 			return -EEXIST;
660 		}
661 		if (strcmp(bdev_name, name->bdev_name) == 0) {
662 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
663 			return -EEXIST;
664 		}
665 	}
666 
667 	name = calloc(1, sizeof(*name));
668 	if (!name) {
669 		SPDK_ERRLOG("could not allocate bdev_names\n");
670 		return -ENOMEM;
671 	}
672 
673 	name->bdev_name = strdup(bdev_name);
674 	if (!name->bdev_name) {
675 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
676 		free(name);
677 		return -ENOMEM;
678 	}
679 
680 	name->vbdev_name = strdup(vbdev_name);
681 	if (!name->vbdev_name) {
682 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
683 		free(name->bdev_name);
684 		free(name);
685 		return -ENOMEM;
686 	}
687 
688 	name->zone_capacity = zone_capacity;
689 	name->optimal_open_zones = optimal_open_zones;
690 
691 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
692 
693 	return 0;
694 }
695 
696 static int
697 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
698 {
699 	size_t i;
700 	struct block_zone *zone;
701 	int rc = 0;
702 
703 	for (i = 0; i < bdev_node->num_zones; i++) {
704 		zone = &bdev_node->zones[i];
705 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
706 		zone->zone_info.capacity = bdev_node->zone_capacity;
707 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
708 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
709 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
710 			SPDK_ERRLOG("pthread_spin_init() failed\n");
711 			rc = -ENOMEM;
712 			break;
713 		}
714 	}
715 
716 	if (rc) {
717 		for (; i > 0; i--) {
718 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
719 		}
720 	}
721 
722 	return rc;
723 }
724 
725 static int
726 zone_block_register(const char *base_bdev_name)
727 {
728 	struct spdk_bdev_desc *base_desc;
729 	struct spdk_bdev *base_bdev;
730 	struct bdev_zone_block_config *name, *tmp;
731 	struct bdev_zone_block *bdev_node;
732 	uint64_t zone_size;
733 	int rc = 0;
734 
735 	/* Check our list of names from config versus this bdev and if
736 	 * there's a match, create the bdev_node & bdev accordingly.
737 	 */
738 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
739 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
740 			continue;
741 		}
742 
743 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
744 					NULL, &base_desc);
745 		if (rc == -ENODEV) {
746 			return -ENODEV;
747 		} else if (rc) {
748 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
749 			goto free_config;
750 		}
751 
752 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
753 
754 		if (spdk_bdev_is_zoned(base_bdev)) {
755 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
756 			rc = -EEXIST;
757 			goto zone_exist;
758 		}
759 
760 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
761 		if (!bdev_node) {
762 			rc = -ENOMEM;
763 			SPDK_ERRLOG("could not allocate bdev_node\n");
764 			goto zone_exist;
765 		}
766 
767 		bdev_node->base_desc = base_desc;
768 
769 		/* The base bdev that we're attaching to. */
770 		bdev_node->bdev.name = strdup(name->vbdev_name);
771 		if (!bdev_node->bdev.name) {
772 			rc = -ENOMEM;
773 			SPDK_ERRLOG("could not allocate bdev_node name\n");
774 			goto strdup_failed;
775 		}
776 
777 		zone_size = spdk_align64pow2(name->zone_capacity);
778 		if (zone_size == 0) {
779 			rc = -EINVAL;
780 			SPDK_ERRLOG("invalid zone size\n");
781 			goto roundup_failed;
782 		}
783 
784 		bdev_node->zone_shift = spdk_u64log2(zone_size);
785 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
786 
787 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
788 		if (!bdev_node->zones) {
789 			rc = -ENOMEM;
790 			SPDK_ERRLOG("could not allocate zones\n");
791 			goto calloc_failed;
792 		}
793 
794 		bdev_node->bdev.product_name = "zone_block";
795 
796 		/* Copy some properties from the underlying base bdev. */
797 		bdev_node->bdev.write_cache = base_bdev->write_cache;
798 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
799 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
800 
801 		bdev_node->bdev.blocklen = base_bdev->blocklen;
802 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
803 
804 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
805 			SPDK_DEBUGLOG(vbdev_zone_block,
806 				      "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
807 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
808 		}
809 
810 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
811 
812 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
813 		bdev_node->bdev.md_len = base_bdev->md_len;
814 		bdev_node->bdev.dif_type = base_bdev->dif_type;
815 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
816 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
817 
818 		bdev_node->bdev.zoned = true;
819 		bdev_node->bdev.ctxt = bdev_node;
820 		bdev_node->bdev.fn_table = &zone_block_fn_table;
821 		bdev_node->bdev.module = &bdev_zoned_if;
822 
823 		/* bdev specific info */
824 		bdev_node->bdev.zone_size = zone_size;
825 
826 		bdev_node->zone_capacity = name->zone_capacity;
827 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
828 		bdev_node->bdev.max_open_zones = 0;
829 		rc = zone_block_init_zone_info(bdev_node);
830 		if (rc) {
831 			SPDK_ERRLOG("could not init zone info\n");
832 			goto zone_info_failed;
833 		}
834 
835 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
836 
837 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
838 					sizeof(struct zone_block_io_channel),
839 					name->vbdev_name);
840 
841 		/* Save the thread where the base device is opened */
842 		bdev_node->thread = spdk_get_thread();
843 
844 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
845 		if (rc) {
846 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
847 			goto claim_failed;
848 		}
849 
850 		rc = spdk_bdev_register(&bdev_node->bdev);
851 		if (rc) {
852 			SPDK_ERRLOG("could not register zoned bdev\n");
853 			goto register_failed;
854 		}
855 	}
856 
857 	return rc;
858 
859 register_failed:
860 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
861 claim_failed:
862 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
863 	spdk_io_device_unregister(bdev_node, NULL);
864 zone_info_failed:
865 	free(bdev_node->zones);
866 calloc_failed:
867 roundup_failed:
868 	free(bdev_node->bdev.name);
869 strdup_failed:
870 	free(bdev_node);
871 zone_exist:
872 	spdk_bdev_close(base_desc);
873 free_config:
874 	zone_block_remove_config(name);
875 	return rc;
876 }
877 
878 int
879 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
880 			uint64_t optimal_open_zones)
881 {
882 	int rc = 0;
883 
884 	if (zone_capacity == 0) {
885 		SPDK_ERRLOG("Zone capacity can't be 0\n");
886 		return -EINVAL;
887 	}
888 
889 	if (optimal_open_zones == 0) {
890 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
891 		return -EINVAL;
892 	}
893 
894 	/* Insert the bdev into our global name list even if it doesn't exist yet,
895 	 * it may show up soon...
896 	 */
897 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
898 	if (rc) {
899 		return rc;
900 	}
901 
902 	rc = zone_block_register(bdev_name);
903 	if (rc == -ENODEV) {
904 		/* This is not an error, even though the bdev is not present at this time it may
905 		 * still show up later.
906 		 */
907 		rc = 0;
908 	}
909 	return rc;
910 }
911 
912 void
913 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
914 {
915 	struct bdev_zone_block_config *name_node;
916 	struct spdk_bdev *bdev = NULL;
917 
918 	bdev = spdk_bdev_get_by_name(name);
919 	if (!bdev || bdev->module != &bdev_zoned_if) {
920 		cb_fn(cb_arg, -ENODEV);
921 		return;
922 	}
923 
924 	TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
925 		if (strcmp(name_node->vbdev_name, bdev->name) == 0) {
926 			zone_block_remove_config(name_node);
927 			break;
928 		}
929 	}
930 
931 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
932 }
933 
934 static void
935 zone_block_examine(struct spdk_bdev *bdev)
936 {
937 	zone_block_register(bdev->name);
938 
939 	spdk_bdev_module_examine_done(&bdev_zoned_if);
940 }
941 
942 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
943