xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision b30d57cdad6d2bc75cc1e4e2ebbcebcb0d98dcfa)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "vbdev_zone_block.h"
37 
38 #include "spdk/config.h"
39 #include "spdk/nvme.h"
40 #include "spdk/bdev_zone.h"
41 
42 #include "spdk/log.h"
43 
44 static int zone_block_init(void);
45 static int zone_block_get_ctx_size(void);
46 static void zone_block_finish(void);
47 static int zone_block_config_json(struct spdk_json_write_ctx *w);
48 static void zone_block_examine(struct spdk_bdev *bdev);
49 
50 static struct spdk_bdev_module bdev_zoned_if = {
51 	.name = "bdev_zoned_block",
52 	.module_init = zone_block_init,
53 	.module_fini = zone_block_finish,
54 	.config_json = zone_block_config_json,
55 	.examine_config = zone_block_examine,
56 	.get_ctx_size = zone_block_get_ctx_size,
57 };
58 
59 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
60 
61 /* List of block vbdev names and their base bdevs via configuration file.
62  * Used so we can parse the conf once at init and use this list in examine().
63  */
64 struct bdev_zone_block_config {
65 	char					*vbdev_name;
66 	char					*bdev_name;
67 	uint64_t				zone_capacity;
68 	uint64_t				optimal_open_zones;
69 	TAILQ_ENTRY(bdev_zone_block_config)	link;
70 };
71 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
72 
73 struct block_zone {
74 	struct spdk_bdev_zone_info zone_info;
75 	pthread_spinlock_t lock;
76 };
77 
78 /* List of block vbdevs and associated info for each. */
79 struct bdev_zone_block {
80 	struct spdk_bdev		bdev;    /* the block zoned bdev */
81 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
82 	struct block_zone		*zones; /* array of zones */
83 	uint64_t			num_zones; /* number of zones */
84 	uint64_t			zone_capacity; /* zone capacity */
85 	uint64_t                        zone_shift; /* log2 of zone_size */
86 	TAILQ_ENTRY(bdev_zone_block)	link;
87 	struct spdk_thread		*thread; /* thread where base device is opened */
88 };
89 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
90 
91 struct zone_block_io_channel {
92 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
93 };
94 
95 struct zone_block_io {
96 	/* vbdev to which IO was issued */
97 	struct bdev_zone_block *bdev_zone_block;
98 };
99 
100 static int
101 zone_block_init(void)
102 {
103 	return 0;
104 }
105 
106 static void
107 zone_block_remove_config(struct bdev_zone_block_config *name)
108 {
109 	TAILQ_REMOVE(&g_bdev_configs, name, link);
110 	free(name->bdev_name);
111 	free(name->vbdev_name);
112 	free(name);
113 }
114 
115 static void
116 zone_block_finish(void)
117 {
118 	struct bdev_zone_block_config *name;
119 
120 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
121 		zone_block_remove_config(name);
122 	}
123 }
124 
125 static int
126 zone_block_get_ctx_size(void)
127 {
128 	return sizeof(struct zone_block_io);
129 }
130 
131 static int
132 zone_block_config_json(struct spdk_json_write_ctx *w)
133 {
134 	struct bdev_zone_block *bdev_node;
135 	struct spdk_bdev *base_bdev = NULL;
136 
137 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
138 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
139 		spdk_json_write_object_begin(w);
140 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
141 		spdk_json_write_named_object_begin(w, "params");
142 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
143 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
144 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
145 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
146 		spdk_json_write_object_end(w);
147 		spdk_json_write_object_end(w);
148 	}
149 
150 	return 0;
151 }
152 
153 /* Callback for unregistering the IO device. */
154 static void
155 _device_unregister_cb(void *io_device)
156 {
157 	struct bdev_zone_block *bdev_node = io_device;
158 	uint64_t i;
159 
160 	free(bdev_node->bdev.name);
161 	for (i = 0; i < bdev_node->num_zones; i++) {
162 		pthread_spin_destroy(&bdev_node->zones[i].lock);
163 	}
164 	free(bdev_node->zones);
165 	free(bdev_node);
166 }
167 
168 static void
169 _zone_block_destruct(void *ctx)
170 {
171 	struct spdk_bdev_desc *desc = ctx;
172 
173 	spdk_bdev_close(desc);
174 }
175 
176 static int
177 zone_block_destruct(void *ctx)
178 {
179 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
180 
181 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
182 
183 	/* Unclaim the underlying bdev. */
184 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
185 
186 	/* Close the underlying bdev on its same opened thread. */
187 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
188 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
189 	} else {
190 		spdk_bdev_close(bdev_node->base_desc);
191 	}
192 
193 	/* Unregister the io_device. */
194 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
195 
196 	return 0;
197 }
198 
199 static struct block_zone *
200 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
201 {
202 	size_t index = lba >> bdev_node->zone_shift;
203 
204 	if (index >= bdev_node->num_zones) {
205 		return NULL;
206 	}
207 
208 	return &bdev_node->zones[index];
209 }
210 
211 static struct block_zone *
212 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
213 {
214 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
215 
216 	if (zone && zone->zone_info.zone_id == start_lba) {
217 		return zone;
218 	} else {
219 		return NULL;
220 	}
221 }
222 
223 static int
224 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
225 {
226 	struct block_zone *zone;
227 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
228 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
229 	size_t i;
230 
231 	/* User can request info for more zones than exist, need to check both internal and user
232 	 * boundaries
233 	 */
234 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
235 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
236 		if (!zone) {
237 			return -EINVAL;
238 		}
239 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
240 	}
241 
242 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
243 	return 0;
244 }
245 
246 static int
247 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
248 {
249 	pthread_spin_lock(&zone->lock);
250 
251 	switch (zone->zone_info.state) {
252 	case SPDK_BDEV_ZONE_STATE_EMPTY:
253 	case SPDK_BDEV_ZONE_STATE_OPEN:
254 	case SPDK_BDEV_ZONE_STATE_CLOSED:
255 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
256 		pthread_spin_unlock(&zone->lock);
257 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
258 		return 0;
259 	default:
260 		pthread_spin_unlock(&zone->lock);
261 		return -EINVAL;
262 	}
263 }
264 
265 static void
266 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
267 {
268 	struct spdk_bdev_io *orig_io = cb_arg;
269 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
270 
271 	/* Complete the original IO and then free the one that we created here
272 	 * as a result of issuing an IO via submit_reqeust.
273 	 */
274 	spdk_bdev_io_complete(orig_io, status);
275 	spdk_bdev_free_io(bdev_io);
276 }
277 
278 static int
279 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
280 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
281 {
282 	pthread_spin_lock(&zone->lock);
283 
284 	switch (zone->zone_info.state) {
285 	case SPDK_BDEV_ZONE_STATE_EMPTY:
286 		pthread_spin_unlock(&zone->lock);
287 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
288 		return 0;
289 	case SPDK_BDEV_ZONE_STATE_OPEN:
290 	case SPDK_BDEV_ZONE_STATE_FULL:
291 	case SPDK_BDEV_ZONE_STATE_CLOSED:
292 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
293 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
294 		pthread_spin_unlock(&zone->lock);
295 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
296 					      zone->zone_info.zone_id, zone->zone_info.capacity,
297 					      _zone_block_complete_unmap, bdev_io);
298 	default:
299 		pthread_spin_unlock(&zone->lock);
300 		return -EINVAL;
301 	}
302 }
303 
304 static int
305 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
306 {
307 	pthread_spin_lock(&zone->lock);
308 
309 	switch (zone->zone_info.state) {
310 	case SPDK_BDEV_ZONE_STATE_OPEN:
311 	case SPDK_BDEV_ZONE_STATE_CLOSED:
312 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
313 		pthread_spin_unlock(&zone->lock);
314 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
315 		return 0;
316 	default:
317 		pthread_spin_unlock(&zone->lock);
318 		return -EINVAL;
319 	}
320 }
321 
322 static int
323 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
324 {
325 	pthread_spin_lock(&zone->lock);
326 
327 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
328 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
329 
330 	pthread_spin_unlock(&zone->lock);
331 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
332 	return 0;
333 }
334 
335 static int
336 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
337 			   struct spdk_bdev_io *bdev_io)
338 {
339 	struct block_zone *zone;
340 
341 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
342 	if (!zone) {
343 		return -EINVAL;
344 	}
345 
346 	switch (bdev_io->u.zone_mgmt.zone_action) {
347 	case SPDK_BDEV_ZONE_RESET:
348 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
349 	case SPDK_BDEV_ZONE_OPEN:
350 		return zone_block_open_zone(zone, bdev_io);
351 	case SPDK_BDEV_ZONE_CLOSE:
352 		return zone_block_close_zone(zone, bdev_io);
353 	case SPDK_BDEV_ZONE_FINISH:
354 		return zone_block_finish_zone(zone, bdev_io);
355 	default:
356 		return -EINVAL;
357 	}
358 }
359 
360 static void
361 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
362 {
363 	struct spdk_bdev_io *orig_io = cb_arg;
364 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
365 
366 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
367 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
368 	}
369 
370 	/* Complete the original IO and then free the one that we created here
371 	 * as a result of issuing an IO via submit_reqeust.
372 	 */
373 	spdk_bdev_io_complete(orig_io, status);
374 	spdk_bdev_free_io(bdev_io);
375 }
376 
377 static int
378 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
379 		 struct spdk_bdev_io *bdev_io)
380 {
381 	struct block_zone *zone;
382 	uint64_t len = bdev_io->u.bdev.num_blocks;
383 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
384 	uint64_t num_blocks_left, wp;
385 	int rc = 0;
386 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
387 
388 	if (is_append) {
389 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
390 	} else {
391 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
392 	}
393 	if (!zone) {
394 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
395 		return -EINVAL;
396 	}
397 
398 	pthread_spin_lock(&zone->lock);
399 
400 	switch (zone->zone_info.state) {
401 	case SPDK_BDEV_ZONE_STATE_OPEN:
402 	case SPDK_BDEV_ZONE_STATE_EMPTY:
403 	case SPDK_BDEV_ZONE_STATE_CLOSED:
404 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
405 		break;
406 	default:
407 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
408 		rc = -EINVAL;
409 		goto write_fail;
410 	}
411 
412 	wp = zone->zone_info.write_pointer;
413 	if (is_append) {
414 		lba = wp;
415 	} else {
416 		if (lba != wp) {
417 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
418 				    lba, wp);
419 			rc = -EINVAL;
420 			goto write_fail;
421 		}
422 	}
423 
424 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
425 	if (len > num_blocks_left) {
426 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
427 			    ")\n", lba, len, wp);
428 		rc = -EINVAL;
429 		goto write_fail;
430 	}
431 
432 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
433 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
434 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
435 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
436 	}
437 	pthread_spin_unlock(&zone->lock);
438 
439 	if (bdev_io->u.bdev.md_buf == NULL) {
440 		rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
441 					     bdev_io->u.bdev.iovcnt, lba,
442 					     bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
443 					     bdev_io);
444 	} else {
445 		rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
446 						     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
447 						     bdev_io->u.bdev.md_buf,
448 						     lba, bdev_io->u.bdev.num_blocks,
449 						     _zone_block_complete_write, bdev_io);
450 	}
451 
452 	return rc;
453 
454 write_fail:
455 	pthread_spin_unlock(&zone->lock);
456 	return rc;
457 }
458 
459 static void
460 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
461 {
462 	struct spdk_bdev_io *orig_io = cb_arg;
463 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
464 
465 	/* Complete the original IO and then free the one that we created here
466 	 * as a result of issuing an IO via submit_reqeust.
467 	 */
468 	spdk_bdev_io_complete(orig_io, status);
469 	spdk_bdev_free_io(bdev_io);
470 }
471 
472 static int
473 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
474 		struct spdk_bdev_io *bdev_io)
475 {
476 	struct block_zone *zone;
477 	uint64_t len = bdev_io->u.bdev.num_blocks;
478 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
479 	int rc;
480 
481 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
482 	if (!zone) {
483 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
484 		return -EINVAL;
485 	}
486 
487 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
488 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
489 		return -EINVAL;
490 	}
491 
492 	if (bdev_io->u.bdev.md_buf == NULL) {
493 		rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
494 					    bdev_io->u.bdev.iovcnt, lba,
495 					    len, _zone_block_complete_read,
496 					    bdev_io);
497 	} else {
498 		rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
499 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
500 						    bdev_io->u.bdev.md_buf,
501 						    lba, len,
502 						    _zone_block_complete_read, bdev_io);
503 	}
504 
505 	return rc;
506 }
507 
508 static void
509 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
510 {
511 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
512 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
513 	int rc = 0;
514 
515 	switch (bdev_io->type) {
516 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
517 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
518 		break;
519 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
520 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
521 		break;
522 	case SPDK_BDEV_IO_TYPE_WRITE:
523 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
524 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
525 		break;
526 	case SPDK_BDEV_IO_TYPE_READ:
527 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
528 		break;
529 	default:
530 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
531 		rc = -ENOTSUP;
532 		break;
533 	}
534 
535 	if (rc != 0) {
536 		if (rc == -ENOMEM) {
537 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
538 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
539 		} else {
540 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
541 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
542 		}
543 	}
544 }
545 
546 static bool
547 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
548 {
549 	switch (io_type) {
550 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
551 	case SPDK_BDEV_IO_TYPE_WRITE:
552 	case SPDK_BDEV_IO_TYPE_READ:
553 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
554 		return true;
555 	default:
556 		return false;
557 	}
558 }
559 
560 static struct spdk_io_channel *
561 zone_block_get_io_channel(void *ctx)
562 {
563 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
564 
565 	return spdk_get_io_channel(bdev_node);
566 }
567 
568 static int
569 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
570 {
571 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
572 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
573 
574 	spdk_json_write_name(w, "zoned_block");
575 	spdk_json_write_object_begin(w);
576 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
577 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
578 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
579 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
580 	spdk_json_write_object_end(w);
581 
582 	return 0;
583 }
584 
585 /* When we register our vbdev this is how we specify our entry points. */
586 static const struct spdk_bdev_fn_table zone_block_fn_table = {
587 	.destruct		= zone_block_destruct,
588 	.submit_request		= zone_block_submit_request,
589 	.io_type_supported	= zone_block_io_type_supported,
590 	.get_io_channel		= zone_block_get_io_channel,
591 	.dump_info_json		= zone_block_dump_info_json,
592 };
593 
594 static void
595 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
596 {
597 	struct bdev_zone_block *bdev_node, *tmp;
598 
599 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
600 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
601 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
602 		}
603 	}
604 }
605 
606 static void
607 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
608 			      void *event_ctx)
609 {
610 	switch (type) {
611 	case SPDK_BDEV_EVENT_REMOVE:
612 		zone_block_base_bdev_hotremove_cb(bdev);
613 		break;
614 	default:
615 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
616 		break;
617 	}
618 }
619 
620 static int
621 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
622 {
623 	struct zone_block_io_channel *bdev_ch = ctx_buf;
624 	struct bdev_zone_block *bdev_node = io_device;
625 
626 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
627 	if (!bdev_ch->base_ch) {
628 		return -ENOMEM;
629 	}
630 
631 	return 0;
632 }
633 
634 static void
635 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
636 {
637 	struct zone_block_io_channel *bdev_ch = ctx_buf;
638 
639 	spdk_put_io_channel(bdev_ch->base_ch);
640 }
641 
642 static int
643 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
644 		       uint64_t optimal_open_zones)
645 {
646 	struct bdev_zone_block_config *name;
647 
648 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
649 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
650 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
651 			return -EEXIST;
652 		}
653 		if (strcmp(bdev_name, name->bdev_name) == 0) {
654 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
655 			return -EEXIST;
656 		}
657 	}
658 
659 	name = calloc(1, sizeof(*name));
660 	if (!name) {
661 		SPDK_ERRLOG("could not allocate bdev_names\n");
662 		return -ENOMEM;
663 	}
664 
665 	name->bdev_name = strdup(bdev_name);
666 	if (!name->bdev_name) {
667 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
668 		free(name);
669 		return -ENOMEM;
670 	}
671 
672 	name->vbdev_name = strdup(vbdev_name);
673 	if (!name->vbdev_name) {
674 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
675 		free(name->bdev_name);
676 		free(name);
677 		return -ENOMEM;
678 	}
679 
680 	name->zone_capacity = zone_capacity;
681 	name->optimal_open_zones = optimal_open_zones;
682 
683 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
684 
685 	return 0;
686 }
687 
688 static int
689 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
690 {
691 	size_t i;
692 	struct block_zone *zone;
693 	int rc = 0;
694 
695 	for (i = 0; i < bdev_node->num_zones; i++) {
696 		zone = &bdev_node->zones[i];
697 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
698 		zone->zone_info.capacity = bdev_node->zone_capacity;
699 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
700 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
701 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
702 			SPDK_ERRLOG("pthread_spin_init() failed\n");
703 			rc = -ENOMEM;
704 			break;
705 		}
706 	}
707 
708 	if (rc) {
709 		for (; i > 0; i--) {
710 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
711 		}
712 	}
713 
714 	return rc;
715 }
716 
717 static int
718 zone_block_register(const char *base_bdev_name)
719 {
720 	struct spdk_bdev_desc *base_desc;
721 	struct spdk_bdev *base_bdev;
722 	struct bdev_zone_block_config *name, *tmp;
723 	struct bdev_zone_block *bdev_node;
724 	uint64_t zone_size;
725 	int rc = 0;
726 
727 	/* Check our list of names from config versus this bdev and if
728 	 * there's a match, create the bdev_node & bdev accordingly.
729 	 */
730 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
731 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
732 			continue;
733 		}
734 
735 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
736 					NULL, &base_desc);
737 		if (rc == -ENODEV) {
738 			return -ENODEV;
739 		} else if (rc) {
740 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
741 			goto free_config;
742 		}
743 
744 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
745 
746 		if (spdk_bdev_is_zoned(base_bdev)) {
747 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
748 			rc = -EEXIST;
749 			goto zone_exist;
750 		}
751 
752 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
753 		if (!bdev_node) {
754 			rc = -ENOMEM;
755 			SPDK_ERRLOG("could not allocate bdev_node\n");
756 			goto zone_exist;
757 		}
758 
759 		bdev_node->base_desc = base_desc;
760 
761 		/* The base bdev that we're attaching to. */
762 		bdev_node->bdev.name = strdup(name->vbdev_name);
763 		if (!bdev_node->bdev.name) {
764 			rc = -ENOMEM;
765 			SPDK_ERRLOG("could not allocate bdev_node name\n");
766 			goto strdup_failed;
767 		}
768 
769 		zone_size = spdk_align64pow2(name->zone_capacity);
770 		if (zone_size == 0) {
771 			rc = -EINVAL;
772 			SPDK_ERRLOG("invalid zone size\n");
773 			goto roundup_failed;
774 		}
775 
776 		bdev_node->zone_shift = spdk_u64log2(zone_size);
777 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
778 
779 		/* Align num_zones to optimal_open_zones */
780 		bdev_node->num_zones -= bdev_node->num_zones % name->optimal_open_zones;
781 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
782 		if (!bdev_node->zones) {
783 			rc = -ENOMEM;
784 			SPDK_ERRLOG("could not allocate zones\n");
785 			goto calloc_failed;
786 		}
787 
788 		bdev_node->bdev.product_name = "zone_block";
789 
790 		/* Copy some properties from the underlying base bdev. */
791 		bdev_node->bdev.write_cache = base_bdev->write_cache;
792 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
793 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
794 
795 		bdev_node->bdev.blocklen = base_bdev->blocklen;
796 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
797 
798 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
799 			SPDK_DEBUGLOG(vbdev_zone_block,
800 				      "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
801 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
802 		}
803 
804 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
805 
806 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
807 		bdev_node->bdev.md_len = base_bdev->md_len;
808 		bdev_node->bdev.dif_type = base_bdev->dif_type;
809 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
810 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
811 
812 		bdev_node->bdev.zoned = true;
813 		bdev_node->bdev.ctxt = bdev_node;
814 		bdev_node->bdev.fn_table = &zone_block_fn_table;
815 		bdev_node->bdev.module = &bdev_zoned_if;
816 
817 		/* bdev specific info */
818 		bdev_node->bdev.zone_size = zone_size;
819 
820 		bdev_node->zone_capacity = name->zone_capacity;
821 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
822 		bdev_node->bdev.max_open_zones = 0;
823 		rc = zone_block_init_zone_info(bdev_node);
824 		if (rc) {
825 			SPDK_ERRLOG("could not init zone info\n");
826 			goto zone_info_failed;
827 		}
828 
829 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
830 
831 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
832 					sizeof(struct zone_block_io_channel),
833 					name->vbdev_name);
834 
835 		/* Save the thread where the base device is opened */
836 		bdev_node->thread = spdk_get_thread();
837 
838 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
839 		if (rc) {
840 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
841 			goto claim_failed;
842 		}
843 
844 		rc = spdk_bdev_register(&bdev_node->bdev);
845 		if (rc) {
846 			SPDK_ERRLOG("could not register zoned bdev\n");
847 			goto register_failed;
848 		}
849 	}
850 
851 	return rc;
852 
853 register_failed:
854 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
855 claim_failed:
856 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
857 	spdk_io_device_unregister(bdev_node, NULL);
858 zone_info_failed:
859 	free(bdev_node->zones);
860 calloc_failed:
861 roundup_failed:
862 	free(bdev_node->bdev.name);
863 strdup_failed:
864 	free(bdev_node);
865 zone_exist:
866 	spdk_bdev_close(base_desc);
867 free_config:
868 	zone_block_remove_config(name);
869 	return rc;
870 }
871 
872 int
873 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
874 			uint64_t optimal_open_zones)
875 {
876 	int rc = 0;
877 
878 	if (zone_capacity == 0) {
879 		SPDK_ERRLOG("Zone capacity can't be 0\n");
880 		return -EINVAL;
881 	}
882 
883 	if (optimal_open_zones == 0) {
884 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
885 		return -EINVAL;
886 	}
887 
888 	/* Insert the bdev into our global name list even if it doesn't exist yet,
889 	 * it may show up soon...
890 	 */
891 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
892 	if (rc) {
893 		return rc;
894 	}
895 
896 	rc = zone_block_register(bdev_name);
897 	if (rc == -ENODEV) {
898 		/* This is not an error, even though the bdev is not present at this time it may
899 		 * still show up later.
900 		 */
901 		rc = 0;
902 	}
903 	return rc;
904 }
905 
906 void
907 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
908 {
909 	struct bdev_zone_block_config *name_node;
910 	struct spdk_bdev *bdev = NULL;
911 
912 	bdev = spdk_bdev_get_by_name(name);
913 	if (!bdev || bdev->module != &bdev_zoned_if) {
914 		cb_fn(cb_arg, -ENODEV);
915 		return;
916 	}
917 
918 	TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
919 		if (strcmp(name_node->vbdev_name, bdev->name) == 0) {
920 			zone_block_remove_config(name_node);
921 			break;
922 		}
923 	}
924 
925 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
926 }
927 
928 static void
929 zone_block_examine(struct spdk_bdev *bdev)
930 {
931 	zone_block_register(bdev->name);
932 
933 	spdk_bdev_module_examine_done(&bdev_zoned_if);
934 }
935 
936 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
937