xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision ceea3088870a3919d6bdfe61d7adba11b9733fb7)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "vbdev_zone_block.h"
37 
38 #include "spdk/config.h"
39 #include "spdk/nvme.h"
40 #include "spdk/bdev_zone.h"
41 
42 #include "spdk/log.h"
43 
44 static int zone_block_init(void);
45 static int zone_block_get_ctx_size(void);
46 static void zone_block_finish(void);
47 static int zone_block_config_json(struct spdk_json_write_ctx *w);
48 static void zone_block_examine(struct spdk_bdev *bdev);
49 
50 static struct spdk_bdev_module bdev_zoned_if = {
51 	.name = "bdev_zoned_block",
52 	.module_init = zone_block_init,
53 	.module_fini = zone_block_finish,
54 	.config_text = NULL,
55 	.config_json = zone_block_config_json,
56 	.examine_config = zone_block_examine,
57 	.get_ctx_size = zone_block_get_ctx_size,
58 };
59 
60 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
61 
62 /* List of block vbdev names and their base bdevs via configuration file.
63  * Used so we can parse the conf once at init and use this list in examine().
64  */
65 struct bdev_zone_block_config {
66 	char					*vbdev_name;
67 	char					*bdev_name;
68 	uint64_t				zone_capacity;
69 	uint64_t				optimal_open_zones;
70 	TAILQ_ENTRY(bdev_zone_block_config)	link;
71 };
72 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
73 
74 struct block_zone {
75 	struct spdk_bdev_zone_info zone_info;
76 	pthread_spinlock_t lock;
77 };
78 
79 /* List of block vbdevs and associated info for each. */
80 struct bdev_zone_block {
81 	struct spdk_bdev		bdev;    /* the block zoned bdev */
82 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
83 	struct block_zone		*zones; /* array of zones */
84 	uint64_t			num_zones; /* number of zones */
85 	uint64_t			zone_capacity; /* zone capacity */
86 	uint64_t                        zone_shift; /* log2 of zone_size */
87 	TAILQ_ENTRY(bdev_zone_block)	link;
88 	struct spdk_thread		*thread; /* thread where base device is opened */
89 };
90 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
91 
92 struct zone_block_io_channel {
93 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
94 };
95 
96 struct zone_block_io {
97 	/* vbdev to which IO was issued */
98 	struct bdev_zone_block *bdev_zone_block;
99 };
100 
101 static int
102 zone_block_init(void)
103 {
104 	return 0;
105 }
106 
107 static void
108 zone_block_remove_config(struct bdev_zone_block_config *name)
109 {
110 	TAILQ_REMOVE(&g_bdev_configs, name, link);
111 	free(name->bdev_name);
112 	free(name->vbdev_name);
113 	free(name);
114 }
115 
116 static void
117 zone_block_finish(void)
118 {
119 	struct bdev_zone_block_config *name;
120 
121 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
122 		zone_block_remove_config(name);
123 	}
124 }
125 
126 static int
127 zone_block_get_ctx_size(void)
128 {
129 	return sizeof(struct zone_block_io);
130 }
131 
132 static int
133 zone_block_config_json(struct spdk_json_write_ctx *w)
134 {
135 	struct bdev_zone_block *bdev_node;
136 	struct spdk_bdev *base_bdev = NULL;
137 
138 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
139 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
140 		spdk_json_write_object_begin(w);
141 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
142 		spdk_json_write_named_object_begin(w, "params");
143 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
144 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
145 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
146 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
147 		spdk_json_write_object_end(w);
148 		spdk_json_write_object_end(w);
149 	}
150 
151 	return 0;
152 }
153 
154 /* Callback for unregistering the IO device. */
155 static void
156 _device_unregister_cb(void *io_device)
157 {
158 	struct bdev_zone_block *bdev_node = io_device;
159 	uint64_t i;
160 
161 	free(bdev_node->bdev.name);
162 	for (i = 0; i < bdev_node->num_zones; i++) {
163 		pthread_spin_destroy(&bdev_node->zones[i].lock);
164 	}
165 	free(bdev_node->zones);
166 	free(bdev_node);
167 }
168 
169 static void
170 _zone_block_destruct(void *ctx)
171 {
172 	struct spdk_bdev_desc *desc = ctx;
173 
174 	spdk_bdev_close(desc);
175 }
176 
177 static int
178 zone_block_destruct(void *ctx)
179 {
180 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
181 
182 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
183 
184 	/* Unclaim the underlying bdev. */
185 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
186 
187 	/* Close the underlying bdev on its same opened thread. */
188 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
189 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
190 	} else {
191 		spdk_bdev_close(bdev_node->base_desc);
192 	}
193 
194 	/* Unregister the io_device. */
195 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
196 
197 	return 0;
198 }
199 
200 static struct block_zone *
201 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
202 {
203 	size_t index = lba >> bdev_node->zone_shift;
204 
205 	if (index >= bdev_node->num_zones) {
206 		return NULL;
207 	}
208 
209 	return &bdev_node->zones[index];
210 }
211 
212 static struct block_zone *
213 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
214 {
215 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
216 
217 	if (zone && zone->zone_info.zone_id == start_lba) {
218 		return zone;
219 	} else {
220 		return NULL;
221 	}
222 }
223 
224 static int
225 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
226 {
227 	struct block_zone *zone;
228 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
229 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
230 	size_t i;
231 
232 	/* User can request info for more zones than exist, need to check both internal and user
233 	 * boundaries
234 	 */
235 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
236 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
237 		if (!zone) {
238 			return -EINVAL;
239 		}
240 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
241 	}
242 
243 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
244 	return 0;
245 }
246 
247 static int
248 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
249 {
250 	pthread_spin_lock(&zone->lock);
251 
252 	switch (zone->zone_info.state) {
253 	case SPDK_BDEV_ZONE_STATE_EMPTY:
254 	case SPDK_BDEV_ZONE_STATE_OPEN:
255 	case SPDK_BDEV_ZONE_STATE_CLOSED:
256 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
257 		pthread_spin_unlock(&zone->lock);
258 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
259 		return 0;
260 	default:
261 		pthread_spin_unlock(&zone->lock);
262 		return -EINVAL;
263 	}
264 }
265 
266 static void
267 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
268 {
269 	struct spdk_bdev_io *orig_io = cb_arg;
270 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
271 
272 	/* Complete the original IO and then free the one that we created here
273 	 * as a result of issuing an IO via submit_reqeust.
274 	 */
275 	spdk_bdev_io_complete(orig_io, status);
276 	spdk_bdev_free_io(bdev_io);
277 }
278 
279 static int
280 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
281 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
282 {
283 	pthread_spin_lock(&zone->lock);
284 
285 	switch (zone->zone_info.state) {
286 	case SPDK_BDEV_ZONE_STATE_EMPTY:
287 		pthread_spin_unlock(&zone->lock);
288 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
289 		return 0;
290 	case SPDK_BDEV_ZONE_STATE_OPEN:
291 	case SPDK_BDEV_ZONE_STATE_FULL:
292 	case SPDK_BDEV_ZONE_STATE_CLOSED:
293 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
294 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
295 		pthread_spin_unlock(&zone->lock);
296 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
297 					      zone->zone_info.zone_id, zone->zone_info.capacity,
298 					      _zone_block_complete_unmap, bdev_io);
299 	default:
300 		pthread_spin_unlock(&zone->lock);
301 		return -EINVAL;
302 	}
303 }
304 
305 static int
306 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
307 {
308 	pthread_spin_lock(&zone->lock);
309 
310 	switch (zone->zone_info.state) {
311 	case SPDK_BDEV_ZONE_STATE_OPEN:
312 	case SPDK_BDEV_ZONE_STATE_CLOSED:
313 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
314 		pthread_spin_unlock(&zone->lock);
315 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
316 		return 0;
317 	default:
318 		pthread_spin_unlock(&zone->lock);
319 		return -EINVAL;
320 	}
321 }
322 
323 static int
324 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
325 {
326 	pthread_spin_lock(&zone->lock);
327 
328 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
329 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
330 
331 	pthread_spin_unlock(&zone->lock);
332 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
333 	return 0;
334 }
335 
336 static int
337 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
338 			   struct spdk_bdev_io *bdev_io)
339 {
340 	struct block_zone *zone;
341 
342 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
343 	if (!zone) {
344 		return -EINVAL;
345 	}
346 
347 	switch (bdev_io->u.zone_mgmt.zone_action) {
348 	case SPDK_BDEV_ZONE_RESET:
349 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
350 	case SPDK_BDEV_ZONE_OPEN:
351 		return zone_block_open_zone(zone, bdev_io);
352 	case SPDK_BDEV_ZONE_CLOSE:
353 		return zone_block_close_zone(zone, bdev_io);
354 	case SPDK_BDEV_ZONE_FINISH:
355 		return zone_block_finish_zone(zone, bdev_io);
356 	default:
357 		return -EINVAL;
358 	}
359 }
360 
361 static void
362 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
363 {
364 	struct spdk_bdev_io *orig_io = cb_arg;
365 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
366 
367 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
368 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
369 	}
370 
371 	/* Complete the original IO and then free the one that we created here
372 	 * as a result of issuing an IO via submit_reqeust.
373 	 */
374 	spdk_bdev_io_complete(orig_io, status);
375 	spdk_bdev_free_io(bdev_io);
376 }
377 
378 static int
379 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
380 		 struct spdk_bdev_io *bdev_io)
381 {
382 	struct block_zone *zone;
383 	uint64_t len = bdev_io->u.bdev.num_blocks;
384 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
385 	uint64_t num_blocks_left, wp;
386 	int rc = 0;
387 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
388 
389 	if (is_append) {
390 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
391 	} else {
392 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
393 	}
394 	if (!zone) {
395 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%lx)\n", lba);
396 		return -EINVAL;
397 	}
398 
399 	pthread_spin_lock(&zone->lock);
400 
401 	switch (zone->zone_info.state) {
402 	case SPDK_BDEV_ZONE_STATE_OPEN:
403 	case SPDK_BDEV_ZONE_STATE_EMPTY:
404 	case SPDK_BDEV_ZONE_STATE_CLOSED:
405 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
406 		break;
407 	default:
408 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
409 		rc = -EINVAL;
410 		goto write_fail;
411 	}
412 
413 	wp = zone->zone_info.write_pointer;
414 	if (is_append) {
415 		lba = wp;
416 	} else {
417 		if (lba != wp) {
418 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%lx, wp 0x%lx)\n", lba, wp);
419 			rc = -EINVAL;
420 			goto write_fail;
421 		}
422 	}
423 
424 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
425 	if (len > num_blocks_left) {
426 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIu64 ", len 0x%lx, wp 0x%lx)\n", lba, len, wp);
427 		rc = -EINVAL;
428 		goto write_fail;
429 	}
430 
431 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
432 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
433 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
434 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
435 	}
436 	pthread_spin_unlock(&zone->lock);
437 
438 	if (bdev_io->u.bdev.md_buf == NULL) {
439 		rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
440 					     bdev_io->u.bdev.iovcnt, lba,
441 					     bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
442 					     bdev_io);
443 	} else {
444 		rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
445 						     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
446 						     bdev_io->u.bdev.md_buf,
447 						     lba, bdev_io->u.bdev.num_blocks,
448 						     _zone_block_complete_write, bdev_io);
449 	}
450 
451 	return rc;
452 
453 write_fail:
454 	pthread_spin_unlock(&zone->lock);
455 	return rc;
456 }
457 
458 static void
459 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
460 {
461 	struct spdk_bdev_io *orig_io = cb_arg;
462 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
463 
464 	/* Complete the original IO and then free the one that we created here
465 	 * as a result of issuing an IO via submit_reqeust.
466 	 */
467 	spdk_bdev_io_complete(orig_io, status);
468 	spdk_bdev_free_io(bdev_io);
469 }
470 
471 static int
472 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
473 		struct spdk_bdev_io *bdev_io)
474 {
475 	struct block_zone *zone;
476 	uint64_t len = bdev_io->u.bdev.num_blocks;
477 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
478 	int rc;
479 
480 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
481 	if (!zone) {
482 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%lx)\n", lba);
483 		return -EINVAL;
484 	}
485 
486 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
487 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%lx, len 0x%lx)\n", lba, len);
488 		return -EINVAL;
489 	}
490 
491 	if (bdev_io->u.bdev.md_buf == NULL) {
492 		rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
493 					    bdev_io->u.bdev.iovcnt, lba,
494 					    len, _zone_block_complete_read,
495 					    bdev_io);
496 	} else {
497 		rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
498 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
499 						    bdev_io->u.bdev.md_buf,
500 						    lba, len,
501 						    _zone_block_complete_read, bdev_io);
502 	}
503 
504 	return rc;
505 }
506 
507 static void
508 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
509 {
510 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
511 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
512 	int rc = 0;
513 
514 	switch (bdev_io->type) {
515 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
516 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
517 		break;
518 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
519 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
520 		break;
521 	case SPDK_BDEV_IO_TYPE_WRITE:
522 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
523 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
524 		break;
525 	case SPDK_BDEV_IO_TYPE_READ:
526 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
527 		break;
528 	default:
529 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
530 		rc = -ENOTSUP;
531 		break;
532 	}
533 
534 	if (rc != 0) {
535 		if (rc == -ENOMEM) {
536 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
537 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
538 		} else {
539 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
540 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
541 		}
542 	}
543 }
544 
545 static bool
546 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
547 {
548 	switch (io_type) {
549 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
550 	case SPDK_BDEV_IO_TYPE_WRITE:
551 	case SPDK_BDEV_IO_TYPE_READ:
552 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
553 		return true;
554 	default:
555 		return false;
556 	}
557 }
558 
559 static struct spdk_io_channel *
560 zone_block_get_io_channel(void *ctx)
561 {
562 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
563 
564 	return spdk_get_io_channel(bdev_node);
565 }
566 
567 static int
568 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
569 {
570 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
571 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
572 
573 	spdk_json_write_name(w, "zoned_block");
574 	spdk_json_write_object_begin(w);
575 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
576 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
577 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
578 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
579 	spdk_json_write_object_end(w);
580 
581 	return 0;
582 }
583 
584 /* When we register our vbdev this is how we specify our entry points. */
585 static const struct spdk_bdev_fn_table zone_block_fn_table = {
586 	.destruct		= zone_block_destruct,
587 	.submit_request		= zone_block_submit_request,
588 	.io_type_supported	= zone_block_io_type_supported,
589 	.get_io_channel		= zone_block_get_io_channel,
590 	.dump_info_json		= zone_block_dump_info_json,
591 };
592 
593 static void
594 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
595 {
596 	struct bdev_zone_block *bdev_node, *tmp;
597 
598 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
599 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
600 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
601 		}
602 	}
603 }
604 
605 static void
606 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
607 			      void *event_ctx)
608 {
609 	switch (type) {
610 	case SPDK_BDEV_EVENT_REMOVE:
611 		zone_block_base_bdev_hotremove_cb(bdev);
612 		break;
613 	default:
614 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
615 		break;
616 	}
617 }
618 
619 static int
620 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
621 {
622 	struct zone_block_io_channel *bdev_ch = ctx_buf;
623 	struct bdev_zone_block *bdev_node = io_device;
624 
625 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
626 	if (!bdev_ch->base_ch) {
627 		return -ENOMEM;
628 	}
629 
630 	return 0;
631 }
632 
633 static void
634 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
635 {
636 	struct zone_block_io_channel *bdev_ch = ctx_buf;
637 
638 	spdk_put_io_channel(bdev_ch->base_ch);
639 }
640 
641 static int
642 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
643 		       uint64_t optimal_open_zones)
644 {
645 	struct bdev_zone_block_config *name;
646 
647 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
648 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
649 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
650 			return -EEXIST;
651 		}
652 		if (strcmp(bdev_name, name->bdev_name) == 0) {
653 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
654 			return -EEXIST;
655 		}
656 	}
657 
658 	name = calloc(1, sizeof(*name));
659 	if (!name) {
660 		SPDK_ERRLOG("could not allocate bdev_names\n");
661 		return -ENOMEM;
662 	}
663 
664 	name->bdev_name = strdup(bdev_name);
665 	if (!name->bdev_name) {
666 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
667 		free(name);
668 		return -ENOMEM;
669 	}
670 
671 	name->vbdev_name = strdup(vbdev_name);
672 	if (!name->vbdev_name) {
673 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
674 		free(name->bdev_name);
675 		free(name);
676 		return -ENOMEM;
677 	}
678 
679 	name->zone_capacity = zone_capacity;
680 	name->optimal_open_zones = optimal_open_zones;
681 
682 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
683 
684 	return 0;
685 }
686 
687 static int
688 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
689 {
690 	size_t i;
691 	struct block_zone *zone;
692 	int rc = 0;
693 
694 	for (i = 0; i < bdev_node->num_zones; i++) {
695 		zone = &bdev_node->zones[i];
696 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
697 		zone->zone_info.capacity = bdev_node->zone_capacity;
698 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
699 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
700 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
701 			SPDK_ERRLOG("pthread_spin_init() failed\n");
702 			rc = -ENOMEM;
703 			break;
704 		}
705 	}
706 
707 	if (rc) {
708 		for (; i > 0; i--) {
709 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
710 		}
711 	}
712 
713 	return rc;
714 }
715 
716 static int
717 zone_block_register(const char *base_bdev_name)
718 {
719 	struct spdk_bdev_desc *base_desc;
720 	struct spdk_bdev *base_bdev;
721 	struct bdev_zone_block_config *name, *tmp;
722 	struct bdev_zone_block *bdev_node;
723 	uint64_t zone_size;
724 	int rc = 0;
725 
726 	/* Check our list of names from config versus this bdev and if
727 	 * there's a match, create the bdev_node & bdev accordingly.
728 	 */
729 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
730 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
731 			continue;
732 		}
733 
734 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
735 					NULL, &base_desc);
736 		if (rc == -ENODEV) {
737 			return -ENODEV;
738 		} else if (rc) {
739 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
740 			goto free_config;
741 		}
742 
743 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
744 
745 		if (spdk_bdev_is_zoned(base_bdev)) {
746 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
747 			rc = -EEXIST;
748 			goto zone_exist;
749 		}
750 
751 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
752 		if (!bdev_node) {
753 			rc = -ENOMEM;
754 			SPDK_ERRLOG("could not allocate bdev_node\n");
755 			goto zone_exist;
756 		}
757 
758 		bdev_node->base_desc = base_desc;
759 
760 		/* The base bdev that we're attaching to. */
761 		bdev_node->bdev.name = strdup(name->vbdev_name);
762 		if (!bdev_node->bdev.name) {
763 			rc = -ENOMEM;
764 			SPDK_ERRLOG("could not allocate bdev_node name\n");
765 			goto strdup_failed;
766 		}
767 
768 		zone_size = spdk_align64pow2(name->zone_capacity);
769 		if (zone_size == 0) {
770 			rc = -EINVAL;
771 			SPDK_ERRLOG("invalid zone size\n");
772 			goto roundup_failed;
773 		}
774 
775 		bdev_node->zone_shift = spdk_u64log2(zone_size);
776 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
777 
778 		/* Align num_zones to optimal_open_zones */
779 		bdev_node->num_zones -= bdev_node->num_zones % name->optimal_open_zones;
780 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
781 		if (!bdev_node->zones) {
782 			rc = -ENOMEM;
783 			SPDK_ERRLOG("could not allocate zones\n");
784 			goto calloc_failed;
785 		}
786 
787 		bdev_node->bdev.product_name = "zone_block";
788 
789 		/* Copy some properties from the underlying base bdev. */
790 		bdev_node->bdev.write_cache = base_bdev->write_cache;
791 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
792 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
793 
794 		bdev_node->bdev.blocklen = base_bdev->blocklen;
795 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
796 
797 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
798 			SPDK_DEBUGLOG(vbdev_zone_block,
799 				      "Lost %lu blocks due to zone capacity and base bdev size misalignment\n",
800 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
801 		}
802 
803 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
804 
805 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
806 		bdev_node->bdev.md_len = base_bdev->md_len;
807 		bdev_node->bdev.dif_type = base_bdev->dif_type;
808 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
809 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
810 
811 		bdev_node->bdev.zoned = true;
812 		bdev_node->bdev.ctxt = bdev_node;
813 		bdev_node->bdev.fn_table = &zone_block_fn_table;
814 		bdev_node->bdev.module = &bdev_zoned_if;
815 
816 		/* bdev specific info */
817 		bdev_node->bdev.zone_size = zone_size;
818 
819 		bdev_node->zone_capacity = name->zone_capacity;
820 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
821 		bdev_node->bdev.max_open_zones = 0;
822 		rc = zone_block_init_zone_info(bdev_node);
823 		if (rc) {
824 			SPDK_ERRLOG("could not init zone info\n");
825 			goto zone_info_failed;
826 		}
827 
828 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
829 
830 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
831 					sizeof(struct zone_block_io_channel),
832 					name->vbdev_name);
833 
834 		/* Save the thread where the base device is opened */
835 		bdev_node->thread = spdk_get_thread();
836 
837 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
838 		if (rc) {
839 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
840 			goto claim_failed;
841 		}
842 
843 		rc = spdk_bdev_register(&bdev_node->bdev);
844 		if (rc) {
845 			SPDK_ERRLOG("could not register zoned bdev\n");
846 			goto register_failed;
847 		}
848 	}
849 
850 	return rc;
851 
852 register_failed:
853 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
854 claim_failed:
855 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
856 	spdk_io_device_unregister(bdev_node, NULL);
857 zone_info_failed:
858 	free(bdev_node->zones);
859 calloc_failed:
860 roundup_failed:
861 	free(bdev_node->bdev.name);
862 strdup_failed:
863 	free(bdev_node);
864 zone_exist:
865 	spdk_bdev_close(base_desc);
866 free_config:
867 	zone_block_remove_config(name);
868 	return rc;
869 }
870 
871 int
872 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
873 			uint64_t optimal_open_zones)
874 {
875 	int rc = 0;
876 
877 	if (zone_capacity == 0) {
878 		SPDK_ERRLOG("Zone capacity can't be 0\n");
879 		return -EINVAL;
880 	}
881 
882 	if (optimal_open_zones == 0) {
883 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
884 		return -EINVAL;
885 	}
886 
887 	/* Insert the bdev into our global name list even if it doesn't exist yet,
888 	 * it may show up soon...
889 	 */
890 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
891 	if (rc) {
892 		return rc;
893 	}
894 
895 	rc = zone_block_register(bdev_name);
896 	if (rc == -ENODEV) {
897 		/* This is not an error, even though the bdev is not present at this time it may
898 		 * still show up later.
899 		 */
900 		rc = 0;
901 	}
902 	return rc;
903 }
904 
905 void
906 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
907 {
908 	struct bdev_zone_block_config *name_node;
909 	struct spdk_bdev *bdev = NULL;
910 
911 	bdev = spdk_bdev_get_by_name(name);
912 	if (!bdev || bdev->module != &bdev_zoned_if) {
913 		cb_fn(cb_arg, -ENODEV);
914 		return;
915 	}
916 
917 	TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
918 		if (strcmp(name_node->vbdev_name, bdev->name) == 0) {
919 			zone_block_remove_config(name_node);
920 			break;
921 		}
922 	}
923 
924 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
925 }
926 
927 static void
928 zone_block_examine(struct spdk_bdev *bdev)
929 {
930 	zone_block_register(bdev->name);
931 
932 	spdk_bdev_module_examine_done(&bdev_zoned_if);
933 }
934 
935 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
936