xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision 4e8e97c886e47e337dc470ac8c1ffa044d729af0)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 
36 #include "vbdev_zone_block.h"
37 
38 #include "spdk/config.h"
39 #include "spdk/nvme.h"
40 #include "spdk/bdev_zone.h"
41 
42 #include "spdk/log.h"
43 
44 static int zone_block_init(void);
45 static int zone_block_get_ctx_size(void);
46 static void zone_block_finish(void);
47 static int zone_block_config_json(struct spdk_json_write_ctx *w);
48 static void zone_block_examine(struct spdk_bdev *bdev);
49 
50 static struct spdk_bdev_module bdev_zoned_if = {
51 	.name = "bdev_zoned_block",
52 	.module_init = zone_block_init,
53 	.module_fini = zone_block_finish,
54 	.config_text = NULL,
55 	.config_json = zone_block_config_json,
56 	.examine_config = zone_block_examine,
57 	.get_ctx_size = zone_block_get_ctx_size,
58 };
59 
60 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
61 
62 /* List of block vbdev names and their base bdevs via configuration file.
63  * Used so we can parse the conf once at init and use this list in examine().
64  */
65 struct bdev_zone_block_config {
66 	char					*vbdev_name;
67 	char					*bdev_name;
68 	uint64_t				zone_capacity;
69 	uint64_t				optimal_open_zones;
70 	TAILQ_ENTRY(bdev_zone_block_config)	link;
71 };
72 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
73 
74 struct block_zone {
75 	struct spdk_bdev_zone_info zone_info;
76 	pthread_spinlock_t lock;
77 };
78 
79 /* List of block vbdevs and associated info for each. */
80 struct bdev_zone_block {
81 	struct spdk_bdev		bdev;    /* the block zoned bdev */
82 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
83 	struct block_zone		*zones; /* array of zones */
84 	uint64_t			num_zones; /* number of zones */
85 	uint64_t			zone_capacity; /* zone capacity */
86 	uint64_t                        zone_shift; /* log2 of zone_size */
87 	TAILQ_ENTRY(bdev_zone_block)	link;
88 	struct spdk_thread		*thread; /* thread where base device is opened */
89 };
90 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
91 
92 struct zone_block_io_channel {
93 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
94 };
95 
96 struct zone_block_io {
97 	/* vbdev to which IO was issued */
98 	struct bdev_zone_block *bdev_zone_block;
99 };
100 
101 static int
102 zone_block_init(void)
103 {
104 	return 0;
105 }
106 
107 static void
108 zone_block_remove_config(struct bdev_zone_block_config *name)
109 {
110 	TAILQ_REMOVE(&g_bdev_configs, name, link);
111 	free(name->bdev_name);
112 	free(name->vbdev_name);
113 	free(name);
114 }
115 
116 static void
117 zone_block_finish(void)
118 {
119 	struct bdev_zone_block_config *name;
120 
121 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
122 		zone_block_remove_config(name);
123 	}
124 }
125 
126 static int
127 zone_block_get_ctx_size(void)
128 {
129 	return sizeof(struct zone_block_io);
130 }
131 
132 static int
133 zone_block_config_json(struct spdk_json_write_ctx *w)
134 {
135 	struct bdev_zone_block *bdev_node;
136 	struct spdk_bdev *base_bdev = NULL;
137 
138 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
139 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
140 		spdk_json_write_object_begin(w);
141 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
142 		spdk_json_write_named_object_begin(w, "params");
143 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
144 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
145 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
146 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
147 		spdk_json_write_object_end(w);
148 		spdk_json_write_object_end(w);
149 	}
150 
151 	return 0;
152 }
153 
154 /* Callback for unregistering the IO device. */
155 static void
156 _device_unregister_cb(void *io_device)
157 {
158 	struct bdev_zone_block *bdev_node = io_device;
159 	uint64_t i;
160 
161 	free(bdev_node->bdev.name);
162 	for (i = 0; i < bdev_node->num_zones; i++) {
163 		pthread_spin_destroy(&bdev_node->zones[i].lock);
164 	}
165 	free(bdev_node->zones);
166 	free(bdev_node);
167 }
168 
169 static void
170 _zone_block_destruct(void *ctx)
171 {
172 	struct spdk_bdev_desc *desc = ctx;
173 
174 	spdk_bdev_close(desc);
175 }
176 
177 static int
178 zone_block_destruct(void *ctx)
179 {
180 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
181 
182 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
183 
184 	/* Unclaim the underlying bdev. */
185 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
186 
187 	/* Close the underlying bdev on its same opened thread. */
188 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
189 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
190 	} else {
191 		spdk_bdev_close(bdev_node->base_desc);
192 	}
193 
194 	/* Unregister the io_device. */
195 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
196 
197 	return 0;
198 }
199 
200 static struct block_zone *
201 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
202 {
203 	size_t index = lba >> bdev_node->zone_shift;
204 
205 	if (index >= bdev_node->num_zones) {
206 		return NULL;
207 	}
208 
209 	return &bdev_node->zones[index];
210 }
211 
212 static struct block_zone *
213 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
214 {
215 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
216 
217 	if (zone && zone->zone_info.zone_id == start_lba) {
218 		return zone;
219 	} else {
220 		return NULL;
221 	}
222 }
223 
224 static int
225 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
226 {
227 	struct block_zone *zone;
228 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
229 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
230 	size_t i;
231 
232 	/* User can request info for more zones than exist, need to check both internal and user
233 	 * boundaries
234 	 */
235 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
236 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
237 		if (!zone) {
238 			return -EINVAL;
239 		}
240 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
241 	}
242 
243 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
244 	return 0;
245 }
246 
247 static int
248 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
249 {
250 	pthread_spin_lock(&zone->lock);
251 
252 	switch (zone->zone_info.state) {
253 	case SPDK_BDEV_ZONE_STATE_EMPTY:
254 	case SPDK_BDEV_ZONE_STATE_OPEN:
255 	case SPDK_BDEV_ZONE_STATE_CLOSED:
256 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
257 		pthread_spin_unlock(&zone->lock);
258 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
259 		return 0;
260 	default:
261 		pthread_spin_unlock(&zone->lock);
262 		return -EINVAL;
263 	}
264 }
265 
266 static void
267 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
268 {
269 	struct spdk_bdev_io *orig_io = cb_arg;
270 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
271 
272 	/* Complete the original IO and then free the one that we created here
273 	 * as a result of issuing an IO via submit_reqeust.
274 	 */
275 	spdk_bdev_io_complete(orig_io, status);
276 	spdk_bdev_free_io(bdev_io);
277 }
278 
279 static int
280 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
281 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
282 {
283 	pthread_spin_lock(&zone->lock);
284 
285 	switch (zone->zone_info.state) {
286 	case SPDK_BDEV_ZONE_STATE_EMPTY:
287 		pthread_spin_unlock(&zone->lock);
288 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
289 		return 0;
290 	case SPDK_BDEV_ZONE_STATE_OPEN:
291 	case SPDK_BDEV_ZONE_STATE_FULL:
292 	case SPDK_BDEV_ZONE_STATE_CLOSED:
293 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
294 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
295 		pthread_spin_unlock(&zone->lock);
296 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
297 					      zone->zone_info.zone_id, zone->zone_info.capacity,
298 					      _zone_block_complete_unmap, bdev_io);
299 	default:
300 		pthread_spin_unlock(&zone->lock);
301 		return -EINVAL;
302 	}
303 }
304 
305 static int
306 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
307 {
308 	pthread_spin_lock(&zone->lock);
309 
310 	switch (zone->zone_info.state) {
311 	case SPDK_BDEV_ZONE_STATE_OPEN:
312 	case SPDK_BDEV_ZONE_STATE_CLOSED:
313 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
314 		pthread_spin_unlock(&zone->lock);
315 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
316 		return 0;
317 	default:
318 		pthread_spin_unlock(&zone->lock);
319 		return -EINVAL;
320 	}
321 }
322 
323 static int
324 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
325 {
326 	pthread_spin_lock(&zone->lock);
327 
328 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
329 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
330 
331 	pthread_spin_unlock(&zone->lock);
332 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
333 	return 0;
334 }
335 
336 static int
337 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
338 			   struct spdk_bdev_io *bdev_io)
339 {
340 	struct block_zone *zone;
341 
342 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
343 	if (!zone) {
344 		return -EINVAL;
345 	}
346 
347 	switch (bdev_io->u.zone_mgmt.zone_action) {
348 	case SPDK_BDEV_ZONE_RESET:
349 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
350 	case SPDK_BDEV_ZONE_OPEN:
351 		return zone_block_open_zone(zone, bdev_io);
352 	case SPDK_BDEV_ZONE_CLOSE:
353 		return zone_block_close_zone(zone, bdev_io);
354 	case SPDK_BDEV_ZONE_FINISH:
355 		return zone_block_finish_zone(zone, bdev_io);
356 	default:
357 		return -EINVAL;
358 	}
359 }
360 
361 static void
362 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
363 {
364 	struct spdk_bdev_io *orig_io = cb_arg;
365 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
366 
367 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
368 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
369 	}
370 
371 	/* Complete the original IO and then free the one that we created here
372 	 * as a result of issuing an IO via submit_reqeust.
373 	 */
374 	spdk_bdev_io_complete(orig_io, status);
375 	spdk_bdev_free_io(bdev_io);
376 }
377 
378 static int
379 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
380 		 struct spdk_bdev_io *bdev_io)
381 {
382 	struct block_zone *zone;
383 	uint64_t len = bdev_io->u.bdev.num_blocks;
384 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
385 	uint64_t num_blocks_left, wp;
386 	int rc = 0;
387 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
388 
389 	if (is_append) {
390 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
391 	} else {
392 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
393 	}
394 	if (!zone) {
395 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%lx)\n", lba);
396 		return -EINVAL;
397 	}
398 
399 	pthread_spin_lock(&zone->lock);
400 
401 	switch (zone->zone_info.state) {
402 	case SPDK_BDEV_ZONE_STATE_OPEN:
403 	case SPDK_BDEV_ZONE_STATE_EMPTY:
404 	case SPDK_BDEV_ZONE_STATE_CLOSED:
405 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
406 		break;
407 	default:
408 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
409 		rc = -EINVAL;
410 		goto write_fail;
411 	}
412 
413 	wp = zone->zone_info.write_pointer;
414 	if (is_append) {
415 		lba = wp;
416 	} else {
417 		if (lba != wp) {
418 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%lx, wp 0x%lx)\n", lba, wp);
419 			rc = -EINVAL;
420 			goto write_fail;
421 		}
422 	}
423 
424 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
425 	if (len > num_blocks_left) {
426 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIu64 ", len 0x%lx, wp 0x%lx)\n", lba, len, wp);
427 		rc = -EINVAL;
428 		goto write_fail;
429 	}
430 
431 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
432 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
433 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
434 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
435 	}
436 	pthread_spin_unlock(&zone->lock);
437 
438 	if (bdev_io->u.bdev.md_buf == NULL) {
439 		rc = spdk_bdev_writev_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
440 					     bdev_io->u.bdev.iovcnt, lba,
441 					     bdev_io->u.bdev.num_blocks, _zone_block_complete_write,
442 					     bdev_io);
443 	} else {
444 		rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
445 						     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
446 						     bdev_io->u.bdev.md_buf,
447 						     lba, bdev_io->u.bdev.num_blocks,
448 						     _zone_block_complete_write, bdev_io);
449 	}
450 
451 	return rc;
452 
453 write_fail:
454 	pthread_spin_unlock(&zone->lock);
455 	return rc;
456 }
457 
458 static void
459 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
460 {
461 	struct spdk_bdev_io *orig_io = cb_arg;
462 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
463 
464 	/* Complete the original IO and then free the one that we created here
465 	 * as a result of issuing an IO via submit_reqeust.
466 	 */
467 	spdk_bdev_io_complete(orig_io, status);
468 	spdk_bdev_free_io(bdev_io);
469 }
470 
471 static int
472 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
473 		struct spdk_bdev_io *bdev_io)
474 {
475 	struct block_zone *zone;
476 	uint64_t len = bdev_io->u.bdev.num_blocks;
477 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
478 	int rc;
479 
480 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
481 	if (!zone) {
482 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%lx)\n", lba);
483 		return -EINVAL;
484 	}
485 
486 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
487 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%lx, len 0x%lx)\n", lba, len);
488 		return -EINVAL;
489 	}
490 
491 	if (bdev_io->u.bdev.md_buf == NULL) {
492 		rc = spdk_bdev_readv_blocks(bdev_node->base_desc, ch->base_ch, bdev_io->u.bdev.iovs,
493 					    bdev_io->u.bdev.iovcnt, lba,
494 					    len, _zone_block_complete_read,
495 					    bdev_io);
496 	} else {
497 		rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
498 						    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
499 						    bdev_io->u.bdev.md_buf,
500 						    lba, len,
501 						    _zone_block_complete_read, bdev_io);
502 	}
503 
504 	return rc;
505 }
506 
507 static void
508 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
509 {
510 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
511 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
512 	int rc = 0;
513 
514 	switch (bdev_io->type) {
515 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
516 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
517 		break;
518 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
519 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
520 		break;
521 	case SPDK_BDEV_IO_TYPE_WRITE:
522 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
523 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
524 		break;
525 	case SPDK_BDEV_IO_TYPE_READ:
526 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
527 		break;
528 	default:
529 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
530 		rc = -ENOTSUP;
531 		break;
532 	}
533 
534 	if (rc != 0) {
535 		if (rc == -ENOMEM) {
536 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
537 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
538 		} else {
539 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
540 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
541 		}
542 	}
543 }
544 
545 static bool
546 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
547 {
548 	switch (io_type) {
549 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
550 	case SPDK_BDEV_IO_TYPE_WRITE:
551 	case SPDK_BDEV_IO_TYPE_READ:
552 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
553 		return true;
554 	default:
555 		return false;
556 	}
557 }
558 
559 static struct spdk_io_channel *
560 zone_block_get_io_channel(void *ctx)
561 {
562 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
563 
564 	return spdk_get_io_channel(bdev_node);
565 }
566 
567 static int
568 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
569 {
570 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
571 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
572 
573 	spdk_json_write_name(w, "zoned_block");
574 	spdk_json_write_object_begin(w);
575 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
576 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
577 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
578 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
579 	spdk_json_write_object_end(w);
580 
581 	return 0;
582 }
583 
584 /* When we register our vbdev this is how we specify our entry points. */
585 static const struct spdk_bdev_fn_table zone_block_fn_table = {
586 	.destruct		= zone_block_destruct,
587 	.submit_request		= zone_block_submit_request,
588 	.io_type_supported	= zone_block_io_type_supported,
589 	.get_io_channel		= zone_block_get_io_channel,
590 	.dump_info_json		= zone_block_dump_info_json,
591 };
592 
593 static void
594 zone_block_base_bdev_hotremove_cb(void *ctx)
595 {
596 	struct bdev_zone_block *bdev_node, *tmp;
597 	struct spdk_bdev *bdev_find = ctx;
598 
599 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
600 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
601 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
602 		}
603 	}
604 }
605 
606 static int
607 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
608 {
609 	struct zone_block_io_channel *bdev_ch = ctx_buf;
610 	struct bdev_zone_block *bdev_node = io_device;
611 
612 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
613 	if (!bdev_ch->base_ch) {
614 		return -ENOMEM;
615 	}
616 
617 	return 0;
618 }
619 
620 static void
621 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
622 {
623 	struct zone_block_io_channel *bdev_ch = ctx_buf;
624 
625 	spdk_put_io_channel(bdev_ch->base_ch);
626 }
627 
628 static int
629 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
630 		       uint64_t optimal_open_zones)
631 {
632 	struct bdev_zone_block_config *name;
633 
634 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
635 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
636 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
637 			return -EEXIST;
638 		}
639 		if (strcmp(bdev_name, name->bdev_name) == 0) {
640 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
641 			return -EEXIST;
642 		}
643 	}
644 
645 	name = calloc(1, sizeof(*name));
646 	if (!name) {
647 		SPDK_ERRLOG("could not allocate bdev_names\n");
648 		return -ENOMEM;
649 	}
650 
651 	name->bdev_name = strdup(bdev_name);
652 	if (!name->bdev_name) {
653 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
654 		free(name);
655 		return -ENOMEM;
656 	}
657 
658 	name->vbdev_name = strdup(vbdev_name);
659 	if (!name->vbdev_name) {
660 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
661 		free(name->bdev_name);
662 		free(name);
663 		return -ENOMEM;
664 	}
665 
666 	name->zone_capacity = zone_capacity;
667 	name->optimal_open_zones = optimal_open_zones;
668 
669 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
670 
671 	return 0;
672 }
673 
674 static int
675 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
676 {
677 	size_t i;
678 	struct block_zone *zone;
679 	int rc = 0;
680 
681 	for (i = 0; i < bdev_node->num_zones; i++) {
682 		zone = &bdev_node->zones[i];
683 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
684 		zone->zone_info.capacity = bdev_node->zone_capacity;
685 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
686 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
687 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
688 			SPDK_ERRLOG("pthread_spin_init() failed\n");
689 			rc = -ENOMEM;
690 			break;
691 		}
692 	}
693 
694 	if (rc) {
695 		for (; i > 0; i--) {
696 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
697 		}
698 	}
699 
700 	return rc;
701 }
702 
703 static int
704 zone_block_register(struct spdk_bdev *base_bdev)
705 {
706 	struct bdev_zone_block_config *name, *tmp;
707 	struct bdev_zone_block *bdev_node;
708 	uint64_t zone_size;
709 	int rc = 0;
710 
711 	/* Check our list of names from config versus this bdev and if
712 	 * there's a match, create the bdev_node & bdev accordingly.
713 	 */
714 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
715 		if (strcmp(name->bdev_name, base_bdev->name) != 0) {
716 			continue;
717 		}
718 
719 		if (spdk_bdev_is_zoned(base_bdev)) {
720 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev->name);
721 			rc = -EEXIST;
722 			goto free_config;
723 		}
724 
725 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
726 		if (!bdev_node) {
727 			rc = -ENOMEM;
728 			SPDK_ERRLOG("could not allocate bdev_node\n");
729 			goto free_config;
730 		}
731 
732 		/* The base bdev that we're attaching to. */
733 		bdev_node->bdev.name = strdup(name->vbdev_name);
734 		if (!bdev_node->bdev.name) {
735 			rc = -ENOMEM;
736 			SPDK_ERRLOG("could not allocate bdev_node name\n");
737 			goto strdup_failed;
738 		}
739 
740 		zone_size = spdk_align64pow2(name->zone_capacity);
741 		if (zone_size == 0) {
742 			rc = -EINVAL;
743 			SPDK_ERRLOG("invalid zone size\n");
744 			goto roundup_failed;
745 		}
746 
747 		bdev_node->zone_shift = spdk_u64log2(zone_size);
748 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
749 
750 		/* Align num_zones to optimal_open_zones */
751 		bdev_node->num_zones -= bdev_node->num_zones % name->optimal_open_zones;
752 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
753 		if (!bdev_node->zones) {
754 			rc = -ENOMEM;
755 			SPDK_ERRLOG("could not allocate zones\n");
756 			goto calloc_failed;
757 		}
758 
759 		bdev_node->bdev.product_name = "zone_block";
760 
761 		/* Copy some properties from the underlying base bdev. */
762 		bdev_node->bdev.write_cache = base_bdev->write_cache;
763 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
764 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
765 
766 		bdev_node->bdev.blocklen = base_bdev->blocklen;
767 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
768 
769 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
770 			SPDK_DEBUGLOG(vbdev_zone_block,
771 				      "Lost %lu blocks due to zone capacity and base bdev size misalignment\n",
772 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
773 		}
774 
775 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
776 
777 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
778 		bdev_node->bdev.md_len = base_bdev->md_len;
779 		bdev_node->bdev.dif_type = base_bdev->dif_type;
780 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
781 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
782 
783 		bdev_node->bdev.zoned = true;
784 		bdev_node->bdev.ctxt = bdev_node;
785 		bdev_node->bdev.fn_table = &zone_block_fn_table;
786 		bdev_node->bdev.module = &bdev_zoned_if;
787 
788 		/* bdev specific info */
789 		bdev_node->bdev.zone_size = zone_size;
790 
791 		bdev_node->zone_capacity = name->zone_capacity;
792 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
793 		bdev_node->bdev.max_open_zones = 0;
794 		rc = zone_block_init_zone_info(bdev_node);
795 		if (rc) {
796 			SPDK_ERRLOG("could not init zone info\n");
797 			goto zone_info_failed;
798 		}
799 
800 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
801 
802 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
803 					sizeof(struct zone_block_io_channel),
804 					name->vbdev_name);
805 
806 		rc = spdk_bdev_open(base_bdev, true, zone_block_base_bdev_hotremove_cb,
807 				    base_bdev, &bdev_node->base_desc);
808 		if (rc) {
809 			SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(base_bdev));
810 			goto open_failed;
811 		}
812 
813 		/* Save the thread where the base device is opened */
814 		bdev_node->thread = spdk_get_thread();
815 
816 		rc = spdk_bdev_module_claim_bdev(base_bdev, bdev_node->base_desc, bdev_node->bdev.module);
817 		if (rc) {
818 			SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base_bdev));
819 			goto claim_failed;
820 		}
821 
822 		rc = spdk_bdev_register(&bdev_node->bdev);
823 		if (rc) {
824 			SPDK_ERRLOG("could not register zoned bdev\n");
825 			goto register_failed;
826 		}
827 	}
828 
829 	return rc;
830 
831 register_failed:
832 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
833 claim_failed:
834 	spdk_bdev_close(bdev_node->base_desc);
835 open_failed:
836 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
837 	spdk_io_device_unregister(bdev_node, NULL);
838 zone_info_failed:
839 	free(bdev_node->zones);
840 calloc_failed:
841 roundup_failed:
842 	free(bdev_node->bdev.name);
843 strdup_failed:
844 	free(bdev_node);
845 free_config:
846 	zone_block_remove_config(name);
847 	return rc;
848 }
849 
850 int
851 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
852 			uint64_t optimal_open_zones)
853 {
854 	struct spdk_bdev *bdev = NULL;
855 	int rc = 0;
856 
857 	if (zone_capacity == 0) {
858 		SPDK_ERRLOG("Zone capacity can't be 0\n");
859 		return -EINVAL;
860 	}
861 
862 	if (optimal_open_zones == 0) {
863 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
864 		return -EINVAL;
865 	}
866 
867 	/* Insert the bdev into our global name list even if it doesn't exist yet,
868 	 * it may show up soon...
869 	 */
870 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
871 	if (rc) {
872 		return rc;
873 	}
874 
875 	bdev = spdk_bdev_get_by_name(bdev_name);
876 	if (!bdev) {
877 		/* This is not an error, even though the bdev is not present at this time it may
878 		 * still show up later.
879 		 */
880 		return 0;
881 	}
882 
883 	return zone_block_register(bdev);
884 }
885 
886 void
887 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
888 {
889 	struct bdev_zone_block_config *name_node;
890 	struct spdk_bdev *bdev = NULL;
891 
892 	bdev = spdk_bdev_get_by_name(name);
893 	if (!bdev || bdev->module != &bdev_zoned_if) {
894 		cb_fn(cb_arg, -ENODEV);
895 		return;
896 	}
897 
898 	TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
899 		if (strcmp(name_node->vbdev_name, bdev->name) == 0) {
900 			zone_block_remove_config(name_node);
901 			break;
902 		}
903 	}
904 
905 	spdk_bdev_unregister(bdev, cb_fn, cb_arg);
906 }
907 
908 static void
909 zone_block_examine(struct spdk_bdev *bdev)
910 {
911 	zone_block_register(bdev);
912 
913 	spdk_bdev_module_examine_done(&bdev_zoned_if);
914 }
915 
916 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
917