xref: /spdk/module/bdev/zone_block/vbdev_zone_block.c (revision 8afdeef3becfe9409cc9e7372bd0bc10e8b7d46d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
4  *   All rights reserved.
5  */
6 
7 #include "spdk/stdinc.h"
8 
9 #include "vbdev_zone_block.h"
10 
11 #include "spdk/config.h"
12 #include "spdk/nvme.h"
13 #include "spdk/bdev_zone.h"
14 
15 #include "spdk/log.h"
16 
17 /* This namespace UUID was generated using uuid_generate() method. */
18 #define BDEV_ZONE_BLOCK_NAMESPACE_UUID "5f3f485a-d6bb-4443-9de7-023683b77389"
19 
20 static int zone_block_init(void);
21 static int zone_block_get_ctx_size(void);
22 static void zone_block_finish(void);
23 static int zone_block_config_json(struct spdk_json_write_ctx *w);
24 static void zone_block_examine(struct spdk_bdev *bdev);
25 
26 static struct spdk_bdev_module bdev_zoned_if = {
27 	.name = "bdev_zoned_block",
28 	.module_init = zone_block_init,
29 	.module_fini = zone_block_finish,
30 	.config_json = zone_block_config_json,
31 	.examine_config = zone_block_examine,
32 	.get_ctx_size = zone_block_get_ctx_size,
33 };
34 
35 SPDK_BDEV_MODULE_REGISTER(bdev_zoned_block, &bdev_zoned_if)
36 
37 /* List of block vbdev names and their base bdevs via configuration file.
38  * Used so we can parse the conf once at init and use this list in examine().
39  */
40 struct bdev_zone_block_config {
41 	char					*vbdev_name;
42 	char					*bdev_name;
43 	uint64_t				zone_capacity;
44 	uint64_t				optimal_open_zones;
45 	TAILQ_ENTRY(bdev_zone_block_config)	link;
46 };
47 static TAILQ_HEAD(, bdev_zone_block_config) g_bdev_configs = TAILQ_HEAD_INITIALIZER(g_bdev_configs);
48 
49 struct block_zone {
50 	struct spdk_bdev_zone_info zone_info;
51 	pthread_spinlock_t lock;
52 };
53 
54 /* List of block vbdevs and associated info for each. */
55 struct bdev_zone_block {
56 	struct spdk_bdev		bdev;    /* the block zoned bdev */
57 	struct spdk_bdev_desc		*base_desc; /* its descriptor we get from open */
58 	struct block_zone		*zones; /* array of zones */
59 	uint64_t			num_zones; /* number of zones */
60 	uint64_t			zone_capacity; /* zone capacity */
61 	uint64_t                        zone_shift; /* log2 of zone_size */
62 	TAILQ_ENTRY(bdev_zone_block)	link;
63 	struct spdk_thread		*thread; /* thread where base device is opened */
64 };
65 static TAILQ_HEAD(, bdev_zone_block) g_bdev_nodes = TAILQ_HEAD_INITIALIZER(g_bdev_nodes);
66 
67 struct zone_block_io_channel {
68 	struct spdk_io_channel	*base_ch; /* IO channel of base device */
69 };
70 
71 struct zone_block_io {
72 	/* vbdev to which IO was issued */
73 	struct bdev_zone_block *bdev_zone_block;
74 };
75 
76 static int
77 zone_block_init(void)
78 {
79 	return 0;
80 }
81 
82 static void
83 zone_block_remove_config(struct bdev_zone_block_config *name)
84 {
85 	TAILQ_REMOVE(&g_bdev_configs, name, link);
86 	free(name->bdev_name);
87 	free(name->vbdev_name);
88 	free(name);
89 }
90 
91 static void
92 zone_block_finish(void)
93 {
94 	struct bdev_zone_block_config *name;
95 
96 	while ((name = TAILQ_FIRST(&g_bdev_configs))) {
97 		zone_block_remove_config(name);
98 	}
99 }
100 
101 static int
102 zone_block_get_ctx_size(void)
103 {
104 	return sizeof(struct zone_block_io);
105 }
106 
107 static int
108 zone_block_config_json(struct spdk_json_write_ctx *w)
109 {
110 	struct bdev_zone_block *bdev_node;
111 	struct spdk_bdev *base_bdev = NULL;
112 
113 	TAILQ_FOREACH(bdev_node, &g_bdev_nodes, link) {
114 		base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
115 		spdk_json_write_object_begin(w);
116 		spdk_json_write_named_string(w, "method", "bdev_zone_block_create");
117 		spdk_json_write_named_object_begin(w, "params");
118 		spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
119 		spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
120 		spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
121 		spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
122 		spdk_json_write_object_end(w);
123 		spdk_json_write_object_end(w);
124 	}
125 
126 	return 0;
127 }
128 
129 /* Callback for unregistering the IO device. */
130 static void
131 _device_unregister_cb(void *io_device)
132 {
133 	struct bdev_zone_block *bdev_node = io_device;
134 	uint64_t i;
135 
136 	free(bdev_node->bdev.name);
137 	for (i = 0; i < bdev_node->num_zones; i++) {
138 		pthread_spin_destroy(&bdev_node->zones[i].lock);
139 	}
140 	free(bdev_node->zones);
141 	free(bdev_node);
142 }
143 
144 static void
145 _zone_block_destruct(void *ctx)
146 {
147 	struct spdk_bdev_desc *desc = ctx;
148 
149 	spdk_bdev_close(desc);
150 }
151 
152 static int
153 zone_block_destruct(void *ctx)
154 {
155 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
156 
157 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
158 
159 	/* Unclaim the underlying bdev. */
160 	spdk_bdev_module_release_bdev(spdk_bdev_desc_get_bdev(bdev_node->base_desc));
161 
162 	/* Close the underlying bdev on its same opened thread. */
163 	if (bdev_node->thread && bdev_node->thread != spdk_get_thread()) {
164 		spdk_thread_send_msg(bdev_node->thread, _zone_block_destruct, bdev_node->base_desc);
165 	} else {
166 		spdk_bdev_close(bdev_node->base_desc);
167 	}
168 
169 	/* Unregister the io_device. */
170 	spdk_io_device_unregister(bdev_node, _device_unregister_cb);
171 
172 	return 0;
173 }
174 
175 static struct block_zone *
176 zone_block_get_zone_containing_lba(struct bdev_zone_block *bdev_node, uint64_t lba)
177 {
178 	size_t index = lba >> bdev_node->zone_shift;
179 
180 	if (index >= bdev_node->num_zones) {
181 		return NULL;
182 	}
183 
184 	return &bdev_node->zones[index];
185 }
186 
187 static struct block_zone *
188 zone_block_get_zone_by_slba(struct bdev_zone_block *bdev_node, uint64_t start_lba)
189 {
190 	struct block_zone *zone = zone_block_get_zone_containing_lba(bdev_node, start_lba);
191 
192 	if (zone && zone->zone_info.zone_id == start_lba) {
193 		return zone;
194 	} else {
195 		return NULL;
196 	}
197 }
198 
199 static int
200 zone_block_get_zone_info(struct bdev_zone_block *bdev_node, struct spdk_bdev_io *bdev_io)
201 {
202 	struct block_zone *zone;
203 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
204 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
205 	size_t i;
206 
207 	/* User can request info for more zones than exist, need to check both internal and user
208 	 * boundaries
209 	 */
210 	for (i = 0; i < bdev_io->u.zone_mgmt.num_zones; i++, zone_id += bdev_node->bdev.zone_size) {
211 		zone = zone_block_get_zone_by_slba(bdev_node, zone_id);
212 		if (!zone) {
213 			return -EINVAL;
214 		}
215 		memcpy(&zone_info[i], &zone->zone_info, sizeof(*zone_info));
216 	}
217 
218 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
219 	return 0;
220 }
221 
222 static int
223 zone_block_open_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
224 {
225 	pthread_spin_lock(&zone->lock);
226 
227 	switch (zone->zone_info.state) {
228 	case SPDK_BDEV_ZONE_STATE_EMPTY:
229 	case SPDK_BDEV_ZONE_STATE_OPEN:
230 	case SPDK_BDEV_ZONE_STATE_CLOSED:
231 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
232 		pthread_spin_unlock(&zone->lock);
233 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
234 		return 0;
235 	default:
236 		pthread_spin_unlock(&zone->lock);
237 		return -EINVAL;
238 	}
239 }
240 
241 static void
242 _zone_block_complete_unmap(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
243 {
244 	struct spdk_bdev_io *orig_io = cb_arg;
245 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
246 
247 	/* Complete the original IO and then free the one that we created here
248 	 * as a result of issuing an IO via submit_request.
249 	 */
250 	spdk_bdev_io_complete(orig_io, status);
251 	spdk_bdev_free_io(bdev_io);
252 }
253 
254 static int
255 zone_block_reset_zone(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
256 		      struct block_zone *zone, struct spdk_bdev_io *bdev_io)
257 {
258 	pthread_spin_lock(&zone->lock);
259 
260 	switch (zone->zone_info.state) {
261 	case SPDK_BDEV_ZONE_STATE_EMPTY:
262 		pthread_spin_unlock(&zone->lock);
263 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
264 		return 0;
265 	case SPDK_BDEV_ZONE_STATE_OPEN:
266 	case SPDK_BDEV_ZONE_STATE_FULL:
267 	case SPDK_BDEV_ZONE_STATE_CLOSED:
268 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_EMPTY;
269 		zone->zone_info.write_pointer = zone->zone_info.zone_id;
270 		pthread_spin_unlock(&zone->lock);
271 
272 		/* The unmap isn't necessary, so if the base bdev doesn't support it, we're done */
273 		if (!spdk_bdev_io_type_supported(spdk_bdev_desc_get_bdev(bdev_node->base_desc),
274 						 SPDK_BDEV_IO_TYPE_UNMAP)) {
275 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
276 			return 0;
277 		}
278 
279 		return spdk_bdev_unmap_blocks(bdev_node->base_desc, ch->base_ch,
280 					      zone->zone_info.zone_id, zone->zone_info.capacity,
281 					      _zone_block_complete_unmap, bdev_io);
282 	default:
283 		pthread_spin_unlock(&zone->lock);
284 		return -EINVAL;
285 	}
286 }
287 
288 static int
289 zone_block_close_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
290 {
291 	pthread_spin_lock(&zone->lock);
292 
293 	switch (zone->zone_info.state) {
294 	case SPDK_BDEV_ZONE_STATE_OPEN:
295 	case SPDK_BDEV_ZONE_STATE_CLOSED:
296 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_CLOSED;
297 		pthread_spin_unlock(&zone->lock);
298 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
299 		return 0;
300 	default:
301 		pthread_spin_unlock(&zone->lock);
302 		return -EINVAL;
303 	}
304 }
305 
306 static int
307 zone_block_finish_zone(struct block_zone *zone, struct spdk_bdev_io *bdev_io)
308 {
309 	pthread_spin_lock(&zone->lock);
310 
311 	zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
312 	zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
313 
314 	pthread_spin_unlock(&zone->lock);
315 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
316 	return 0;
317 }
318 
319 static int
320 zone_block_zone_management(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
321 			   struct spdk_bdev_io *bdev_io)
322 {
323 	struct block_zone *zone;
324 
325 	zone = zone_block_get_zone_by_slba(bdev_node, bdev_io->u.zone_mgmt.zone_id);
326 	if (!zone) {
327 		return -EINVAL;
328 	}
329 
330 	switch (bdev_io->u.zone_mgmt.zone_action) {
331 	case SPDK_BDEV_ZONE_RESET:
332 		return zone_block_reset_zone(bdev_node, ch, zone, bdev_io);
333 	case SPDK_BDEV_ZONE_OPEN:
334 		return zone_block_open_zone(zone, bdev_io);
335 	case SPDK_BDEV_ZONE_CLOSE:
336 		return zone_block_close_zone(zone, bdev_io);
337 	case SPDK_BDEV_ZONE_FINISH:
338 		return zone_block_finish_zone(zone, bdev_io);
339 	default:
340 		return -EINVAL;
341 	}
342 }
343 
344 static void
345 _zone_block_complete_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
346 {
347 	struct spdk_bdev_io *orig_io = cb_arg;
348 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
349 
350 	if (success && orig_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND) {
351 		orig_io->u.bdev.offset_blocks = bdev_io->u.bdev.offset_blocks;
352 	}
353 
354 	/* Complete the original IO and then free the one that we created here
355 	 * as a result of issuing an IO via submit_request.
356 	 */
357 	spdk_bdev_io_complete(orig_io, status);
358 	spdk_bdev_free_io(bdev_io);
359 }
360 
361 static int
362 zone_block_write(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
363 		 struct spdk_bdev_io *bdev_io)
364 {
365 	struct block_zone *zone;
366 	uint64_t len = bdev_io->u.bdev.num_blocks;
367 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
368 	uint64_t num_blocks_left, wp;
369 	int rc = 0;
370 	bool is_append = bdev_io->type == SPDK_BDEV_IO_TYPE_ZONE_APPEND;
371 
372 	if (is_append) {
373 		zone = zone_block_get_zone_by_slba(bdev_node, lba);
374 	} else {
375 		zone = zone_block_get_zone_containing_lba(bdev_node, lba);
376 	}
377 	if (!zone) {
378 		SPDK_ERRLOG("Trying to write to invalid zone (lba 0x%" PRIx64 ")\n", lba);
379 		return -EINVAL;
380 	}
381 
382 	pthread_spin_lock(&zone->lock);
383 
384 	switch (zone->zone_info.state) {
385 	case SPDK_BDEV_ZONE_STATE_OPEN:
386 	case SPDK_BDEV_ZONE_STATE_EMPTY:
387 	case SPDK_BDEV_ZONE_STATE_CLOSED:
388 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_OPEN;
389 		break;
390 	default:
391 		SPDK_ERRLOG("Trying to write to zone in invalid state %u\n", zone->zone_info.state);
392 		rc = -EINVAL;
393 		goto write_fail;
394 	}
395 
396 	wp = zone->zone_info.write_pointer;
397 	if (is_append) {
398 		lba = wp;
399 	} else {
400 		if (lba != wp) {
401 			SPDK_ERRLOG("Trying to write to zone with invalid address (lba 0x%" PRIx64 ", wp 0x%" PRIx64 ")\n",
402 				    lba, wp);
403 			rc = -EINVAL;
404 			goto write_fail;
405 		}
406 	}
407 
408 	num_blocks_left = zone->zone_info.zone_id + zone->zone_info.capacity - wp;
409 	if (len > num_blocks_left) {
410 		SPDK_ERRLOG("Write exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ", wp 0x%" PRIx64
411 			    ")\n", lba, len, wp);
412 		rc = -EINVAL;
413 		goto write_fail;
414 	}
415 
416 	zone->zone_info.write_pointer += bdev_io->u.bdev.num_blocks;
417 	assert(zone->zone_info.write_pointer <= zone->zone_info.zone_id + zone->zone_info.capacity);
418 	if (zone->zone_info.write_pointer == zone->zone_info.zone_id + zone->zone_info.capacity) {
419 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
420 	}
421 	pthread_spin_unlock(&zone->lock);
422 
423 	rc = spdk_bdev_writev_blocks_with_md(bdev_node->base_desc, ch->base_ch,
424 					     bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
425 					     bdev_io->u.bdev.md_buf,
426 					     lba, bdev_io->u.bdev.num_blocks,
427 					     _zone_block_complete_write, bdev_io);
428 
429 	return rc;
430 
431 write_fail:
432 	pthread_spin_unlock(&zone->lock);
433 	return rc;
434 }
435 
436 static void
437 _zone_block_complete_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
438 {
439 	struct spdk_bdev_io *orig_io = cb_arg;
440 	int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
441 
442 	/* Complete the original IO and then free the one that we created here
443 	 * as a result of issuing an IO via submit_request.
444 	 */
445 	spdk_bdev_io_complete(orig_io, status);
446 	spdk_bdev_free_io(bdev_io);
447 }
448 
449 static int
450 zone_block_read(struct bdev_zone_block *bdev_node, struct zone_block_io_channel *ch,
451 		struct spdk_bdev_io *bdev_io)
452 {
453 	struct block_zone *zone;
454 	uint64_t len = bdev_io->u.bdev.num_blocks;
455 	uint64_t lba = bdev_io->u.bdev.offset_blocks;
456 	int rc;
457 
458 	zone = zone_block_get_zone_containing_lba(bdev_node, lba);
459 	if (!zone) {
460 		SPDK_ERRLOG("Trying to read from invalid zone (lba 0x%" PRIx64 ")\n", lba);
461 		return -EINVAL;
462 	}
463 
464 	if ((lba + len) > (zone->zone_info.zone_id + zone->zone_info.capacity)) {
465 		SPDK_ERRLOG("Read exceeds zone capacity (lba 0x%" PRIx64 ", len 0x%" PRIx64 ")\n", lba, len);
466 		return -EINVAL;
467 	}
468 
469 	rc = spdk_bdev_readv_blocks_with_md(bdev_node->base_desc, ch->base_ch,
470 					    bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
471 					    bdev_io->u.bdev.md_buf,
472 					    lba, len,
473 					    _zone_block_complete_read, bdev_io);
474 
475 	return rc;
476 }
477 
478 static void
479 zone_block_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
480 {
481 	struct bdev_zone_block *bdev_node = SPDK_CONTAINEROF(bdev_io->bdev, struct bdev_zone_block, bdev);
482 	struct zone_block_io_channel *dev_ch = spdk_io_channel_get_ctx(ch);
483 	int rc = 0;
484 
485 	switch (bdev_io->type) {
486 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
487 		rc = zone_block_get_zone_info(bdev_node, bdev_io);
488 		break;
489 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
490 		rc = zone_block_zone_management(bdev_node, dev_ch, bdev_io);
491 		break;
492 	case SPDK_BDEV_IO_TYPE_WRITE:
493 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
494 		rc = zone_block_write(bdev_node, dev_ch, bdev_io);
495 		break;
496 	case SPDK_BDEV_IO_TYPE_READ:
497 		rc = zone_block_read(bdev_node, dev_ch, bdev_io);
498 		break;
499 	default:
500 		SPDK_ERRLOG("vbdev_block: unknown I/O type %u\n", bdev_io->type);
501 		rc = -ENOTSUP;
502 		break;
503 	}
504 
505 	if (rc != 0) {
506 		if (rc == -ENOMEM) {
507 			SPDK_WARNLOG("ENOMEM, start to queue io for vbdev.\n");
508 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
509 		} else {
510 			SPDK_ERRLOG("ERROR on bdev_io submission!\n");
511 			spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
512 		}
513 	}
514 }
515 
516 static bool
517 zone_block_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
518 {
519 	switch (io_type) {
520 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
521 	case SPDK_BDEV_IO_TYPE_WRITE:
522 	case SPDK_BDEV_IO_TYPE_READ:
523 	case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
524 		return true;
525 	default:
526 		return false;
527 	}
528 }
529 
530 static struct spdk_io_channel *
531 zone_block_get_io_channel(void *ctx)
532 {
533 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
534 
535 	return spdk_get_io_channel(bdev_node);
536 }
537 
538 static int
539 zone_block_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
540 {
541 	struct bdev_zone_block *bdev_node = (struct bdev_zone_block *)ctx;
542 	struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(bdev_node->base_desc);
543 
544 	spdk_json_write_name(w, "zoned_block");
545 	spdk_json_write_object_begin(w);
546 	spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&bdev_node->bdev));
547 	spdk_json_write_named_string(w, "base_bdev", spdk_bdev_get_name(base_bdev));
548 	spdk_json_write_named_uint64(w, "zone_capacity", bdev_node->zone_capacity);
549 	spdk_json_write_named_uint64(w, "optimal_open_zones", bdev_node->bdev.optimal_open_zones);
550 	spdk_json_write_object_end(w);
551 
552 	return 0;
553 }
554 
555 /* When we register our vbdev this is how we specify our entry points. */
556 static const struct spdk_bdev_fn_table zone_block_fn_table = {
557 	.destruct		= zone_block_destruct,
558 	.submit_request		= zone_block_submit_request,
559 	.io_type_supported	= zone_block_io_type_supported,
560 	.get_io_channel		= zone_block_get_io_channel,
561 	.dump_info_json		= zone_block_dump_info_json,
562 };
563 
564 static void
565 zone_block_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
566 {
567 	struct bdev_zone_block *bdev_node, *tmp;
568 
569 	TAILQ_FOREACH_SAFE(bdev_node, &g_bdev_nodes, link, tmp) {
570 		if (bdev_find == spdk_bdev_desc_get_bdev(bdev_node->base_desc)) {
571 			spdk_bdev_unregister(&bdev_node->bdev, NULL, NULL);
572 		}
573 	}
574 }
575 
576 static void
577 zone_block_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
578 			      void *event_ctx)
579 {
580 	switch (type) {
581 	case SPDK_BDEV_EVENT_REMOVE:
582 		zone_block_base_bdev_hotremove_cb(bdev);
583 		break;
584 	default:
585 		SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
586 		break;
587 	}
588 }
589 
590 static int
591 _zone_block_ch_create_cb(void *io_device, void *ctx_buf)
592 {
593 	struct zone_block_io_channel *bdev_ch = ctx_buf;
594 	struct bdev_zone_block *bdev_node = io_device;
595 
596 	bdev_ch->base_ch = spdk_bdev_get_io_channel(bdev_node->base_desc);
597 	if (!bdev_ch->base_ch) {
598 		return -ENOMEM;
599 	}
600 
601 	return 0;
602 }
603 
604 static void
605 _zone_block_ch_destroy_cb(void *io_device, void *ctx_buf)
606 {
607 	struct zone_block_io_channel *bdev_ch = ctx_buf;
608 
609 	spdk_put_io_channel(bdev_ch->base_ch);
610 }
611 
612 static int
613 zone_block_insert_name(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
614 		       uint64_t optimal_open_zones)
615 {
616 	struct bdev_zone_block_config *name;
617 
618 	TAILQ_FOREACH(name, &g_bdev_configs, link) {
619 		if (strcmp(vbdev_name, name->vbdev_name) == 0) {
620 			SPDK_ERRLOG("block zoned bdev %s already exists\n", vbdev_name);
621 			return -EEXIST;
622 		}
623 		if (strcmp(bdev_name, name->bdev_name) == 0) {
624 			SPDK_ERRLOG("base bdev %s already claimed\n", bdev_name);
625 			return -EEXIST;
626 		}
627 	}
628 
629 	name = calloc(1, sizeof(*name));
630 	if (!name) {
631 		SPDK_ERRLOG("could not allocate bdev_names\n");
632 		return -ENOMEM;
633 	}
634 
635 	name->bdev_name = strdup(bdev_name);
636 	if (!name->bdev_name) {
637 		SPDK_ERRLOG("could not allocate name->bdev_name\n");
638 		free(name);
639 		return -ENOMEM;
640 	}
641 
642 	name->vbdev_name = strdup(vbdev_name);
643 	if (!name->vbdev_name) {
644 		SPDK_ERRLOG("could not allocate name->vbdev_name\n");
645 		free(name->bdev_name);
646 		free(name);
647 		return -ENOMEM;
648 	}
649 
650 	name->zone_capacity = zone_capacity;
651 	name->optimal_open_zones = optimal_open_zones;
652 
653 	TAILQ_INSERT_TAIL(&g_bdev_configs, name, link);
654 
655 	return 0;
656 }
657 
658 static int
659 zone_block_init_zone_info(struct bdev_zone_block *bdev_node)
660 {
661 	size_t i;
662 	struct block_zone *zone;
663 	int rc = 0;
664 
665 	for (i = 0; i < bdev_node->num_zones; i++) {
666 		zone = &bdev_node->zones[i];
667 		zone->zone_info.zone_id = bdev_node->bdev.zone_size * i;
668 		zone->zone_info.capacity = bdev_node->zone_capacity;
669 		zone->zone_info.write_pointer = zone->zone_info.zone_id + zone->zone_info.capacity;
670 		zone->zone_info.state = SPDK_BDEV_ZONE_STATE_FULL;
671 		zone->zone_info.type = SPDK_BDEV_ZONE_TYPE_SEQWR;
672 		if (pthread_spin_init(&zone->lock, PTHREAD_PROCESS_PRIVATE)) {
673 			SPDK_ERRLOG("pthread_spin_init() failed\n");
674 			rc = -ENOMEM;
675 			break;
676 		}
677 	}
678 
679 	if (rc) {
680 		for (; i > 0; i--) {
681 			pthread_spin_destroy(&bdev_node->zones[i - 1].lock);
682 		}
683 	}
684 
685 	return rc;
686 }
687 
688 static int
689 zone_block_register(const char *base_bdev_name)
690 {
691 	struct spdk_bdev_desc *base_desc;
692 	struct spdk_bdev *base_bdev;
693 	struct bdev_zone_block_config *name, *tmp;
694 	struct bdev_zone_block *bdev_node;
695 	struct spdk_uuid ns_uuid;
696 	uint64_t zone_size;
697 	int rc = 0;
698 
699 	spdk_uuid_parse(&ns_uuid, BDEV_ZONE_BLOCK_NAMESPACE_UUID);
700 
701 	/* Check our list of names from config versus this bdev and if
702 	 * there's a match, create the bdev_node & bdev accordingly.
703 	 */
704 	TAILQ_FOREACH_SAFE(name, &g_bdev_configs, link, tmp) {
705 		if (strcmp(name->bdev_name, base_bdev_name) != 0) {
706 			continue;
707 		}
708 
709 		rc = spdk_bdev_open_ext(base_bdev_name, true, zone_block_base_bdev_event_cb,
710 					NULL, &base_desc);
711 		if (rc == -ENODEV) {
712 			return -ENODEV;
713 		} else if (rc) {
714 			SPDK_ERRLOG("could not open bdev %s\n", base_bdev_name);
715 			goto free_config;
716 		}
717 
718 		base_bdev = spdk_bdev_desc_get_bdev(base_desc);
719 
720 		if (spdk_bdev_is_zoned(base_bdev)) {
721 			SPDK_ERRLOG("Base bdev %s is already a zoned bdev\n", base_bdev_name);
722 			rc = -EEXIST;
723 			goto zone_exist;
724 		}
725 
726 		bdev_node = calloc(1, sizeof(struct bdev_zone_block));
727 		if (!bdev_node) {
728 			rc = -ENOMEM;
729 			SPDK_ERRLOG("could not allocate bdev_node\n");
730 			goto zone_exist;
731 		}
732 
733 		bdev_node->base_desc = base_desc;
734 
735 		/* The base bdev that we're attaching to. */
736 		bdev_node->bdev.name = strdup(name->vbdev_name);
737 		if (!bdev_node->bdev.name) {
738 			rc = -ENOMEM;
739 			SPDK_ERRLOG("could not allocate bdev_node name\n");
740 			goto strdup_failed;
741 		}
742 
743 		zone_size = spdk_align64pow2(name->zone_capacity);
744 		if (zone_size == 0) {
745 			rc = -EINVAL;
746 			SPDK_ERRLOG("invalid zone size\n");
747 			goto roundup_failed;
748 		}
749 
750 		bdev_node->zone_shift = spdk_u64log2(zone_size);
751 		bdev_node->num_zones = base_bdev->blockcnt / zone_size;
752 
753 		bdev_node->zones = calloc(bdev_node->num_zones, sizeof(struct block_zone));
754 		if (!bdev_node->zones) {
755 			rc = -ENOMEM;
756 			SPDK_ERRLOG("could not allocate zones\n");
757 			goto calloc_failed;
758 		}
759 
760 		bdev_node->bdev.product_name = "zone_block";
761 
762 		/* Copy some properties from the underlying base bdev. */
763 		bdev_node->bdev.write_cache = base_bdev->write_cache;
764 		bdev_node->bdev.required_alignment = base_bdev->required_alignment;
765 		bdev_node->bdev.optimal_io_boundary = base_bdev->optimal_io_boundary;
766 
767 		bdev_node->bdev.blocklen = base_bdev->blocklen;
768 		bdev_node->bdev.blockcnt = bdev_node->num_zones * zone_size;
769 
770 		if (bdev_node->num_zones * name->zone_capacity != base_bdev->blockcnt) {
771 			SPDK_DEBUGLOG(vbdev_zone_block,
772 				      "Lost %" PRIu64 " blocks due to zone capacity and base bdev size misalignment\n",
773 				      base_bdev->blockcnt - bdev_node->num_zones * name->zone_capacity);
774 		}
775 
776 		bdev_node->bdev.write_unit_size = base_bdev->write_unit_size;
777 
778 		bdev_node->bdev.md_interleave = base_bdev->md_interleave;
779 		bdev_node->bdev.md_len = base_bdev->md_len;
780 		bdev_node->bdev.dif_type = base_bdev->dif_type;
781 		bdev_node->bdev.dif_is_head_of_md = base_bdev->dif_is_head_of_md;
782 		bdev_node->bdev.dif_check_flags = base_bdev->dif_check_flags;
783 		bdev_node->bdev.dif_pi_format = base_bdev->dif_pi_format;
784 
785 		bdev_node->bdev.zoned = true;
786 		bdev_node->bdev.ctxt = bdev_node;
787 		bdev_node->bdev.fn_table = &zone_block_fn_table;
788 		bdev_node->bdev.module = &bdev_zoned_if;
789 
790 		/* Generate UUID based on namespace UUID + base bdev UUID. */
791 		rc = spdk_uuid_generate_sha1(&bdev_node->bdev.uuid, &ns_uuid,
792 					     (const char *)&base_bdev->uuid, sizeof(struct spdk_uuid));
793 		if (rc) {
794 			SPDK_ERRLOG("Unable to generate new UUID for zone block bdev\n");
795 			goto uuid_generation_failed;
796 		}
797 
798 		/* bdev specific info */
799 		bdev_node->bdev.zone_size = zone_size;
800 
801 		bdev_node->zone_capacity = name->zone_capacity;
802 		bdev_node->bdev.optimal_open_zones = name->optimal_open_zones;
803 		bdev_node->bdev.max_open_zones = 0;
804 		rc = zone_block_init_zone_info(bdev_node);
805 		if (rc) {
806 			SPDK_ERRLOG("could not init zone info\n");
807 			goto zone_info_failed;
808 		}
809 
810 		TAILQ_INSERT_TAIL(&g_bdev_nodes, bdev_node, link);
811 
812 		spdk_io_device_register(bdev_node, _zone_block_ch_create_cb, _zone_block_ch_destroy_cb,
813 					sizeof(struct zone_block_io_channel),
814 					name->vbdev_name);
815 
816 		/* Save the thread where the base device is opened */
817 		bdev_node->thread = spdk_get_thread();
818 
819 		rc = spdk_bdev_module_claim_bdev(base_bdev, base_desc, bdev_node->bdev.module);
820 		if (rc) {
821 			SPDK_ERRLOG("could not claim bdev %s\n", base_bdev_name);
822 			goto claim_failed;
823 		}
824 
825 		rc = spdk_bdev_register(&bdev_node->bdev);
826 		if (rc) {
827 			SPDK_ERRLOG("could not register zoned bdev\n");
828 			goto register_failed;
829 		}
830 	}
831 
832 	return rc;
833 
834 register_failed:
835 	spdk_bdev_module_release_bdev(&bdev_node->bdev);
836 claim_failed:
837 	TAILQ_REMOVE(&g_bdev_nodes, bdev_node, link);
838 	spdk_io_device_unregister(bdev_node, NULL);
839 zone_info_failed:
840 uuid_generation_failed:
841 	free(bdev_node->zones);
842 calloc_failed:
843 roundup_failed:
844 	free(bdev_node->bdev.name);
845 strdup_failed:
846 	free(bdev_node);
847 zone_exist:
848 	spdk_bdev_close(base_desc);
849 free_config:
850 	zone_block_remove_config(name);
851 	return rc;
852 }
853 
854 int
855 vbdev_zone_block_create(const char *bdev_name, const char *vbdev_name, uint64_t zone_capacity,
856 			uint64_t optimal_open_zones)
857 {
858 	int rc = 0;
859 
860 	if (zone_capacity == 0) {
861 		SPDK_ERRLOG("Zone capacity can't be 0\n");
862 		return -EINVAL;
863 	}
864 
865 	if (optimal_open_zones == 0) {
866 		SPDK_ERRLOG("Optimal open zones can't be 0\n");
867 		return -EINVAL;
868 	}
869 
870 	/* Insert the bdev into our global name list even if it doesn't exist yet,
871 	 * it may show up soon...
872 	 */
873 	rc = zone_block_insert_name(bdev_name, vbdev_name, zone_capacity, optimal_open_zones);
874 	if (rc) {
875 		return rc;
876 	}
877 
878 	rc = zone_block_register(bdev_name);
879 	if (rc == -ENODEV) {
880 		/* This is not an error, even though the bdev is not present at this time it may
881 		 * still show up later.
882 		 */
883 		rc = 0;
884 	}
885 	return rc;
886 }
887 
888 void
889 vbdev_zone_block_delete(const char *name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
890 {
891 	struct bdev_zone_block_config *name_node;
892 	int rc;
893 
894 	rc = spdk_bdev_unregister_by_name(name, &bdev_zoned_if, cb_fn, cb_arg);
895 	if (rc == 0) {
896 		TAILQ_FOREACH(name_node, &g_bdev_configs, link) {
897 			if (strcmp(name_node->vbdev_name, name) == 0) {
898 				zone_block_remove_config(name_node);
899 				break;
900 			}
901 		}
902 	} else {
903 		cb_fn(cb_arg, rc);
904 	}
905 }
906 
907 static void
908 zone_block_examine(struct spdk_bdev *bdev)
909 {
910 	zone_block_register(bdev->name);
911 
912 	spdk_bdev_module_examine_done(&bdev_zoned_if);
913 }
914 
915 SPDK_LOG_REGISTER_COMPONENT(vbdev_zone_block)
916