xref: /spdk/module/bdev/uring/bdev_uring.c (revision 8afdeef3becfe9409cc9e7372bd0bc10e8b7d46d)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_uring.h"
7 
8 #include "spdk/stdinc.h"
9 #include "spdk/config.h"
10 #include "spdk/barrier.h"
11 #include "spdk/bdev.h"
12 #include "spdk/env.h"
13 #include "spdk/fd.h"
14 #include "spdk/likely.h"
15 #include "spdk/thread.h"
16 #include "spdk/json.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 #include "spdk/file.h"
20 
21 #include "spdk/log.h"
22 #include "spdk_internal/uring.h"
23 
24 #ifdef SPDK_CONFIG_URING_ZNS
25 #include <linux/blkzoned.h>
26 #define SECTOR_SHIFT 9
27 #endif
28 
29 struct bdev_uring_zoned_dev {
30 	uint64_t		num_zones;
31 	uint32_t		zone_shift;
32 	uint32_t		lba_shift;
33 };
34 
35 struct bdev_uring_io_channel {
36 	struct bdev_uring_group_channel		*group_ch;
37 };
38 
39 struct bdev_uring_group_channel {
40 	uint64_t				io_inflight;
41 	uint64_t				io_pending;
42 	struct spdk_poller			*poller;
43 	struct io_uring				uring;
44 };
45 
46 struct bdev_uring_task {
47 	uint64_t			len;
48 	struct bdev_uring_io_channel	*ch;
49 	TAILQ_ENTRY(bdev_uring_task)	link;
50 };
51 
52 struct bdev_uring {
53 	struct spdk_bdev	bdev;
54 	struct bdev_uring_zoned_dev	zd;
55 	char			*filename;
56 	int			fd;
57 	TAILQ_ENTRY(bdev_uring)  link;
58 };
59 
60 static int bdev_uring_init(void);
61 static void bdev_uring_fini(void);
62 static void uring_free_bdev(struct bdev_uring *uring);
63 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
64 
65 #define SPDK_URING_QUEUE_DEPTH 512
66 #define MAX_EVENTS_PER_POLL 32
67 
68 static int
69 bdev_uring_get_ctx_size(void)
70 {
71 	return sizeof(struct bdev_uring_task);
72 }
73 
74 static struct spdk_bdev_module uring_if = {
75 	.name		= "uring",
76 	.module_init	= bdev_uring_init,
77 	.module_fini	= bdev_uring_fini,
78 	.get_ctx_size	= bdev_uring_get_ctx_size,
79 };
80 
81 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
82 
83 static int
84 bdev_uring_open(struct bdev_uring *bdev)
85 {
86 	int fd;
87 
88 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
89 	if (fd < 0) {
90 		/* Try without O_DIRECT for non-disk files */
91 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
92 		if (fd < 0) {
93 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
94 				    bdev->filename, errno, spdk_strerror(errno));
95 			bdev->fd = -1;
96 			return -1;
97 		}
98 	}
99 
100 	bdev->fd = fd;
101 
102 	return 0;
103 }
104 
105 static void
106 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
107 {
108 }
109 
110 int
111 bdev_uring_rescan(const char *name)
112 {
113 	struct spdk_bdev_desc *desc;
114 	struct spdk_bdev *bdev;
115 	struct bdev_uring *uring;
116 	uint64_t uring_size, blockcnt;
117 	int rc;
118 
119 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
120 	if (rc != 0) {
121 		return rc;
122 	}
123 
124 	bdev = spdk_bdev_desc_get_bdev(desc);
125 	if (bdev->module != &uring_if) {
126 		rc = -ENODEV;
127 		goto exit;
128 	}
129 
130 	uring = SPDK_CONTAINEROF(bdev, struct bdev_uring, bdev);
131 	uring_size = spdk_fd_get_size(uring->fd);
132 	blockcnt = uring_size / bdev->blocklen;
133 
134 	if (bdev->blockcnt != blockcnt) {
135 		SPDK_NOTICELOG("URING device is resized: bdev name %s, old block count %" PRIu64
136 			       ", new block count %"
137 			       PRIu64 "\n",
138 			       uring->filename,
139 			       bdev->blockcnt,
140 			       blockcnt);
141 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
142 		if (rc != 0) {
143 			SPDK_ERRLOG("Could not change num blocks for uring bdev: name %s, errno: %d.\n",
144 				    uring->filename, rc);
145 			goto exit;
146 		}
147 	}
148 
149 exit:
150 	spdk_bdev_close(desc);
151 	return rc;
152 }
153 
154 static int
155 bdev_uring_close(struct bdev_uring *bdev)
156 {
157 	int rc;
158 
159 	if (bdev->fd == -1) {
160 		return 0;
161 	}
162 
163 	rc = close(bdev->fd);
164 	if (rc < 0) {
165 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
166 			    bdev->fd, errno, spdk_strerror(errno));
167 		return -1;
168 	}
169 
170 	bdev->fd = -1;
171 
172 	return 0;
173 }
174 
175 static int64_t
176 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
177 		 struct bdev_uring_task *uring_task,
178 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
179 {
180 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
181 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
182 	struct io_uring_sqe *sqe;
183 
184 	sqe = io_uring_get_sqe(&group_ch->uring);
185 	if (!sqe) {
186 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
187 		return -ENOMEM;
188 	}
189 
190 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
191 	io_uring_sqe_set_data(sqe, uring_task);
192 	uring_task->len = nbytes;
193 	uring_task->ch = uring_ch;
194 
195 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
196 		      iovcnt, nbytes, offset);
197 
198 	group_ch->io_pending++;
199 	return nbytes;
200 }
201 
202 static int64_t
203 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
204 		  struct bdev_uring_task *uring_task,
205 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
206 {
207 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
208 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
209 	struct io_uring_sqe *sqe;
210 
211 	sqe = io_uring_get_sqe(&group_ch->uring);
212 	if (!sqe) {
213 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
214 		return -ENOMEM;
215 	}
216 
217 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
218 	io_uring_sqe_set_data(sqe, uring_task);
219 	uring_task->len = nbytes;
220 	uring_task->ch = uring_ch;
221 
222 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
223 		      iovcnt, nbytes, offset);
224 
225 	group_ch->io_pending++;
226 	return nbytes;
227 }
228 
229 static int
230 bdev_uring_destruct(void *ctx)
231 {
232 	struct bdev_uring *uring = ctx;
233 	int rc = 0;
234 
235 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
236 	rc = bdev_uring_close(uring);
237 	if (rc < 0) {
238 		SPDK_ERRLOG("bdev_uring_close() failed\n");
239 	}
240 	spdk_io_device_unregister(uring, NULL);
241 	uring_free_bdev(uring);
242 	return rc;
243 }
244 
245 static int
246 bdev_uring_reap(struct io_uring *ring, int max)
247 {
248 	int i, count, ret;
249 	struct io_uring_cqe *cqe;
250 	struct bdev_uring_task *uring_task;
251 	enum spdk_bdev_io_status status;
252 
253 	count = 0;
254 	for (i = 0; i < max; i++) {
255 		ret = io_uring_peek_cqe(ring, &cqe);
256 		if (ret != 0) {
257 			return ret;
258 		}
259 
260 		if (cqe == NULL) {
261 			return count;
262 		}
263 
264 		uring_task = (struct bdev_uring_task *)cqe->user_data;
265 		if (cqe->res != (signed)uring_task->len) {
266 			status = SPDK_BDEV_IO_STATUS_FAILED;
267 		} else {
268 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
269 		}
270 
271 		uring_task->ch->group_ch->io_inflight--;
272 		io_uring_cqe_seen(ring, cqe);
273 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
274 		count++;
275 	}
276 
277 	return count;
278 }
279 
280 static int
281 bdev_uring_group_poll(void *arg)
282 {
283 	struct bdev_uring_group_channel *group_ch = arg;
284 	int to_complete, to_submit;
285 	int count, ret;
286 
287 	to_submit = group_ch->io_pending;
288 
289 	if (to_submit > 0) {
290 		/* If there are I/O to submit, use io_uring_submit here.
291 		 * It will automatically call spdk_io_uring_enter appropriately. */
292 		ret = io_uring_submit(&group_ch->uring);
293 		if (ret < 0) {
294 			return SPDK_POLLER_BUSY;
295 		}
296 
297 		group_ch->io_pending = 0;
298 		group_ch->io_inflight += to_submit;
299 	}
300 
301 	to_complete = group_ch->io_inflight;
302 	count = 0;
303 	if (to_complete > 0) {
304 		count = bdev_uring_reap(&group_ch->uring, to_complete);
305 	}
306 
307 	if (count + to_submit > 0) {
308 		return SPDK_POLLER_BUSY;
309 	} else {
310 		return SPDK_POLLER_IDLE;
311 	}
312 }
313 
314 static void
315 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
316 		      bool success)
317 {
318 	int64_t ret = 0;
319 
320 	if (!success) {
321 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
322 		return;
323 	}
324 
325 	switch (bdev_io->type) {
326 	case SPDK_BDEV_IO_TYPE_READ:
327 		ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
328 				       ch,
329 				       (struct bdev_uring_task *)bdev_io->driver_ctx,
330 				       bdev_io->u.bdev.iovs,
331 				       bdev_io->u.bdev.iovcnt,
332 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
333 				       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
334 		break;
335 	case SPDK_BDEV_IO_TYPE_WRITE:
336 		ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
337 					ch,
338 					(struct bdev_uring_task *)bdev_io->driver_ctx,
339 					bdev_io->u.bdev.iovs,
340 					bdev_io->u.bdev.iovcnt,
341 					bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
342 					bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
343 		break;
344 	default:
345 		SPDK_ERRLOG("Wrong io type\n");
346 		break;
347 	}
348 
349 	if (ret == -ENOMEM) {
350 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
351 	}
352 }
353 
354 #ifdef SPDK_CONFIG_URING_ZNS
355 static int
356 bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
357 {
358 	switch (zones_rep->type) {
359 	case BLK_ZONE_TYPE_CONVENTIONAL:
360 		zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV;
361 		break;
362 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
363 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
364 		break;
365 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
366 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP;
367 		break;
368 	default:
369 		SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type);
370 		return -EIO;
371 	}
372 	return 0;
373 }
374 
375 static int
376 bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
377 {
378 	switch (zones_rep->cond) {
379 	case BLK_ZONE_COND_EMPTY:
380 		zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
381 		break;
382 	case BLK_ZONE_COND_IMP_OPEN:
383 		zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
384 		break;
385 	case BLK_ZONE_COND_EXP_OPEN:
386 		zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
387 		break;
388 	case BLK_ZONE_COND_CLOSED:
389 		zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
390 		break;
391 	case BLK_ZONE_COND_READONLY:
392 		zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
393 		break;
394 	case BLK_ZONE_COND_FULL:
395 		zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
396 		break;
397 	case BLK_ZONE_COND_OFFLINE:
398 		zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
399 		break;
400 	case BLK_ZONE_COND_NOT_WP:
401 		zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP;
402 		break;
403 	default:
404 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
405 		return -EIO;
406 	}
407 	return 0;
408 }
409 
410 static int
411 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
412 {
413 	struct bdev_uring *uring;
414 	struct blk_zone_range range;
415 	long unsigned zone_mgmt_op;
416 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
417 
418 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
419 
420 	switch (bdev_io->u.zone_mgmt.zone_action) {
421 	case SPDK_BDEV_ZONE_RESET:
422 		zone_mgmt_op = BLKRESETZONE;
423 		break;
424 	case SPDK_BDEV_ZONE_OPEN:
425 		zone_mgmt_op = BLKOPENZONE;
426 		break;
427 	case SPDK_BDEV_ZONE_CLOSE:
428 		zone_mgmt_op = BLKCLOSEZONE;
429 		break;
430 	case SPDK_BDEV_ZONE_FINISH:
431 		zone_mgmt_op = BLKFINISHZONE;
432 		break;
433 	default:
434 		return -EINVAL;
435 	}
436 
437 	range.sector = (zone_id << uring->zd.lba_shift);
438 	range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
439 
440 	if (ioctl(uring->fd, zone_mgmt_op, &range)) {
441 		SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
442 			    bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
443 		return -EINVAL;
444 	}
445 
446 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
447 
448 	return 0;
449 }
450 
451 static int
452 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
453 {
454 	struct bdev_uring *uring;
455 	struct blk_zone *zones;
456 	struct blk_zone_report *rep;
457 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
458 	size_t repsize;
459 	uint32_t i, shift;
460 	uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
461 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
462 
463 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
464 	shift = uring->zd.lba_shift;
465 
466 	if ((num_zones > uring->zd.num_zones) || !num_zones) {
467 		return -EINVAL;
468 	}
469 
470 	repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
471 	rep = (struct blk_zone_report *)malloc(repsize);
472 	if (!rep) {
473 		return -ENOMEM;
474 	}
475 
476 	zones = (struct blk_zone *)(rep + 1);
477 
478 	while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
479 		memset(rep, 0, repsize);
480 		rep->sector = zone_id;
481 		rep->nr_zones = num_zones;
482 
483 		if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
484 			SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
485 				    errno, strerror(errno));
486 			free(rep);
487 			return -EINVAL;
488 		}
489 
490 		if (!rep->nr_zones) {
491 			break;
492 		}
493 
494 		for (i = 0; i < rep->nr_zones; i++) {
495 			zone_info->zone_id = ((zones + i)->start >> shift);
496 			zone_info->write_pointer = ((zones + i)->wp >> shift);
497 			zone_info->capacity = ((zones + i)->capacity >> shift);
498 
499 			bdev_uring_fill_zone_state(zone_info, zones + i);
500 			bdev_uring_fill_zone_type(zone_info, zones + i);
501 
502 			zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
503 			zone_info++;
504 			num_zones--;
505 		}
506 	}
507 
508 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
509 	free(rep);
510 	return 0;
511 }
512 
513 static int
514 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
515 {
516 	char *filename_dup = NULL, *base;
517 	char *str = NULL;
518 	uint32_t val;
519 	uint32_t zinfo;
520 	int retval = -1;
521 	struct stat sb;
522 	char resolved_path[PATH_MAX], *rp;
523 	char *sysfs_path = NULL;
524 
525 	uring->bdev.zoned = false;
526 
527 	/* Follow symlink */
528 	if ((rp = realpath(filename, resolved_path))) {
529 		filename = rp;
530 	}
531 
532 	/* Perform check on block devices only */
533 	if (stat(filename, &sb) == 0 && S_ISBLK(sb.st_mode)) {
534 		return 0;
535 	}
536 
537 	/* strdup() because basename() may modify the passed parameter */
538 	filename_dup = strdup(filename);
539 	if (filename_dup == NULL) {
540 		SPDK_ERRLOG("Could not duplicate string %s\n", filename);
541 		return -1;
542 	}
543 
544 	base = basename(filename_dup);
545 	free(filename_dup);
546 	sysfs_path = spdk_sprintf_alloc("/sys/block/%s/queue/zoned", base);
547 	retval = spdk_read_sysfs_attribute(&str, "%s", sysfs_path);
548 	/* Check if this is a zoned block device */
549 	if (retval < 0) {
550 		SPDK_ERRLOG("Unable to open file %s. errno: %d\n", sysfs_path, retval);
551 	} else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
552 		/* Only host-aware & host-managed zns devices */
553 		uring->bdev.zoned = true;
554 
555 		if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
556 			SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
557 			goto err_ret;
558 		}
559 		uring->zd.num_zones = zinfo;
560 
561 		if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
562 			SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
563 			goto err_ret;
564 		}
565 
566 		uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
567 		uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
568 		uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
569 
570 		retval = spdk_read_sysfs_attribute_uint32(&val, "/sys/block/%s/queue/max_open_zones", base);
571 		if (retval < 0) {
572 			SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", retval, strerror(-retval));
573 			goto err_ret;
574 		}
575 		uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = val;
576 
577 		retval = spdk_read_sysfs_attribute_uint32(&val, "/sys/block/%s/queue/max_active_zones", base);
578 		if (retval < 0) {
579 			SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", retval, strerror(-retval));
580 			goto err_ret;
581 		}
582 		uring->bdev.max_active_zones = val;
583 		retval = 0;
584 	} else {
585 		retval = 0;        /* queue/zoned=none */
586 	}
587 err_ret:
588 	free(str);
589 	free(sysfs_path);
590 	return retval;
591 }
592 #else
593 /* No support for zoned devices */
594 static int
595 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
596 {
597 	return -1;
598 }
599 
600 static int
601 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
602 {
603 	return -1;
604 }
605 
606 static int
607 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
608 {
609 	return 0;
610 }
611 #endif
612 
613 static int
614 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
615 {
616 
617 	switch (bdev_io->type) {
618 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
619 		return bdev_uring_zone_get_info(bdev_io);
620 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
621 		return bdev_uring_zone_management_op(bdev_io);
622 	/* Read and write operations must be performed on buffers aligned to
623 	 * bdev->required_alignment. If user specified unaligned buffers,
624 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
625 	case SPDK_BDEV_IO_TYPE_READ:
626 	case SPDK_BDEV_IO_TYPE_WRITE:
627 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
628 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
629 		return 0;
630 	default:
631 		return -1;
632 	}
633 }
634 
635 static void
636 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
637 {
638 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
639 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
640 	}
641 }
642 
643 static bool
644 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
645 {
646 	switch (io_type) {
647 #ifdef SPDK_CONFIG_URING_ZNS
648 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
649 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
650 #endif
651 	case SPDK_BDEV_IO_TYPE_READ:
652 	case SPDK_BDEV_IO_TYPE_WRITE:
653 		return true;
654 	default:
655 		return false;
656 	}
657 }
658 
659 static int
660 bdev_uring_create_cb(void *io_device, void *ctx_buf)
661 {
662 	struct bdev_uring_io_channel *ch = ctx_buf;
663 
664 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
665 
666 	return 0;
667 }
668 
669 static void
670 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
671 {
672 	struct bdev_uring_io_channel *ch = ctx_buf;
673 
674 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
675 }
676 
677 static struct spdk_io_channel *
678 bdev_uring_get_io_channel(void *ctx)
679 {
680 	struct bdev_uring *uring = ctx;
681 
682 	return spdk_get_io_channel(uring);
683 }
684 
685 static int
686 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
687 {
688 	struct bdev_uring *uring = ctx;
689 
690 	spdk_json_write_named_object_begin(w, "uring");
691 
692 	spdk_json_write_named_string(w, "filename", uring->filename);
693 
694 	spdk_json_write_object_end(w);
695 
696 	return 0;
697 }
698 
699 static void
700 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
701 {
702 	struct bdev_uring *uring = bdev->ctxt;
703 	char uuid_str[SPDK_UUID_STRING_LEN];
704 
705 	spdk_json_write_object_begin(w);
706 
707 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
708 
709 	spdk_json_write_named_object_begin(w, "params");
710 	spdk_json_write_named_string(w, "name", bdev->name);
711 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
712 	spdk_json_write_named_string(w, "filename", uring->filename);
713 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
714 	spdk_json_write_named_string(w, "uuid", uuid_str);
715 	spdk_json_write_object_end(w);
716 
717 	spdk_json_write_object_end(w);
718 }
719 
720 static const struct spdk_bdev_fn_table uring_fn_table = {
721 	.destruct		= bdev_uring_destruct,
722 	.submit_request		= bdev_uring_submit_request,
723 	.io_type_supported	= bdev_uring_io_type_supported,
724 	.get_io_channel		= bdev_uring_get_io_channel,
725 	.dump_info_json		= bdev_uring_dump_info_json,
726 	.write_config_json	= bdev_uring_write_json_config,
727 };
728 
729 static void
730 uring_free_bdev(struct bdev_uring *uring)
731 {
732 	if (uring == NULL) {
733 		return;
734 	}
735 	free(uring->filename);
736 	free(uring->bdev.name);
737 	free(uring);
738 }
739 
740 static int
741 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
742 {
743 	struct bdev_uring_group_channel *ch = ctx_buf;
744 
745 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
746 	 * local devices but also devices attached from remote target */
747 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
748 		SPDK_ERRLOG("uring I/O context setup failure\n");
749 		return -1;
750 	}
751 
752 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
753 	return 0;
754 }
755 
756 static void
757 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
758 {
759 	struct bdev_uring_group_channel *ch = ctx_buf;
760 
761 	io_uring_queue_exit(&ch->uring);
762 
763 	spdk_poller_unregister(&ch->poller);
764 }
765 
766 struct spdk_bdev *
767 create_uring_bdev(const struct bdev_uring_opts *opts)
768 {
769 	struct bdev_uring *uring;
770 	uint32_t detected_block_size;
771 	uint64_t bdev_size;
772 	int rc;
773 	uint32_t block_size = opts->block_size;
774 
775 	uring = calloc(1, sizeof(*uring));
776 	if (!uring) {
777 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
778 		return NULL;
779 	}
780 
781 	uring->filename = strdup(opts->filename);
782 	if (!uring->filename) {
783 		goto error_return;
784 	}
785 
786 	if (bdev_uring_open(uring)) {
787 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", opts->filename, uring->fd, errno);
788 		goto error_return;
789 	}
790 
791 	bdev_size = spdk_fd_get_size(uring->fd);
792 
793 	uring->bdev.name = strdup(opts->name);
794 	if (!uring->bdev.name) {
795 		goto error_return;
796 	}
797 	uring->bdev.product_name = "URING bdev";
798 	uring->bdev.module = &uring_if;
799 
800 	uring->bdev.write_cache = 0;
801 
802 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
803 	if (block_size == 0) {
804 		/* User did not specify block size - use autodetected block size. */
805 		if (detected_block_size == 0) {
806 			SPDK_ERRLOG("Block size could not be auto-detected\n");
807 			goto error_return;
808 		}
809 		block_size = detected_block_size;
810 	} else {
811 		if (block_size < detected_block_size) {
812 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
813 				    "auto-detected block size %" PRIu32 "\n",
814 				    block_size, detected_block_size);
815 			goto error_return;
816 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
817 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
818 				     "auto-detected block size %" PRIu32 "\n",
819 				     block_size, detected_block_size);
820 		}
821 	}
822 
823 	if (block_size < 512) {
824 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
825 		goto error_return;
826 	}
827 
828 	if (!spdk_u32_is_pow2(block_size)) {
829 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
830 		goto error_return;
831 	}
832 
833 	uring->bdev.blocklen = block_size;
834 	uring->bdev.required_alignment = spdk_u32log2(block_size);
835 
836 	rc = bdev_uring_check_zoned_support(uring, opts->name, opts->filename);
837 	if (rc) {
838 		goto error_return;
839 	}
840 
841 	if (bdev_size % uring->bdev.blocklen != 0) {
842 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
843 			    bdev_size, uring->bdev.blocklen);
844 		goto error_return;
845 	}
846 
847 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
848 	uring->bdev.ctxt = uring;
849 
850 	uring->bdev.fn_table = &uring_fn_table;
851 
852 	if (!spdk_mem_all_zero(&opts->uuid, sizeof(opts->uuid))) {
853 		spdk_uuid_copy(&uring->bdev.uuid, &opts->uuid);
854 	}
855 
856 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
857 				sizeof(struct bdev_uring_io_channel),
858 				uring->bdev.name);
859 	rc = spdk_bdev_register(&uring->bdev);
860 	if (rc) {
861 		spdk_io_device_unregister(uring, NULL);
862 		goto error_return;
863 	}
864 
865 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
866 	return &uring->bdev;
867 
868 error_return:
869 	bdev_uring_close(uring);
870 	uring_free_bdev(uring);
871 	return NULL;
872 }
873 
874 struct delete_uring_bdev_ctx {
875 	spdk_delete_uring_complete cb_fn;
876 	void *cb_arg;
877 };
878 
879 static void
880 uring_bdev_unregister_cb(void *arg, int bdeverrno)
881 {
882 	struct delete_uring_bdev_ctx *ctx = arg;
883 
884 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
885 	free(ctx);
886 }
887 
888 void
889 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
890 {
891 	struct delete_uring_bdev_ctx *ctx;
892 	int rc;
893 
894 	ctx = calloc(1, sizeof(*ctx));
895 	if (ctx == NULL) {
896 		cb_fn(cb_arg, -ENOMEM);
897 		return;
898 	}
899 
900 	ctx->cb_fn = cb_fn;
901 	ctx->cb_arg = cb_arg;
902 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
903 	if (rc != 0) {
904 		uring_bdev_unregister_cb(ctx, rc);
905 	}
906 }
907 
908 static int
909 bdev_uring_init(void)
910 {
911 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
912 				sizeof(struct bdev_uring_group_channel), "uring_module");
913 
914 	return 0;
915 }
916 
917 static void
918 bdev_uring_fini(void)
919 {
920 	spdk_io_device_unregister(&uring_if, NULL);
921 }
922 
923 SPDK_LOG_REGISTER_COMPONENT(uring)
924