xref: /spdk/module/bdev/uring/bdev_uring.c (revision b02581a89058ebaebe03bd0e16e3b58adfe406c1)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_uring.h"
7 
8 #include "spdk/stdinc.h"
9 #include "spdk/config.h"
10 #include "spdk/barrier.h"
11 #include "spdk/bdev.h"
12 #include "spdk/env.h"
13 #include "spdk/fd.h"
14 #include "spdk/likely.h"
15 #include "spdk/thread.h"
16 #include "spdk/json.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 
20 #include "spdk/log.h"
21 #include "spdk_internal/uring.h"
22 
23 #ifdef SPDK_CONFIG_URING_ZNS
24 #include <linux/blkzoned.h>
25 #define SECTOR_SHIFT 9
26 #endif
27 
28 struct bdev_uring_zoned_dev {
29 	uint64_t		num_zones;
30 	uint32_t		zone_shift;
31 	uint32_t		lba_shift;
32 };
33 
34 struct bdev_uring_io_channel {
35 	struct bdev_uring_group_channel		*group_ch;
36 };
37 
38 struct bdev_uring_group_channel {
39 	uint64_t				io_inflight;
40 	uint64_t				io_pending;
41 	struct spdk_poller			*poller;
42 	struct io_uring				uring;
43 };
44 
45 struct bdev_uring_task {
46 	uint64_t			len;
47 	struct bdev_uring_io_channel	*ch;
48 	TAILQ_ENTRY(bdev_uring_task)	link;
49 };
50 
51 struct bdev_uring {
52 	struct spdk_bdev	bdev;
53 	struct bdev_uring_zoned_dev	zd;
54 	char			*filename;
55 	int			fd;
56 	TAILQ_ENTRY(bdev_uring)  link;
57 };
58 
59 static int bdev_uring_init(void);
60 static void bdev_uring_fini(void);
61 static void uring_free_bdev(struct bdev_uring *uring);
62 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
63 
64 #define SPDK_URING_QUEUE_DEPTH 512
65 #define MAX_EVENTS_PER_POLL 32
66 
67 static int
68 bdev_uring_get_ctx_size(void)
69 {
70 	return sizeof(struct bdev_uring_task);
71 }
72 
73 static struct spdk_bdev_module uring_if = {
74 	.name		= "uring",
75 	.module_init	= bdev_uring_init,
76 	.module_fini	= bdev_uring_fini,
77 	.get_ctx_size	= bdev_uring_get_ctx_size,
78 };
79 
80 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
81 
82 static int
83 bdev_uring_open(struct bdev_uring *bdev)
84 {
85 	int fd;
86 
87 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
88 	if (fd < 0) {
89 		/* Try without O_DIRECT for non-disk files */
90 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
91 		if (fd < 0) {
92 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
93 				    bdev->filename, errno, spdk_strerror(errno));
94 			bdev->fd = -1;
95 			return -1;
96 		}
97 	}
98 
99 	bdev->fd = fd;
100 
101 	return 0;
102 }
103 
104 static int
105 bdev_uring_close(struct bdev_uring *bdev)
106 {
107 	int rc;
108 
109 	if (bdev->fd == -1) {
110 		return 0;
111 	}
112 
113 	rc = close(bdev->fd);
114 	if (rc < 0) {
115 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
116 			    bdev->fd, errno, spdk_strerror(errno));
117 		return -1;
118 	}
119 
120 	bdev->fd = -1;
121 
122 	return 0;
123 }
124 
125 static int64_t
126 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
127 		 struct bdev_uring_task *uring_task,
128 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
129 {
130 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
131 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
132 	struct io_uring_sqe *sqe;
133 
134 	sqe = io_uring_get_sqe(&group_ch->uring);
135 	if (!sqe) {
136 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
137 		return -ENOMEM;
138 	}
139 
140 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
141 	io_uring_sqe_set_data(sqe, uring_task);
142 	uring_task->len = nbytes;
143 	uring_task->ch = uring_ch;
144 
145 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
146 		      iovcnt, nbytes, offset);
147 
148 	group_ch->io_pending++;
149 	return nbytes;
150 }
151 
152 static int64_t
153 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
154 		  struct bdev_uring_task *uring_task,
155 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
156 {
157 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
158 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
159 	struct io_uring_sqe *sqe;
160 
161 	sqe = io_uring_get_sqe(&group_ch->uring);
162 	if (!sqe) {
163 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
164 		return -ENOMEM;
165 	}
166 
167 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
168 	io_uring_sqe_set_data(sqe, uring_task);
169 	uring_task->len = nbytes;
170 	uring_task->ch = uring_ch;
171 
172 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
173 		      iovcnt, nbytes, offset);
174 
175 	group_ch->io_pending++;
176 	return nbytes;
177 }
178 
179 static int
180 bdev_uring_destruct(void *ctx)
181 {
182 	struct bdev_uring *uring = ctx;
183 	int rc = 0;
184 
185 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
186 	rc = bdev_uring_close(uring);
187 	if (rc < 0) {
188 		SPDK_ERRLOG("bdev_uring_close() failed\n");
189 	}
190 	spdk_io_device_unregister(uring, NULL);
191 	uring_free_bdev(uring);
192 	return rc;
193 }
194 
195 static int
196 bdev_uring_reap(struct io_uring *ring, int max)
197 {
198 	int i, count, ret;
199 	struct io_uring_cqe *cqe;
200 	struct bdev_uring_task *uring_task;
201 	enum spdk_bdev_io_status status;
202 
203 	count = 0;
204 	for (i = 0; i < max; i++) {
205 		ret = io_uring_peek_cqe(ring, &cqe);
206 		if (ret != 0) {
207 			return ret;
208 		}
209 
210 		if (cqe == NULL) {
211 			return count;
212 		}
213 
214 		uring_task = (struct bdev_uring_task *)cqe->user_data;
215 		if (cqe->res != (signed)uring_task->len) {
216 			status = SPDK_BDEV_IO_STATUS_FAILED;
217 		} else {
218 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
219 		}
220 
221 		uring_task->ch->group_ch->io_inflight--;
222 		io_uring_cqe_seen(ring, cqe);
223 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
224 		count++;
225 	}
226 
227 	return count;
228 }
229 
230 static int
231 bdev_uring_group_poll(void *arg)
232 {
233 	struct bdev_uring_group_channel *group_ch = arg;
234 	int to_complete, to_submit;
235 	int count, ret;
236 
237 	to_submit = group_ch->io_pending;
238 
239 	if (to_submit > 0) {
240 		/* If there are I/O to submit, use io_uring_submit here.
241 		 * It will automatically call spdk_io_uring_enter appropriately. */
242 		ret = io_uring_submit(&group_ch->uring);
243 		if (ret < 0) {
244 			return SPDK_POLLER_BUSY;
245 		}
246 
247 		group_ch->io_pending = 0;
248 		group_ch->io_inflight += to_submit;
249 	}
250 
251 	to_complete = group_ch->io_inflight;
252 	count = 0;
253 	if (to_complete > 0) {
254 		count = bdev_uring_reap(&group_ch->uring, to_complete);
255 	}
256 
257 	if (count + to_submit > 0) {
258 		return SPDK_POLLER_BUSY;
259 	} else {
260 		return SPDK_POLLER_IDLE;
261 	}
262 }
263 
264 static void
265 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
266 		      bool success)
267 {
268 	int64_t ret = 0;
269 
270 	if (!success) {
271 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
272 		return;
273 	}
274 
275 	switch (bdev_io->type) {
276 	case SPDK_BDEV_IO_TYPE_READ:
277 		ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
278 				       ch,
279 				       (struct bdev_uring_task *)bdev_io->driver_ctx,
280 				       bdev_io->u.bdev.iovs,
281 				       bdev_io->u.bdev.iovcnt,
282 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
283 				       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
284 		break;
285 	case SPDK_BDEV_IO_TYPE_WRITE:
286 		ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
287 					ch,
288 					(struct bdev_uring_task *)bdev_io->driver_ctx,
289 					bdev_io->u.bdev.iovs,
290 					bdev_io->u.bdev.iovcnt,
291 					bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
292 					bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
293 		break;
294 	default:
295 		SPDK_ERRLOG("Wrong io type\n");
296 		break;
297 	}
298 
299 	if (ret == -ENOMEM) {
300 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
301 	}
302 }
303 
304 #ifdef SPDK_CONFIG_URING_ZNS
305 static int
306 bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len)
307 {
308 	char *path = NULL;
309 	char *device = NULL;
310 	char *name;
311 	FILE *file;
312 	int ret = 0;
313 
314 	name = strdup(devname);
315 	if (name == NULL) {
316 		return -EINVAL;
317 	}
318 	device = basename(name);
319 	path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr);
320 	free(name);
321 	if (!path) {
322 		return -EINVAL;
323 	}
324 
325 	file = fopen(path, "r");
326 	if (!file) {
327 		free(path);
328 		return -ENOENT;
329 	}
330 
331 	if (!fgets(str, str_len, file)) {
332 		ret = -EINVAL;
333 		goto close;
334 	}
335 
336 	spdk_str_chomp(str);
337 
338 close:
339 	free(path);
340 	fclose(file);
341 	return ret;
342 }
343 
344 static int
345 bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val)
346 {
347 	char str[128];
348 	int ret;
349 
350 	ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str));
351 	if (ret) {
352 		return ret;
353 	}
354 
355 	*val = spdk_strtol(str, 10);
356 
357 	return 0;
358 }
359 
360 static int
361 bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
362 {
363 	switch (zones_rep->type) {
364 	case BLK_ZONE_TYPE_CONVENTIONAL:
365 		zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV;
366 		break;
367 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
368 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
369 		break;
370 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
371 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP;
372 		break;
373 	default:
374 		SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type);
375 		return -EIO;
376 	}
377 	return 0;
378 }
379 
380 static int
381 bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
382 {
383 	switch (zones_rep->cond) {
384 	case BLK_ZONE_COND_EMPTY:
385 		zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
386 		break;
387 	case BLK_ZONE_COND_IMP_OPEN:
388 		zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
389 		break;
390 	case BLK_ZONE_COND_EXP_OPEN:
391 		zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
392 		break;
393 	case BLK_ZONE_COND_CLOSED:
394 		zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
395 		break;
396 	case BLK_ZONE_COND_READONLY:
397 		zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
398 		break;
399 	case BLK_ZONE_COND_FULL:
400 		zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
401 		break;
402 	case BLK_ZONE_COND_OFFLINE:
403 		zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
404 		break;
405 	case BLK_ZONE_COND_NOT_WP:
406 		zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP;
407 		break;
408 	default:
409 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
410 		return -EIO;
411 	}
412 	return 0;
413 }
414 
415 static int
416 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
417 {
418 	struct bdev_uring *uring;
419 	struct blk_zone_range range;
420 	long unsigned zone_mgmt_op;
421 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
422 
423 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
424 
425 	switch (bdev_io->u.zone_mgmt.zone_action) {
426 	case SPDK_BDEV_ZONE_RESET:
427 		zone_mgmt_op = BLKRESETZONE;
428 		break;
429 	case SPDK_BDEV_ZONE_OPEN:
430 		zone_mgmt_op = BLKOPENZONE;
431 		break;
432 	case SPDK_BDEV_ZONE_CLOSE:
433 		zone_mgmt_op = BLKCLOSEZONE;
434 		break;
435 	case SPDK_BDEV_ZONE_FINISH:
436 		zone_mgmt_op = BLKFINISHZONE;
437 		break;
438 	default:
439 		return -EINVAL;
440 	}
441 
442 	range.sector = (zone_id << uring->zd.lba_shift);
443 	range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
444 
445 	if (ioctl(uring->fd, zone_mgmt_op, &range)) {
446 		SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
447 			    bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
448 		return -EINVAL;
449 	}
450 
451 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
452 
453 	return 0;
454 }
455 
456 static int
457 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
458 {
459 	struct bdev_uring *uring;
460 	struct blk_zone *zones;
461 	struct blk_zone_report *rep;
462 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
463 	size_t repsize;
464 	uint32_t i, shift;
465 	uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
466 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
467 
468 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
469 	shift = uring->zd.lba_shift;
470 
471 	if ((num_zones > uring->zd.num_zones) || !num_zones) {
472 		return -EINVAL;
473 	}
474 
475 	repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
476 	rep = (struct blk_zone_report *)malloc(repsize);
477 	if (!rep) {
478 		return -ENOMEM;
479 	}
480 
481 	zones = (struct blk_zone *)(rep + 1);
482 
483 	while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
484 		memset(rep, 0, repsize);
485 		rep->sector = zone_id;
486 		rep->nr_zones = num_zones;
487 
488 		if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
489 			SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
490 				    errno, strerror(errno));
491 			free(rep);
492 			return -EINVAL;
493 		}
494 
495 		if (!rep->nr_zones) {
496 			break;
497 		}
498 
499 		for (i = 0; i < rep->nr_zones; i++) {
500 			zone_info->zone_id = ((zones + i)->start >> shift);
501 			zone_info->write_pointer = ((zones + i)->wp >> shift);
502 			zone_info->capacity = ((zones + i)->capacity >> shift);
503 
504 			bdev_uring_fill_zone_state(zone_info, zones + i);
505 			bdev_uring_fill_zone_type(zone_info, zones + i);
506 
507 			zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
508 			zone_info++;
509 			num_zones--;
510 		}
511 	}
512 
513 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
514 	free(rep);
515 	return 0;
516 }
517 
518 static int
519 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
520 {
521 	char str[128];
522 	long int val = 0;
523 	uint32_t zinfo;
524 	int retval = -1;
525 
526 	uring->bdev.zoned = false;
527 
528 	/* Check if this is a zoned block device */
529 	if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) {
530 		SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno);
531 	} else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
532 		/* Only host-aware & host-managed zns devices */
533 		uring->bdev.zoned = true;
534 
535 		if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
536 			SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
537 			goto err_ret;
538 		}
539 		uring->zd.num_zones = zinfo;
540 
541 		if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
542 			SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
543 			goto err_ret;
544 		}
545 
546 		uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
547 		uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
548 		uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
549 
550 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) {
551 			SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno));
552 			goto err_ret;
553 		}
554 		uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val;
555 
556 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) {
557 			SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno));
558 			goto err_ret;
559 		}
560 		uring->bdev.max_active_zones = (uint32_t)val;
561 		retval = 0;
562 	} else {
563 		retval = 0;        /* queue/zoned=none */
564 	}
565 
566 err_ret:
567 	return retval;
568 }
569 #else
570 /* No support for zoned devices */
571 static int
572 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
573 {
574 	return -1;
575 }
576 
577 static int
578 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
579 {
580 	return -1;
581 }
582 
583 static int
584 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
585 {
586 	return 0;
587 }
588 #endif
589 
590 static int
591 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
592 {
593 
594 	switch (bdev_io->type) {
595 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
596 		return bdev_uring_zone_get_info(bdev_io);
597 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
598 		return bdev_uring_zone_management_op(bdev_io);
599 	/* Read and write operations must be performed on buffers aligned to
600 	 * bdev->required_alignment. If user specified unaligned buffers,
601 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
602 	case SPDK_BDEV_IO_TYPE_READ:
603 	case SPDK_BDEV_IO_TYPE_WRITE:
604 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
605 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
606 		return 0;
607 	default:
608 		return -1;
609 	}
610 }
611 
612 static void
613 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
614 {
615 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
616 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
617 	}
618 }
619 
620 static bool
621 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
622 {
623 	switch (io_type) {
624 #ifdef SPDK_CONFIG_URING_ZNS
625 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
626 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
627 #endif
628 	case SPDK_BDEV_IO_TYPE_READ:
629 	case SPDK_BDEV_IO_TYPE_WRITE:
630 		return true;
631 	default:
632 		return false;
633 	}
634 }
635 
636 static int
637 bdev_uring_create_cb(void *io_device, void *ctx_buf)
638 {
639 	struct bdev_uring_io_channel *ch = ctx_buf;
640 
641 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
642 
643 	return 0;
644 }
645 
646 static void
647 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
648 {
649 	struct bdev_uring_io_channel *ch = ctx_buf;
650 
651 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
652 }
653 
654 static struct spdk_io_channel *
655 bdev_uring_get_io_channel(void *ctx)
656 {
657 	struct bdev_uring *uring = ctx;
658 
659 	return spdk_get_io_channel(uring);
660 }
661 
662 static int
663 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
664 {
665 	struct bdev_uring *uring = ctx;
666 
667 	spdk_json_write_named_object_begin(w, "uring");
668 
669 	spdk_json_write_named_string(w, "filename", uring->filename);
670 
671 	spdk_json_write_object_end(w);
672 
673 	return 0;
674 }
675 
676 static void
677 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
678 {
679 	struct bdev_uring *uring = bdev->ctxt;
680 
681 	spdk_json_write_object_begin(w);
682 
683 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
684 
685 	spdk_json_write_named_object_begin(w, "params");
686 	spdk_json_write_named_string(w, "name", bdev->name);
687 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
688 	spdk_json_write_named_string(w, "filename", uring->filename);
689 	spdk_json_write_object_end(w);
690 
691 	spdk_json_write_object_end(w);
692 }
693 
694 static const struct spdk_bdev_fn_table uring_fn_table = {
695 	.destruct		= bdev_uring_destruct,
696 	.submit_request		= bdev_uring_submit_request,
697 	.io_type_supported	= bdev_uring_io_type_supported,
698 	.get_io_channel		= bdev_uring_get_io_channel,
699 	.dump_info_json		= bdev_uring_dump_info_json,
700 	.write_config_json	= bdev_uring_write_json_config,
701 };
702 
703 static void
704 uring_free_bdev(struct bdev_uring *uring)
705 {
706 	if (uring == NULL) {
707 		return;
708 	}
709 	free(uring->filename);
710 	free(uring->bdev.name);
711 	free(uring);
712 }
713 
714 static int
715 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
716 {
717 	struct bdev_uring_group_channel *ch = ctx_buf;
718 
719 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
720 	 * local devices but also devices attached from remote target */
721 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
722 		SPDK_ERRLOG("uring I/O context setup failure\n");
723 		return -1;
724 	}
725 
726 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
727 	return 0;
728 }
729 
730 static void
731 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
732 {
733 	struct bdev_uring_group_channel *ch = ctx_buf;
734 
735 	io_uring_queue_exit(&ch->uring);
736 
737 	spdk_poller_unregister(&ch->poller);
738 }
739 
740 struct spdk_bdev *
741 create_uring_bdev(const char *name, const char *filename, uint32_t block_size)
742 {
743 	struct bdev_uring *uring;
744 	uint32_t detected_block_size;
745 	uint64_t bdev_size;
746 	int rc;
747 
748 	uring = calloc(1, sizeof(*uring));
749 	if (!uring) {
750 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
751 		return NULL;
752 	}
753 
754 	uring->filename = strdup(filename);
755 	if (!uring->filename) {
756 		goto error_return;
757 	}
758 
759 	if (bdev_uring_open(uring)) {
760 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, uring->fd, errno);
761 		goto error_return;
762 	}
763 
764 	bdev_size = spdk_fd_get_size(uring->fd);
765 
766 	uring->bdev.name = strdup(name);
767 	if (!uring->bdev.name) {
768 		goto error_return;
769 	}
770 	uring->bdev.product_name = "URING bdev";
771 	uring->bdev.module = &uring_if;
772 
773 	uring->bdev.write_cache = 0;
774 
775 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
776 	if (block_size == 0) {
777 		/* User did not specify block size - use autodetected block size. */
778 		if (detected_block_size == 0) {
779 			SPDK_ERRLOG("Block size could not be auto-detected\n");
780 			goto error_return;
781 		}
782 		block_size = detected_block_size;
783 	} else {
784 		if (block_size < detected_block_size) {
785 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
786 				    "auto-detected block size %" PRIu32 "\n",
787 				    block_size, detected_block_size);
788 			goto error_return;
789 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
790 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
791 				     "auto-detected block size %" PRIu32 "\n",
792 				     block_size, detected_block_size);
793 		}
794 	}
795 
796 	if (block_size < 512) {
797 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
798 		goto error_return;
799 	}
800 
801 	if (!spdk_u32_is_pow2(block_size)) {
802 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
803 		goto error_return;
804 	}
805 
806 	uring->bdev.blocklen = block_size;
807 	uring->bdev.required_alignment = spdk_u32log2(block_size);
808 
809 	rc = bdev_uring_check_zoned_support(uring, name, filename);
810 	if (rc) {
811 		goto error_return;
812 	}
813 
814 	if (bdev_size % uring->bdev.blocklen != 0) {
815 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
816 			    bdev_size, uring->bdev.blocklen);
817 		goto error_return;
818 	}
819 
820 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
821 	uring->bdev.ctxt = uring;
822 
823 	uring->bdev.fn_table = &uring_fn_table;
824 
825 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
826 				sizeof(struct bdev_uring_io_channel),
827 				uring->bdev.name);
828 	rc = spdk_bdev_register(&uring->bdev);
829 	if (rc) {
830 		spdk_io_device_unregister(uring, NULL);
831 		goto error_return;
832 	}
833 
834 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
835 	return &uring->bdev;
836 
837 error_return:
838 	bdev_uring_close(uring);
839 	uring_free_bdev(uring);
840 	return NULL;
841 }
842 
843 struct delete_uring_bdev_ctx {
844 	spdk_delete_uring_complete cb_fn;
845 	void *cb_arg;
846 };
847 
848 static void
849 uring_bdev_unregister_cb(void *arg, int bdeverrno)
850 {
851 	struct delete_uring_bdev_ctx *ctx = arg;
852 
853 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
854 	free(ctx);
855 }
856 
857 void
858 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
859 {
860 	struct delete_uring_bdev_ctx *ctx;
861 	int rc;
862 
863 	ctx = calloc(1, sizeof(*ctx));
864 	if (ctx == NULL) {
865 		cb_fn(cb_arg, -ENOMEM);
866 		return;
867 	}
868 
869 	ctx->cb_fn = cb_fn;
870 	ctx->cb_arg = cb_arg;
871 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
872 	if (rc != 0) {
873 		uring_bdev_unregister_cb(ctx, rc);
874 	}
875 }
876 
877 static int
878 bdev_uring_init(void)
879 {
880 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
881 				sizeof(struct bdev_uring_group_channel), "uring_module");
882 
883 	return 0;
884 }
885 
886 static void
887 bdev_uring_fini(void)
888 {
889 	spdk_io_device_unregister(&uring_if, NULL);
890 }
891 
892 SPDK_LOG_REGISTER_COMPONENT(uring)
893