xref: /spdk/module/bdev/uring/bdev_uring.c (revision f387b7fe187572d4505323dfb7a5dc1318638dda)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2019 Intel Corporation.
3  *   All rights reserved.
4  */
5 
6 #include "bdev_uring.h"
7 
8 #include "spdk/stdinc.h"
9 #include "spdk/config.h"
10 #include "spdk/barrier.h"
11 #include "spdk/bdev.h"
12 #include "spdk/env.h"
13 #include "spdk/fd.h"
14 #include "spdk/likely.h"
15 #include "spdk/thread.h"
16 #include "spdk/json.h"
17 #include "spdk/util.h"
18 #include "spdk/string.h"
19 
20 #include "spdk/log.h"
21 #include "spdk_internal/uring.h"
22 
23 #ifdef SPDK_CONFIG_URING_ZNS
24 #include <linux/blkzoned.h>
25 #define SECTOR_SHIFT 9
26 #endif
27 
28 struct bdev_uring_zoned_dev {
29 	uint64_t		num_zones;
30 	uint32_t		zone_shift;
31 	uint32_t		lba_shift;
32 };
33 
34 struct bdev_uring_io_channel {
35 	struct bdev_uring_group_channel		*group_ch;
36 };
37 
38 struct bdev_uring_group_channel {
39 	uint64_t				io_inflight;
40 	uint64_t				io_pending;
41 	struct spdk_poller			*poller;
42 	struct io_uring				uring;
43 };
44 
45 struct bdev_uring_task {
46 	uint64_t			len;
47 	struct bdev_uring_io_channel	*ch;
48 	TAILQ_ENTRY(bdev_uring_task)	link;
49 };
50 
51 struct bdev_uring {
52 	struct spdk_bdev	bdev;
53 	struct bdev_uring_zoned_dev	zd;
54 	char			*filename;
55 	int			fd;
56 	TAILQ_ENTRY(bdev_uring)  link;
57 };
58 
59 static int bdev_uring_init(void);
60 static void bdev_uring_fini(void);
61 static void uring_free_bdev(struct bdev_uring *uring);
62 static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
63 
64 #define SPDK_URING_QUEUE_DEPTH 512
65 #define MAX_EVENTS_PER_POLL 32
66 
67 static int
68 bdev_uring_get_ctx_size(void)
69 {
70 	return sizeof(struct bdev_uring_task);
71 }
72 
73 static struct spdk_bdev_module uring_if = {
74 	.name		= "uring",
75 	.module_init	= bdev_uring_init,
76 	.module_fini	= bdev_uring_fini,
77 	.get_ctx_size	= bdev_uring_get_ctx_size,
78 };
79 
80 SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
81 
82 static int
83 bdev_uring_open(struct bdev_uring *bdev)
84 {
85 	int fd;
86 
87 	fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
88 	if (fd < 0) {
89 		/* Try without O_DIRECT for non-disk files */
90 		fd = open(bdev->filename, O_RDWR | O_NOATIME);
91 		if (fd < 0) {
92 			SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
93 				    bdev->filename, errno, spdk_strerror(errno));
94 			bdev->fd = -1;
95 			return -1;
96 		}
97 	}
98 
99 	bdev->fd = fd;
100 
101 	return 0;
102 }
103 
104 static void
105 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
106 {
107 }
108 
109 int
110 bdev_uring_rescan(const char *name)
111 {
112 	struct spdk_bdev_desc *desc;
113 	struct spdk_bdev *bdev;
114 	struct bdev_uring *uring;
115 	uint64_t uring_size, blockcnt;
116 	int rc;
117 
118 	rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
119 	if (rc != 0) {
120 		return rc;
121 	}
122 
123 	bdev = spdk_bdev_desc_get_bdev(desc);
124 	if (bdev->module != &uring_if) {
125 		rc = -ENODEV;
126 		goto exit;
127 	}
128 
129 	uring = SPDK_CONTAINEROF(bdev, struct bdev_uring, bdev);
130 	uring_size = spdk_fd_get_size(uring->fd);
131 	blockcnt = uring_size / bdev->blocklen;
132 
133 	if (bdev->blockcnt != blockcnt) {
134 		SPDK_NOTICELOG("URING device is resized: bdev name %s, old block count %" PRIu64
135 			       ", new block count %"
136 			       PRIu64 "\n",
137 			       uring->filename,
138 			       bdev->blockcnt,
139 			       blockcnt);
140 		rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
141 		if (rc != 0) {
142 			SPDK_ERRLOG("Could not change num blocks for uring bdev: name %s, errno: %d.\n",
143 				    uring->filename, rc);
144 			goto exit;
145 		}
146 	}
147 
148 exit:
149 	spdk_bdev_close(desc);
150 	return rc;
151 }
152 
153 static int
154 bdev_uring_close(struct bdev_uring *bdev)
155 {
156 	int rc;
157 
158 	if (bdev->fd == -1) {
159 		return 0;
160 	}
161 
162 	rc = close(bdev->fd);
163 	if (rc < 0) {
164 		SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
165 			    bdev->fd, errno, spdk_strerror(errno));
166 		return -1;
167 	}
168 
169 	bdev->fd = -1;
170 
171 	return 0;
172 }
173 
174 static int64_t
175 bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
176 		 struct bdev_uring_task *uring_task,
177 		 struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
178 {
179 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
180 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
181 	struct io_uring_sqe *sqe;
182 
183 	sqe = io_uring_get_sqe(&group_ch->uring);
184 	if (!sqe) {
185 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
186 		return -ENOMEM;
187 	}
188 
189 	io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
190 	io_uring_sqe_set_data(sqe, uring_task);
191 	uring_task->len = nbytes;
192 	uring_task->ch = uring_ch;
193 
194 	SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
195 		      iovcnt, nbytes, offset);
196 
197 	group_ch->io_pending++;
198 	return nbytes;
199 }
200 
201 static int64_t
202 bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
203 		  struct bdev_uring_task *uring_task,
204 		  struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
205 {
206 	struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
207 	struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
208 	struct io_uring_sqe *sqe;
209 
210 	sqe = io_uring_get_sqe(&group_ch->uring);
211 	if (!sqe) {
212 		SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
213 		return -ENOMEM;
214 	}
215 
216 	io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
217 	io_uring_sqe_set_data(sqe, uring_task);
218 	uring_task->len = nbytes;
219 	uring_task->ch = uring_ch;
220 
221 	SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
222 		      iovcnt, nbytes, offset);
223 
224 	group_ch->io_pending++;
225 	return nbytes;
226 }
227 
228 static int
229 bdev_uring_destruct(void *ctx)
230 {
231 	struct bdev_uring *uring = ctx;
232 	int rc = 0;
233 
234 	TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
235 	rc = bdev_uring_close(uring);
236 	if (rc < 0) {
237 		SPDK_ERRLOG("bdev_uring_close() failed\n");
238 	}
239 	spdk_io_device_unregister(uring, NULL);
240 	uring_free_bdev(uring);
241 	return rc;
242 }
243 
244 static int
245 bdev_uring_reap(struct io_uring *ring, int max)
246 {
247 	int i, count, ret;
248 	struct io_uring_cqe *cqe;
249 	struct bdev_uring_task *uring_task;
250 	enum spdk_bdev_io_status status;
251 
252 	count = 0;
253 	for (i = 0; i < max; i++) {
254 		ret = io_uring_peek_cqe(ring, &cqe);
255 		if (ret != 0) {
256 			return ret;
257 		}
258 
259 		if (cqe == NULL) {
260 			return count;
261 		}
262 
263 		uring_task = (struct bdev_uring_task *)cqe->user_data;
264 		if (cqe->res != (signed)uring_task->len) {
265 			status = SPDK_BDEV_IO_STATUS_FAILED;
266 		} else {
267 			status = SPDK_BDEV_IO_STATUS_SUCCESS;
268 		}
269 
270 		uring_task->ch->group_ch->io_inflight--;
271 		io_uring_cqe_seen(ring, cqe);
272 		spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
273 		count++;
274 	}
275 
276 	return count;
277 }
278 
279 static int
280 bdev_uring_group_poll(void *arg)
281 {
282 	struct bdev_uring_group_channel *group_ch = arg;
283 	int to_complete, to_submit;
284 	int count, ret;
285 
286 	to_submit = group_ch->io_pending;
287 
288 	if (to_submit > 0) {
289 		/* If there are I/O to submit, use io_uring_submit here.
290 		 * It will automatically call spdk_io_uring_enter appropriately. */
291 		ret = io_uring_submit(&group_ch->uring);
292 		if (ret < 0) {
293 			return SPDK_POLLER_BUSY;
294 		}
295 
296 		group_ch->io_pending = 0;
297 		group_ch->io_inflight += to_submit;
298 	}
299 
300 	to_complete = group_ch->io_inflight;
301 	count = 0;
302 	if (to_complete > 0) {
303 		count = bdev_uring_reap(&group_ch->uring, to_complete);
304 	}
305 
306 	if (count + to_submit > 0) {
307 		return SPDK_POLLER_BUSY;
308 	} else {
309 		return SPDK_POLLER_IDLE;
310 	}
311 }
312 
313 static void
314 bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
315 		      bool success)
316 {
317 	int64_t ret = 0;
318 
319 	if (!success) {
320 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
321 		return;
322 	}
323 
324 	switch (bdev_io->type) {
325 	case SPDK_BDEV_IO_TYPE_READ:
326 		ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
327 				       ch,
328 				       (struct bdev_uring_task *)bdev_io->driver_ctx,
329 				       bdev_io->u.bdev.iovs,
330 				       bdev_io->u.bdev.iovcnt,
331 				       bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
332 				       bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
333 		break;
334 	case SPDK_BDEV_IO_TYPE_WRITE:
335 		ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
336 					ch,
337 					(struct bdev_uring_task *)bdev_io->driver_ctx,
338 					bdev_io->u.bdev.iovs,
339 					bdev_io->u.bdev.iovcnt,
340 					bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
341 					bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
342 		break;
343 	default:
344 		SPDK_ERRLOG("Wrong io type\n");
345 		break;
346 	}
347 
348 	if (ret == -ENOMEM) {
349 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
350 	}
351 }
352 
353 #ifdef SPDK_CONFIG_URING_ZNS
354 static int
355 bdev_uring_read_sysfs_attr(const char *devname, const char *attr, char *str, int str_len)
356 {
357 	char *path = NULL;
358 	char *device = NULL;
359 	char *name;
360 	FILE *file;
361 	int ret = 0;
362 
363 	name = strdup(devname);
364 	if (name == NULL) {
365 		return -EINVAL;
366 	}
367 	device = basename(name);
368 	path = spdk_sprintf_alloc("/sys/block/%s/%s", device, attr);
369 	free(name);
370 	if (!path) {
371 		return -EINVAL;
372 	}
373 
374 	file = fopen(path, "r");
375 	if (!file) {
376 		free(path);
377 		return -ENOENT;
378 	}
379 
380 	if (!fgets(str, str_len, file)) {
381 		ret = -EINVAL;
382 		goto close;
383 	}
384 
385 	spdk_str_chomp(str);
386 
387 close:
388 	free(path);
389 	fclose(file);
390 	return ret;
391 }
392 
393 static int
394 bdev_uring_read_sysfs_attr_long(const char *devname, const char *attr, long *val)
395 {
396 	char str[128];
397 	int ret;
398 
399 	ret = bdev_uring_read_sysfs_attr(devname, attr, str, sizeof(str));
400 	if (ret) {
401 		return ret;
402 	}
403 
404 	*val = spdk_strtol(str, 10);
405 
406 	return 0;
407 }
408 
409 static int
410 bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
411 {
412 	switch (zones_rep->type) {
413 	case BLK_ZONE_TYPE_CONVENTIONAL:
414 		zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV;
415 		break;
416 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
417 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
418 		break;
419 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
420 		zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP;
421 		break;
422 	default:
423 		SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type);
424 		return -EIO;
425 	}
426 	return 0;
427 }
428 
429 static int
430 bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
431 {
432 	switch (zones_rep->cond) {
433 	case BLK_ZONE_COND_EMPTY:
434 		zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
435 		break;
436 	case BLK_ZONE_COND_IMP_OPEN:
437 		zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
438 		break;
439 	case BLK_ZONE_COND_EXP_OPEN:
440 		zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
441 		break;
442 	case BLK_ZONE_COND_CLOSED:
443 		zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
444 		break;
445 	case BLK_ZONE_COND_READONLY:
446 		zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
447 		break;
448 	case BLK_ZONE_COND_FULL:
449 		zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
450 		break;
451 	case BLK_ZONE_COND_OFFLINE:
452 		zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
453 		break;
454 	case BLK_ZONE_COND_NOT_WP:
455 		zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP;
456 		break;
457 	default:
458 		SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
459 		return -EIO;
460 	}
461 	return 0;
462 }
463 
464 static int
465 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
466 {
467 	struct bdev_uring *uring;
468 	struct blk_zone_range range;
469 	long unsigned zone_mgmt_op;
470 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
471 
472 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
473 
474 	switch (bdev_io->u.zone_mgmt.zone_action) {
475 	case SPDK_BDEV_ZONE_RESET:
476 		zone_mgmt_op = BLKRESETZONE;
477 		break;
478 	case SPDK_BDEV_ZONE_OPEN:
479 		zone_mgmt_op = BLKOPENZONE;
480 		break;
481 	case SPDK_BDEV_ZONE_CLOSE:
482 		zone_mgmt_op = BLKCLOSEZONE;
483 		break;
484 	case SPDK_BDEV_ZONE_FINISH:
485 		zone_mgmt_op = BLKFINISHZONE;
486 		break;
487 	default:
488 		return -EINVAL;
489 	}
490 
491 	range.sector = (zone_id << uring->zd.lba_shift);
492 	range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
493 
494 	if (ioctl(uring->fd, zone_mgmt_op, &range)) {
495 		SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
496 			    bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
497 		return -EINVAL;
498 	}
499 
500 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
501 
502 	return 0;
503 }
504 
505 static int
506 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
507 {
508 	struct bdev_uring *uring;
509 	struct blk_zone *zones;
510 	struct blk_zone_report *rep;
511 	struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
512 	size_t repsize;
513 	uint32_t i, shift;
514 	uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
515 	uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
516 
517 	uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
518 	shift = uring->zd.lba_shift;
519 
520 	if ((num_zones > uring->zd.num_zones) || !num_zones) {
521 		return -EINVAL;
522 	}
523 
524 	repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
525 	rep = (struct blk_zone_report *)malloc(repsize);
526 	if (!rep) {
527 		return -ENOMEM;
528 	}
529 
530 	zones = (struct blk_zone *)(rep + 1);
531 
532 	while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
533 		memset(rep, 0, repsize);
534 		rep->sector = zone_id;
535 		rep->nr_zones = num_zones;
536 
537 		if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
538 			SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
539 				    errno, strerror(errno));
540 			free(rep);
541 			return -EINVAL;
542 		}
543 
544 		if (!rep->nr_zones) {
545 			break;
546 		}
547 
548 		for (i = 0; i < rep->nr_zones; i++) {
549 			zone_info->zone_id = ((zones + i)->start >> shift);
550 			zone_info->write_pointer = ((zones + i)->wp >> shift);
551 			zone_info->capacity = ((zones + i)->capacity >> shift);
552 
553 			bdev_uring_fill_zone_state(zone_info, zones + i);
554 			bdev_uring_fill_zone_type(zone_info, zones + i);
555 
556 			zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
557 			zone_info++;
558 			num_zones--;
559 		}
560 	}
561 
562 	spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
563 	free(rep);
564 	return 0;
565 }
566 
567 static int
568 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
569 {
570 	char str[128];
571 	long int val = 0;
572 	uint32_t zinfo;
573 	int retval = -1;
574 	struct stat sb;
575 	char resolved_path[PATH_MAX], *rp;
576 
577 	uring->bdev.zoned = false;
578 
579 	/* Follow symlink */
580 	if ((rp = realpath(filename, resolved_path))) {
581 		filename = rp;
582 	}
583 
584 	/* Perform check on block devices only */
585 	if (stat(filename, &sb) == 0 && S_ISBLK(sb.st_mode)) {
586 		return 0;
587 	}
588 
589 	/* Check if this is a zoned block device */
590 	if (bdev_uring_read_sysfs_attr(filename, "queue/zoned", str, sizeof(str))) {
591 		SPDK_ERRLOG("Unable to open file %s/queue/zoned. errno: %d\n", filename, errno);
592 	} else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
593 		/* Only host-aware & host-managed zns devices */
594 		uring->bdev.zoned = true;
595 
596 		if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
597 			SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
598 			goto err_ret;
599 		}
600 		uring->zd.num_zones = zinfo;
601 
602 		if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
603 			SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
604 			goto err_ret;
605 		}
606 
607 		uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
608 		uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
609 		uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
610 
611 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_open_zones", &val)) {
612 			SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", errno, strerror(errno));
613 			goto err_ret;
614 		}
615 		uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = (uint32_t)val;
616 
617 		if (bdev_uring_read_sysfs_attr_long(filename, "queue/max_active_zones", &val)) {
618 			SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", errno, strerror(errno));
619 			goto err_ret;
620 		}
621 		uring->bdev.max_active_zones = (uint32_t)val;
622 		retval = 0;
623 	} else {
624 		retval = 0;        /* queue/zoned=none */
625 	}
626 
627 err_ret:
628 	return retval;
629 }
630 #else
631 /* No support for zoned devices */
632 static int
633 bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
634 {
635 	return -1;
636 }
637 
638 static int
639 bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
640 {
641 	return -1;
642 }
643 
644 static int
645 bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
646 {
647 	return 0;
648 }
649 #endif
650 
651 static int
652 _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
653 {
654 
655 	switch (bdev_io->type) {
656 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
657 		return bdev_uring_zone_get_info(bdev_io);
658 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
659 		return bdev_uring_zone_management_op(bdev_io);
660 	/* Read and write operations must be performed on buffers aligned to
661 	 * bdev->required_alignment. If user specified unaligned buffers,
662 	 * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
663 	case SPDK_BDEV_IO_TYPE_READ:
664 	case SPDK_BDEV_IO_TYPE_WRITE:
665 		spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
666 				     bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
667 		return 0;
668 	default:
669 		return -1;
670 	}
671 }
672 
673 static void
674 bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
675 {
676 	if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
677 		spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
678 	}
679 }
680 
681 static bool
682 bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
683 {
684 	switch (io_type) {
685 #ifdef SPDK_CONFIG_URING_ZNS
686 	case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
687 	case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
688 #endif
689 	case SPDK_BDEV_IO_TYPE_READ:
690 	case SPDK_BDEV_IO_TYPE_WRITE:
691 		return true;
692 	default:
693 		return false;
694 	}
695 }
696 
697 static int
698 bdev_uring_create_cb(void *io_device, void *ctx_buf)
699 {
700 	struct bdev_uring_io_channel *ch = ctx_buf;
701 
702 	ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
703 
704 	return 0;
705 }
706 
707 static void
708 bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
709 {
710 	struct bdev_uring_io_channel *ch = ctx_buf;
711 
712 	spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
713 }
714 
715 static struct spdk_io_channel *
716 bdev_uring_get_io_channel(void *ctx)
717 {
718 	struct bdev_uring *uring = ctx;
719 
720 	return spdk_get_io_channel(uring);
721 }
722 
723 static int
724 bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
725 {
726 	struct bdev_uring *uring = ctx;
727 
728 	spdk_json_write_named_object_begin(w, "uring");
729 
730 	spdk_json_write_named_string(w, "filename", uring->filename);
731 
732 	spdk_json_write_object_end(w);
733 
734 	return 0;
735 }
736 
737 static void
738 bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
739 {
740 	struct bdev_uring *uring = bdev->ctxt;
741 	char uuid_str[SPDK_UUID_STRING_LEN];
742 
743 	spdk_json_write_object_begin(w);
744 
745 	spdk_json_write_named_string(w, "method", "bdev_uring_create");
746 
747 	spdk_json_write_named_object_begin(w, "params");
748 	spdk_json_write_named_string(w, "name", bdev->name);
749 	spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
750 	spdk_json_write_named_string(w, "filename", uring->filename);
751 	spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
752 	spdk_json_write_named_string(w, "uuid", uuid_str);
753 	spdk_json_write_object_end(w);
754 
755 	spdk_json_write_object_end(w);
756 }
757 
758 static const struct spdk_bdev_fn_table uring_fn_table = {
759 	.destruct		= bdev_uring_destruct,
760 	.submit_request		= bdev_uring_submit_request,
761 	.io_type_supported	= bdev_uring_io_type_supported,
762 	.get_io_channel		= bdev_uring_get_io_channel,
763 	.dump_info_json		= bdev_uring_dump_info_json,
764 	.write_config_json	= bdev_uring_write_json_config,
765 };
766 
767 static void
768 uring_free_bdev(struct bdev_uring *uring)
769 {
770 	if (uring == NULL) {
771 		return;
772 	}
773 	free(uring->filename);
774 	free(uring->bdev.name);
775 	free(uring);
776 }
777 
778 static int
779 bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
780 {
781 	struct bdev_uring_group_channel *ch = ctx_buf;
782 
783 	/* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
784 	 * local devices but also devices attached from remote target */
785 	if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
786 		SPDK_ERRLOG("uring I/O context setup failure\n");
787 		return -1;
788 	}
789 
790 	ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
791 	return 0;
792 }
793 
794 static void
795 bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
796 {
797 	struct bdev_uring_group_channel *ch = ctx_buf;
798 
799 	io_uring_queue_exit(&ch->uring);
800 
801 	spdk_poller_unregister(&ch->poller);
802 }
803 
804 struct spdk_bdev *
805 create_uring_bdev(const struct bdev_uring_opts *opts)
806 {
807 	struct bdev_uring *uring;
808 	uint32_t detected_block_size;
809 	uint64_t bdev_size;
810 	int rc;
811 	uint32_t block_size = opts->block_size;
812 
813 	uring = calloc(1, sizeof(*uring));
814 	if (!uring) {
815 		SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
816 		return NULL;
817 	}
818 
819 	uring->filename = strdup(opts->filename);
820 	if (!uring->filename) {
821 		goto error_return;
822 	}
823 
824 	if (bdev_uring_open(uring)) {
825 		SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", opts->filename, uring->fd, errno);
826 		goto error_return;
827 	}
828 
829 	bdev_size = spdk_fd_get_size(uring->fd);
830 
831 	uring->bdev.name = strdup(opts->name);
832 	if (!uring->bdev.name) {
833 		goto error_return;
834 	}
835 	uring->bdev.product_name = "URING bdev";
836 	uring->bdev.module = &uring_if;
837 
838 	uring->bdev.write_cache = 0;
839 
840 	detected_block_size = spdk_fd_get_blocklen(uring->fd);
841 	if (block_size == 0) {
842 		/* User did not specify block size - use autodetected block size. */
843 		if (detected_block_size == 0) {
844 			SPDK_ERRLOG("Block size could not be auto-detected\n");
845 			goto error_return;
846 		}
847 		block_size = detected_block_size;
848 	} else {
849 		if (block_size < detected_block_size) {
850 			SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
851 				    "auto-detected block size %" PRIu32 "\n",
852 				    block_size, detected_block_size);
853 			goto error_return;
854 		} else if (detected_block_size != 0 && block_size != detected_block_size) {
855 			SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
856 				     "auto-detected block size %" PRIu32 "\n",
857 				     block_size, detected_block_size);
858 		}
859 	}
860 
861 	if (block_size < 512) {
862 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
863 		goto error_return;
864 	}
865 
866 	if (!spdk_u32_is_pow2(block_size)) {
867 		SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
868 		goto error_return;
869 	}
870 
871 	uring->bdev.blocklen = block_size;
872 	uring->bdev.required_alignment = spdk_u32log2(block_size);
873 
874 	rc = bdev_uring_check_zoned_support(uring, opts->name, opts->filename);
875 	if (rc) {
876 		goto error_return;
877 	}
878 
879 	if (bdev_size % uring->bdev.blocklen != 0) {
880 		SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
881 			    bdev_size, uring->bdev.blocklen);
882 		goto error_return;
883 	}
884 
885 	uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
886 	uring->bdev.ctxt = uring;
887 
888 	uring->bdev.fn_table = &uring_fn_table;
889 
890 	if (!spdk_mem_all_zero(&opts->uuid, sizeof(opts->uuid))) {
891 		spdk_uuid_copy(&uring->bdev.uuid, &opts->uuid);
892 	}
893 
894 	spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
895 				sizeof(struct bdev_uring_io_channel),
896 				uring->bdev.name);
897 	rc = spdk_bdev_register(&uring->bdev);
898 	if (rc) {
899 		spdk_io_device_unregister(uring, NULL);
900 		goto error_return;
901 	}
902 
903 	TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
904 	return &uring->bdev;
905 
906 error_return:
907 	bdev_uring_close(uring);
908 	uring_free_bdev(uring);
909 	return NULL;
910 }
911 
912 struct delete_uring_bdev_ctx {
913 	spdk_delete_uring_complete cb_fn;
914 	void *cb_arg;
915 };
916 
917 static void
918 uring_bdev_unregister_cb(void *arg, int bdeverrno)
919 {
920 	struct delete_uring_bdev_ctx *ctx = arg;
921 
922 	ctx->cb_fn(ctx->cb_arg, bdeverrno);
923 	free(ctx);
924 }
925 
926 void
927 delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
928 {
929 	struct delete_uring_bdev_ctx *ctx;
930 	int rc;
931 
932 	ctx = calloc(1, sizeof(*ctx));
933 	if (ctx == NULL) {
934 		cb_fn(cb_arg, -ENOMEM);
935 		return;
936 	}
937 
938 	ctx->cb_fn = cb_fn;
939 	ctx->cb_arg = cb_arg;
940 	rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
941 	if (rc != 0) {
942 		uring_bdev_unregister_cb(ctx, rc);
943 	}
944 }
945 
946 static int
947 bdev_uring_init(void)
948 {
949 	spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
950 				sizeof(struct bdev_uring_group_channel), "uring_module");
951 
952 	return 0;
953 }
954 
955 static void
956 bdev_uring_fini(void)
957 {
958 	spdk_io_device_unregister(&uring_if, NULL);
959 }
960 
961 SPDK_LOG_REGISTER_COMPONENT(uring)
962